Spaces:

keras-io
/

Object-Detection-Using-RetinaNet

Runtime error

App Files Files Community

Object-Detection-Using-RetinaNet / app.py

RobotJelly

app.py

3637838 over 3 years ago

raw

history blame

12.3 kB

	import gradio as gr
	from huggingface_hub import from_pretrained_keras
	from PIL import Image
	import io
	import matplotlib.pyplot as plt
	import os
	import re
	import zipfile
	import numpy as np
	import tensorflow as tf
	from tensorflow import keras
	import tensorflow_datasets as tfds

	coco_image = []
	coco_dir = 'coco/images/'
	for idx, images in enumerate(os.listdir(coco_dir)):
	image = os.path.join(coco_dir, images)
	if os.path.isfile(image) and idx < 10:
	coco_image.append(image)

	_, dataset_info = tfds.load(
	"coco/2017", split=["train", "validation","test"], with_info=True, data_dir="data"
	)
	#test_dataset = tfds.load("coco/2017", split="test", data_dir="data")
	int2str = dataset_info.features["objects"]["label"].int2str

	class AnchorBox:
	"""Generates anchor boxes.

	This class has operations to generate anchor boxes for feature maps at
	strides `[8, 16, 32, 64, 128]`. Where each anchor each box is of the
	format `[x, y, width, height]`.

	Attributes:
	aspect_ratios: A list of float values representing the aspect ratios of
	the anchor boxes at each location on the feature map
	scales: A list of float values representing the scale of the anchor boxes
	at each location on the feature map.
	num_anchors: The number of anchor boxes at each location on feature map
	areas: A list of float values representing the areas of the anchor
	boxes for each feature map in the feature pyramid.
	strides: A list of float value representing the strides for each feature
	map in the feature pyramid.
	"""

	def __init__(self):
	self.aspect_ratios = [0.5, 1.0, 2.0]
	self.scales = [2 ** x for x in [0, 1 / 3, 2 / 3]]

	self._num_anchors = len(self.aspect_ratios) * len(self.scales)
	self._strides = [2 ** i for i in range(3, 8)]
	self._areas = [x ** 2 for x in [32.0, 64.0, 128.0, 256.0, 512.0]]
	self._anchor_dims = self._compute_dims()

	def _compute_dims(self):
	"""Computes anchor box dimensions for all ratios and scales at all levels
	of the feature pyramid.
	"""
	anchor_dims_all = []
	for area in self._areas:
	anchor_dims = []
	for ratio in self.aspect_ratios:
	anchor_height = tf.math.sqrt(area / ratio)
	anchor_width = area / anchor_height
	dims = tf.reshape(
	tf.stack([anchor_width, anchor_height], axis=-1), [1, 1, 2]
	)
	for scale in self.scales:
	anchor_dims.append(scale * dims)
	anchor_dims_all.append(tf.stack(anchor_dims, axis=-2))
	return anchor_dims_all

	def _get_anchors(self, feature_height, feature_width, level):
	"""Generates anchor boxes for a given feature map size and level

	Arguments:
	feature_height: An integer representing the height of the feature map.
	feature_width: An integer representing the width of the feature map.
	level: An integer representing the level of the feature map in the
	feature pyramid.

	Returns:
	anchor boxes with the shape
	`(feature_height * feature_width * num_anchors, 4)`
	"""
	rx = tf.range(feature_width, dtype=tf.float32) + 0.5
	ry = tf.range(feature_height, dtype=tf.float32) + 0.5
	centers = tf.stack(tf.meshgrid(rx, ry), axis=-1) * self._strides[level - 3]
	centers = tf.expand_dims(centers, axis=-2)
	centers = tf.tile(centers, [1, 1, self._num_anchors, 1])
	dims = tf.tile(
	self._anchor_dims[level - 3], [feature_height, feature_width, 1, 1]
	)
	anchors = tf.concat([centers, dims], axis=-1)
	return tf.reshape(
	anchors, [feature_height * feature_width * self._num_anchors, 4]
	)

	def get_anchors(self, image_height, image_width):
	"""Generates anchor boxes for all the feature maps of the feature pyramid.

	Arguments:
	image_height: Height of the input image.
	image_width: Width of the input image.

	Returns:
	anchor boxes for all the feature maps, stacked as a single tensor
	with shape `(total_anchors, 4)`
	"""
	anchors = [
	self._get_anchors(
	tf.math.ceil(image_height / 2 ** i),
	tf.math.ceil(image_width / 2 ** i),
	i,
	)
	for i in range(3, 8)
	]
	return tf.concat(anchors, axis=0)

	class DecodePredictions(tf.keras.layers.Layer):
	"""A Keras layer that decodes predictions of the RetinaNet model.

	Attributes:
	num_classes: Number of classes in the dataset
	confidence_threshold: Minimum class probability, below which detections
	are pruned.
	nms_iou_threshold: IOU threshold for the NMS operation
	max_detections_per_class: Maximum number of detections to retain per
	class.
	max_detections: Maximum number of detections to retain across all
	classes.
	box_variance: The scaling factors used to scale the bounding box
	predictions.
	"""

	def __init__(
	self,
	num_classes=80,
	confidence_threshold=0.05,
	nms_iou_threshold=0.5,
	max_detections_per_class=100,
	max_detections=100,
	box_variance=[0.1, 0.1, 0.2, 0.2],
	**kwargs
	):
	super(DecodePredictions, self).__init__(**kwargs)
	self.num_classes = num_classes
	self.confidence_threshold = confidence_threshold
	self.nms_iou_threshold = nms_iou_threshold
	self.max_detections_per_class = max_detections_per_class
	self.max_detections = max_detections

	self._anchor_box = AnchorBox()
	self._box_variance = tf.convert_to_tensor(
	[0.1, 0.1, 0.2, 0.2], dtype=tf.float32
	)

	def _decode_box_predictions(self, anchor_boxes, box_predictions):
	boxes = box_predictions * self._box_variance
	boxes = tf.concat(
	[
	boxes[:, :, :2] * anchor_boxes[:, :, 2:] + anchor_boxes[:, :, :2],
	tf.math.exp(boxes[:, :, 2:]) * anchor_boxes[:, :, 2:],
	],
	axis=-1,
	)
	boxes_transformed = convert_to_corners(boxes)
	return boxes_transformed

	def call(self, images, predictions):
	image_shape = tf.cast(tf.shape(images), dtype=tf.float32)
	anchor_boxes = self._anchor_box.get_anchors(image_shape[1], image_shape[2])
	box_predictions = predictions[:, :, :4]
	cls_predictions = tf.nn.sigmoid(predictions[:, :, 4:])
	boxes = self._decode_box_predictions(anchor_boxes[None, ...], box_predictions)

	return tf.image.combined_non_max_suppression(
	tf.expand_dims(boxes, axis=2),
	cls_predictions,
	self.max_detections_per_class,
	self.max_detections,
	self.nms_iou_threshold,
	self.confidence_threshold,
	clip_boxes=False,
	)

	def convert_to_corners(boxes):
	"""Changes the box format to corner coordinates

	Arguments:
	boxes: A tensor of rank 2 or higher with a shape of `(..., num_boxes, 4)`
	representing bounding boxes where each box is of the format
	`[x, y, width, height]`.

	Returns:
	converted boxes with shape same as that of boxes.
	"""
	return tf.concat(
	[boxes[..., :2] - boxes[..., 2:] / 2.0, boxes[..., :2] + boxes[..., 2:] / 2.0],
	axis=-1,
	)

	def resize_and_pad_image(
	image, min_side=800.0, max_side=1333.0, jitter=[640, 1024], stride=128.0
	):
	"""Resizes and pads image while preserving aspect ratio.

	1. Resizes images so that the shorter side is equal to `min_side`
	2. If the longer side is greater than `max_side`, then resize the image
	with longer side equal to `max_side`
	3. Pad with zeros on right and bottom to make the image shape divisible by
	`stride`

	Arguments:
	image: A 3-D tensor of shape `(height, width, channels)` representing an
	image.
	min_side: The shorter side of the image is resized to this value, if
	`jitter` is set to None.
	max_side: If the longer side of the image exceeds this value after
	resizing, the image is resized such that the longer side now equals to
	this value.
	jitter: A list of floats containing minimum and maximum size for scale
	jittering. If available, the shorter side of the image will be
	resized to a random value in this range.
	stride: The stride of the smallest feature map in the feature pyramid.
	Can be calculated using `image_size / feature_map_size`.

	Returns:
	image: Resized and padded image.
	image_shape: Shape of the image before padding.
	ratio: The scaling factor used to resize the image
	"""
	image_shape = tf.cast(tf.shape(image)[:2], dtype=tf.float32)
	if jitter is not None:
	min_side = tf.random.uniform((), jitter[0], jitter[1], dtype=tf.float32)
	ratio = min_side / tf.reduce_min(image_shape)
	if ratio * tf.reduce_max(image_shape) > max_side:
	ratio = max_side / tf.reduce_max(image_shape)
	image_shape = ratio * image_shape
	image = tf.image.resize(image, tf.cast(image_shape, dtype=tf.int32))
	padded_image_shape = tf.cast(
	tf.math.ceil(image_shape / stride) * stride, dtype=tf.int32
	)
	image = tf.image.pad_to_bounding_box(
	image, 0, 0, padded_image_shape[0], padded_image_shape[1]
	)
	return image, image_shape, ratio

	def visualize_detections(
	image, boxes, classes, scores, figsize=(7, 7), linewidth=1, color=[0, 0, 1]
	):
	"""Visualize Detections"""
	image = np.array(image, dtype=np.uint8)
	plt.figure(figsize=figsize)
	plt.axis("off")
	plt.imshow(image)
	ax = plt.gca()
	for box, _cls, score in zip(boxes, classes, scores):
	text = "{}: {:.2f}".format(_cls, score)
	x1, y1, x2, y2 = box
	w, h = x2 - x1, y2 - y1
	patch = plt.Rectangle(
	[x1, y1], w, h, fill=False, edgecolor=color, linewidth=linewidth
	)
	ax.add_patch(patch)
	ax.text(
	x1,
	y1,
	text,
	bbox={"facecolor": color, "alpha": 0.4},
	clip_box=ax.clipbox,
	clip_on=True,
	)
	plt.show()
	return ax

	def prepare_image(image):
	image, _, ratio = resize_and_pad_image(image, jitter=None)
	image = tf.keras.applications.resnet.preprocess_input(image)
	return tf.expand_dims(image, axis=0), ratio

	model = from_pretrained_keras("keras-io/Object-Detection-RetinaNet")
	img_input = tf.keras.Input(shape=[None, None, 3], name="image")
	predictions = model(img_input, training=False)
	detections = DecodePredictions(confidence_threshold=0.5)(img_input, predictions)
	inference_model = tf.keras.Model(inputs=img_input, outputs=detections)

	def predict(image):
	input_image, ratio = prepare_image(image)
	detections = inference_model.predict(input_image)
	num_detections = detections.valid_detections[0]
	class_names = [
	int2str(int(x)) for x in detections.nmsed_classes[0][:num_detections]
	]
	img_buf = io.BytesIO()
	ax = visualize_detections(
	image,
	detections.nmsed_boxes[0][:num_detections] / ratio,
	class_names,
	detections.nmsed_scores[0][:num_detections],
	)
	ax.figure.savefig(img_buf)
	img_buf.seek(0)
	img = Image.open(img_buf)
	return img

	# Input
	input = gr.inputs.Image(image_mode="RGB", type="numpy", label="Enter Object Image")

	# Output
	output = gr.outputs.Image(type="pil", label="Detected Objects with Class Category")

	title = "Object Detection With RetinaNet"
	description = "Upload an Image or take one from examples to localize objects present in an image, and at the same time, classify them into different categories"

	gr.Interface(fn=predict, inputs = input, outputs = output, examples=coco_image, allow_flagging=False, analytics_enabled=False, title=title, description=description, article="<center>Space By: <u><a href='https://github.com/robotjellyzone'><b>Kavya Bisht</b></a></u> \n Based on notebook <a href='https://keras.io/examples/vision/retinanet/'><b>this notebook</b></a></center>").launch(enable_queue=True, debug=True)