Spaces:

keras-io
/

Object-Detection-Using-RetinaNet

Runtime error

App Files Files Community

Object-Detection-Using-RetinaNet / app.py

RobotJelly

app.py

a0001dc over 2 years ago

raw

history blame

12 kB

	import gradio as gr
	from huggingface_hub import from_pretrained_keras
	from PIL import Image
	import io
	import matplotlib.pyplot as plt
	import os
	import re
	import zipfile
	import numpy as np
	import tensorflow as tf
	from tensorflow import keras

	coco_image = []
	coco_dir = 'coco/images/test2017/'
	for idx, images in enumerate(os.listdir(coco_dir)):
	image = os.path.join(coco_dir, images)
	if os.path.isfile(image) and idx < 10:
	coco_image.append(image)

	class AnchorBox:
	"""Generates anchor boxes.

	This class has operations to generate anchor boxes for feature maps at
	strides `[8, 16, 32, 64, 128]`. Where each anchor each box is of the
	format `[x, y, width, height]`.

	Attributes:
	aspect_ratios: A list of float values representing the aspect ratios of
	the anchor boxes at each location on the feature map
	scales: A list of float values representing the scale of the anchor boxes
	at each location on the feature map.
	num_anchors: The number of anchor boxes at each location on feature map
	areas: A list of float values representing the areas of the anchor
	boxes for each feature map in the feature pyramid.
	strides: A list of float value representing the strides for each feature
	map in the feature pyramid.
	"""

	def __init__(self):
	self.aspect_ratios = [0.5, 1.0, 2.0]
	self.scales = [2 ** x for x in [0, 1 / 3, 2 / 3]]

	self._num_anchors = len(self.aspect_ratios) * len(self.scales)
	self._strides = [2 ** i for i in range(3, 8)]
	self._areas = [x ** 2 for x in [32.0, 64.0, 128.0, 256.0, 512.0]]
	self._anchor_dims = self._compute_dims()

	def _compute_dims(self):
	"""Computes anchor box dimensions for all ratios and scales at all levels
	of the feature pyramid.
	"""
	anchor_dims_all = []
	for area in self._areas:
	anchor_dims = []
	for ratio in self.aspect_ratios:
	anchor_height = tf.math.sqrt(area / ratio)
	anchor_width = area / anchor_height
	dims = tf.reshape(
	tf.stack([anchor_width, anchor_height], axis=-1), [1, 1, 2]
	)
	for scale in self.scales:
	anchor_dims.append(scale * dims)
	anchor_dims_all.append(tf.stack(anchor_dims, axis=-2))
	return anchor_dims_all

	def _get_anchors(self, feature_height, feature_width, level):
	"""Generates anchor boxes for a given feature map size and level

	Arguments:
	feature_height: An integer representing the height of the feature map.
	feature_width: An integer representing the width of the feature map.
	level: An integer representing the level of the feature map in the
	feature pyramid.

	Returns:
	anchor boxes with the shape
	`(feature_height * feature_width * num_anchors, 4)`
	"""
	rx = tf.range(feature_width, dtype=tf.float32) + 0.5
	ry = tf.range(feature_height, dtype=tf.float32) + 0.5
	centers = tf.stack(tf.meshgrid(rx, ry), axis=-1) * self._strides[level - 3]
	centers = tf.expand_dims(centers, axis=-2)
	centers = tf.tile(centers, [1, 1, self._num_anchors, 1])
	dims = tf.tile(
	self._anchor_dims[level - 3], [feature_height, feature_width, 1, 1]
	)
	anchors = tf.concat([centers, dims], axis=-1)
	return tf.reshape(
	anchors, [feature_height * feature_width * self._num_anchors, 4]
	)

	def get_anchors(self, image_height, image_width):
	"""Generates anchor boxes for all the feature maps of the feature pyramid.

	Arguments:
	image_height: Height of the input image.
	image_width: Width of the input image.

	Returns:
	anchor boxes for all the feature maps, stacked as a single tensor
	with shape `(total_anchors, 4)`
	"""
	anchors = [
	self._get_anchors(
	tf.math.ceil(image_height / 2 ** i),
	tf.math.ceil(image_width / 2 ** i),
	i,
	)
	for i in range(3, 8)
	]
	return tf.concat(anchors, axis=0)

	class DecodePredictions(tf.keras.layers.Layer):
	"""A Keras layer that decodes predictions of the RetinaNet model.

	Attributes:
	num_classes: Number of classes in the dataset
	confidence_threshold: Minimum class probability, below which detections
	are pruned.
	nms_iou_threshold: IOU threshold for the NMS operation
	max_detections_per_class: Maximum number of detections to retain per
	class.
	max_detections: Maximum number of detections to retain across all
	classes.
	box_variance: The scaling factors used to scale the bounding box
	predictions.
	"""

	def __init__(
	self,
	num_classes=80,
	confidence_threshold=0.05,
	nms_iou_threshold=0.5,
	max_detections_per_class=100,
	max_detections=100,
	box_variance=[0.1, 0.1, 0.2, 0.2],
	**kwargs
	):
	super(DecodePredictions, self).__init__(**kwargs)
	self.num_classes = num_classes
	self.confidence_threshold = confidence_threshold
	self.nms_iou_threshold = nms_iou_threshold
	self.max_detections_per_class = max_detections_per_class
	self.max_detections = max_detections

	self._anchor_box = AnchorBox()
	self._box_variance = tf.convert_to_tensor(
	[0.1, 0.1, 0.2, 0.2], dtype=tf.float32
	)

	def _decode_box_predictions(self, anchor_boxes, box_predictions):
	boxes = box_predictions * self._box_variance
	boxes = tf.concat(
	[
	boxes[:, :, :2] * anchor_boxes[:, :, 2:] + anchor_boxes[:, :, :2],
	tf.math.exp(boxes[:, :, 2:]) * anchor_boxes[:, :, 2:],
	],
	axis=-1,
	)
	boxes_transformed = convert_to_corners(boxes)
	return boxes_transformed

	def call(self, images, predictions):
	image_shape = tf.cast(tf.shape(images), dtype=tf.float32)
	anchor_boxes = self._anchor_box.get_anchors(image_shape[1], image_shape[2])
	box_predictions = predictions[:, :, :4]
	cls_predictions = tf.nn.sigmoid(predictions[:, :, 4:])
	boxes = self._decode_box_predictions(anchor_boxes[None, ...], box_predictions)

	return tf.image.combined_non_max_suppression(
	tf.expand_dims(boxes, axis=2),
	cls_predictions,
	self.max_detections_per_class,
	self.max_detections,
	self.nms_iou_threshold,
	self.confidence_threshold,
	clip_boxes=False,
	)

	def convert_to_corners(boxes):
	"""Changes the box format to corner coordinates

	Arguments:
	boxes: A tensor of rank 2 or higher with a shape of `(..., num_boxes, 4)`
	representing bounding boxes where each box is of the format
	`[x, y, width, height]`.

	Returns:
	converted boxes with shape same as that of boxes.
	"""
	return tf.concat(
	[boxes[..., :2] - boxes[..., 2:] / 2.0, boxes[..., :2] + boxes[..., 2:] / 2.0],
	axis=-1,
	)

	def resize_and_pad_image(
	image, min_side=800.0, max_side=1333.0, jitter=[640, 1024], stride=128.0
	):
	"""Resizes and pads image while preserving aspect ratio.

	1. Resizes images so that the shorter side is equal to `min_side`
	2. If the longer side is greater than `max_side`, then resize the image
	with longer side equal to `max_side`
	3. Pad with zeros on right and bottom to make the image shape divisible by
	`stride`

	Arguments:
	image: A 3-D tensor of shape `(height, width, channels)` representing an
	image.
	min_side: The shorter side of the image is resized to this value, if
	`jitter` is set to None.
	max_side: If the longer side of the image exceeds this value after
	resizing, the image is resized such that the longer side now equals to
	this value.
	jitter: A list of floats containing minimum and maximum size for scale
	jittering. If available, the shorter side of the image will be
	resized to a random value in this range.
	stride: The stride of the smallest feature map in the feature pyramid.
	Can be calculated using `image_size / feature_map_size`.

	Returns:
	image: Resized and padded image.
	image_shape: Shape of the image before padding.
	ratio: The scaling factor used to resize the image
	"""
	image_shape = tf.cast(tf.shape(image)[:2], dtype=tf.float32)
	if jitter is not None:
	min_side = tf.random.uniform((), jitter[0], jitter[1], dtype=tf.float32)
	ratio = min_side / tf.reduce_min(image_shape)
	if ratio * tf.reduce_max(image_shape) > max_side:
	ratio = max_side / tf.reduce_max(image_shape)
	image_shape = ratio * image_shape
	image = tf.image.resize(image, tf.cast(image_shape, dtype=tf.int32))
	padded_image_shape = tf.cast(
	tf.math.ceil(image_shape / stride) * stride, dtype=tf.int32
	)
	image = tf.image.pad_to_bounding_box(
	image, 0, 0, padded_image_shape[0], padded_image_shape[1]
	)
	return image, image_shape, ratio

	def visualize_detections(
	image, boxes, classes, scores, figsize=(7, 7), linewidth=1, color=[0, 0, 1]
	):
	"""Visualize Detections"""
	image = np.array(image, dtype=np.uint8)
	plt.figure(figsize=figsize)
	plt.axis("off")
	plt.imshow(image)
	ax = plt.gca()
	for box, _cls, score in zip(boxes, classes, scores):
	text = "{}: {:.2f}".format(_cls, score)
	x1, y1, x2, y2 = box
	w, h = x2 - x1, y2 - y1
	patch = plt.Rectangle(
	[x1, y1], w, h, fill=False, edgecolor=color, linewidth=linewidth
	)
	ax.add_patch(patch)
	ax.text(
	x1,
	y1,
	text,
	bbox={"facecolor": color, "alpha": 0.4},
	clip_box=ax.clipbox,
	clip_on=True,
	)
	plt.show()
	return ax

	def prepare_image(image):
	image, _, ratio = resize_and_pad_image(image, jitter=None)
	image = tf.keras.applications.resnet.preprocess_input(image)
	return tf.expand_dims(image, axis=0), ratio

	model = from_pretrained_keras("keras-io/Object-Detection-RetinaNet")
	img_input = tf.keras.Input(shape=[None, None, 3], name="image")
	predictions = model(img_input, training=False)
	detections = DecodePredictions(confidence_threshold=0.5)(img_input, predictions)
	inference_model = tf.keras.Model(inputs=img_input, outputs=detections)

	def predict(image):
	input_image, ratio = prepare_image(image)
	detections = inference_model.predict(input_image)
	num_detections = detections.valid_detections[0]
	class_names = [
	int2str(int(x)) for x in detections.nmsed_classes[0][:num_detections]
	]
	img_buf = io.BytesIO()
	ax = visualize_detections(
	image,
	detections.nmsed_boxes[0][:num_detections] / ratio,
	class_names,
	detections.nmsed_scores[0][:num_detections],
	)
	ax.figure.savefig(img_buf)
	img_buf.seek(0)
	img = Image.open(img_buf)
	return img

	# Input
	input = gr.inputs.Image(image_mode="RGB", type="numpy", label="Enter Object Image")

	# Output
	output = gr.outputs.Image(type="pil", label="Detected Objects with Class Category")

	title = "Object Detection With RetinaNet"
	description = "Upload an Image or take one from examples to localize objects present in an image, and at the same time, classify them into different categories"

	gr.Interface(fn=predict, inputs = input, outputs = output, examples=coco_image, allow_flagging=False, analytics_enabled=False, title=title, description=description, article="Space By: <u><a href="https://github.com/robotjellyzone"><b>Kavya Bisht</b></a></u> \n Based on notebook <a href=""><b>https://keras.io/examples/vision/retinanet/</b></a>").launch(enable_queue=True, debug=True)