Spaces:
Runtime error
Runtime error
import gradio as gr | |
from huggingface_hub import from_pretrained_keras | |
from PIL import Image | |
import io | |
import matplotlib.pyplot as plt | |
import os | |
import re | |
import zipfile | |
import numpy as np | |
import tensorflow as tf | |
from tensorflow import keras | |
coco_image = [] | |
coco_dir = 'coco/images/test2017/' | |
for idx, images in enumerate(os.listdir(coco_dir)): | |
image = os.path.join(coco_dir, images) | |
if os.path.isfile(image) and idx < 10: | |
coco_image.append(image) | |
class AnchorBox: | |
"""Generates anchor boxes. | |
This class has operations to generate anchor boxes for feature maps at | |
strides `[8, 16, 32, 64, 128]`. Where each anchor each box is of the | |
format `[x, y, width, height]`. | |
Attributes: | |
aspect_ratios: A list of float values representing the aspect ratios of | |
the anchor boxes at each location on the feature map | |
scales: A list of float values representing the scale of the anchor boxes | |
at each location on the feature map. | |
num_anchors: The number of anchor boxes at each location on feature map | |
areas: A list of float values representing the areas of the anchor | |
boxes for each feature map in the feature pyramid. | |
strides: A list of float value representing the strides for each feature | |
map in the feature pyramid. | |
""" | |
def __init__(self): | |
self.aspect_ratios = [0.5, 1.0, 2.0] | |
self.scales = [2 ** x for x in [0, 1 / 3, 2 / 3]] | |
self._num_anchors = len(self.aspect_ratios) * len(self.scales) | |
self._strides = [2 ** i for i in range(3, 8)] | |
self._areas = [x ** 2 for x in [32.0, 64.0, 128.0, 256.0, 512.0]] | |
self._anchor_dims = self._compute_dims() | |
def _compute_dims(self): | |
"""Computes anchor box dimensions for all ratios and scales at all levels | |
of the feature pyramid. | |
""" | |
anchor_dims_all = [] | |
for area in self._areas: | |
anchor_dims = [] | |
for ratio in self.aspect_ratios: | |
anchor_height = tf.math.sqrt(area / ratio) | |
anchor_width = area / anchor_height | |
dims = tf.reshape( | |
tf.stack([anchor_width, anchor_height], axis=-1), [1, 1, 2] | |
) | |
for scale in self.scales: | |
anchor_dims.append(scale * dims) | |
anchor_dims_all.append(tf.stack(anchor_dims, axis=-2)) | |
return anchor_dims_all | |
def _get_anchors(self, feature_height, feature_width, level): | |
"""Generates anchor boxes for a given feature map size and level | |
Arguments: | |
feature_height: An integer representing the height of the feature map. | |
feature_width: An integer representing the width of the feature map. | |
level: An integer representing the level of the feature map in the | |
feature pyramid. | |
Returns: | |
anchor boxes with the shape | |
`(feature_height * feature_width * num_anchors, 4)` | |
""" | |
rx = tf.range(feature_width, dtype=tf.float32) + 0.5 | |
ry = tf.range(feature_height, dtype=tf.float32) + 0.5 | |
centers = tf.stack(tf.meshgrid(rx, ry), axis=-1) * self._strides[level - 3] | |
centers = tf.expand_dims(centers, axis=-2) | |
centers = tf.tile(centers, [1, 1, self._num_anchors, 1]) | |
dims = tf.tile( | |
self._anchor_dims[level - 3], [feature_height, feature_width, 1, 1] | |
) | |
anchors = tf.concat([centers, dims], axis=-1) | |
return tf.reshape( | |
anchors, [feature_height * feature_width * self._num_anchors, 4] | |
) | |
def get_anchors(self, image_height, image_width): | |
"""Generates anchor boxes for all the feature maps of the feature pyramid. | |
Arguments: | |
image_height: Height of the input image. | |
image_width: Width of the input image. | |
Returns: | |
anchor boxes for all the feature maps, stacked as a single tensor | |
with shape `(total_anchors, 4)` | |
""" | |
anchors = [ | |
self._get_anchors( | |
tf.math.ceil(image_height / 2 ** i), | |
tf.math.ceil(image_width / 2 ** i), | |
i, | |
) | |
for i in range(3, 8) | |
] | |
return tf.concat(anchors, axis=0) | |
class DecodePredictions(tf.keras.layers.Layer): | |
"""A Keras layer that decodes predictions of the RetinaNet model. | |
Attributes: | |
num_classes: Number of classes in the dataset | |
confidence_threshold: Minimum class probability, below which detections | |
are pruned. | |
nms_iou_threshold: IOU threshold for the NMS operation | |
max_detections_per_class: Maximum number of detections to retain per | |
class. | |
max_detections: Maximum number of detections to retain across all | |
classes. | |
box_variance: The scaling factors used to scale the bounding box | |
predictions. | |
""" | |
def __init__( | |
self, | |
num_classes=80, | |
confidence_threshold=0.05, | |
nms_iou_threshold=0.5, | |
max_detections_per_class=100, | |
max_detections=100, | |
box_variance=[0.1, 0.1, 0.2, 0.2], | |
**kwargs | |
): | |
super(DecodePredictions, self).__init__(**kwargs) | |
self.num_classes = num_classes | |
self.confidence_threshold = confidence_threshold | |
self.nms_iou_threshold = nms_iou_threshold | |
self.max_detections_per_class = max_detections_per_class | |
self.max_detections = max_detections | |
self._anchor_box = AnchorBox() | |
self._box_variance = tf.convert_to_tensor( | |
[0.1, 0.1, 0.2, 0.2], dtype=tf.float32 | |
) | |
def _decode_box_predictions(self, anchor_boxes, box_predictions): | |
boxes = box_predictions * self._box_variance | |
boxes = tf.concat( | |
[ | |
boxes[:, :, :2] * anchor_boxes[:, :, 2:] + anchor_boxes[:, :, :2], | |
tf.math.exp(boxes[:, :, 2:]) * anchor_boxes[:, :, 2:], | |
], | |
axis=-1, | |
) | |
boxes_transformed = convert_to_corners(boxes) | |
return boxes_transformed | |
def call(self, images, predictions): | |
image_shape = tf.cast(tf.shape(images), dtype=tf.float32) | |
anchor_boxes = self._anchor_box.get_anchors(image_shape[1], image_shape[2]) | |
box_predictions = predictions[:, :, :4] | |
cls_predictions = tf.nn.sigmoid(predictions[:, :, 4:]) | |
boxes = self._decode_box_predictions(anchor_boxes[None, ...], box_predictions) | |
return tf.image.combined_non_max_suppression( | |
tf.expand_dims(boxes, axis=2), | |
cls_predictions, | |
self.max_detections_per_class, | |
self.max_detections, | |
self.nms_iou_threshold, | |
self.confidence_threshold, | |
clip_boxes=False, | |
) | |
def convert_to_corners(boxes): | |
"""Changes the box format to corner coordinates | |
Arguments: | |
boxes: A tensor of rank 2 or higher with a shape of `(..., num_boxes, 4)` | |
representing bounding boxes where each box is of the format | |
`[x, y, width, height]`. | |
Returns: | |
converted boxes with shape same as that of boxes. | |
""" | |
return tf.concat( | |
[boxes[..., :2] - boxes[..., 2:] / 2.0, boxes[..., :2] + boxes[..., 2:] / 2.0], | |
axis=-1, | |
) | |
def resize_and_pad_image( | |
image, min_side=800.0, max_side=1333.0, jitter=[640, 1024], stride=128.0 | |
): | |
"""Resizes and pads image while preserving aspect ratio. | |
1. Resizes images so that the shorter side is equal to `min_side` | |
2. If the longer side is greater than `max_side`, then resize the image | |
with longer side equal to `max_side` | |
3. Pad with zeros on right and bottom to make the image shape divisible by | |
`stride` | |
Arguments: | |
image: A 3-D tensor of shape `(height, width, channels)` representing an | |
image. | |
min_side: The shorter side of the image is resized to this value, if | |
`jitter` is set to None. | |
max_side: If the longer side of the image exceeds this value after | |
resizing, the image is resized such that the longer side now equals to | |
this value. | |
jitter: A list of floats containing minimum and maximum size for scale | |
jittering. If available, the shorter side of the image will be | |
resized to a random value in this range. | |
stride: The stride of the smallest feature map in the feature pyramid. | |
Can be calculated using `image_size / feature_map_size`. | |
Returns: | |
image: Resized and padded image. | |
image_shape: Shape of the image before padding. | |
ratio: The scaling factor used to resize the image | |
""" | |
image_shape = tf.cast(tf.shape(image)[:2], dtype=tf.float32) | |
if jitter is not None: | |
min_side = tf.random.uniform((), jitter[0], jitter[1], dtype=tf.float32) | |
ratio = min_side / tf.reduce_min(image_shape) | |
if ratio * tf.reduce_max(image_shape) > max_side: | |
ratio = max_side / tf.reduce_max(image_shape) | |
image_shape = ratio * image_shape | |
image = tf.image.resize(image, tf.cast(image_shape, dtype=tf.int32)) | |
padded_image_shape = tf.cast( | |
tf.math.ceil(image_shape / stride) * stride, dtype=tf.int32 | |
) | |
image = tf.image.pad_to_bounding_box( | |
image, 0, 0, padded_image_shape[0], padded_image_shape[1] | |
) | |
return image, image_shape, ratio | |
def visualize_detections( | |
image, boxes, classes, scores, figsize=(7, 7), linewidth=1, color=[0, 0, 1] | |
): | |
"""Visualize Detections""" | |
image = np.array(image, dtype=np.uint8) | |
plt.figure(figsize=figsize) | |
plt.axis("off") | |
plt.imshow(image) | |
ax = plt.gca() | |
for box, _cls, score in zip(boxes, classes, scores): | |
text = "{}: {:.2f}".format(_cls, score) | |
x1, y1, x2, y2 = box | |
w, h = x2 - x1, y2 - y1 | |
patch = plt.Rectangle( | |
[x1, y1], w, h, fill=False, edgecolor=color, linewidth=linewidth | |
) | |
ax.add_patch(patch) | |
ax.text( | |
x1, | |
y1, | |
text, | |
bbox={"facecolor": color, "alpha": 0.4}, | |
clip_box=ax.clipbox, | |
clip_on=True, | |
) | |
plt.show() | |
return ax | |
def prepare_image(image): | |
image, _, ratio = resize_and_pad_image(image, jitter=None) | |
image = tf.keras.applications.resnet.preprocess_input(image) | |
return tf.expand_dims(image, axis=0), ratio | |
model = from_pretrained_keras("keras-io/Object-Detection-RetinaNet") | |
img_input = tf.keras.Input(shape=[None, None, 3], name="image") | |
predictions = model(img_input, training=False) | |
detections = DecodePredictions(confidence_threshold=0.5)(img_input, predictions) | |
inference_model = tf.keras.Model(inputs=img_input, outputs=detections) | |
def predict(image): | |
input_image, ratio = prepare_image(image) | |
detections = inference_model.predict(input_image) | |
num_detections = detections.valid_detections[0] | |
class_names = [ | |
int2str(int(x)) for x in detections.nmsed_classes[0][:num_detections] | |
] | |
img_buf = io.BytesIO() | |
ax = visualize_detections( | |
image, | |
detections.nmsed_boxes[0][:num_detections] / ratio, | |
class_names, | |
detections.nmsed_scores[0][:num_detections], | |
) | |
ax.figure.savefig(img_buf) | |
img_buf.seek(0) | |
img = Image.open(img_buf) | |
return img | |
# Input | |
input = gr.inputs.Image(image_mode="RGB", type="numpy", label="Enter Object Image") | |
# Output | |
output = gr.outputs.Image(type="pil", label="Detected Objects with Class Category") | |
title = "Object Detection With RetinaNet" | |
description = "Upload an Image or take one from examples to localize objects present in an image, and at the same time, classify them into different categories" | |
gr.Interface(fn=predict, inputs = input, outputs = output, examples=coco_image, allow_flagging=False, analytics_enabled=False, title=title, description=description, article="Space By: <u><a href="https://github.com/robotjellyzone"><b>Kavya Bisht</b></a></u> \n Based on notebook <a href=""><b>https://keras.io/examples/vision/retinanet/</b></a>").launch(enable_queue=True, debug=True) |