Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
from ultralyticsplus import YOLO | |
import numpy as np | |
from PIL import Image | |
from base64 import b64encode | |
from io import BytesIO | |
from gtts import gTTS | |
from mtranslate import translate | |
from speech_recognition import AudioFile, Recognizer | |
import time | |
from sahi.prediction import ObjectPrediction, PredictionScore | |
from sahi.utils.cv import ( | |
get_bool_mask_from_coco_segmentation, | |
read_image_as_pil, | |
visualize_object_predictions, | |
) | |
model = YOLO('ultralyticsplus/yolov8s') | |
CLASS = model.model.names | |
def tts(text: str, language="ja") -> object: | |
"""Converts text into autoplay html. | |
Args: | |
text (str): generated answer of bot | |
Returns: | |
html: autoplay object | |
""" | |
tts_object = gTTS(text=text, lang=language, slow=False) | |
bytes_object = BytesIO() | |
tts_object.write_to_fp(bytes_object) | |
bytes_object.seek(0) | |
b64 = b64encode(bytes_object.getvalue()).decode() | |
html = f""" | |
<audio controls autoplay> | |
<source src="data:audio/wav;base64,{b64}" type="audio/wav"> | |
</audio> | |
""" | |
return html | |
def yolov8_inference( | |
image, | |
area_thres=0.35, | |
defaul_bot_voice="おはいようございます" | |
): | |
""" | |
YOLOv8 inference function | |
Args: | |
image: Input image | |
Returns: | |
Rendered image | |
""" | |
# time.sleep(1) | |
# set model parameters | |
model.overrides['conf'] = 0.25 # NMS confidence threshold | |
model.overrides['iou'] = 0.45 # NMS IoU threshold | |
model.overrides['agnostic_nms'] = False # NMS class-agnostic | |
model.overrides['max_det'] = 1000 # maximum number of detections per image | |
results = model.predict(image, show=False)[0] | |
image = read_image_as_pil(image) | |
np_image = np.ascontiguousarray(image) | |
masks, boxes = results.masks, results.boxes | |
area_image = image.width*image.height | |
object_predictions = [] | |
html_bot_voice = "" | |
most_close = 0 | |
if boxes is not None: | |
det_ind = 0 | |
for xyxy, conf, cls in zip(boxes.xyxy, boxes.conf, boxes.cls): | |
if int(cls) != 0: | |
continue | |
box = xyxy.tolist() | |
area_rate = (box[2] - box[0]) * (box[3] - box[1]) / area_image | |
object_prediction = ObjectPrediction( | |
bbox=box, | |
category_name=CLASS[int(cls)], | |
category_id=int(cls), | |
score=area_rate, | |
) | |
object_predictions.append(object_prediction) | |
det_ind += 1 | |
if area_rate >= most_close: | |
out_img = image.crop(tuple(box)).resize((image.width, image.height)) | |
most_close = area_rate | |
if area_rate >= area_thres: | |
html_bot_voice = tts(defaul_bot_voice, language="ja") | |
# result = visualize_object_predictions( | |
# image=np_image, | |
# object_prediction_list=object_predictions, | |
# rect_th=2, | |
# text_th=2, | |
# ) | |
# return Image.fromarray(result["image"]), html_bot_voice | |
return out_img, html_bot_voice | |
outputs = [gr.Image(type="filepath", label="Robot View"), | |
gr.HTML()] | |
title = "Detomo Aisatsu Robot" | |
demo_app = gr.Interface( | |
fn=yolov8_inference, | |
inputs=gr.Image(source="webcam", streaming=True, label="Input Image"), | |
outputs=outputs, | |
title=title, | |
live=True, | |
) | |
demo_app.launch(debug=True) |