|
"""Compute depth maps for images in the input folder. |
|
""" |
|
import os |
|
import glob |
|
import torch |
|
import utils |
|
import cv2 |
|
import argparse |
|
import time |
|
|
|
import numpy as np |
|
|
|
from imutils.video import VideoStream |
|
from midas.model_loader import default_models, load_model |
|
|
|
first_execution = True |
|
def process(device, model, model_type, image, input_size, target_size, optimize, use_camera): |
|
""" |
|
Run the inference and interpolate. |
|
|
|
Args: |
|
device (torch.device): the torch device used |
|
model: the model used for inference |
|
model_type: the type of the model |
|
image: the image fed into the neural network |
|
input_size: the size (width, height) of the neural network input (for OpenVINO) |
|
target_size: the size (width, height) the neural network output is interpolated to |
|
optimize: optimize the model to half-floats on CUDA? |
|
use_camera: is the camera used? |
|
|
|
Returns: |
|
the prediction |
|
""" |
|
global first_execution |
|
|
|
if "openvino" in model_type: |
|
if first_execution or not use_camera: |
|
print(f" Input resized to {input_size[0]}x{input_size[1]} before entering the encoder") |
|
first_execution = False |
|
|
|
sample = [np.reshape(image, (1, 3, *input_size))] |
|
prediction = model(sample)[model.output(0)][0] |
|
prediction = cv2.resize(prediction, dsize=target_size, |
|
interpolation=cv2.INTER_CUBIC) |
|
else: |
|
sample = torch.from_numpy(image).to(device).unsqueeze(0) |
|
|
|
if optimize and device == torch.device("cuda"): |
|
if first_execution: |
|
print(" Optimization to half-floats activated. Use with caution, because models like Swin require\n" |
|
" float precision to work properly and may yield non-finite depth values to some extent for\n" |
|
" half-floats.") |
|
sample = sample.to(memory_format=torch.channels_last) |
|
sample = sample.half() |
|
|
|
if first_execution or not use_camera: |
|
height, width = sample.shape[2:] |
|
print(f" Input resized to {width}x{height} before entering the encoder") |
|
first_execution = False |
|
|
|
prediction = model.forward(sample) |
|
prediction = ( |
|
torch.nn.functional.interpolate( |
|
prediction.unsqueeze(1), |
|
size=target_size[::-1], |
|
mode="bicubic", |
|
align_corners=False, |
|
) |
|
.squeeze() |
|
.cpu() |
|
.numpy() |
|
) |
|
|
|
return prediction |
|
|
|
|
|
def create_side_by_side(image, depth, grayscale): |
|
""" |
|
Take an RGB image and depth map and place them side by side. This includes a proper normalization of the depth map |
|
for better visibility. |
|
|
|
Args: |
|
image: the RGB image |
|
depth: the depth map |
|
grayscale: use a grayscale colormap? |
|
|
|
Returns: |
|
the image and depth map place side by side |
|
""" |
|
depth_min = depth.min() |
|
depth_max = depth.max() |
|
normalized_depth = 255 * (depth - depth_min) / (depth_max - depth_min) |
|
normalized_depth *= 3 |
|
|
|
right_side = np.repeat(np.expand_dims(normalized_depth, 2), 3, axis=2) / 3 |
|
if not grayscale: |
|
right_side = cv2.applyColorMap(np.uint8(right_side), cv2.COLORMAP_INFERNO) |
|
|
|
if image is None: |
|
return right_side |
|
else: |
|
return np.concatenate((image, right_side), axis=1) |
|
|
|
|
|
def run(input_path, output_path, model_path, model_type="dpt_beit_large_512", optimize=False, side=False, height=None, |
|
square=False, grayscale=False): |
|
"""Run MonoDepthNN to compute depth maps. |
|
|
|
Args: |
|
input_path (str): path to input folder |
|
output_path (str): path to output folder |
|
model_path (str): path to saved model |
|
model_type (str): the model type |
|
optimize (bool): optimize the model to half-floats on CUDA? |
|
side (bool): RGB and depth side by side in output images? |
|
height (int): inference encoder image height |
|
square (bool): resize to a square resolution? |
|
grayscale (bool): use a grayscale colormap? |
|
""" |
|
print("Initialize") |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
print("Device: %s" % device) |
|
|
|
model, transform, net_w, net_h = load_model(device, model_path, model_type, optimize, height, square) |
|
|
|
|
|
if input_path is not None: |
|
image_names = glob.glob(os.path.join(input_path, "*")) |
|
num_images = len(image_names) |
|
else: |
|
print("No input path specified. Grabbing images from camera.") |
|
|
|
|
|
if output_path is not None: |
|
os.makedirs(output_path, exist_ok=True) |
|
|
|
print("Start processing") |
|
|
|
if input_path is not None: |
|
if output_path is None: |
|
print("Warning: No output path specified. Images will be processed but not shown or stored anywhere.") |
|
for index, image_name in enumerate(image_names): |
|
|
|
print(" Processing {} ({}/{})".format(image_name, index + 1, num_images)) |
|
|
|
|
|
original_image_rgb = utils.read_image(image_name) |
|
image = transform({"image": original_image_rgb})["image"] |
|
|
|
|
|
with torch.no_grad(): |
|
prediction = process(device, model, model_type, image, (net_w, net_h), original_image_rgb.shape[1::-1], |
|
optimize, False) |
|
|
|
|
|
if output_path is not None: |
|
filename = os.path.join( |
|
output_path, os.path.splitext(os.path.basename(image_name))[0] + '-' + model_type |
|
) |
|
if not side: |
|
utils.write_depth(filename, prediction, grayscale, bits=2) |
|
else: |
|
original_image_bgr = np.flip(original_image_rgb, 2) |
|
content = create_side_by_side(original_image_bgr*255, prediction, grayscale) |
|
cv2.imwrite(filename + ".png", content) |
|
utils.write_pfm(filename + ".pfm", prediction.astype(np.float32)) |
|
|
|
else: |
|
with torch.no_grad(): |
|
fps = 1 |
|
video = VideoStream(0).start() |
|
time_start = time.time() |
|
frame_index = 0 |
|
while True: |
|
frame = video.read() |
|
if frame is not None: |
|
original_image_rgb = np.flip(frame, 2) |
|
image = transform({"image": original_image_rgb/255})["image"] |
|
|
|
prediction = process(device, model, model_type, image, (net_w, net_h), |
|
original_image_rgb.shape[1::-1], optimize, True) |
|
|
|
original_image_bgr = np.flip(original_image_rgb, 2) if side else None |
|
content = create_side_by_side(original_image_bgr, prediction, grayscale) |
|
cv2.imshow('MiDaS Depth Estimation - Press Escape to close window ', content/255) |
|
|
|
if output_path is not None: |
|
filename = os.path.join(output_path, 'Camera' + '-' + model_type + '_' + str(frame_index)) |
|
cv2.imwrite(filename + ".png", content) |
|
|
|
alpha = 0.1 |
|
if time.time()-time_start > 0: |
|
fps = (1 - alpha) * fps + alpha * 1 / (time.time()-time_start) |
|
time_start = time.time() |
|
print(f"\rFPS: {round(fps,2)}", end="") |
|
|
|
if cv2.waitKey(1) == 27: |
|
break |
|
|
|
frame_index += 1 |
|
print() |
|
|
|
print("Finished") |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser() |
|
|
|
parser.add_argument('-i', '--input_path', |
|
default=None, |
|
help='Folder with input images (if no input path is specified, images are tried to be grabbed ' |
|
'from camera)' |
|
) |
|
|
|
parser.add_argument('-o', '--output_path', |
|
default=None, |
|
help='Folder for output images' |
|
) |
|
|
|
parser.add_argument('-m', '--model_weights', |
|
default=None, |
|
help='Path to the trained weights of model' |
|
) |
|
|
|
parser.add_argument('-t', '--model_type', |
|
default='dpt_beit_large_512', |
|
help='Model type: ' |
|
'dpt_beit_large_512, dpt_beit_large_384, dpt_beit_base_384, dpt_swin2_large_384, ' |
|
'dpt_swin2_base_384, dpt_swin2_tiny_256, dpt_swin_large_384, dpt_next_vit_large_384, ' |
|
'dpt_levit_224, dpt_large_384, dpt_hybrid_384, midas_v21_384, midas_v21_small_256 or ' |
|
'openvino_midas_v21_small_256' |
|
) |
|
|
|
parser.add_argument('-s', '--side', |
|
action='store_true', |
|
help='Output images contain RGB and depth images side by side' |
|
) |
|
|
|
parser.add_argument('--optimize', dest='optimize', action='store_true', help='Use half-float optimization') |
|
parser.set_defaults(optimize=False) |
|
|
|
parser.add_argument('--height', |
|
type=int, default=None, |
|
help='Preferred height of images feed into the encoder during inference. Note that the ' |
|
'preferred height may differ from the actual height, because an alignment to multiples of ' |
|
'32 takes place. Many models support only the height chosen during training, which is ' |
|
'used automatically if this parameter is not set.' |
|
) |
|
parser.add_argument('--square', |
|
action='store_true', |
|
help='Option to resize images to a square resolution by changing their widths when images are ' |
|
'fed into the encoder during inference. If this parameter is not set, the aspect ratio of ' |
|
'images is tried to be preserved if supported by the model.' |
|
) |
|
parser.add_argument('--grayscale', |
|
action='store_true', |
|
help='Use a grayscale colormap instead of the inferno one. Although the inferno colormap, ' |
|
'which is used by default, is better for visibility, it does not allow storing 16-bit ' |
|
'depth values in PNGs but only 8-bit ones due to the precision limitation of this ' |
|
'colormap.' |
|
) |
|
|
|
args = parser.parse_args() |
|
|
|
|
|
if args.model_weights is None: |
|
args.model_weights = default_models[args.model_type] |
|
|
|
|
|
torch.backends.cudnn.enabled = True |
|
torch.backends.cudnn.benchmark = True |
|
|
|
|
|
run(args.input_path, args.output_path, args.model_weights, args.model_type, args.optimize, args.side, args.height, |
|
args.square, args.grayscale) |
|
|