Update pipeline.py
Browse files- pipeline.py +6 -9
pipeline.py
CHANGED
@@ -27,7 +27,7 @@
|
|
27 |
# Modifications from the original code are marked with '# add' comments.
|
28 |
|
29 |
from dataclasses import dataclass
|
30 |
-
from typing import
|
31 |
|
32 |
import numpy as np
|
33 |
import torch
|
@@ -82,7 +82,7 @@ class E2EMarigoldDepthOutput(BaseOutput):
|
|
82 |
Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
|
83 |
The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
|
84 |
"""
|
85 |
-
|
86 |
prediction: Union[np.ndarray, torch.Tensor]
|
87 |
latent: Union[None, torch.Tensor]
|
88 |
|
@@ -124,7 +124,7 @@ class E2EMarigoldDepthPipeline(DiffusionPipeline):
|
|
124 |
scheduler: Union[DDIMScheduler],
|
125 |
text_encoder: CLIPTextModel,
|
126 |
tokenizer: CLIPTokenizer,
|
127 |
-
default_processing_resolution: Optional[int] = 768,
|
128 |
):
|
129 |
super().__init__()
|
130 |
|
@@ -265,8 +265,7 @@ class E2EMarigoldDepthPipeline(DiffusionPipeline):
|
|
265 |
batch_size (`int`, *optional*, defaults to `1`):
|
266 |
Batch size; only matters passing a tensor of images.
|
267 |
output_type (`str`, *optional*, defaults to `"np"`):
|
268 |
-
Preferred format of the output's `prediction`
|
269 |
-
values are: `"np"` (numpy array) or `"pt"` (torch tensor).
|
270 |
output_latent (`bool`, *optional*, defaults to `False`):
|
271 |
When enabled, the output's `latent` field contains the latent codes corresponding to the predictions
|
272 |
within the ensemble. These codes can be saved, modified, and used for subsequent calls with the
|
@@ -339,9 +338,7 @@ class E2EMarigoldDepthPipeline(DiffusionPipeline):
|
|
339 |
|
340 |
# 5. Process the denoising loop. All `N * E` latents are processed sequentially in batches of size `batch_size`.
|
341 |
# The unet model takes concatenated latent spaces of the input image and the predicted modality as an input, and
|
342 |
-
# outputs noise for the predicted modality's latent space.
|
343 |
-
# `num_inference_steps`. It is either set directly, or resolves to the optimal value specific to the loaded
|
344 |
-
# model.
|
345 |
# Model invocation: self.unet.
|
346 |
pred_latents = []
|
347 |
|
@@ -396,7 +393,7 @@ class E2EMarigoldDepthPipeline(DiffusionPipeline):
|
|
396 |
# 7. Remove padding. The output shape is (PH, PW).
|
397 |
prediction = self.image_processor.unpad_image(prediction, padding) # [N*E,1,PH,PW]
|
398 |
|
399 |
-
# 9. If `match_input_resolution` is set, the output prediction
|
400 |
# input resolution `(H, W)`. This step may introduce upsampling artifacts, and therefore can be disabled.
|
401 |
# Depending on the downstream use-case, upsampling can be also chosen based on the tolerated artifacts by
|
402 |
# setting the `resample_method_output` parameter (e.g., to `"nearest"`).
|
|
|
27 |
# Modifications from the original code are marked with '# add' comments.
|
28 |
|
29 |
from dataclasses import dataclass
|
30 |
+
from typing import List, Optional, Tuple, Union
|
31 |
|
32 |
import numpy as np
|
33 |
import torch
|
|
|
82 |
Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
|
83 |
The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
|
84 |
"""
|
85 |
+
|
86 |
prediction: Union[np.ndarray, torch.Tensor]
|
87 |
latent: Union[None, torch.Tensor]
|
88 |
|
|
|
124 |
scheduler: Union[DDIMScheduler],
|
125 |
text_encoder: CLIPTextModel,
|
126 |
tokenizer: CLIPTokenizer,
|
127 |
+
default_processing_resolution: Optional[int] = 768, # add
|
128 |
):
|
129 |
super().__init__()
|
130 |
|
|
|
265 |
batch_size (`int`, *optional*, defaults to `1`):
|
266 |
Batch size; only matters passing a tensor of images.
|
267 |
output_type (`str`, *optional*, defaults to `"np"`):
|
268 |
+
Preferred format of the output's `prediction`. The accepted ßvalues are: `"np"` (numpy array) or `"pt"` (torch tensor).
|
|
|
269 |
output_latent (`bool`, *optional*, defaults to `False`):
|
270 |
When enabled, the output's `latent` field contains the latent codes corresponding to the predictions
|
271 |
within the ensemble. These codes can be saved, modified, and used for subsequent calls with the
|
|
|
338 |
|
339 |
# 5. Process the denoising loop. All `N * E` latents are processed sequentially in batches of size `batch_size`.
|
340 |
# The unet model takes concatenated latent spaces of the input image and the predicted modality as an input, and
|
341 |
+
# outputs noise for the predicted modality's latent space.
|
|
|
|
|
342 |
# Model invocation: self.unet.
|
343 |
pred_latents = []
|
344 |
|
|
|
393 |
# 7. Remove padding. The output shape is (PH, PW).
|
394 |
prediction = self.image_processor.unpad_image(prediction, padding) # [N*E,1,PH,PW]
|
395 |
|
396 |
+
# 9. If `match_input_resolution` is set, the output prediction are upsampled to match the
|
397 |
# input resolution `(H, W)`. This step may introduce upsampling artifacts, and therefore can be disabled.
|
398 |
# Depending on the downstream use-case, upsampling can be also chosen based on the tolerated artifacts by
|
399 |
# setting the `resample_method_output` parameter (e.g., to `"nearest"`).
|