GonzaloMG commited on
Commit
ca34774
·
verified ·
1 Parent(s): aeb6bb9

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +6 -9
pipeline.py CHANGED
@@ -27,7 +27,7 @@
27
  # Modifications from the original code are marked with '# add' comments.
28
 
29
  from dataclasses import dataclass
30
- from typing import Any, Dict, List, Optional, Tuple, Union
31
 
32
  import numpy as np
33
  import torch
@@ -82,7 +82,7 @@ class E2EMarigoldDepthOutput(BaseOutput):
82
  Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
83
  The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
84
  """
85
-
86
  prediction: Union[np.ndarray, torch.Tensor]
87
  latent: Union[None, torch.Tensor]
88
 
@@ -124,7 +124,7 @@ class E2EMarigoldDepthPipeline(DiffusionPipeline):
124
  scheduler: Union[DDIMScheduler],
125
  text_encoder: CLIPTextModel,
126
  tokenizer: CLIPTokenizer,
127
- default_processing_resolution: Optional[int] = 768,
128
  ):
129
  super().__init__()
130
 
@@ -265,8 +265,7 @@ class E2EMarigoldDepthPipeline(DiffusionPipeline):
265
  batch_size (`int`, *optional*, defaults to `1`):
266
  Batch size; only matters passing a tensor of images.
267
  output_type (`str`, *optional*, defaults to `"np"`):
268
- Preferred format of the output's `prediction` and the optional `uncertainty` fields. The accepted
269
- values are: `"np"` (numpy array) or `"pt"` (torch tensor).
270
  output_latent (`bool`, *optional*, defaults to `False`):
271
  When enabled, the output's `latent` field contains the latent codes corresponding to the predictions
272
  within the ensemble. These codes can be saved, modified, and used for subsequent calls with the
@@ -339,9 +338,7 @@ class E2EMarigoldDepthPipeline(DiffusionPipeline):
339
 
340
  # 5. Process the denoising loop. All `N * E` latents are processed sequentially in batches of size `batch_size`.
341
  # The unet model takes concatenated latent spaces of the input image and the predicted modality as an input, and
342
- # outputs noise for the predicted modality's latent space. The number of denoising diffusion steps is defined by
343
- # `num_inference_steps`. It is either set directly, or resolves to the optimal value specific to the loaded
344
- # model.
345
  # Model invocation: self.unet.
346
  pred_latents = []
347
 
@@ -396,7 +393,7 @@ class E2EMarigoldDepthPipeline(DiffusionPipeline):
396
  # 7. Remove padding. The output shape is (PH, PW).
397
  prediction = self.image_processor.unpad_image(prediction, padding) # [N*E,1,PH,PW]
398
 
399
- # 9. If `match_input_resolution` is set, the output prediction and the uncertainty are upsampled to match the
400
  # input resolution `(H, W)`. This step may introduce upsampling artifacts, and therefore can be disabled.
401
  # Depending on the downstream use-case, upsampling can be also chosen based on the tolerated artifacts by
402
  # setting the `resample_method_output` parameter (e.g., to `"nearest"`).
 
27
  # Modifications from the original code are marked with '# add' comments.
28
 
29
  from dataclasses import dataclass
30
+ from typing import List, Optional, Tuple, Union
31
 
32
  import numpy as np
33
  import torch
 
82
  Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
83
  The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
84
  """
85
+
86
  prediction: Union[np.ndarray, torch.Tensor]
87
  latent: Union[None, torch.Tensor]
88
 
 
124
  scheduler: Union[DDIMScheduler],
125
  text_encoder: CLIPTextModel,
126
  tokenizer: CLIPTokenizer,
127
+ default_processing_resolution: Optional[int] = 768, # add
128
  ):
129
  super().__init__()
130
 
 
265
  batch_size (`int`, *optional*, defaults to `1`):
266
  Batch size; only matters passing a tensor of images.
267
  output_type (`str`, *optional*, defaults to `"np"`):
268
+ Preferred format of the output's `prediction`. The accepted ßvalues are: `"np"` (numpy array) or `"pt"` (torch tensor).
 
269
  output_latent (`bool`, *optional*, defaults to `False`):
270
  When enabled, the output's `latent` field contains the latent codes corresponding to the predictions
271
  within the ensemble. These codes can be saved, modified, and used for subsequent calls with the
 
338
 
339
  # 5. Process the denoising loop. All `N * E` latents are processed sequentially in batches of size `batch_size`.
340
  # The unet model takes concatenated latent spaces of the input image and the predicted modality as an input, and
341
+ # outputs noise for the predicted modality's latent space.
 
 
342
  # Model invocation: self.unet.
343
  pred_latents = []
344
 
 
393
  # 7. Remove padding. The output shape is (PH, PW).
394
  prediction = self.image_processor.unpad_image(prediction, padding) # [N*E,1,PH,PW]
395
 
396
+ # 9. If `match_input_resolution` is set, the output prediction are upsampled to match the
397
  # input resolution `(H, W)`. This step may introduce upsampling artifacts, and therefore can be disabled.
398
  # Depending on the downstream use-case, upsampling can be also chosen based on the tolerated artifacts by
399
  # setting the `resample_method_output` parameter (e.g., to `"nearest"`).