HieuPM commited on
Commit
95ddcb0
·
1 Parent(s): 534910b

EDIT: add super-resolution model and inpainting sd

Browse files
handler.py CHANGED
@@ -1,12 +1,11 @@
1
  from typing import Dict, List, Any
2
  import torch
3
- from diffusers import DPMSolverMultistepScheduler, StableDiffusionInpaintPipeline
4
  from PIL import Image
5
  import base64
6
  from io import BytesIO
7
- from datetime import datetime
8
- import time
9
-
10
 
11
  # set device
12
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
@@ -14,20 +13,19 @@ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
14
  if device.type != 'cuda':
15
  raise ValueError("need to run on GPU")
16
 
17
- begin_runtime = datetime.now()
18
-
19
- def print_current_time(tag: str):
20
- global begin_runtime
21
- print(str(tag).upper() + ": " + str(datetime.now() - begin_runtime))
22
-
23
  class EndpointHandler():
24
  def __init__(self, path=""):
25
  # load StableDiffusionInpaintPipeline pipeline
26
  self.pipe = StableDiffusionInpaintPipeline.from_pretrained(path, torch_dtype=torch.float16)
27
  # use DPMSolverMultistepScheduler
28
- self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(self.pipe.scheduler.config)
 
29
  # move to device
30
- self.pipe = self.pipe.to(device)
 
 
 
 
31
 
32
 
33
  def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
@@ -45,14 +43,12 @@ class EndpointHandler():
45
 
46
  # hyperparamters
47
  num_inference_steps = data.pop("num_inference_steps", 50)
48
- # guidance_scale = data.pop("guidance_scale", 7.5)
49
  negative_prompt = data.pop("negative_prompt", None)
50
  height = data.pop("height", None)
51
  width = data.pop("width", None)
52
 
53
  # process image
54
- print_current_time("Start decoding")
55
-
56
  if encoded_image is not None and encoded_mask_image is not None:
57
  image = self.decode_base64_image(encoded_image)
58
  mask_image = self.decode_base64_image(encoded_mask_image)
@@ -60,29 +56,31 @@ class EndpointHandler():
60
  image = None
61
  mask_image = None
62
 
63
- print_current_time("Finish decoding")
64
-
65
  # run inference pipeline
66
  out = self.pipe(inputs,
67
  image=image,
68
  mask_image=mask_image,
69
  num_inference_steps=num_inference_steps,
70
- # guidance_scale=guidance_scale,
71
  num_images_per_prompt=num_images,
72
  negative_prompt=negative_prompt,
73
  height=height,
74
  width=width
75
- )
 
 
 
 
 
 
76
 
77
  # return first generate PIL image
78
  json_imgs = {}
79
- for i in range(len(out.images)):
80
  buffered = BytesIO()
81
- out.images[i].save(buffered, format="PNG")
82
  img_str = base64.b64encode(buffered.getvalue())
83
  json_imgs[f"{i}"] = img_str.decode()
84
-
85
- print_current_time("Complete Stable diffusion")
86
  return json_imgs
87
 
88
  # helper to decode input image
 
1
  from typing import Dict, List, Any
2
  import torch
3
+ from diffusers import DPMSolverMultistepScheduler, StableDiffusionInpaintPipeline, EulerAncestralDiscreteScheduler
4
  from PIL import Image
5
  import base64
6
  from io import BytesIO
7
+ import numpy as np
8
+ from RealESRGAN import RealESRGAN
 
9
 
10
  # set device
11
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
13
  if device.type != 'cuda':
14
  raise ValueError("need to run on GPU")
15
 
 
 
 
 
 
 
16
  class EndpointHandler():
17
  def __init__(self, path=""):
18
  # load StableDiffusionInpaintPipeline pipeline
19
  self.pipe = StableDiffusionInpaintPipeline.from_pretrained(path, torch_dtype=torch.float16)
20
  # use DPMSolverMultistepScheduler
21
+ self.pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
22
+ # pipe.enable_sequential_cpu_offload()
23
  # move to device
24
+ self.pipe.to(device)
25
+ self.pipe.enable_xformers_memory_efficient_attention()
26
+
27
+ self.upscaler = RealESRGAN(device, scale=4)
28
+ self.upscaler.load_weights('weights/RealESRGAN_x4.pth', download=True)
29
 
30
 
31
  def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
 
43
 
44
  # hyperparamters
45
  num_inference_steps = data.pop("num_inference_steps", 50)
46
+ guidance_scale = data.pop("guidance_scale", 7.5)
47
  negative_prompt = data.pop("negative_prompt", None)
48
  height = data.pop("height", None)
49
  width = data.pop("width", None)
50
 
51
  # process image
 
 
52
  if encoded_image is not None and encoded_mask_image is not None:
53
  image = self.decode_base64_image(encoded_image)
54
  mask_image = self.decode_base64_image(encoded_mask_image)
 
56
  image = None
57
  mask_image = None
58
 
 
 
59
  # run inference pipeline
60
  out = self.pipe(inputs,
61
  image=image,
62
  mask_image=mask_image,
63
  num_inference_steps=num_inference_steps,
64
+ guidance_scale=guidance_scale,
65
  num_images_per_prompt=num_images,
66
  negative_prompt=negative_prompt,
67
  height=height,
68
  width=width
69
+ ).images
70
+
71
+ for i in range(len(out)):
72
+ gen_img = Image.composite(out[i], image.resize(out[i].size), mask_image.resize(out[i].size))
73
+ gen_img = self.upscaler.predict(gen_img)
74
+ gen_img = Image.composite(gen_img, image.resize(gen_img.size), mask_image.resize(gen_img.size))
75
+ out[i] = gen_img
76
 
77
  # return first generate PIL image
78
  json_imgs = {}
79
+ for i in range(len(out)):
80
  buffered = BytesIO()
81
+ out[i].save(buffered, format="PNG")
82
  img_str = base64.b64encode(buffered.getvalue())
83
  json_imgs[f"{i}"] = img_str.decode()
 
 
84
  return json_imgs
85
 
86
  # helper to decode input image
model_index.json DELETED
@@ -1,38 +0,0 @@
1
- {
2
- "_class_name": "StableDiffusionInpaintPipeline",
3
- "_diffusers_version": "0.27.2",
4
- "_name_or_path": "stabilityai/stable-diffusion-2-1",
5
- "feature_extractor": [
6
- null,
7
- null
8
- ],
9
- "image_encoder": [
10
- null,
11
- null
12
- ],
13
- "requires_safety_checker": false,
14
- "safety_checker": [
15
- null,
16
- null
17
- ],
18
- "scheduler": [
19
- "diffusers",
20
- "DDIMScheduler"
21
- ],
22
- "text_encoder": [
23
- "transformers",
24
- "CLIPTextModel"
25
- ],
26
- "tokenizer": [
27
- "transformers",
28
- "CLIPTokenizer"
29
- ],
30
- "unet": [
31
- "diffusers",
32
- "UNet2DConditionModel"
33
- ],
34
- "vae": [
35
- "diffusers",
36
- "AutoencoderKL"
37
- ]
38
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ git+https://github.com/sberbank-ai/Real-ESRGAN.git
scheduler/scheduler_config.json DELETED
@@ -1,20 +0,0 @@
1
- {
2
- "_class_name": "DDIMScheduler",
3
- "_diffusers_version": "0.27.2",
4
- "beta_end": 0.012,
5
- "beta_schedule": "scaled_linear",
6
- "beta_start": 0.00085,
7
- "clip_sample": false,
8
- "clip_sample_range": 1.0,
9
- "dynamic_thresholding_ratio": 0.995,
10
- "num_train_timesteps": 1000,
11
- "prediction_type": "v_prediction",
12
- "rescale_betas_zero_snr": false,
13
- "sample_max_value": 1.0,
14
- "set_alpha_to_one": false,
15
- "skip_prk_steps": true,
16
- "steps_offset": 1,
17
- "thresholding": false,
18
- "timestep_spacing": "leading",
19
- "trained_betas": null
20
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text_encoder/config.json DELETED
@@ -1,25 +0,0 @@
1
- {
2
- "_name_or_path": "/root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/text_encoder",
3
- "architectures": [
4
- "CLIPTextModel"
5
- ],
6
- "attention_dropout": 0.0,
7
- "bos_token_id": 0,
8
- "dropout": 0.0,
9
- "eos_token_id": 2,
10
- "hidden_act": "gelu",
11
- "hidden_size": 1024,
12
- "initializer_factor": 1.0,
13
- "initializer_range": 0.02,
14
- "intermediate_size": 4096,
15
- "layer_norm_eps": 1e-05,
16
- "max_position_embeddings": 77,
17
- "model_type": "clip_text_model",
18
- "num_attention_heads": 16,
19
- "num_hidden_layers": 23,
20
- "pad_token_id": 1,
21
- "projection_dim": 512,
22
- "torch_dtype": "float16",
23
- "transformers_version": "4.38.2",
24
- "vocab_size": 49408
25
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text_encoder/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:bc1827c465450322616f06dea41596eac7d493f4e95904dcb51f0fc745c4e13f
3
- size 680820392
 
 
 
 
tokenizer/merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
tokenizer/special_tokens_map.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "bos_token": {
3
- "content": "<|startoftext|>",
4
- "lstrip": false,
5
- "normalized": true,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "<|endoftext|>",
11
- "lstrip": false,
12
- "normalized": true,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "pad_token": "!",
17
- "unk_token": {
18
- "content": "<|endoftext|>",
19
- "lstrip": false,
20
- "normalized": true,
21
- "rstrip": false,
22
- "single_word": false
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tokenizer/tokenizer_config.json DELETED
@@ -1,38 +0,0 @@
1
- {
2
- "add_prefix_space": false,
3
- "added_tokens_decoder": {
4
- "0": {
5
- "content": "!",
6
- "lstrip": false,
7
- "normalized": false,
8
- "rstrip": false,
9
- "single_word": false,
10
- "special": true
11
- },
12
- "49406": {
13
- "content": "<|startoftext|>",
14
- "lstrip": false,
15
- "normalized": true,
16
- "rstrip": false,
17
- "single_word": false,
18
- "special": true
19
- },
20
- "49407": {
21
- "content": "<|endoftext|>",
22
- "lstrip": false,
23
- "normalized": true,
24
- "rstrip": false,
25
- "single_word": false,
26
- "special": true
27
- }
28
- },
29
- "bos_token": "<|startoftext|>",
30
- "clean_up_tokenization_spaces": true,
31
- "do_lower_case": true,
32
- "eos_token": "<|endoftext|>",
33
- "errors": "replace",
34
- "model_max_length": 77,
35
- "pad_token": "!",
36
- "tokenizer_class": "CLIPTokenizer",
37
- "unk_token": "<|endoftext|>"
38
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tokenizer/vocab.json DELETED
The diff for this file is too large to render. See raw diff
 
unet/config.json DELETED
@@ -1,73 +0,0 @@
1
- {
2
- "_class_name": "UNet2DConditionModel",
3
- "_diffusers_version": "0.27.2",
4
- "_name_or_path": "/root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/unet",
5
- "act_fn": "silu",
6
- "addition_embed_type": null,
7
- "addition_embed_type_num_heads": 64,
8
- "addition_time_embed_dim": null,
9
- "attention_head_dim": [
10
- 5,
11
- 10,
12
- 20,
13
- 20
14
- ],
15
- "attention_type": "default",
16
- "block_out_channels": [
17
- 320,
18
- 640,
19
- 1280,
20
- 1280
21
- ],
22
- "center_input_sample": false,
23
- "class_embed_type": null,
24
- "class_embeddings_concat": false,
25
- "conv_in_kernel": 3,
26
- "conv_out_kernel": 3,
27
- "cross_attention_dim": 1024,
28
- "cross_attention_norm": null,
29
- "down_block_types": [
30
- "CrossAttnDownBlock2D",
31
- "CrossAttnDownBlock2D",
32
- "CrossAttnDownBlock2D",
33
- "DownBlock2D"
34
- ],
35
- "downsample_padding": 1,
36
- "dropout": 0.0,
37
- "dual_cross_attention": false,
38
- "encoder_hid_dim": null,
39
- "encoder_hid_dim_type": null,
40
- "flip_sin_to_cos": true,
41
- "freq_shift": 0,
42
- "in_channels": 4,
43
- "layers_per_block": 2,
44
- "mid_block_only_cross_attention": null,
45
- "mid_block_scale_factor": 1,
46
- "mid_block_type": "UNetMidBlock2DCrossAttn",
47
- "norm_eps": 1e-05,
48
- "norm_num_groups": 32,
49
- "num_attention_heads": null,
50
- "num_class_embeds": null,
51
- "only_cross_attention": false,
52
- "out_channels": 4,
53
- "projection_class_embeddings_input_dim": null,
54
- "resnet_out_scale_factor": 1.0,
55
- "resnet_skip_time_act": false,
56
- "resnet_time_scale_shift": "default",
57
- "reverse_transformer_layers_per_block": null,
58
- "sample_size": 96,
59
- "time_cond_proj_dim": null,
60
- "time_embedding_act_fn": null,
61
- "time_embedding_dim": null,
62
- "time_embedding_type": "positional",
63
- "timestep_post_act": null,
64
- "transformer_layers_per_block": 1,
65
- "up_block_types": [
66
- "UpBlock2D",
67
- "CrossAttnUpBlock2D",
68
- "CrossAttnUpBlock2D",
69
- "CrossAttnUpBlock2D"
70
- ],
71
- "upcast_attention": true,
72
- "use_linear_projection": true
73
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
unet/diffusion_pytorch_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a3a4d7978884c5e4ef00b62641b1b544b257be2f6715d984188610ad6475ad2
3
- size 1731904736
 
 
 
 
vae/config.json DELETED
@@ -1,34 +0,0 @@
1
- {
2
- "_class_name": "AutoencoderKL",
3
- "_diffusers_version": "0.27.2",
4
- "_name_or_path": "/root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/vae",
5
- "act_fn": "silu",
6
- "block_out_channels": [
7
- 128,
8
- 256,
9
- 512,
10
- 512
11
- ],
12
- "down_block_types": [
13
- "DownEncoderBlock2D",
14
- "DownEncoderBlock2D",
15
- "DownEncoderBlock2D",
16
- "DownEncoderBlock2D"
17
- ],
18
- "force_upcast": true,
19
- "in_channels": 3,
20
- "latent_channels": 4,
21
- "latents_mean": null,
22
- "latents_std": null,
23
- "layers_per_block": 2,
24
- "norm_num_groups": 32,
25
- "out_channels": 3,
26
- "sample_size": 768,
27
- "scaling_factor": 0.18215,
28
- "up_block_types": [
29
- "UpDecoderBlock2D",
30
- "UpDecoderBlock2D",
31
- "UpDecoderBlock2D",
32
- "UpDecoderBlock2D"
33
- ]
34
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vae/diffusion_pytorch_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e4c08995484ee61270175e9e7a072b66a6e4eeb5f0c266667fe1f45b90daf9a
3
- size 167335342