Spaces:
Configuration error
Configuration error
amankishore
commited on
Commit
·
0c8e8a3
1
Parent(s):
57595b7
Fix xformers, support v2-1
Browse files- app.py +8 -2
- requirements.txt +1 -2
- train_dreambooth.py +40 -31
app.py
CHANGED
@@ -34,6 +34,8 @@ if(is_gpu_associated):
|
|
34 |
model_v1 = snapshot_download(repo_id="multimodalart/sd-fine-tunable")
|
35 |
model_v2 = snapshot_download(repo_id="stabilityai/stable-diffusion-2")
|
36 |
model_v2_512 = snapshot_download(repo_id="stabilityai/stable-diffusion-2-base")
|
|
|
|
|
37 |
safety_checker = snapshot_download(repo_id="multimodalart/sd-sc")
|
38 |
model_to_load = model_v1
|
39 |
|
@@ -47,8 +49,12 @@ def swap_base_model(selected_model):
|
|
47 |
model_to_load = model_v1
|
48 |
elif(selected_model == "v2-768"):
|
49 |
model_to_load = model_v2
|
50 |
-
|
51 |
model_to_load = model_v2_512
|
|
|
|
|
|
|
|
|
52 |
|
53 |
def count_files(*inputs):
|
54 |
file_counter = 0
|
@@ -532,7 +538,7 @@ with gr.Blocks(css=css) as demo:
|
|
532 |
|
533 |
with gr.Accordion("Custom Settings", open=False):
|
534 |
with gr.Row() as what_are_you_training:
|
535 |
-
base_model_to_use = gr.Dropdown(label="Which base model would you like to use?", choices=["v1-5", "v2-512", "v2-768"], value="v1-5", interactive=True)
|
536 |
|
537 |
swap_auto_calculated = gr.Checkbox(label="Use custom settings")
|
538 |
gr.Markdown("If not checked, the % of frozen encoder will be tuned automatically to whether you are training an `object`, `person` or `style`. The text-encoder is frozen after 10% of the steps for a style, 30% of the steps for an object and 75% trained for persons. The number of steps varies between 1400 and 2400 depending on how many images uploaded. If you see too many artifacts in your output, it means it may have overfit and you need less steps. If your results aren't really what you wanted, it may be underfitting and you need more steps.")
|
|
|
34 |
model_v1 = snapshot_download(repo_id="multimodalart/sd-fine-tunable")
|
35 |
model_v2 = snapshot_download(repo_id="stabilityai/stable-diffusion-2")
|
36 |
model_v2_512 = snapshot_download(repo_id="stabilityai/stable-diffusion-2-base")
|
37 |
+
model_v2_1 = snapshot_download(repo_id="stabilityai/stable-diffusion-2-1")
|
38 |
+
model_v2_1_512 = snapshot_download(repo_id="stabilityai/stable-diffusion-2-1-base")
|
39 |
safety_checker = snapshot_download(repo_id="multimodalart/sd-sc")
|
40 |
model_to_load = model_v1
|
41 |
|
|
|
49 |
model_to_load = model_v1
|
50 |
elif(selected_model == "v2-768"):
|
51 |
model_to_load = model_v2
|
52 |
+
elif(selected_model == "v2-512"):
|
53 |
model_to_load = model_v2_512
|
54 |
+
elif(selected_model == "v2-1-768"):
|
55 |
+
model_to_load = model_v2_1
|
56 |
+
else:
|
57 |
+
model_to_load = model_v2_1_512
|
58 |
|
59 |
def count_files(*inputs):
|
60 |
file_counter = 0
|
|
|
538 |
|
539 |
with gr.Accordion("Custom Settings", open=False):
|
540 |
with gr.Row() as what_are_you_training:
|
541 |
+
base_model_to_use = gr.Dropdown(label="Which base model would you like to use?", choices=["v1-5", "v2-512", "v2-768", "v2-1-512", "v2-1-768"], value="v1-5", interactive=True)
|
542 |
|
543 |
swap_auto_calculated = gr.Checkbox(label="Use custom settings")
|
544 |
gr.Markdown("If not checked, the % of frozen encoder will be tuned automatically to whether you are training an `object`, `person` or `style`. The text-encoder is frozen after 10% of the steps for a style, 30% of the steps for an object and 75% trained for persons. The number of steps varies between 1400 and 2400 depending on how many images uploaded. If you see too many artifacts in your output, it means it may have overfit and you need less steps. If your results aren't really what you wanted, it may be underfitting and you need more steps.")
|
requirements.txt
CHANGED
@@ -14,5 +14,4 @@ triton==2.0.0.dev20220701
|
|
14 |
bitsandbytes
|
15 |
python-slugify
|
16 |
requests
|
17 |
-
tensorboard
|
18 |
-
https://github.com/apolinario/xformers/releases/download/0.0.2/xformers-0.0.14.dev0-cp38-cp38-linux_x86_64.whl
|
|
|
14 |
bitsandbytes
|
15 |
python-slugify
|
16 |
requests
|
17 |
+
tensorboard
|
|
train_dreambooth.py
CHANGED
@@ -19,6 +19,7 @@ from accelerate.logging import get_logger
|
|
19 |
from accelerate.utils import set_seed
|
20 |
from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
|
21 |
from diffusers.optimization import get_scheduler
|
|
|
22 |
from huggingface_hub import HfFolder, Repository, whoami
|
23 |
from PIL import Image
|
24 |
from torchvision import transforms
|
@@ -197,15 +198,15 @@ def parse_args():
|
|
197 |
default=1,
|
198 |
help=("Save the model every n global_steps"),
|
199 |
)
|
200 |
-
|
201 |
-
|
202 |
parser.add_argument(
|
203 |
"--save_starting_step",
|
204 |
type=int,
|
205 |
default=1,
|
206 |
help=("The step from which it starts saving intermediary checkpoints"),
|
207 |
)
|
208 |
-
|
209 |
parser.add_argument(
|
210 |
"--stop_text_encoder_training",
|
211 |
type=int,
|
@@ -218,39 +219,39 @@ def parse_args():
|
|
218 |
"--image_captions_filename",
|
219 |
action="store_true",
|
220 |
help="Get captions from filename",
|
221 |
-
)
|
222 |
-
|
223 |
-
|
224 |
parser.add_argument(
|
225 |
"--dump_only_text_encoder",
|
226 |
action="store_true",
|
227 |
-
default=False,
|
228 |
help="Dump only text encoder",
|
229 |
)
|
230 |
|
231 |
parser.add_argument(
|
232 |
"--train_only_unet",
|
233 |
action="store_true",
|
234 |
-
default=False,
|
235 |
help="Train only the unet",
|
236 |
)
|
237 |
-
|
238 |
parser.add_argument(
|
239 |
"--cache_latents",
|
240 |
action="store_true",
|
241 |
-
default=False,
|
242 |
help="Train only the unet",
|
243 |
)
|
244 |
-
|
245 |
parser.add_argument(
|
246 |
"--Session_dir",
|
247 |
type=str,
|
248 |
-
default="",
|
249 |
help="Current session directory",
|
250 |
-
)
|
|
|
|
|
251 |
|
252 |
-
|
253 |
-
|
254 |
|
255 |
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
|
256 |
|
@@ -304,7 +305,7 @@ class DreamBoothDataset(Dataset):
|
|
304 |
|
305 |
if args.image_captions_filename:
|
306 |
self.image_captions_filename = True
|
307 |
-
|
308 |
if class_data_root is not None:
|
309 |
self.class_data_root = Path(class_data_root)
|
310 |
self.class_data_root.mkdir(parents=True, exist_ok=True)
|
@@ -334,9 +335,9 @@ class DreamBoothDataset(Dataset):
|
|
334 |
instance_image = Image.open(path)
|
335 |
if not instance_image.mode == "RGB":
|
336 |
instance_image = instance_image.convert("RGB")
|
337 |
-
|
338 |
instance_prompt = self.instance_prompt
|
339 |
-
|
340 |
if self.image_captions_filename:
|
341 |
filename = Path(path).stem
|
342 |
pt=''.join([i for i in filename if not i.isdigit()])
|
@@ -488,7 +489,7 @@ def run_training(args_imported):
|
|
488 |
for example in tqdm(
|
489 |
sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
|
490 |
):
|
491 |
-
with torch.autocast("cuda"):
|
492 |
images = pipeline(example["prompt"]).images
|
493 |
|
494 |
for i, image in enumerate(images):
|
@@ -533,6 +534,14 @@ def run_training(args_imported):
|
|
533 |
text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder")
|
534 |
vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
|
535 |
unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
536 |
|
537 |
vae.requires_grad_(False)
|
538 |
if not args.train_text_encoder:
|
@@ -735,7 +744,7 @@ def run_training(args_imported):
|
|
735 |
|
736 |
# Predict the noise residual
|
737 |
model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
|
738 |
-
|
739 |
# Get the target for loss depending on the prediction type
|
740 |
if noise_scheduler.config.prediction_type == "epsilon":
|
741 |
target = noise
|
@@ -743,7 +752,7 @@ def run_training(args_imported):
|
|
743 |
target = noise_scheduler.get_velocity(latents, noise, timesteps)
|
744 |
else:
|
745 |
raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
|
746 |
-
|
747 |
if args.with_prior_preservation:
|
748 |
# Chunk the noise and model_pred into two parts and compute the loss on each part separately.
|
749 |
model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
|
@@ -780,7 +789,7 @@ def run_training(args_imported):
|
|
780 |
fll=round((global_step*100)/args.max_train_steps)
|
781 |
fll=round(fll/4)
|
782 |
pr=bar(fll)
|
783 |
-
|
784 |
logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
|
785 |
progress_bar.set_postfix(**logs)
|
786 |
progress_bar.set_description_str("Progress:"+pr)
|
@@ -791,7 +800,7 @@ def run_training(args_imported):
|
|
791 |
|
792 |
if args.train_text_encoder and global_step == args.stop_text_encoder_training and global_step >= 30:
|
793 |
if accelerator.is_main_process:
|
794 |
-
print(" [0;32m" +" Freezing the text_encoder ..."+" [0m")
|
795 |
frz_dir=args.output_dir + "/text_encoder_frozen"
|
796 |
if os.path.exists(frz_dir):
|
797 |
subprocess.call('rm -r '+ frz_dir, shell=True)
|
@@ -802,13 +811,13 @@ def run_training(args_imported):
|
|
802 |
text_encoder=accelerator.unwrap_model(text_encoder),
|
803 |
)
|
804 |
pipeline.text_encoder.save_pretrained(frz_dir)
|
805 |
-
|
806 |
if args.save_n_steps >= 200:
|
807 |
if global_step < args.max_train_steps and global_step+1==i:
|
808 |
ckpt_name = "_step_" + str(global_step+1)
|
809 |
save_dir = Path(args.output_dir+ckpt_name)
|
810 |
save_dir=str(save_dir)
|
811 |
-
save_dir=save_dir.replace(" ", "_")
|
812 |
if not os.path.exists(save_dir):
|
813 |
os.mkdir(save_dir)
|
814 |
inst=save_dir[16:]
|
@@ -822,15 +831,15 @@ def run_training(args_imported):
|
|
822 |
text_encoder=accelerator.unwrap_model(text_encoder),
|
823 |
)
|
824 |
pipeline.save_pretrained(save_dir)
|
825 |
-
frz_dir=args.output_dir + "/text_encoder_frozen"
|
826 |
if args.train_text_encoder and os.path.exists(frz_dir):
|
827 |
subprocess.call('rm -r '+save_dir+'/text_encoder/*.*', shell=True)
|
828 |
-
subprocess.call('cp -f '+frz_dir +'/*.* '+ save_dir+'/text_encoder', shell=True)
|
829 |
chkpth=args.Session_dir+"/"+inst+".ckpt"
|
830 |
subprocess.call('python /content/diffusers/scripts/convert_diffusers_to_original_stable_diffusion.py --model_path ' + save_dir + ' --checkpoint_path ' + chkpth + ' --half', shell=True)
|
831 |
subprocess.call('rm -r '+ save_dir, shell=True)
|
832 |
i=i+args.save_n_steps
|
833 |
-
|
834 |
accelerator.wait_for_everyone()
|
835 |
|
836 |
# Create the pipeline using using the trained modules and save it.
|
@@ -844,7 +853,7 @@ def run_training(args_imported):
|
|
844 |
unet=accelerator.unwrap_model(unet),
|
845 |
text_encoder=accelerator.unwrap_model(text_encoder),
|
846 |
)
|
847 |
-
pipeline.text_encoder.save_pretrained(txt_dir)
|
848 |
|
849 |
elif args.train_only_unet:
|
850 |
pipeline = StableDiffusionPipeline.from_pretrained(
|
@@ -855,7 +864,7 @@ def run_training(args_imported):
|
|
855 |
pipeline.save_pretrained(args.output_dir)
|
856 |
txt_dir=args.output_dir + "/text_encoder_trained"
|
857 |
subprocess.call('rm -r '+txt_dir, shell=True)
|
858 |
-
|
859 |
else:
|
860 |
pipeline = StableDiffusionPipeline.from_pretrained(
|
861 |
args.pretrained_model_name_or_path,
|
@@ -866,7 +875,7 @@ def run_training(args_imported):
|
|
866 |
pipeline.save_pretrained(args.output_dir)
|
867 |
if args.train_text_encoder and os.path.exists(frz_dir):
|
868 |
subprocess.call('mv -f '+frz_dir +'/*.* '+ args.output_dir+'/text_encoder', shell=True)
|
869 |
-
subprocess.call('rm -r '+ frz_dir, shell=True)
|
870 |
|
871 |
if args.push_to_hub:
|
872 |
repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
|
|
|
19 |
from accelerate.utils import set_seed
|
20 |
from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
|
21 |
from diffusers.optimization import get_scheduler
|
22 |
+
from diffusers.utils.import_utils import is_xformers_available
|
23 |
from huggingface_hub import HfFolder, Repository, whoami
|
24 |
from PIL import Image
|
25 |
from torchvision import transforms
|
|
|
198 |
default=1,
|
199 |
help=("Save the model every n global_steps"),
|
200 |
)
|
201 |
+
|
202 |
+
|
203 |
parser.add_argument(
|
204 |
"--save_starting_step",
|
205 |
type=int,
|
206 |
default=1,
|
207 |
help=("The step from which it starts saving intermediary checkpoints"),
|
208 |
)
|
209 |
+
|
210 |
parser.add_argument(
|
211 |
"--stop_text_encoder_training",
|
212 |
type=int,
|
|
|
219 |
"--image_captions_filename",
|
220 |
action="store_true",
|
221 |
help="Get captions from filename",
|
222 |
+
)
|
223 |
+
|
224 |
+
|
225 |
parser.add_argument(
|
226 |
"--dump_only_text_encoder",
|
227 |
action="store_true",
|
228 |
+
default=False,
|
229 |
help="Dump only text encoder",
|
230 |
)
|
231 |
|
232 |
parser.add_argument(
|
233 |
"--train_only_unet",
|
234 |
action="store_true",
|
235 |
+
default=False,
|
236 |
help="Train only the unet",
|
237 |
)
|
238 |
+
|
239 |
parser.add_argument(
|
240 |
"--cache_latents",
|
241 |
action="store_true",
|
242 |
+
default=False,
|
243 |
help="Train only the unet",
|
244 |
)
|
245 |
+
|
246 |
parser.add_argument(
|
247 |
"--Session_dir",
|
248 |
type=str,
|
249 |
+
default="",
|
250 |
help="Current session directory",
|
251 |
+
)
|
252 |
+
|
253 |
+
|
254 |
|
|
|
|
|
255 |
|
256 |
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
|
257 |
|
|
|
305 |
|
306 |
if args.image_captions_filename:
|
307 |
self.image_captions_filename = True
|
308 |
+
|
309 |
if class_data_root is not None:
|
310 |
self.class_data_root = Path(class_data_root)
|
311 |
self.class_data_root.mkdir(parents=True, exist_ok=True)
|
|
|
335 |
instance_image = Image.open(path)
|
336 |
if not instance_image.mode == "RGB":
|
337 |
instance_image = instance_image.convert("RGB")
|
338 |
+
|
339 |
instance_prompt = self.instance_prompt
|
340 |
+
|
341 |
if self.image_captions_filename:
|
342 |
filename = Path(path).stem
|
343 |
pt=''.join([i for i in filename if not i.isdigit()])
|
|
|
489 |
for example in tqdm(
|
490 |
sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
|
491 |
):
|
492 |
+
with torch.autocast("cuda"):
|
493 |
images = pipeline(example["prompt"]).images
|
494 |
|
495 |
for i, image in enumerate(images):
|
|
|
534 |
text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder")
|
535 |
vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
|
536 |
unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
|
537 |
+
if is_xformers_available():
|
538 |
+
try:
|
539 |
+
print("Enabling memory efficient attention with xformers...")
|
540 |
+
unet.enable_xformers_memory_efficient_attention()
|
541 |
+
except Exception as e:
|
542 |
+
logger.warning(
|
543 |
+
f"Could not enable memory efficient attention. Make sure xformers is installed correctly and a GPU is available: {e}"
|
544 |
+
)
|
545 |
|
546 |
vae.requires_grad_(False)
|
547 |
if not args.train_text_encoder:
|
|
|
744 |
|
745 |
# Predict the noise residual
|
746 |
model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
|
747 |
+
|
748 |
# Get the target for loss depending on the prediction type
|
749 |
if noise_scheduler.config.prediction_type == "epsilon":
|
750 |
target = noise
|
|
|
752 |
target = noise_scheduler.get_velocity(latents, noise, timesteps)
|
753 |
else:
|
754 |
raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
|
755 |
+
|
756 |
if args.with_prior_preservation:
|
757 |
# Chunk the noise and model_pred into two parts and compute the loss on each part separately.
|
758 |
model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
|
|
|
789 |
fll=round((global_step*100)/args.max_train_steps)
|
790 |
fll=round(fll/4)
|
791 |
pr=bar(fll)
|
792 |
+
|
793 |
logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
|
794 |
progress_bar.set_postfix(**logs)
|
795 |
progress_bar.set_description_str("Progress:"+pr)
|
|
|
800 |
|
801 |
if args.train_text_encoder and global_step == args.stop_text_encoder_training and global_step >= 30:
|
802 |
if accelerator.is_main_process:
|
803 |
+
print(" [0;32m" +" Freezing the text_encoder ..."+" [0m")
|
804 |
frz_dir=args.output_dir + "/text_encoder_frozen"
|
805 |
if os.path.exists(frz_dir):
|
806 |
subprocess.call('rm -r '+ frz_dir, shell=True)
|
|
|
811 |
text_encoder=accelerator.unwrap_model(text_encoder),
|
812 |
)
|
813 |
pipeline.text_encoder.save_pretrained(frz_dir)
|
814 |
+
|
815 |
if args.save_n_steps >= 200:
|
816 |
if global_step < args.max_train_steps and global_step+1==i:
|
817 |
ckpt_name = "_step_" + str(global_step+1)
|
818 |
save_dir = Path(args.output_dir+ckpt_name)
|
819 |
save_dir=str(save_dir)
|
820 |
+
save_dir=save_dir.replace(" ", "_")
|
821 |
if not os.path.exists(save_dir):
|
822 |
os.mkdir(save_dir)
|
823 |
inst=save_dir[16:]
|
|
|
831 |
text_encoder=accelerator.unwrap_model(text_encoder),
|
832 |
)
|
833 |
pipeline.save_pretrained(save_dir)
|
834 |
+
frz_dir=args.output_dir + "/text_encoder_frozen"
|
835 |
if args.train_text_encoder and os.path.exists(frz_dir):
|
836 |
subprocess.call('rm -r '+save_dir+'/text_encoder/*.*', shell=True)
|
837 |
+
subprocess.call('cp -f '+frz_dir +'/*.* '+ save_dir+'/text_encoder', shell=True)
|
838 |
chkpth=args.Session_dir+"/"+inst+".ckpt"
|
839 |
subprocess.call('python /content/diffusers/scripts/convert_diffusers_to_original_stable_diffusion.py --model_path ' + save_dir + ' --checkpoint_path ' + chkpth + ' --half', shell=True)
|
840 |
subprocess.call('rm -r '+ save_dir, shell=True)
|
841 |
i=i+args.save_n_steps
|
842 |
+
|
843 |
accelerator.wait_for_everyone()
|
844 |
|
845 |
# Create the pipeline using using the trained modules and save it.
|
|
|
853 |
unet=accelerator.unwrap_model(unet),
|
854 |
text_encoder=accelerator.unwrap_model(text_encoder),
|
855 |
)
|
856 |
+
pipeline.text_encoder.save_pretrained(txt_dir)
|
857 |
|
858 |
elif args.train_only_unet:
|
859 |
pipeline = StableDiffusionPipeline.from_pretrained(
|
|
|
864 |
pipeline.save_pretrained(args.output_dir)
|
865 |
txt_dir=args.output_dir + "/text_encoder_trained"
|
866 |
subprocess.call('rm -r '+txt_dir, shell=True)
|
867 |
+
|
868 |
else:
|
869 |
pipeline = StableDiffusionPipeline.from_pretrained(
|
870 |
args.pretrained_model_name_or_path,
|
|
|
875 |
pipeline.save_pretrained(args.output_dir)
|
876 |
if args.train_text_encoder and os.path.exists(frz_dir):
|
877 |
subprocess.call('mv -f '+frz_dir +'/*.* '+ args.output_dir+'/text_encoder', shell=True)
|
878 |
+
subprocess.call('rm -r '+ frz_dir, shell=True)
|
879 |
|
880 |
if args.push_to_hub:
|
881 |
repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
|