From a791e54d3086e953a96c5370127b92873b4b4db2 Mon Sep 17 00:00:00 2001 From: Charles Saluski Date: Mon, 22 Aug 2022 22:50:41 -0700 Subject: [PATCH 01/17] Remove safety --- .../stable_diffusion/pipeline_stable_diffusion.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 550513b5c943..5d2647cbefa3 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -155,9 +155,10 @@ def __call__( image = image.cpu().permute(0, 2, 3, 1).numpy() # run safety checker - safety_cheker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(self.device) - image, has_nsfw_concept = self.safety_checker(images=image, clip_input=safety_cheker_input.pixel_values) - + #safety_cheker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(self.device) + #image, has_nsfw_concept = self.safety_checker(images=image, clip_input=safety_cheker_input.pixel_values) + has_nsft_concept = False + if output_type == "pil": image = self.numpy_to_pil(image) From 4ec28732d9284dafba3350760547dc03d0e76449 Mon Sep 17 00:00:00 2001 From: Charles Saluski Date: Mon, 22 Aug 2022 23:10:29 -0700 Subject: [PATCH 02/17] Update pipeline_stable_diffusion.py --- .../pipelines/stable_diffusion/pipeline_stable_diffusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 5d2647cbefa3..0e9bb6c23b33 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -157,7 +157,7 @@ def __call__( # run safety checker #safety_cheker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(self.device) #image, has_nsfw_concept = self.safety_checker(images=image, clip_input=safety_cheker_input.pixel_values) - has_nsft_concept = False + has_nsfw_concept = False if output_type == "pil": image = self.numpy_to_pil(image) From 556fa26ec2abc8004339044adc668e71ef75a75e Mon Sep 17 00:00:00 2001 From: Charles Saluski Date: Tue, 23 Aug 2022 12:45:06 -0700 Subject: [PATCH 03/17] Add linear interpolation between two prompts --- .../pipeline_stable_diffusion.py | 124 ++++++++++++++++++ 1 file changed, 124 insertions(+) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 0e9bb6c23b33..60e3c0e1c1a1 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -1,5 +1,6 @@ import inspect import warnings +import random from typing import List, Optional, Union import torch @@ -163,3 +164,126 @@ def __call__( image = self.numpy_to_pil(image) return {"sample": image, "nsfw_content_detected": has_nsfw_concept} + + def get_text_latent_space(self, prompt): + + # get prompt text embeddings + text_input = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0] + return text_embeddings + + def lerp_between_prompts(self, first_prompt, second_prompt, seed = None, length = 10, save=False, **kwargs): + first_embedding = self.get_text_latent_space(first_prompt) + second_embedding = self.get_text_latent_space(second_prompt) + if not seed: + seed = random.randint() + generator = torch.Generator("cuda") + lerp_embed_points = [] + for i in range(length): + weight = i / length + tensor_lerp = torch.lerp(first_embedding, second_embedding, weight) + lerp_embed_points.extend(tensor_lerp) + images = [] + for idx, latent_point in enumerate(lerp_embed_points): + generator.manual_seed(seed) + image = self.image_from_latent_space(latent_point, **kwargs) + images.extend(image) + if save: + image.save(f"{first_prompt}-{second_prompt}-{idx:02d}") + return images + + + def image_from_latent_space(self, text_embeddings, + height: Optional[int] = 512, + width: Optional[int] = 512, + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + eta: Optional[float] = 0.0, + generator: Optional[torch.Generator] = None, + output_type: Optional[str] = "pil", + **kwargs,): + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance: + max_length = text_input.input_ids.shape[-1] + uncond_input = self.tokenizer( + [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt" + ) + uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0] + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + text_embeddings = torch.cat([uncond_embeddings, text_embeddings]) + + # get the intial random noise + latents = torch.randn( + (batch_size, self.unet.in_channels, height // 8, width // 8), + generator=generator, + device=self.device, + ) + + # set timesteps + accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys()) + extra_set_kwargs = {} + if accepts_offset: + extra_set_kwargs["offset"] = 1 + + self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs) + + # if we use LMSDiscreteScheduler, let's make sure latents are mulitplied by sigmas + if isinstance(self.scheduler, LMSDiscreteScheduler): + latents = latents * self.scheduler.sigmas[0] + + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + for i, t in tqdm(enumerate(self.scheduler.timesteps)): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + if isinstance(self.scheduler, LMSDiscreteScheduler): + sigma = self.scheduler.sigmas[i] + latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5) + + # predict the noise residual + noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings)["sample"] + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + if isinstance(self.scheduler, LMSDiscreteScheduler): + latents = self.scheduler.step(noise_pred, i, latents, **extra_step_kwargs)["prev_sample"] + else: + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)["prev_sample"] + + # scale and decode the image latents with vae + latents = 1 / 0.18215 * latents + image = self.vae.decode(latents) + + image = (image / 2 + 0.5).clamp(0, 1) + image = image.cpu().permute(0, 2, 3, 1).numpy() + + if output_type == "pil": + image = self.numpy_to_pil(image) + + return image + From 97cbc2b1d3dd2716e71719fee94e7fcb31e0565f Mon Sep 17 00:00:00 2001 From: Charles Saluski Date: Tue, 23 Aug 2022 13:43:00 -0700 Subject: [PATCH 04/17] Fix a couple issues with variables that don't exist --- .../pipeline_stable_diffusion.py | 37 +++++++++++-------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 60e3c0e1c1a1..a88736dbfb5e 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -165,7 +165,7 @@ def __call__( return {"sample": image, "nsfw_content_detected": has_nsfw_concept} - def get_text_latent_space(self, prompt): + def get_text_latent_space(self, prompt, guidance_scale): # get prompt text embeddings text_input = self.tokenizer( @@ -176,9 +176,27 @@ def get_text_latent_space(self, prompt): return_tensors="pt", ) text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0] + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance: + max_length = text_input.input_ids.shape[-1] + uncond_input = self.tokenizer( + [""], padding="max_length", max_length=max_length, return_tensors="pt" + ) + uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0] + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + text_embeddings = torch.cat([uncond_embeddings, text_embeddings]) + return text_embeddings - def lerp_between_prompts(self, first_prompt, second_prompt, seed = None, length = 10, save=False, **kwargs): + def lerp_between_prompts(self, first_prompt, second_prompt, seed = None, length = 10, save=False, guidance_scale: Optional[float] = 7.5, **kwargs): first_embedding = self.get_text_latent_space(first_prompt) second_embedding = self.get_text_latent_space(second_prompt) if not seed: @@ -209,23 +227,12 @@ def image_from_latent_space(self, text_embeddings, output_type: Optional[str] = "pil", **kwargs,): + batch_size = 1 + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` # corresponds to doing no classifier free guidance. do_classifier_free_guidance = guidance_scale > 1.0 - # get unconditional embeddings for classifier free guidance - if do_classifier_free_guidance: - max_length = text_input.input_ids.shape[-1] - uncond_input = self.tokenizer( - [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt" - ) - uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0] - - # For classifier free guidance, we need to do two forward passes. - # Here we concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes - text_embeddings = torch.cat([uncond_embeddings, text_embeddings]) - # get the intial random noise latents = torch.randn( (batch_size, self.unet.in_channels, height // 8, width // 8), From ac2f99befe3cfe216eef9d58bc0f172b781bda48 Mon Sep 17 00:00:00 2001 From: Charles Saluski Date: Tue, 23 Aug 2022 15:15:30 -0700 Subject: [PATCH 05/17] Disable gradient of image_from_latent_space, should drastically decrease memory consumption --- .../stable_diffusion/pipeline_stable_diffusion.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index a88736dbfb5e..0f6a304b5656 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -1,6 +1,7 @@ import inspect import warnings import random +import sys from typing import List, Optional, Union import torch @@ -165,7 +166,7 @@ def __call__( return {"sample": image, "nsfw_content_detected": has_nsfw_concept} - def get_text_latent_space(self, prompt, guidance_scale): + def get_text_latent_space(self, prompt, guidance_scale = 7.5): # get prompt text embeddings text_input = self.tokenizer( @@ -200,7 +201,7 @@ def lerp_between_prompts(self, first_prompt, second_prompt, seed = None, length first_embedding = self.get_text_latent_space(first_prompt) second_embedding = self.get_text_latent_space(second_prompt) if not seed: - seed = random.randint() + seed = random.randint(0, sys.maxsize) generator = torch.Generator("cuda") lerp_embed_points = [] for i in range(length): @@ -216,7 +217,7 @@ def lerp_between_prompts(self, first_prompt, second_prompt, seed = None, length image.save(f"{first_prompt}-{second_prompt}-{idx:02d}") return images - + @torch.no_grad() def image_from_latent_space(self, text_embeddings, height: Optional[int] = 512, width: Optional[int] = 512, From ae352e115dfa20d1b35394330f8c2389dae4eb04 Mon Sep 17 00:00:00 2001 From: Charles Saluski Date: Tue, 23 Aug 2022 22:29:37 -0700 Subject: [PATCH 06/17] Add variation function and make all functions return the state of the generator in order to help facilitate this --- .../pipeline_stable_diffusion.py | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 0f6a304b5656..5e6fc5eefd2f 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -203,6 +203,8 @@ def lerp_between_prompts(self, first_prompt, second_prompt, seed = None, length if not seed: seed = random.randint(0, sys.maxsize) generator = torch.Generator("cuda") + generator.manual_seed(seed) + generator_state = generator.get_state() lerp_embed_points = [] for i in range(length): weight = i / length @@ -210,12 +212,12 @@ def lerp_between_prompts(self, first_prompt, second_prompt, seed = None, length lerp_embed_points.extend(tensor_lerp) images = [] for idx, latent_point in enumerate(lerp_embed_points): - generator.manual_seed(seed) - image = self.image_from_latent_space(latent_point, **kwargs) + generator.set_state(generator_state) + image = self.image_from_latent_space(latent_point, **kwargs)["image"][0] images.extend(image) if save: image.save(f"{first_prompt}-{second_prompt}-{idx:02d}") - return images + return {"images": images, "generator_state": generator_state} @torch.no_grad() def image_from_latent_space(self, text_embeddings, @@ -230,6 +232,9 @@ def image_from_latent_space(self, text_embeddings, batch_size = 1 + if generator == None: + generator = torch.Generator("cuda") + generator_state = generator.get_state() # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` # corresponds to doing no classifier free guidance. @@ -293,5 +298,15 @@ def image_from_latent_space(self, text_embeddings, if output_type == "pil": image = self.numpy_to_pil(image) - return image + return {"image": image, "generator_state": generator_state} + def variation(self, text_embeddings, generator_state, variation_magnitude = 100, **kwargs): + # random vector to move in latent space + rand_t = (torch.rand(text_embeddings.shape) * 2) - 1 + rand_mag = torch.sum(torch.abs(rand_t)) / variation_magnitude + scaled_rand_t = rand_t / rand_mag + variation_embedding = text_embeddings + scaled_rand_t + + generator = torch.Generator("cuda") + generator.set_state(generator_state) + return self.image_from_latent_space(variation_embedding, generator=generator, **kwargs) From 7e3fe6818498284952c6c208ea2b6883ca7cf031 Mon Sep 17 00:00:00 2001 From: Charles Saluski Date: Tue, 23 Aug 2022 22:35:32 -0700 Subject: [PATCH 07/17] Account for device in variation generator --- .../pipelines/stable_diffusion/pipeline_stable_diffusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 5e6fc5eefd2f..7c632c3a246f 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -302,7 +302,7 @@ def image_from_latent_space(self, text_embeddings, def variation(self, text_embeddings, generator_state, variation_magnitude = 100, **kwargs): # random vector to move in latent space - rand_t = (torch.rand(text_embeddings.shape) * 2) - 1 + rand_t = (torch.rand(text_embeddings.shape, device = self.device) * 2) - 1 rand_mag = torch.sum(torch.abs(rand_t)) / variation_magnitude scaled_rand_t = rand_t / rand_mag variation_embedding = text_embeddings + scaled_rand_t From cf80ce75ee50f0eb7e979c9f6e1954be63e0ed74 Mon Sep 17 00:00:00 2001 From: Charles Saluski Date: Wed, 24 Aug 2022 13:29:28 -0700 Subject: [PATCH 08/17] Fix lerp_between_prompts and add more items to the return dictionaries --- .../stable_diffusion/pipeline_stable_diffusion.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 7c632c3a246f..b17f088648da 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -209,15 +209,15 @@ def lerp_between_prompts(self, first_prompt, second_prompt, seed = None, length for i in range(length): weight = i / length tensor_lerp = torch.lerp(first_embedding, second_embedding, weight) - lerp_embed_points.extend(tensor_lerp) + lerp_embed_points.append(tensor_lerp) images = [] for idx, latent_point in enumerate(lerp_embed_points): generator.set_state(generator_state) image = self.image_from_latent_space(latent_point, **kwargs)["image"][0] - images.extend(image) + images.append(image) if save: image.save(f"{first_prompt}-{second_prompt}-{idx:02d}") - return {"images": images, "generator_state": generator_state} + return {"images": images, "latent_points": lerp_embed_points,"generator_state": generator_state} @torch.no_grad() def image_from_latent_space(self, text_embeddings, @@ -309,4 +309,6 @@ def variation(self, text_embeddings, generator_state, variation_magnitude = 100, generator = torch.Generator("cuda") generator.set_state(generator_state) - return self.image_from_latent_space(variation_embedding, generator=generator, **kwargs) + result = self.image_from_latent_space(variation_embedding, generator=generator, **kwargs) + result.update({"latent_point": variation_embedding}) + return result From 52138b29a6cbc9ba444bdc0829aadc050345ad81 Mon Sep 17 00:00:00 2001 From: Charles Saluski Date: Thu, 25 Aug 2022 15:50:16 -0700 Subject: [PATCH 09/17] Add slerp_through_seeds, which navigates between 2 points in the noise space with the prompt being held constant --- .../pipeline_stable_diffusion.py | 79 +++++++++++++++++-- 1 file changed, 72 insertions(+), 7 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index b17f088648da..33a3ed6ad62c 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -5,6 +5,7 @@ from typing import List, Optional, Union import torch +import numpy as np from tqdm.auto import tqdm from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer @@ -197,12 +198,40 @@ def get_text_latent_space(self, prompt, guidance_scale = 7.5): return text_embeddings + def slerp(t, v0, v1, DOT_THRESHOLD=0.9995): + """ helper function to spherically interpolate two arrays v1 v2 + from https://gist.github.com/karpathy/00103b0037c5aaea32fe1da1af553355 + this should be better than lerping for moving between noise spaces """ + + if not isinstance(v0, np.ndarray): + inputs_are_torch = True + input_device = v0.device + v0 = v0.cpu().numpy() + v1 = v1.cpu().numpy() + + dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1))) + if np.abs(dot) > DOT_THRESHOLD: + v2 = (1 - t) * v0 + t * v1 + else: + theta_0 = np.arccos(dot) + sin_theta_0 = np.sin(theta_0) + theta_t = theta_0 * t + sin_theta_t = np.sin(theta_t) + s0 = np.sin(theta_0 - theta_t) / sin_theta_0 + s1 = sin_theta_t / sin_theta_0 + v2 = s0 * v0 + s1 * v1 + + if inputs_are_torch: + v2 = torch.from_numpy(v2).to(input_device) + + return v2 + def lerp_between_prompts(self, first_prompt, second_prompt, seed = None, length = 10, save=False, guidance_scale: Optional[float] = 7.5, **kwargs): first_embedding = self.get_text_latent_space(first_prompt) second_embedding = self.get_text_latent_space(second_prompt) if not seed: seed = random.randint(0, sys.maxsize) - generator = torch.Generator("cuda") + generator = torch.Generator(self.device) generator.manual_seed(seed) generator_state = generator.get_state() lerp_embed_points = [] @@ -213,14 +242,51 @@ def lerp_between_prompts(self, first_prompt, second_prompt, seed = None, length images = [] for idx, latent_point in enumerate(lerp_embed_points): generator.set_state(generator_state) - image = self.image_from_latent_space(latent_point, **kwargs)["image"][0] + image = self.diffuse_from_inits(latent_point, **kwargs)["image"][0] images.append(image) if save: image.save(f"{first_prompt}-{second_prompt}-{idx:02d}") return {"images": images, "latent_points": lerp_embed_points,"generator_state": generator_state} + def slerp_through_seeds(self, + prompt, + height: Optional[int] = 512, + width: Optional[int] = 512, + save = False, + seed = None, steps = 10, **kwargs): + + if not seed: + seed = random.randint(0, sys.maxsize) + generator = torch.Generator(self.device) + generator.manual_seed(seed) + init_start = torch.randn( + (1, self.unet.in_channels, height // 8, width // 8), + generator = generator, device = self.device) + init_end = torch.randn( + (1, self.unet.in_channels, height // 8, width // 8), + generator = generator, device = self.device) + generator_state = generator.get_state() + slerp_embed_points = [] + # weight from 0 to 1/(steps - 1), add init_end specifically so that we + # have len(images) = steps + for i in range(steps - 1): + weight = i / steps + tensor_slerp = self.slerp(weight, init_start, init_end) + slerp_embed_points.append(tensor_slerp) + slerp_embed_points.append(init_end) + images = [] + embed_point = self.get_text_latent_space(prompt, **kwargs) + for idx, noise_point in enumerate(slerp_embed_points): + generator.set_state(generator_state) + image = self.diffuse_from_inits(embed_point, init = noise_point, **kwargs)["image"][0] + images.append(image) + if save: + image.save(f"{seed}-{idx:02d}") + return {"images": images, "noise_samples": slerp_embed_points,"generator_state": generator_state} + @torch.no_grad() - def image_from_latent_space(self, text_embeddings, + def diffuse_from_inits(self, text_embeddings, + init = None, height: Optional[int] = 512, width: Optional[int] = 512, num_inference_steps: Optional[int] = 50, @@ -240,11 +306,10 @@ def image_from_latent_space(self, text_embeddings, # corresponds to doing no classifier free guidance. do_classifier_free_guidance = guidance_scale > 1.0 # get the intial random noise - latents = torch.randn( + latents = init if init else torch.randn( (batch_size, self.unet.in_channels, height // 8, width // 8), generator=generator, - device=self.device, - ) + device=self.device,) # set timesteps accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys()) @@ -309,6 +374,6 @@ def variation(self, text_embeddings, generator_state, variation_magnitude = 100, generator = torch.Generator("cuda") generator.set_state(generator_state) - result = self.image_from_latent_space(variation_embedding, generator=generator, **kwargs) + result = self.diffuse_from_inits(variation_embedding, generator=generator, **kwargs) result.update({"latent_point": variation_embedding}) return result From 2cd40fb751e72c9d28c557dd1c0fb7eea487f854 Mon Sep 17 00:00:00 2001 From: Charles Saluski Date: Thu, 25 Aug 2022 16:17:21 -0700 Subject: [PATCH 10/17] I forgot self haha --- .../pipelines/stable_diffusion/pipeline_stable_diffusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 33a3ed6ad62c..3ec1965f168c 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -198,7 +198,7 @@ def get_text_latent_space(self, prompt, guidance_scale = 7.5): return text_embeddings - def slerp(t, v0, v1, DOT_THRESHOLD=0.9995): + def slerp(self, t, v0, v1, DOT_THRESHOLD=0.9995): """ helper function to spherically interpolate two arrays v1 v2 from https://gist.github.com/karpathy/00103b0037c5aaea32fe1da1af553355 this should be better than lerping for moving between noise spaces """ From 59fd5f2e6cd98c63920436c988aa48257f2e7a00 Mon Sep 17 00:00:00 2001 From: Charles Saluski Date: Thu, 25 Aug 2022 16:25:09 -0700 Subject: [PATCH 11/17] Can't check if tensor is truthy, check if it's not none instead --- .../pipelines/stable_diffusion/pipeline_stable_diffusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 3ec1965f168c..8c1e69908b68 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -306,7 +306,7 @@ def diffuse_from_inits(self, text_embeddings, # corresponds to doing no classifier free guidance. do_classifier_free_guidance = guidance_scale > 1.0 # get the intial random noise - latents = init if init else torch.randn( + latents = init if init is not None else torch.randn( (batch_size, self.unet.in_channels, height // 8, width // 8), generator=generator, device=self.device,) From 587ea2bf5b31d17d5d81b75d8220eb80520541a3 Mon Sep 17 00:00:00 2001 From: Charles Saluski Date: Fri, 26 Aug 2022 21:18:27 -0700 Subject: [PATCH 12/17] Fix saving ahhfdafafaa --- .../pipelines/stable_diffusion/pipeline_stable_diffusion.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index fd1ae3101175..67a0c8793b26 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -252,7 +252,7 @@ def lerp_between_prompts(self, first_prompt, second_prompt, seed = None, length image = self.diffuse_from_inits(latent_point, **kwargs)["image"][0] images.append(image) if save: - image.save(f"{first_prompt}-{second_prompt}-{idx:02d}") + image.save(f"{first_prompt}-{second_prompt}-{idx:02d}.png", "PNG") return {"images": images, "latent_points": lerp_embed_points,"generator_state": generator_state} def slerp_through_seeds(self, @@ -288,7 +288,7 @@ def slerp_through_seeds(self, image = self.diffuse_from_inits(embed_point, init = noise_point, **kwargs)["image"][0] images.append(image) if save: - image.save(f"{seed}-{idx:02d}") + image.save(f"{seed}-{idx:02d}.png", "PNG") return {"images": images, "noise_samples": slerp_embed_points,"generator_state": generator_state} @torch.no_grad() From f235c32916b25d4ab2966cf9a340086ab44159aa Mon Sep 17 00:00:00 2001 From: Charles Saluski Date: Fri, 26 Aug 2022 22:51:05 -0700 Subject: [PATCH 13/17] Don't pass kwargs to functions that don't kwargs --- .../pipelines/stable_diffusion/pipeline_stable_diffusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 67a0c8793b26..43650caeaad2 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -282,7 +282,7 @@ def slerp_through_seeds(self, slerp_embed_points.append(tensor_slerp) slerp_embed_points.append(init_end) images = [] - embed_point = self.get_text_latent_space(prompt, **kwargs) + embed_point = self.get_text_latent_space(prompt) for idx, noise_point in enumerate(slerp_embed_points): generator.set_state(generator_state) image = self.diffuse_from_inits(embed_point, init = noise_point, **kwargs)["image"][0] From 8a111dd06a22985ab5523962f44b1d90bc7a64f9 Mon Sep 17 00:00:00 2001 From: Charles Saluski Date: Sat, 24 Sep 2022 22:16:28 -0700 Subject: [PATCH 14/17] TQDM got removed during merge --- .../pipelines/stable_diffusion/pipeline_stable_diffusion.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 5536396854fb..b4ccc4f33631 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -7,6 +7,8 @@ import torch import numpy as np +from tqdm.auto import tqdm + from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from ...models import AutoencoderKL, UNet2DConditionModel From 7ae0aaa8ae44a5301d87cfa63867444914e0f135 Mon Sep 17 00:00:00 2001 From: Charles Saluski Date: Sat, 24 Sep 2022 22:23:54 -0700 Subject: [PATCH 15/17] Output from decoding latents was changed to a data wrapping object, unwrap it. --- .../pipelines/stable_diffusion/pipeline_stable_diffusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index b4ccc4f33631..a0cd93ef06f9 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -366,7 +366,7 @@ def diffuse_from_inits(self, text_embeddings, # scale and decode the image latents with vae latents = 1 / 0.18215 * latents - image = self.vae.decode(latents) + image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) image = image.cpu().permute(0, 2, 3, 1).numpy() From 7ac1af647d6f3a9b7cda7c1156480f0afa4b949e Mon Sep 17 00:00:00 2001 From: Charles Saluski Date: Sat, 24 Sep 2022 23:34:32 -0700 Subject: [PATCH 16/17] Add option to save the intermediate latent spaces from the diffusion --- .../stable_diffusion/pipeline_stable_diffusion.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index a0cd93ef06f9..07d047cbf14c 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -305,6 +305,7 @@ def diffuse_from_inits(self, text_embeddings, eta: Optional[float] = 0.0, generator: Optional[torch.Generator] = None, output_type: Optional[str] = "pil", + save_n_steps: Optional[int] = None, **kwargs,): batch_size = 1 @@ -342,8 +343,14 @@ def diffuse_from_inits(self, text_embeddings, extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta - + if save_n_steps: + diffuse_latents = [] + else: + diffuse_latents = None for i, t in tqdm(enumerate(self.scheduler.timesteps)): + if save_n_steps: + if i % save_n_steps == 0: + diffuse_latents.append(latents) # expand the latents if we are doing classifier free guidance latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents if isinstance(self.scheduler, LMSDiscreteScheduler): @@ -374,7 +381,7 @@ def diffuse_from_inits(self, text_embeddings, if output_type == "pil": image = self.numpy_to_pil(image) - return {"image": image, "generator_state": generator_state} + return {"image": image, "generator_state": generator_state, "latents": diffuse_latents} def variation(self, text_embeddings, generator_state, variation_magnitude = 100, **kwargs): # random vector to move in latent space From 28dccaad33e2ec09a3cb5771b2328f5f91513954 Mon Sep 17 00:00:00 2001 From: Charles Saluski Date: Sat, 24 Sep 2022 23:47:24 -0700 Subject: [PATCH 17/17] Add option to save the intermediate latent spaces from the diffusion as images instead of raw latent tensor --- .../pipeline_stable_diffusion.py | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 07d047cbf14c..5b3e5ec2611f 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -344,13 +344,26 @@ def diffuse_from_inits(self, text_embeddings, if accepts_eta: extra_step_kwargs["eta"] = eta if save_n_steps: - diffuse_latents = [] + mid_latents = [] + mid_images = [] else: - diffuse_latents = None + mid_latents = None + mid_images = None for i, t in tqdm(enumerate(self.scheduler.timesteps)): if save_n_steps: if i % save_n_steps == 0: - diffuse_latents.append(latents) + # scale and decode the image latents with vae + dec_mid_latents = 1 / 0.18215 * latents + mid_latents.append(dec_mid_latents) + image = self.vae.decode(dec_mid_latents).sample + + image = (image / 2 + 0.5).clamp(0, 1) + image = image.cpu().permute(0, 2, 3, 1).numpy() + + if output_type == "pil": + image = self.numpy_to_pil(image) + mid_latents.append(image) + mid_images.append(image) # expand the latents if we are doing classifier free guidance latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents if isinstance(self.scheduler, LMSDiscreteScheduler): @@ -381,7 +394,7 @@ def diffuse_from_inits(self, text_embeddings, if output_type == "pil": image = self.numpy_to_pil(image) - return {"image": image, "generator_state": generator_state, "latents": diffuse_latents} + return {"image": image, "generator_state": generator_state, "mid_latents": mid_latents, "mid_images": mid_images} def variation(self, text_embeddings, generator_state, variation_magnitude = 100, **kwargs): # random vector to move in latent space