From bea5a3c935ab822d0d46c5efa55617dee1a70407 Mon Sep 17 00:00:00 2001 From: BZLibby Date: Tue, 5 Sep 2023 18:56:00 -0500 Subject: [PATCH 001/240] rebase diffuser change --- api/onnx_web/models/cnet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/onnx_web/models/cnet.py b/api/onnx_web/models/cnet.py index 1c9a9a02..71c4b4d0 100644 --- a/api/onnx_web/models/cnet.py +++ b/api/onnx_web/models/cnet.py @@ -24,7 +24,7 @@ import torch.nn as nn import torch.utils.checkpoint from diffusers.configuration_utils import ConfigMixin, register_to_config from diffusers.loaders import UNet2DConditionLoadersMixin -from diffusers.models.cross_attention import AttnProcessor +from diffusers.models.attention_processor import AttnProcessor from diffusers.models.embeddings import ( GaussianFourierProjection, TimestepEmbedding, From 0fa03e77ad175b0f8756b138ddc2ed2b793655b9 Mon Sep 17 00:00:00 2001 From: Sean Sube Date: Sun, 10 Sep 2023 11:26:18 -0500 Subject: [PATCH 002/240] feat: add panorama pipeline for SDXL --- api/onnx_web/diffusers/load.py | 33 +- api/onnx_web/diffusers/patches/vae.py | 4 +- .../diffusers/pipelines/panorama_xl.py | 664 ++++++++++++++++++ api/onnx_web/params.py | 6 +- gui/src/strings/de.ts | 1 + gui/src/strings/en.ts | 1 + gui/src/strings/es.ts | 1 + gui/src/strings/fr.ts | 1 + 8 files changed, 693 insertions(+), 18 deletions(-) create mode 100644 api/onnx_web/diffusers/pipelines/panorama_xl.py diff --git a/api/onnx_web/diffusers/load.py b/api/onnx_web/diffusers/load.py index 78123449..36a35b7a 100644 --- a/api/onnx_web/diffusers/load.py +++ b/api/onnx_web/diffusers/load.py @@ -24,6 +24,7 @@ from .patches.vae import VAEWrapper from .pipelines.controlnet import OnnxStableDiffusionControlNetPipeline from .pipelines.lpw import OnnxStableDiffusionLongPromptWeightingPipeline from .pipelines.panorama import OnnxStableDiffusionPanoramaPipeline +from .pipelines.panorama_xl import ORTStableDiffusionXLPanoramaPipeline from .pipelines.pix2pix import OnnxStableDiffusionInstructPix2PixPipeline from .version_safe_diffusers import ( DDIMScheduler, @@ -58,6 +59,7 @@ available_pipelines = { # "inpaint-sdxl": ORTStableDiffusionXLInpaintPipeline, "lpw": OnnxStableDiffusionLongPromptWeightingPipeline, "panorama": OnnxStableDiffusionPanoramaPipeline, + "panorama-sdxl": ORTStableDiffusionXLPanoramaPipeline, "pix2pix": OnnxStableDiffusionInstructPix2PixPipeline, "txt2img-sdxl": ORTStableDiffusionXLPipeline, "txt2img": OnnxStableDiffusionPipeline, @@ -399,7 +401,6 @@ def load_pipeline( ) # make sure XL models are actually being used - # TODO: why is this needed? if "text_encoder_session" in components: logger.info( "text encoder matches: %s, %s", @@ -424,23 +425,23 @@ def load_pipeline( pipe.unet.session == components["unet_session"], type(pipe.unet), ) + pipe.unet = None + run_gc([device]) pipe.unet = ORTModelUnet(unet_session, unet_model) if not server.show_progress: pipe.set_progress_bar_config(disable=True) optimize_pipeline(server, pipe) - - if not params.is_xl(): - patch_pipeline(server, pipe, pipeline, pipeline_class, params) + patch_pipeline(server, pipe, pipeline_class, params) server.cache.set(ModelTypes.diffusion, pipe_key, pipe) server.cache.set(ModelTypes.scheduler, scheduler_key, components["scheduler"]) - if not params.is_xl() and hasattr(pipe, "vae_decoder"): + if hasattr(pipe, "vae_decoder"): pipe.vae_decoder.set_tiled(tiled=params.tiled_vae) - if not params.is_xl() and hasattr(pipe, "vae_encoder"): + if hasattr(pipe, "vae_encoder"): pipe.vae_encoder.set_tiled(tiled=params.tiled_vae) # update panorama params @@ -514,17 +515,18 @@ def optimize_pipeline( def patch_pipeline( server: ServerContext, pipe: StableDiffusionPipeline, - pipe_type: str, pipeline: Any, params: ImageParams, ) -> None: logger.debug("patching SD pipeline") - if pipe_type != "lpw": + if params.is_lpw(): pipe._encode_prompt = expand_prompt.__get__(pipe, pipeline) - original_unet = pipe.unet - pipe.unet = UNetWrapper(server, original_unet) + if not params.is_xl(): + original_unet = pipe.unet + pipe.unet = UNetWrapper(server, original_unet) + logger.debug("patched UNet with wrapper") if hasattr(pipe, "vae_decoder"): original_decoder = pipe.vae_decoder @@ -535,6 +537,9 @@ def patch_pipeline( window=params.tiles, overlap=params.overlap, ) + logger.debug("patched VAE decoder with wrapper") + + if hasattr(pipe, "vae_encoder"): original_encoder = pipe.vae_encoder pipe.vae_encoder = VAEWrapper( server, @@ -543,7 +548,7 @@ def patch_pipeline( window=params.tiles, overlap=params.overlap, ) - elif hasattr(pipe, "vae"): - pass # TODO: current wrapper does not work with upscaling VAE - else: - logger.debug("no VAE found to patch") + logger.debug("patched VAE encoder with wrapper") + + if hasattr(pipe, "vae"): + logger.warning("not patching single VAE, tiled VAE may not work") diff --git a/api/onnx_web/diffusers/patches/vae.py b/api/onnx_web/diffusers/patches/vae.py index c5fd6936..48e9358e 100644 --- a/api/onnx_web/diffusers/patches/vae.py +++ b/api/onnx_web/diffusers/patches/vae.py @@ -39,11 +39,13 @@ class VAEWrapper(object): self.tile_overlap_factor = overlap def __call__(self, latent_sample=None, sample=None, **kwargs): + model = self.wrapped.model if hasattr(self.wrapped, "model") else self.wrapped.session + # set timestep dtype to input type sample_dtype = next( ( input.type - for input in self.wrapped.model.get_inputs() + for input in model.get_inputs() if input.name == "sample" or input.name == "latent_sample" ), "tensor(float)", diff --git a/api/onnx_web/diffusers/pipelines/panorama_xl.py b/api/onnx_web/diffusers/pipelines/panorama_xl.py new file mode 100644 index 00000000..5092cae0 --- /dev/null +++ b/api/onnx_web/diffusers/pipelines/panorama_xl.py @@ -0,0 +1,664 @@ +from optimum.onnxruntime.modeling_diffusion import ORTStableDiffusionXLPipelineBase +from optimum.pipelines.diffusers.pipeline_stable_diffusion_xl_img2img import StableDiffusionXLImg2ImgPipelineMixin +from optimum.pipelines.diffusers.pipeline_utils import preprocess, rescale_noise_cfg +from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput +import logging +from typing import Any, Optional, List, Union, Tuple, Callable, Dict +import torch +import numpy as np +import PIL +import inspect + +logger = logging.getLogger(__name__) + + +DEFAULT_WINDOW = 64 +DEFAULT_STRIDE = 16 + + +class StableDiffusionXLPanoramaPipelineMixin(StableDiffusionXLImg2ImgPipelineMixin): + def __init__( + self, + *args, + window: int = DEFAULT_WINDOW, + stride: int = DEFAULT_STRIDE, + **kwargs, + ): + super().__init__(self, *args, **kwargs) + + self.window = window + self.stride = stride + + + def set_window_size(self, window: int, stride: int): + self.window = window + self.stride = stride + + + def get_views(self, panorama_height, panorama_width, window_size, stride): + # Here, we define the mappings F_i (see Eq. 7 in the MultiDiffusion paper https://arxiv.org/abs/2302.08113) + panorama_height /= 8 + panorama_width /= 8 + + num_blocks_height = abs((panorama_height - window_size) // stride) + 1 + num_blocks_width = abs((panorama_width - window_size) // stride) + 1 + total_num_blocks = int(num_blocks_height * num_blocks_width) + logger.debug( + "panorama generated %s views, %s by %s blocks", + total_num_blocks, + num_blocks_height, + num_blocks_width, + ) + + views = [] + for i in range(total_num_blocks): + h_start = int((i // num_blocks_width) * stride) + h_end = h_start + window_size + w_start = int((i % num_blocks_width) * stride) + w_end = w_start + window_size + views.append((h_start, h_end, w_start, w_end)) + + return views + + + # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents + def prepare_latents_img2img(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None): + batch_size = batch_size * num_images_per_prompt + + if image.shape[1] == 4: + init_latents = image + else: + init_latents = self.vae_encoder(sample=image)[0] * self.vae_decoder.config.get("scaling_factor", 0.18215) + + if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: + # expand init_latents for batch_size + additional_image_per_prompt = batch_size // init_latents.shape[0] + init_latents = np.concatenate([init_latents] * additional_image_per_prompt, axis=0) + elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: + raise ValueError( + f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." + ) + else: + init_latents = np.concatenate([init_latents], axis=0) + + # add noise to latents using the timesteps + noise = generator.randn(*init_latents.shape).astype(dtype) + init_latents = self.scheduler.add_noise( + torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timestep) + ) + return init_latents.numpy() + + + # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents + def prepare_latents_text2img(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None): + shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if latents is None: + latents = generator.randn(*shape).astype(dtype) + elif latents.shape != shape: + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * np.float64(self.scheduler.init_noise_sigma) + + return latents + + + # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + extra_step_kwargs = {} + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_eta: + extra_step_kwargs["eta"] = eta + + return extra_step_kwargs + + + # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.__call__ + def text2img( + self, + prompt: Optional[Union[str, List[str]]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 5.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: int = 1, + eta: float = 0.0, + generator: Optional[np.random.RandomState] = None, + latents: Optional[np.ndarray] = None, + prompt_embeds: Optional[np.ndarray] = None, + negative_prompt_embeds: Optional[np.ndarray] = None, + pooled_prompt_embeds: Optional[np.ndarray] = None, + negative_pooled_prompt_embeds: Optional[np.ndarray] = None, + output_type: str = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, np.ndarray], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + guidance_rescale: float = 0.0, + original_size: Optional[Tuple[int, int]] = None, + crops_coords_top_left: Tuple[int, int] = (0, 0), + target_size: Optional[Tuple[int, int]] = None, + ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + prompt (`Optional[Union[str, List[str]]]`, defaults to None): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + height (`Optional[int]`, defaults to None): + The height in pixels of the generated image. + width (`Optional[int]`, defaults to None): + The width in pixels of the generated image. + num_inference_steps (`int`, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, defaults to 5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + negative_prompt (`Optional[Union[str, list]]`): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` + is less than `1`). + num_images_per_prompt (`int`, defaults to 1): + The number of images to generate per prompt. + eta (`float`, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`Optional[np.random.RandomState]`, defaults to `None`):: + A np.random.RandomState to make generation deterministic. + latents (`Optional[np.ndarray]`, defaults to `None`): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + output_type (`str`, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a + plain tuple. + callback (Optional[Callable], defaults to `None`): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. + guidance_rescale (`float`, defaults to 0.7): + Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are + Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of + [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). + Guidance rescale factor should fix overexposure when using zero terminal SNR. + + Returns: + [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a `tuple. + When returning a tuple, the first element is a list with the generated images, and the second element is a + list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" + (nsfw) content, according to the `safety_checker`. + """ + + # 0. Default height and width to unet + height = height or self.unet.config["sample_size"] * self.vae_scale_factor + width = width or self.unet.config["sample_size"] * self.vae_scale_factor + + original_size = original_size or (height, width) + target_size = target_size or (height, width) + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, + 1.0, + callback_steps, + negative_prompt, + prompt_embeds, + negative_prompt_embeds, + ) + + # 2. Define call parameters + if isinstance(prompt, str): + batch_size = 1 + elif isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + if generator is None: + generator = np.random + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + # 3. Encode input prompt + ( + prompt_embeds, + negative_prompt_embeds, + pooled_prompt_embeds, + negative_pooled_prompt_embeds, + ) = self._encode_prompt( + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + pooled_prompt_embeds=pooled_prompt_embeds, + negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, + ) + + # 4. Prepare timesteps + self.scheduler.set_timesteps(num_inference_steps) + timesteps = self.scheduler.timesteps + + # 5. Prepare latent variables + latents = self.prepare_latents_text2img( + batch_size * num_images_per_prompt, + self.unet.config.get("in_channels", 4), + height, + width, + prompt_embeds.dtype, + generator, + latents, + ) + + # 6. Prepare extra step kwargs + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 7. Prepare added time ids & embeddings + add_text_embeds = pooled_prompt_embeds + add_time_ids = (original_size + crops_coords_top_left + target_size,) + add_time_ids = np.array(add_time_ids, dtype=prompt_embeds.dtype) + + if do_classifier_free_guidance: + prompt_embeds = np.concatenate((negative_prompt_embeds, prompt_embeds), axis=0) + add_text_embeds = np.concatenate((negative_pooled_prompt_embeds, add_text_embeds), axis=0) + add_time_ids = np.concatenate((add_time_ids, add_time_ids), axis=0) + add_time_ids = np.repeat(add_time_ids, batch_size * num_images_per_prompt, axis=0) + + # Adapted from diffusers to extend it for other runtimes than ORT + timestep_dtype = self.unet.input_dtype.get("timestep", np.float32) + + # 8. Panorama additions + views = self.get_views(height, width, self.window, self.stride) + count = np.zeros_like(latents) + value = np.zeros_like(latents) + + # 8. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + for i, t in enumerate(self.progress_bar(timesteps)): + count.fill(0) + value.fill(0) + + for h_start, h_end, w_start, w_end in views: + # get the latents corresponding to the current view coordinates + latents_for_view = latents[:, :, h_start:h_end, w_start:w_end] + + # expand the latents if we are doing classifier free guidance + latent_model_input = np.concatenate([latents_for_view] * 2) if do_classifier_free_guidance else latents_for_view + latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t) + latent_model_input = latent_model_input.cpu().numpy() + + # predict the noise residual + timestep = np.array([t], dtype=timestep_dtype) + noise_pred = self.unet( + sample=latent_model_input, + timestep=timestep, + encoder_hidden_states=prompt_embeds, + text_embeds=add_text_embeds, + time_ids=add_time_ids, + ) + noise_pred = noise_pred[0] + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + if guidance_rescale > 0.0: + # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf + noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale) + + # compute the previous noisy sample x_t -> x_t-1 + scheduler_output = self.scheduler.step( + torch.from_numpy(noise_pred), t, torch.from_numpy(latents_for_view), **extra_step_kwargs + ) + latents_view_denoised = scheduler_output.prev_sample.numpy() + + value[:, :, h_start:h_end, w_start:w_end] += latents_view_denoised + count[:, :, h_start:h_end, w_start:w_end] += 1 + + # take the MultiDiffusion step. Eq. 5 in MultiDiffusion paper: https://arxiv.org/abs/2302.08113 + latents = np.where(count > 0, value / count, value) + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + if callback is not None and i % callback_steps == 0: + callback(i, t, latents) + + if output_type == "latent": + image = latents + else: + latents = latents / self.vae_decoder.config.get("scaling_factor", 0.18215) + # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1 + image = np.concatenate( + [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])] + ) + image = self.watermark.apply_watermark(image) + + # TODO: add image_processor + image = np.clip(image / 2 + 0.5, 0, 1).transpose((0, 2, 3, 1)) + + if output_type == "pil": + image = self.numpy_to_pil(image) + + if not return_dict: + return (image,) + + return StableDiffusionXLPipelineOutput(images=image) + + + # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.__call__ + def img2img( + self, + prompt: Optional[Union[str, List[str]]] = None, + image: Union[np.ndarray, PIL.Image.Image] = None, + strength: float = 0.3, + num_inference_steps: int = 50, + guidance_scale: float = 5.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: int = 1, + eta: float = 0.0, + generator: Optional[np.random.RandomState] = None, + latents: Optional[np.ndarray] = None, + prompt_embeds: Optional[np.ndarray] = None, + negative_prompt_embeds: Optional[np.ndarray] = None, + pooled_prompt_embeds: Optional[np.ndarray] = None, + negative_pooled_prompt_embeds: Optional[np.ndarray] = None, + output_type: str = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, np.ndarray], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + guidance_rescale: float = 0.0, + original_size: Optional[Tuple[int, int]] = None, + crops_coords_top_left: Tuple[int, int] = (0, 0), + target_size: Optional[Tuple[int, int]] = None, + aesthetic_score: float = 6.0, + negative_aesthetic_score: float = 2.5, + ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + prompt (`Optional[Union[str, List[str]]]`, defaults to None): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + image (`Union[np.ndarray, PIL.Image.Image]`): + `Image`, or tensor representing an image batch which will be upscaled. + strength (`float`, defaults to 0.8): + Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` + will be used as a starting point, adding more noise to it the larger the `strength`. The number of + denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will + be maximum and the denoising process will run for the full number of iterations specified in + `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. + num_inference_steps (`int`, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, defaults to 5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + negative_prompt (`Optional[Union[str, list]]`): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` + is less than `1`). + num_images_per_prompt (`int`, defaults to 1): + The number of images to generate per prompt. + eta (`float`, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`Optional[np.random.RandomState]`, defaults to `None`):: + A np.random.RandomState to make generation deterministic. + latents (`Optional[np.ndarray]`, defaults to `None`): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + output_type (`str`, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a + plain tuple. + callback (Optional[Callable], defaults to `None`): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. + guidance_rescale (`float`, defaults to 0.7): + Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are + Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of + [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). + Guidance rescale factor should fix overexposure when using zero terminal SNR. + + Returns: + [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a `tuple. + When returning a tuple, the first element is a list with the generated images, and the second element is a + list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" + (nsfw) content, according to the `safety_checker`. + """ + # 0. Check inputs. Raise error if not correct + self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds) + + # 1. Define call parameters + if isinstance(prompt, str): + batch_size = 1 + elif isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + if generator is None: + generator = np.random + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + # 2. Encode input prompt + ( + prompt_embeds, + negative_prompt_embeds, + pooled_prompt_embeds, + negative_pooled_prompt_embeds, + ) = self._encode_prompt( + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + pooled_prompt_embeds=pooled_prompt_embeds, + negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, + ) + + # 3. Preprocess image + image = preprocess(image) + + # 4. Prepare timesteps + self.scheduler.set_timesteps(num_inference_steps) + + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) + latent_timestep = np.repeat(timesteps[:1], batch_size * num_images_per_prompt, axis=0) + timestep_dtype = self.unet.input_dtype.get("timestep", np.float32) + + latents_dtype = prompt_embeds.dtype + image = image.astype(latents_dtype) + + # 5. Prepare latent variables + latents = self.prepare_latents_img2img( + image, latent_timestep, batch_size, num_images_per_prompt, latents_dtype, generator + ) + + # 6. Prepare extra step kwargs + extra_step_kwargs = {} + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_eta: + extra_step_kwargs["eta"] = eta + + height, width = latents.shape[-2:] + height = height * self.vae_scale_factor + width = width * self.vae_scale_factor + original_size = original_size or (height, width) + target_size = target_size or (height, width) + + # 8. Prepare added time ids & embeddings + add_text_embeds = pooled_prompt_embeds + add_time_ids, add_neg_time_ids = self._get_add_time_ids( + original_size, + crops_coords_top_left, + target_size, + aesthetic_score, + negative_aesthetic_score, + dtype=prompt_embeds.dtype, + ) + + if do_classifier_free_guidance: + prompt_embeds = np.concatenate((negative_prompt_embeds, prompt_embeds), axis=0) + add_text_embeds = np.concatenate((negative_pooled_prompt_embeds, add_text_embeds), axis=0) + add_time_ids = np.concatenate((add_time_ids, add_time_ids), axis=0) + add_time_ids = np.repeat(add_time_ids, batch_size * num_images_per_prompt, axis=0) + + # 8. Panorama additions + views = self.get_views(height, width, self.window, self.stride) + count = np.zeros_like(latents) + value = np.zeros_like(latents) + + # 8. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + for i, t in enumerate(self.progress_bar(timesteps)): + count.fill(0) + value.fill(0) + + for h_start, h_end, w_start, w_end in views: + # get the latents corresponding to the current view coordinates + latents_for_view = latents[:, :, h_start:h_end, w_start:w_end] + + # expand the latents if we are doing classifier free guidance + latent_model_input = np.concatenate([latents_for_view] * 2) if do_classifier_free_guidance else latents_for_view + latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t) + latent_model_input = latent_model_input.cpu().numpy() + + # predict the noise residual + timestep = np.array([t], dtype=timestep_dtype) + noise_pred = self.unet( + sample=latent_model_input, + timestep=timestep, + encoder_hidden_states=prompt_embeds, + text_embeds=add_text_embeds, + time_ids=add_time_ids, + ) + noise_pred = noise_pred[0] + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + if guidance_rescale > 0.0: + # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf + noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale) + + # compute the previous noisy sample x_t -> x_t-1 + scheduler_output = self.scheduler.step( + torch.from_numpy(noise_pred), t, torch.from_numpy(latents_for_view), **extra_step_kwargs + ) + latents_view_denoised = scheduler_output.prev_sample.numpy() + + value[:, :, h_start:h_end, w_start:w_end] += latents_view_denoised + count[:, :, h_start:h_end, w_start:w_end] += 1 + + # take the MultiDiffusion step. Eq. 5 in MultiDiffusion paper: https://arxiv.org/abs/2302.08113 + latents = np.where(count > 0, value / count, value) + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + if callback is not None and i % callback_steps == 0: + callback(i, t, latents) + + if output_type == "latent": + image = latents + else: + latents = latents / self.vae_decoder.config.get("scaling_factor", 0.18215) + # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1 + image = np.concatenate( + [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])] + ) + image = self.watermark.apply_watermark(image) + + # TODO: add image_processor + image = np.clip(image / 2 + 0.5, 0, 1).transpose((0, 2, 3, 1)) + + if output_type == "pil": + image = self.numpy_to_pil(image) + + if not return_dict: + return (image,) + + return StableDiffusionXLPipelineOutput(images=image) + + + def __call__( + self, + *args, + **kwargs, + ): + if "image" in kwargs or ( + len(args) > 1 + and ( + isinstance(args[1], np.ndarray) or isinstance(args[1], PIL.Image.Image) + ) + ): + logger.debug("running img2img panorama XL pipeline") + return self.img2img(*args, **kwargs) + else: + logger.debug("running txt2img panorama XL pipeline") + return self.text2img(*args, **kwargs) + + +class ORTStableDiffusionXLPanoramaPipeline(ORTStableDiffusionXLPipelineBase, StableDiffusionXLPanoramaPipelineMixin): + def __call__(self, *args, **kwargs): + return StableDiffusionXLPanoramaPipelineMixin.__call__(self, *args, **kwargs) diff --git a/api/onnx_web/params.py b/api/onnx_web/params.py index 28825ad4..0ceb99e6 100644 --- a/api/onnx_web/params.py +++ b/api/onnx_web/params.py @@ -259,7 +259,7 @@ class ImageParams: # otherwise, check for additional allowed pipelines if group == "img2img": - if pipeline in ["controlnet", "img2img-sdxl", "lpw", "panorama", "pix2pix"]: + if pipeline in ["controlnet", "img2img-sdxl", "lpw", "panorama", "panorama-sdxl", "pix2pix"]: return pipeline elif pipeline == "txt2img-sdxl": return "img2img-sdxl" @@ -267,7 +267,7 @@ class ImageParams: if pipeline in ["controlnet", "lpw", "panorama"]: return pipeline elif group == "txt2img": - if pipeline in ["lpw", "panorama", "txt2img-sdxl"]: + if pipeline in ["lpw", "panorama", "panorama-sdxl", "txt2img-sdxl"]: return pipeline logger.debug("pipeline %s is not valid for %s", pipeline, group) @@ -280,7 +280,7 @@ class ImageParams: return self.pipeline == "lpw" def is_panorama(self): - return self.pipeline == "panorama" + return self.pipeline in ["panorama", "panorama-sdxl"] def is_pix2pix(self): return self.pipeline == "pix2pix" diff --git a/gui/src/strings/de.ts b/gui/src/strings/de.ts index 922ff27d..fb324301 100644 --- a/gui/src/strings/de.ts +++ b/gui/src/strings/de.ts @@ -188,6 +188,7 @@ export const I18N_STRINGS_DE = { 'inpaint-sdxl': '', 'lpw': '', 'panorama': '', + 'panorama-sdxl': '', 'pix2pix': '', 'txt2img': '', 'txt2img-sdxl': '', diff --git a/gui/src/strings/en.ts b/gui/src/strings/en.ts index b7fe2cc6..fc2eb448 100644 --- a/gui/src/strings/en.ts +++ b/gui/src/strings/en.ts @@ -242,6 +242,7 @@ export const I18N_STRINGS_EN = { 'inpaint-sdxl': 'SDXL Inpaint', 'lpw': 'Long Prompt Weighting', 'panorama': 'Panorama', + 'panorama-sdxl': 'SDXL Panorama', 'pix2pix': 'Instruct Pix2Pix', 'txt2img': 'Txt2Img', 'txt2img-sdxl': 'SDXL Txt2Img', diff --git a/gui/src/strings/es.ts b/gui/src/strings/es.ts index 8bd6d792..e2b572e0 100644 --- a/gui/src/strings/es.ts +++ b/gui/src/strings/es.ts @@ -188,6 +188,7 @@ export const I18N_STRINGS_ES = { 'inpaint-sdxl': '', 'lpw': '', 'panorama': '', + 'panorama-sdxl': '', 'pix2pix': '', 'txt2img': '', 'txt2img-sdxl': '', diff --git a/gui/src/strings/fr.ts b/gui/src/strings/fr.ts index 38afb2e1..589cf85f 100644 --- a/gui/src/strings/fr.ts +++ b/gui/src/strings/fr.ts @@ -188,6 +188,7 @@ export const I18N_STRINGS_FR = { 'inpaint-sdxl': '', 'lpw': '', 'panorama': '', + 'panorama-sdxl': '', 'pix2pix': '', 'txt2img': '', 'txt2img-sdxl': '', From 78f834a67852a92f6b7967bcda8d5da8a94f9cb1 Mon Sep 17 00:00:00 2001 From: Sean Sube Date: Sun, 10 Sep 2023 11:26:55 -0500 Subject: [PATCH 003/240] apply lint --- api/onnx_web/diffusers/patches/vae.py | 6 +- .../diffusers/pipelines/panorama_xl.py | 211 +++++++++++++----- api/onnx_web/params.py | 9 +- 3 files changed, 168 insertions(+), 58 deletions(-) diff --git a/api/onnx_web/diffusers/patches/vae.py b/api/onnx_web/diffusers/patches/vae.py index 48e9358e..1b46e505 100644 --- a/api/onnx_web/diffusers/patches/vae.py +++ b/api/onnx_web/diffusers/patches/vae.py @@ -39,7 +39,11 @@ class VAEWrapper(object): self.tile_overlap_factor = overlap def __call__(self, latent_sample=None, sample=None, **kwargs): - model = self.wrapped.model if hasattr(self.wrapped, "model") else self.wrapped.session + model = ( + self.wrapped.model + if hasattr(self.wrapped, "model") + else self.wrapped.session + ) # set timestep dtype to input type sample_dtype = next( diff --git a/api/onnx_web/diffusers/pipelines/panorama_xl.py b/api/onnx_web/diffusers/pipelines/panorama_xl.py index 5092cae0..fed65722 100644 --- a/api/onnx_web/diffusers/pipelines/panorama_xl.py +++ b/api/onnx_web/diffusers/pipelines/panorama_xl.py @@ -1,13 +1,16 @@ -from optimum.onnxruntime.modeling_diffusion import ORTStableDiffusionXLPipelineBase -from optimum.pipelines.diffusers.pipeline_stable_diffusion_xl_img2img import StableDiffusionXLImg2ImgPipelineMixin -from optimum.pipelines.diffusers.pipeline_utils import preprocess, rescale_noise_cfg -from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput +import inspect import logging -from typing import Any, Optional, List, Union, Tuple, Callable, Dict -import torch +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + import numpy as np import PIL -import inspect +import torch +from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput +from optimum.onnxruntime.modeling_diffusion import ORTStableDiffusionXLPipelineBase +from optimum.pipelines.diffusers.pipeline_stable_diffusion_xl_img2img import ( + StableDiffusionXLImg2ImgPipelineMixin, +) +from optimum.pipelines.diffusers.pipeline_utils import preprocess, rescale_noise_cfg logger = logging.getLogger(__name__) @@ -18,23 +21,21 @@ DEFAULT_STRIDE = 16 class StableDiffusionXLPanoramaPipelineMixin(StableDiffusionXLImg2ImgPipelineMixin): def __init__( - self, - *args, - window: int = DEFAULT_WINDOW, - stride: int = DEFAULT_STRIDE, - **kwargs, + self, + *args, + window: int = DEFAULT_WINDOW, + stride: int = DEFAULT_STRIDE, + **kwargs, ): super().__init__(self, *args, **kwargs) self.window = window self.stride = stride - def set_window_size(self, window: int, stride: int): self.window = window self.stride = stride - def get_views(self, panorama_height, panorama_width, window_size, stride): # Here, we define the mappings F_i (see Eq. 7 in the MultiDiffusion paper https://arxiv.org/abs/2302.08113) panorama_height /= 8 @@ -60,21 +61,32 @@ class StableDiffusionXLPanoramaPipelineMixin(StableDiffusionXLImg2ImgPipelineMix return views - # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents - def prepare_latents_img2img(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None): + def prepare_latents_img2img( + self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None + ): batch_size = batch_size * num_images_per_prompt if image.shape[1] == 4: init_latents = image else: - init_latents = self.vae_encoder(sample=image)[0] * self.vae_decoder.config.get("scaling_factor", 0.18215) + init_latents = self.vae_encoder(sample=image)[ + 0 + ] * self.vae_decoder.config.get("scaling_factor", 0.18215) - if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: + if ( + batch_size > init_latents.shape[0] + and batch_size % init_latents.shape[0] == 0 + ): # expand init_latents for batch_size additional_image_per_prompt = batch_size // init_latents.shape[0] - init_latents = np.concatenate([init_latents] * additional_image_per_prompt, axis=0) - elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: + init_latents = np.concatenate( + [init_latents] * additional_image_per_prompt, axis=0 + ) + elif ( + batch_size > init_latents.shape[0] + and batch_size % init_latents.shape[0] != 0 + ): raise ValueError( f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." ) @@ -84,14 +96,29 @@ class StableDiffusionXLPanoramaPipelineMixin(StableDiffusionXLImg2ImgPipelineMix # add noise to latents using the timesteps noise = generator.randn(*init_latents.shape).astype(dtype) init_latents = self.scheduler.add_noise( - torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timestep) + torch.from_numpy(init_latents), + torch.from_numpy(noise), + torch.from_numpy(timestep), ) return init_latents.numpy() - # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents - def prepare_latents_text2img(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None): - shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) + def prepare_latents_text2img( + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): + shape = ( + batch_size, + num_channels_latents, + height // self.vae_scale_factor, + width // self.vae_scale_factor, + ) if isinstance(generator, list) and len(generator) != batch_size: raise ValueError( f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" @@ -101,14 +128,15 @@ class StableDiffusionXLPanoramaPipelineMixin(StableDiffusionXLImg2ImgPipelineMix if latents is None: latents = generator.randn(*shape).astype(dtype) elif latents.shape != shape: - raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") + raise ValueError( + f"Unexpected latents shape, got {latents.shape}, expected {shape}" + ) # scale the initial noise by the standard deviation required by the scheduler latents = latents * np.float64(self.scheduler.init_noise_sigma) return latents - # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs def prepare_extra_step_kwargs(self, generator, eta): # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature @@ -118,13 +146,14 @@ class StableDiffusionXLPanoramaPipelineMixin(StableDiffusionXLImg2ImgPipelineMix extra_step_kwargs = {} - accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set( + inspect.signature(self.scheduler.step).parameters.keys() + ) if accepts_eta: extra_step_kwargs["eta"] = eta return extra_step_kwargs - # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.__call__ def text2img( self, @@ -294,10 +323,16 @@ class StableDiffusionXLPanoramaPipelineMixin(StableDiffusionXLImg2ImgPipelineMix add_time_ids = np.array(add_time_ids, dtype=prompt_embeds.dtype) if do_classifier_free_guidance: - prompt_embeds = np.concatenate((negative_prompt_embeds, prompt_embeds), axis=0) - add_text_embeds = np.concatenate((negative_pooled_prompt_embeds, add_text_embeds), axis=0) + prompt_embeds = np.concatenate( + (negative_prompt_embeds, prompt_embeds), axis=0 + ) + add_text_embeds = np.concatenate( + (negative_pooled_prompt_embeds, add_text_embeds), axis=0 + ) add_time_ids = np.concatenate((add_time_ids, add_time_ids), axis=0) - add_time_ids = np.repeat(add_time_ids, batch_size * num_images_per_prompt, axis=0) + add_time_ids = np.repeat( + add_time_ids, batch_size * num_images_per_prompt, axis=0 + ) # Adapted from diffusers to extend it for other runtimes than ORT timestep_dtype = self.unet.input_dtype.get("timestep", np.float32) @@ -318,8 +353,14 @@ class StableDiffusionXLPanoramaPipelineMixin(StableDiffusionXLImg2ImgPipelineMix latents_for_view = latents[:, :, h_start:h_end, w_start:w_end] # expand the latents if we are doing classifier free guidance - latent_model_input = np.concatenate([latents_for_view] * 2) if do_classifier_free_guidance else latents_for_view - latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t) + latent_model_input = ( + np.concatenate([latents_for_view] * 2) + if do_classifier_free_guidance + else latents_for_view + ) + latent_model_input = self.scheduler.scale_model_input( + torch.from_numpy(latent_model_input), t + ) latent_model_input = latent_model_input.cpu().numpy() # predict the noise residual @@ -336,14 +377,23 @@ class StableDiffusionXLPanoramaPipelineMixin(StableDiffusionXLImg2ImgPipelineMix # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * ( + noise_pred_text - noise_pred_uncond + ) if guidance_rescale > 0.0: # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf - noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale) + noise_pred = rescale_noise_cfg( + noise_pred, + noise_pred_text, + guidance_rescale=guidance_rescale, + ) # compute the previous noisy sample x_t -> x_t-1 scheduler_output = self.scheduler.step( - torch.from_numpy(noise_pred), t, torch.from_numpy(latents_for_view), **extra_step_kwargs + torch.from_numpy(noise_pred), + t, + torch.from_numpy(latents_for_view), + **extra_step_kwargs, ) latents_view_denoised = scheduler_output.prev_sample.numpy() @@ -354,7 +404,9 @@ class StableDiffusionXLPanoramaPipelineMixin(StableDiffusionXLImg2ImgPipelineMix latents = np.where(count > 0, value / count, value) # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ( + (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0 + ): if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -364,7 +416,10 @@ class StableDiffusionXLPanoramaPipelineMixin(StableDiffusionXLImg2ImgPipelineMix latents = latents / self.vae_decoder.config.get("scaling_factor", 0.18215) # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1 image = np.concatenate( - [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])] + [ + self.vae_decoder(latent_sample=latents[i : i + 1])[0] + for i in range(latents.shape[0]) + ] ) image = self.watermark.apply_watermark(image) @@ -379,7 +434,6 @@ class StableDiffusionXLPanoramaPipelineMixin(StableDiffusionXLImg2ImgPipelineMix return StableDiffusionXLPipelineOutput(images=image) - # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.__call__ def img2img( self, @@ -481,7 +535,14 @@ class StableDiffusionXLPanoramaPipelineMixin(StableDiffusionXLImg2ImgPipelineMix (nsfw) content, according to the `safety_checker`. """ # 0. Check inputs. Raise error if not correct - self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds) + self.check_inputs( + prompt, + strength, + callback_steps, + negative_prompt, + prompt_embeds, + negative_prompt_embeds, + ) # 1. Define call parameters if isinstance(prompt, str): @@ -522,8 +583,12 @@ class StableDiffusionXLPanoramaPipelineMixin(StableDiffusionXLImg2ImgPipelineMix # 4. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) - latent_timestep = np.repeat(timesteps[:1], batch_size * num_images_per_prompt, axis=0) + timesteps, num_inference_steps = self.get_timesteps( + num_inference_steps, strength + ) + latent_timestep = np.repeat( + timesteps[:1], batch_size * num_images_per_prompt, axis=0 + ) timestep_dtype = self.unet.input_dtype.get("timestep", np.float32) latents_dtype = prompt_embeds.dtype @@ -531,12 +596,19 @@ class StableDiffusionXLPanoramaPipelineMixin(StableDiffusionXLImg2ImgPipelineMix # 5. Prepare latent variables latents = self.prepare_latents_img2img( - image, latent_timestep, batch_size, num_images_per_prompt, latents_dtype, generator + image, + latent_timestep, + batch_size, + num_images_per_prompt, + latents_dtype, + generator, ) # 6. Prepare extra step kwargs extra_step_kwargs = {} - accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set( + inspect.signature(self.scheduler.step).parameters.keys() + ) if accepts_eta: extra_step_kwargs["eta"] = eta @@ -558,10 +630,16 @@ class StableDiffusionXLPanoramaPipelineMixin(StableDiffusionXLImg2ImgPipelineMix ) if do_classifier_free_guidance: - prompt_embeds = np.concatenate((negative_prompt_embeds, prompt_embeds), axis=0) - add_text_embeds = np.concatenate((negative_pooled_prompt_embeds, add_text_embeds), axis=0) + prompt_embeds = np.concatenate( + (negative_prompt_embeds, prompt_embeds), axis=0 + ) + add_text_embeds = np.concatenate( + (negative_pooled_prompt_embeds, add_text_embeds), axis=0 + ) add_time_ids = np.concatenate((add_time_ids, add_time_ids), axis=0) - add_time_ids = np.repeat(add_time_ids, batch_size * num_images_per_prompt, axis=0) + add_time_ids = np.repeat( + add_time_ids, batch_size * num_images_per_prompt, axis=0 + ) # 8. Panorama additions views = self.get_views(height, width, self.window, self.stride) @@ -579,8 +657,14 @@ class StableDiffusionXLPanoramaPipelineMixin(StableDiffusionXLImg2ImgPipelineMix latents_for_view = latents[:, :, h_start:h_end, w_start:w_end] # expand the latents if we are doing classifier free guidance - latent_model_input = np.concatenate([latents_for_view] * 2) if do_classifier_free_guidance else latents_for_view - latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t) + latent_model_input = ( + np.concatenate([latents_for_view] * 2) + if do_classifier_free_guidance + else latents_for_view + ) + latent_model_input = self.scheduler.scale_model_input( + torch.from_numpy(latent_model_input), t + ) latent_model_input = latent_model_input.cpu().numpy() # predict the noise residual @@ -597,14 +681,23 @@ class StableDiffusionXLPanoramaPipelineMixin(StableDiffusionXLImg2ImgPipelineMix # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * ( + noise_pred_text - noise_pred_uncond + ) if guidance_rescale > 0.0: # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf - noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale) + noise_pred = rescale_noise_cfg( + noise_pred, + noise_pred_text, + guidance_rescale=guidance_rescale, + ) # compute the previous noisy sample x_t -> x_t-1 scheduler_output = self.scheduler.step( - torch.from_numpy(noise_pred), t, torch.from_numpy(latents_for_view), **extra_step_kwargs + torch.from_numpy(noise_pred), + t, + torch.from_numpy(latents_for_view), + **extra_step_kwargs, ) latents_view_denoised = scheduler_output.prev_sample.numpy() @@ -615,7 +708,9 @@ class StableDiffusionXLPanoramaPipelineMixin(StableDiffusionXLImg2ImgPipelineMix latents = np.where(count > 0, value / count, value) # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ( + (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0 + ): if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -625,7 +720,10 @@ class StableDiffusionXLPanoramaPipelineMixin(StableDiffusionXLImg2ImgPipelineMix latents = latents / self.vae_decoder.config.get("scaling_factor", 0.18215) # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1 image = np.concatenate( - [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])] + [ + self.vae_decoder(latent_sample=latents[i : i + 1])[0] + for i in range(latents.shape[0]) + ] ) image = self.watermark.apply_watermark(image) @@ -640,7 +738,6 @@ class StableDiffusionXLPanoramaPipelineMixin(StableDiffusionXLImg2ImgPipelineMix return StableDiffusionXLPipelineOutput(images=image) - def __call__( self, *args, @@ -659,6 +756,8 @@ class StableDiffusionXLPanoramaPipelineMixin(StableDiffusionXLImg2ImgPipelineMix return self.text2img(*args, **kwargs) -class ORTStableDiffusionXLPanoramaPipeline(ORTStableDiffusionXLPipelineBase, StableDiffusionXLPanoramaPipelineMixin): +class ORTStableDiffusionXLPanoramaPipeline( + ORTStableDiffusionXLPipelineBase, StableDiffusionXLPanoramaPipelineMixin +): def __call__(self, *args, **kwargs): return StableDiffusionXLPanoramaPipelineMixin.__call__(self, *args, **kwargs) diff --git a/api/onnx_web/params.py b/api/onnx_web/params.py index 0ceb99e6..4b03f758 100644 --- a/api/onnx_web/params.py +++ b/api/onnx_web/params.py @@ -259,7 +259,14 @@ class ImageParams: # otherwise, check for additional allowed pipelines if group == "img2img": - if pipeline in ["controlnet", "img2img-sdxl", "lpw", "panorama", "panorama-sdxl", "pix2pix"]: + if pipeline in [ + "controlnet", + "img2img-sdxl", + "lpw", + "panorama", + "panorama-sdxl", + "pix2pix", + ]: return pipeline elif pipeline == "txt2img-sdxl": return "img2img-sdxl" From fe68670844d25226a634c8ead689b47fa520722b Mon Sep 17 00:00:00 2001 From: Sean Sube Date: Sun, 10 Sep 2023 11:52:46 -0500 Subject: [PATCH 004/240] feat(api): add conversion for SDXL models --- api/onnx_web/convert/__main__.py | 28 +++++--- .../diffusion/{diffusers.py => diffusion.py} | 0 .../convert/diffusion/diffusion_xl.py | 72 +++++++++++++++++++ 3 files changed, 92 insertions(+), 8 deletions(-) rename api/onnx_web/convert/diffusion/{diffusers.py => diffusion.py} (100%) create mode 100644 api/onnx_web/convert/diffusion/diffusion_xl.py diff --git a/api/onnx_web/convert/__main__.py b/api/onnx_web/convert/__main__.py index 36c97429..9a44fd8d 100644 --- a/api/onnx_web/convert/__main__.py +++ b/api/onnx_web/convert/__main__.py @@ -11,11 +11,13 @@ from jsonschema import ValidationError, validate from onnx import load_model, save_model from transformers import CLIPTokenizer +from .diffusion.diffusion_xl import convert_diffusion_diffusers_xl + from ..constants import ONNX_MODEL, ONNX_WEIGHTS from ..utils import load_config from .correction.gfpgan import convert_correction_gfpgan from .diffusion.control import convert_diffusion_control -from .diffusion.diffusers import convert_diffusion_diffusers +from .diffusion.diffusion import convert_diffusion_diffusers from .diffusion.lora import blend_loras from .diffusion.textual_inversion import blend_textual_inversions from .upscaling.bsrgan import convert_upscaling_bsrgan @@ -357,13 +359,23 @@ def convert_models(conversion: ConversionContext, args, models: Models): conversion, name, model["source"], format=model_format ) - converted, dest = convert_diffusion_diffusers( - conversion, - model, - source, - model_format, - hf=hf, - ) + pipeline = model.get("pipeline", "txt2img") + if pipeline.endswith("-sdxl"): + converted, dest = convert_diffusion_diffusers_xl( + conversion, + model, + source, + model_format, + hf=hf, + ) + else: + converted, dest = convert_diffusion_diffusers( + conversion, + model, + source, + model_format, + hf=hf, + ) # make sure blending only happens once, not every run if converted: diff --git a/api/onnx_web/convert/diffusion/diffusers.py b/api/onnx_web/convert/diffusion/diffusion.py similarity index 100% rename from api/onnx_web/convert/diffusion/diffusers.py rename to api/onnx_web/convert/diffusion/diffusion.py diff --git a/api/onnx_web/convert/diffusion/diffusion_xl.py b/api/onnx_web/convert/diffusion/diffusion_xl.py new file mode 100644 index 00000000..58d57143 --- /dev/null +++ b/api/onnx_web/convert/diffusion/diffusion_xl.py @@ -0,0 +1,72 @@ +from logging import getLogger +from os import path +from typing import Dict, Optional, Tuple + +import torch +from optimum.pipelines.diffusers.pipeline_stable_diffusion_xl import StableDiffusionXLPipeline +from optimum.exporters.onnx import main_export + +from ..utils import ConversionContext + +logger = getLogger(__name__) + + +@torch.no_grad() +def convert_diffusion_diffusers_xl( + conversion: ConversionContext, + model: Dict, + source: str, + format: Optional[str], + hf: bool = False, +) -> Tuple[bool, str]: + """ + From https://github.com/huggingface/diffusers/blob/main/scripts/convert_stable_diffusion_checkpoint_to_onnx.py + """ + name = model.get("name") + # TODO: support alternate VAE + + device = conversion.training_device + dtype = conversion.torch_dtype() + logger.debug("using Torch dtype %s for pipeline", dtype) + + dest_path = path.join(conversion.model_path, name) + model_index = path.join(dest_path, "model_index.json") + model_hash = path.join(dest_path, "hash.txt") + + # diffusers go into a directory rather than .onnx file + logger.info( + "converting Stable Diffusion XL model %s: %s -> %s/", name, source, dest_path + ) + + if "hash" in model and not path.exists(model_hash): + logger.info("ONNX model does not have hash file, adding one") + with open(model_hash, "w") as f: + f.write(model["hash"]) + + if path.exists(dest_path) and path.exists(model_index): + logger.info("ONNX model already exists, skipping conversion") + return (False, dest_path) + + # safetensors -> diffusers directory with torch models + temp_path = path.join(conversion.cache_path, f"{name}-torch") + + if format == "safetensors": + pipeline = StableDiffusionXLPipeline.from_single_file(source, use_safetensors=True) + else: + pipeline = StableDiffusionXLPipeline.from_pretrained(source) + + pipeline.save_pretrained(temp_path) + + # directory -> onnx using optimum exporters + main_export( + temp_path, + output=dest_path, + task="stable-diffusion-xl", + device=device, + fp16=conversion.half, + framework="pt", + ) + + # TODO: optimize UNet to fp16 + + return False, dest_path \ No newline at end of file From f2d0c2f7657d6ac5af955a1cedd5de05540c1f03 Mon Sep 17 00:00:00 2001 From: Sean Sube Date: Sun, 10 Sep 2023 11:53:36 -0500 Subject: [PATCH 005/240] apply lint --- api/onnx_web/convert/__main__.py | 3 +-- api/onnx_web/convert/diffusion/diffusion_xl.py | 10 +++++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/api/onnx_web/convert/__main__.py b/api/onnx_web/convert/__main__.py index 9a44fd8d..a969ca4c 100644 --- a/api/onnx_web/convert/__main__.py +++ b/api/onnx_web/convert/__main__.py @@ -11,13 +11,12 @@ from jsonschema import ValidationError, validate from onnx import load_model, save_model from transformers import CLIPTokenizer -from .diffusion.diffusion_xl import convert_diffusion_diffusers_xl - from ..constants import ONNX_MODEL, ONNX_WEIGHTS from ..utils import load_config from .correction.gfpgan import convert_correction_gfpgan from .diffusion.control import convert_diffusion_control from .diffusion.diffusion import convert_diffusion_diffusers +from .diffusion.diffusion_xl import convert_diffusion_diffusers_xl from .diffusion.lora import blend_loras from .diffusion.textual_inversion import blend_textual_inversions from .upscaling.bsrgan import convert_upscaling_bsrgan diff --git a/api/onnx_web/convert/diffusion/diffusion_xl.py b/api/onnx_web/convert/diffusion/diffusion_xl.py index 58d57143..a92c5856 100644 --- a/api/onnx_web/convert/diffusion/diffusion_xl.py +++ b/api/onnx_web/convert/diffusion/diffusion_xl.py @@ -3,8 +3,10 @@ from os import path from typing import Dict, Optional, Tuple import torch -from optimum.pipelines.diffusers.pipeline_stable_diffusion_xl import StableDiffusionXLPipeline from optimum.exporters.onnx import main_export +from optimum.pipelines.diffusers.pipeline_stable_diffusion_xl import ( + StableDiffusionXLPipeline, +) from ..utils import ConversionContext @@ -51,7 +53,9 @@ def convert_diffusion_diffusers_xl( temp_path = path.join(conversion.cache_path, f"{name}-torch") if format == "safetensors": - pipeline = StableDiffusionXLPipeline.from_single_file(source, use_safetensors=True) + pipeline = StableDiffusionXLPipeline.from_single_file( + source, use_safetensors=True + ) else: pipeline = StableDiffusionXLPipeline.from_pretrained(source) @@ -69,4 +73,4 @@ def convert_diffusion_diffusers_xl( # TODO: optimize UNet to fp16 - return False, dest_path \ No newline at end of file + return False, dest_path From 956a260db6b6a6c91ffe7bcdf9ba8106e3f2faf0 Mon Sep 17 00:00:00 2001 From: Sean Sube Date: Sun, 10 Sep 2023 12:15:39 -0500 Subject: [PATCH 006/240] fix import --- api/onnx_web/convert/diffusion/diffusion_xl.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/api/onnx_web/convert/diffusion/diffusion_xl.py b/api/onnx_web/convert/diffusion/diffusion_xl.py index a92c5856..a7dcf104 100644 --- a/api/onnx_web/convert/diffusion/diffusion_xl.py +++ b/api/onnx_web/convert/diffusion/diffusion_xl.py @@ -3,10 +3,8 @@ from os import path from typing import Dict, Optional, Tuple import torch +from diffusers import StableDiffusionXLPipeline from optimum.exporters.onnx import main_export -from optimum.pipelines.diffusers.pipeline_stable_diffusion_xl import ( - StableDiffusionXLPipeline, -) from ..utils import ConversionContext From 1a732d54b6168fa9949ec3d19faea10b197fb508 Mon Sep 17 00:00:00 2001 From: Sean Sube Date: Sun, 10 Sep 2023 16:35:16 -0500 Subject: [PATCH 007/240] add endpoint for multiple image generation --- api/onnx_web/output.py | 3 +- api/onnx_web/server/api.py | 58 ++++++++++++- api/onnx_web/server/params.py | 153 +++++++++++++++++++++++++++++++++- 3 files changed, 211 insertions(+), 3 deletions(-) diff --git a/api/onnx_web/output.py b/api/onnx_web/output.py index 17f29744..d64f79a0 100644 --- a/api/onnx_web/output.py +++ b/api/onnx_web/output.py @@ -158,6 +158,7 @@ def make_output_name( size: Size, extras: Optional[List[Optional[Param]]] = None, count: Optional[int] = None, + offset: int = 0, ) -> List[str]: count = count or params.batch now = int(time()) @@ -183,7 +184,7 @@ def make_output_name( return [ f"{mode}_{params.seed}_{sha.hexdigest()}_{now}_{i}.{server.image_format}" - for i in range(count) + for i in range(offset, count + offset) ] diff --git a/api/onnx_web/server/api.py b/api/onnx_web/server/api.py index 7035f680..8e046440 100644 --- a/api/onnx_web/server/api.py +++ b/api/onnx_web/server/api.py @@ -51,6 +51,7 @@ from .load import ( from .params import ( border_from_request, highres_from_request, + pipeline_from_json, pipeline_from_request, upscale_from_request, ) @@ -221,7 +222,7 @@ def txt2img(server: ServerContext, pool: DevicePoolExecutor): replace_wildcards(params, get_wildcard_data()) - output = make_output_name(server, "txt2img", params, size) + output = make_output_name(server, "txt2img", params, size, count=params.batch) job_name = output[0] pool.submit( @@ -514,6 +515,61 @@ def txt2txt(server: ServerContext, pool: DevicePoolExecutor): return jsonify(json_params(output, params, size)) +def generate(server: ServerContext, pool: DevicePoolExecutor): + if not request.is_json(): + return error_reply("generate endpoint requires JSON parameters") + + # TODO: should this accept YAML as well? + data = request.get_json() + schema = load_config("./schemas/generate.yaml") + + logger.debug("validating generate request: %s against %s", data, schema) + validate(data, schema) + + jobs = [] + + if "txt2img" in data: + for job in data.get("txt2img"): + device, params, size = pipeline_from_json(server, job, "txt2img") + jobs.append(( + f"generate-txt2img-{len(jobs)}", + run_txt2img_pipeline, + server, + params, + size, + make_output_name(server, "txt2img", params, size, offset=len(jobs)), + None, + None, + device, + )) + + if "img2img" in data: + for job in data.get("img2img"): + device, params, size = pipeline_from_json(server, job, "img2img") + jobs.append(( + f"generate-img2img-{len(jobs)}", + run_img2img_pipeline, + server, + params, + size, + make_output_name(server, "img2img", params, size, offset=len(jobs)) + None, + None, + device, + )) + + for job in jobs: + pool.submit(*job) + + # TODO: collect results + # this is the hard part. once all of the jobs are done, the last job or some dedicated job + # needs to collect the previous outputs and put them on a grid. jobs write their own + # output to disk and do not return it, so that may need to read the images based on the + # output names assigned to each job. knowing when the jobs are done is the first problem. + + # TODO: assemble grid + + def cancel(server: ServerContext, pool: DevicePoolExecutor): output_file = request.args.get("output", None) if output_file is None: diff --git a/api/onnx_web/server/params.py b/api/onnx_web/server/params.py index 2598e819..43c7a511 100644 --- a/api/onnx_web/server/params.py +++ b/api/onnx_web/server/params.py @@ -1,5 +1,5 @@ from logging import getLogger -from typing import Tuple +from typing import Any, Dict, Tuple import numpy as np from flask import request @@ -34,6 +34,157 @@ from .utils import get_model_path logger = getLogger(__name__) +def pipeline_from_json( + server: ServerContext, + data: Dict[str, Any], + default_pipeline: str = "txt2img", +) -> Tuple[DeviceParams, ImageParams, Size]: + device = None + device_name = data.get("platform") + + if device_name is not None and device_name != "any": + for platform in get_available_platforms(): + if platform.device == device_name: + device = platform + + # diffusion model + model = get_not_empty(data, "model", get_config_value("model")) + model_path = get_model_path(server, model) + + # pipeline stuff + pipeline = get_from_list( + data, "pipeline", get_available_pipelines(), default_pipeline + ) + scheduler = get_from_list(data, "scheduler", get_pipeline_schedulers()) + + if scheduler is None: + scheduler = get_config_value("scheduler") + + # prompt does not come from config + prompt = data.get("prompt", "") + negative_prompt = data.get("negativePrompt", None) + + if negative_prompt is not None and negative_prompt.strip() == "": + negative_prompt = None + + # image params + batch = get_and_clamp_int( + data, + "batch", + get_config_value("batch"), + get_config_value("batch", "max"), + get_config_value("batch", "min"), + ) + cfg = get_and_clamp_float( + data, + "cfg", + get_config_value("cfg"), + get_config_value("cfg", "max"), + get_config_value("cfg", "min"), + ) + eta = get_and_clamp_float( + data, + "eta", + get_config_value("eta"), + get_config_value("eta", "max"), + get_config_value("eta", "min"), + ) + loopback = get_and_clamp_int( + data, + "loopback", + get_config_value("loopback"), + get_config_value("loopback", "max"), + get_config_value("loopback", "min"), + ) + steps = get_and_clamp_int( + data, + "steps", + get_config_value("steps"), + get_config_value("steps", "max"), + get_config_value("steps", "min"), + ) + height = get_and_clamp_int( + data, + "height", + get_config_value("height"), + get_config_value("height", "max"), + get_config_value("height", "min"), + ) + width = get_and_clamp_int( + data, + "width", + get_config_value("width"), + get_config_value("width", "max"), + get_config_value("width", "min"), + ) + tiled_vae = get_boolean(data, "tiledVAE", get_config_value("tiledVAE")) + tiles = get_and_clamp_int( + data, + "tiles", + get_config_value("tiles"), + get_config_value("tiles", "max"), + get_config_value("tiles", "min"), + ) + overlap = get_and_clamp_float( + data, + "overlap", + get_config_value("overlap"), + get_config_value("overlap", "max"), + get_config_value("overlap", "min"), + ) + stride = get_and_clamp_int( + data, + "stride", + get_config_value("stride"), + get_config_value("stride", "max"), + get_config_value("stride", "min"), + ) + + if stride > tiles: + logger.info("limiting stride to tile size, %s > %s", stride, tiles) + stride = tiles + + seed = int(data.get("seed", -1)) + if seed == -1: + # this one can safely use np.random because it produces a single value + seed = np.random.randint(np.iinfo(np.int32).max) + + logger.debug( + "parsed parameters for %s steps of %s using %s in %s on %s, %sx%s, %s, %s - %s", + steps, + scheduler, + model_path, + pipeline, + device or "any device", + width, + height, + cfg, + seed, + prompt, + ) + + params = ImageParams( + model_path, + pipeline, + scheduler, + prompt, + cfg, + steps, + seed, + eta=eta, + negative_prompt=negative_prompt, + batch=batch, + # TODO: control=control, + loopback=loopback, + tiled_vae=tiled_vae, + tiles=tiles, + overlap=overlap, + stride=stride, + ) + size = Size(width, height) + return (device, params, size) + + def pipeline_from_request( server: ServerContext, default_pipeline: str = "txt2img", From 1fb965633ed339bafb3097b21a6cb485af29f41b Mon Sep 17 00:00:00 2001 From: Sean Sube Date: Sun, 10 Sep 2023 20:59:13 -0500 Subject: [PATCH 008/240] read chain pipeline from JSON, remove new endpoint --- api/onnx_web/chain/__init__.py | 2 + api/onnx_web/chain/blend_grid.py | 47 +++++++++++ api/onnx_web/chain/source_noise.py | 8 +- api/onnx_web/chain/source_s3.py | 9 ++- api/onnx_web/chain/source_txt2img.py | 14 ++-- api/onnx_web/chain/source_url.py | 6 +- api/onnx_web/server/api.py | 76 +++-------------- api/schemas/generate.yaml | 117 +++++++++++++++++++++++++++ 8 files changed, 202 insertions(+), 77 deletions(-) create mode 100644 api/onnx_web/chain/blend_grid.py create mode 100644 api/schemas/generate.yaml diff --git a/api/onnx_web/chain/__init__.py b/api/onnx_web/chain/__init__.py index e0e23a30..df2ac80e 100644 --- a/api/onnx_web/chain/__init__.py +++ b/api/onnx_web/chain/__init__.py @@ -1,5 +1,6 @@ from .base import ChainPipeline, PipelineStage, StageParams from .blend_img2img import BlendImg2ImgStage +from .blend_grid import BlendGridStage from .blend_linear import BlendLinearStage from .blend_mask import BlendMaskStage from .correct_codeformer import CorrectCodeformerStage @@ -23,6 +24,7 @@ from .upscale_swinir import UpscaleSwinIRStage CHAIN_STAGES = { "blend-img2img": BlendImg2ImgStage, "blend-inpaint": UpscaleOutpaintStage, + "blend-grid": BlendGridStage, "blend-linear": BlendLinearStage, "blend-mask": BlendMaskStage, "correct-codeformer": CorrectCodeformerStage, diff --git a/api/onnx_web/chain/blend_grid.py b/api/onnx_web/chain/blend_grid.py new file mode 100644 index 00000000..51472a42 --- /dev/null +++ b/api/onnx_web/chain/blend_grid.py @@ -0,0 +1,47 @@ +from logging import getLogger +from typing import List, Optional + +from PIL import Image + +from ..params import ImageParams, StageParams +from ..server import ServerContext +from ..worker import ProgressCallback, WorkerContext +from .stage import BaseStage + +logger = getLogger(__name__) + + +class BlendGridStage(BaseStage): + def run( + self, + _worker: WorkerContext, + _server: ServerContext, + _stage: StageParams, + _params: ImageParams, + sources: List[Image.Image], + *, + height: int, + width: int, + rows: Optional[List[str]] = None, + columns: Optional[List[str]] = None, + title: Optional[str] = None, + order: Optional[int] = None, + stage_source: Optional[Image.Image] = None, + _callback: Optional[ProgressCallback] = None, + **kwargs, + ) -> List[Image.Image]: + logger.info("combining source images using grid layout") + + size = sources[0].size + + output = Image.new("RGB", (size[0] * width, size[1] * height)) + + # TODO: labels + for i in order or range(len(sources)): + x = i % width + y = i / width + + output.paste(sources[i], (x * size[0], y * size[1])) + + return [output] + diff --git a/api/onnx_web/chain/source_noise.py b/api/onnx_web/chain/source_noise.py index 1ee68f42..5e6035d8 100644 --- a/api/onnx_web/chain/source_noise.py +++ b/api/onnx_web/chain/source_noise.py @@ -28,11 +28,13 @@ class SourceNoiseStage(BaseStage): logger.info("generating image from noise source") if len(sources) > 0: - logger.warning( - "source images were passed to a noise stage and will be discarded" + logger.info( + "source images were passed to a source stage, new images will be appended" ) - outputs = [] + outputs = list(sources) + + # TODO: looping over sources and ignoring params does not make much sense for a source stage for source in sources: output = noise_source(source, (size.width, size.height), (0, 0)) diff --git a/api/onnx_web/chain/source_s3.py b/api/onnx_web/chain/source_s3.py index 900270a3..55f8f228 100644 --- a/api/onnx_web/chain/source_s3.py +++ b/api/onnx_web/chain/source_s3.py @@ -20,7 +20,7 @@ class SourceS3Stage(BaseStage): _server: ServerContext, _stage: StageParams, _params: ImageParams, - _sources: List[Image.Image], + sources: List[Image.Image], *, source_keys: List[str], bucket: str, @@ -31,7 +31,12 @@ class SourceS3Stage(BaseStage): session = Session(profile_name=profile_name) s3 = session.client("s3", endpoint_url=endpoint_url) - outputs = [] + if len(sources) > 0: + logger.info( + "source images were passed to a source stage, new images will be appended" + ) + + outputs = list(sources) for key in source_keys: try: logger.info("loading image from s3://%s/%s", bucket, key) diff --git a/api/onnx_web/chain/source_txt2img.py b/api/onnx_web/chain/source_txt2img.py index cc642d55..82d9aebe 100644 --- a/api/onnx_web/chain/source_txt2img.py +++ b/api/onnx_web/chain/source_txt2img.py @@ -1,5 +1,5 @@ from logging import getLogger -from typing import Optional, Tuple +from typing import List, Optional, Tuple import numpy as np import torch @@ -30,7 +30,7 @@ class SourceTxt2ImgStage(BaseStage): server: ServerContext, stage: StageParams, params: ImageParams, - _source: Image.Image, + sources: List[Image.Image], *, dims: Tuple[int, int, int], size: Size, @@ -50,9 +50,9 @@ class SourceTxt2ImgStage(BaseStage): "generating image using txt2img, %s steps: %s", params.steps, params.prompt ) - if "stage_source" in kwargs: - logger.warning( - "a source image was passed to a txt2img stage, and will be discarded" + if len(sources): + logger.info( + "source images were passed to a source stage, new images will be appended" ) prompt_pairs, loras, inversions, (prompt, negative_prompt) = parse_prompt( @@ -123,4 +123,6 @@ class SourceTxt2ImgStage(BaseStage): callback=callback, ) - return result.images + output = list(sources) + output.extend(result.images) + return output diff --git a/api/onnx_web/chain/source_url.py b/api/onnx_web/chain/source_url.py index 5fa54b67..54f86c54 100644 --- a/api/onnx_web/chain/source_url.py +++ b/api/onnx_web/chain/source_url.py @@ -29,11 +29,11 @@ class SourceURLStage(BaseStage): logger.info("loading image from URL source") if len(sources) > 0: - logger.warning( - "a source image was passed to a source stage, and will be discarded" + logger.info( + "source images were passed to a source stage, new images will be appended" ) - outputs = [] + outputs = list(sources) for url in source_urls: response = requests.get(url) output = Image.open(BytesIO(response.content)) diff --git a/api/onnx_web/server/api.py b/api/onnx_web/server/api.py index 8e046440..b97ab36a 100644 --- a/api/onnx_web/server/api.py +++ b/api/onnx_web/server/api.py @@ -368,16 +368,21 @@ def upscale(server: ServerContext, pool: DevicePoolExecutor): def chain(server: ServerContext, pool: DevicePoolExecutor): - logger.debug( - "chain pipeline request: %s, %s", request.form.keys(), request.files.keys() - ) - body = request.form.get("chain") or request.files.get("chain") - if body is None: - return error_reply("chain pipeline must have a body") + if request.is_json(): + logger.debug("chain pipeline request with JSON body") + data = request.get_json() + else: + logger.debug( + "chain pipeline request: %s, %s", request.form.keys(), request.files.keys() + ) + + body = request.form.get("chain") or request.files.get("chain") + if body is None: + return error_reply("chain pipeline must have a body") + + data = load_config_str(body) - data = load_config_str(body) schema = load_config("./schemas/chain.yaml") - logger.debug("validating chain request: %s against %s", data, schema) validate(data, schema) @@ -515,61 +520,6 @@ def txt2txt(server: ServerContext, pool: DevicePoolExecutor): return jsonify(json_params(output, params, size)) -def generate(server: ServerContext, pool: DevicePoolExecutor): - if not request.is_json(): - return error_reply("generate endpoint requires JSON parameters") - - # TODO: should this accept YAML as well? - data = request.get_json() - schema = load_config("./schemas/generate.yaml") - - logger.debug("validating generate request: %s against %s", data, schema) - validate(data, schema) - - jobs = [] - - if "txt2img" in data: - for job in data.get("txt2img"): - device, params, size = pipeline_from_json(server, job, "txt2img") - jobs.append(( - f"generate-txt2img-{len(jobs)}", - run_txt2img_pipeline, - server, - params, - size, - make_output_name(server, "txt2img", params, size, offset=len(jobs)), - None, - None, - device, - )) - - if "img2img" in data: - for job in data.get("img2img"): - device, params, size = pipeline_from_json(server, job, "img2img") - jobs.append(( - f"generate-img2img-{len(jobs)}", - run_img2img_pipeline, - server, - params, - size, - make_output_name(server, "img2img", params, size, offset=len(jobs)) - None, - None, - device, - )) - - for job in jobs: - pool.submit(*job) - - # TODO: collect results - # this is the hard part. once all of the jobs are done, the last job or some dedicated job - # needs to collect the previous outputs and put them on a grid. jobs write their own - # output to disk and do not return it, so that may need to read the images based on the - # output names assigned to each job. knowing when the jobs are done is the first problem. - - # TODO: assemble grid - - def cancel(server: ServerContext, pool: DevicePoolExecutor): output_file = request.args.get("output", None) if output_file is None: diff --git a/api/schemas/generate.yaml b/api/schemas/generate.yaml new file mode 100644 index 00000000..8666468e --- /dev/null +++ b/api/schemas/generate.yaml @@ -0,0 +1,117 @@ +$id: TODO +$schema: https://json-schema.org/draft/2020-12/schema + +$defs: + grid: + type: object + additionalProperties: False + required: [width, height] + width: + type: number + height: + type: number + labels: + type: object + additionalProperties: False + properties: + title: + type: string + rows: + type: array + items: + type: string + columns: + type: array + items: + type: string + order: + type: array + items: number + + job_base: + type: object + additionalProperties: true + required: [ + device, + model, + pipeline, + scheduler, + prompt, + cfg, + steps, + seed, + ] + properties: + batch: + type: number + device: + type: string + model: + type: string + control: + type: string + pipeline: + type: string + scheduler: + type: string + prompt: + type: string + negative_prompt: + type: string + cfg: + type: number + eta: + type: number + steps: + type: number + tiled_vae: + type: boolean + tiles: + type: number + overlap: + type: number + seed: + type: number + stride: + type: number + + job_txt2img: + allOf: + - $ref: "#/$defs/job_base" + - type: object + additionalProperties: False + required: [ + height, + width, + ] + properties: + width: + type: number + height: + type: number + + job_img2img: + allOf: + - $ref: "#/$defs/job_base" + - type: object + additionalProperties: False + required: [] + properties: + loopback: + type: number + +type: object +additionalProperties: False +properties: + txt2img: + type: array + items: + $ref: "#/$defs/job_txt2img" + img2img: + type: array + items: + $ref: "#/$defs/job_img2img" + grid: + type: array + items: + $ref: "#/$defs/grid" From 9d4272eb09e27df1e6d806b5bb2c619e70d8979e Mon Sep 17 00:00:00 2001 From: Sean Sube Date: Sun, 10 Sep 2023 20:59:33 -0500 Subject: [PATCH 009/240] add basic variables to txt2img tab --- gui/src/client/api.ts | 11 ++ gui/src/client/local.ts | 3 + gui/src/client/types.ts | 18 +++ gui/src/client/utils.ts | 42 +++++++ gui/src/components/card/ErrorCard.tsx | 11 +- .../components/control/VariableControl.tsx | 107 ++++++++++++++++++ gui/src/components/tab/Txt2Img.tsx | 25 +++- gui/src/state.ts | 31 ++++- 8 files changed, 239 insertions(+), 9 deletions(-) create mode 100644 gui/src/client/utils.ts create mode 100644 gui/src/components/control/VariableControl.tsx diff --git a/gui/src/client/api.ts b/gui/src/client/api.ts index 6bed1e46..1271caab 100644 --- a/gui/src/client/api.ts +++ b/gui/src/client/api.ts @@ -7,6 +7,7 @@ import { ApiClient, BaseImgParams, BlendParams, + ChainPipeline, FilterResponse, HighresParams, ImageResponse, @@ -430,6 +431,16 @@ export function makeClient(root: string, token: Maybe = undefined, f = f } }; }, + async chain(chain: ChainPipeline): Promise { + const url = makeApiUrl(root, 'chain'); + const body = JSON.stringify(chain); + + // eslint-disable-next-line no-return-await + return await parseRequest(url, { + body, + method: 'POST', + }); + }, async ready(key: string): Promise { const path = makeApiUrl(root, 'ready'); path.searchParams.append('output', key); diff --git a/gui/src/client/local.ts b/gui/src/client/local.ts index 97f785a8..273e5168 100644 --- a/gui/src/client/local.ts +++ b/gui/src/client/local.ts @@ -39,6 +39,9 @@ export const LOCAL_CLIENT = { async outpaint(model, params, upscale) { throw new NoServerError(); }, + async chain(chain) { + throw new NoServerError(); + }, async noises() { throw new NoServerError(); }, diff --git a/gui/src/client/types.ts b/gui/src/client/types.ts index b9d22495..d1912d7b 100644 --- a/gui/src/client/types.ts +++ b/gui/src/client/types.ts @@ -162,6 +162,22 @@ export interface HighresParams { highresStrength: number; } +export interface Txt2ImgStage { + name: string; + type: 'source-txt2img'; + params: Txt2ImgParams; +} + +export interface Img2ImgStage { + name: string; + type: 'blend-img2img'; + params: Img2ImgParams; +} + +export interface ChainPipeline { + stages: Array; +} + /** * Output image data within the response. */ @@ -354,6 +370,8 @@ export interface ApiClient { */ blend(model: ModelParams, params: BlendParams, upscale?: UpscaleParams): Promise; + chain(chain: ChainPipeline): Promise; + /** * Check whether job has finished and its output is ready. */ diff --git a/gui/src/client/utils.ts b/gui/src/client/utils.ts new file mode 100644 index 00000000..d1d69141 --- /dev/null +++ b/gui/src/client/utils.ts @@ -0,0 +1,42 @@ +import { ChainPipeline, HighresParams, ModelParams, Txt2ImgParams, UpscaleParams } from './types.js'; + +export interface PipelineVariable { + parameter: 'prompt' | 'cfg' | 'seed' | 'steps'; + input: string; + values: Array; +} + +export interface PipelineGrid { + enabled: boolean; + columns: PipelineVariable; + rows: PipelineVariable; +} + +// eslint-disable-next-line max-params +export function buildPipelineForTxt2ImgGrid(grid: PipelineGrid, model: ModelParams, params: Txt2ImgParams, upscale?: UpscaleParams, highres?: HighresParams): ChainPipeline { + const pipeline: ChainPipeline = { + stages: [], + }; + + let i = 0; + + for (const column of grid.columns.values) { + for (const row of grid.rows.values) { + pipeline.stages.push({ + name: `cell-${i}`, + type: 'source-txt2img', + params: { + ...params, + [grid.columns.parameter]: column, + [grid.rows.parameter]: row, + }, + }); + + i += 1; + } + } + + // TODO: add final grid stage + + return pipeline; +} diff --git a/gui/src/components/card/ErrorCard.tsx b/gui/src/components/card/ErrorCard.tsx index fe683e67..9bb6e455 100644 --- a/gui/src/components/card/ErrorCard.tsx +++ b/gui/src/components/card/ErrorCard.tsx @@ -1,4 +1,4 @@ -import { mustExist } from '@apextoaster/js-utils'; +import { Maybe, doesExist, mustExist } from '@apextoaster/js-utils'; import { Delete, Replay } from '@mui/icons-material'; import { Alert, Box, Card, CardContent, IconButton, Tooltip } from '@mui/material'; import { Stack } from '@mui/system'; @@ -15,7 +15,7 @@ import { ClientContext, ConfigContext, OnnxState, StateContext } from '../../sta export interface ErrorCardProps { image: ImageResponse; ready: ReadyResponse; - retry: RetryParams; + retry: Maybe; } export function ErrorCard(props: ErrorCardProps) { @@ -30,8 +30,11 @@ export function ErrorCard(props: ErrorCardProps) { async function retryImage() { removeHistory(image); - const { image: nextImage, retry: nextRetry } = await client.retry(retryParams); - pushHistory(nextImage, nextRetry); + + if (doesExist(retryParams)) { + const { image: nextImage, retry: nextRetry } = await client.retry(retryParams); + pushHistory(nextImage, nextRetry); + } } const retry = useMutation(retryImage); diff --git a/gui/src/components/control/VariableControl.tsx b/gui/src/components/control/VariableControl.tsx new file mode 100644 index 00000000..cd159954 --- /dev/null +++ b/gui/src/components/control/VariableControl.tsx @@ -0,0 +1,107 @@ +import { doesExist, mustExist } from '@apextoaster/js-utils'; +import { Checkbox, FormControl, InputLabel, MenuItem, Select, Stack, TextField } from '@mui/material'; +import * as React from 'react'; +import { useContext } from 'react'; +import { useStore } from 'zustand'; + +import { PipelineGrid } from '../../client/utils.js'; +import { OnnxState, StateContext } from '../../state.js'; + +export interface VariableControlProps { + selectGrid: (state: OnnxState) => PipelineGrid; + setGrid: (grid: Partial) => void; +} + +export type VariableKey = 'prompt' | 'steps' | 'seed'; + +export function VariableControl(props: VariableControlProps) { + const store = mustExist(useContext(StateContext)); + const grid = useStore(store, props.selectGrid); + + return + + Grid Mode + props.setGrid({ + enabled: grid.enabled === false, + })} /> + + + + Columns + + + props.setGrid({ + columns: { + parameter: grid.columns.parameter, + input: event.target.value, + values: rangeSplit(grid.columns.parameter, event.target.value), + }, + })} /> + + + + Rows + + + props.setGrid({ + rows: { + parameter: grid.rows.parameter, + input: event.target.value, + values: rangeSplit(grid.rows.parameter, event.target.value), + } + })} /> + + ; +} + +export function rangeSplit(parameter: string, value: string): Array { + // string values + if (parameter === 'prompt') { + return value.split('\n'); + } + + return value.split(',').map((it) => it.trim()).flatMap((it) => expandRanges(it)); +} + +export const EXPR_STRICT_NUMBER = /^[0-9]+$/; +export const EXPR_NUMBER_RANGE = /^([0-9]+)-([0-9]+)$/; + +export function expandRanges(range: string): Array { + if (EXPR_STRICT_NUMBER.test(range)) { + // entirely numeric, return without parsing + return [range]; + } + + if (EXPR_NUMBER_RANGE.test(range)) { + const match = EXPR_NUMBER_RANGE.exec(range); + if (doesExist(match)) { + const [_full, startStr, endStr] = Array.from(match); + const start = parseInt(startStr, 10); + const end = parseInt(endStr, 10); + + return new Array(end - start).fill(0).map((_value, idx) => (idx + start).toFixed(0)); + } + } + + return []; +} diff --git a/gui/src/components/tab/Txt2Img.tsx b/gui/src/components/tab/Txt2Img.tsx index 76e669ce..d5347926 100644 --- a/gui/src/components/tab/Txt2Img.tsx +++ b/gui/src/components/tab/Txt2Img.tsx @@ -15,15 +15,27 @@ import { ModelControl } from '../control/ModelControl.js'; import { UpscaleControl } from '../control/UpscaleControl.js'; import { NumericField } from '../input/NumericField.js'; import { Profiles } from '../Profiles.js'; +import { VariableControl } from '../control/VariableControl.js'; +import { PipelineGrid, buildPipelineForTxt2ImgGrid } from '../../client/utils.js'; export function Txt2Img() { const { params } = mustExist(useContext(ConfigContext)); async function generateImage() { const state = store.getState(); - const { image, retry } = await client.txt2img(model, selectParams(state), selectUpscale(state), selectHighres(state)); + const grid = selectVariable(state); + const params2 = selectParams(state); + const upscale = selectUpscale(state); + const highres = selectHighres(state); - pushHistory(image, retry); + if (grid.enabled) { + const chain = buildPipelineForTxt2ImgGrid(grid, model, params2, upscale, highres); + const image = await client.chain(chain); + pushHistory(image); + } else { + const { image, retry } = await client.txt2img(model, params2, upscale, highres); + pushHistory(image, retry); + } } const client = mustExist(useContext(ClientContext)); @@ -33,7 +45,7 @@ export function Txt2Img() { }); const store = mustExist(useContext(StateContext)); - const { pushHistory, setHighres, setModel, setParams, setUpscale } = useStore(store, selectActions, shallow); + const { pushHistory, setHighres, setModel, setParams, setUpscale, setVariable } = useStore(store, selectActions, shallow); const { height, width } = useStore(store, selectReactParams, shallow); const model = useStore(store, selectModel); @@ -79,6 +91,7 @@ export function Txt2Img() { +