diff --git a/api/onnx_web/diffusers/pipelines/panorama_xl.py b/api/onnx_web/diffusers/pipelines/panorama_xl.py index c5ea69df..e55a5870 100644 --- a/api/onnx_web/diffusers/pipelines/panorama_xl.py +++ b/api/onnx_web/diffusers/pipelines/panorama_xl.py @@ -450,29 +450,30 @@ class StableDiffusionXLPanoramaPipelineMixin(StableDiffusionXLImg2ImgPipelineMix logger.debug("running region prompt: %s, %s, %s, %s, %s, %s", top, left, bottom, right, mult, prompt) # convert coordinates to latent space - h_start = top // 8 - h_end = bottom // 8 - w_start = left // 8 - w_end = right // 8 + h_start = left // 8 + h_end = right // 8 + w_start = top // 8 + w_end = bottom // 8 # get the latents corresponding to the current view coordinates - latents_for_view = latents[:, :, h_start:h_end, w_start:w_end] + latents_for_region = latents[:, :, h_start:h_end, w_start:w_end] + logger.trace("region latent shape: %s", latents_for_region.shape) # expand the latents if we are doing classifier free guidance - latent_model_input = ( - np.concatenate([latents_for_view] * 2) + latent_region_input = ( + np.concatenate([latents_for_region] * 2) if do_classifier_free_guidance - else latents_for_view + else latents_for_region ) - latent_model_input = self.scheduler.scale_model_input( - torch.from_numpy(latent_model_input), t + latent_region_input = self.scheduler.scale_model_input( + torch.from_numpy(latent_region_input), t ) - latent_model_input = latent_model_input.cpu().numpy() + latent_region_input = latent_region_input.cpu().numpy() # predict the noise residual timestep = np.array([t], dtype=timestep_dtype) noise_pred = self.unet( - sample=latent_model_input, + sample=latent_region_input, timestep=timestep, encoder_hidden_states=region_embeds[i], text_embeds=add_region_embeds[i], @@ -498,12 +499,12 @@ class StableDiffusionXLPanoramaPipelineMixin(StableDiffusionXLImg2ImgPipelineMix scheduler_output = self.scheduler.step( torch.from_numpy(noise_pred), t, - torch.from_numpy(latents_for_view), + torch.from_numpy(latents_for_region), **extra_step_kwargs, ) - latents_view_denoised = scheduler_output.prev_sample.numpy() + latents_region_denoised = scheduler_output.prev_sample.numpy() - value[:, :, h_start:h_end, w_start:w_end] += latents_view_denoised * mult + value[:, :, h_start:h_end, w_start:w_end] += latents_region_denoised * mult count[:, :, h_start:h_end, w_start:w_end] += mult # take the MultiDiffusion step. Eq. 5 in MultiDiffusion paper: https://arxiv.org/abs/2302.08113