remove some duplicate code
This commit is contained in:
parent
3c17fb8d50
commit
7cfe619b4a
|
@ -48,118 +48,6 @@ class OnnxStableDiffusionHighresPipeline(OnnxStableDiffusionBasePipeline):
|
||||||
|
|
||||||
self.upscaler = upscaler
|
self.upscaler = upscaler
|
||||||
|
|
||||||
def _encode_prompt(
|
|
||||||
self,
|
|
||||||
prompt: Union[str, List[str]],
|
|
||||||
num_images_per_prompt: Optional[int],
|
|
||||||
do_classifier_free_guidance: bool,
|
|
||||||
negative_prompt: Optional[str],
|
|
||||||
prompt_embeds: Optional[np.ndarray] = None,
|
|
||||||
negative_prompt_embeds: Optional[np.ndarray] = None,
|
|
||||||
):
|
|
||||||
r"""
|
|
||||||
Encodes the prompt into text encoder hidden states.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
prompt (`str` or `List[str]`):
|
|
||||||
prompt to be encoded
|
|
||||||
num_images_per_prompt (`int`):
|
|
||||||
number of images that should be generated per prompt
|
|
||||||
do_classifier_free_guidance (`bool`):
|
|
||||||
whether to use classifier free guidance or not
|
|
||||||
negative_prompt (`str` or `List[str]`):
|
|
||||||
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
|
||||||
if `guidance_scale` is less than `1`).
|
|
||||||
prompt_embeds (`np.ndarray`, *optional*):
|
|
||||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
|
||||||
provided, text embeddings will be generated from `prompt` input argument.
|
|
||||||
negative_prompt_embeds (`np.ndarray`, *optional*):
|
|
||||||
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
|
||||||
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
|
||||||
argument.
|
|
||||||
"""
|
|
||||||
if prompt is not None and isinstance(prompt, str):
|
|
||||||
batch_size = 1
|
|
||||||
elif prompt is not None and isinstance(prompt, list):
|
|
||||||
batch_size = len(prompt)
|
|
||||||
else:
|
|
||||||
batch_size = prompt_embeds.shape[0]
|
|
||||||
|
|
||||||
if prompt_embeds is None:
|
|
||||||
# get prompt text embeddings
|
|
||||||
text_inputs = self.tokenizer(
|
|
||||||
prompt,
|
|
||||||
padding="max_length",
|
|
||||||
max_length=self.tokenizer.model_max_length,
|
|
||||||
truncation=True,
|
|
||||||
return_tensors="np",
|
|
||||||
)
|
|
||||||
text_input_ids = text_inputs.input_ids
|
|
||||||
untruncated_ids = self.tokenizer(
|
|
||||||
prompt, padding="max_length", return_tensors="np"
|
|
||||||
).input_ids
|
|
||||||
|
|
||||||
if not np.array_equal(text_input_ids, untruncated_ids):
|
|
||||||
removed_text = self.tokenizer.batch_decode(
|
|
||||||
untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
|
|
||||||
)
|
|
||||||
logger.warning(
|
|
||||||
"The following part of your input was truncated because CLIP can only handle sequences up to"
|
|
||||||
f" {self.tokenizer.model_max_length} tokens: {removed_text}"
|
|
||||||
)
|
|
||||||
|
|
||||||
prompt_embeds, text_pooler_out, *hidden_states = self.text_encoder(
|
|
||||||
input_ids=text_input_ids.astype(np.int32),
|
|
||||||
)
|
|
||||||
|
|
||||||
# get unconditional embeddings for classifier free guidance
|
|
||||||
if do_classifier_free_guidance:
|
|
||||||
if negative_prompt_embeds is None:
|
|
||||||
uncond_tokens: List[str]
|
|
||||||
|
|
||||||
if negative_prompt is None:
|
|
||||||
uncond_tokens = [""] * batch_size
|
|
||||||
elif type(prompt) is not type(negative_prompt):
|
|
||||||
raise TypeError(
|
|
||||||
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
|
|
||||||
f" {type(prompt)}."
|
|
||||||
)
|
|
||||||
elif isinstance(negative_prompt, str):
|
|
||||||
uncond_tokens = [negative_prompt] * batch_size
|
|
||||||
elif batch_size != len(negative_prompt):
|
|
||||||
raise ValueError(
|
|
||||||
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
|
|
||||||
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
|
|
||||||
" the batch size of `prompt`."
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
uncond_tokens = negative_prompt
|
|
||||||
|
|
||||||
max_length = prompt_embeds.shape[1]
|
|
||||||
uncond_input = self.tokenizer(
|
|
||||||
uncond_tokens,
|
|
||||||
padding="max_length",
|
|
||||||
max_length=max_length,
|
|
||||||
truncation=True,
|
|
||||||
return_tensors="np",
|
|
||||||
)
|
|
||||||
|
|
||||||
(
|
|
||||||
negative_prompt_embeds,
|
|
||||||
negative_pooled_embeds,
|
|
||||||
*_negative_hidden_states,
|
|
||||||
) = self.text_encoder(
|
|
||||||
input_ids=uncond_input.input_ids.astype(np.int32),
|
|
||||||
)
|
|
||||||
|
|
||||||
# For classifier free guidance, we need to do two forward passes.
|
|
||||||
# Here we concatenate the unconditional and text embeddings into a single batch
|
|
||||||
# to avoid doing two forward passes
|
|
||||||
prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds])
|
|
||||||
text_pooler_out = np.concatenate([negative_pooled_embeds, text_pooler_out])
|
|
||||||
|
|
||||||
return prompt_embeds, text_pooler_out
|
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def text2img(
|
def text2img(
|
||||||
self,
|
self,
|
||||||
|
@ -181,67 +69,6 @@ class OnnxStableDiffusionHighresPipeline(OnnxStableDiffusionBasePipeline):
|
||||||
callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
|
callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
|
||||||
callback_steps: int = 1,
|
callback_steps: int = 1,
|
||||||
):
|
):
|
||||||
r"""
|
|
||||||
Function invoked when calling the pipeline for generation.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
prompt (`str` or `List[str]`, *optional*):
|
|
||||||
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
|
|
||||||
instead.
|
|
||||||
image (`PIL.Image.Image` or List[`PIL.Image.Image`] or `torch.FloatTensor`):
|
|
||||||
`Image`, or tensor representing an image batch which will be upscaled. *
|
|
||||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
|
||||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
|
||||||
expense of slower inference.
|
|
||||||
guidance_scale (`float`, *optional*, defaults to 7.5):
|
|
||||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
|
||||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
|
||||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
|
||||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
|
||||||
usually at the expense of lower image quality.
|
|
||||||
negative_prompt (`str` or `List[str]`, *optional*):
|
|
||||||
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
|
||||||
`negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
|
|
||||||
is less than `1`).
|
|
||||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
|
||||||
The number of images to generate per prompt.
|
|
||||||
eta (`float`, *optional*, defaults to 0.0):
|
|
||||||
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
|
|
||||||
[`schedulers.DDIMScheduler`], will be ignored for others.
|
|
||||||
generator (`np.random.RandomState`, *optional*):
|
|
||||||
One or a list of [numpy generator(s)](TODO) to make generation deterministic.
|
|
||||||
latents (`np.ndarray`, *optional*):
|
|
||||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
|
||||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
|
||||||
tensor will ge generated by sampling using the supplied random `generator`.
|
|
||||||
prompt_embeds (`np.ndarray`, *optional*):
|
|
||||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
|
||||||
provided, text embeddings will be generated from `prompt` input argument.
|
|
||||||
negative_prompt_embeds (`np.ndarray`, *optional*):
|
|
||||||
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
|
||||||
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
|
||||||
argument.
|
|
||||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
|
||||||
The output format of the generate image. Choose between
|
|
||||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
|
||||||
return_dict (`bool`, *optional*, defaults to `True`):
|
|
||||||
Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
|
|
||||||
plain tuple.
|
|
||||||
callback (`Callable`, *optional*):
|
|
||||||
A function that will be called every `callback_steps` steps during inference. The function will be
|
|
||||||
called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
|
|
||||||
callback_steps (`int`, *optional*, defaults to 1):
|
|
||||||
The frequency at which the `callback` function will be called. If not specified, the callback will be
|
|
||||||
called at every step.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
|
|
||||||
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
|
|
||||||
When returning a tuple, the first element is a list with the generated images, and the second element is a
|
|
||||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
|
||||||
(nsfw) content, according to the `safety_checker`.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# check inputs. Raise error if not correct
|
# check inputs. Raise error if not correct
|
||||||
self.check_inputs(
|
self.check_inputs(
|
||||||
prompt,
|
prompt,
|
||||||
|
|
Loading…
Reference in New Issue