feat(api): enable optimizations for SD pipelines based on env vars (#155)

2023-02-18 11:53:13 -06:00 · 2023-02-18 11:53:13 -06:00 · ab6462d095
parent ff57527274
commit ab6462d095
3 changed files with 35 additions and 0 deletions
--- a/api/onnx_web/chain/upscale_stable_diffusion.py
+++ b/api/onnx_web/chain/upscale_stable_diffusion.py
@ -5,6 +5,7 @@ import torch
 from diffusers import StableDiffusionUpscalePipeline
 from PIL import Image

+from ..diffusion.load import optimize_pipeline
 from ..diffusion.pipeline_onnx_stable_diffusion_upscale import (
    OnnxStableDiffusionUpscalePipeline,
 )
@ -52,6 +53,8 @@ def load_stable_diffusion(
    if not server.show_progress:
        pipe.set_progress_bar_config(disable=True)

+    optimize_pipeline(server, pipe)
+
    server.cache.set("diffusion", cache_key, pipe)
    run_gc([device])

--- a/api/onnx_web/diffusion/load.py
+++ b/api/onnx_web/diffusion/load.py
@ -17,6 +17,7 @@ from diffusers import (
    KDPM2DiscreteScheduler,
    LMSDiscreteScheduler,
    PNDMScheduler,
+    StableDiffusionPipeline,
 )

 try:
@ -87,6 +88,32 @@ def get_tile_latents(
    return full_latents[:, :, y:yt, x:xt]


+def optimize_pipeline(
+    server: ServerContext,
+    pipe: StableDiffusionPipeline,
+) -> None:
+    if "attention-slicing" in server.optimizations:
+        logger.debug("enabling attention slicing on SD pipeline")
+        pipe.enable_attention_slicing()
+
+    if "vae-slicing" in server.optimizations:
+        logger.debug("enabling VAE slicing on SD pipeline")
+        pipe.enable_vae_slicing()
+
+    if "sequential-cpu-offload" in server.optimizations:
+        logger.debug("enabling sequential CPU offload on SD pipeline")
+        pipe.enable_sequential_cpu_offload()
+    elif "model-cpu-offload" in server.optimizations:
+        # TODO: check for accelerate
+        logger.debug("enabling model CPU offload on SD pipeline")
+        pipe.enable_model_cpu_offload()
+
+    if "memory-efficient-attention" in server.optimizations:
+        # TODO: check for xformers
+        logger.debug("enabling memory efficient attention for SD pipeline")
+        pipe.enable_xformers_memory_efficient_attention()
+
+
 def load_pipeline(
    server: ServerContext,
    pipeline: DiffusionPipeline,
@ -151,6 +178,8 @@ def load_pipeline(
        if not server.show_progress:
            pipe.set_progress_bar_config(disable=True)

+        optimize_pipeline(server, pipe)
+
        if device is not None and hasattr(pipe, "to"):
            pipe = pipe.to(device.torch_str())

--- a/api/onnx_web/utils.py
+++ b/api/onnx_web/utils.py
@ -28,6 +28,7 @@ class ServerContext:
        cache: ModelCache = None,
        cache_path: str = None,
        show_progress: bool = True,
+        optimizations: List[str] = [],
    ) -> None:
        self.bundle_path = bundle_path
        self.model_path = model_path
@ -42,6 +43,7 @@ class ServerContext:
        self.cache = cache or ModelCache(num_workers)
        self.cache_path = cache_path or path.join(model_path, ".cache")
        self.show_progress = show_progress
+        self.optimizations = optimizations

    @classmethod
    def from_environ(cls):
@ -64,6 +66,7 @@ class ServerContext:
            image_format=environ.get("ONNX_WEB_IMAGE_FORMAT", "png"),
            cache=ModelCache(limit=cache_limit),
            show_progress=get_boolean(environ, "ONNX_WEB_SHOW_PROGRESS", True),
+            optimizations=environ.get("ONNX_WEB_OPTIMIZATIONS", "").split(","),
        )