chore(docs): explain model optimizations

2023-02-18 16:06:05 -06:00 · 2023-02-18 16:06:05 -06:00 · bfdb071c2d
parent e0a62ccbb5
commit bfdb071c2d
3 changed files with 42 additions and 8 deletions
--- a/api/onnx_web/diffusion/load.py
+++ b/api/onnx_web/diffusion/load.py
@ -92,28 +92,28 @@ def optimize_pipeline(
    server: ServerContext,
    pipe: StableDiffusionPipeline,
 ) -> None:
-    if "attention-slicing" in server.optimizations:
+    if "diffusers-attention-slicing" in server.optimizations:
        logger.debug("enabling attention slicing on SD pipeline")
        try:
            pipe.enable_attention_slicing()
        except Exception as e:
            logger.warning("error while enabling attention slicing: %s", e)

-    if "vae-slicing" in server.optimizations:
+    if "diffusers-vae-slicing" in server.optimizations:
        logger.debug("enabling VAE slicing on SD pipeline")
        try:
            pipe.enable_vae_slicing()
        except Exception as e:
            logger.warning("error while enabling VAE slicing: %s", e)

-    if "sequential-cpu-offload" in server.optimizations:
+    if "diffusers-cpu-offload-sequential" in server.optimizations:
        logger.debug("enabling sequential CPU offload on SD pipeline")
        try:
            pipe.enable_sequential_cpu_offload()
        except Exception as e:
            logger.warning("error while enabling sequential CPU offload: %s", e)

-    elif "model-cpu-offload" in server.optimizations:
+    elif "diffusers-cpu-offload-model" in server.optimizations:
        # TODO: check for accelerate
        logger.debug("enabling model CPU offload on SD pipeline")
        try:
@ -121,7 +121,7 @@ def optimize_pipeline(
        except Exception as e:
            logger.warning("error while enabling model CPU offload: %s", e)

-    if "memory-efficient-attention" in server.optimizations:
+    if "diffusers-memory-efficient-attention" in server.optimizations:
        # TODO: check for xformers
        logger.debug("enabling memory efficient attention for SD pipeline")
        try:
--- a/api/onnx_web/params.py
+++ b/api/onnx_web/params.py
@ -107,13 +107,13 @@ class DeviceParams:
            sess.enable_mem_pattern = False
            sess.enable_mem_reuse = False

-        if "onnx-optimization-disable" in self.optimizations:
+        if "onnx-graph-disable" in self.optimizations:
            logger.debug("disabling all ONNX graph optimizations")
            sess.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL
-        elif "onnx-optimization-basic" in self.optimizations:
+        elif "onnx-graph-basic" in self.optimizations:
            logger.debug("enabling basic ONNX graph optimizations")
            sess.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC
-        elif "onnx-optimization-all" in self.optimizations:
+        elif "onnx-graph-all" in self.optimizations:
            logger.debug("enabling all ONNX graph optimizations")
            sess.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL

--- a/docs/server-admin.md
+++ b/docs/server-admin.md
@ -11,6 +11,7 @@ Please see [the user guide](user-guide.md) for descriptions of the client and ea
  - [Configuration](#configuration)
    - [Debug Mode](#debug-mode)
    - [Environment Variables](#environment-variables)
+    - [Pipeline Optimizations](#pipeline-optimizations)
    - [Server Parameters](#server-parameters)
  - [Containers](#containers)
    - [CPU](#cpu)
@ -73,6 +74,39 @@ Others:
 - `ONNX_WEB_SHOW_PROGRESS`
  - show progress bars in the logs
  - disabling this can reduce noise in server logs, especially when logging to a file
+- `ONNX_WEB_OPTIMIZATIONS`
+  - comma-delimited list of optimizations to enable
+
+### Pipeline Optimizations
+
+- `diffusers-*`
+  - `diffusers-attention-slicing`
+    - https://huggingface.co/docs/diffusers/optimization/fp16#sliced-attention-for-additional-memory-savings
+  - `diffusers-cpu-offload-*`
+    - `diffusers-cpu-offload-sequential`
+      - not available for ONNX pipelines (most of them)
+      - https://huggingface.co/docs/diffusers/optimization/fp16#offloading-to-cpu-with-accelerate-for-memory-savings
+    - `diffusers-cpu-offload-model`
+      - not available for ONNX pipelines (most of them)
+      - https://huggingface.co/docs/diffusers/optimization/fp16#model-offloading-for-fast-inference-and-memory-savings
+  - `diffusers-memory-efficient-attention`
+    - requires [the `xformers` library](https://huggingface.co/docs/diffusers/optimization/xformers)
+    - https://huggingface.co/docs/diffusers/optimization/fp16#memory-efficient-attention
+  - `diffusers-vae-slicing`
+    - not available for ONNX pipelines (most of them)
+    - https://huggingface.co/docs/diffusers/optimization/fp16#sliced-vae-decode-for-larger-batches
+- `onnx-*`
+  - `onnx-low-memory`
+    - disable ONNX features that allocate more memory than is strictly required or keep memory after use
+  - `onnx-graph-*`
+    - `onnx-graph-disable`
+      - disable all ONNX graph optimizations
+    - `onnx-graph-basic`
+      - enable basic ONNX graph optimizations
+    - `onnx-graph-all`
+      - enable all ONNX graph optimizations
+  - `onnx-deterministic-compute`
+    - enable ONNX deterministic compute

 ### Server Parameters