1
0
Fork 0

chore(docs): explain model optimizations

This commit is contained in:
Sean Sube 2023-02-18 16:06:05 -06:00
parent e0a62ccbb5
commit bfdb071c2d
Signed by: ssube
GPG Key ID: 3EED7B957D362AF1
3 changed files with 42 additions and 8 deletions

View File

@ -92,28 +92,28 @@ def optimize_pipeline(
server: ServerContext, server: ServerContext,
pipe: StableDiffusionPipeline, pipe: StableDiffusionPipeline,
) -> None: ) -> None:
if "attention-slicing" in server.optimizations: if "diffusers-attention-slicing" in server.optimizations:
logger.debug("enabling attention slicing on SD pipeline") logger.debug("enabling attention slicing on SD pipeline")
try: try:
pipe.enable_attention_slicing() pipe.enable_attention_slicing()
except Exception as e: except Exception as e:
logger.warning("error while enabling attention slicing: %s", e) logger.warning("error while enabling attention slicing: %s", e)
if "vae-slicing" in server.optimizations: if "diffusers-vae-slicing" in server.optimizations:
logger.debug("enabling VAE slicing on SD pipeline") logger.debug("enabling VAE slicing on SD pipeline")
try: try:
pipe.enable_vae_slicing() pipe.enable_vae_slicing()
except Exception as e: except Exception as e:
logger.warning("error while enabling VAE slicing: %s", e) logger.warning("error while enabling VAE slicing: %s", e)
if "sequential-cpu-offload" in server.optimizations: if "diffusers-cpu-offload-sequential" in server.optimizations:
logger.debug("enabling sequential CPU offload on SD pipeline") logger.debug("enabling sequential CPU offload on SD pipeline")
try: try:
pipe.enable_sequential_cpu_offload() pipe.enable_sequential_cpu_offload()
except Exception as e: except Exception as e:
logger.warning("error while enabling sequential CPU offload: %s", e) logger.warning("error while enabling sequential CPU offload: %s", e)
elif "model-cpu-offload" in server.optimizations: elif "diffusers-cpu-offload-model" in server.optimizations:
# TODO: check for accelerate # TODO: check for accelerate
logger.debug("enabling model CPU offload on SD pipeline") logger.debug("enabling model CPU offload on SD pipeline")
try: try:
@ -121,7 +121,7 @@ def optimize_pipeline(
except Exception as e: except Exception as e:
logger.warning("error while enabling model CPU offload: %s", e) logger.warning("error while enabling model CPU offload: %s", e)
if "memory-efficient-attention" in server.optimizations: if "diffusers-memory-efficient-attention" in server.optimizations:
# TODO: check for xformers # TODO: check for xformers
logger.debug("enabling memory efficient attention for SD pipeline") logger.debug("enabling memory efficient attention for SD pipeline")
try: try:

View File

@ -107,13 +107,13 @@ class DeviceParams:
sess.enable_mem_pattern = False sess.enable_mem_pattern = False
sess.enable_mem_reuse = False sess.enable_mem_reuse = False
if "onnx-optimization-disable" in self.optimizations: if "onnx-graph-disable" in self.optimizations:
logger.debug("disabling all ONNX graph optimizations") logger.debug("disabling all ONNX graph optimizations")
sess.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL sess.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL
elif "onnx-optimization-basic" in self.optimizations: elif "onnx-graph-basic" in self.optimizations:
logger.debug("enabling basic ONNX graph optimizations") logger.debug("enabling basic ONNX graph optimizations")
sess.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC sess.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC
elif "onnx-optimization-all" in self.optimizations: elif "onnx-graph-all" in self.optimizations:
logger.debug("enabling all ONNX graph optimizations") logger.debug("enabling all ONNX graph optimizations")
sess.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL sess.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL

View File

@ -11,6 +11,7 @@ Please see [the user guide](user-guide.md) for descriptions of the client and ea
- [Configuration](#configuration) - [Configuration](#configuration)
- [Debug Mode](#debug-mode) - [Debug Mode](#debug-mode)
- [Environment Variables](#environment-variables) - [Environment Variables](#environment-variables)
- [Pipeline Optimizations](#pipeline-optimizations)
- [Server Parameters](#server-parameters) - [Server Parameters](#server-parameters)
- [Containers](#containers) - [Containers](#containers)
- [CPU](#cpu) - [CPU](#cpu)
@ -73,6 +74,39 @@ Others:
- `ONNX_WEB_SHOW_PROGRESS` - `ONNX_WEB_SHOW_PROGRESS`
- show progress bars in the logs - show progress bars in the logs
- disabling this can reduce noise in server logs, especially when logging to a file - disabling this can reduce noise in server logs, especially when logging to a file
- `ONNX_WEB_OPTIMIZATIONS`
- comma-delimited list of optimizations to enable
### Pipeline Optimizations
- `diffusers-*`
- `diffusers-attention-slicing`
- https://huggingface.co/docs/diffusers/optimization/fp16#sliced-attention-for-additional-memory-savings
- `diffusers-cpu-offload-*`
- `diffusers-cpu-offload-sequential`
- not available for ONNX pipelines (most of them)
- https://huggingface.co/docs/diffusers/optimization/fp16#offloading-to-cpu-with-accelerate-for-memory-savings
- `diffusers-cpu-offload-model`
- not available for ONNX pipelines (most of them)
- https://huggingface.co/docs/diffusers/optimization/fp16#model-offloading-for-fast-inference-and-memory-savings
- `diffusers-memory-efficient-attention`
- requires [the `xformers` library](https://huggingface.co/docs/diffusers/optimization/xformers)
- https://huggingface.co/docs/diffusers/optimization/fp16#memory-efficient-attention
- `diffusers-vae-slicing`
- not available for ONNX pipelines (most of them)
- https://huggingface.co/docs/diffusers/optimization/fp16#sliced-vae-decode-for-larger-batches
- `onnx-*`
- `onnx-low-memory`
- disable ONNX features that allocate more memory than is strictly required or keep memory after use
- `onnx-graph-*`
- `onnx-graph-disable`
- disable all ONNX graph optimizations
- `onnx-graph-basic`
- enable basic ONNX graph optimizations
- `onnx-graph-all`
- enable all ONNX graph optimizations
- `onnx-deterministic-compute`
- enable ONNX deterministic compute
### Server Parameters ### Server Parameters