chore(docs): explain model optimizations
This commit is contained in:
parent
e0a62ccbb5
commit
bfdb071c2d
|
@ -92,28 +92,28 @@ def optimize_pipeline(
|
|||
server: ServerContext,
|
||||
pipe: StableDiffusionPipeline,
|
||||
) -> None:
|
||||
if "attention-slicing" in server.optimizations:
|
||||
if "diffusers-attention-slicing" in server.optimizations:
|
||||
logger.debug("enabling attention slicing on SD pipeline")
|
||||
try:
|
||||
pipe.enable_attention_slicing()
|
||||
except Exception as e:
|
||||
logger.warning("error while enabling attention slicing: %s", e)
|
||||
|
||||
if "vae-slicing" in server.optimizations:
|
||||
if "diffusers-vae-slicing" in server.optimizations:
|
||||
logger.debug("enabling VAE slicing on SD pipeline")
|
||||
try:
|
||||
pipe.enable_vae_slicing()
|
||||
except Exception as e:
|
||||
logger.warning("error while enabling VAE slicing: %s", e)
|
||||
|
||||
if "sequential-cpu-offload" in server.optimizations:
|
||||
if "diffusers-cpu-offload-sequential" in server.optimizations:
|
||||
logger.debug("enabling sequential CPU offload on SD pipeline")
|
||||
try:
|
||||
pipe.enable_sequential_cpu_offload()
|
||||
except Exception as e:
|
||||
logger.warning("error while enabling sequential CPU offload: %s", e)
|
||||
|
||||
elif "model-cpu-offload" in server.optimizations:
|
||||
elif "diffusers-cpu-offload-model" in server.optimizations:
|
||||
# TODO: check for accelerate
|
||||
logger.debug("enabling model CPU offload on SD pipeline")
|
||||
try:
|
||||
|
@ -121,7 +121,7 @@ def optimize_pipeline(
|
|||
except Exception as e:
|
||||
logger.warning("error while enabling model CPU offload: %s", e)
|
||||
|
||||
if "memory-efficient-attention" in server.optimizations:
|
||||
if "diffusers-memory-efficient-attention" in server.optimizations:
|
||||
# TODO: check for xformers
|
||||
logger.debug("enabling memory efficient attention for SD pipeline")
|
||||
try:
|
||||
|
|
|
@ -107,13 +107,13 @@ class DeviceParams:
|
|||
sess.enable_mem_pattern = False
|
||||
sess.enable_mem_reuse = False
|
||||
|
||||
if "onnx-optimization-disable" in self.optimizations:
|
||||
if "onnx-graph-disable" in self.optimizations:
|
||||
logger.debug("disabling all ONNX graph optimizations")
|
||||
sess.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL
|
||||
elif "onnx-optimization-basic" in self.optimizations:
|
||||
elif "onnx-graph-basic" in self.optimizations:
|
||||
logger.debug("enabling basic ONNX graph optimizations")
|
||||
sess.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC
|
||||
elif "onnx-optimization-all" in self.optimizations:
|
||||
elif "onnx-graph-all" in self.optimizations:
|
||||
logger.debug("enabling all ONNX graph optimizations")
|
||||
sess.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
|
||||
|
||||
|
|
|
@ -11,6 +11,7 @@ Please see [the user guide](user-guide.md) for descriptions of the client and ea
|
|||
- [Configuration](#configuration)
|
||||
- [Debug Mode](#debug-mode)
|
||||
- [Environment Variables](#environment-variables)
|
||||
- [Pipeline Optimizations](#pipeline-optimizations)
|
||||
- [Server Parameters](#server-parameters)
|
||||
- [Containers](#containers)
|
||||
- [CPU](#cpu)
|
||||
|
@ -73,6 +74,39 @@ Others:
|
|||
- `ONNX_WEB_SHOW_PROGRESS`
|
||||
- show progress bars in the logs
|
||||
- disabling this can reduce noise in server logs, especially when logging to a file
|
||||
- `ONNX_WEB_OPTIMIZATIONS`
|
||||
- comma-delimited list of optimizations to enable
|
||||
|
||||
### Pipeline Optimizations
|
||||
|
||||
- `diffusers-*`
|
||||
- `diffusers-attention-slicing`
|
||||
- https://huggingface.co/docs/diffusers/optimization/fp16#sliced-attention-for-additional-memory-savings
|
||||
- `diffusers-cpu-offload-*`
|
||||
- `diffusers-cpu-offload-sequential`
|
||||
- not available for ONNX pipelines (most of them)
|
||||
- https://huggingface.co/docs/diffusers/optimization/fp16#offloading-to-cpu-with-accelerate-for-memory-savings
|
||||
- `diffusers-cpu-offload-model`
|
||||
- not available for ONNX pipelines (most of them)
|
||||
- https://huggingface.co/docs/diffusers/optimization/fp16#model-offloading-for-fast-inference-and-memory-savings
|
||||
- `diffusers-memory-efficient-attention`
|
||||
- requires [the `xformers` library](https://huggingface.co/docs/diffusers/optimization/xformers)
|
||||
- https://huggingface.co/docs/diffusers/optimization/fp16#memory-efficient-attention
|
||||
- `diffusers-vae-slicing`
|
||||
- not available for ONNX pipelines (most of them)
|
||||
- https://huggingface.co/docs/diffusers/optimization/fp16#sliced-vae-decode-for-larger-batches
|
||||
- `onnx-*`
|
||||
- `onnx-low-memory`
|
||||
- disable ONNX features that allocate more memory than is strictly required or keep memory after use
|
||||
- `onnx-graph-*`
|
||||
- `onnx-graph-disable`
|
||||
- disable all ONNX graph optimizations
|
||||
- `onnx-graph-basic`
|
||||
- enable basic ONNX graph optimizations
|
||||
- `onnx-graph-all`
|
||||
- enable all ONNX graph optimizations
|
||||
- `onnx-deterministic-compute`
|
||||
- enable ONNX deterministic compute
|
||||
|
||||
### Server Parameters
|
||||
|
||||
|
|
Loading…
Reference in New Issue