From c2f8fb1d3128a0ff9b0acf67943985afd3a0a562 Mon Sep 17 00:00:00 2001 From: Sean Sube Date: Mon, 27 Mar 2023 08:55:01 -0500 Subject: [PATCH] fix(api): combine names for ONNX fp16 optimization --- api/onnx_web/convert/__main__.py | 2 +- docs/server-admin.md | 7 +------ docs/user-guide.md | 25 +++++++++++++++++-------- 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/api/onnx_web/convert/__main__.py b/api/onnx_web/convert/__main__.py index 8872afd1..42cdb8bd 100644 --- a/api/onnx_web/convert/__main__.py +++ b/api/onnx_web/convert/__main__.py @@ -478,7 +478,7 @@ def main() -> int: logger.info("CLI arguments: %s", args) ctx = ConversionContext.from_environ() - ctx.half = args.half or "onnx-internal-fp16" in ctx.optimizations + ctx.half = args.half or "onnx-fp16" in ctx.optimizations ctx.opset = args.opset ctx.token = args.token logger.info("converting models in %s using %s", ctx.model_path, ctx.training_device) diff --git a/docs/server-admin.md b/docs/server-admin.md index 073255ee..6718c44e 100644 --- a/docs/server-admin.md +++ b/docs/server-admin.md @@ -102,9 +102,7 @@ Others: - `onnx-deterministic-compute` - enable ONNX deterministic compute - `onnx-fp16` - - force 16-bit floating point values when running pipelines - - use with https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/python/tools/transformers/models/stable_diffusion#optimize-onnx-pipeline - and the `--float16` flag + - convert model nodes to 16-bit floating point values internally while leaving 32-bit inputs - `onnx-graph-*` - `onnx-graph-disable` - disable all ONNX graph optimizations @@ -112,9 +110,6 @@ Others: - enable basic ONNX graph optimizations - `onnx-graph-all` - enable all ONNX graph optimizations - - `onnx-internal-fp16` - - convert internal model nodes to 16-bit floating point values - - does not reduce disk space as much as `onnx-fp16` or `torch-fp16`, but does not incur as many extra conversions - `onnx-low-memory` - disable ONNX features that allocate more memory than is strictly required or keep memory after use - `torch-*` diff --git a/docs/user-guide.md b/docs/user-guide.md index 96b0f37f..3481ac0d 100644 --- a/docs/user-guide.md +++ b/docs/user-guide.md @@ -725,20 +725,29 @@ Some common VAE models include: ### Optimizing models for lower memory usage Running Stable Diffusion with ONNX acceleration uses more memory by default than some other methods, but there are a -number of optimizations that you can apply to reduce the memory usage. +number of [server optimizations](server-admin.md#pipeline-optimizations) that you can apply to reduce the memory usage: + +- `diffusers-attention-slicing` +- `onnx-fp16` +- `onnx-graph-all` +- `onnx-low-memory` +- `torch-fp16` At least 12GB of VRAM is recommended for running all of the models in the extras file, but `onnx-web` should work on most 8GB cards and may work on some 6GB cards. 4GB is not supported yet, but [it should be possible](https://github.com/ssube/onnx-web/issues/241#issuecomment-1475341043). -- `diffusers-attention-slicing` -- `onnx-fp16` -- `onnx-internal-fp16` -- `onnx-graph-all` -- `onnx-low-memory` -- `torch-fp16` +Based on somewhat limited testing, the model size memory usage for each optimization level is approximately: -TODO: memory at different optimization levels +| Optimizations | Disk Size | Memory Usage - 1 @ 512x512 | Supported Platforms | +| --------------------------- | --------- | -------------------------- | ------------------- | +| none | 4.0G | 11.5G | all | +| `onnx-fp16` | 2.2G | 9.9G | all | +| ORT script | 4.0G | 6.6G | CUDA only | +| ORT script with `--float16` | 2.1G | 5.8G | CUDA only | +| `torch-fp16` | 2.0G | 5.9G | CUDA only | + +- https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/python/tools/transformers/models/stable_diffusion#cuda-optimizations-for-stable-diffusion ### Permanently blending additional networks