From c2f8fb1d3128a0ff9b0acf67943985afd3a0a562 Mon Sep 17 00:00:00 2001
From: Sean Sube <seansube@gmail.com>
Date: Mon, 27 Mar 2023 08:55:01 -0500
Subject: [PATCH] fix(api): combine names for ONNX fp16 optimization

---
 api/onnx_web/convert/__main__.py |  2 +-
 docs/server-admin.md             |  7 +------
 docs/user-guide.md               | 25 +++++++++++++++++--------
 3 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/api/onnx_web/convert/__main__.py b/api/onnx_web/convert/__main__.py
index 8872afd1..42cdb8bd 100644
--- a/api/onnx_web/convert/__main__.py
+++ b/api/onnx_web/convert/__main__.py
@@ -478,7 +478,7 @@ def main() -> int:
     logger.info("CLI arguments: %s", args)
 
     ctx = ConversionContext.from_environ()
-    ctx.half = args.half or "onnx-internal-fp16" in ctx.optimizations
+    ctx.half = args.half or "onnx-fp16" in ctx.optimizations
     ctx.opset = args.opset
     ctx.token = args.token
     logger.info("converting models in %s using %s", ctx.model_path, ctx.training_device)
diff --git a/docs/server-admin.md b/docs/server-admin.md
index 073255ee..6718c44e 100644
--- a/docs/server-admin.md
+++ b/docs/server-admin.md
@@ -102,9 +102,7 @@ Others:
   - `onnx-deterministic-compute`
     - enable ONNX deterministic compute
   - `onnx-fp16`
-    - force 16-bit floating point values when running pipelines
-    - use with https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/python/tools/transformers/models/stable_diffusion#optimize-onnx-pipeline
-      and the `--float16` flag
+    - convert model nodes to 16-bit floating point values internally while leaving 32-bit inputs
   - `onnx-graph-*`
     - `onnx-graph-disable`
       - disable all ONNX graph optimizations
@@ -112,9 +110,6 @@ Others:
       - enable basic ONNX graph optimizations
     - `onnx-graph-all`
       - enable all ONNX graph optimizations
-  - `onnx-internal-fp16`
-    - convert internal model nodes to 16-bit floating point values
-    - does not reduce disk space as much as `onnx-fp16` or `torch-fp16`, but does not incur as many extra conversions
   - `onnx-low-memory`
     - disable ONNX features that allocate more memory than is strictly required or keep memory after use
 - `torch-*`
diff --git a/docs/user-guide.md b/docs/user-guide.md
index 96b0f37f..3481ac0d 100644
--- a/docs/user-guide.md
+++ b/docs/user-guide.md
@@ -725,20 +725,29 @@ Some common VAE models include:
 ### Optimizing models for lower memory usage
 
 Running Stable Diffusion with ONNX acceleration uses more memory by default than some other methods, but there are a
-number of optimizations that you can apply to reduce the memory usage.
+number of [server optimizations](server-admin.md#pipeline-optimizations) that you can apply to reduce the memory usage:
+
+- `diffusers-attention-slicing`
+- `onnx-fp16`
+- `onnx-graph-all`
+- `onnx-low-memory`
+- `torch-fp16`
 
 At least 12GB of VRAM is recommended for running all of the models in the extras file, but `onnx-web` should work on
 most 8GB cards and may work on some 6GB cards. 4GB is not supported yet, but [it should be
 possible](https://github.com/ssube/onnx-web/issues/241#issuecomment-1475341043).
 
-- `diffusers-attention-slicing`
-- `onnx-fp16`
-- `onnx-internal-fp16`
-- `onnx-graph-all`
-- `onnx-low-memory`
-- `torch-fp16`
+Based on somewhat limited testing, the model size memory usage for each optimization level is approximately:
 
-TODO: memory at different optimization levels
+| Optimizations               | Disk Size | Memory Usage - 1 @ 512x512 | Supported Platforms |
+| --------------------------- | --------- | -------------------------- | ------------------- |
+| none                        | 4.0G      | 11.5G                      | all                 |
+| `onnx-fp16`                 | 2.2G      | 9.9G                       | all                 |
+| ORT script                  | 4.0G      | 6.6G                       | CUDA only           |
+| ORT script with `--float16` | 2.1G      | 5.8G                       | CUDA only           |
+| `torch-fp16`                | 2.0G      | 5.9G                       | CUDA only           |
+
+- https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/python/tools/transformers/models/stable_diffusion#cuda-optimizations-for-stable-diffusion
 
 ### Permanently blending additional networks