Skip to content

Commit

Permalink
--fast now takes a number as argument to indicate how fast you want it.
Browse files Browse the repository at this point in the history
The idea is that you can indicate how much quality vs speed you want.

At the moment:

--fast 2 enables fp16 accumulation if your pytorch supports it.
--fast 5 enables fp8 matrix mult on fp8 models and the optimization above.

--fast without a number enables all optimizations.
  • Loading branch information
comfyanonymous committed Feb 28, 2025
1 parent eb45434 commit cf0b549
Show file tree
Hide file tree
Showing 3 changed files with 4 additions and 3 deletions.
2 changes: 1 addition & 1 deletion comfy/cli_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ class LatentPreviewMethod(enum.Enum):

parser.add_argument("--disable-smart-memory", action="store_true", help="Force ComfyUI to agressively offload to regular ram instead of keeping models in vram when it can.")
parser.add_argument("--deterministic", action="store_true", help="Make pytorch use slower deterministic algorithms when it can. Note that this might not make images deterministic in all cases.")
parser.add_argument("--fast", action="store_true", help="Enable some untested and potentially quality deteriorating optimizations.")
parser.add_argument("--fast", metavar="number", type=int, const=99, default=0, nargs="?", help="Enable some untested and potentially quality deteriorating optimizations. You can pass a number from 0 to 10 for a bigger speed vs quality tradeoff. Using --fast with no number means maximum speed. 2 or larger enables fp16 accumulation, 5 or larger enables fp8 matrix multiplication.")

parser.add_argument("--dont-print-server", action="store_true", help="Don't print server output.")
parser.add_argument("--quick-test-for-ci", action="store_true", help="Quick test for CI.")
Expand Down
3 changes: 2 additions & 1 deletion comfy/model_management.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,9 +280,10 @@ def is_amd():

PRIORITIZE_FP16 = False # TODO: remove and replace with something that shows exactly which dtype is faster than the other
try:
if is_nvidia() and args.fast:
if is_nvidia() and args.fast >= 2:
torch.backends.cuda.matmul.allow_fp16_accumulation = True
PRIORITIZE_FP16 = True # TODO: limit to cards where it actually boosts performance
logging.info("Enabled fp16 accumulation.")
except:
pass

Expand Down
2 changes: 1 addition & 1 deletion comfy/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,7 @@ def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_
if scaled_fp8 is not None:
return scaled_fp8_ops(fp8_matrix_mult=fp8_compute, scale_input=True, override_dtype=scaled_fp8)

if fp8_compute and (fp8_optimizations or args.fast) and not disable_fast_fp8:
if fp8_compute and (fp8_optimizations or args.fast >= 5) and not disable_fast_fp8:
return fp8_ops

if compute_dtype is None or weight_dtype == compute_dtype:
Expand Down

0 comments on commit cf0b549

Please sign in to comment.