main.py

import argparse
import uvicorn
from api import app


def parse_args():
    parser = argparse.ArgumentParser(description="Launch Flux API server")
    parser.add_argument(
        "-c",
        "--config-path",
        type=str,
        help="Path to the configuration file, if not provided, the model will be loaded from the command line arguments",
    )
    parser.add_argument(
        "-p",
        "--port",
        type=int,
        default=8088,
        help="Port to run the server on",
    )
    parser.add_argument(
        "-H",
        "--host",
        type=str,
        default="0.0.0.0",
        help="Host to run the server on",
    )
    parser.add_argument(
        "-f", "--flow-model-path", type=str, help="Path to the flow model"
    )
    parser.add_argument(
        "-t", "--text-enc-path", type=str, help="Path to the text encoder"
    )
    parser.add_argument(
        "-a", "--autoencoder-path", type=str, help="Path to the autoencoder"
    )
    parser.add_argument(
        "-m",
        "--model-version",
        type=str,
        choices=["flux-dev", "flux-schnell"],
        default="flux-dev",
        help="Choose model version",
    )
    parser.add_argument(
        "-F",
        "--flux-device",
        type=str,
        default="cuda:0",
        help="Device to run the flow model on",
    )
    parser.add_argument(
        "-T",
        "--text-enc-device",
        type=str,
        default="cuda:0",
        help="Device to run the text encoder on",
    )
    parser.add_argument(
        "-A",
        "--autoencoder-device",
        type=str,
        default="cuda:0",
        help="Device to run the autoencoder on",
    )
    parser.add_argument(
        "-q",
        "--num-to-quant",
        type=int,
        default=20,
        help="Number of linear layers in flow transformer (the 'unet') to quantize",
    )
    parser.add_argument(
        "-C",
        "--compile",
        action="store_true",
        default=False,
        help="Compile the flow model with extra optimizations",
    )
    parser.add_argument(
        "-qT",
        "--quant-text-enc",
        type=str,
        default="qfloat8",
        choices=["qint4", "qfloat8", "qint2", "qint8", "bf16"],
        help="Quantize the t5 text encoder to the given dtype, if bf16, will not quantize",
        dest="quant_text_enc",
    )
    parser.add_argument(
        "-qA",
        "--quant-ae",
        action="store_true",
        default=False,
        help="Quantize the autoencoder with float8 linear layers, otherwise will use bfloat16",
        dest="quant_ae",
    )
    parser.add_argument(
        "-OF",
        "--offload-flow",
        action="store_true",
        default=False,
        dest="offload_flow",
        help="Offload the flow model to the CPU when not being used to save memory",
    )
    parser.add_argument(
        "-OA",
        "--no-offload-ae",
        action="store_false",
        default=True,
        dest="offload_ae",
        help="Disable offloading the autoencoder to the CPU when not being used to increase e2e inference speed",
    )
    parser.add_argument(
        "-OT",
        "--no-offload-text-enc",
        action="store_false",
        default=True,
        dest="offload_text_enc",
        help="Disable offloading the text encoder to the CPU when not being used to increase e2e inference speed",
    )
    parser.add_argument(
        "-PF",
        "--prequantized-flow",
        action="store_true",
        default=False,
        dest="prequantized_flow",
        help="Load the flow model from a prequantized checkpoint "
        + "(requires loading the flow model, running a minimum of 24 steps, "
        + "and then saving the state_dict as a safetensors file), "
        + "which reduces the size of the checkpoint by about 50% & reduces startup time",
    )
    parser.add_argument(
        "-nqfm",
        "--no-quantize-flow-modulation",
        action="store_false",
        default=True,
        dest="quantize_modulation",
        help="Disable quantization of the modulation layers in the flow model, adds ~2GB vram usage for moderate precision improvements",
    )
    parser.add_argument(
        "-qfl",
        "--quantize-flow-embedder-layers",
        action="store_true",
        default=False,
        dest="quantize_flow_embedder_layers",
        help="Quantize the flow embedder layers in the flow model, saves ~512MB vram usage, but precision loss is very noticeable",
    )
    return parser.parse_args()


def main():
    args = parse_args()

    # lazy loading so cli returns fast instead of waiting for torch to load modules
    from flux_pipeline import FluxPipeline
    from util import load_config, ModelVersion

    if args.config_path:
        app.state.model = FluxPipeline.load_pipeline_from_config_path(
            args.config_path, flow_model_path=args.flow_model_path
        )
    else:
        model_version = (
            ModelVersion.flux_dev
            if args.model_version == "flux-dev"
            else ModelVersion.flux_schnell
        )
        config = load_config(
            model_version,
            flux_path=args.flow_model_path,
            flux_device=args.flux_device,
            ae_path=args.autoencoder_path,
            ae_device=args.autoencoder_device,
            text_enc_path=args.text_enc_path,
            text_enc_device=args.text_enc_device,
            flow_dtype="float16",
            text_enc_dtype="bfloat16",
            ae_dtype="bfloat16",
            num_to_quant=args.num_to_quant,
            compile_extras=args.compile,
            compile_blocks=args.compile,
            quant_text_enc=(
                None if args.quant_text_enc == "bf16" else args.quant_text_enc
            ),
            quant_ae=args.quant_ae,
            offload_flow=args.offload_flow,
            offload_ae=args.offload_ae,
            offload_text_enc=args.offload_text_enc,
            prequantized_flow=args.prequantized_flow,
            quantize_modulation=args.quantize_modulation,
            quantize_flow_embedder_layers=args.quantize_flow_embedder_layers,
        )
        app.state.model = FluxPipeline.load_pipeline_from_config(config)

    uvicorn.run(app, host=args.host, port=args.port)


if __name__ == "__main__":
    main()