From 062dd871b2e7fd6c1e5a97adf3db7a6e75b67b63 Mon Sep 17 00:00:00 2001 From: angelayi Date: Thu, 9 Jan 2025 16:13:26 -0800 Subject: [PATCH] [aoti] Remove need for -l in cmake --- .github/workflows/runner-cuda-dtype.yml | 2 +- README.md | 2 +- runner/run.cpp | 61 ++++++++------------ torchchat/export.py | 77 +++++++++++++++++-------- 4 files changed, 79 insertions(+), 63 deletions(-) diff --git a/.github/workflows/runner-cuda-dtype.yml b/.github/workflows/runner-cuda-dtype.yml index 1813f483e..4cfb9ff09 100644 --- a/.github/workflows/runner-cuda-dtype.yml +++ b/.github/workflows/runner-cuda-dtype.yml @@ -52,7 +52,7 @@ jobs: python torchchat.py export --checkpoint-path ${MODEL_DIR}/stories15M.pt --output-aoti-package-path /tmp/model.pt2 - ./cmake-out/aoti_run /tmp/model.pt2 -d CUDA -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}" + ./cmake-out/aoti_run /tmp/model.pt2 -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}" done diff --git a/README.md b/README.md index 1cfb06e22..2448b0b72 100644 --- a/README.md +++ b/README.md @@ -341,7 +341,7 @@ torchchat/utils/scripts/build_native.sh aoti Then run the compiled executable, with the pt2. ```bash -cmake-out/aoti_run exportedModels/llama3_1_artifacts.pt2 -z `python3 torchchat.py where llama3.1`/tokenizer.model -l 3 -i "Once upon a time" +cmake-out/aoti_run exportedModels/llama3_1_artifacts.pt2 -z `python3 torchchat.py where llama3.1`/tokenizer.model -i "Once upon a time" ``` ## Mobile Execution diff --git a/runner/run.cpp b/runner/run.cpp index f2b8e8e6b..e5c818cfa 100644 --- a/runner/run.cpp +++ b/runner/run.cpp @@ -102,6 +102,7 @@ typedef struct { typedef struct { Config config; // the hyperparameters of the architecture (the blueprint) RunState state; // buffers for the "wave" of activations in the forward pass + std::unordered_map metadata; #ifdef __AOTI_MODEL__ torch::inductor::AOTIModelPackageLoader *runner; @@ -141,20 +142,9 @@ void read_checkpoint(char *checkpoint, Config *config) { config->vocab_size = abs(config->vocab_size); } -void build_transformer(Transformer *t, char *model_path, int vocab_size, - int seq_len) { - // read in the Config and the Weights from the model - // read_checkpoint(model_path, &t->config); - // allocate the RunState buffers - t->config.vocab_size = vocab_size; - t->config.seq_len = seq_len; - malloc_run_state(&t->state, &t->config); - +void build_transformer(Transformer *t, char *model_path) { #ifdef __AOTI_MODEL__ t->runner = new torch::inductor::AOTIModelPackageLoader(model_path); - aoti_device = t->runner->get_metadata()["AOTI_DEVICE_KEY"] == "cpu" - ? torch::Device(torch::kCPU) - : torch::Device(torch::kCUDA); #else //__ET_MODEL__ t->runner = new Module( /* path to PTE model */ model_path, @@ -776,9 +766,6 @@ void error_usage() { " -v (optional) vocab size, default is model-specific.\n"); fprintf(stderr, " -l (optional) llama version (2 or 3), default 2.\n"); - fprintf( - stderr, - " -d (optional) device(CUDA or CPU) model was exported for\n"); exit(EXIT_FAILURE); } @@ -848,37 +835,35 @@ int main(int argc, char *argv[]) { system_prompt = argv[i + 1]; } else if (argv[i][1] == 'l') { llama_ver = atoi(argv[i + 1]); -#ifdef __AOTI_MODEL__ - } else if (argv[i][1] == 'd') { -#ifdef USE_CUDA - if (strcasecmp(argv[i + 1], "CUDA") == 0) { - aoti_device = torch::Device(torch::kCUDA); - } else -#endif - if (strcasecmp(argv[i + 1], "CPU") == 0) { - aoti_device = torch::Device(torch::kCPU); - } else { - fprintf(stderr, "Unknown device %s", argv[i + 1]); - exit(1); - } -#endif } else { error_usage(); } } + if (model_path == NULL) { + fprintf(stderr, "No model_path provided."); + error_usage(); + } + + Transformer transformer; + build_transformer(&transformer, model_path); + +#ifdef __AOTI_MODEL__ + auto aoti_metadata = transformer.runner->get_metadata(); + aoti_device = aoti_metadata["AOTI_DEVICE_KEY"] == "cpu" + ? torch::Device(torch::kCPU) + : torch::Device(torch::kCUDA); + ModelType model_type = get_model_type(std::stoi(aoti_metadata["tokenizer_type"])); +#else // __ET_MODEL__ ModelType model_type = get_model_type(llama_ver); +#endif + if (model_type == UNKNOWN_MODEL) { fprintf(stderr, "Unknown model type passed by -l argument. Received l=%d.", llama_ver); error_usage(); } - if (model_path == NULL) { - fprintf(stderr, "No model_path provided."); - error_usage(); - } - if (tokenizer_path == NULL) { fprintf(stderr, "No tokenizer_path provided."); error_usage(); @@ -901,8 +886,12 @@ int main(int argc, char *argv[]) { vocab_size = tokenizer->vocab_size(); } - Transformer transformer; - build_transformer(&transformer, model_path, vocab_size, steps); + // read in the Config and the Weights from the model + // read_checkpoint(model_path, &t->config); + // allocate the RunState buffers + transformer.config.vocab_size = vocab_size; + transformer.config.seq_len = steps; + malloc_run_state(&transformer.state, &transformer.config); Sampler sampler; build_sampler(&sampler, vocab_size, temperature, topp, rng_seed); diff --git a/torchchat/export.py b/torchchat/export.py index 979778b7c..e84a344bd 100644 --- a/torchchat/export.py +++ b/torchchat/export.py @@ -5,13 +5,13 @@ # LICENSE file in the root directory of this source tree. import os -from typing import Optional +from typing import Dict, Optional import torch +import torch._inductor import torch.nn as nn from torch.export import Dim -import torch._inductor from torchchat.cli.builder import ( _initialize_model, @@ -39,6 +39,7 @@ def export_for_server( output_path: str = "model.pt2", dynamic_shapes: bool = False, package: bool = True, + metadata: Optional[Dict[str, str]] = None, ) -> str: """ Export the model using AOT Compile to get a .dso for server use cases. @@ -67,8 +68,10 @@ def export_for_server( dynamic_shapes = None with torch.nn.attention.sdpa_kernel([torch.nn.attention.SDPBackend.MATH]): - metadata = {} # TODO: put more metadata here - options = {"aot_inductor.package": package, "aot_inductor.metadata": metadata} + options = { + "aot_inductor.package": package, + "aot_inductor.metadata": metadata or {}, + } if not package: options = {"aot_inductor.output_path": output_path} @@ -81,6 +84,7 @@ def export_for_server( if package: from torch._inductor.package import package_aoti + path = package_aoti(output_path, path) print(f"The generated packaged model can be found at: {path}") @@ -102,13 +106,13 @@ def export_for_server( from typing import Any, Dict, Tuple, Union import executorch.exir as exir + from executorch.backends.xnnpack._passes.convert_to_linear import ( + ConvertToLinearPass, + ) from executorch.backends.xnnpack.partition.xnnpack_partitioner import ( XnnpackDynamicallyQuantizedPartitioner, ) - from executorch.backends.xnnpack._passes.convert_to_linear import ( - ConvertToLinearPass, - ) from executorch.exir import EdgeProgramManager, to_edge from executorch.exir.capture._config import ( @@ -166,18 +170,22 @@ def __init__(self, attention: Attention): self.wo = attention.wo - max_batch_size, n_heads, max_seq_length, head_dim = ( - attention.kv_cache[0].k_cache.shape - ) + max_batch_size, n_heads, max_seq_length, head_dim = attention.kv_cache[ + 0 + ].k_cache.shape cache_dtype = attention.kv_cache[0].k_cache.dtype # The `Attention` module being replaced can have multiple KV caches # (denoted by `cache_lanes`). Thus we follow the same setup format # as in `Attention.setup_cache`. cache_lanes = len(attention.kv_cache) - self.kv_cache = nn.ModuleList([ - CustomKVCache(max_batch_size, max_seq_length, n_heads, head_dim, cache_dtype) - for _ in range(cache_lanes) - ]) + self.kv_cache = nn.ModuleList( + [ + CustomKVCache( + max_batch_size, max_seq_length, n_heads, head_dim, cache_dtype + ) + for _ in range(cache_lanes) + ] + ) self.n_heads = attention.n_heads self.head_dim = attention.head_dim @@ -215,9 +223,7 @@ def forward(self, x, freqs_cis, mask, input_pos=None, cache_lane: int = 0): return self.wo(output) def replace_attention_with_custom_sdpa_attention(module: nn.Module): - from executorch.extension.llm.custom_ops import ( # noqa - sdpa_with_kv_cache, - ) + from executorch.extension.llm.custom_ops import sdpa_with_kv_cache # noqa for name, child in module.named_children(): if isinstance(child, Attention): @@ -238,7 +244,9 @@ def _to_core_aten( raise ValueError( f"Expected passed in model to be an instance of fx.GraphModule, got {type(model)}" ) - core_aten_ep = export_for_training(model, example_inputs, dynamic_shapes=dynamic_shapes) + core_aten_ep = export_for_training( + model, example_inputs, dynamic_shapes=dynamic_shapes + ) if verbose: logging.info(f"Core ATen graph:\n{core_aten_ep.graph}") return core_aten_ep @@ -350,7 +358,11 @@ def main(args): print(f"Using device={builder_args.device}") set_precision(builder_args.precision) - set_backend(dso=args.output_dso_path, pte=args.output_pte_path, aoti_package=args.output_aoti_package_path) + set_backend( + dso=args.output_dso_path, + pte=args.output_pte_path, + aoti_package=args.output_aoti_package_path, + ) builder_args.dso_path = None builder_args.pte_path = None @@ -372,6 +384,7 @@ def main(args): # TODO: clean this up # This mess is because ET does not support _weight_int4pack_mm right now + tokenizer_args = None if not builder_args.gguf_path: # tokenizer needed for quantization so get that here, try: @@ -382,9 +395,8 @@ def main(args): if builder_args.max_seq_length is None: if ( - (output_dso_path is not None or output_aoti_package_path is not None) - and not builder_args.dynamic_shapes - ): + output_dso_path is not None or output_aoti_package_path is not None + ) and not builder_args.dynamic_shapes: print("Setting max_seq_length to 300 for DSO export.") builder_args.max_seq_length = 300 elif output_pte_path is not None: @@ -397,7 +409,8 @@ def main(args): quantize, tokenizer, max_seq_length=builder_args.max_seq_length, - support_tensor_subclass=output_dso_path is None and output_aoti_package_path is None, + support_tensor_subclass=output_dso_path is None + and output_aoti_package_path is None, ) model_to_pte = model model_to_dso = model @@ -435,7 +448,9 @@ def main(args): if output_dso_path: output_dso_path = str(os.path.abspath(output_dso_path)) print(f"Exporting model using AOT Inductor to {output_dso_path}") - print("WARNING!! The path of compiling a dso is deprecated. Please use --output-aoti-package-path to create a .pt2 artifact instead.") + print( + "WARNING!! The path of compiling a dso is deprecated. Please use --output-aoti-package-path to create a .pt2 artifact instead." + ) export_for_server( model_to_dso, builder_args.device, @@ -446,11 +461,23 @@ def main(args): if output_aoti_package_path: output_aoti_package_path = str(os.path.abspath(output_aoti_package_path)) - print(f"Exporting model using AOT Inductor to {output_aoti_package_path}") + + if tokenizer_args is None: + tokenizer_type = "0" + elif tokenizer_args.is_sentencepiece: + tokenizer_type = "2" # Corresponding to llama2 + else: + tokenizer_type = "3" # Corresponding to llama3 + + metadata = {"tokenizer_type": tokenizer_type} + print( + "Exporting model using AOT Inductor to " f"{output_aoti_package_path}." + ) export_for_server( model_to_aoti_package, builder_args.device, output_aoti_package_path, builder_args.dynamic_shapes, package=True, + metadata=metadata, )