Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[aoti] Remove need for -l in cmake call #1159

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/runner-cuda-dtype.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ jobs:

python torchchat.py export --checkpoint-path ${MODEL_DIR}/stories15M.pt --output-aoti-package-path /tmp/model.pt2

./cmake-out/aoti_run /tmp/model.pt2 -d CUDA -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}"
./cmake-out/aoti_run /tmp/model.pt2 -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}"

done

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ torchchat/utils/scripts/build_native.sh aoti

Then run the compiled executable, with the pt2.
```bash
cmake-out/aoti_run exportedModels/llama3_1_artifacts.pt2 -z `python3 torchchat.py where llama3.1`/tokenizer.model -l 3 -i "Once upon a time"
cmake-out/aoti_run exportedModels/llama3_1_artifacts.pt2 -z `python3 torchchat.py where llama3.1`/tokenizer.model -i "Once upon a time"
```

## Mobile Execution
Expand Down
61 changes: 25 additions & 36 deletions runner/run.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ typedef struct {
typedef struct {
Config config; // the hyperparameters of the architecture (the blueprint)
RunState state; // buffers for the "wave" of activations in the forward pass
std::unordered_map<std::string, std::string> metadata;

#ifdef __AOTI_MODEL__
torch::inductor::AOTIModelPackageLoader *runner;
Expand Down Expand Up @@ -141,20 +142,9 @@ void read_checkpoint(char *checkpoint, Config *config) {
config->vocab_size = abs(config->vocab_size);
}

void build_transformer(Transformer *t, char *model_path, int vocab_size,
int seq_len) {
// read in the Config and the Weights from the model
// read_checkpoint(model_path, &t->config);
// allocate the RunState buffers
t->config.vocab_size = vocab_size;
t->config.seq_len = seq_len;
malloc_run_state(&t->state, &t->config);

void build_transformer(Transformer *t, char *model_path) {
#ifdef __AOTI_MODEL__
t->runner = new torch::inductor::AOTIModelPackageLoader(model_path);
aoti_device = t->runner->get_metadata()["AOTI_DEVICE_KEY"] == "cpu"
? torch::Device(torch::kCPU)
: torch::Device(torch::kCUDA);
#else //__ET_MODEL__
t->runner = new Module(
/* path to PTE model */ model_path,
Expand Down Expand Up @@ -776,9 +766,6 @@ void error_usage() {
" -v <int> (optional) vocab size, default is model-specific.\n");
fprintf(stderr,
" -l <int> (optional) llama version (2 or 3), default 2.\n");
fprintf(
stderr,
" -d <string> (optional) device(CUDA or CPU) model was exported for\n");
exit(EXIT_FAILURE);
}

Expand Down Expand Up @@ -848,37 +835,35 @@ int main(int argc, char *argv[]) {
system_prompt = argv[i + 1];
} else if (argv[i][1] == 'l') {
llama_ver = atoi(argv[i + 1]);
#ifdef __AOTI_MODEL__
} else if (argv[i][1] == 'd') {
#ifdef USE_CUDA
if (strcasecmp(argv[i + 1], "CUDA") == 0) {
aoti_device = torch::Device(torch::kCUDA);
} else
#endif
if (strcasecmp(argv[i + 1], "CPU") == 0) {
aoti_device = torch::Device(torch::kCPU);
} else {
fprintf(stderr, "Unknown device %s", argv[i + 1]);
exit(1);
}
#endif
} else {
error_usage();
}
}

if (model_path == NULL) {
fprintf(stderr, "No model_path provided.");
error_usage();
}

Transformer transformer;
build_transformer(&transformer, model_path);

#ifdef __AOTI_MODEL__
auto aoti_metadata = transformer.runner->get_metadata();
aoti_device = aoti_metadata["AOTI_DEVICE_KEY"] == "cpu"
? torch::Device(torch::kCPU)
: torch::Device(torch::kCUDA);
ModelType model_type = get_model_type(std::stoi(aoti_metadata["tokenizer_type"]));
#else // __ET_MODEL__
ModelType model_type = get_model_type(llama_ver);
#endif

if (model_type == UNKNOWN_MODEL) {
fprintf(stderr, "Unknown model type passed by -l argument. Received l=%d.",
llama_ver);
error_usage();
}

if (model_path == NULL) {
fprintf(stderr, "No model_path provided.");
error_usage();
}

if (tokenizer_path == NULL) {
fprintf(stderr, "No tokenizer_path provided.");
error_usage();
Expand All @@ -901,8 +886,12 @@ int main(int argc, char *argv[]) {
vocab_size = tokenizer->vocab_size();
}

Transformer transformer;
build_transformer(&transformer, model_path, vocab_size, steps);
// read in the Config and the Weights from the model
// read_checkpoint(model_path, &t->config);
// allocate the RunState buffers
transformer.config.vocab_size = vocab_size;
transformer.config.seq_len = steps;
malloc_run_state(&transformer.state, &transformer.config);

Sampler sampler;
build_sampler(&sampler, vocab_size, temperature, topp, rng_seed);
Expand Down
77 changes: 52 additions & 25 deletions torchchat/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
# LICENSE file in the root directory of this source tree.

import os
from typing import Optional
from typing import Dict, Optional

import torch
import torch._inductor
import torch.nn as nn

from torch.export import Dim
import torch._inductor

from torchchat.cli.builder import (
_initialize_model,
Expand Down Expand Up @@ -39,6 +39,7 @@ def export_for_server(
output_path: str = "model.pt2",
dynamic_shapes: bool = False,
package: bool = True,
metadata: Optional[Dict[str, str]] = None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
metadata: Optional[Dict[str, str]] = None,
metadata: Dict[str, str] = {},

We can delete metadata = metadata or {}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

general advice is to not have mutable structures as default args because they survive invocations =>
https://docs.python-guide.org/writing/gotchas/

) -> str:
"""
Export the model using AOT Compile to get a .dso for server use cases.
Expand Down Expand Up @@ -67,8 +68,10 @@ def export_for_server(
dynamic_shapes = None

with torch.nn.attention.sdpa_kernel([torch.nn.attention.SDPBackend.MATH]):
metadata = {} # TODO: put more metadata here
options = {"aot_inductor.package": package, "aot_inductor.metadata": metadata}
options = {
"aot_inductor.package": package,
"aot_inductor.metadata": metadata or {},
}
if not package:
options = {"aot_inductor.output_path": output_path}

Expand All @@ -81,6 +84,7 @@ def export_for_server(

if package:
from torch._inductor.package import package_aoti

path = package_aoti(output_path, path)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry, somehow my editor added on a bunch of formatting changes here.. hope it's not too confusing otherwise I can try to remove them.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No worries, these are good lint fixes

print(f"The generated packaged model can be found at: {path}")
Expand All @@ -102,13 +106,13 @@ def export_for_server(
from typing import Any, Dict, Tuple, Union

import executorch.exir as exir
from executorch.backends.xnnpack._passes.convert_to_linear import (
ConvertToLinearPass,
)

from executorch.backends.xnnpack.partition.xnnpack_partitioner import (
XnnpackDynamicallyQuantizedPartitioner,
)
from executorch.backends.xnnpack._passes.convert_to_linear import (
ConvertToLinearPass,
)
from executorch.exir import EdgeProgramManager, to_edge

from executorch.exir.capture._config import (
Expand Down Expand Up @@ -166,18 +170,22 @@ def __init__(self, attention: Attention):

self.wo = attention.wo

max_batch_size, n_heads, max_seq_length, head_dim = (
attention.kv_cache[0].k_cache.shape
)
max_batch_size, n_heads, max_seq_length, head_dim = attention.kv_cache[
0
].k_cache.shape
cache_dtype = attention.kv_cache[0].k_cache.dtype
# The `Attention` module being replaced can have multiple KV caches
# (denoted by `cache_lanes`). Thus we follow the same setup format
# as in `Attention.setup_cache`.
cache_lanes = len(attention.kv_cache)
self.kv_cache = nn.ModuleList([
CustomKVCache(max_batch_size, max_seq_length, n_heads, head_dim, cache_dtype)
for _ in range(cache_lanes)
])
self.kv_cache = nn.ModuleList(
[
CustomKVCache(
max_batch_size, max_seq_length, n_heads, head_dim, cache_dtype
)
for _ in range(cache_lanes)
]
)

self.n_heads = attention.n_heads
self.head_dim = attention.head_dim
Expand Down Expand Up @@ -215,9 +223,7 @@ def forward(self, x, freqs_cis, mask, input_pos=None, cache_lane: int = 0):
return self.wo(output)

def replace_attention_with_custom_sdpa_attention(module: nn.Module):
from executorch.extension.llm.custom_ops import ( # noqa
sdpa_with_kv_cache,
)
from executorch.extension.llm.custom_ops import sdpa_with_kv_cache # noqa

for name, child in module.named_children():
if isinstance(child, Attention):
Expand All @@ -238,7 +244,9 @@ def _to_core_aten(
raise ValueError(
f"Expected passed in model to be an instance of fx.GraphModule, got {type(model)}"
)
core_aten_ep = export_for_training(model, example_inputs, dynamic_shapes=dynamic_shapes)
core_aten_ep = export_for_training(
model, example_inputs, dynamic_shapes=dynamic_shapes
)
if verbose:
logging.info(f"Core ATen graph:\n{core_aten_ep.graph}")
return core_aten_ep
Expand Down Expand Up @@ -350,7 +358,11 @@ def main(args):

print(f"Using device={builder_args.device}")
set_precision(builder_args.precision)
set_backend(dso=args.output_dso_path, pte=args.output_pte_path, aoti_package=args.output_aoti_package_path)
set_backend(
dso=args.output_dso_path,
pte=args.output_pte_path,
aoti_package=args.output_aoti_package_path,
)

builder_args.dso_path = None
builder_args.pte_path = None
Expand All @@ -372,6 +384,7 @@ def main(args):

# TODO: clean this up
# This mess is because ET does not support _weight_int4pack_mm right now
tokenizer_args = None
if not builder_args.gguf_path:
# tokenizer needed for quantization so get that here,
try:
Expand All @@ -382,9 +395,8 @@ def main(args):

if builder_args.max_seq_length is None:
if (
(output_dso_path is not None or output_aoti_package_path is not None)
and not builder_args.dynamic_shapes
):
output_dso_path is not None or output_aoti_package_path is not None
) and not builder_args.dynamic_shapes:
print("Setting max_seq_length to 300 for DSO export.")
builder_args.max_seq_length = 300
elif output_pte_path is not None:
Expand All @@ -397,7 +409,8 @@ def main(args):
quantize,
tokenizer,
max_seq_length=builder_args.max_seq_length,
support_tensor_subclass=output_dso_path is None and output_aoti_package_path is None,
support_tensor_subclass=output_dso_path is None
and output_aoti_package_path is None,
)
model_to_pte = model
model_to_dso = model
Expand Down Expand Up @@ -435,7 +448,9 @@ def main(args):
if output_dso_path:
output_dso_path = str(os.path.abspath(output_dso_path))
print(f"Exporting model using AOT Inductor to {output_dso_path}")
print("WARNING!! The path of compiling a dso is deprecated. Please use --output-aoti-package-path to create a .pt2 artifact instead.")
print(
"WARNING!! The path of compiling a dso is deprecated. Please use --output-aoti-package-path to create a .pt2 artifact instead."
)
export_for_server(
model_to_dso,
builder_args.device,
Expand All @@ -446,11 +461,23 @@ def main(args):

if output_aoti_package_path:
output_aoti_package_path = str(os.path.abspath(output_aoti_package_path))
print(f"Exporting model using AOT Inductor to {output_aoti_package_path}")

if tokenizer_args is None:
tokenizer_type = "0"
elif tokenizer_args.is_sentencepiece:
tokenizer_type = "2" # Corresponding to llama2
else:
tokenizer_type = "3" # Corresponding to llama3

metadata = {"tokenizer_type": tokenizer_type}
print(
"Exporting model using AOT Inductor to " f"{output_aoti_package_path}."
)
export_for_server(
model_to_aoti_package,
builder_args.device,
output_aoti_package_path,
builder_args.dynamic_shapes,
package=True,
metadata=metadata,
)
Loading