diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py index b853b0589..3859d8039 100644 --- a/test/integration/test_integration.py +++ b/test/integration/test_integration.py @@ -1373,6 +1373,60 @@ def forward(self, x): after_export = model(x) self.assertTrue(torch.equal(after_export, ref)) +class TestUtils(unittest.TestCase): + @parameterized.expand(COMMON_DEVICE_DTYPE) + @unittest.skipIf(not TORCH_VERSION_AFTER_2_3, "autoquant requires 2.3+.") + def test_get_model_size_autoquant(self, device, dtype): + if device != "cuda" and dtype != torch.bfloat16: + self.skipTest(f"autoquant currently does not support {device}") + if device != "cuda" or not torch.cuda.is_available(): + self.skipTest(f"autoquant currently does not support {device}") + if torch.cuda.is_available() and torch.cuda.get_device_capability() < (8, 0): + if dtype == torch.bfloat16: + self.skipTest(f"bfloat16 requires sm80+") + m, k, n = 16, 128, 128 + model = torch.nn.Sequential( + torch.nn.ReLU(), + torch.nn.Linear(k,n), + torch.nn.ReLU(), + ).to(device).to(dtype) + example_input = torch.randn(m, k, device=device, dtype=dtype) + size = torchao.utils.get_model_size_in_bytes(model) + + from torchao.quantization.autoquant import ( + AQWeightOnlyQuantizedLinearWeight2, + ) + qtensor_class_list = ( + AQWeightOnlyQuantizedLinearWeight2, + + ) + + mod = torchao.autoquant(torch.compile(model), qtensor_class_list = qtensor_class_list) + mod(example_input) + size2 = torchao.utils.get_model_size_in_bytes(mod) + self.assertTrue(size2 < size) + + @parameterized.expand( + list(itertools.product(TENSOR_SUBCLASS_APIS, COMMON_DEVICES, COMMON_DTYPES)), + ) + def test_get_model_size_aqt(self, api, test_device, test_dtype): + if test_dtype != torch.bfloat16: + self.skipTest(f"{api} in {test_dtype} is not supported yet") + if test_device != "cuda" or not torch.cuda.is_available(): + self.skipTest(f"{api} currently does not support {test_device}") + k, n = 1024, 1024 + model = torch.nn.Sequential( + torch.nn.ReLU(), + torch.nn.Linear(k,n), + torch.nn.ReLU(), + ).to(test_device).to(test_dtype) + size = torchao.utils.get_model_size_in_bytes(model) + api(model) + size2 = torchao.utils.get_model_size_in_bytes(model) + self.assertTrue(size2 < size) + + + if __name__ == "__main__": unittest.main() diff --git a/torchao/_models/llama/generate.py b/torchao/_models/llama/generate.py index 1f5380a88..ea7200ea6 100644 --- a/torchao/_models/llama/generate.py +++ b/torchao/_models/llama/generate.py @@ -13,6 +13,7 @@ import torchao import torch._dynamo.config import torch._inductor.config +from torchao.utils import get_model_size_in_bytes def device_sync(device): if "cuda" in device: @@ -143,21 +144,6 @@ def _load_model(checkpoint_path, device, precision): return model.eval() -def _get_model_size(model): - model_size = 0 - for name, child in model.named_children(): - if not isinstance(child, torch.nn.Embedding): - for p in itertools.chain(child.parameters(), child.buffers()): - # handling for tensor subclasses - if isinstance(p, torchao.dtypes.aqt.AffineQuantizedTensor): - layout_tensor = p.layout_tensor - for attr_name in layout_tensor._tensor_flatten__()[0]: - sub_tensor = getattr(layout_tensor, attr_name) - model_size += sub_tensor.numel() * sub_tensor.element_size() - else: - model_size += p.numel() * p.element_size() - return model_size - B_INST, E_INST = "[INST]", "[/INST]" def main( @@ -226,7 +212,7 @@ def main( interactive=False ) - model_size = _get_model_size(model) / 1e9 + model_size = get_model_size_in_bytes(model, ignore_embeddings=True) / 1e9 if compile: global decode_one_token, prefill diff --git a/torchao/utils.py b/torchao/utils.py index 381a30264..991257a9c 100644 --- a/torchao/utils.py +++ b/torchao/utils.py @@ -5,6 +5,7 @@ from math import gcd from packaging import version import torch.nn.utils.parametrize as parametrize +import itertools __all__ = [ "benchmark_model", @@ -82,14 +83,31 @@ def find_multiple(n: int, *args: Tuple[int]) -> int: return n return n + k - (n % k) -# https://discuss.pytorch.org/t/finding-model-size/130275 -def get_model_size_in_bytes(model): - s = 0 - for p in model.parameters(): - s += p.nelement() * p.element_size() - for b in model.buffers(): - s += b.nelement() * b.element_size() - return s +def get_model_size_in_bytes(model, ignore_embeddings=False): + """ + Returns the model size in bytes. The option to ignore embeddings + is useful for models with disproportionately large embeddings compared + to other model parameters that get quantized/sparsified. + """ + def flat_size(tensor): + if hasattr(tensor, "__tensor_flatten__"): + size = 0 + # 0th element is a list of attributes that + # hold tensors + for attr_name in tensor.__tensor_flatten__()[0]: + sub_tensor = getattr(tensor, attr_name) + size += flat_size(sub_tensor) + return size + else: + return tensor.numel() * tensor.element_size() + + model_size = 0 + for name, child in model.named_children(): + if not (isinstance(child, torch.nn.Embedding) and ignore_embeddings): + for p in itertools.chain(child.parameters(recurse=False), child.buffers(recurse=False)): + model_size += flat_size(p) + model_size += get_model_size_in_bytes(child, ignore_embeddings) + return model_size class UnwrapTensorSubclass(torch.nn.Module): def forward(self, *tensors):