diff --git a/main/_downloads/315c4c52fb68082a731b192d944e2ede/tutorials_python.zip b/main/_downloads/315c4c52fb68082a731b192d944e2ede/tutorials_python.zip index 0414ae753..fb5b5f893 100644 Binary files a/main/_downloads/315c4c52fb68082a731b192d944e2ede/tutorials_python.zip and b/main/_downloads/315c4c52fb68082a731b192d944e2ede/tutorials_python.zip differ diff --git a/main/_downloads/a5659940aa3f8f568547d47752a43172/tutorials_jupyter.zip b/main/_downloads/a5659940aa3f8f568547d47752a43172/tutorials_jupyter.zip index e9de22c3a..d7658f1a1 100644 Binary files a/main/_downloads/a5659940aa3f8f568547d47752a43172/tutorials_jupyter.zip and b/main/_downloads/a5659940aa3f8f568547d47752a43172/tutorials_jupyter.zip differ diff --git a/main/_downloads/e148c8862a389bde3e2c2727c00d1f30/template_tutorial.zip b/main/_downloads/e148c8862a389bde3e2c2727c00d1f30/template_tutorial.zip index 8794d7b2c..07c133c83 100644 Binary files a/main/_downloads/e148c8862a389bde3e2c2727c00d1f30/template_tutorial.zip and b/main/_downloads/e148c8862a389bde3e2c2727c00d1f30/template_tutorial.zip differ diff --git a/main/_modules/torchao/dtypes/affine_quantized_tensor.html b/main/_modules/torchao/dtypes/affine_quantized_tensor.html index aca58d818..21fd598e9 100644 --- a/main/_modules/torchao/dtypes/affine_quantized_tensor.html +++ b/main/_modules/torchao/dtypes/affine_quantized_tensor.html @@ -431,17 +431,17 @@

Source code for torchao.dtypes.affine_quantized_tensor

aten = torch.ops.aten ############################### -# Base Layout Tensor Subclass # +# Base Tensor Impl Subclass # ############################### -class AQTLayout(TorchAOBaseTensor): +class AQTTensorImpl(TorchAOBaseTensor): """ - Base class for the layout tensor for `AffineQuantizedTensor` + Base class for the tensor impl for `AffineQuantizedTensor` """ def get_plain(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Get the plain (unpacked) Tensor for the layout Tensor + """Get the plain (unpacked) Tensor for the tensor impl Returns data, scale and zero_point - Can be overwritten if other types of AQTLayout Tensor has different numbers of plain tensors + Can be overwritten if other types of AQTTensorImpl has different numbers of plain tensors """ pass @@ -456,7 +456,7 @@

Source code for torchao.dtypes.affine_quantized_tensor

zero_point: torch.Tensor, layout_type: LayoutType, ): - """ Construct a Layout from data, scale, zero_point and the layout_type""" + """ Construct a TensorImpl from data, scale, zero_point and the layout_type""" pass def __repr__(self): @@ -511,7 +511,7 @@

Source code for torchao.dtypes.affine_quantized_tensor

regardless of the internal representation's type or orientation. fields: - layout_tensor (AQTLayout): tensor that serves as a general layout storage for the quantized data, + tensor_impl (AQTTensorImpl): tensor that serves as a general tensor impl storage for the quantized data, e.g. storing plain tensors (int_data, scale, zero_point) or packed formats depending on device and operator/kernel block_size (Tuple[int, ...]): granularity of quantization, this means the size of the tensor elements that's sharing the same qparam @@ -531,7 +531,7 @@

Source code for torchao.dtypes.affine_quantized_tensor

@staticmethod def __new__( cls, - layout_tensor: AQTLayout, + tensor_impl: AQTTensorImpl, block_size: Tuple[int, ...], shape: torch.Size, quant_min: Optional[Union[int, float]] = None, @@ -541,9 +541,9 @@

Source code for torchao.dtypes.affine_quantized_tensor

strides=None, ): kwargs = {} - kwargs["device"] = layout_tensor.device + kwargs["device"] = tensor_impl.device kwargs["layout"] = ( - kwargs.get("layout") if kwargs.get("layout", False) else layout_tensor.layout + kwargs.get("layout") if kwargs.get("layout", False) else tensor_impl.layout ) kwargs["dtype"] = dtype if strides is not None: @@ -553,7 +553,7 @@

Source code for torchao.dtypes.affine_quantized_tensor

def __init__( self, - layout_tensor: AQTLayout, + tensor_impl: AQTTensorImpl, block_size: Tuple[int, ...], shape: torch.Size, quant_min: Optional[Union[int, float]] = None, @@ -562,7 +562,7 @@

Source code for torchao.dtypes.affine_quantized_tensor

dtype=None, strides=None, ): - self.layout_tensor = layout_tensor + self.tensor_impl = tensor_impl self.block_size = block_size self.quant_min = quant_min self.quant_max = quant_max @@ -570,12 +570,12 @@

Source code for torchao.dtypes.affine_quantized_tensor

def __repr__(self): return ( - f"{self.__class__.__name__}(layout_tensor={self.layout_tensor}, block_size={self.block_size}, " + f"{self.__class__.__name__}(tensor_impl={self.tensor_impl}, block_size={self.block_size}, " f"shape={self.shape}, device={self.device}, dtype={self.dtype}, requires_grad={self.requires_grad})" ) def _quantization_type(self): - return f"shape={self.shape}, block_size={self.block_size}, device={self.device}, layout_type={self.layout_type}, layout_tensor_dtype={self.layout_tensor.dtype}, quant_min={self.quant_min}, quant_max={self.quant_max}" + return f"shape={self.shape}, block_size={self.block_size}, device={self.device}, layout_type={self.layout_type}, tensor_impl_dtype={self.tensor_impl.dtype}, quant_min={self.quant_min}, quant_max={self.quant_max}"
[docs] def dequantize(self, output_dtype: Optional[torch.dtype] = None) -> torch.Tensor: if output_dtype is None: @@ -583,10 +583,10 @@

Source code for torchao.dtypes.affine_quantized_tensor

from torchao.dtypes.floatx import FloatxTensorCoreLayoutType if isinstance(self.layout_type, FloatxTensorCoreLayoutType): - int_data, scale = self.layout_tensor.get_plain() + int_data, scale = self.tensor_impl.get_plain() return dequantize_affine_floatx(int_data, scale, self.layout_type.ebits, self.layout_type.mbits, output_dtype=output_dtype) else: - data, scale, zero_point = self.layout_tensor.get_plain() + data, scale, zero_point = self.tensor_impl.get_plain() dq = dequantize_affine( data, self.block_size, @@ -612,16 +612,16 @@

Source code for torchao.dtypes.affine_quantized_tensor

raise QuantizedLinearNotImplementedError("No specialized dispatch found for quantized linear op") def __tensor_flatten__(self): - return ["layout_tensor"], [self.block_size, self.shape, self.quant_min, self.quant_max, self.zero_point_domain, self.dtype] + return ["tensor_impl"], [self.block_size, self.shape, self.quant_min, self.quant_max, self.zero_point_domain, self.dtype] @classmethod def __tensor_unflatten__( cls, tensor_data_dict, tensor_attributes, outer_size, outer_stride ): - layout_tensor = tensor_data_dict["layout_tensor"] + tensor_impl = tensor_data_dict["tensor_impl"] block_size, shape, quant_min, quant_max, zero_point_domain, dtype = tensor_attributes return cls( - layout_tensor, + tensor_impl, block_size, shape if outer_size is None else outer_size, quant_min, @@ -669,10 +669,10 @@

Source code for torchao.dtypes.affine_quantized_tensor

# Note: output will be uint8 tensor for sub byte tensors for now data = layout_type.post_process(data) - layout_tensor_ctr = get_layout_tensor_constructor(type(layout_type)) - layout_tensor = layout_tensor_ctr(data, scale, zero_point, layout_type) + tensor_impl_ctr = get_tensor_impl_constructor(type(layout_type)) + tensor_impl = tensor_impl_ctr(data, scale, zero_point, layout_type) return cls( - layout_tensor, + tensor_impl, block_size, original_shape, quant_min, @@ -704,10 +704,10 @@

Source code for torchao.dtypes.affine_quantized_tensor

int_data = layout_type.post_process(int_data) - layout_tensor_ctr = get_layout_tensor_constructor(type(layout_type)) - layout_tensor = layout_tensor_ctr(int_data, scale, zero_point, layout_type) + tensor_impl_ctr = get_tensor_impl_constructor(type(layout_type)) + tensor_impl = tensor_impl_ctr(int_data, scale, zero_point, layout_type) return cls( - layout_tensor, + tensor_impl, block_size, original_shape, quant_min, @@ -790,10 +790,10 @@

Source code for torchao.dtypes.affine_quantized_tensor

floatx_unpacked = quantize_affine_floatx(input_float, scale, ebits, mbits) floatx_packed = layout_type.post_process(floatx_unpacked) - layout_tensor_ctr = get_layout_tensor_constructor(type(layout_type)) - layout_tensor = layout_tensor_ctr(floatx_packed, scale, None, layout_type) + tensor_impl_ctr = get_tensor_impl_constructor(type(layout_type)) + tensor_impl = tensor_impl_ctr(floatx_packed, scale, None, layout_type) return cls( - layout_tensor, + tensor_impl, block_size, original_shape, dtype=input_float.dtype @@ -801,13 +801,13 @@

Source code for torchao.dtypes.affine_quantized_tensor

@property def layout_type(self) -> LayoutType: - return self.layout_tensor.layout_type + return self.tensor_impl.layout_type
[docs] def to(self, *args, **kwargs): kwargs = self._get_to_kwargs(*args, **kwargs) device = kwargs.pop("device") return self.__class__( - self.layout_tensor.to(device), + self.tensor_impl.to(device), self.block_size, self.shape, self.quant_min, @@ -818,7 +818,7 @@

Source code for torchao.dtypes.affine_quantized_tensor

def _apply_fn_to_data(self, fn): return self.__class__( - fn(self.layout_tensor), + fn(self.tensor_impl), self.block_size, self.shape, self.quant_min, @@ -844,10 +844,10 @@

Source code for torchao.dtypes.affine_quantized_tensor

###################################################### -# LayoutType and Layout Tensor Subclass Registration # +# LayoutType and TensorImpl Subclass Registration # ###################################################### -register_layout_cls = AffineQuantizedTensor.register_layout_cls -get_layout_tensor_constructor = AffineQuantizedTensor.get_layout_tensor_constructor +register_layout = AffineQuantizedTensor.register_layout +get_tensor_impl_constructor = AffineQuantizedTensor.get_tensor_impl_constructor @dataclass(frozen=True) class SemiSparseLayoutType(LayoutType): @@ -928,10 +928,10 @@

Source code for torchao.dtypes.affine_quantized_tensor

return w_24.t() -@register_layout_cls(PlainLayoutType) -class PlainAQTLayout(AQTLayout): +@register_layout(PlainLayoutType) +class PlainAQTTensorImpl(AQTTensorImpl): """ - Layout storage class for plain layout for affine quantized tensor, it stores int_data, scale, zero_point + TensorImpl storage class for plain layout for affine quantized tensor, it stores int_data, scale, zero_point tensors directly as plain tensors. fields: @@ -1025,12 +1025,12 @@

Source code for torchao.dtypes.affine_quantized_tensor

) elif dim == 1: assert len(self.scale.shape) == 1, f"slice dim==1 only works when len(scale.shape) == 1 currently, got: {self.scale.shape}" - return PlainAQTLayout(aten.slice.Tensor(self.int_data, dim, start, end, step), self.scale.view(-1), self.zero_point.view(-1), self.layout_type) + return PlainAQTTensorImpl(aten.slice.Tensor(self.int_data, dim, start, end, step), self.scale.view(-1), self.zero_point.view(-1), self.layout_type) else: - raise NotImplementedError(f"PlainAQTLayout dispatch: attempting to run {func}, with dim={dim}, that is not supported") + raise NotImplementedError(f"PlainAQTTensorImpl dispatch: attempting to run {func}, with dim={dim}, that is not supported") raise NotImplementedError( - f"PlainAQTLayout dispatch: attempting to run {func}, this is not supported" + f"PlainAQTTensorImpl dispatch: attempting to run {func}, this is not supported" ) __torch_function__ = torch._C._disabled_torch_function_impl @@ -1052,10 +1052,10 @@

Source code for torchao.dtypes.affine_quantized_tensor

assert isinstance(layout_type, PlainLayoutType) return cls(int_data, scale, zero_point, layout_type) -@register_layout_cls(SemiSparseLayoutType) -class SemiSparseAQTLayout(PlainAQTLayout): +@register_layout(SemiSparseLayoutType) +class SemiSparseAQTTensorImpl(PlainAQTTensorImpl): """ - Layout storage class for semi_sparse_cusparselt layout for affine quantized tensor + TensorImpl storage class for semi_sparse_cusparselt layout for affine quantized tensor """ @classmethod def __torch_dispatch__(cls, func, types, args, kwargs): @@ -1067,7 +1067,7 @@

Source code for torchao.dtypes.affine_quantized_tensor

) raise NotImplementedError( - f"SparseAQTLayout dispatch: attempting to run {func}, this is not supported" + f"SparseAQTTensorImpl dispatch: attempting to run {func}, this is not supported" ) def get_plain(self): @@ -1092,8 +1092,8 @@

Source code for torchao.dtypes.affine_quantized_tensor

int_data_compressed = torch._cslt_compress(int_data) return cls(int_data_compressed, scale, zero_point, layout_type) -@register_layout_cls(BlockSparseLayoutType) -class BlockSparseAQTLayout(PlainAQTLayout): +@register_layout(BlockSparseLayoutType) +class BlockSparseAQTTensorImpl(PlainAQTTensorImpl): bsr_crow_indices: Optional[torch.Tensor] bsr_col_indices: Optional[torch.Tensor] bsr_values: Optional[torch.Tensor] @@ -1229,13 +1229,13 @@

Source code for torchao.dtypes.affine_quantized_tensor

return args[0].bsr_values.shape[0] raise NotImplementedError( - f"BlockSparseAQTLayout dispatch: attempting to run {func}, this is not supported" + f"BlockSparseAQTTensorImpl dispatch: attempting to run {func}, this is not supported" ) -@register_layout_cls(MarlinSparseLayoutType) -class MarlinSparseAQTLayout(AQTLayout): +@register_layout(MarlinSparseLayoutType) +class MarlinSparseAQTTensorImpl(AQTTensorImpl): """ - Layout storage class for sparse_marlin_24 layout for affine quantized tensor. + TensorImpl storage class for sparse_marlin_24 layout for affine quantized tensor. Can be used with 4 bits and 8 bits quantization. @@ -1302,7 +1302,7 @@

Source code for torchao.dtypes.affine_quantized_tensor

) raise NotImplementedError( - f"MarlinSparseAQTLayout dispatch: attempting to run {func}, this is not supported" + f"MarlinSparseAQTTensorImpl dispatch: attempting to run {func}, this is not supported" ) def __tensor_flatten__(self): @@ -1402,10 +1402,10 @@

Source code for torchao.dtypes.affine_quantized_tensor

return self -@register_layout_cls(Float8LayoutType) -class Float8AQTLayout(AQTLayout): +@register_layout(Float8LayoutType) +class Float8AQTTensorImpl(AQTTensorImpl): """ - Layout storage class for float8 layout for affine quantized tensor + TensorImpl storage class for float8 tensor impl for affine quantized tensor """ float8_data: torch.Tensor scale: torch.Tensor @@ -1492,12 +1492,12 @@

Source code for torchao.dtypes.affine_quantized_tensor

) elif dim == 1: assert len(self.scale.shape) == 1, f"slice dim==1 only works when len(scale.shape) == 1 currently, got: {self.scale.shape}" - return Float8AQTLayout(aten.slice.Tensor(self.float8_data, dim, start, end, step), self.scale, None, self.layout_type) + return Float8AQTTensorImpl(aten.slice.Tensor(self.float8_data, dim, start, end, step), self.scale, None, self.layout_type) else: - raise NotImplementedError(f"Float8AQTLayout dispatch: attempting to run {func}, with dim={dim}, that is not supported") + raise NotImplementedError(f"Float8AQTTensorImpl dispatch: attempting to run {func}, with dim={dim}, that is not supported") else: raise NotImplementedError( - f"Float8AQTLayout dispatch: attempting to run {func}, this is not supported" + f"Float8AQTTensorImpl dispatch: attempting to run {func}, this is not supported" ) __torch_function__ = torch._C._disabled_torch_function_impl @@ -1516,9 +1516,9 @@

Source code for torchao.dtypes.affine_quantized_tensor

zero_point: Optional[torch.Tensor], layout_type: LayoutType, ): - """ Main entrypoint for constructing Float8Layout Tensor""" - assert _is_float8_type(data.dtype), f"Float8 Layout must be constructed from float8 dtype but got {data.dtype}" - assert isinstance(layout_type, Float8LayoutType), f"Float8 Layout must be constructed from Float8LayoutType but got {layout_type}" + """ Main entrypoint for constructing Float8TensorImpl""" + assert _is_float8_type(data.dtype), f"Float8 TensorImpl must be constructed from float8 dtype but got {data.dtype}" + assert isinstance(layout_type, Float8LayoutType), f"Float8 TensorImpl must be constructed from Float8LayoutType but got {layout_type}" return cls(data, scale, False, layout_type) def __repr__(self): @@ -1531,10 +1531,10 @@

Source code for torchao.dtypes.affine_quantized_tensor

f"layout_type={layout_type})") -@register_layout_cls(TensorCoreTiledLayoutType) -class TensorCoreTiledAQTLayout(AQTLayout): +@register_layout(TensorCoreTiledLayoutType) +class TensorCoreTiledAQTTensorImpl(AQTTensorImpl): """ - Layout storage class for tensor_core_tiled layout for affine quantized tensor, this is for int4 only, + TensorImpl storage class for tensor_core_tiled tensor impl for affine quantized tensor, this is for int4 only, it stores the original tensor of dimension [n][k] (int32 dtype) as packed weight of 4-d tensor of dimension: [n / 8][k / (inner_k_tiles * 16)][32][inner_k_tiles / 2] @@ -1610,7 +1610,7 @@

Source code for torchao.dtypes.affine_quantized_tensor

kwargs = self._get_to_kwargs(*args, **kwargs) device = kwargs["device"] if not is_device("cuda", device): - raise ValueError(f"TensorCoreTiledAQTLayout is only available for cuda device, can't convert to {device}") + raise ValueError(f"TensorCoreTiledAQTTensorImpl is only available for cuda device, can't convert to {device}") return self.__class__( self.packed_weight.to(device), self.scale_and_zero.to(device), @@ -1645,7 +1645,7 @@

Source code for torchao.dtypes.affine_quantized_tensor

return return_and_correct_aliasing(func, args, kwargs, args[0]) raise NotImplementedError( - f"TensorCoreTiledAQTLayout dispatch: attempting to run {func}, this is not supported" + f"TensorCoreTiledAQTTensorImpl dispatch: attempting to run {func}, this is not supported" ) __torch_function__ = torch._C._disabled_torch_function_impl @@ -1691,14 +1691,14 @@

Source code for torchao.dtypes.affine_quantized_tensor

def _aqt_is_int8(aqt): """Check if an AffineQuantizedTensor is int8 quantized Tensor""" return ( - aqt.layout_tensor.dtype == torch.int8 and + aqt.tensor_impl.dtype == torch.int8 and (aqt.quant_min is None or aqt.quant_min == -128) and (aqt.quant_max is None or aqt.quant_max == 127) ) def _aqt_is_int8_reduced_range(aqt): return ( - aqt.layout_tensor.dtype == torch.int8 and + aqt.tensor_impl.dtype == torch.int8 and aqt.quant_min == -127 and (aqt.quant_max is None or aqt.quant_max == 127) ) @@ -1707,7 +1707,7 @@

Source code for torchao.dtypes.affine_quantized_tensor

"""Check if an AffineQuantizedTensor is uint4 quantized Tensor""" # TODO: use torch.uint4 return ( - aqt.layout_tensor.dtype == torch.int32 and + aqt.tensor_impl.dtype == torch.int32 and aqt.quant_min == 0 and aqt.quant_max == 15 ) @@ -1744,10 +1744,10 @@

Source code for torchao.dtypes.affine_quantized_tensor

# value of a float 16, (which results in a value of inf even if multiplying # by the other scale would bring it within the expected range) - x_vals_int8 = input_tensor.layout_tensor.int_data - x_scales = input_tensor.layout_tensor.scale - w_vals_int8_t = weight_tensor.layout_tensor.int_data.contiguous().t() - w_scales = weight_tensor.layout_tensor.scale + x_vals_int8 = input_tensor.tensor_impl.int_data + x_scales = input_tensor.tensor_impl.scale + w_vals_int8_t = weight_tensor.tensor_impl.int_data.contiguous().t() + w_scales = weight_tensor.tensor_impl.scale tmp = x_vals_int8.reshape(-1, x_vals_int8.shape[-1]) y_dot_scaled = int_scaled_matmul(tmp, w_vals_int8_t, x_scales.reshape(-1, 1)) @@ -1775,10 +1775,10 @@

Source code for torchao.dtypes.affine_quantized_tensor

) def _linear_int8_act_int8_weight_semi_structured_sparse_impl(input_tensor, weight_tensor, bias): - x_vals_int8 = input_tensor.layout_tensor.int_data - x_scales = input_tensor.layout_tensor.scale - w_vals_int8 = weight_tensor.layout_tensor.int_data - w_scales = weight_tensor.layout_tensor.scale + x_vals_int8 = input_tensor.tensor_impl.int_data + x_scales = input_tensor.tensor_impl.scale + w_vals_int8 = weight_tensor.tensor_impl.int_data + w_scales = weight_tensor.tensor_impl.scale tmp = x_vals_int8.reshape(-1, x_vals_int8.shape[-1]) # we fuse one of the scalar matrix multiplications (w_scales) into the sparse mm y_dot_bf16_w_scales_fused = torch._cslt_sparse_mm( @@ -1807,10 +1807,10 @@

Source code for torchao.dtypes.affine_quantized_tensor

def _linear_int8_act_int8_weight_block_sparse_impl(input_tensor, weight_tensor, bias): - x_vals_int8 = input_tensor.layout_tensor.int_data - x_scales = input_tensor.layout_tensor.scale - w_vals = weight_tensor.layout_tensor - w_scales = weight_tensor.layout_tensor.scale + x_vals_int8 = input_tensor.tensor_impl.int_data + x_scales = input_tensor.tensor_impl.scale + w_vals = weight_tensor.tensor_impl + w_scales = weight_tensor.tensor_impl.scale tmp = x_vals_int8.reshape(-1, x_vals_int8.shape[-1]) tmp_t = tmp.t() @@ -1836,7 +1836,7 @@

Source code for torchao.dtypes.affine_quantized_tensor

# input is native bfloat16 tensor not is_traceable_wrapper_subclass(input_tensor) and input_tensor.dtype == torch.bfloat16 and - # weight is uint4, group quantized tensor_core_tiled layout affine quantized tensor + # weight is uint4, group quantized tensor_core_tiled tensor impl affine quantized tensor isinstance(weight_tensor, AffineQuantizedTensor) and _aqt_is_tensor_core_tile_uint4(weight_tensor) and weight_tensor.dtype == torch.bfloat16 and @@ -1858,8 +1858,8 @@

Source code for torchao.dtypes.affine_quantized_tensor

act_mat = input_tensor # weight is packed from padded (out_features, in_features) weight tensor # (same dimension requirement as F.linear weight) - packed_weight = weight_tensor.layout_tensor.packed_weight - scale_and_zero = weight_tensor.layout_tensor.scale_and_zero + packed_weight = weight_tensor.tensor_impl.packed_weight + scale_and_zero = weight_tensor.tensor_impl.scale_and_zero orig_act_size = act_mat.size() orig_dtype = act_mat.dtype @@ -1902,11 +1902,11 @@

Source code for torchao.dtypes.affine_quantized_tensor

def _linear_fp_act_int8_weight_impl(input_tensor, weight_tensor, bias): # TODO: enable cpu and mps efficient path # is_cpu and is_mps only, some issue with is_contiguous() currently - # return torch.ops.aten._weight_int8pack_mm(input_tensor.contiguous(), w_vals_int8_t, weight_tensor.layout_tensor.scale) + # return torch.ops.aten._weight_int8pack_mm(input_tensor.contiguous(), w_vals_int8_t, weight_tensor.tensor_impl.scale) # per channel int8 weight only quantizated mm - w_vals_int8_t = weight_tensor.layout_tensor.int_data.t() - scale = weight_tensor.layout_tensor.scale + w_vals_int8_t = weight_tensor.tensor_impl.int_data.t() + scale = weight_tensor.tensor_impl.scale orig_dtype = input_tensor.dtype m = torch.mm( input_tensor.reshape(-1, input_tensor.shape[-1]), @@ -1960,8 +1960,8 @@

Source code for torchao.dtypes.affine_quantized_tensor

weight.layout_type.ebits, weight.layout_type.mbits, act_reshaped, - weight.layout_tensor.packed_floatx_data, - weight.layout_tensor.scale, + weight.tensor_impl.packed_floatx_data, + weight.tensor_impl.scale, splitK=splitK, ) @@ -1979,7 +1979,7 @@

Source code for torchao.dtypes.affine_quantized_tensor

return ( isinstance(aqt, AffineQuantizedTensor) and isinstance(aqt.layout_type, Float8LayoutType) - and aqt.layout_tensor.dtype in [torch.float8_e4m3fn, torch.float8_e5m2] + and aqt.tensor_impl.dtype in [torch.float8_e4m3fn, torch.float8_e5m2] and (aqt.shape == aqt.block_size or _is_rowwise_scaled(aqt)) ) return check_aqt(input_tensor) and check_aqt(weight_tensor) @@ -2004,14 +2004,14 @@

Source code for torchao.dtypes.affine_quantized_tensor

out_shape = get_out_shape(input_tensor.shape, weight_tensor.shape) # Weight tensor preprocessing - w_layout = weight_tensor.layout_tensor - assert not w_layout.transposed, "Weight tensor must be contiguous" - w_data = w_layout.float8_data - w_scale = w_layout.scale + w_tensor_impl = weight_tensor.tensor_impl + assert not w_tensor_impl.transposed, "Weight tensor must be contiguous" + w_data = w_tensor_impl.float8_data + w_scale = w_tensor_impl.scale # Input tensor preprocessing - inpt_data = input_tensor.layout_tensor.float8_data - input_scale = input_tensor.layout_tensor.scale + inpt_data = input_tensor.tensor_impl.float8_data + input_scale = input_tensor.tensor_impl.scale # Handle case where input tensor is more than 2D inpt_data = inpt_data.reshape(-1, inpt_data.shape[-1]) @@ -2047,7 +2047,7 @@

Source code for torchao.dtypes.affine_quantized_tensor

# weight is float8 quantized affine quantized tensor isinstance(weight_tensor, AffineQuantizedTensor) and isinstance(weight_tensor.layout_type, Float8LayoutType) - and weight_tensor.layout_tensor.dtype in [torch.float8_e4m3fn, torch.float8_e5m2] + and weight_tensor.tensor_impl.dtype in [torch.float8_e4m3fn, torch.float8_e5m2] and (weight_tensor.shape == weight_tensor.block_size or _is_rowwise_scaled(weight_tensor)) ) @@ -2074,11 +2074,11 @@

Source code for torchao.dtypes.affine_quantized_tensor

assert isinstance(weight_tensor, AffineQuantizedTensor) - sparse_w_int4 = weight_tensor.layout_tensor.int_data - scale = weight_tensor.layout_tensor.scale - meta = weight_tensor.layout_tensor.meta - original_shape = weight_tensor.layout_tensor.original_shape - num_bits = weight_tensor.layout_tensor.num_bits + sparse_w_int4 = weight_tensor.tensor_impl.int_data + scale = weight_tensor.tensor_impl.scale + meta = weight_tensor.tensor_impl.meta + original_shape = weight_tensor.tensor_impl.original_shape + num_bits = weight_tensor.tensor_impl.num_bits # Folds batch dimension into the first dimension input_2d = input_tensor.view(-1, input_tensor.shape[-1]) @@ -2225,7 +2225,7 @@

Source code for torchao.dtypes.affine_quantized_tensor

tensor = args[0] shape = tensor.shape[::-1] new = tensor.__class__( - tensor.layout_tensor.t(), transposed_block_size, shape, tensor.quant_min, tensor.quant_max, tensor.zero_point_domain, dtype=tensor.dtype, strides=tensor.stride() + tensor.tensor_impl.t(), transposed_block_size, shape, tensor.quant_min, tensor.quant_max, tensor.zero_point_domain, dtype=tensor.dtype, strides=tensor.stride() ) return return_and_correct_aliasing(func, args, kwargs, new) @@ -2243,7 +2243,7 @@

Source code for torchao.dtypes.affine_quantized_tensor

# with slice, some shape dimension might be smaller than block_size dimension, so # we need to make sure there is no overflow block_size = (min(shape[0], block_size[0]), min(shape[1], block_size[1])) - new = self.__class__(aten.slice.Tensor(self.layout_tensor, dim, start, end, step), block_size, shape, self.quant_min, self.quant_max, self.zero_point_domain, dtype=self.dtype, strides=self.stride()) + new = self.__class__(aten.slice.Tensor(self.tensor_impl, dim, start, end, step), block_size, shape, self.quant_min, self.quant_max, self.zero_point_domain, dtype=self.dtype, strides=self.stride()) return return_and_correct_aliasing(func, args, kwargs, new) # this is needed for DTensor.from_local() and for flattening tensor @@ -2252,12 +2252,12 @@

Source code for torchao.dtypes.affine_quantized_tensor

self, shape = args if tuple(self.shape) == tuple(shape): - return self.__class__(self.layout_tensor, self.block_size, self.shape, self.quant_min, self.quant_max, self.zero_point_domain, dtype=self.dtype, strides=self.stride()) + return self.__class__(self.tensor_impl, self.block_size, self.shape, self.quant_min, self.quant_max, self.zero_point_domain, dtype=self.dtype, strides=self.stride()) if len(shape) == 1 and shape[0] == -1: assert len(self.block_size) == 2 and self.block_size[0] == 1 block_size = (self.block_size[1],) - return self.__class__(self.layout_tensor, block_size, (self.numel(),), self.quant_min, self.quant_max, self.zero_point_domain, dtype=self.dtype, strides=self.stride()) + return self.__class__(self.tensor_impl, block_size, (self.numel(),), self.quant_min, self.quant_max, self.zero_point_domain, dtype=self.dtype, strides=self.stride()) raise ValueError(f"{self.__class__.__name__} only supports .view() with same shape or shape=[-1]") diff --git a/main/_modules/torchao/quantization/quant_api.html b/main/_modules/torchao/quantization/quant_api.html index 7b1a27f86..397e00468 100644 --- a/main/_modules/torchao/quantization/quant_api.html +++ b/main/_modules/torchao/quantization/quant_api.html @@ -1266,7 +1266,7 @@

Source code for torchao.quantization.quant_api

e.g. fp6_e3_m2, fp6_e2_m3, ... The packing format and kernels are from the fp6-llm paper: https://arxiv.org/abs/2401.14112 github repo: https://github.com/usyd-fsalab/fp6_llm, now renamed to quant-llm - For more details for packing please see: :class:`~torchao.dtypes.fpx.FpxTensorCoreAQTLayout` + For more details for packing please see: :class:`~torchao.dtypes.fpx.FpxTensorCoreAQTTensorImpl` This is experimental, will be merged with `to_affine_quantized_floatx` in the future diff --git a/main/_sources/tutorials/template_tutorial.rst.txt b/main/_sources/tutorials/template_tutorial.rst.txt index fe58a2570..87a9c7808 100644 --- a/main/_sources/tutorials/template_tutorial.rst.txt +++ b/main/_sources/tutorials/template_tutorial.rst.txt @@ -66,11 +66,11 @@ Example code (the output below is generated automatically): .. code-block:: none - tensor([[0.8493, 0.0526, 0.5841], - [0.6383, 0.5932, 0.8083], - [0.3087, 0.3515, 0.4735], - [0.8996, 0.7762, 0.1826], - [0.2607, 0.2312, 0.7631]]) + tensor([[0.7804, 0.8663, 0.7150], + [0.4530, 0.6350, 0.2086], + [0.9097, 0.1238, 0.0825], + [0.5196, 0.2840, 0.3932], + [0.3891, 0.3960, 0.5983]]) diff --git a/main/generated/torchao.dtypes.AffineQuantizedTensor.html b/main/generated/torchao.dtypes.AffineQuantizedTensor.html index 64b401016..2129c6bbc 100644 --- a/main/generated/torchao.dtypes.AffineQuantizedTensor.html +++ b/main/generated/torchao.dtypes.AffineQuantizedTensor.html @@ -390,7 +390,7 @@

AffineQuantizedTensor

-class torchao.dtypes.AffineQuantizedTensor(layout_tensor: AQTLayout, block_size: Tuple[int, ...], shape: Size, quant_min: Optional[Union[int, float]] = None, quant_max: Optional[Union[int, float]] = None, zero_point_domain: ZeroPointDomain = ZeroPointDomain.INT, dtype=None, strides=None)[source]
+class torchao.dtypes.AffineQuantizedTensor(tensor_impl: AQTTensorImpl, block_size: Tuple[int, ...], shape: Size, quant_min: Optional[Union[int, float]] = None, quant_max: Optional[Union[int, float]] = None, zero_point_domain: ZeroPointDomain = ZeroPointDomain.INT, dtype=None, strides=None)[source]
Affine quantized tensor subclass. Affine quantization means we quantize the floating point tensor with an affine transformation:

quantized_tensor = float_tensor / scale + zero_point

@@ -402,7 +402,7 @@

AffineQuantizedTensor
fields:
-
layout_tensor (AQTLayout): tensor that serves as a general layout storage for the quantized data,

e.g. storing plain tensors (int_data, scale, zero_point) or packed formats depending on device +

tensor_impl (AQTTensorImpl): tensor that serves as a general tensor impl storage for the quantized data,

e.g. storing plain tensors (int_data, scale, zero_point) or packed formats depending on device and operator/kernel

block_size (Tuple[int, …]): granularity of quantization, this means the size of the tensor elements that’s sharing the same qparam

e.g. when size is the same as the input tensor dimension, we are using per tensor quantization

diff --git a/main/searchindex.js b/main/searchindex.js index f4f3c13f2..e5cd55d43 100644 --- a/main/searchindex.js +++ b/main/searchindex.js @@ -1 +1 @@ -Search.setIndex({"docnames": ["api_ref_dtypes", "api_ref_intro", "api_ref_kernel", "api_ref_quantization", "api_ref_sparsity", "dtypes", "generated/torchao.dtypes.AffineQuantizedTensor", "generated/torchao.dtypes.to_affine_quantized_floatx", "generated/torchao.dtypes.to_affine_quantized_floatx_static", "generated/torchao.dtypes.to_affine_quantized_intx", "generated/torchao.dtypes.to_affine_quantized_intx_static", "generated/torchao.dtypes.to_nf4", "generated/torchao.quantization.Int4WeightOnlyGPTQQuantizer", "generated/torchao.quantization.Int4WeightOnlyQuantizer", "generated/torchao.quantization.SmoothFakeDynQuantMixin", "generated/torchao.quantization.SmoothFakeDynamicallyQuantizedLinear", "generated/torchao.quantization.int4_weight_only", "generated/torchao.quantization.int8_dynamic_activation_int4_weight", "generated/torchao.quantization.int8_dynamic_activation_int8_weight", "generated/torchao.quantization.int8_weight_only", "generated/torchao.quantization.quantize_", "generated/torchao.quantization.smooth_fq_linear_to_inference", "generated/torchao.quantization.swap_linear_with_smooth_fq_linear", "generated/torchao.sparsity.PerChannelNormObserver", "generated/torchao.sparsity.WandaSparsifier", "generated/torchao.sparsity.apply_fake_sparsity", "getting-started", "index", "overview", "performant_kernels", "quantization", "serialization", "sg_execution_times", "sparsity", "tutorials/index", "tutorials/sg_execution_times", "tutorials/template_tutorial"], "filenames": ["api_ref_dtypes.rst", "api_ref_intro.rst", "api_ref_kernel.rst", "api_ref_quantization.rst", "api_ref_sparsity.rst", "dtypes.rst", "generated/torchao.dtypes.AffineQuantizedTensor.rst", "generated/torchao.dtypes.to_affine_quantized_floatx.rst", "generated/torchao.dtypes.to_affine_quantized_floatx_static.rst", "generated/torchao.dtypes.to_affine_quantized_intx.rst", "generated/torchao.dtypes.to_affine_quantized_intx_static.rst", "generated/torchao.dtypes.to_nf4.rst", "generated/torchao.quantization.Int4WeightOnlyGPTQQuantizer.rst", "generated/torchao.quantization.Int4WeightOnlyQuantizer.rst", "generated/torchao.quantization.SmoothFakeDynQuantMixin.rst", "generated/torchao.quantization.SmoothFakeDynamicallyQuantizedLinear.rst", "generated/torchao.quantization.int4_weight_only.rst", "generated/torchao.quantization.int8_dynamic_activation_int4_weight.rst", "generated/torchao.quantization.int8_dynamic_activation_int8_weight.rst", "generated/torchao.quantization.int8_weight_only.rst", "generated/torchao.quantization.quantize_.rst", "generated/torchao.quantization.smooth_fq_linear_to_inference.rst", "generated/torchao.quantization.swap_linear_with_smooth_fq_linear.rst", "generated/torchao.sparsity.PerChannelNormObserver.rst", "generated/torchao.sparsity.WandaSparsifier.rst", "generated/torchao.sparsity.apply_fake_sparsity.rst", "getting-started.rst", "index.rst", "overview.rst", "performant_kernels.rst", "quantization.rst", "serialization.rst", "sg_execution_times.rst", "sparsity.rst", "tutorials/index.rst", "tutorials/sg_execution_times.rst", "tutorials/template_tutorial.rst"], "titles": ["torchao.dtypes", "torchao API Reference", "torchao.kernel", "torchao.quantization", "torchao.sparsity", "Dtypes", "AffineQuantizedTensor", "to_affine_quantized_floatx", "to_affine_quantized_floatx_static", "to_affine_quantized_intx", "to_affine_quantized_intx_static", "to_nf4", "Int4WeightOnlyGPTQQuantizer", "Int4WeightOnlyQuantizer", "SmoothFakeDynQuantMixin", "SmoothFakeDynamicallyQuantizedLinear", "int4_weight_only", "int8_dynamic_activation_int4_weight", "int8_dynamic_activation_int8_weight", "int8_weight_only", "quantize", "smooth_fq_linear_to_inference", "swap_linear_with_smooth_fq_linear", "PerChannelNormObserver", "WandaSparsifier", "apply_fake_sparsity", "Getting Started", "Welcome to the torchao Documentation", "Overview", "Performant Kernels", "Quantization", "Serialization", "Computation times", "Sparsity", "<no title>", "Computation times", "Template Tutorial"], "terms": {"thi": [1, 6, 15, 16, 17, 20, 23, 24, 25, 31, 36], "section": 1, "introduc": 1, "dive": 1, "detail": 1, "how": [1, 6, 16, 31], "integr": [1, 31], "pytorch": [1, 6, 27, 36], "optim": [1, 20], "your": [1, 20, 27], "machin": 1, "learn": [1, 16, 36], "model": [1, 17, 20, 21, 22, 24, 25, 27], "sparsiti": [1, 23, 24, 25, 27, 31], "quantiz": [1, 6, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 27, 31], "dtype": [1, 6, 7, 8, 9, 10, 11, 13, 20, 27, 31], "kernel": [1, 6, 16, 20], "tba": [2, 5, 26, 28, 29, 30, 33], "class": [6, 12, 13, 14, 15, 23, 24, 31], "torchao": [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 31], "layout_tensor": 6, "aqtlayout": 6, "block_siz": [6, 7, 8, 9, 10, 11], "tupl": [6, 7, 8, 9, 10, 24], "int": [6, 7, 8, 9, 10, 11, 13, 20, 24], "shape": 6, "size": [6, 16, 17, 31], "quant_min": [6, 9, 10], "option": [6, 7, 9, 10, 13, 20, 21, 22, 24], "union": [6, 20], "float": [6, 9, 16, 20, 22, 24, 31], "none": [6, 7, 9, 10, 20, 21, 22, 24], "quant_max": [6, 9, 10], "zero_point_domain": [6, 9, 10, 16, 20], "zeropointdomain": [6, 9, 10, 16], "stride": 6, "sourc": [6, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 34, 36], "affin": [6, 20], "tensor": [6, 7, 8, 9, 10, 11, 16, 20, 24, 31, 36], "subclass": [6, 15, 20, 23, 31], "mean": 6, "we": [6, 20, 31], "point": [6, 16, 31], "an": [6, 24, 27], "transform": 6, "quantized_tensor": 6, "float_tensor": 6, "scale": [6, 8, 10, 14, 15, 21, 22], "zero_point": [6, 10, 16], "To": [6, 31], "see": [6, 31], "what": [6, 36], "happen": 6, "dure": [6, 22], "choose_qparam": 6, "dequant": [6, 16], "pleas": [6, 16], "checkout": 6, "http": [6, 24], "github": 6, "com": 6, "ao": 6, "blob": 6, "main": [6, 16], "quant_primit": 6, "py": [6, 32, 35, 36], "check": [6, 31], "three": [6, 24], "quant": 6, "primit": 6, "op": [6, 16, 20], "choose_qparams_affin": [6, 16], "quantize_affin": [6, 16], "qand": 6, "dequantize_affin": [6, 16], "The": [6, 20, 21, 22, 24, 31], "repres": [6, 24, 31], "look": 6, "extern": 6, "regardless": 6, "intern": 6, "represent": [6, 16], "s": 6, "type": [6, 12, 13, 16, 31], "orient": 6, "field": 6, "serv": 6, "gener": [6, 34, 36], "layout": [6, 16], "storag": 6, "data": [6, 31], "e": [6, 20, 31], "g": [6, 20, 31], "store": [6, 23], "plain": 6, "int_data": 6, "pack": 6, "format": 6, "depend": [6, 31], "devic": [6, 12, 13, 20, 31], "oper": 6, "granular": [6, 16, 17], "element": 6, "share": 6, "same": 6, "qparam": 6, "when": 6, "input": [6, 20, 24], "dimens": 6, "ar": [6, 16, 20, 24, 31], "us": [6, 16, 17, 20, 24, 27, 31], "per": [6, 15, 16, 17, 18, 19, 24], "torch": [6, 13, 15, 16, 20, 21, 22, 31, 36], "origin": [6, 24, 31], "high": 6, "precis": [6, 13], "minimum": 6, "valu": [6, 14, 15, 21, 24], "specifi": [6, 20, 24], "deriv": 6, "from": [6, 17, 20, 31, 32, 35, 36], "maximum": [6, 21], "domain": [6, 16], "should": [6, 15, 23, 24], "either": [6, 24], "integ": [6, 16], "zero": [6, 16, 24], "ad": [6, 24], "subtract": 6, "unquant": 6, "default": [6, 16, 20, 21, 22], "float32": [6, 31], "given": 6, "return": [6, 20, 21, 22, 31], "arg": [6, 14, 15, 24], "kwarg": [6, 14, 15, 23, 24, 25], "perform": [6, 14, 15, 21, 23], "convers": [6, 20], "A": [6, 23], "infer": [6, 15, 21, 31], "argument": [6, 20], "self": [6, 14, 15, 31], "If": [6, 21, 24], "alreadi": 6, "ha": 6, "correct": 6, "otherwis": 6, "copi": [6, 24, 31], "desir": 6, "here": [6, 31], "wai": 6, "call": [6, 15, 20, 23, 31], "non_block": 6, "fals": [6, 9, 16, 20, 21, 24, 31], "memory_format": 6, "preserve_format": 6, "memori": 6, "tri": 6, "convert": [6, 15, 20], "asynchron": 6, "respect": 6, "host": 6, "possibl": 6, "cpu": [6, 31], "pin": 6, "cuda": [6, 12, 13, 20, 31], "set": [6, 14, 15, 20, 21, 24], "new": [6, 20], "creat": 6, "even": 6, "match": 6, "other": [6, 24, 31, 36], "exampl": [6, 20, 24, 31, 32, 34, 35, 36], "randn": [6, 31], "2": [6, 16, 20, 25, 36], "initi": [6, 31], "float64": 6, "0": [6, 12, 14, 15, 20, 22, 24, 31, 32, 35, 36], "5044": 6, "0005": 6, "3310": 6, "0584": 6, "cuda0": 6, "true": [6, 9, 12, 13, 20, 21, 31], "input_float": [7, 8, 9, 10], "target_dtyp": [7, 8, 9, 10], "layout_typ": [7, 8, 9, 10, 16, 18], "layouttyp": [7, 8, 9, 10], "scale_dtyp": [7, 9], "mapping_typ": [9, 17], "mappingtyp": [9, 17], "ep": 9, "zero_point_dtyp": [9, 20], "preserve_zero": [9, 16, 20], "bool": [9, 13, 20, 21], "plainlayouttyp": [9, 10, 18], "use_hqq": [9, 16], "64": [11, 12, 16, 31], "scaler_block_s": 11, "256": [11, 13, 16], "blocksiz": 12, "128": [12, 16], "percdamp": 12, "01": 12, "groupsiz": [12, 13, 20], "inner_k_til": [12, 13, 16], "8": [12, 13, 16], "padding_allow": [12, 13], "bfloat16": [13, 20, 31], "set_debug_x_absmax": [14, 15], "x_running_abs_max": [14, 15], "which": [14, 15, 31], "lead": [14, 15], "smooth": [14, 15], "all": [14, 15, 23, 24, 25, 31, 32, 34], "ones": [14, 15, 24], "alpha": [14, 15, 22], "5": [14, 15, 22, 24, 36], "enabl": [14, 15], "benchmark": [14, 15, 21], "without": [14, 15], "calibr": [14, 15], "replac": [15, 22], "nn": [15, 20, 21, 22, 31], "linear": [15, 16, 17, 18, 19, 20, 22, 25, 31], "implement": [15, 31], "dynam": [15, 17, 18], "token": [15, 17, 18], "activ": [15, 17, 18, 21, 24], "channel": [15, 18, 19, 23], "weight": [15, 16, 17, 18, 19, 20, 24, 31], "base": [15, 24], "smoothquant": [15, 21, 22], "forward": [15, 23, 31], "x": [15, 20, 31, 36], "defin": [15, 23, 24], "comput": [15, 23, 24], "everi": [15, 23], "overridden": [15, 23], "although": [15, 23], "recip": [15, 23], "pass": [15, 23], "need": [15, 23, 24, 31], "within": [15, 23], "function": [15, 20, 23, 24, 25, 27, 31], "one": [15, 23], "modul": [15, 20, 21, 22, 23, 24, 31], "instanc": [15, 20, 23, 31], "afterward": [15, 23], "instead": [15, 16, 23], "sinc": [15, 23, 31], "former": [15, 23], "take": [15, 20, 23], "care": [15, 23, 31], "run": [15, 20, 21, 23, 36], "regist": [15, 23], "hook": [15, 23], "while": [15, 23, 24], "latter": [15, 23], "silent": [15, 23], "ignor": [15, 23], "them": [15, 23], "classmethod": 15, "from_float": 15, "mod": 15, "fake": 15, "version": 15, "note": [15, 24], "requir": 15, "to_infer": 15, "calcul": [15, 21], "prepar": [15, 21, 24], "group_siz": [16, 17, 20], "tensorcoretiledlayouttyp": 16, "appli": [16, 17, 18, 19, 20], "uint4": [16, 20], "onli": [16, 19, 20, 31], "asymmetr": [16, 17, 20], "group": [16, 17], "layer": [16, 18, 19, 21, 22, 24, 25], "tensor_core_til": 16, "speedup": 16, "tinygemm": [16, 20], "target": [16, 24], "int4mm": 16, "aten": 16, "_weight_int4pack_mm": 16, "differ": [16, 31], "algorithm": 16, "compar": [16, 24], "more": [16, 17, 27], "tradit": 16, "follow": 16, "1": [16, 20, 24, 31, 32, 35, 36], "doe": 16, "have": [16, 24], "exactli": 16, "relev": [16, 36], "code": [16, 34, 36], "about": [16, 31], "paramet": [16, 17, 20, 21, 22, 24, 31], "chosen": 16, "control": [16, 17, 24], "smaller": [16, 17, 31], "fine": [16, 17], "grain": [16, 17], "choic": 16, "32": [16, 17, 20, 31], "whether": [16, 20], "hqq": 16, "mode": 16, "symmetr": [17, 18, 19], "int8": [17, 18, 19, 20], "int4": [17, 20, 31], "produc": 17, "executorch": [17, 20], "backend": 17, "current": [17, 20, 22, 24], "did": 17, "support": [17, 31], "lower": 17, "flow": 17, "yet": 17, "quantize_": [20, 31], "apply_tensor_subclass": 20, "callabl": 20, "filter_fn": 20, "str": [20, 22, 24], "set_inductor_config": 20, "modifi": [20, 24], "inplac": [20, 24], "fulli": [20, 22], "qualifi": [20, 22], "name": [20, 22, 24], "want": [20, 31], "automat": [20, 36], "recommend": 20, "inductor": 20, "config": [20, 24], "move": 20, "befor": [20, 31], "can": [20, 31], "speed": 20, "up": 20, "final": 20, "do": 20, "chang": [20, 31], "import": [20, 31, 36], "some": [20, 24], "predefin": 20, "method": [20, 24], "correspond": [20, 31], "execut": [20, 32, 35], "path": 20, "also": [20, 31], "customiz": 20, "int8_dynamic_activation_int4_weight": 20, "int8_dynamic_activation_int8_weight": 20, "mm": 20, "compil": 20, "int4_weight_onli": [20, 31], "int8_weight_onli": 20, "quant_api": [20, 31], "m": [20, 31], "sequenti": 20, "1024": [20, 31], "write": 20, "own": 20, "you": [20, 24, 31, 36], "add": [20, 36], "manual": 20, "constructor": 20, "to_affine_quantized_intx": 20, "groupwis": 20, "apply_weight_qu": 20, "lambda": 20, "int32": 20, "15": 20, "1e": 20, "6": 20, "def": [20, 31], "apply_weight_quant_to_linear": 20, "requires_grad": 20, "under": [20, 27], "block0": 20, "submodul": 20, "fqn": [20, 24], "isinst": 20, "debug_skip_calibr": 21, "each": [21, 23], "smoothfakedynamicallyquantizedlinear": [21, 22], "contain": [21, 22], "debug": 21, "skip_fqn_list": 22, "cur_fqn": 22, "equival": 22, "list": [22, 24], "skip": [22, 24], "being": 22, "process": [22, 36], "factor": 22, "custom": 23, "observ": 23, "l2": 23, "norm": [23, 24], "buffer": 23, "x_orig": 23, "sparsity_level": 24, "semi_structured_block_s": 24, "wanda": 24, "sparsifi": [24, 31], "prune": [24, 27], "propos": 24, "arxiv": 24, "org": 24, "ab": 24, "2306": 24, "11695": 24, "awar": 24, "remov": 24, "product": 24, "magnitud": 24, "variabl": 24, "number": 24, "spars": 24, "block": 24, "out": 24, "level": 24, "dict": 24, "parametr": 24, "preserv": 24, "deepcopi": 24, "squash_mask": 24, "params_to_keep": 24, "params_to_keep_per_lay": 24, "squash": 24, "mask": 24, "appropri": 24, "sparse_param": 24, "attach": 24, "kei": [24, 36], "save": [24, 31], "param": 24, "specif": [24, 31], "string": 24, "xdoctest": 24, "local": 24, "undefin": 24, "don": 24, "t": 24, "ani": 24, "hasattr": 24, "submodule1": 24, "keep": 24, "linear1": [24, 31], "foo": 24, "bar": 24, "submodule2": 24, "linear42": 24, "baz": 24, "print": [24, 31, 36], "42": 24, "24": 24, "update_mask": 24, "tensor_nam": 24, "statist": 24, "retriev": 24, "first": 24, "act_per_input": 24, "Then": 24, "metric": 24, "matrix": 24, "across": 24, "whole": 24, "simul": 25, "4": [25, 31], "open": 27, "librari": [27, 31], "provid": 27, "nativ": 27, "our": 27, "develop": 27, "content": 27, "come": 27, "soon": 27, "question": 31, "peopl": 31, "especi": 31, "describ": [31, 36], "work": 31, "tempfil": 31, "util": 31, "get_model_size_in_byt": 31, "toylinearmodel": 31, "__init__": 31, "n": 31, "k": 31, "super": 31, "bia": 31, "linear2": 31, "example_input": 31, "batch_siz": 31, "in_featur": 31, "eval": 31, "f": 31, "mb": [31, 32, 35], "ref": 31, "namedtemporaryfil": 31, "state_dict": 31, "seek": 31, "load": 31, "meta": 31, "m_load": 31, "so": 31, "load_state_dict": 31, "assign": 31, "after": 31, "re": 31, "assert": 31, "equal": 31, "just": 31, "becaus": 31, "techniqu": 31, "like": 31, "thing": 31, "structur": 31, "For": 31, "float_weight1": 31, "float_weight2": 31, "quantized_weight1": 31, "quantized_weight2": 31, "typic": 31, "go": [31, 36], "techinqu": 31, "abov": 31, "reduct": 31, "around": 31, "4x": 31, "0625": 31, "reason": 31, "avoid": 31, "mai": 31, "fit": 31, "updat": 31, "affinequantizedtensor": 31, "No": 31, "verifi": 31, "properli": 31, "affine_quantized_tensor": 31, "00": [32, 35], "004": [32, 35, 36], "total": [32, 35, 36], "file": [32, 35], "galleri": [32, 34, 36], "mem": [32, 35], "templat": [32, 34, 35], "tutori": [32, 34, 35], "tutorials_sourc": 32, "template_tutori": [32, 35, 36], "download": [34, 36], "python": [34, 36], "tutorials_python": 34, "zip": [34, 36], "jupyt": [34, 36], "notebook": [34, 36], "tutorials_jupyt": 34, "sphinx": [34, 36], "end": 36, "full": 36, "author": 36, "firstnam": 36, "lastnam": 36, "item": 36, "3": 36, "prerequisit": 36, "v2": 36, "gpu": 36, "why": 36, "topic": 36, "link": 36, "research": 36, "paper": 36, "walk": 36, "through": 36, "output": 36, "below": 36, "rand": 36, "8493": 36, "0526": 36, "5841": 36, "6383": 36, "5932": 36, "8083": 36, "3087": 36, "3515": 36, "4735": 36, "8996": 36, "7762": 36, "1826": 36, "2607": 36, "2312": 36, "7631": 36, "practic": 36, "user": 36, "test": 36, "knowledg": 36, "nlp": 36, "scratch": 36, "summar": 36, "concept": 36, "cover": 36, "highlight": 36, "takeawai": 36, "link1": 36, "link2": 36, "time": 36, "script": 36, "minut": 36, "second": 36, "ipynb": 36}, "objects": {"torchao.dtypes": [[6, 0, 1, "", "AffineQuantizedTensor"], [7, 2, 1, "", "to_affine_quantized_floatx"], [8, 2, 1, "", "to_affine_quantized_floatx_static"], [9, 2, 1, "", "to_affine_quantized_intx"], [10, 2, 1, "", "to_affine_quantized_intx_static"], [11, 2, 1, "", "to_nf4"]], "torchao.dtypes.AffineQuantizedTensor": [[6, 1, 1, "", "dequantize"], [6, 1, 1, "", "to"]], "torchao.quantization": [[12, 0, 1, "", "Int4WeightOnlyGPTQQuantizer"], [13, 0, 1, "", "Int4WeightOnlyQuantizer"], [14, 0, 1, "", "SmoothFakeDynQuantMixin"], [15, 0, 1, "", "SmoothFakeDynamicallyQuantizedLinear"], [16, 2, 1, "", "int4_weight_only"], [17, 2, 1, "", "int8_dynamic_activation_int4_weight"], [18, 2, 1, "", "int8_dynamic_activation_int8_weight"], [19, 2, 1, "", "int8_weight_only"], [20, 2, 1, "", "quantize_"], [21, 2, 1, "", "smooth_fq_linear_to_inference"], [22, 2, 1, "", "swap_linear_with_smooth_fq_linear"]], "torchao.quantization.SmoothFakeDynQuantMixin": [[14, 1, 1, "", "set_debug_x_absmax"]], "torchao.quantization.SmoothFakeDynamicallyQuantizedLinear": [[15, 1, 1, "", "forward"], [15, 1, 1, "", "from_float"], [15, 1, 1, "", "set_debug_x_absmax"], [15, 1, 1, "", "to_inference"]], "torchao": [[4, 3, 0, "-", "sparsity"]], "torchao.sparsity": [[23, 0, 1, "", "PerChannelNormObserver"], [24, 0, 1, "", "WandaSparsifier"], [25, 2, 1, "", "apply_fake_sparsity"]], "torchao.sparsity.PerChannelNormObserver": [[23, 1, 1, "", "forward"]], "torchao.sparsity.WandaSparsifier": [[24, 1, 1, "", "prepare"], [24, 1, 1, "", "squash_mask"], [24, 1, 1, "", "update_mask"]]}, "objtypes": {"0": "py:class", "1": "py:method", "2": "py:function", "3": "py:module"}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "method", "Python method"], "2": ["py", "function", "Python function"], "3": ["py", "module", "Python module"]}, "titleterms": {"torchao": [0, 1, 2, 3, 4, 27], "dtype": [0, 5], "api": [1, 27], "refer": [1, 27], "python": 1, "kernel": [2, 29], "quantiz": [3, 20, 30], "sparsiti": [4, 33], "affinequantizedtensor": 6, "to_affine_quantized_floatx": 7, "to_affine_quantized_floatx_stat": 8, "to_affine_quantized_intx": 9, "to_affine_quantized_intx_stat": 10, "to_nf4": 11, "int4weightonlygptqquant": 12, "int4weightonlyquant": 13, "smoothfakedynquantmixin": 14, "smoothfakedynamicallyquantizedlinear": 15, "int4_weight_onli": 16, "int8_dynamic_activation_int4_weight": 17, "int8_dynamic_activation_int8_weight": 18, "int8_weight_onli": 19, "smooth_fq_linear_to_infer": 21, "swap_linear_with_smooth_fq_linear": 22, "perchannelnormobserv": 23, "wandasparsifi": 24, "apply_fake_spars": 25, "get": 26, "start": 26, "welcom": 27, "document": 27, "overview": [28, 36], "perform": 29, "serial": 31, "deseri": 31, "flow": 31, "what": 31, "happen": 31, "when": 31, "an": 31, "optim": 31, "model": 31, "comput": [32, 35], "time": [32, 35], "templat": 36, "tutori": 36, "step": 36, "option": 36, "addit": 36, "exercis": 36, "conclus": 36, "further": 36, "read": 36}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 6, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.todo": 2, "sphinx.ext.viewcode": 1, "sphinx": 56}}) \ No newline at end of file +Search.setIndex({"docnames": ["api_ref_dtypes", "api_ref_intro", "api_ref_kernel", "api_ref_quantization", "api_ref_sparsity", "dtypes", "generated/torchao.dtypes.AffineQuantizedTensor", "generated/torchao.dtypes.to_affine_quantized_floatx", "generated/torchao.dtypes.to_affine_quantized_floatx_static", "generated/torchao.dtypes.to_affine_quantized_intx", "generated/torchao.dtypes.to_affine_quantized_intx_static", "generated/torchao.dtypes.to_nf4", "generated/torchao.quantization.Int4WeightOnlyGPTQQuantizer", "generated/torchao.quantization.Int4WeightOnlyQuantizer", "generated/torchao.quantization.SmoothFakeDynQuantMixin", "generated/torchao.quantization.SmoothFakeDynamicallyQuantizedLinear", "generated/torchao.quantization.int4_weight_only", "generated/torchao.quantization.int8_dynamic_activation_int4_weight", "generated/torchao.quantization.int8_dynamic_activation_int8_weight", "generated/torchao.quantization.int8_weight_only", "generated/torchao.quantization.quantize_", "generated/torchao.quantization.smooth_fq_linear_to_inference", "generated/torchao.quantization.swap_linear_with_smooth_fq_linear", "generated/torchao.sparsity.PerChannelNormObserver", "generated/torchao.sparsity.WandaSparsifier", "generated/torchao.sparsity.apply_fake_sparsity", "getting-started", "index", "overview", "performant_kernels", "quantization", "serialization", "sg_execution_times", "sparsity", "tutorials/index", "tutorials/sg_execution_times", "tutorials/template_tutorial"], "filenames": ["api_ref_dtypes.rst", "api_ref_intro.rst", "api_ref_kernel.rst", "api_ref_quantization.rst", "api_ref_sparsity.rst", "dtypes.rst", "generated/torchao.dtypes.AffineQuantizedTensor.rst", "generated/torchao.dtypes.to_affine_quantized_floatx.rst", "generated/torchao.dtypes.to_affine_quantized_floatx_static.rst", "generated/torchao.dtypes.to_affine_quantized_intx.rst", "generated/torchao.dtypes.to_affine_quantized_intx_static.rst", "generated/torchao.dtypes.to_nf4.rst", "generated/torchao.quantization.Int4WeightOnlyGPTQQuantizer.rst", "generated/torchao.quantization.Int4WeightOnlyQuantizer.rst", "generated/torchao.quantization.SmoothFakeDynQuantMixin.rst", "generated/torchao.quantization.SmoothFakeDynamicallyQuantizedLinear.rst", "generated/torchao.quantization.int4_weight_only.rst", "generated/torchao.quantization.int8_dynamic_activation_int4_weight.rst", "generated/torchao.quantization.int8_dynamic_activation_int8_weight.rst", "generated/torchao.quantization.int8_weight_only.rst", "generated/torchao.quantization.quantize_.rst", "generated/torchao.quantization.smooth_fq_linear_to_inference.rst", "generated/torchao.quantization.swap_linear_with_smooth_fq_linear.rst", "generated/torchao.sparsity.PerChannelNormObserver.rst", "generated/torchao.sparsity.WandaSparsifier.rst", "generated/torchao.sparsity.apply_fake_sparsity.rst", "getting-started.rst", "index.rst", "overview.rst", "performant_kernels.rst", "quantization.rst", "serialization.rst", "sg_execution_times.rst", "sparsity.rst", "tutorials/index.rst", "tutorials/sg_execution_times.rst", "tutorials/template_tutorial.rst"], "titles": ["torchao.dtypes", "torchao API Reference", "torchao.kernel", "torchao.quantization", "torchao.sparsity", "Dtypes", "AffineQuantizedTensor", "to_affine_quantized_floatx", "to_affine_quantized_floatx_static", "to_affine_quantized_intx", "to_affine_quantized_intx_static", "to_nf4", "Int4WeightOnlyGPTQQuantizer", "Int4WeightOnlyQuantizer", "SmoothFakeDynQuantMixin", "SmoothFakeDynamicallyQuantizedLinear", "int4_weight_only", "int8_dynamic_activation_int4_weight", "int8_dynamic_activation_int8_weight", "int8_weight_only", "quantize", "smooth_fq_linear_to_inference", "swap_linear_with_smooth_fq_linear", "PerChannelNormObserver", "WandaSparsifier", "apply_fake_sparsity", "Getting Started", "Welcome to the torchao Documentation", "Overview", "Performant Kernels", "Quantization", "Serialization", "Computation times", "Sparsity", "<no title>", "Computation times", "Template Tutorial"], "terms": {"thi": [1, 6, 15, 16, 17, 20, 23, 24, 25, 31, 36], "section": 1, "introduc": 1, "dive": 1, "detail": 1, "how": [1, 6, 16, 31], "integr": [1, 31], "pytorch": [1, 6, 27, 36], "optim": [1, 20], "your": [1, 20, 27], "machin": 1, "learn": [1, 16, 36], "model": [1, 17, 20, 21, 22, 24, 25, 27], "sparsiti": [1, 23, 24, 25, 27, 31], "quantiz": [1, 6, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 27, 31], "dtype": [1, 6, 7, 8, 9, 10, 11, 13, 20, 27, 31], "kernel": [1, 6, 16, 20], "tba": [2, 5, 26, 28, 29, 30, 33], "class": [6, 12, 13, 14, 15, 23, 24, 31], "torchao": [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 31], "tensor_impl": 6, "aqttensorimpl": 6, "block_siz": [6, 7, 8, 9, 10, 11], "tupl": [6, 7, 8, 9, 10, 24], "int": [6, 7, 8, 9, 10, 11, 13, 20, 24], "shape": 6, "size": [6, 16, 17, 31], "quant_min": [6, 9, 10], "option": [6, 7, 9, 10, 13, 20, 21, 22, 24], "union": [6, 20], "float": [6, 9, 16, 20, 22, 24, 31], "none": [6, 7, 9, 10, 20, 21, 22, 24], "quant_max": [6, 9, 10], "zero_point_domain": [6, 9, 10, 16, 20], "zeropointdomain": [6, 9, 10, 16], "stride": 6, "sourc": [6, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 34, 36], "affin": [6, 20], "tensor": [6, 7, 8, 9, 10, 11, 16, 20, 24, 31, 36], "subclass": [6, 15, 20, 23, 31], "mean": 6, "we": [6, 20, 31], "point": [6, 16, 31], "an": [6, 24, 27], "transform": 6, "quantized_tensor": 6, "float_tensor": 6, "scale": [6, 8, 10, 14, 15, 21, 22], "zero_point": [6, 10, 16], "To": [6, 31], "see": [6, 31], "what": [6, 36], "happen": 6, "dure": [6, 22], "choose_qparam": 6, "dequant": [6, 16], "pleas": [6, 16], "checkout": 6, "http": [6, 24], "github": 6, "com": 6, "ao": 6, "blob": 6, "main": [6, 16], "quant_primit": 6, "py": [6, 32, 35, 36], "check": [6, 31], "three": [6, 24], "quant": 6, "primit": 6, "op": [6, 16, 20], "choose_qparams_affin": [6, 16], "quantize_affin": [6, 16], "qand": 6, "dequantize_affin": [6, 16], "The": [6, 20, 21, 22, 24, 31], "repres": [6, 24, 31], "look": 6, "extern": 6, "regardless": 6, "intern": 6, "represent": [6, 16], "s": 6, "type": [6, 12, 13, 16, 31], "orient": 6, "field": 6, "serv": 6, "gener": [6, 34, 36], "impl": 6, "storag": 6, "data": [6, 31], "e": [6, 20, 31], "g": [6, 20, 31], "store": [6, 23], "plain": 6, "int_data": 6, "pack": 6, "format": 6, "depend": [6, 31], "devic": [6, 12, 13, 20, 31], "oper": 6, "granular": [6, 16, 17], "element": 6, "share": 6, "same": 6, "qparam": 6, "when": 6, "input": [6, 20, 24], "dimens": 6, "ar": [6, 16, 20, 24, 31], "us": [6, 16, 17, 20, 24, 27, 31], "per": [6, 15, 16, 17, 18, 19, 24], "torch": [6, 13, 15, 16, 20, 21, 22, 31, 36], "origin": [6, 24, 31], "high": 6, "precis": [6, 13], "minimum": 6, "valu": [6, 14, 15, 21, 24], "specifi": [6, 20, 24], "deriv": 6, "from": [6, 17, 20, 31, 32, 35, 36], "maximum": [6, 21], "domain": [6, 16], "should": [6, 15, 23, 24], "either": [6, 24], "integ": [6, 16], "zero": [6, 16, 24], "ad": [6, 24], "subtract": 6, "unquant": 6, "default": [6, 16, 20, 21, 22], "float32": [6, 31], "given": 6, "return": [6, 20, 21, 22, 31], "arg": [6, 14, 15, 24], "kwarg": [6, 14, 15, 23, 24, 25], "perform": [6, 14, 15, 21, 23], "convers": [6, 20], "A": [6, 23], "infer": [6, 15, 21, 31], "argument": [6, 20], "self": [6, 14, 15, 31], "If": [6, 21, 24], "alreadi": 6, "ha": 6, "correct": 6, "otherwis": 6, "copi": [6, 24, 31], "desir": 6, "here": [6, 31], "wai": 6, "call": [6, 15, 20, 23, 31], "non_block": 6, "fals": [6, 9, 16, 20, 21, 24, 31], "memory_format": 6, "preserve_format": 6, "memori": 6, "tri": 6, "convert": [6, 15, 20], "asynchron": 6, "respect": 6, "host": 6, "possibl": 6, "cpu": [6, 31], "pin": 6, "cuda": [6, 12, 13, 20, 31], "set": [6, 14, 15, 20, 21, 24], "new": [6, 20], "creat": 6, "even": 6, "match": 6, "other": [6, 24, 31, 36], "exampl": [6, 20, 24, 31, 32, 34, 35, 36], "randn": [6, 31], "2": [6, 16, 20, 25, 36], "initi": [6, 31], "float64": 6, "0": [6, 12, 14, 15, 20, 22, 24, 31, 32, 35, 36], "5044": 6, "0005": 6, "3310": 6, "0584": 6, "cuda0": 6, "true": [6, 9, 12, 13, 20, 21, 31], "input_float": [7, 8, 9, 10], "target_dtyp": [7, 8, 9, 10], "layout_typ": [7, 8, 9, 10, 16, 18], "layouttyp": [7, 8, 9, 10], "scale_dtyp": [7, 9], "mapping_typ": [9, 17], "mappingtyp": [9, 17], "ep": 9, "zero_point_dtyp": [9, 20], "preserve_zero": [9, 16, 20], "bool": [9, 13, 20, 21], "plainlayouttyp": [9, 10, 18], "use_hqq": [9, 16], "64": [11, 12, 16, 31], "scaler_block_s": 11, "256": [11, 13, 16], "blocksiz": 12, "128": [12, 16], "percdamp": 12, "01": 12, "groupsiz": [12, 13, 20], "inner_k_til": [12, 13, 16], "8": [12, 13, 16], "padding_allow": [12, 13], "bfloat16": [13, 20, 31], "set_debug_x_absmax": [14, 15], "x_running_abs_max": [14, 15], "which": [14, 15, 31], "lead": [14, 15], "smooth": [14, 15], "all": [14, 15, 23, 24, 25, 31, 32, 34], "ones": [14, 15, 24], "alpha": [14, 15, 22], "5": [14, 15, 22, 24, 36], "enabl": [14, 15], "benchmark": [14, 15, 21], "without": [14, 15], "calibr": [14, 15], "replac": [15, 22], "nn": [15, 20, 21, 22, 31], "linear": [15, 16, 17, 18, 19, 20, 22, 25, 31], "implement": [15, 31], "dynam": [15, 17, 18], "token": [15, 17, 18], "activ": [15, 17, 18, 21, 24], "channel": [15, 18, 19, 23], "weight": [15, 16, 17, 18, 19, 20, 24, 31], "base": [15, 24], "smoothquant": [15, 21, 22], "forward": [15, 23, 31], "x": [15, 20, 31, 36], "defin": [15, 23, 24], "comput": [15, 23, 24], "everi": [15, 23], "overridden": [15, 23], "although": [15, 23], "recip": [15, 23], "pass": [15, 23], "need": [15, 23, 24, 31], "within": [15, 23], "function": [15, 20, 23, 24, 25, 27, 31], "one": [15, 23], "modul": [15, 20, 21, 22, 23, 24, 31], "instanc": [15, 20, 23, 31], "afterward": [15, 23], "instead": [15, 16, 23], "sinc": [15, 23, 31], "former": [15, 23], "take": [15, 20, 23], "care": [15, 23, 31], "run": [15, 20, 21, 23, 36], "regist": [15, 23], "hook": [15, 23], "while": [15, 23, 24], "latter": [15, 23], "silent": [15, 23], "ignor": [15, 23], "them": [15, 23], "classmethod": 15, "from_float": 15, "mod": 15, "fake": 15, "version": 15, "note": [15, 24], "requir": 15, "to_infer": 15, "calcul": [15, 21], "prepar": [15, 21, 24], "group_siz": [16, 17, 20], "tensorcoretiledlayouttyp": 16, "appli": [16, 17, 18, 19, 20], "uint4": [16, 20], "onli": [16, 19, 20, 31], "asymmetr": [16, 17, 20], "group": [16, 17], "layer": [16, 18, 19, 21, 22, 24, 25], "tensor_core_til": 16, "layout": 16, "speedup": 16, "tinygemm": [16, 20], "target": [16, 24], "int4mm": 16, "aten": 16, "_weight_int4pack_mm": 16, "differ": [16, 31], "algorithm": 16, "compar": [16, 24], "more": [16, 17, 27], "tradit": 16, "follow": 16, "1": [16, 20, 24, 31, 32, 35, 36], "doe": 16, "have": [16, 24], "exactli": 16, "relev": [16, 36], "code": [16, 34, 36], "about": [16, 31], "paramet": [16, 17, 20, 21, 22, 24, 31], "chosen": 16, "control": [16, 17, 24], "smaller": [16, 17, 31], "fine": [16, 17], "grain": [16, 17], "choic": 16, "32": [16, 17, 20, 31], "whether": [16, 20], "hqq": 16, "mode": 16, "symmetr": [17, 18, 19], "int8": [17, 18, 19, 20], "int4": [17, 20, 31], "produc": 17, "executorch": [17, 20], "backend": 17, "current": [17, 20, 22, 24], "did": 17, "support": [17, 31], "lower": 17, "flow": 17, "yet": 17, "quantize_": [20, 31], "apply_tensor_subclass": 20, "callabl": 20, "filter_fn": 20, "str": [20, 22, 24], "set_inductor_config": 20, "modifi": [20, 24], "inplac": [20, 24], "fulli": [20, 22], "qualifi": [20, 22], "name": [20, 22, 24], "want": [20, 31], "automat": [20, 36], "recommend": 20, "inductor": 20, "config": [20, 24], "move": 20, "befor": [20, 31], "can": [20, 31], "speed": 20, "up": 20, "final": 20, "do": 20, "chang": [20, 31], "import": [20, 31, 36], "some": [20, 24], "predefin": 20, "method": [20, 24], "correspond": [20, 31], "execut": [20, 32, 35], "path": 20, "also": [20, 31], "customiz": 20, "int8_dynamic_activation_int4_weight": 20, "int8_dynamic_activation_int8_weight": 20, "mm": 20, "compil": 20, "int4_weight_onli": [20, 31], "int8_weight_onli": 20, "quant_api": [20, 31], "m": [20, 31], "sequenti": 20, "1024": [20, 31], "write": 20, "own": 20, "you": [20, 24, 31, 36], "add": [20, 36], "manual": 20, "constructor": 20, "to_affine_quantized_intx": 20, "groupwis": 20, "apply_weight_qu": 20, "lambda": 20, "int32": 20, "15": 20, "1e": 20, "6": 20, "def": [20, 31], "apply_weight_quant_to_linear": 20, "requires_grad": 20, "under": [20, 27], "block0": 20, "submodul": 20, "fqn": [20, 24], "isinst": 20, "debug_skip_calibr": 21, "each": [21, 23], "smoothfakedynamicallyquantizedlinear": [21, 22], "contain": [21, 22], "debug": 21, "skip_fqn_list": 22, "cur_fqn": 22, "equival": 22, "list": [22, 24], "skip": [22, 24], "being": 22, "process": [22, 36], "factor": 22, "custom": 23, "observ": 23, "l2": 23, "norm": [23, 24], "buffer": 23, "x_orig": 23, "sparsity_level": 24, "semi_structured_block_s": 24, "wanda": 24, "sparsifi": [24, 31], "prune": [24, 27], "propos": 24, "arxiv": 24, "org": 24, "ab": 24, "2306": 24, "11695": 24, "awar": 24, "remov": 24, "product": 24, "magnitud": 24, "variabl": 24, "number": 24, "spars": 24, "block": 24, "out": 24, "level": 24, "dict": 24, "parametr": 24, "preserv": 24, "deepcopi": 24, "squash_mask": 24, "params_to_keep": 24, "params_to_keep_per_lay": 24, "squash": 24, "mask": 24, "appropri": 24, "sparse_param": 24, "attach": 24, "kei": [24, 36], "save": [24, 31], "param": 24, "specif": [24, 31], "string": 24, "xdoctest": 24, "local": 24, "undefin": 24, "don": 24, "t": 24, "ani": 24, "hasattr": 24, "submodule1": 24, "keep": 24, "linear1": [24, 31], "foo": 24, "bar": 24, "submodule2": 24, "linear42": 24, "baz": 24, "print": [24, 31, 36], "42": 24, "24": 24, "update_mask": 24, "tensor_nam": 24, "statist": 24, "retriev": 24, "first": 24, "act_per_input": 24, "Then": 24, "metric": 24, "matrix": 24, "across": 24, "whole": 24, "simul": 25, "4": [25, 31], "open": 27, "librari": [27, 31], "provid": 27, "nativ": 27, "our": 27, "develop": 27, "content": 27, "come": 27, "soon": 27, "question": 31, "peopl": 31, "especi": 31, "describ": [31, 36], "work": 31, "tempfil": 31, "util": 31, "get_model_size_in_byt": 31, "toylinearmodel": 31, "__init__": 31, "n": 31, "k": 31, "super": 31, "bia": 31, "linear2": 31, "example_input": 31, "batch_siz": 31, "in_featur": 31, "eval": 31, "f": 31, "mb": [31, 32, 35], "ref": 31, "namedtemporaryfil": 31, "state_dict": 31, "seek": 31, "load": 31, "meta": 31, "m_load": 31, "so": 31, "load_state_dict": 31, "assign": 31, "after": 31, "re": 31, "assert": 31, "equal": 31, "just": 31, "becaus": 31, "techniqu": 31, "like": 31, "thing": 31, "structur": 31, "For": 31, "float_weight1": 31, "float_weight2": 31, "quantized_weight1": 31, "quantized_weight2": 31, "typic": 31, "go": [31, 36], "techinqu": 31, "abov": 31, "reduct": 31, "around": 31, "4x": 31, "0625": 31, "reason": 31, "avoid": 31, "mai": 31, "fit": 31, "updat": 31, "affinequantizedtensor": 31, "No": 31, "verifi": 31, "properli": 31, "affine_quantized_tensor": 31, "00": [32, 35], "004": [32, 35, 36], "total": [32, 35, 36], "file": [32, 35], "galleri": [32, 34, 36], "mem": [32, 35], "templat": [32, 34, 35], "tutori": [32, 34, 35], "tutorials_sourc": 32, "template_tutori": [32, 35, 36], "download": [34, 36], "python": [34, 36], "tutorials_python": 34, "zip": [34, 36], "jupyt": [34, 36], "notebook": [34, 36], "tutorials_jupyt": 34, "sphinx": [34, 36], "end": 36, "full": 36, "author": 36, "firstnam": 36, "lastnam": 36, "item": 36, "3": 36, "prerequisit": 36, "v2": 36, "gpu": 36, "why": 36, "topic": 36, "link": 36, "research": 36, "paper": 36, "walk": 36, "through": 36, "output": 36, "below": 36, "rand": 36, "7804": 36, "8663": 36, "7150": 36, "4530": 36, "6350": 36, "2086": 36, "9097": 36, "1238": 36, "0825": 36, "5196": 36, "2840": 36, "3932": 36, "3891": 36, "3960": 36, "5983": 36, "practic": 36, "user": 36, "test": 36, "knowledg": 36, "nlp": 36, "scratch": 36, "summar": 36, "concept": 36, "cover": 36, "highlight": 36, "takeawai": 36, "link1": 36, "link2": 36, "time": 36, "script": 36, "minut": 36, "second": 36, "ipynb": 36}, "objects": {"torchao.dtypes": [[6, 0, 1, "", "AffineQuantizedTensor"], [7, 2, 1, "", "to_affine_quantized_floatx"], [8, 2, 1, "", "to_affine_quantized_floatx_static"], [9, 2, 1, "", "to_affine_quantized_intx"], [10, 2, 1, "", "to_affine_quantized_intx_static"], [11, 2, 1, "", "to_nf4"]], "torchao.dtypes.AffineQuantizedTensor": [[6, 1, 1, "", "dequantize"], [6, 1, 1, "", "to"]], "torchao.quantization": [[12, 0, 1, "", "Int4WeightOnlyGPTQQuantizer"], [13, 0, 1, "", "Int4WeightOnlyQuantizer"], [14, 0, 1, "", "SmoothFakeDynQuantMixin"], [15, 0, 1, "", "SmoothFakeDynamicallyQuantizedLinear"], [16, 2, 1, "", "int4_weight_only"], [17, 2, 1, "", "int8_dynamic_activation_int4_weight"], [18, 2, 1, "", "int8_dynamic_activation_int8_weight"], [19, 2, 1, "", "int8_weight_only"], [20, 2, 1, "", "quantize_"], [21, 2, 1, "", "smooth_fq_linear_to_inference"], [22, 2, 1, "", "swap_linear_with_smooth_fq_linear"]], "torchao.quantization.SmoothFakeDynQuantMixin": [[14, 1, 1, "", "set_debug_x_absmax"]], "torchao.quantization.SmoothFakeDynamicallyQuantizedLinear": [[15, 1, 1, "", "forward"], [15, 1, 1, "", "from_float"], [15, 1, 1, "", "set_debug_x_absmax"], [15, 1, 1, "", "to_inference"]], "torchao": [[4, 3, 0, "-", "sparsity"]], "torchao.sparsity": [[23, 0, 1, "", "PerChannelNormObserver"], [24, 0, 1, "", "WandaSparsifier"], [25, 2, 1, "", "apply_fake_sparsity"]], "torchao.sparsity.PerChannelNormObserver": [[23, 1, 1, "", "forward"]], "torchao.sparsity.WandaSparsifier": [[24, 1, 1, "", "prepare"], [24, 1, 1, "", "squash_mask"], [24, 1, 1, "", "update_mask"]]}, "objtypes": {"0": "py:class", "1": "py:method", "2": "py:function", "3": "py:module"}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "method", "Python method"], "2": ["py", "function", "Python function"], "3": ["py", "module", "Python module"]}, "titleterms": {"torchao": [0, 1, 2, 3, 4, 27], "dtype": [0, 5], "api": [1, 27], "refer": [1, 27], "python": 1, "kernel": [2, 29], "quantiz": [3, 20, 30], "sparsiti": [4, 33], "affinequantizedtensor": 6, "to_affine_quantized_floatx": 7, "to_affine_quantized_floatx_stat": 8, "to_affine_quantized_intx": 9, "to_affine_quantized_intx_stat": 10, "to_nf4": 11, "int4weightonlygptqquant": 12, "int4weightonlyquant": 13, "smoothfakedynquantmixin": 14, "smoothfakedynamicallyquantizedlinear": 15, "int4_weight_onli": 16, "int8_dynamic_activation_int4_weight": 17, "int8_dynamic_activation_int8_weight": 18, "int8_weight_onli": 19, "smooth_fq_linear_to_infer": 21, "swap_linear_with_smooth_fq_linear": 22, "perchannelnormobserv": 23, "wandasparsifi": 24, "apply_fake_spars": 25, "get": 26, "start": 26, "welcom": 27, "document": 27, "overview": [28, 36], "perform": 29, "serial": 31, "deseri": 31, "flow": 31, "what": 31, "happen": 31, "when": 31, "an": 31, "optim": 31, "model": 31, "comput": [32, 35], "time": [32, 35], "templat": 36, "tutori": 36, "step": 36, "option": 36, "addit": 36, "exercis": 36, "conclus": 36, "further": 36, "read": 36}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 6, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.todo": 2, "sphinx.ext.viewcode": 1, "sphinx": 56}}) \ No newline at end of file diff --git a/main/tutorials/template_tutorial.html b/main/tutorials/template_tutorial.html index 73bb5d6c7..95941d51c 100644 --- a/main/tutorials/template_tutorial.html +++ b/main/tutorials/template_tutorial.html @@ -413,11 +413,11 @@

Steps print(x)

-
tensor([[0.8493, 0.0526, 0.5841],
-        [0.6383, 0.5932, 0.8083],
-        [0.3087, 0.3515, 0.4735],
-        [0.8996, 0.7762, 0.1826],
-        [0.2607, 0.2312, 0.7631]])
+
tensor([[0.7804, 0.8663, 0.7150],
+        [0.4530, 0.6350, 0.2086],
+        [0.9097, 0.1238, 0.0825],
+        [0.5196, 0.2840, 0.3932],
+        [0.3891, 0.3960, 0.5983]])