From 76b6e36e24133050ba4116db02c5dd56c1d475cb Mon Sep 17 00:00:00 2001 From: Keyan Pishdadian Date: Thu, 10 Oct 2024 12:28:19 -0400 Subject: [PATCH] Change torchao quantization types from int to size_t and preface vars with "preferred_" Differential Revision: D63873383 Pull Request resolved: https://github.com/pytorch/ao/pull/1041 --- ...se_lowbit_weight_1x1x32_f32_neondot-impl.h | 6 ++--- ...se_lowbit_weight_1x4x16_f32_neondot-impl.h | 6 ++--- ...se_lowbit_weight_1x8x16_f32_neondot-impl.h | 6 ++--- ...ion_prepare_activation_data_1xk_f32-impl.h | 2 +- .../kernels/cpu/aarch64/linear/linear.h | 13 ++++++----- .../benchmark_linear_8bit_act_xbit_weight.cpp | 16 ++++++------- .../Linear8BitActXBitWeightOperator.h | 14 +++++------ .../examples/separate_function_wrappers.cpp | 12 +++++----- .../examples/stateful_class_wrapper.cpp | 4 ++-- .../linear_8bit_act_xbit_weight.cpp | 12 +++++----- .../linear_8bit_act_xbit_weight.h | 23 ++++++++++--------- .../op_linear_8bit_act_xbit_weight-impl.h | 8 +++---- .../test_linear_8bit_act_xbit_weight.cpp | 12 +++++----- 13 files changed, 68 insertions(+), 66 deletions(-) diff --git a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot-impl.h b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot-impl.h index 73c3fa500..b870725ee 100644 --- a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot-impl.h +++ b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot-impl.h @@ -181,7 +181,7 @@ void kernel_impl( // The groupi_zero is only present if has_weight_zeros = true. // Returns number of bytes required for weight_data -int inline weight_data_size_impl( +size_t inline weight_data_size_impl( int n, int k, int group_size, @@ -270,7 +270,7 @@ void prepare_weight_data_impl( // Activation functions template -int torchao::kernels::cpu::aarch64::linear:: +size_t torchao::kernels::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot:: activation_data_size(int m, int k, int group_size) { return torchao::kernels::cpu::aarch64::linear:: @@ -297,7 +297,7 @@ void torchao::kernels::cpu::aarch64::linear:: // Weight functions template -int torchao::kernels::cpu::aarch64::linear:: +size_t torchao::kernels::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot:: weight_data_size(int n, int k, int group_size) { return torchao::kernels::cpu::aarch64::linear:: diff --git a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h index a97013580..15417e026 100644 --- a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h +++ b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h @@ -248,7 +248,7 @@ void kernel_impl( // Prepares weight data for kernel_impl. // Returns number of bytes required for weight_data -int inline weight_data_size_impl( +size_t inline weight_data_size_impl( int n, int k, int group_size, @@ -397,7 +397,7 @@ void prepare_weight_data_impl( // Activation functions template -int torchao::kernels::cpu::aarch64::linear:: +size_t torchao::kernels::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot:: activation_data_size(int m, int k, int group_size) { return torchao::kernels::cpu::aarch64::linear:: @@ -424,7 +424,7 @@ void torchao::kernels::cpu::aarch64::linear:: // Weight functions template -int torchao::kernels::cpu::aarch64::linear:: +size_t torchao::kernels::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot:: weight_data_size(int n, int k, int group_size) { return torchao::kernels::cpu::aarch64::linear:: diff --git a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h index d1d904219..a6f11a175 100644 --- a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h +++ b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h @@ -333,7 +333,7 @@ void kernel_impl( // Prepares weight data for kernel_impl. // Returns number of bytes required for weight_data -int inline weight_data_size_impl( +size_t inline weight_data_size_impl( int n, int k, int group_size, @@ -483,7 +483,7 @@ void prepare_weight_data_impl( // Activation functions template -int torchao::kernels::cpu::aarch64::linear:: +size_t torchao::kernels::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot:: activation_data_size(int m, int k, int group_size) { return torchao::kernels::cpu::aarch64::linear:: @@ -510,7 +510,7 @@ void torchao::kernels::cpu::aarch64::linear:: // Weight functions template -int torchao::kernels::cpu::aarch64::linear:: +size_t torchao::kernels::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot:: weight_data_size(int n, int k, int group_size) { return torchao::kernels::cpu::aarch64::linear:: diff --git a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_prepare_activation_data_1xk_f32-impl.h b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_prepare_activation_data_1xk_f32-impl.h index b32b33e58..f5def72dc 100644 --- a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_prepare_activation_data_1xk_f32-impl.h +++ b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_prepare_activation_data_1xk_f32-impl.h @@ -25,7 +25,7 @@ namespace channelwise_8bit_activation_prepare_activation_data_1xk_f32:: // The groupi_qvals_sum is only present if has_weight_zeros = true. // Returns number of bytes required for activation_data -int inline activation_data_size_impl( +size_t inline activation_data_size_impl( int m, int k, // Ignored if has_weight_zeros = false diff --git a/torchao/experimental/kernels/cpu/aarch64/linear/linear.h b/torchao/experimental/kernels/cpu/aarch64/linear/linear.h index 447e42b9f..c6843124b 100644 --- a/torchao/experimental/kernels/cpu/aarch64/linear/linear.h +++ b/torchao/experimental/kernels/cpu/aarch64/linear/linear.h @@ -9,13 +9,14 @@ #if defined(__aarch64__) || defined(__ARM_NEON) #include +#include namespace torchao::kernels::cpu::aarch64::linear { namespace channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot { template -int activation_data_size(int m, int k, int group_size); +size_t activation_data_size(int m, int k, int group_size); template void prepare_activation_data( @@ -28,7 +29,7 @@ void prepare_activation_data( const float* activations); template -int weight_data_size(int n, int k, int group_size); +size_t weight_data_size(int n, int k, int group_size); template void prepare_weight_data( @@ -65,7 +66,7 @@ void kernel( namespace channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot { template -int activation_data_size(int m, int k, int group_size); +size_t activation_data_size(int m, int k, int group_size); template void prepare_activation_data( @@ -78,7 +79,7 @@ void prepare_activation_data( const float* activations); template -int weight_data_size(int n, int k, int group_size); +size_t weight_data_size(int n, int k, int group_size); template void prepare_weight_data( @@ -115,7 +116,7 @@ void kernel( namespace channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot { template -int activation_data_size(int m, int k, int group_size); +size_t activation_data_size(int m, int k, int group_size); template void prepare_activation_data( @@ -128,7 +129,7 @@ void prepare_activation_data( const float* activations); template -int weight_data_size(int n, int k, int group_size); +size_t weight_data_size(int n, int k, int group_size); template void prepare_weight_data( diff --git a/torchao/experimental/ops/benchmarks/benchmark_linear_8bit_act_xbit_weight.cpp b/torchao/experimental/ops/benchmarks/benchmark_linear_8bit_act_xbit_weight.cpp index 0edad27cc..2efd42517 100644 --- a/torchao/experimental/ops/benchmarks/benchmark_linear_8bit_act_xbit_weight.cpp +++ b/torchao/experimental/ops/benchmarks/benchmark_linear_8bit_act_xbit_weight.cpp @@ -24,12 +24,12 @@ UKernelConfig get_ukernel_config() { config.nr = 8; config.activation_data_size_fn = &ukernel::activation_data_size; - config.activation_data_alignment = 16; // size of neon register + config.preferred_activation_data_alignment = 16; // size of neon register config.prepare_activation_data_fn = &ukernel::prepare_activation_data; config.weight_data_size_fn = &ukernel::weight_data_size; - config.weight_data_alignment = 16; // size of neon register + config.preferred_weight_data_alignment = 16; // size of neon register config.prepare_weight_data_fn = &ukernel::prepare_weight_data; config.kernel_fn = @@ -85,13 +85,13 @@ static void linear_8bit_act_xbit_weight(benchmark::State& state) { // Pack test case weights size_t packed_weight_data_size = get_packed_weight_data_size(ukernel_config, n, k, group_size); - size_t packed_weight_data_alignment = - get_packed_weight_data_alignment(ukernel_config); + size_t preferred_packed_weight_data_alignment = + get_preferred_packed_weight_data_alignment(ukernel_config); std::vector> packed_weight_data; for (int i = 0; i < test_cases.size(); i++) { packed_weight_data.emplace_back(torchao::make_aligned_byte_ptr( - packed_weight_data_alignment, packed_weight_data_size)); + preferred_packed_weight_data_alignment, packed_weight_data_size)); pack_weight_data_operator( ukernel_config, pack_weight_data_tiling_params, @@ -112,11 +112,11 @@ static void linear_8bit_act_xbit_weight(benchmark::State& state) { m, k, group_size); - size_t activation_data_buffer_alignment = - get_activation_data_buffer_alignment(ukernel_config); + size_t preferred_activation_data_buffer_alignment = + get_preferred_activation_data_buffer_alignment(ukernel_config); auto activation_data_buffer = torchao::make_aligned_byte_ptr( - activation_data_buffer_alignment, activation_data_buffer_size); + preferred_activation_data_buffer_alignment, activation_data_buffer_size); auto output = std::vector(m * n); for (auto _ : state) { diff --git a/torchao/experimental/ops/linear_8bit_act_xbit_weight/examples/Linear8BitActXBitWeightOperator.h b/torchao/experimental/ops/linear_8bit_act_xbit_weight/examples/Linear8BitActXBitWeightOperator.h index bba120a4f..7d4e28f44 100644 --- a/torchao/experimental/ops/linear_8bit_act_xbit_weight/examples/Linear8BitActXBitWeightOperator.h +++ b/torchao/experimental/ops/linear_8bit_act_xbit_weight/examples/Linear8BitActXBitWeightOperator.h @@ -17,7 +17,7 @@ class Linear8BitActXBitWeightOperator { private: torchao::aligned_byte_ptr packed_weight_data_{nullptr, nullptr}; int packed_weight_data_size_{0}; - int packed_weight_data_alignment_{0}; + int preferred_packed_weight_data_alignment_{0}; torchao::aligned_byte_ptr activation_data_buffer_{nullptr, nullptr}; @@ -107,13 +107,13 @@ class Linear8BitActXBitWeightOperator { // Pack weight data auto packed_weight_data_size = get_packed_weight_data_size(ukernel_config_, n_, k_, group_size_); - auto packed_weight_data_alignment = - get_packed_weight_data_alignment(ukernel_config_); + auto preferred_packed_weight_data_alignment = + get_preferred_packed_weight_data_alignment(ukernel_config_); packed_weight_data_size_ = packed_weight_data_size; - packed_weight_data_alignment_ = packed_weight_data_alignment; + preferred_packed_weight_data_alignment_ = preferred_packed_weight_data_alignment; packed_weight_data_ = torchao::make_aligned_byte_ptr( - packed_weight_data_alignment, packed_weight_data_size); + preferred_packed_weight_data_alignment, packed_weight_data_size); pack_weight_data_operator( ukernel_config_, @@ -136,7 +136,7 @@ class Linear8BitActXBitWeightOperator { k_, group_size_); auto activation_data_buffer_alignment = - get_activation_data_buffer_alignment(ukernel_config_); + get_preferred_activation_data_buffer_alignment(ukernel_config_); activation_data_buffer_ = torchao::make_aligned_byte_ptr( activation_data_buffer_alignment, activation_data_buffer_size); @@ -168,7 +168,7 @@ class Linear8BitActXBitWeightOperator { k_, group_size_); auto activation_data_buffer_alignment = - get_activation_data_buffer_alignment(ukernel_config_); + get_preferred_activation_data_buffer_alignment(ukernel_config_); activation_data_buffer_ = torchao::make_aligned_byte_ptr( activation_data_buffer_alignment, activation_data_buffer_size); } diff --git a/torchao/experimental/ops/linear_8bit_act_xbit_weight/examples/separate_function_wrappers.cpp b/torchao/experimental/ops/linear_8bit_act_xbit_weight/examples/separate_function_wrappers.cpp index 56233e22c..961c03e98 100644 --- a/torchao/experimental/ops/linear_8bit_act_xbit_weight/examples/separate_function_wrappers.cpp +++ b/torchao/experimental/ops/linear_8bit_act_xbit_weight/examples/separate_function_wrappers.cpp @@ -34,12 +34,12 @@ UKernelConfig get_ukernel_config() { config.nr = 8; config.activation_data_size_fn = &ukernel::activation_data_size; - config.activation_data_alignment = 16; // size of neon register + config.preferred_activation_data_alignment = 16; // size of neon register config.prepare_activation_data_fn = &ukernel::prepare_activation_data; config.weight_data_size_fn = &ukernel::weight_data_size; - config.weight_data_alignment = 16; // size of neon register + config.preferred_weight_data_alignment = 16; // size of neon register config.prepare_weight_data_fn = &ukernel::prepare_weight_data; config.kernel_fn = @@ -67,10 +67,10 @@ torchao::aligned_byte_ptr pack_weight_data_operator( auto packed_weight_data_size = get_packed_weight_data_size(ukernel_config, n, k, group_size); - auto packed_weight_data_alignment = - get_packed_weight_data_alignment(ukernel_config); + auto preferred_packed_weight_data_alignment = + get_preferred_packed_weight_data_alignment(ukernel_config); auto packed_weight_data = torchao::make_aligned_byte_ptr( - packed_weight_data_alignment, packed_weight_data_size); + preferred_packed_weight_data_alignment, packed_weight_data_size); pack_weight_data_operator( ukernel_config, @@ -118,7 +118,7 @@ void linear_operator( auto activation_data_buffer_size = get_activation_data_buffer_size( ukernel_config, tiling_params_, scheduling_policy_, m, k, group_size); auto activation_data_buffer_alignment = - get_activation_data_buffer_alignment(ukernel_config); + get_preferred_activation_data_buffer_alignment(ukernel_config); auto activation_data_buffer = torchao::make_aligned_byte_ptr( activation_data_buffer_alignment, activation_data_buffer_size); diff --git a/torchao/experimental/ops/linear_8bit_act_xbit_weight/examples/stateful_class_wrapper.cpp b/torchao/experimental/ops/linear_8bit_act_xbit_weight/examples/stateful_class_wrapper.cpp index 647d10d13..a45c32811 100644 --- a/torchao/experimental/ops/linear_8bit_act_xbit_weight/examples/stateful_class_wrapper.cpp +++ b/torchao/experimental/ops/linear_8bit_act_xbit_weight/examples/stateful_class_wrapper.cpp @@ -34,12 +34,12 @@ UKernelConfig get_ukernel_config() { config.nr = 8; config.activation_data_size_fn = &ukernel::activation_data_size; - config.activation_data_alignment = 16; // size of neon register + config.preferred_activation_data_alignment = 16; // size of neon register config.prepare_activation_data_fn = &ukernel::prepare_activation_data; config.weight_data_size_fn = &ukernel::weight_data_size; - config.weight_data_alignment = 16; // size of neon register + config.preferred_weight_data_alignment = 16; // size of neon register config.prepare_weight_data_fn = &ukernel::prepare_weight_data; config.kernel_fn = diff --git a/torchao/experimental/ops/linear_8bit_act_xbit_weight/linear_8bit_act_xbit_weight.cpp b/torchao/experimental/ops/linear_8bit_act_xbit_weight/linear_8bit_act_xbit_weight.cpp index f41144485..e2fcbaa2f 100644 --- a/torchao/experimental/ops/linear_8bit_act_xbit_weight/linear_8bit_act_xbit_weight.cpp +++ b/torchao/experimental/ops/linear_8bit_act_xbit_weight/linear_8bit_act_xbit_weight.cpp @@ -117,7 +117,7 @@ LinearTilingParams get_default_linear_tiling_params( namespace internal { -inline int +inline size_t get_activation_data_buffer_size_with_tile_schedule_policy_single_mc_parallel_nc( const UKernelConfig& ukernel_config, const LinearTilingParams& tiling_params, @@ -128,7 +128,7 @@ get_activation_data_buffer_size_with_tile_schedule_policy_single_mc_parallel_nc( tiling_params.mc_by_mr * ukernel_config.mr, k, group_size); } -inline int +inline size_t get_activation_data_buffer_size_with_tile_schedule_policy_parallel_mc_parallel_nc( const UKernelConfig& ukernel_config, const LinearTilingParams& tiling_params, @@ -162,7 +162,7 @@ inline void linear_operator_with_tile_schedule_policy_single_mc_parallel_nc( int nc = std::min(n, tiling_params.nc_by_nr * ukernel_config.nr); int num_mc_panels = (m + mc - 1) / mc; int num_nc_panels = (n + nc - 1) / nc; - int weight_data_size = ukernel_config.weight_data_size_fn(nr, k, group_size); + size_t weight_data_size = ukernel_config.weight_data_size_fn(nr, k, group_size); for (int mc_tile_idx = 0; mc_tile_idx < num_mc_panels; mc_tile_idx++) { int m_idx = mc_tile_idx * mc; @@ -223,8 +223,8 @@ inline void linear_operator_with_tile_schedule_policy_parallel_mc_parallel_nc( int num_mc_panels = (m + mc - 1) / mc; int num_nc_panels = (n + nc - 1) / nc; - int weight_data_size = ukernel_config.weight_data_size_fn(nr, k, group_size); - int activation_data_size = + size_t weight_data_size = ukernel_config.weight_data_size_fn(nr, k, group_size); + size_t activation_data_size = ukernel_config.activation_data_size_fn(mr, k, group_size); torchao::parallel_1d(0, num_mc_panels, [&](int64_t idx) { @@ -332,7 +332,7 @@ void linear_operator( } } -int get_activation_data_buffer_size( +size_t get_activation_data_buffer_size( const UKernelConfig& ukernel_config, const LinearTilingParams& tiling_params, LinearTileSchedulingPolicy scheduling_policy, diff --git a/torchao/experimental/ops/linear_8bit_act_xbit_weight/linear_8bit_act_xbit_weight.h b/torchao/experimental/ops/linear_8bit_act_xbit_weight/linear_8bit_act_xbit_weight.h index 9788cf8dc..6ec098314 100644 --- a/torchao/experimental/ops/linear_8bit_act_xbit_weight/linear_8bit_act_xbit_weight.h +++ b/torchao/experimental/ops/linear_8bit_act_xbit_weight/linear_8bit_act_xbit_weight.h @@ -6,18 +6,19 @@ #pragma once #include +#include namespace torchao::ops::linear_8bit_act_xbit_weight { struct UKernelConfig { - using activation_data_size_fn_type = int (*)(int m, int k, int group_size); + using activation_data_size_fn_type = size_t (*)(int m, int k, int group_size); using prepare_activation_data_fn_type = void (*)( void* activation_data, int m, int k, int group_size, const float* activations); - using weight_data_size_fn_type = int (*)(int n, int k, int group_size); + using weight_data_size_fn_type = size_t (*)(int n, int k, int group_size); using prepare_weight_data_fn_type = void (*)( void* weight_data, int n, @@ -40,11 +41,11 @@ struct UKernelConfig { float clamp_max); activation_data_size_fn_type activation_data_size_fn{nullptr}; - // activation_data_alignment is only a preferred alignment for + // preferred_activation_data_alignment is only a preferred alignment for // performance reasons. Integration surfaces are not required to // respect this alignment, and the ukernel must behave correctly no matter // how the prepared_activation_data byte-array is aligned - int activation_data_alignment{0}; + size_t preferred_activation_data_alignment{0}; prepare_activation_data_fn_type prepare_activation_data_fn{nullptr}; weight_data_size_fn_type weight_data_size_fn{nullptr}; @@ -52,7 +53,7 @@ struct UKernelConfig { // performance reasons. Integration surfaces are not required to // respect this alignment, and the ukernel must behave correctly no matter // how the prepared_weight_data byte-array is aligned - int weight_data_alignment{0}; + size_t preferred_weight_data_alignment{0}; prepare_weight_data_fn_type prepare_weight_data_fn{nullptr}; kernel_fn_type kernel_fn{nullptr}; @@ -70,7 +71,7 @@ PackWeightDataTilingParams get_default_pack_weight_data_tiling_params( int n, int target_panels_per_thread = 1); -inline int get_packed_weight_data_size( +inline size_t get_packed_weight_data_size( const UKernelConfig& ukernel_config, int n, int k, @@ -78,9 +79,9 @@ inline int get_packed_weight_data_size( return ukernel_config.weight_data_size_fn(n, k, group_size); } -inline int get_packed_weight_data_alignment( +inline size_t get_preferred_packed_weight_data_alignment( const UKernelConfig& ukernel_config) { - return ukernel_config.weight_data_alignment; + return ukernel_config.preferred_weight_data_alignment; } void pack_weight_data_operator( @@ -113,7 +114,7 @@ enum class LinearTileSchedulingPolicy { parallel_mc_parallel_nc }; -int get_activation_data_buffer_size( +size_t get_activation_data_buffer_size( const UKernelConfig& ukernel_config, const LinearTilingParams& tiling_params, LinearTileSchedulingPolicy scheduling_policy, @@ -121,9 +122,9 @@ int get_activation_data_buffer_size( int k, int group_size); -inline int get_activation_data_buffer_alignment( +inline size_t get_preferred_activation_data_buffer_alignment( const UKernelConfig& ukernel_config) { - return ukernel_config.activation_data_alignment; + return ukernel_config.preferred_activation_data_alignment; } void linear_operator( diff --git a/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight-impl.h b/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight-impl.h index 80772c7c1..ba732d526 100644 --- a/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight-impl.h +++ b/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight-impl.h @@ -47,12 +47,12 @@ get_ukernel_config() { config.nr = 8; config.activation_data_size_fn = &ukernel::activation_data_size; - config.activation_data_alignment = 16; // size of neon register + config.preferred_activation_data_alignment = 16; // size of neon register config.prepare_activation_data_fn = &ukernel::prepare_activation_data; config.weight_data_size_fn = &ukernel::weight_data_size; - config.weight_data_alignment = 16; // size of neon register + config.preferred_weight_data_alignment = 16; // size of neon register config.prepare_weight_data_fn = &ukernel::prepare_weight_data; config.kernel_fn = @@ -116,7 +116,7 @@ Tensor pack_weights_cpu( auto packed_weight_data_size = get_packed_weight_data_size(ukernel_config, n, k, group_size); - Tensor packed_weights = torch::empty({packed_weight_data_size}, torch::kInt8); + Tensor packed_weights = torch::empty({static_cast(packed_weight_data_size)}, torch::kInt8); pack_weight_data_operator( ukernel_config, pack_weight_tiling_params, @@ -182,7 +182,7 @@ Tensor pack_weights_meta( auto packed_weight_data_size = get_packed_weight_data_size(ukernel_config, n, k, group_size); - return torch::empty({packed_weight_data_size}).to("meta"); + return torch::empty({static_cast(packed_weight_data_size)}).to("meta"); } #endif // USE_ATEN diff --git a/torchao/experimental/ops/tests/test_linear_8bit_act_xbit_weight.cpp b/torchao/experimental/ops/tests/test_linear_8bit_act_xbit_weight.cpp index 11eb28239..4f6e8cbe5 100644 --- a/torchao/experimental/ops/tests/test_linear_8bit_act_xbit_weight.cpp +++ b/torchao/experimental/ops/tests/test_linear_8bit_act_xbit_weight.cpp @@ -26,12 +26,12 @@ UKernelConfig get_ukernel_config() { config.nr = 8; config.activation_data_size_fn = &ukernel::activation_data_size; - config.activation_data_alignment = 16; // size of neon register + config.preferred_activation_data_alignment = 16; // size of neon register config.prepare_activation_data_fn = &ukernel::prepare_activation_data; config.weight_data_size_fn = &ukernel::weight_data_size; - config.weight_data_alignment = 16; // size of neon register + config.preferred_weight_data_alignment = 16; // size of neon register config.prepare_weight_data_fn = &ukernel::prepare_weight_data; config.kernel_fn = @@ -70,10 +70,10 @@ void test_linear_8bit_act_xbit_weight(int m, int n, int k, int group_size) { get_default_pack_weight_data_tiling_params(ukernel_config, n); auto packed_weight_data_size = get_packed_weight_data_size(ukernel_config, n, k, group_size); - auto packed_weight_data_alignment = - get_packed_weight_data_alignment(ukernel_config); + auto preferred_packed_weight_data_alignment = + get_preferred_packed_weight_data_alignment(ukernel_config); auto packed_weight_data = torchao::make_aligned_byte_ptr( - packed_weight_data_alignment, packed_weight_data_size); + preferred_packed_weight_data_alignment, packed_weight_data_size); pack_weight_data_operator( ukernel_config, @@ -98,7 +98,7 @@ void test_linear_8bit_act_xbit_weight(int m, int n, int k, int group_size) { k, group_size); auto activation_data_buffer_alignment = - get_activation_data_buffer_alignment(ukernel_config); + get_preferred_activation_data_buffer_alignment(ukernel_config); auto activation_data_buffer = torchao::make_aligned_byte_ptr( activation_data_buffer_alignment, activation_data_buffer_size);