From 60ffb86f28f69c4d485c88cb93ce2b5b6f9f39ea Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Mon, 30 Sep 2024 11:19:59 -0700 Subject: [PATCH] Add arm neon guards Differential Revision: D63347224 Pull Request resolved: https://github.com/pytorch/ao/pull/980 --- .../cpu/aarch64/benchmarks/benchmark_bitpacking.cpp | 4 ++++ .../cpu/aarch64/benchmarks/benchmark_quantization.cpp | 4 ++++ .../kernels/cpu/aarch64/bitpacking/bitpack.h | 5 +++++ .../experimental/kernels/cpu/aarch64/bitpacking/uint1.h | 4 ++++ .../experimental/kernels/cpu/aarch64/bitpacking/uint2.h | 5 +++++ .../experimental/kernels/cpu/aarch64/bitpacking/uint3.h | 5 +++++ .../experimental/kernels/cpu/aarch64/bitpacking/uint4.h | 5 +++++ .../experimental/kernels/cpu/aarch64/bitpacking/uint5.h | 5 +++++ ...ion_groupwise_lowbit_weight_1x1x32_f32_neondot-impl.h | 5 +++++ ...ion_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h | 5 +++++ ...ion_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h | 5 +++++ ...bit_activation_prepare_activation_data_1xk_f32-impl.h | 5 +++++ torchao/experimental/kernels/cpu/aarch64/linear/linear.h | 5 +++++ .../kernels/cpu/aarch64/quantization/quantize.cpp | 4 ++++ .../kernels/cpu/aarch64/quantization/quantize.h | 4 ++++ .../kernels/cpu/aarch64/reduction/compute_sum.cpp | 4 ++++ .../kernels/cpu/aarch64/reduction/find_min_and_max.cpp | 4 ++++ .../kernels/cpu/aarch64/reduction/reduction.h | 4 ++++ .../kernels/cpu/aarch64/tests/test_bitpacking.cpp | 4 ++++ .../kernels/cpu/aarch64/tests/test_linear.cpp | 4 ++++ .../kernels/cpu/aarch64/tests/test_quantization.cpp | 4 ++++ .../kernels/cpu/aarch64/tests/test_reduction.cpp | 4 ++++ .../experimental/kernels/cpu/aarch64/tests/test_utils.h | 5 +++++ .../op_linear_8bit_act_xbit_weight-impl.h | 9 ++++++++- 24 files changed, 111 insertions(+), 1 deletion(-) diff --git a/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_bitpacking.cpp b/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_bitpacking.cpp index 16096a6c4..178215595 100644 --- a/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_bitpacking.cpp +++ b/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_bitpacking.cpp @@ -4,6 +4,8 @@ // This source code is licensed under the license found in the // LICENSE file in the root directory of this source tree. +#if defined(__aarch64__) || defined(__ARM_NEON) + #include #include @@ -796,3 +798,5 @@ BENCHMARK(benchmark_unpack_uint5_values)->ArgsProduct({{128}, {8, 64, 128}}); // Run the benchmark BENCHMARK_MAIN(); + +#endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_quantization.cpp b/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_quantization.cpp index 868f01648..7c81b963d 100644 --- a/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_quantization.cpp +++ b/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_quantization.cpp @@ -4,6 +4,8 @@ // This source code is licensed under the license found in the // LICENSE file in the root directory of this source tree. +#if defined(__aarch64__) || defined(__ARM_NEON) + #include #include #include @@ -39,3 +41,5 @@ BENCHMARK(benchmark_quantize) // Run the benchmark BENCHMARK_MAIN(); + +#endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h b/torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h index ae5a716a5..7029b7e49 100644 --- a/torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h +++ b/torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h @@ -5,6 +5,9 @@ // LICENSE file in the root directory of this source tree. #pragma once + +#if defined(__aarch64__) || defined(__ARM_NEON) + #include #include #include @@ -503,3 +506,5 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_128_lowbit_values( } // namespace bitpacking } // namespace torchao + +#endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint1.h b/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint1.h index 0a16c7398..78d0f76e8 100644 --- a/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint1.h +++ b/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint1.h @@ -5,6 +5,8 @@ // LICENSE file in the root directory of this source tree. #pragma once + +#if defined(__aarch64__) || defined(__ARM_NEON) #include #include @@ -140,3 +142,5 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_128_uint1_values( } // namespace internal } // namespace bitpacking } // namespace torchao + +#endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint2.h b/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint2.h index 985dfd9a7..d036c6ebc 100644 --- a/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint2.h +++ b/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint2.h @@ -5,6 +5,9 @@ // LICENSE file in the root directory of this source tree. #pragma once + +#if defined(__aarch64__) || defined(__ARM_NEON) + #include #include @@ -130,3 +133,5 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_64_uint2_values( } // namespace internal } // namespace bitpacking } // namespace torchao + +#endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint3.h b/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint3.h index b76b146ba..41cc1d0b1 100644 --- a/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint3.h +++ b/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint3.h @@ -5,6 +5,9 @@ // LICENSE file in the root directory of this source tree. #pragma once + +#if defined(__aarch64__) || defined(__ARM_NEON) + #include #include @@ -329,3 +332,5 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_128_uint3_values( } // namespace internal } // namespace bitpacking } // namespace torchao + +#endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint4.h b/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint4.h index d4d3f391f..3b1352d91 100644 --- a/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint4.h +++ b/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint4.h @@ -5,6 +5,9 @@ // LICENSE file in the root directory of this source tree. #pragma once + +#if defined(__aarch64__) || defined(__ARM_NEON) + #include #include @@ -68,3 +71,5 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_32_uint4_values( } // namespace internal } // namespace bitpacking } // namespace torchao + +#endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint5.h b/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint5.h index 0e8e101ea..0eceb56b7 100644 --- a/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint5.h +++ b/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint5.h @@ -5,6 +5,9 @@ // LICENSE file in the root directory of this source tree. #pragma once + +#if defined(__aarch64__) || defined(__ARM_NEON) + #include #include @@ -353,3 +356,5 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_128_uint5_values( } // namespace internal } // namespace bitpacking } // namespace torchao + +#endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot-impl.h b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot-impl.h index 19d4fe5bd..73c3fa500 100644 --- a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot-impl.h +++ b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot-impl.h @@ -5,6 +5,9 @@ // LICENSE file in the root directory of this source tree. #pragma once + +#if defined(__aarch64__) || defined(__ARM_NEON) + #include #include #include @@ -363,3 +366,5 @@ void torchao::kernels::cpu::aarch64::linear:: clamp_min, clamp_max); } + +#endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h index 2fcd8d131..a97013580 100644 --- a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h +++ b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h @@ -5,6 +5,9 @@ // LICENSE file in the root directory of this source tree. #pragma once + +#if defined(__aarch64__) || defined(__ARM_NEON) + #include #include #include @@ -489,3 +492,5 @@ void torchao::kernels::cpu::aarch64::linear:: clamp_min, clamp_max); } + +#endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h index 4974e909d..d1d904219 100644 --- a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h +++ b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h @@ -5,6 +5,9 @@ // LICENSE file in the root directory of this source tree. #pragma once + +#if defined(__aarch64__) || defined(__ARM_NEON) + #include #include #include @@ -575,3 +578,5 @@ void torchao::kernels::cpu::aarch64::linear:: clamp_min, clamp_max); } + +#endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_prepare_activation_data_1xk_f32-impl.h b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_prepare_activation_data_1xk_f32-impl.h index a67e2b0d1..b32b33e58 100644 --- a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_prepare_activation_data_1xk_f32-impl.h +++ b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_prepare_activation_data_1xk_f32-impl.h @@ -5,6 +5,9 @@ // LICENSE file in the root directory of this source tree. #pragma once + +#if defined(__aarch64__) || defined(__ARM_NEON) + #include #include #include @@ -119,3 +122,5 @@ void prepare_activation_data_impl( } // namespace // channelwise_8bit_activation_prepare_activation_data_1xk_f32::internal } // namespace torchao::kernels::cpu::aarch64::linear + +#endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/experimental/kernels/cpu/aarch64/linear/linear.h b/torchao/experimental/kernels/cpu/aarch64/linear/linear.h index cf3af21b5..447e42b9f 100644 --- a/torchao/experimental/kernels/cpu/aarch64/linear/linear.h +++ b/torchao/experimental/kernels/cpu/aarch64/linear/linear.h @@ -5,6 +5,9 @@ // LICENSE file in the root directory of this source tree. #pragma once + +#if defined(__aarch64__) || defined(__ARM_NEON) + #include namespace torchao::kernels::cpu::aarch64::linear { @@ -164,3 +167,5 @@ void kernel( #include #include #include + +#endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.cpp b/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.cpp index 523fd9360..65416fdf1 100644 --- a/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.cpp +++ b/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.cpp @@ -4,6 +4,8 @@ // This source code is licensed under the license found in the // LICENSE file in the root directory of this source tree. +#if defined(__aarch64__) || defined(__ARM_NEON) + #include #include #include @@ -111,3 +113,5 @@ void torchao::kernels::cpu::aarch64::quantization::quantize( vst1_s8(qvals + i, vec_qval_s8_01); } } + +#endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h b/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h index a8214cc44..777bdc6e2 100644 --- a/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h +++ b/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h @@ -5,6 +5,8 @@ // LICENSE file in the root directory of this source tree. #pragma once + +#if defined(__aarch64__) || defined(__ARM_NEON) #include // These methods are here temporarily @@ -53,3 +55,5 @@ void quantize( } // namespace cpu } // namespace kernels } // namespace torchao + +#endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp b/torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp index 3aa7f4a5d..3a41307cb 100644 --- a/torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp +++ b/torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp @@ -4,6 +4,8 @@ // This source code is licensed under the license found in the // LICENSE file in the root directory of this source tree. +#if defined(__aarch64__) || defined(__ARM_NEON) + #include #include @@ -25,3 +27,5 @@ int32_t torchao::kernels::cpu::aarch64::reduction::compute_sum( } return res; } + +#endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp b/torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp index 1516f3cef..89707eb0a 100644 --- a/torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp +++ b/torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp @@ -4,6 +4,8 @@ // This source code is licensed under the license found in the // LICENSE file in the root directory of this source tree. +#if defined(__aarch64__) || defined(__ARM_NEON) + #include #include @@ -44,3 +46,5 @@ void torchao::kernels::cpu::aarch64::reduction::find_min_and_max( i += 1; } } + +#endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h b/torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h index f027c8530..53e6ad5c4 100644 --- a/torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h +++ b/torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h @@ -5,6 +5,8 @@ // LICENSE file in the root directory of this source tree. #pragma once + +#if defined(__aarch64__) || defined(__ARM_NEON) #include #include @@ -26,3 +28,5 @@ int32_t compute_sum(const int8_t* vals, int size); } // namespace cpu } // namespace kernels } // namespace torchao + +#endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_bitpacking.cpp b/torchao/experimental/kernels/cpu/aarch64/tests/test_bitpacking.cpp index 581c3b3e3..92dceb16e 100644 --- a/torchao/experimental/kernels/cpu/aarch64/tests/test_bitpacking.cpp +++ b/torchao/experimental/kernels/cpu/aarch64/tests/test_bitpacking.cpp @@ -4,6 +4,8 @@ // This source code is licensed under the license found in the // LICENSE file in the root directory of this source tree. +#if defined(__aarch64__) || defined(__ARM_NEON) + #include #include #include @@ -662,3 +664,5 @@ TEST_BITPACKING_128_LOWBIT_VALUES(2); TEST_BITPACKING_128_LOWBIT_VALUES(3); TEST_BITPACKING_128_LOWBIT_VALUES(4); TEST_BITPACKING_128_LOWBIT_VALUES(5); + +#endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_linear.cpp b/torchao/experimental/kernels/cpu/aarch64/tests/test_linear.cpp index 22a2ed0f8..47902be72 100644 --- a/torchao/experimental/kernels/cpu/aarch64/tests/test_linear.cpp +++ b/torchao/experimental/kernels/cpu/aarch64/tests/test_linear.cpp @@ -4,6 +4,8 @@ // This source code is licensed under the license found in the // LICENSE file in the root directory of this source tree. +#if defined(__aarch64__) || defined(__ARM_NEON) + #include #include #include @@ -347,3 +349,5 @@ TEST( /*m=*/7, /*k=*/64, /*n=*/n, /*group_size=*/16); } } + +#endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_quantization.cpp b/torchao/experimental/kernels/cpu/aarch64/tests/test_quantization.cpp index 74fc5ef52..bb19528de 100644 --- a/torchao/experimental/kernels/cpu/aarch64/tests/test_quantization.cpp +++ b/torchao/experimental/kernels/cpu/aarch64/tests/test_quantization.cpp @@ -4,6 +4,8 @@ // This source code is licensed under the license found in the // LICENSE file in the root directory of this source tree. +#if defined(__aarch64__) || defined(__ARM_NEON) + #include #include #include @@ -68,3 +70,5 @@ TEST(test_quantize, ExpectedOutput) { } } } + +#endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_reduction.cpp b/torchao/experimental/kernels/cpu/aarch64/tests/test_reduction.cpp index 16eb87fbb..0720f2dcf 100644 --- a/torchao/experimental/kernels/cpu/aarch64/tests/test_reduction.cpp +++ b/torchao/experimental/kernels/cpu/aarch64/tests/test_reduction.cpp @@ -4,6 +4,8 @@ // This source code is licensed under the license found in the // LICENSE file in the root directory of this source tree. +#if defined(__aarch64__) || defined(__ARM_NEON) + #include #include #include @@ -58,3 +60,5 @@ TEST(test_compute_sum, SizeSmallerThan16) { int expected_sum = std::accumulate(vals.begin(), vals.end(), 0); EXPECT_EQ(sum, expected_sum); } + +#endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_utils.h b/torchao/experimental/kernels/cpu/aarch64/tests/test_utils.h index b9b03c777..c3dc431c0 100644 --- a/torchao/experimental/kernels/cpu/aarch64/tests/test_utils.h +++ b/torchao/experimental/kernels/cpu/aarch64/tests/test_utils.h @@ -5,6 +5,9 @@ // LICENSE file in the root directory of this source tree. #pragma once + +#if defined(__aarch64__) || defined(__ARM_NEON) + #include #include #include @@ -272,3 +275,5 @@ struct channelwise_8bit_activation_groupwise_lowbit_weight_test_case { }; } // namespace torchao + +#endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight-impl.h b/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight-impl.h index b40e53a59..51a02d264 100644 --- a/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight-impl.h +++ b/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight-impl.h @@ -5,7 +5,11 @@ // LICENSE file in the root directory of this source tree. #pragma once + +#if defined(__aarch64__) || defined(__ARM_NEON) #include +#endif // defined(__aarch64__) || defined(__ARM_NEON) + #include #include #include @@ -32,9 +36,11 @@ using RuntimeContext = torch::executor::KernelRuntimeContext; namespace { template -inline torchao::ops::linear_8bit_act_xbit_weight::UKernelConfig get_ukernel_config() { +inline torchao::ops::linear_8bit_act_xbit_weight::UKernelConfig +get_ukernel_config() { torchao::ops::linear_8bit_act_xbit_weight::UKernelConfig config; +#if defined(__aarch64__) || defined(__ARM_NEON) namespace ukernel = torchao::kernels::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot; config.mr = 1; @@ -51,6 +57,7 @@ inline torchao::ops::linear_8bit_act_xbit_weight::UKernelConfig get_ukernel_conf &ukernel::prepare_weight_data; config.kernel_fn = &ukernel::kernel; +#endif // defined(__aarch64__) || defined(__ARM_NEON) return config; }