Add arm neon guards

Differential Revision: D63347224 Pull Request resolved: #980
pytorch · Sep 30, 2024 · 60ffb86 · 60ffb86
1 parent b983f7d
commit 60ffb86
Show file tree

Hide file tree

Showing 24 changed files with 111 additions and 1 deletion.
diff --git a/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_bitpacking.cpp b/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_bitpacking.cpp
@@ -4,6 +4,8 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.
 
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <arm_neon.h>
 #include <benchmark/benchmark.h>
 
@@ -796,3 +798,5 @@ BENCHMARK(benchmark_unpack_uint5_values)->ArgsProduct({{128}, {8, 64, 128}});
 
 // Run the benchmark
 BENCHMARK_MAIN();
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_quantization.cpp b/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_quantization.cpp
@@ -4,6 +4,8 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.
 
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <benchmark/benchmark.h>
 #include <torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h>
 #include <torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h>
@@ -39,3 +41,5 @@ BENCHMARK(benchmark_quantize)
 
 // Run the benchmark
 BENCHMARK_MAIN();
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h b/torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h
@@ -5,6 +5,9 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <arm_neon.h>
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/macro.h>
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/uint1.h>
@@ -503,3 +506,5 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_128_lowbit_values(
 
 } // namespace bitpacking
 } // namespace torchao
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint1.h b/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint1.h
@@ -5,6 +5,8 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
 #include <arm_neon.h>
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/macro.h>
 
@@ -140,3 +142,5 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_128_uint1_values(
 } // namespace internal
 } // namespace bitpacking
 } // namespace torchao
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint2.h b/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint2.h
@@ -5,6 +5,9 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <arm_neon.h>
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/macro.h>
 
@@ -130,3 +133,5 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_64_uint2_values(
 } // namespace internal
 } // namespace bitpacking
 } // namespace torchao
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint3.h b/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint3.h
@@ -5,6 +5,9 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <arm_neon.h>
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/macro.h>
 
@@ -329,3 +332,5 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_128_uint3_values(
 } // namespace internal
 } // namespace bitpacking
 } // namespace torchao
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint4.h b/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint4.h
@@ -5,6 +5,9 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <arm_neon.h>
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/macro.h>
 
@@ -68,3 +71,5 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_32_uint4_values(
 } // namespace internal
 } // namespace bitpacking
 } // namespace torchao
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint5.h b/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint5.h
@@ -5,6 +5,9 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <arm_neon.h>
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/macro.h>
 
@@ -353,3 +356,5 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_128_uint5_values(
 } // namespace internal
 } // namespace bitpacking
 } // namespace torchao
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/...ch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot-impl.h b/...ch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot-impl.h
@@ -5,6 +5,9 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h>
 #include <torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_prepare_activation_data_1xk_f32-impl.h>
 #include <torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h>
@@ -363,3 +366,5 @@ void torchao::kernels::cpu::aarch64::linear::
                   clamp_min,
                   clamp_max);
 }
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/...ch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h b/...ch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h
@@ -5,6 +5,9 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h>
 #include <torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_prepare_activation_data_1xk_f32-impl.h>
 #include <torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h>
@@ -489,3 +492,5 @@ void torchao::kernels::cpu::aarch64::linear::
                   clamp_min,
                   clamp_max);
 }
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/...ch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h b/...ch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h
@@ -5,6 +5,9 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h>
 #include <torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_prepare_activation_data_1xk_f32-impl.h>
 #include <torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h>
@@ -575,3 +578,5 @@ void torchao::kernels::cpu::aarch64::linear::
                   clamp_min,
                   clamp_max);
 }
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/...els/cpu/aarch64/linear/channelwise_8bit_activation_prepare_activation_data_1xk_f32-impl.h b/...els/cpu/aarch64/linear/channelwise_8bit_activation_prepare_activation_data_1xk_f32-impl.h
@@ -5,6 +5,9 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h>
 #include <torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h>
 #include <cassert>
@@ -119,3 +122,5 @@ void prepare_activation_data_impl(
 } // namespace
   // channelwise_8bit_activation_prepare_activation_data_1xk_f32::internal
 } // namespace torchao::kernels::cpu::aarch64::linear
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/linear/linear.h b/torchao/experimental/kernels/cpu/aarch64/linear/linear.h
@@ -5,6 +5,9 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <arm_neon.h>
 
 namespace torchao::kernels::cpu::aarch64::linear {
@@ -164,3 +167,5 @@ void kernel(
 #include <torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot-impl.h>
 #include <torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h>
 #include <torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h>
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.cpp b/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.cpp
@@ -4,6 +4,8 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.
 
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h>
 #include <algorithm>
 #include <cassert>
@@ -111,3 +113,5 @@ void torchao::kernels::cpu::aarch64::quantization::quantize(
     vst1_s8(qvals + i, vec_qval_s8_01);
   }
 }
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h b/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h
@@ -5,6 +5,8 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
 #include <arm_neon.h>
 
 // These methods are here temporarily
@@ -53,3 +55,5 @@ void quantize(
 } // namespace cpu
 } // namespace kernels
 } // namespace torchao
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp b/torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp
@@ -4,6 +4,8 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.
 
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h>
 #include <cassert>
 
@@ -25,3 +27,5 @@ int32_t torchao::kernels::cpu::aarch64::reduction::compute_sum(
   }
   return res;
 }
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp b/torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp
@@ -4,6 +4,8 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.
 
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h>
 #include <cassert>
 
@@ -44,3 +46,5 @@ void torchao::kernels::cpu::aarch64::reduction::find_min_and_max(
     i += 1;
   }
 }
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h b/torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h
@@ -5,6 +5,8 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
 #include <arm_neon.h>
 #include <utility>
 
@@ -26,3 +28,5 @@ int32_t compute_sum(const int8_t* vals, int size);
 } // namespace cpu
 } // namespace kernels
 } // namespace torchao
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_bitpacking.cpp b/torchao/experimental/kernels/cpu/aarch64/tests/test_bitpacking.cpp
@@ -4,6 +4,8 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.
 
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <arm_neon.h>
 #include <gtest/gtest.h>
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h>
@@ -662,3 +664,5 @@ TEST_BITPACKING_128_LOWBIT_VALUES(2);
 TEST_BITPACKING_128_LOWBIT_VALUES(3);
 TEST_BITPACKING_128_LOWBIT_VALUES(4);
 TEST_BITPACKING_128_LOWBIT_VALUES(5);
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_linear.cpp b/torchao/experimental/kernels/cpu/aarch64/tests/test_linear.cpp
@@ -4,6 +4,8 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.
 
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <arm_neon.h>
 #include <gtest/gtest.h>
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h>
@@ -347,3 +349,5 @@ TEST(
         /*m=*/7, /*k=*/64, /*n=*/n, /*group_size=*/16);
   }
 }
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_quantization.cpp b/torchao/experimental/kernels/cpu/aarch64/tests/test_quantization.cpp
@@ -4,6 +4,8 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.
 
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <arm_neon.h>
 #include <gtest/gtest.h>
 #include <torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h>
@@ -68,3 +70,5 @@ TEST(test_quantize, ExpectedOutput) {
     }
   }
 }
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_reduction.cpp b/torchao/experimental/kernels/cpu/aarch64/tests/test_reduction.cpp
@@ -4,6 +4,8 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.
 
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <arm_neon.h>
 #include <gtest/gtest.h>
 #include <torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h>
@@ -58,3 +60,5 @@ TEST(test_compute_sum, SizeSmallerThan16) {
   int expected_sum = std::accumulate(vals.begin(), vals.end(), 0);
   EXPECT_EQ(sum, expected_sum);
 }
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_utils.h b/torchao/experimental/kernels/cpu/aarch64/tests/test_utils.h
@@ -5,6 +5,9 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h>
 #include <torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h>
 #include <cassert>
@@ -272,3 +275,5 @@ struct channelwise_8bit_activation_groupwise_lowbit_weight_test_case {
 };
 
 } // namespace torchao
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight-impl.h b/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight-impl.h
@@ -5,7 +5,11 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
 #include <torchao/experimental/kernels/cpu/aarch64/linear/linear.h>
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <torchao/experimental/ops/linear_8bit_act_xbit_weight/linear_8bit_act_xbit_weight.h>
 #include <optional>
 #include <vector>
@@ -32,9 +36,11 @@ using RuntimeContext = torch::executor::KernelRuntimeContext;
 namespace {
 
 template <int weight_nbit, bool has_weight_zeros, bool has_bias, bool has_clamp>
-inline torchao::ops::linear_8bit_act_xbit_weight::UKernelConfig get_ukernel_config() {
+inline torchao::ops::linear_8bit_act_xbit_weight::UKernelConfig
+get_ukernel_config() {
   torchao::ops::linear_8bit_act_xbit_weight::UKernelConfig config;
 
+#if defined(__aarch64__) || defined(__ARM_NEON)
   namespace ukernel = torchao::kernels::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot;
   config.mr = 1;
@@ -51,6 +57,7 @@ inline torchao::ops::linear_8bit_act_xbit_weight::UKernelConfig get_ukernel_conf
       &ukernel::prepare_weight_data<weight_nbit, has_weight_zeros>;
   config.kernel_fn =
       &ukernel::kernel<weight_nbit, has_weight_zeros, has_bias, has_clamp>;
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
 
   return config;
 }