From 60ffb86f28f69c4d485c88cb93ce2b5b6f9f39ea Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 30 Sep 2024 11:19:59 -0700
Subject: [PATCH] Add arm neon guards

Differential Revision: D63347224

Pull Request resolved: https://github.com/pytorch/ao/pull/980
---
 .../cpu/aarch64/benchmarks/benchmark_bitpacking.cpp      | 4 ++++
 .../cpu/aarch64/benchmarks/benchmark_quantization.cpp    | 4 ++++
 .../kernels/cpu/aarch64/bitpacking/bitpack.h             | 5 +++++
 .../experimental/kernels/cpu/aarch64/bitpacking/uint1.h  | 4 ++++
 .../experimental/kernels/cpu/aarch64/bitpacking/uint2.h  | 5 +++++
 .../experimental/kernels/cpu/aarch64/bitpacking/uint3.h  | 5 +++++
 .../experimental/kernels/cpu/aarch64/bitpacking/uint4.h  | 5 +++++
 .../experimental/kernels/cpu/aarch64/bitpacking/uint5.h  | 5 +++++
 ...ion_groupwise_lowbit_weight_1x1x32_f32_neondot-impl.h | 5 +++++
 ...ion_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h | 5 +++++
 ...ion_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h | 5 +++++
 ...bit_activation_prepare_activation_data_1xk_f32-impl.h | 5 +++++
 torchao/experimental/kernels/cpu/aarch64/linear/linear.h | 5 +++++
 .../kernels/cpu/aarch64/quantization/quantize.cpp        | 4 ++++
 .../kernels/cpu/aarch64/quantization/quantize.h          | 4 ++++
 .../kernels/cpu/aarch64/reduction/compute_sum.cpp        | 4 ++++
 .../kernels/cpu/aarch64/reduction/find_min_and_max.cpp   | 4 ++++
 .../kernels/cpu/aarch64/reduction/reduction.h            | 4 ++++
 .../kernels/cpu/aarch64/tests/test_bitpacking.cpp        | 4 ++++
 .../kernels/cpu/aarch64/tests/test_linear.cpp            | 4 ++++
 .../kernels/cpu/aarch64/tests/test_quantization.cpp      | 4 ++++
 .../kernels/cpu/aarch64/tests/test_reduction.cpp         | 4 ++++
 .../experimental/kernels/cpu/aarch64/tests/test_utils.h  | 5 +++++
 .../op_linear_8bit_act_xbit_weight-impl.h                | 9 ++++++++-
 24 files changed, 111 insertions(+), 1 deletion(-)

diff --git a/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_bitpacking.cpp b/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_bitpacking.cpp
index 16096a6c4..178215595 100644
--- a/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_bitpacking.cpp
+++ b/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_bitpacking.cpp
@@ -4,6 +4,8 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.
 
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <arm_neon.h>
 #include <benchmark/benchmark.h>
 
@@ -796,3 +798,5 @@ BENCHMARK(benchmark_unpack_uint5_values)->ArgsProduct({{128}, {8, 64, 128}});
 
 // Run the benchmark
 BENCHMARK_MAIN();
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_quantization.cpp b/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_quantization.cpp
index 868f01648..7c81b963d 100644
--- a/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_quantization.cpp
+++ b/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_quantization.cpp
@@ -4,6 +4,8 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.
 
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <benchmark/benchmark.h>
 #include <torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h>
 #include <torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h>
@@ -39,3 +41,5 @@ BENCHMARK(benchmark_quantize)
 
 // Run the benchmark
 BENCHMARK_MAIN();
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h b/torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h
index ae5a716a5..7029b7e49 100644
--- a/torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h
+++ b/torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h
@@ -5,6 +5,9 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <arm_neon.h>
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/macro.h>
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/uint1.h>
@@ -503,3 +506,5 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_128_lowbit_values(
 
 } // namespace bitpacking
 } // namespace torchao
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint1.h b/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint1.h
index 0a16c7398..78d0f76e8 100644
--- a/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint1.h
+++ b/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint1.h
@@ -5,6 +5,8 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
 #include <arm_neon.h>
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/macro.h>
 
@@ -140,3 +142,5 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_128_uint1_values(
 } // namespace internal
 } // namespace bitpacking
 } // namespace torchao
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint2.h b/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint2.h
index 985dfd9a7..d036c6ebc 100644
--- a/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint2.h
+++ b/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint2.h
@@ -5,6 +5,9 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <arm_neon.h>
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/macro.h>
 
@@ -130,3 +133,5 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_64_uint2_values(
 } // namespace internal
 } // namespace bitpacking
 } // namespace torchao
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint3.h b/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint3.h
index b76b146ba..41cc1d0b1 100644
--- a/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint3.h
+++ b/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint3.h
@@ -5,6 +5,9 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <arm_neon.h>
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/macro.h>
 
@@ -329,3 +332,5 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_128_uint3_values(
 } // namespace internal
 } // namespace bitpacking
 } // namespace torchao
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint4.h b/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint4.h
index d4d3f391f..3b1352d91 100644
--- a/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint4.h
+++ b/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint4.h
@@ -5,6 +5,9 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <arm_neon.h>
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/macro.h>
 
@@ -68,3 +71,5 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_32_uint4_values(
 } // namespace internal
 } // namespace bitpacking
 } // namespace torchao
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint5.h b/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint5.h
index 0e8e101ea..0eceb56b7 100644
--- a/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint5.h
+++ b/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint5.h
@@ -5,6 +5,9 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <arm_neon.h>
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/macro.h>
 
@@ -353,3 +356,5 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_128_uint5_values(
 } // namespace internal
 } // namespace bitpacking
 } // namespace torchao
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot-impl.h b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot-impl.h
index 19d4fe5bd..73c3fa500 100644
--- a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot-impl.h
+++ b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot-impl.h
@@ -5,6 +5,9 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h>
 #include <torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_prepare_activation_data_1xk_f32-impl.h>
 #include <torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h>
@@ -363,3 +366,5 @@ void torchao::kernels::cpu::aarch64::linear::
                   clamp_min,
                   clamp_max);
 }
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h
index 2fcd8d131..a97013580 100644
--- a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h
+++ b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h
@@ -5,6 +5,9 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h>
 #include <torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_prepare_activation_data_1xk_f32-impl.h>
 #include <torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h>
@@ -489,3 +492,5 @@ void torchao::kernels::cpu::aarch64::linear::
                   clamp_min,
                   clamp_max);
 }
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h
index 4974e909d..d1d904219 100644
--- a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h
+++ b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h
@@ -5,6 +5,9 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h>
 #include <torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_prepare_activation_data_1xk_f32-impl.h>
 #include <torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h>
@@ -575,3 +578,5 @@ void torchao::kernels::cpu::aarch64::linear::
                   clamp_min,
                   clamp_max);
 }
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_prepare_activation_data_1xk_f32-impl.h b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_prepare_activation_data_1xk_f32-impl.h
index a67e2b0d1..b32b33e58 100644
--- a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_prepare_activation_data_1xk_f32-impl.h
+++ b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_prepare_activation_data_1xk_f32-impl.h
@@ -5,6 +5,9 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h>
 #include <torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h>
 #include <cassert>
@@ -119,3 +122,5 @@ void prepare_activation_data_impl(
 } // namespace
   // channelwise_8bit_activation_prepare_activation_data_1xk_f32::internal
 } // namespace torchao::kernels::cpu::aarch64::linear
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/linear/linear.h b/torchao/experimental/kernels/cpu/aarch64/linear/linear.h
index cf3af21b5..447e42b9f 100644
--- a/torchao/experimental/kernels/cpu/aarch64/linear/linear.h
+++ b/torchao/experimental/kernels/cpu/aarch64/linear/linear.h
@@ -5,6 +5,9 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <arm_neon.h>
 
 namespace torchao::kernels::cpu::aarch64::linear {
@@ -164,3 +167,5 @@ void kernel(
 #include <torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot-impl.h>
 #include <torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h>
 #include <torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h>
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.cpp b/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.cpp
index 523fd9360..65416fdf1 100644
--- a/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.cpp
+++ b/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.cpp
@@ -4,6 +4,8 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.
 
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h>
 #include <algorithm>
 #include <cassert>
@@ -111,3 +113,5 @@ void torchao::kernels::cpu::aarch64::quantization::quantize(
     vst1_s8(qvals + i, vec_qval_s8_01);
   }
 }
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h b/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h
index a8214cc44..777bdc6e2 100644
--- a/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h
+++ b/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h
@@ -5,6 +5,8 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
 #include <arm_neon.h>
 
 // These methods are here temporarily
@@ -53,3 +55,5 @@ void quantize(
 } // namespace cpu
 } // namespace kernels
 } // namespace torchao
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp b/torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp
index 3aa7f4a5d..3a41307cb 100644
--- a/torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp
+++ b/torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp
@@ -4,6 +4,8 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.
 
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h>
 #include <cassert>
 
@@ -25,3 +27,5 @@ int32_t torchao::kernels::cpu::aarch64::reduction::compute_sum(
   }
   return res;
 }
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp b/torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp
index 1516f3cef..89707eb0a 100644
--- a/torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp
+++ b/torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp
@@ -4,6 +4,8 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.
 
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h>
 #include <cassert>
 
@@ -44,3 +46,5 @@ void torchao::kernels::cpu::aarch64::reduction::find_min_and_max(
     i += 1;
   }
 }
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h b/torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h
index f027c8530..53e6ad5c4 100644
--- a/torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h
+++ b/torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h
@@ -5,6 +5,8 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
 #include <arm_neon.h>
 #include <utility>
 
@@ -26,3 +28,5 @@ int32_t compute_sum(const int8_t* vals, int size);
 } // namespace cpu
 } // namespace kernels
 } // namespace torchao
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_bitpacking.cpp b/torchao/experimental/kernels/cpu/aarch64/tests/test_bitpacking.cpp
index 581c3b3e3..92dceb16e 100644
--- a/torchao/experimental/kernels/cpu/aarch64/tests/test_bitpacking.cpp
+++ b/torchao/experimental/kernels/cpu/aarch64/tests/test_bitpacking.cpp
@@ -4,6 +4,8 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.
 
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <arm_neon.h>
 #include <gtest/gtest.h>
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h>
@@ -662,3 +664,5 @@ TEST_BITPACKING_128_LOWBIT_VALUES(2);
 TEST_BITPACKING_128_LOWBIT_VALUES(3);
 TEST_BITPACKING_128_LOWBIT_VALUES(4);
 TEST_BITPACKING_128_LOWBIT_VALUES(5);
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_linear.cpp b/torchao/experimental/kernels/cpu/aarch64/tests/test_linear.cpp
index 22a2ed0f8..47902be72 100644
--- a/torchao/experimental/kernels/cpu/aarch64/tests/test_linear.cpp
+++ b/torchao/experimental/kernels/cpu/aarch64/tests/test_linear.cpp
@@ -4,6 +4,8 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.
 
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <arm_neon.h>
 #include <gtest/gtest.h>
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h>
@@ -347,3 +349,5 @@ TEST(
         /*m=*/7, /*k=*/64, /*n=*/n, /*group_size=*/16);
   }
 }
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_quantization.cpp b/torchao/experimental/kernels/cpu/aarch64/tests/test_quantization.cpp
index 74fc5ef52..bb19528de 100644
--- a/torchao/experimental/kernels/cpu/aarch64/tests/test_quantization.cpp
+++ b/torchao/experimental/kernels/cpu/aarch64/tests/test_quantization.cpp
@@ -4,6 +4,8 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.
 
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <arm_neon.h>
 #include <gtest/gtest.h>
 #include <torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h>
@@ -68,3 +70,5 @@ TEST(test_quantize, ExpectedOutput) {
     }
   }
 }
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_reduction.cpp b/torchao/experimental/kernels/cpu/aarch64/tests/test_reduction.cpp
index 16eb87fbb..0720f2dcf 100644
--- a/torchao/experimental/kernels/cpu/aarch64/tests/test_reduction.cpp
+++ b/torchao/experimental/kernels/cpu/aarch64/tests/test_reduction.cpp
@@ -4,6 +4,8 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.
 
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <arm_neon.h>
 #include <gtest/gtest.h>
 #include <torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h>
@@ -58,3 +60,5 @@ TEST(test_compute_sum, SizeSmallerThan16) {
   int expected_sum = std::accumulate(vals.begin(), vals.end(), 0);
   EXPECT_EQ(sum, expected_sum);
 }
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_utils.h b/torchao/experimental/kernels/cpu/aarch64/tests/test_utils.h
index b9b03c777..c3dc431c0 100644
--- a/torchao/experimental/kernels/cpu/aarch64/tests/test_utils.h
+++ b/torchao/experimental/kernels/cpu/aarch64/tests/test_utils.h
@@ -5,6 +5,9 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h>
 #include <torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h>
 #include <cassert>
@@ -272,3 +275,5 @@ struct channelwise_8bit_activation_groupwise_lowbit_weight_test_case {
 };
 
 } // namespace torchao
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight-impl.h b/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight-impl.h
index b40e53a59..51a02d264 100644
--- a/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight-impl.h
+++ b/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight-impl.h
@@ -5,7 +5,11 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+
+#if defined(__aarch64__) || defined(__ARM_NEON)
 #include <torchao/experimental/kernels/cpu/aarch64/linear/linear.h>
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
+
 #include <torchao/experimental/ops/linear_8bit_act_xbit_weight/linear_8bit_act_xbit_weight.h>
 #include <optional>
 #include <vector>
@@ -32,9 +36,11 @@ using RuntimeContext = torch::executor::KernelRuntimeContext;
 namespace {
 
 template <int weight_nbit, bool has_weight_zeros, bool has_bias, bool has_clamp>
-inline torchao::ops::linear_8bit_act_xbit_weight::UKernelConfig get_ukernel_config() {
+inline torchao::ops::linear_8bit_act_xbit_weight::UKernelConfig
+get_ukernel_config() {
   torchao::ops::linear_8bit_act_xbit_weight::UKernelConfig config;
 
+#if defined(__aarch64__) || defined(__ARM_NEON)
   namespace ukernel = torchao::kernels::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot;
   config.mr = 1;
@@ -51,6 +57,7 @@ inline torchao::ops::linear_8bit_act_xbit_weight::UKernelConfig get_ukernel_conf
       &ukernel::prepare_weight_data<weight_nbit, has_weight_zeros>;
   config.kernel_fn =
       &ukernel::kernel<weight_nbit, has_weight_zeros, has_bias, has_clamp>;
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
 
   return config;
 }