Skip to content

Commit

Permalink
Add arm neon guards
Browse files Browse the repository at this point in the history
Differential Revision: D63347224

Pull Request resolved: #980
  • Loading branch information
metascroy authored Sep 30, 2024
1 parent b983f7d commit 60ffb86
Show file tree
Hide file tree
Showing 24 changed files with 111 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.

#if defined(__aarch64__) || defined(__ARM_NEON)

#include <arm_neon.h>
#include <benchmark/benchmark.h>

Expand Down Expand Up @@ -796,3 +798,5 @@ BENCHMARK(benchmark_unpack_uint5_values)->ArgsProduct({{128}, {8, 64, 128}});

// Run the benchmark
BENCHMARK_MAIN();

#endif // defined(__aarch64__) || defined(__ARM_NEON)
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.

#if defined(__aarch64__) || defined(__ARM_NEON)

#include <benchmark/benchmark.h>
#include <torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h>
#include <torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h>
Expand Down Expand Up @@ -39,3 +41,5 @@ BENCHMARK(benchmark_quantize)

// Run the benchmark
BENCHMARK_MAIN();

#endif // defined(__aarch64__) || defined(__ARM_NEON)
5 changes: 5 additions & 0 deletions torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
// LICENSE file in the root directory of this source tree.

#pragma once

#if defined(__aarch64__) || defined(__ARM_NEON)

#include <arm_neon.h>
#include <torchao/experimental/kernels/cpu/aarch64/bitpacking/macro.h>
#include <torchao/experimental/kernels/cpu/aarch64/bitpacking/uint1.h>
Expand Down Expand Up @@ -503,3 +506,5 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_128_lowbit_values(

} // namespace bitpacking
} // namespace torchao

#endif // defined(__aarch64__) || defined(__ARM_NEON)
4 changes: 4 additions & 0 deletions torchao/experimental/kernels/cpu/aarch64/bitpacking/uint1.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
// LICENSE file in the root directory of this source tree.

#pragma once

#if defined(__aarch64__) || defined(__ARM_NEON)
#include <arm_neon.h>
#include <torchao/experimental/kernels/cpu/aarch64/bitpacking/macro.h>

Expand Down Expand Up @@ -140,3 +142,5 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_128_uint1_values(
} // namespace internal
} // namespace bitpacking
} // namespace torchao

#endif // defined(__aarch64__) || defined(__ARM_NEON)
5 changes: 5 additions & 0 deletions torchao/experimental/kernels/cpu/aarch64/bitpacking/uint2.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
// LICENSE file in the root directory of this source tree.

#pragma once

#if defined(__aarch64__) || defined(__ARM_NEON)

#include <arm_neon.h>
#include <torchao/experimental/kernels/cpu/aarch64/bitpacking/macro.h>

Expand Down Expand Up @@ -130,3 +133,5 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_64_uint2_values(
} // namespace internal
} // namespace bitpacking
} // namespace torchao

#endif // defined(__aarch64__) || defined(__ARM_NEON)
5 changes: 5 additions & 0 deletions torchao/experimental/kernels/cpu/aarch64/bitpacking/uint3.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
// LICENSE file in the root directory of this source tree.

#pragma once

#if defined(__aarch64__) || defined(__ARM_NEON)

#include <arm_neon.h>
#include <torchao/experimental/kernels/cpu/aarch64/bitpacking/macro.h>

Expand Down Expand Up @@ -329,3 +332,5 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_128_uint3_values(
} // namespace internal
} // namespace bitpacking
} // namespace torchao

#endif // defined(__aarch64__) || defined(__ARM_NEON)
5 changes: 5 additions & 0 deletions torchao/experimental/kernels/cpu/aarch64/bitpacking/uint4.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
// LICENSE file in the root directory of this source tree.

#pragma once

#if defined(__aarch64__) || defined(__ARM_NEON)

#include <arm_neon.h>
#include <torchao/experimental/kernels/cpu/aarch64/bitpacking/macro.h>

Expand Down Expand Up @@ -68,3 +71,5 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_32_uint4_values(
} // namespace internal
} // namespace bitpacking
} // namespace torchao

#endif // defined(__aarch64__) || defined(__ARM_NEON)
5 changes: 5 additions & 0 deletions torchao/experimental/kernels/cpu/aarch64/bitpacking/uint5.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
// LICENSE file in the root directory of this source tree.

#pragma once

#if defined(__aarch64__) || defined(__ARM_NEON)

#include <arm_neon.h>
#include <torchao/experimental/kernels/cpu/aarch64/bitpacking/macro.h>

Expand Down Expand Up @@ -353,3 +356,5 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_128_uint5_values(
} // namespace internal
} // namespace bitpacking
} // namespace torchao

#endif // defined(__aarch64__) || defined(__ARM_NEON)
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
// LICENSE file in the root directory of this source tree.

#pragma once

#if defined(__aarch64__) || defined(__ARM_NEON)

#include <torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h>
#include <torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_prepare_activation_data_1xk_f32-impl.h>
#include <torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h>
Expand Down Expand Up @@ -363,3 +366,5 @@ void torchao::kernels::cpu::aarch64::linear::
clamp_min,
clamp_max);
}

#endif // defined(__aarch64__) || defined(__ARM_NEON)
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
// LICENSE file in the root directory of this source tree.

#pragma once

#if defined(__aarch64__) || defined(__ARM_NEON)

#include <torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h>
#include <torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_prepare_activation_data_1xk_f32-impl.h>
#include <torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h>
Expand Down Expand Up @@ -489,3 +492,5 @@ void torchao::kernels::cpu::aarch64::linear::
clamp_min,
clamp_max);
}

#endif // defined(__aarch64__) || defined(__ARM_NEON)
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
// LICENSE file in the root directory of this source tree.

#pragma once

#if defined(__aarch64__) || defined(__ARM_NEON)

#include <torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h>
#include <torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_prepare_activation_data_1xk_f32-impl.h>
#include <torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h>
Expand Down Expand Up @@ -575,3 +578,5 @@ void torchao::kernels::cpu::aarch64::linear::
clamp_min,
clamp_max);
}

#endif // defined(__aarch64__) || defined(__ARM_NEON)
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
// LICENSE file in the root directory of this source tree.

#pragma once

#if defined(__aarch64__) || defined(__ARM_NEON)

#include <torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h>
#include <torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h>
#include <cassert>
Expand Down Expand Up @@ -119,3 +122,5 @@ void prepare_activation_data_impl(
} // namespace
// channelwise_8bit_activation_prepare_activation_data_1xk_f32::internal
} // namespace torchao::kernels::cpu::aarch64::linear

#endif // defined(__aarch64__) || defined(__ARM_NEON)
5 changes: 5 additions & 0 deletions torchao/experimental/kernels/cpu/aarch64/linear/linear.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
// LICENSE file in the root directory of this source tree.

#pragma once

#if defined(__aarch64__) || defined(__ARM_NEON)

#include <arm_neon.h>

namespace torchao::kernels::cpu::aarch64::linear {
Expand Down Expand Up @@ -164,3 +167,5 @@ void kernel(
#include <torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot-impl.h>
#include <torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h>
#include <torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h>

#endif // defined(__aarch64__) || defined(__ARM_NEON)
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.

#if defined(__aarch64__) || defined(__ARM_NEON)

#include <torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h>
#include <algorithm>
#include <cassert>
Expand Down Expand Up @@ -111,3 +113,5 @@ void torchao::kernels::cpu::aarch64::quantization::quantize(
vst1_s8(qvals + i, vec_qval_s8_01);
}
}

#endif // defined(__aarch64__) || defined(__ARM_NEON)
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
// LICENSE file in the root directory of this source tree.

#pragma once

#if defined(__aarch64__) || defined(__ARM_NEON)
#include <arm_neon.h>

// These methods are here temporarily
Expand Down Expand Up @@ -53,3 +55,5 @@ void quantize(
} // namespace cpu
} // namespace kernels
} // namespace torchao

#endif // defined(__aarch64__) || defined(__ARM_NEON)
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.

#if defined(__aarch64__) || defined(__ARM_NEON)

#include <torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h>
#include <cassert>

Expand All @@ -25,3 +27,5 @@ int32_t torchao::kernels::cpu::aarch64::reduction::compute_sum(
}
return res;
}

#endif // defined(__aarch64__) || defined(__ARM_NEON)
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.

#if defined(__aarch64__) || defined(__ARM_NEON)

#include <torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h>
#include <cassert>

Expand Down Expand Up @@ -44,3 +46,5 @@ void torchao::kernels::cpu::aarch64::reduction::find_min_and_max(
i += 1;
}
}

#endif // defined(__aarch64__) || defined(__ARM_NEON)
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
// LICENSE file in the root directory of this source tree.

#pragma once

#if defined(__aarch64__) || defined(__ARM_NEON)
#include <arm_neon.h>
#include <utility>

Expand All @@ -26,3 +28,5 @@ int32_t compute_sum(const int8_t* vals, int size);
} // namespace cpu
} // namespace kernels
} // namespace torchao

#endif // defined(__aarch64__) || defined(__ARM_NEON)
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.

#if defined(__aarch64__) || defined(__ARM_NEON)

#include <arm_neon.h>
#include <gtest/gtest.h>
#include <torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h>
Expand Down Expand Up @@ -662,3 +664,5 @@ TEST_BITPACKING_128_LOWBIT_VALUES(2);
TEST_BITPACKING_128_LOWBIT_VALUES(3);
TEST_BITPACKING_128_LOWBIT_VALUES(4);
TEST_BITPACKING_128_LOWBIT_VALUES(5);

#endif // defined(__aarch64__) || defined(__ARM_NEON)
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.

#if defined(__aarch64__) || defined(__ARM_NEON)

#include <arm_neon.h>
#include <gtest/gtest.h>
#include <torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h>
Expand Down Expand Up @@ -347,3 +349,5 @@ TEST(
/*m=*/7, /*k=*/64, /*n=*/n, /*group_size=*/16);
}
}

#endif // defined(__aarch64__) || defined(__ARM_NEON)
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.

#if defined(__aarch64__) || defined(__ARM_NEON)

#include <arm_neon.h>
#include <gtest/gtest.h>
#include <torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h>
Expand Down Expand Up @@ -68,3 +70,5 @@ TEST(test_quantize, ExpectedOutput) {
}
}
}

#endif // defined(__aarch64__) || defined(__ARM_NEON)
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.

#if defined(__aarch64__) || defined(__ARM_NEON)

#include <arm_neon.h>
#include <gtest/gtest.h>
#include <torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h>
Expand Down Expand Up @@ -58,3 +60,5 @@ TEST(test_compute_sum, SizeSmallerThan16) {
int expected_sum = std::accumulate(vals.begin(), vals.end(), 0);
EXPECT_EQ(sum, expected_sum);
}

#endif // defined(__aarch64__) || defined(__ARM_NEON)
5 changes: 5 additions & 0 deletions torchao/experimental/kernels/cpu/aarch64/tests/test_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
// LICENSE file in the root directory of this source tree.

#pragma once

#if defined(__aarch64__) || defined(__ARM_NEON)

#include <torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h>
#include <torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h>
#include <cassert>
Expand Down Expand Up @@ -272,3 +275,5 @@ struct channelwise_8bit_activation_groupwise_lowbit_weight_test_case {
};

} // namespace torchao

#endif // defined(__aarch64__) || defined(__ARM_NEON)
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@
// LICENSE file in the root directory of this source tree.

#pragma once

#if defined(__aarch64__) || defined(__ARM_NEON)
#include <torchao/experimental/kernels/cpu/aarch64/linear/linear.h>
#endif // defined(__aarch64__) || defined(__ARM_NEON)

#include <torchao/experimental/ops/linear_8bit_act_xbit_weight/linear_8bit_act_xbit_weight.h>
#include <optional>
#include <vector>
Expand All @@ -32,9 +36,11 @@ using RuntimeContext = torch::executor::KernelRuntimeContext;
namespace {

template <int weight_nbit, bool has_weight_zeros, bool has_bias, bool has_clamp>
inline torchao::ops::linear_8bit_act_xbit_weight::UKernelConfig get_ukernel_config() {
inline torchao::ops::linear_8bit_act_xbit_weight::UKernelConfig
get_ukernel_config() {
torchao::ops::linear_8bit_act_xbit_weight::UKernelConfig config;

#if defined(__aarch64__) || defined(__ARM_NEON)
namespace ukernel = torchao::kernels::cpu::aarch64::linear::
channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot;
config.mr = 1;
Expand All @@ -51,6 +57,7 @@ inline torchao::ops::linear_8bit_act_xbit_weight::UKernelConfig get_ukernel_conf
&ukernel::prepare_weight_data<weight_nbit, has_weight_zeros>;
config.kernel_fn =
&ukernel::kernel<weight_nbit, has_weight_zeros, has_bias, has_clamp>;
#endif // defined(__aarch64__) || defined(__ARM_NEON)

return config;
}
Expand Down

0 comments on commit 60ffb86

Please sign in to comment.