Skip to content

Commit

Permalink
Detect ARM CPU features for host target and in runtime (Backport to r…
Browse files Browse the repository at this point in the history
…elease/18.x) (#8343)

Detect ARM CPU features for host target and in runtime (#8298)

Adds feature detection for ARM CPUs to the runtime library and to
the host target feature computation. Supports Windows, macOS,
Linux, iOS, and Android.

Also fix bug in Type::max() and Type::min() for float16.

Fixes #4727
Fixes #6106
Fixes #7901
Fixes #7979
Fixes #8340

Co-authored-by: Alex Reinking <[email protected]>
  • Loading branch information
steven-johnson and alexreinking authored Jul 17, 2024
1 parent 41bc134 commit 8c651b4
Show file tree
Hide file tree
Showing 14 changed files with 422 additions and 42 deletions.
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -828,6 +828,8 @@ RUNTIME_CPP_COMPONENTS = \
hexagon_dma_pool \
hexagon_host \
ios_io \
linux_aarch64_cpu_features \
linux_arm_cpu_features \
linux_clock \
linux_host_cpu_count \
linux_yield \
Expand All @@ -839,6 +841,8 @@ RUNTIME_CPP_COMPONENTS = \
msan \
msan_stubs \
opencl \
osx_aarch64_cpu_features \
osx_arm_cpu_features \
osx_clock \
osx_get_symbol \
osx_host_cpu_count \
Expand Down Expand Up @@ -873,6 +877,7 @@ RUNTIME_CPP_COMPONENTS = \
wasm_cpu_features \
webgpu_dawn \
webgpu_emscripten \
windows_aarch64_cpu_features_arm \
windows_clock \
windows_cuda \
windows_d3d12compute_arm \
Expand Down
64 changes: 52 additions & 12 deletions src/LLVM_Runtime_Linker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,20 +46,31 @@ std::unique_ptr<llvm::Module> parse_bitcode_file(llvm::StringRef buf, llvm::LLVM
return std::unique_ptr<llvm::Module>(); \
}

#define DECLARE_CPP_INITMOD_LOOKUP_BITS(mod, bits) \
do { \
if (debug) { \
return get_initmod_##mod##_##bits##_debug(context); \
} else { \
return get_initmod_##mod##_##bits(context); \
} \
} while (0)

#define DECLARE_CPP_INITMOD_LOOKUP(mod) \
std::unique_ptr<llvm::Module> get_initmod_##mod(llvm::LLVMContext *context, bool bits_64, bool debug) { \
if (bits_64) { \
if (debug) { \
return get_initmod_##mod##_64_debug(context); \
} else { \
return get_initmod_##mod##_64(context); \
} \
DECLARE_CPP_INITMOD_LOOKUP_BITS(mod, 64); \
} else { \
DECLARE_CPP_INITMOD_LOOKUP_BITS(mod, 32); \
} \
}

#define DECLARE_CPP_INITMOD_LOOKUP_64(mod) \
std::unique_ptr<llvm::Module> get_initmod_##mod(llvm::LLVMContext *context, bool bits_64, bool debug) { \
if (bits_64) { \
DECLARE_CPP_INITMOD_LOOKUP_BITS(mod, 64); \
} else { \
if (debug) { \
return get_initmod_##mod##_32_debug(context); \
} else { \
return get_initmod_##mod##_32(context); \
} \
internal_error << "No support for 32-bit initmod: " #mod; \
return nullptr; /* appease warnings */ \
} \
}

Expand All @@ -70,6 +81,11 @@ std::unique_ptr<llvm::Module> parse_bitcode_file(llvm::StringRef buf, llvm::LLVM
DECLARE_INITMOD(mod##_64) \
DECLARE_CPP_INITMOD_LOOKUP(mod)

#define DECLARE_CPP_INITMOD_64(mod) \
DECLARE_INITMOD(mod##_64_debug) \
DECLARE_INITMOD(mod##_64) \
DECLARE_CPP_INITMOD_LOOKUP_64(mod)

#define DECLARE_LL_INITMOD(mod) \
DECLARE_INITMOD(mod##_ll)

Expand Down Expand Up @@ -183,18 +199,28 @@ DECLARE_NO_INITMOD(metal_objc_x86)
DECLARE_LL_INITMOD(arm)
DECLARE_LL_INITMOD(arm_no_neon)
DECLARE_CPP_INITMOD(arm_cpu_features)
DECLARE_CPP_INITMOD(linux_arm_cpu_features)
DECLARE_CPP_INITMOD(osx_arm_cpu_features)
#else
DECLARE_NO_INITMOD(arm)
DECLARE_NO_INITMOD(arm_no_neon)
DECLARE_NO_INITMOD(arm_cpu_features)
DECLARE_NO_INITMOD(linux_arm_cpu_features)
DECLARE_NO_INITMOD(osx_arm_cpu_features)
#endif // WITH_ARM

#ifdef WITH_AARCH64
DECLARE_LL_INITMOD(aarch64)
DECLARE_CPP_INITMOD(aarch64_cpu_features)
DECLARE_CPP_INITMOD(linux_aarch64_cpu_features)
DECLARE_CPP_INITMOD(osx_aarch64_cpu_features)
DECLARE_CPP_INITMOD_64(windows_aarch64_cpu_features_arm)
#else
DECLARE_NO_INITMOD(aarch64)
DECLARE_NO_INITMOD(aarch64_cpu_features)
DECLARE_NO_INITMOD(linux_aarch64_cpu_features)
DECLARE_NO_INITMOD(osx_aarch64_cpu_features)
DECLARE_NO_INITMOD(windows_aarch64_cpu_features_arm)
#endif // WITH_AARCH64

#ifdef WITH_NVPTX
Expand Down Expand Up @@ -1206,9 +1232,23 @@ std::unique_ptr<llvm::Module> get_initial_module_for_target(Target t, llvm::LLVM
}
if (t.arch == Target::ARM) {
if (t.bits == 64) {
modules.push_back(get_initmod_aarch64_cpu_features(c, bits_64, debug));
if (t.os == Target::Android || t.os == Target::Linux) {
modules.push_back(get_initmod_linux_aarch64_cpu_features(c, bits_64, debug));
} else if (t.os == Target::OSX || t.os == Target::IOS) {
modules.push_back(get_initmod_osx_aarch64_cpu_features(c, bits_64, debug));
} else if (t.os == Target::Windows) {
modules.push_back(get_initmod_windows_aarch64_cpu_features_arm(c, bits_64, debug));
} else {
modules.push_back(get_initmod_aarch64_cpu_features(c, bits_64, debug));
}
} else {
modules.push_back(get_initmod_arm_cpu_features(c, bits_64, debug));
if (t.os == Target::Android || t.os == Target::Linux) {
modules.push_back(get_initmod_linux_arm_cpu_features(c, bits_64, debug));
} else if (t.os == Target::OSX || t.os == Target::IOS) {
modules.push_back(get_initmod_osx_arm_cpu_features(c, bits_64, debug));
} else {
modules.push_back(get_initmod_arm_cpu_features(c, bits_64, debug));
}
}
}
if (t.arch == Target::POWERPC) {
Expand Down
122 changes: 115 additions & 7 deletions src/Target.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,23 +21,50 @@
#endif

#ifdef _MSC_VER
#define NOMINMAX
#define WIN32_LEAN_AND_MEAN
#include <intrin.h>
#include <windows.h>
#endif // _MSC_VER

#ifdef __APPLE__
#include <mach/machine.h>
#include <sys/sysctl.h>
#include <sys/types.h>
#endif

#if defined(__linux__) && (defined(__arm__) || defined(__aarch64__))
#include <asm/hwcap.h>
#include <sys/auxv.h>
#ifndef HWCAP_ASIMDHP
#define HWCAP_ASIMDHP 0
#endif
#ifndef HWCAP_ASIMDDP
#define HWCAP_ASIMDDP 0
#endif
#ifndef HWCAP_SVE
#define HWCAP_SVE 0
#endif
#ifndef HWCAP2_SVE2
#define HWCAP2_SVE2 0
#endif
#endif

namespace Halide {

using std::string;
using std::vector;

namespace {

#ifdef _MSC_VER
static void cpuid(int info[4], int infoType, int extra) {
#if defined(_M_IX86) || defined(_M_AMD64)

void cpuid(int info[4], int infoType, int extra) {
__cpuidex(info, infoType, extra);
}
#else

#if defined(__x86_64__) || defined(__i386__)
#elif defined(__x86_64__) || defined(__i386__)

// CPU feature detection code taken from ispc
// (https://github.com/ispc/ispc/blob/master/builtins/dispatch.ll)

Expand All @@ -47,10 +74,10 @@ void cpuid(int info[4], int infoType, int extra) {
: "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
: "0"(infoType), "2"(extra));
}
#endif

#endif

#if defined(__x86_64__) || defined(__i386__) || defined(_MSC_VER)
#if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_AMD64)

enum class VendorSignatures {
Unknown,
Expand Down Expand Up @@ -143,6 +170,29 @@ Target::Processor get_amd_processor(unsigned family, unsigned model, bool have_s

#endif // defined(__x86_64__) || defined(__i386__) || defined(_MSC_VER)

#ifdef __APPLE__

template<typename T>
std::optional<T> getsysctl(const char *name) {
T value;
size_t size = sizeof(value);
if (sysctlbyname(name, &value, &size, nullptr, 0)) {
return std::nullopt;
}
return std::make_optional(value);
}

bool sysctl_is_set(const char *name) {
return getsysctl<int>(name).value_or(0);
}

bool is_armv7s() {
return getsysctl<cpu_type_t>("hw.cputype") == CPU_TYPE_ARM &&
getsysctl<cpu_subtype_t>("hw.cpusubtype") == CPU_SUBTYPE_ARM_V7S;
}

#endif // __APPLE__

Target calculate_host_target() {
Target::OS os = Target::OSUnknown;
#ifdef __linux__
Expand All @@ -164,8 +214,66 @@ Target calculate_host_target() {
#if __riscv
Target::Arch arch = Target::RISCV;
#else
#if defined(__arm__) || defined(__aarch64__)
#if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
Target::Arch arch = Target::ARM;

#ifdef __APPLE__
if (is_armv7s()) {
initial_features.push_back(Target::ARMv7s);
}

if (sysctl_is_set("hw.optional.arm.FEAT_DotProd")) {
initial_features.push_back(Target::ARMDotProd);
}

if (sysctl_is_set("hw.optional.arm.FEAT_FP16")) {
initial_features.push_back(Target::ARMFp16);
}
#endif

#ifdef __linux__
unsigned long hwcaps = getauxval(AT_HWCAP);
unsigned long hwcaps2 = getauxval(AT_HWCAP2);

if (hwcaps & HWCAP_ASIMDDP) {
initial_features.push_back(Target::ARMDotProd);
}

if (hwcaps & HWCAP_ASIMDHP) {
initial_features.push_back(Target::ARMFp16);
}

if (hwcaps & HWCAP_SVE) {
initial_features.push_back(Target::SVE);
}

if (hwcaps2 & HWCAP2_SVE2) {
initial_features.push_back(Target::SVE2);
}
#endif

#ifdef _MSC_VER

// Magic value from: https://github.com/dotnet/runtime/blob/7e977dcbe5efaeec2c75ed0c3e200c85b2e55522/src/native/minipal/cpufeatures.c#L19
#define PF_ARM_SVE_INSTRUCTIONS_AVAILABLE (46)

// This is the strategy used by Google's cpuinfo library for
// detecting fp16 arithmetic support on Windows.
if (!IsProcessorFeaturePresent(PF_FLOATING_POINT_EMULATED) &&
IsProcessorFeaturePresent(PF_ARM_FMAC_INSTRUCTIONS_AVAILABLE)) {
initial_features.push_back(Target::ARMFp16);
}

if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) {
initial_features.push_back(Target::ARMDotProd);
}

if (IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)) {
initial_features.push_back(Target::SVE);
}

#endif

#else
#if defined(__powerpc__) && (defined(__FreeBSD__) || defined(__linux__))
Target::Arch arch = Target::POWERPC;
Expand Down
4 changes: 2 additions & 2 deletions src/Type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ Halide::Expr Type::max() const {
} else {
internal_assert(is_float());
if (bits() == 16) {
return Internal::FloatImm::make(*this, 65504.0);
return Internal::FloatImm::make(*this, (double)float16_t::make_infinity());
} else if (bits() == 32) {
return Internal::FloatImm::make(*this, std::numeric_limits<float>::infinity());
} else if (bits() == 64) {
Expand All @@ -59,7 +59,7 @@ Halide::Expr Type::min() const {
} else {
internal_assert(is_float());
if (bits() == 16) {
return Internal::FloatImm::make(*this, -65504.0);
return Internal::FloatImm::make(*this, (double)float16_t::make_negative_infinity());
} else if (bits() == 32) {
return Internal::FloatImm::make(*this, -std::numeric_limits<float>::infinity());
} else if (bits() == 64) {
Expand Down
9 changes: 8 additions & 1 deletion src/Util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -859,7 +859,14 @@ void run_with_large_stack(const std::function<void()> &action) {
// Portable bit-counting methods
int popcount64(uint64_t x) {
#ifdef _MSC_VER
#if defined(_WIN64)
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64_EC)
int popcnt = 0;
while (x) {
x &= x - 1;
popcnt++;
}
return popcnt;
#elif defined(_WIN64)
return __popcnt64(x);
#else
return __popcnt((uint32_t)(x >> 32)) + __popcnt((uint32_t)(x & 0xffffffff));
Expand Down
5 changes: 5 additions & 0 deletions src/runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ set(RUNTIME_CPP
hexagon_dma_pool
hexagon_host
ios_io
linux_aarch64_cpu_features
linux_arm_cpu_features
linux_clock
linux_host_cpu_count
linux_yield
Expand All @@ -43,6 +45,8 @@ set(RUNTIME_CPP
msan
msan_stubs
opencl
osx_aarch64_cpu_features
osx_arm_cpu_features
osx_clock
osx_get_symbol
osx_host_cpu_count
Expand Down Expand Up @@ -80,6 +84,7 @@ set(RUNTIME_CPP
# webgpu
webgpu_dawn
webgpu_emscripten
windows_aarch64_cpu_features_arm
windows_clock
windows_cuda
windows_d3d12compute_arm
Expand Down
Loading

0 comments on commit 8c651b4

Please sign in to comment.