Detect ARM CPU features for host target and in runtime (Backport to r…

…elease/18.x) (#8343) Detect ARM CPU features for host target and in runtime (#8298) Adds feature detection for ARM CPUs to the runtime library and to the host target feature computation. Supports Windows, macOS, Linux, iOS, and Android. Also fix bug in Type::max() and Type::min() for float16. Fixes #4727 Fixes #6106 Fixes #7901 Fixes #7979 Fixes #8340 Co-authored-by: Alex Reinking <[email protected]>
halide · Jul 17, 2024 · 8c651b4 · 8c651b4
1 parent 41bc134
commit 8c651b4
Show file tree

Hide file tree

Showing 14 changed files with 422 additions and 42 deletions.
diff --git a/Makefile b/Makefile
@@ -828,6 +828,8 @@ RUNTIME_CPP_COMPONENTS = \
   hexagon_dma_pool \
   hexagon_host \
   ios_io \
+  linux_aarch64_cpu_features \
+  linux_arm_cpu_features \
   linux_clock \
   linux_host_cpu_count \
   linux_yield \
@@ -839,6 +841,8 @@ RUNTIME_CPP_COMPONENTS = \
   msan \
   msan_stubs \
   opencl \
+  osx_aarch64_cpu_features \
+  osx_arm_cpu_features \
   osx_clock \
   osx_get_symbol \
   osx_host_cpu_count \
@@ -873,6 +877,7 @@ RUNTIME_CPP_COMPONENTS = \
   wasm_cpu_features \
   webgpu_dawn \
   webgpu_emscripten \
+  windows_aarch64_cpu_features_arm \
   windows_clock \
   windows_cuda \
   windows_d3d12compute_arm \

diff --git a/src/LLVM_Runtime_Linker.cpp b/src/LLVM_Runtime_Linker.cpp
@@ -46,20 +46,31 @@ std::unique_ptr<llvm::Module> parse_bitcode_file(llvm::StringRef buf, llvm::LLVM
         return std::unique_ptr<llvm::Module>();                                                                         \
     }
 
+#define DECLARE_CPP_INITMOD_LOOKUP_BITS(mod, bits)              \
+    do {                                                        \
+        if (debug) {                                            \
+            return get_initmod_##mod##_##bits##_debug(context); \
+        } else {                                                \
+            return get_initmod_##mod##_##bits(context);         \
+        }                                                       \
+    } while (0)
+
 #define DECLARE_CPP_INITMOD_LOOKUP(mod)                                                                     \
     std::unique_ptr<llvm::Module> get_initmod_##mod(llvm::LLVMContext *context, bool bits_64, bool debug) { \
         if (bits_64) {                                                                                      \
-            if (debug) {                                                                                    \
-                return get_initmod_##mod##_64_debug(context);                                               \
-            } else {                                                                                        \
-                return get_initmod_##mod##_64(context);                                                     \
-            }                                                                                               \
+            DECLARE_CPP_INITMOD_LOOKUP_BITS(mod, 64);                                                       \
+        } else {                                                                                            \
+            DECLARE_CPP_INITMOD_LOOKUP_BITS(mod, 32);                                                       \
+        }                                                                                                   \
+    }
+
+#define DECLARE_CPP_INITMOD_LOOKUP_64(mod)                                                                  \
+    std::unique_ptr<llvm::Module> get_initmod_##mod(llvm::LLVMContext *context, bool bits_64, bool debug) { \
+        if (bits_64) {                                                                                      \
+            DECLARE_CPP_INITMOD_LOOKUP_BITS(mod, 64);                                                       \
         } else {                                                                                            \
-            if (debug) {                                                                                    \
-                return get_initmod_##mod##_32_debug(context);                                               \
-            } else {                                                                                        \
-                return get_initmod_##mod##_32(context);                                                     \
-            }                                                                                               \
+            internal_error << "No support for 32-bit initmod: " #mod;                                       \
+            return nullptr; /* appease warnings */                                                          \
         }                                                                                                   \
     }
 
@@ -70,6 +81,11 @@ std::unique_ptr<llvm::Module> parse_bitcode_file(llvm::StringRef buf, llvm::LLVM
     DECLARE_INITMOD(mod##_64)       \
     DECLARE_CPP_INITMOD_LOOKUP(mod)
 
+#define DECLARE_CPP_INITMOD_64(mod) \
+    DECLARE_INITMOD(mod##_64_debug) \
+    DECLARE_INITMOD(mod##_64)       \
+    DECLARE_CPP_INITMOD_LOOKUP_64(mod)
+
 #define DECLARE_LL_INITMOD(mod) \
     DECLARE_INITMOD(mod##_ll)
 
@@ -183,18 +199,28 @@ DECLARE_NO_INITMOD(metal_objc_x86)
 DECLARE_LL_INITMOD(arm)
 DECLARE_LL_INITMOD(arm_no_neon)
 DECLARE_CPP_INITMOD(arm_cpu_features)
+DECLARE_CPP_INITMOD(linux_arm_cpu_features)
+DECLARE_CPP_INITMOD(osx_arm_cpu_features)
 #else
 DECLARE_NO_INITMOD(arm)
 DECLARE_NO_INITMOD(arm_no_neon)
 DECLARE_NO_INITMOD(arm_cpu_features)
+DECLARE_NO_INITMOD(linux_arm_cpu_features)
+DECLARE_NO_INITMOD(osx_arm_cpu_features)
 #endif  // WITH_ARM
 
 #ifdef WITH_AARCH64
 DECLARE_LL_INITMOD(aarch64)
 DECLARE_CPP_INITMOD(aarch64_cpu_features)
+DECLARE_CPP_INITMOD(linux_aarch64_cpu_features)
+DECLARE_CPP_INITMOD(osx_aarch64_cpu_features)
+DECLARE_CPP_INITMOD_64(windows_aarch64_cpu_features_arm)
 #else
 DECLARE_NO_INITMOD(aarch64)
 DECLARE_NO_INITMOD(aarch64_cpu_features)
+DECLARE_NO_INITMOD(linux_aarch64_cpu_features)
+DECLARE_NO_INITMOD(osx_aarch64_cpu_features)
+DECLARE_NO_INITMOD(windows_aarch64_cpu_features_arm)
 #endif  // WITH_AARCH64
 
 #ifdef WITH_NVPTX
@@ -1206,9 +1232,23 @@ std::unique_ptr<llvm::Module> get_initial_module_for_target(Target t, llvm::LLVM
             }
             if (t.arch == Target::ARM) {
                 if (t.bits == 64) {
-                    modules.push_back(get_initmod_aarch64_cpu_features(c, bits_64, debug));
+                    if (t.os == Target::Android || t.os == Target::Linux) {
+                        modules.push_back(get_initmod_linux_aarch64_cpu_features(c, bits_64, debug));
+                    } else if (t.os == Target::OSX || t.os == Target::IOS) {
+                        modules.push_back(get_initmod_osx_aarch64_cpu_features(c, bits_64, debug));
+                    } else if (t.os == Target::Windows) {
+                        modules.push_back(get_initmod_windows_aarch64_cpu_features_arm(c, bits_64, debug));
+                    } else {
+                        modules.push_back(get_initmod_aarch64_cpu_features(c, bits_64, debug));
+                    }
                 } else {
-                    modules.push_back(get_initmod_arm_cpu_features(c, bits_64, debug));
+                    if (t.os == Target::Android || t.os == Target::Linux) {
+                        modules.push_back(get_initmod_linux_arm_cpu_features(c, bits_64, debug));
+                    } else if (t.os == Target::OSX || t.os == Target::IOS) {
+                        modules.push_back(get_initmod_osx_arm_cpu_features(c, bits_64, debug));
+                    } else {
+                        modules.push_back(get_initmod_arm_cpu_features(c, bits_64, debug));
+                    }
                 }
             }
             if (t.arch == Target::POWERPC) {

diff --git a/src/Target.cpp b/src/Target.cpp
@@ -21,23 +21,50 @@
 #endif
 
 #ifdef _MSC_VER
+#define NOMINMAX
+#define WIN32_LEAN_AND_MEAN
 #include <intrin.h>
+#include <windows.h>
 #endif  // _MSC_VER
 
+#ifdef __APPLE__
+#include <mach/machine.h>
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#endif
+
+#if defined(__linux__) && (defined(__arm__) || defined(__aarch64__))
+#include <asm/hwcap.h>
+#include <sys/auxv.h>
+#ifndef HWCAP_ASIMDHP
+#define HWCAP_ASIMDHP 0
+#endif
+#ifndef HWCAP_ASIMDDP
+#define HWCAP_ASIMDDP 0
+#endif
+#ifndef HWCAP_SVE
+#define HWCAP_SVE 0
+#endif
+#ifndef HWCAP2_SVE2
+#define HWCAP2_SVE2 0
+#endif
+#endif
+
 namespace Halide {
 
 using std::string;
 using std::vector;
 
 namespace {
 
-#ifdef _MSC_VER
-static void cpuid(int info[4], int infoType, int extra) {
+#if defined(_M_IX86) || defined(_M_AMD64)
+
+void cpuid(int info[4], int infoType, int extra) {
     __cpuidex(info, infoType, extra);
 }
-#else
 
-#if defined(__x86_64__) || defined(__i386__)
+#elif defined(__x86_64__) || defined(__i386__)
+
 // CPU feature detection code taken from ispc
 // (https://github.com/ispc/ispc/blob/master/builtins/dispatch.ll)
 
@@ -47,10 +74,10 @@ void cpuid(int info[4], int infoType, int extra) {
         : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
         : "0"(infoType), "2"(extra));
 }
-#endif
+
 #endif
 
-#if defined(__x86_64__) || defined(__i386__) || defined(_MSC_VER)
+#if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_AMD64)
 
 enum class VendorSignatures {
     Unknown,
@@ -143,6 +170,29 @@ Target::Processor get_amd_processor(unsigned family, unsigned model, bool have_s
 
 #endif  // defined(__x86_64__) || defined(__i386__) || defined(_MSC_VER)
 
+#ifdef __APPLE__
+
+template<typename T>
+std::optional<T> getsysctl(const char *name) {
+    T value;
+    size_t size = sizeof(value);
+    if (sysctlbyname(name, &value, &size, nullptr, 0)) {
+        return std::nullopt;
+    }
+    return std::make_optional(value);
+}
+
+bool sysctl_is_set(const char *name) {
+    return getsysctl<int>(name).value_or(0);
+}
+
+bool is_armv7s() {
+    return getsysctl<cpu_type_t>("hw.cputype") == CPU_TYPE_ARM &&
+           getsysctl<cpu_subtype_t>("hw.cpusubtype") == CPU_SUBTYPE_ARM_V7S;
+}
+
+#endif  // __APPLE__
+
 Target calculate_host_target() {
     Target::OS os = Target::OSUnknown;
 #ifdef __linux__
@@ -164,8 +214,66 @@ Target calculate_host_target() {
 #if __riscv
     Target::Arch arch = Target::RISCV;
 #else
-#if defined(__arm__) || defined(__aarch64__)
+#if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     Target::Arch arch = Target::ARM;
+
+#ifdef __APPLE__
+    if (is_armv7s()) {
+        initial_features.push_back(Target::ARMv7s);
+    }
+
+    if (sysctl_is_set("hw.optional.arm.FEAT_DotProd")) {
+        initial_features.push_back(Target::ARMDotProd);
+    }
+
+    if (sysctl_is_set("hw.optional.arm.FEAT_FP16")) {
+        initial_features.push_back(Target::ARMFp16);
+    }
+#endif
+
+#ifdef __linux__
+    unsigned long hwcaps = getauxval(AT_HWCAP);
+    unsigned long hwcaps2 = getauxval(AT_HWCAP2);
+
+    if (hwcaps & HWCAP_ASIMDDP) {
+        initial_features.push_back(Target::ARMDotProd);
+    }
+
+    if (hwcaps & HWCAP_ASIMDHP) {
+        initial_features.push_back(Target::ARMFp16);
+    }
+
+    if (hwcaps & HWCAP_SVE) {
+        initial_features.push_back(Target::SVE);
+    }
+
+    if (hwcaps2 & HWCAP2_SVE2) {
+        initial_features.push_back(Target::SVE2);
+    }
+#endif
+
+#ifdef _MSC_VER
+
+    // Magic value from: https://github.com/dotnet/runtime/blob/7e977dcbe5efaeec2c75ed0c3e200c85b2e55522/src/native/minipal/cpufeatures.c#L19
+#define PF_ARM_SVE_INSTRUCTIONS_AVAILABLE (46)
+
+    // This is the strategy used by Google's cpuinfo library for
+    // detecting fp16 arithmetic support on Windows.
+    if (!IsProcessorFeaturePresent(PF_FLOATING_POINT_EMULATED) &&
+        IsProcessorFeaturePresent(PF_ARM_FMAC_INSTRUCTIONS_AVAILABLE)) {
+        initial_features.push_back(Target::ARMFp16);
+    }
+
+    if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) {
+        initial_features.push_back(Target::ARMDotProd);
+    }
+
+    if (IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)) {
+        initial_features.push_back(Target::SVE);
+    }
+
+#endif
+
 #else
 #if defined(__powerpc__) && (defined(__FreeBSD__) || defined(__linux__))
     Target::Arch arch = Target::POWERPC;

diff --git a/src/Type.cpp b/src/Type.cpp
@@ -35,7 +35,7 @@ Halide::Expr Type::max() const {
     } else {
         internal_assert(is_float());
         if (bits() == 16) {
-            return Internal::FloatImm::make(*this, 65504.0);
+            return Internal::FloatImm::make(*this, (double)float16_t::make_infinity());
         } else if (bits() == 32) {
             return Internal::FloatImm::make(*this, std::numeric_limits<float>::infinity());
         } else if (bits() == 64) {
@@ -59,7 +59,7 @@ Halide::Expr Type::min() const {
     } else {
         internal_assert(is_float());
         if (bits() == 16) {
-            return Internal::FloatImm::make(*this, -65504.0);
+            return Internal::FloatImm::make(*this, (double)float16_t::make_negative_infinity());
         } else if (bits() == 32) {
             return Internal::FloatImm::make(*this, -std::numeric_limits<float>::infinity());
         } else if (bits() == 64) {

diff --git a/src/Util.cpp b/src/Util.cpp
@@ -859,7 +859,14 @@ void run_with_large_stack(const std::function<void()> &action) {
 // Portable bit-counting methods
 int popcount64(uint64_t x) {
 #ifdef _MSC_VER
-#if defined(_WIN64)
+#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64_EC)
+    int popcnt = 0;
+    while (x) {
+        x &= x - 1;
+        popcnt++;
+    }
+    return popcnt;
+#elif defined(_WIN64)
     return __popcnt64(x);
 #else
     return __popcnt((uint32_t)(x >> 32)) + __popcnt((uint32_t)(x & 0xffffffff));

diff --git a/src/runtime/CMakeLists.txt b/src/runtime/CMakeLists.txt
@@ -32,6 +32,8 @@ set(RUNTIME_CPP
     hexagon_dma_pool
     hexagon_host
     ios_io
+    linux_aarch64_cpu_features
+    linux_arm_cpu_features
     linux_clock
     linux_host_cpu_count
     linux_yield
@@ -43,6 +45,8 @@ set(RUNTIME_CPP
     msan
     msan_stubs
     opencl
+    osx_aarch64_cpu_features
+    osx_arm_cpu_features
     osx_clock
     osx_get_symbol
     osx_host_cpu_count
@@ -80,6 +84,7 @@ set(RUNTIME_CPP
     # webgpu
     webgpu_dawn
     webgpu_emscripten
+    windows_aarch64_cpu_features_arm
     windows_clock
     windows_cuda
     windows_d3d12compute_arm