Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into abadams/zen4_natural_…
Browse files Browse the repository at this point in the history
…vector_size
  • Loading branch information
abadams committed Feb 13, 2025
2 parents 8030aab + f770beb commit f395da4
Show file tree
Hide file tree
Showing 14 changed files with 126 additions and 131 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ option(THREADS_PREFER_PTHREAD_FLAG "When enabled, prefer to use the -pthread fla
find_package(Threads REQUIRED)

## LLVM
find_package(Halide_LLVM 18...20 REQUIRED
find_package(Halide_LLVM 18...99 REQUIRED # Use 99 to fake a minimum-only constraint
COMPONENTS WebAssembly X86
OPTIONAL_COMPONENTS AArch64 ARM Hexagon NVPTX PowerPC RISCV)

Expand Down
2 changes: 1 addition & 1 deletion src/CodeGen_LLVM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -748,7 +748,7 @@ Value *CodeGen_LLVM::register_destructor(llvm::Function *destructor_fn, Value *o
IRBuilderBase::InsertPoint here = builder->saveIP();
BasicBlock *dtors = get_destructor_block();

builder->SetInsertPoint(dtors->getFirstNonPHI());
builder->SetInsertPoint(dtors->getFirstNonPHIIt());

PHINode *error_code = dyn_cast<PHINode>(dtors->begin());
internal_assert(error_code) << "The destructor block is supposed to start with a phi node\n";
Expand Down
2 changes: 2 additions & 0 deletions src/CodeGen_PTX_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,8 @@ void CodeGen_PTX_Dev::add_kernel(Stmt stmt,
}
}

function->setCallingConv(llvm::CallingConv::PTX_Kernel);

// Make the initial basic block
entry_block = BasicBlock::Create(*context, "entry", function);
builder->SetInsertPoint(entry_block);
Expand Down
1 change: 0 additions & 1 deletion src/IROperator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,6 @@ Expr lossless_cast(Type t,
Expr a = lossless_cast(t, op->a, scope, cache);
Expr b = lossless_cast(t, op->b, scope, cache);
if (a.defined() && b.defined()) {
debug(0) << a << " " << b << "\n";
return Min::make(a, b);
}
} else if (const Max *op = e.as<Max>()) {
Expand Down
45 changes: 22 additions & 23 deletions src/runtime/aarch64_cpu_features.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,24 +21,24 @@ extern "C" unsigned long getauxval(unsigned long type);

namespace {

void set_platform_features(CpuFeatures &features) {
void set_platform_features(CpuFeatures *features) {
unsigned long hwcaps = getauxval(AT_HWCAP);
unsigned long hwcaps2 = getauxval(AT_HWCAP2);

if (hwcaps & HWCAP_ASIMDDP) {
features.set_available(halide_target_feature_arm_dot_prod);
halide_set_available_cpu_feature(features, halide_target_feature_arm_dot_prod);
}

if (hwcaps & HWCAP_ASIMDHP) {
features.set_available(halide_target_feature_arm_fp16);
halide_set_available_cpu_feature(features, halide_target_feature_arm_fp16);
}

if (hwcaps & HWCAP_SVE) {
features.set_available(halide_target_feature_sve);
halide_set_available_cpu_feature(features, halide_target_feature_sve);
}

if (hwcaps2 & HWCAP2_SVE2) {
features.set_available(halide_target_feature_sve2);
halide_set_available_cpu_feature(features, halide_target_feature_sve2);
}
}

Expand All @@ -56,13 +56,13 @@ bool sysctl_is_set(const char *name) {
return sysctlbyname(name, &enabled, &enabled_len, nullptr, 0) == 0 && enabled;
}

void set_platform_features(CpuFeatures &features) {
void set_platform_features(CpuFeatures *features) {
if (sysctl_is_set("hw.optional.arm.FEAT_DotProd")) {
features.set_available(halide_target_feature_arm_dot_prod);
halide_set_available_cpu_feature(features, halide_target_feature_arm_dot_prod);
}

if (sysctl_is_set("hw.optional.arm.FEAT_FP16")) {
features.set_available(halide_target_feature_arm_fp16);
halide_set_available_cpu_feature(features, halide_target_feature_arm_fp16);
}
}

Expand All @@ -84,20 +84,20 @@ extern "C" BOOL IsProcessorFeaturePresent(DWORD feature);

namespace {

void set_platform_features(CpuFeatures &features) {
void set_platform_features(CpuFeatures *features) {
// This is the strategy used by Google's cpuinfo library for
// detecting fp16 arithmetic support on Windows.
if (!IsProcessorFeaturePresent(PF_FLOATING_POINT_EMULATED) &&
IsProcessorFeaturePresent(PF_ARM_FMAC_INSTRUCTIONS_AVAILABLE)) {
features.set_available(halide_target_feature_arm_fp16);
halide_set_available_cpu_feature(features, halide_target_feature_arm_fp16);
}

if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) {
features.set_available(halide_target_feature_arm_dot_prod);
halide_set_available_cpu_feature(features, halide_target_feature_arm_dot_prod);
}

if (IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)) {
features.set_available(halide_target_feature_sve);
halide_set_available_cpu_feature(features, halide_target_feature_sve);
}
}

Expand All @@ -107,28 +107,27 @@ void set_platform_features(CpuFeatures &features) {

namespace {

void set_platform_features(CpuFeatures &) {
void set_platform_features(CpuFeatures *) {
}

} // namespace

#endif

WEAK CpuFeatures halide_get_cpu_features() {
CpuFeatures features;
features.set_known(halide_target_feature_arm_dot_prod);
features.set_known(halide_target_feature_arm_fp16);
features.set_known(halide_target_feature_armv7s);
features.set_known(halide_target_feature_no_neon);
features.set_known(halide_target_feature_sve);
features.set_known(halide_target_feature_sve2);
extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features) {
halide_set_known_cpu_feature(features, halide_target_feature_arm_dot_prod);
halide_set_known_cpu_feature(features, halide_target_feature_arm_fp16);
halide_set_known_cpu_feature(features, halide_target_feature_armv7s);
halide_set_known_cpu_feature(features, halide_target_feature_no_neon);
halide_set_known_cpu_feature(features, halide_target_feature_sve);
halide_set_known_cpu_feature(features, halide_target_feature_sve2);

// All ARM architectures support "No Neon".
features.set_available(halide_target_feature_no_neon);
halide_set_available_cpu_feature(features, halide_target_feature_no_neon);

set_platform_features(features);

return features;
return halide_error_code_success;
}

} // namespace Internal
Expand Down
35 changes: 17 additions & 18 deletions src/runtime/arm_cpu_features.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,15 @@ extern "C" unsigned long getauxval(unsigned long type);

namespace {

void set_platform_features(CpuFeatures &features) {
void set_platform_features(CpuFeatures *features) {
unsigned long hwcaps = getauxval(AT_HWCAP);

if (hwcaps & HWCAP_ASIMDDP) {
features.set_available(halide_target_feature_arm_dot_prod);
halide_set_available_cpu_feature(features, halide_target_feature_arm_dot_prod);
}

if (hwcaps & HWCAP_ASIMDHP) {
features.set_available(halide_target_feature_arm_fp16);
halide_set_available_cpu_feature(features, halide_target_feature_arm_fp16);
}
}

Expand Down Expand Up @@ -68,17 +68,17 @@ bool is_armv7s() {
return type == CPU_TYPE_ARM && subtype == CPU_SUBTYPE_ARM_V7S;
}

void set_platform_features(CpuFeatures &features) {
void set_platform_features(CpuFeatures *features) {
if (is_armv7s()) {
features.set_available(halide_target_feature_armv7s);
halide_set_available_cpu_feature(features, halide_target_feature_armv7s);
}

if (sysctl_is_set("hw.optional.arm.FEAT_DotProd")) {
features.set_available(halide_target_feature_arm_dot_prod);
halide_set_available_cpu_feature(features, halide_target_feature_arm_dot_prod);
}

if (sysctl_is_set("hw.optional.arm.FEAT_FP16")) {
features.set_available(halide_target_feature_arm_fp16);
halide_set_available_cpu_feature(features, halide_target_feature_arm_fp16);
}
}

Expand All @@ -88,28 +88,27 @@ void set_platform_features(CpuFeatures &features) {

namespace {

void set_platform_features(CpuFeatures &) {
void set_platform_features(CpuFeatures *) {
}

} // namespace

#endif

WEAK CpuFeatures halide_get_cpu_features() {
CpuFeatures features;
features.set_known(halide_target_feature_arm_dot_prod);
features.set_known(halide_target_feature_arm_fp16);
features.set_known(halide_target_feature_armv7s);
features.set_known(halide_target_feature_no_neon);
features.set_known(halide_target_feature_sve);
features.set_known(halide_target_feature_sve2);
extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features) {
halide_set_known_cpu_feature(features, halide_target_feature_arm_dot_prod);
halide_set_known_cpu_feature(features, halide_target_feature_arm_fp16);
halide_set_known_cpu_feature(features, halide_target_feature_armv7s);
halide_set_known_cpu_feature(features, halide_target_feature_no_neon);
halide_set_known_cpu_feature(features, halide_target_feature_sve);
halide_set_known_cpu_feature(features, halide_target_feature_sve2);

// All ARM architectures support "No Neon".
features.set_available(halide_target_feature_no_neon);
halide_set_available_cpu_feature(features, halide_target_feature_no_neon);

set_platform_features(features);

return features;
return halide_error_code_success;
}

} // namespace Internal
Expand Down
10 changes: 6 additions & 4 deletions src/runtime/can_use_target.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,21 +40,23 @@ WEAK int halide_default_can_use_target_features(int count, const uint64_t *featu

static_assert(sizeof(halide_cpu_features_storage) == sizeof(CpuFeatures), "CpuFeatures Mismatch");
if (!halide_cpu_features_initialized) {
CpuFeatures tmp = halide_get_cpu_features();
CpuFeatures tmp;
int error = halide_get_cpu_features(&tmp);
halide_abort_if_false(nullptr, error == halide_error_code_success);
memcpy(&halide_cpu_features_storage, &tmp, sizeof(tmp));
halide_cpu_features_initialized = true;
}
}

if (count != CpuFeatures::kWordCount) {
if (count != cpu_feature_mask_size) {
// This should not happen unless our runtime is out of sync with the rest of libHalide.
#ifdef DEBUG_RUNTIME
debug(nullptr) << "count " << count << " CpuFeatures::kWordCount " << CpuFeatures::kWordCount << "\n";
debug(nullptr) << "count " << count << " cpu_feature_mask_size " << cpu_feature_mask_size << "\n";
#endif
halide_error(nullptr, "Internal error: wrong structure size passed to halide_can_use_target_features()\n");
}
const CpuFeatures *cpu_features = reinterpret_cast<const CpuFeatures *>(&halide_cpu_features_storage[0]);
for (int i = 0; i < CpuFeatures::kWordCount; ++i) {
for (int i = 0; i < cpu_feature_mask_size; ++i) {
uint64_t m;
if ((m = (features[i] & cpu_features->known[i])) != 0) {
if ((m & cpu_features->available[i]) != m) {
Expand Down
57 changes: 26 additions & 31 deletions src/runtime/cpu_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,42 +8,37 @@ namespace Halide {
namespace Runtime {
namespace Internal {

// Return two masks:
// Size of CPU feature mask large enough to cover all Halide target features
static constexpr int cpu_feature_mask_size = (halide_target_feature_end + 63) / (sizeof(uint64_t) * 8);

// Contains two masks:
// One with all the CPU-specific features that might possible be available on this architecture ('known'),
// and one with the subset that are actually present ('available').
struct CpuFeatures {
static const int kWordCount = (halide_target_feature_end + 63) / (sizeof(uint64_t) * 8);

ALWAYS_INLINE void set_known(int i) {
known[i >> 6] |= ((uint64_t)1) << (i & 63);
}

ALWAYS_INLINE void set_available(int i) {
available[i >> 6] |= ((uint64_t)1) << (i & 63);
}

ALWAYS_INLINE bool test_known(int i) const {
return (known[i >> 6] & ((uint64_t)1) << (i & 63)) != 0;
}

ALWAYS_INLINE bool test_available(int i) const {
return (available[i >> 6] & ((uint64_t)1) << (i & 63)) != 0;
}

ALWAYS_INLINE
CpuFeatures() {
for (int i = 0; i < kWordCount; ++i) {
known[i] = 0;
available[i] = 0;
}
}

uint64_t known[kWordCount]; // mask of the CPU features we know how to detect
uint64_t available[kWordCount]; // mask of the CPU features that are available
// (always a subset of 'known')
uint64_t known[cpu_feature_mask_size] = {0}; // mask of the CPU features we know how to detect
uint64_t available[cpu_feature_mask_size] = {0}; // mask of the CPU features that are available
// (always a subset of 'known')
};

extern WEAK CpuFeatures halide_get_cpu_features();
ALWAYS_INLINE void halide_set_known_cpu_feature(CpuFeatures *features, int i) {
features->known[i >> 6] |= ((uint64_t)1) << (i & 63);
}

ALWAYS_INLINE void halide_set_available_cpu_feature(CpuFeatures *features, int i) {
features->available[i >> 6] |= ((uint64_t)1) << (i & 63);
}

ALWAYS_INLINE bool halide_test_known_cpu_feature(CpuFeatures *features, int i) {
return (features->known[i >> 6] & ((uint64_t)1) << (i & 63)) != 0;
}

ALWAYS_INLINE bool halide_test_available_cpu_feature(CpuFeatures *features, int i) {
return (features->available[i >> 6] & ((uint64_t)1) << (i & 63)) != 0;
}

// NOTE: This method is not part of the public API, but we push it into extern "C" to
// avoid name mangling mismatches between platforms. See: https://github.com/halide/Halide/issues/8565
extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features);

} // namespace Internal
} // namespace Runtime
Expand Down
4 changes: 2 additions & 2 deletions src/runtime/hexagon_cpu_features.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ namespace Halide {
namespace Runtime {
namespace Internal {

WEAK CpuFeatures halide_get_cpu_features() {
extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features) {
// Hexagon has no CPU-specific Features.
return CpuFeatures();
return halide_error_code_success;
}

} // namespace Internal
Expand Down
18 changes: 10 additions & 8 deletions src/runtime/powerpc_cpu_features.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,29 @@

#define PPC_FEATURE2_ARCH_2_07 0x80000000

extern "C" unsigned long int getauxval(unsigned long int);
extern "C" {

unsigned long int getauxval(unsigned long int);
}

namespace Halide {
namespace Runtime {
namespace Internal {

WEAK CpuFeatures halide_get_cpu_features() {
CpuFeatures features;
features.set_known(halide_target_feature_vsx);
features.set_known(halide_target_feature_power_arch_2_07);
extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features) {
halide_set_known_cpu_feature(features, halide_target_feature_vsx);
halide_set_known_cpu_feature(features, halide_target_feature_power_arch_2_07);

const unsigned long hwcap = getauxval(AT_HWCAP);
const unsigned long hwcap2 = getauxval(AT_HWCAP2);

if (hwcap & PPC_FEATURE_HAS_VSX) {
features.set_available(halide_target_feature_vsx);
halide_set_available_cpu_feature(features, halide_target_feature_vsx);
}
if (hwcap2 & PPC_FEATURE2_ARCH_2_07) {
features.set_available(halide_target_feature_power_arch_2_07);
halide_set_available_cpu_feature(features, halide_target_feature_power_arch_2_07);
}
return features;
return halide_error_code_success;
}

} // namespace Internal
Expand Down
4 changes: 2 additions & 2 deletions src/runtime/riscv_cpu_features.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ namespace Halide {
namespace Runtime {
namespace Internal {

WEAK CpuFeatures halide_get_cpu_features() {
extern "C" WEAK int halide_get_cpu_features(Halide::Runtime::Internal::CpuFeatures *features) {
// For now, no version specific features, though RISCV promises to have many.
return CpuFeatures();
return halide_error_code_success;
}

} // namespace Internal
Expand Down
Loading

0 comments on commit f395da4

Please sign in to comment.