diff --git a/O/OpenBLAS/OpenBLAS32@0.3.26/build_tarballs.jl b/O/OpenBLAS/OpenBLAS32@0.3.26/build_tarballs.jl index 4ec78e6a1eb..b13b84d9d20 100644 --- a/O/OpenBLAS/OpenBLAS32@0.3.26/build_tarballs.jl +++ b/O/OpenBLAS/OpenBLAS32@0.3.26/build_tarballs.jl @@ -7,7 +7,7 @@ name = "OpenBLAS32" version = v"0.3.26" sources = openblas_sources(version) -script = openblas_script(openblas32=true) +script = openblas_script(openblas32=true, bfloat16=true) platforms = openblas_platforms() products = openblas_products() dependencies = openblas_dependencies(platforms) diff --git a/O/OpenBLAS/OpenBLAS32@0.3.26/bundled/patches/80-avx512bf-kernels.patch b/O/OpenBLAS/OpenBLAS32@0.3.26/bundled/patches/80-avx512bf-kernels.patch new file mode 100644 index 00000000000..7e99cdf7c53 --- /dev/null +++ b/O/OpenBLAS/OpenBLAS32@0.3.26/bundled/patches/80-avx512bf-kernels.patch @@ -0,0 +1,107 @@ +From 1dada6d65d89d19b2cf89b12169f6b2196c90f1d Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Fri, 12 Jan 2024 00:10:56 +0100 +Subject: [PATCH 1/2] Add compiler test and flag for AVX512BF16 capability + +--- + c_check | 22 ++++++++++++++++++++++ + 1 file changed, 22 insertions(+) + +diff --git a/c_check b/c_check +index b5e4a9ad00..3e507be818 100755 +--- a/c_check ++++ b/c_check +@@ -244,6 +244,7 @@ case "$data" in + esac + + no_avx512=0 ++no_avx512bf=0 + if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then + tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') + tmpf="$tmpd/a.c" +@@ -262,6 +263,25 @@ if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then + } + + rm -rf "$tmpd" ++ if [ "$no_avx512" -eq 0 ]; then ++ tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') ++ tmpf="$tmpd/a.c" ++ code='"__m512 a= _mm512_dpbf16_ps(a, (__m512bh) _mm512_loadu_si512(%1]), (__m512bh) _mm512_loadu_si512(%2]));"' ++ printf "#include \n\nint main(void){ %s; }\n" "$code" >> "$tmpf" ++ if [ "$compiler" = "PGI" ]; then ++ args=" -tp cooperlake -c -o $tmpf.o $tmpf" ++ else ++ args=" -march=cooperlake -c -o $tmpf.o $tmpf" ++ fi ++ no_avx512bf=0 ++ { ++ $compiler_name $flags $args >/dev/null 2>&1 ++ } || { ++ no_avx512bf=1 ++ } ++ ++ rm -rf "$tmpd" ++ fi + fi + + no_rv64gv=0 +@@ -409,6 +429,7 @@ done + [ "$makefile" = "-" ] && { + [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n" + [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" ++ [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n" + [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" + [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n" + exit 0 +@@ -437,6 +458,7 @@ done + [ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n" + [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n" + [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" ++ [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n" + [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" + [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n" + [ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n" + +From 995a990e24fdcc8080128a8abc17b4ccc66bd4fd Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Fri, 12 Jan 2024 00:12:46 +0100 +Subject: [PATCH 2/2] Make AVX512 BFLOAT16 kernels conditional on compiler + capability + +--- + kernel/x86_64/KERNEL.COOPERLAKE | 3 ++- + kernel/x86_64/KERNEL.SAPPHIRERAPIDS | 2 ++ + 2 files changed, 4 insertions(+), 1 deletion(-) + +diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE +index dba94aea86..22b042029f 100644 +--- a/kernel/x86_64/KERNEL.COOPERLAKE ++++ b/kernel/x86_64/KERNEL.COOPERLAKE +@@ -1,5 +1,5 @@ + include $(KERNELDIR)/KERNEL.SKYLAKEX +- ++ifneq ($(NO_AVX512BF16), 1) + SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_cooperlake.c + SBGEMM_SMALL_K_NN = sbgemm_small_kernel_nn_cooperlake.c + SBGEMM_SMALL_K_B0_NN = sbgemm_small_kernel_nn_cooperlake.c +@@ -20,3 +20,4 @@ SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) + SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) + SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) + SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) ++endif +diff --git a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS +index 3a832e9174..0ab2b4ddcf 100644 +--- a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS ++++ b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS +@@ -1,5 +1,6 @@ + include $(KERNELDIR)/KERNEL.COOPERLAKE + ++ifneq ($(NO_AVX512BF16), 1) + SBGEMM_SMALL_M_PERMIT = + SBGEMM_SMALL_K_NN = + SBGEMM_SMALL_K_B0_NN = +@@ -20,3 +21,4 @@ SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) + SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) + SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) + SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) ++endif diff --git a/O/OpenBLAS/OpenBLAS32@0.3.26/bundled/patches/90-darwin-sve.patch b/O/OpenBLAS/OpenBLAS32@0.3.26/bundled/patches/90-darwin-sve.patch new file mode 100644 index 00000000000..a2166db9379 --- /dev/null +++ b/O/OpenBLAS/OpenBLAS32@0.3.26/bundled/patches/90-darwin-sve.patch @@ -0,0 +1,34 @@ +From 03688a42622cf76e696859ce384e45aa26d927fc Mon Sep 17 00:00:00 2001 +From: Ian McInerney +Date: Tue, 23 Jan 2024 10:29:57 +0000 +Subject: [PATCH] Build with proper aarch64 flags on Neoverse Darwin + +We aren't affected by the problems in AppleClang that prompted this +fallback to an older architecture. +--- + Makefile.arm64 | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/Makefile.arm64 b/Makefile.arm64 +index ed52a9424..a8f3cb0f0 100644 +--- a/Makefile.arm64 ++++ b/Makefile.arm64 +@@ -135,11 +135,11 @@ ifeq ($(CORE), NEOVERSEN2) + ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) + ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG))) + ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG))) +-ifneq ($(OSNAME), Darwin) ++#ifneq ($(OSNAME), Darwin) + CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 +-else +-CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +-endif ++#else ++#CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 ++#endif + ifneq ($(F_COMPILER), NAG) + FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 + endif +-- +2.43.0 + diff --git a/O/OpenBLAS/OpenBLAS@0.3.26/build_tarballs.jl b/O/OpenBLAS/OpenBLAS@0.3.26/build_tarballs.jl index fe9a79ed43a..f1ba3dc1fa8 100644 --- a/O/OpenBLAS/OpenBLAS@0.3.26/build_tarballs.jl +++ b/O/OpenBLAS/OpenBLAS@0.3.26/build_tarballs.jl @@ -7,7 +7,7 @@ name = "OpenBLAS" version = v"0.3.26" sources = openblas_sources(version) -script = openblas_script(;aarch64_ilp64=true, num_64bit_threads=512) +script = openblas_script(;aarch64_ilp64=true, num_64bit_threads=512, bfloat16=true) platforms = openblas_platforms(;experimental=true) push!(platforms, Platform("x86_64", "linux"; sanitize="memory")) products = openblas_products() diff --git a/O/OpenBLAS/OpenBLAS@0.3.26/bundled/patches/80-avx512bf-kernels.patch b/O/OpenBLAS/OpenBLAS@0.3.26/bundled/patches/80-avx512bf-kernels.patch new file mode 100644 index 00000000000..7e99cdf7c53 --- /dev/null +++ b/O/OpenBLAS/OpenBLAS@0.3.26/bundled/patches/80-avx512bf-kernels.patch @@ -0,0 +1,107 @@ +From 1dada6d65d89d19b2cf89b12169f6b2196c90f1d Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Fri, 12 Jan 2024 00:10:56 +0100 +Subject: [PATCH 1/2] Add compiler test and flag for AVX512BF16 capability + +--- + c_check | 22 ++++++++++++++++++++++ + 1 file changed, 22 insertions(+) + +diff --git a/c_check b/c_check +index b5e4a9ad00..3e507be818 100755 +--- a/c_check ++++ b/c_check +@@ -244,6 +244,7 @@ case "$data" in + esac + + no_avx512=0 ++no_avx512bf=0 + if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then + tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') + tmpf="$tmpd/a.c" +@@ -262,6 +263,25 @@ if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then + } + + rm -rf "$tmpd" ++ if [ "$no_avx512" -eq 0 ]; then ++ tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') ++ tmpf="$tmpd/a.c" ++ code='"__m512 a= _mm512_dpbf16_ps(a, (__m512bh) _mm512_loadu_si512(%1]), (__m512bh) _mm512_loadu_si512(%2]));"' ++ printf "#include \n\nint main(void){ %s; }\n" "$code" >> "$tmpf" ++ if [ "$compiler" = "PGI" ]; then ++ args=" -tp cooperlake -c -o $tmpf.o $tmpf" ++ else ++ args=" -march=cooperlake -c -o $tmpf.o $tmpf" ++ fi ++ no_avx512bf=0 ++ { ++ $compiler_name $flags $args >/dev/null 2>&1 ++ } || { ++ no_avx512bf=1 ++ } ++ ++ rm -rf "$tmpd" ++ fi + fi + + no_rv64gv=0 +@@ -409,6 +429,7 @@ done + [ "$makefile" = "-" ] && { + [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n" + [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" ++ [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n" + [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" + [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n" + exit 0 +@@ -437,6 +458,7 @@ done + [ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n" + [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n" + [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" ++ [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n" + [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" + [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n" + [ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n" + +From 995a990e24fdcc8080128a8abc17b4ccc66bd4fd Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Fri, 12 Jan 2024 00:12:46 +0100 +Subject: [PATCH 2/2] Make AVX512 BFLOAT16 kernels conditional on compiler + capability + +--- + kernel/x86_64/KERNEL.COOPERLAKE | 3 ++- + kernel/x86_64/KERNEL.SAPPHIRERAPIDS | 2 ++ + 2 files changed, 4 insertions(+), 1 deletion(-) + +diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE +index dba94aea86..22b042029f 100644 +--- a/kernel/x86_64/KERNEL.COOPERLAKE ++++ b/kernel/x86_64/KERNEL.COOPERLAKE +@@ -1,5 +1,5 @@ + include $(KERNELDIR)/KERNEL.SKYLAKEX +- ++ifneq ($(NO_AVX512BF16), 1) + SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_cooperlake.c + SBGEMM_SMALL_K_NN = sbgemm_small_kernel_nn_cooperlake.c + SBGEMM_SMALL_K_B0_NN = sbgemm_small_kernel_nn_cooperlake.c +@@ -20,3 +20,4 @@ SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) + SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) + SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) + SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) ++endif +diff --git a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS +index 3a832e9174..0ab2b4ddcf 100644 +--- a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS ++++ b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS +@@ -1,5 +1,6 @@ + include $(KERNELDIR)/KERNEL.COOPERLAKE + ++ifneq ($(NO_AVX512BF16), 1) + SBGEMM_SMALL_M_PERMIT = + SBGEMM_SMALL_K_NN = + SBGEMM_SMALL_K_B0_NN = +@@ -20,3 +21,4 @@ SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) + SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) + SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) + SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) ++endif diff --git a/O/OpenBLAS/OpenBLAS@0.3.26/bundled/patches/90-darwin-sve.patch b/O/OpenBLAS/OpenBLAS@0.3.26/bundled/patches/90-darwin-sve.patch new file mode 100644 index 00000000000..a2166db9379 --- /dev/null +++ b/O/OpenBLAS/OpenBLAS@0.3.26/bundled/patches/90-darwin-sve.patch @@ -0,0 +1,34 @@ +From 03688a42622cf76e696859ce384e45aa26d927fc Mon Sep 17 00:00:00 2001 +From: Ian McInerney +Date: Tue, 23 Jan 2024 10:29:57 +0000 +Subject: [PATCH] Build with proper aarch64 flags on Neoverse Darwin + +We aren't affected by the problems in AppleClang that prompted this +fallback to an older architecture. +--- + Makefile.arm64 | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/Makefile.arm64 b/Makefile.arm64 +index ed52a9424..a8f3cb0f0 100644 +--- a/Makefile.arm64 ++++ b/Makefile.arm64 +@@ -135,11 +135,11 @@ ifeq ($(CORE), NEOVERSEN2) + ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) + ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG))) + ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG))) +-ifneq ($(OSNAME), Darwin) ++#ifneq ($(OSNAME), Darwin) + CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 +-else +-CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +-endif ++#else ++#CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 ++#endif + ifneq ($(F_COMPILER), NAG) + FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 + endif +-- +2.43.0 + diff --git a/O/OpenBLAS/common.jl b/O/OpenBLAS/common.jl index c4ce1222e41..4e5099ad733 100644 --- a/O/OpenBLAS/common.jl +++ b/O/OpenBLAS/common.jl @@ -73,13 +73,14 @@ end # Do not override the default `num_64bit_threads` here, instead pass a custom from specific OpenBLAS versions # that should opt into a higher thread count. -function openblas_script(;num_64bit_threads::Integer=32, openblas32::Bool=false, aarch64_ilp64::Bool=false, consistent_fpcsr::Bool=false, kwargs...) +function openblas_script(;num_64bit_threads::Integer=32, openblas32::Bool=false, aarch64_ilp64::Bool=false, consistent_fpcsr::Bool=false, bfloat16::Bool=false, kwargs...) # Allow some basic configuration script = """ NUM_64BIT_THREADS=$(num_64bit_threads) OPENBLAS32=$(openblas32) AARCH64_ILP64=$(aarch64_ilp64) CONSISTENT_FPCSR=$(consistent_fpcsr) + BFLOAT16=$(bfloat16) version_patch=$(version.patch) """ # Bash recipe for building across all platforms @@ -110,6 +111,11 @@ function openblas_script(;num_64bit_threads::Integer=32, openblas32::Bool=false, flags+=(CONSISTENT_FPCSR=1) fi + # Build BFLOAT16 kernels + if [[ "${BFLOAT16}" == "true" ]]; then + flags+=(BUILD_BFLOAT16=1) + fi + # We are cross-compiling flags+=(CROSS=1 PREFIX=/ "CROSS_SUFFIX=${target}-")