enable poly_compress_avx for Linux kernel compilation

Also enable the AVX compiler flags. Signed-off-by: Stephan Mueller <[email protected]>
smuellerDD · Feb 8, 2024 · c022fda · c022fda
1 parent c11e5a6
commit c022fda
Show file tree

Hide file tree

Showing 4 changed files with 13 additions and 15 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -23,6 +23,8 @@ Changes 0.9.0-prerelease
 
 * enhancement: use accelerated XOR for KMAC/cSHAKE AEAD
 
+* fix: enable poly_compress_avx for Linux kernel compilation
+
 Changes 0.8.0:
 
 * enhancement: add applications

diff --git a/TODO b/TODO
@@ -7,6 +7,6 @@
   compiled for target host ARMv7 - at the moment compiling on ARMv6 fails
   because the Neon code is compiled (which should not be compiled)
 
-- poly_compress_avx does not work when compiled for Linux kernel
+- poly_compress_avx does not work when compiled for Linux kernel - it works with current compilers (see change 0b0bf07b5f7c0d7d623087212883c427476eefef, but older compilers still fail this code
 
 - check RISCV-64 Keccak implementation
diff --git a/kem/src/avx2/kyber_poly_avx2.c b/kem/src/avx2/kyber_poly_avx2.c
@@ -34,9 +34,6 @@
 #error "AVX2 support for Kyber mode 4 only"
 #endif
 
-//TODO remove from kernel code
-void poly_compress(uint8_t r[LC_KYBER_POLYCOMPRESSEDBYTES],
-		   const poly *restrict a);
 /**
  * @brief poly_compress
  *
@@ -51,17 +48,13 @@ void poly_compress(uint8_t r[LC_KYBER_POLYCOMPRESSEDBYTES],
 void poly_compress_avx(uint8_t r[LC_KYBER_POLYCOMPRESSEDBYTES],
 		       const poly *restrict a)
 {
-#ifdef LINUX_KERNEL
-	poly_compress(r, a);
-#else /* LINUX_KERNEL */
 	unsigned int i;
 	__m256i f0, f1;
 	__m128i t0, t1;
 
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeclaration-after-statement"
 	/* Due to const, the variables cannot be defined before */
-	LC_FPU_ENABLE;
 	const __m256i v = _mm256_load_si256(&kyber_qdata.vec[_16XV / 16]);
 	const __m256i shift1 = _mm256_set1_epi16(1 << 10);
 	const __m256i mask = _mm256_set1_epi16(31);
@@ -74,6 +67,7 @@ void poly_compress_avx(uint8_t r[LC_KYBER_POLYCOMPRESSEDBYTES],
 				-1, 4, 3, 2, 1, 0);
 #pragma GCC diagnostic pop
 
+	LC_FPU_ENABLE;
 	for (i = 0; i < LC_KYBER_N / 32; i++) {
 		f0 = _mm256_load_si256(&a->vec[2 * i + 0]);
 		f1 = _mm256_load_si256(&a->vec[2 * i + 1]);
@@ -98,7 +92,6 @@ void poly_compress_avx(uint8_t r[LC_KYBER_POLYCOMPRESSEDBYTES],
 		memcpy(&r[20 * i + 16], &t1, 4);
 	}
 	LC_FPU_DISABLE;
-#endif /* LINUX_KERNEL */
 }
 
 /**
@@ -122,7 +115,6 @@ void poly_decompress_avx(poly *restrict r,
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeclaration-after-statement"
 	/* Due to const, the variables cannot be defined before */
-	LC_FPU_ENABLE;
 	const __m256i q = _mm256_load_si256(&kyber_qdata.vec[_16XQ / 16]);
 	const __m256i shufbidx =
 		_mm256_set_epi8(9, 9, 9, 8, 8, 8, 8, 7, 7, 6, 6, 6, 6, 5, 5, 5,
@@ -135,6 +127,7 @@ void poly_decompress_avx(poly *restrict r,
 				 512, 64, 8, 256, 32, 1024);
 #pragma GCC diagnostic pop
 
+	LC_FPU_ENABLE;
 	for (i = 0; i < LC_KYBER_N / 16; i++) {
 		t = _mm_loadl_epi64((__m128i_u *)&a[10 * i + 0]);
 		memcpy(&ti, &a[10 * i + 8], 2);
@@ -168,7 +161,6 @@ void poly_frommsg_avx(poly *restrict r,
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeclaration-after-statement"
 	/* Due to const, the variables cannot be defined before */
-	LC_FPU_ENABLE;
 	const __m256i shift =
 		_mm256_broadcastsi128_si256(_mm_set_epi32(0, 1, 2, 3));
 	const __m256i idx = _mm256_broadcastsi128_si256(_mm_set_epi8(
@@ -205,6 +197,7 @@ void poly_frommsg_avx(poly *restrict r,
 	_mm256_store_si256(&r->vec[8 + 2 * i + 1], g3)
 
 	f = _mm256_loadu_si256((__m256i_u *)msg);
+	LC_FPU_ENABLE;
 	FROMMSG64(0);
 	FROMMSG64(1);
 	FROMMSG64(2);
@@ -232,11 +225,11 @@ void poly_tomsg_avx(uint8_t msg[LC_KYBER_INDCPA_MSGBYTES],
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeclaration-after-statement"
 	/* Due to const, the variables cannot be defined before */
-	LC_FPU_ENABLE;
 	const __m256i hq = _mm256_set1_epi16((LC_KYBER_Q - 1) / 2);
 	const __m256i hhq = _mm256_set1_epi16((LC_KYBER_Q - 1) / 4);
 #pragma GCC diagnostic pop
 
+	LC_FPU_ENABLE;
 	for (i = 0; i < LC_KYBER_N / 32; i++) {
 		f0 = _mm256_load_si256(&a->vec[2 * i + 0]);
 		f1 = _mm256_load_si256(&a->vec[2 * i + 1]);

diff --git a/linux_kernel/Kbuild b/linux_kernel/Kbuild
@@ -238,9 +238,12 @@ ccflags-y			:= -I$(PWD) -DLINUX_KERNEL -DLC_MEM_ON_HEAP    \
 				   -DMAJVERSION=0 -DMINVERSION=9 -DPATCHLEVEL=0
 
 ifdef CONFIG_X86_64
-ccflags-y			+= -DLC_HOST_X86_64
+ccflags-y			+= -DLC_HOST_X86_64			       \
+				    -mavx2 -mbmi2 -mpopcnt		       \
+				    -Wno-unused-command-line-argument
 else ifdef CONFIG_ARM64
-ccflags-y			+= -DLC_HOST_AARCH64
+ccflags-y			+= -DLC_HOST_AARCH64			       \
+				   -march=armv8-a+simd -Wno-unused-result
 else ifneq ($(and $(CONFIG_RISCV),$(CONFIG_64BIT)),)
 ccflags-y			+= -DLC_HOST_RISCV64
 endif
@@ -896,7 +899,7 @@ ifdef CONFIG_ARM64
 asflags-$(CONFIG_LEANCRYPTO_AES)					       \
 				+= -march=armv8-a+crypto
 leancrypto-$(CONFIG_LEANCRYPTO_AES)					       \
-				+= ../sym/src/asm/ARMv8/aes_armv8_ce.o\
+				+= ../sym/src/asm/ARMv8/aes_armv8_ce.o	       \
 				   ../sym/src/aes_block_armce_v8.o
 else
 leancrypto-$(CONFIG_LEANCRYPTO_AES)					       \