diff --git a/Makefile b/Makefile index b9b7126..3867baa 100644 --- a/Makefile +++ b/Makefile @@ -9,6 +9,7 @@ include tests/helloworld/helloworld.mk include tests/intmulntt/intmulntt.mk include tests/karatsuba/karatsuba.mk include tests/keccak/keccak.mk +include tests/kyber-armv7m/kyber-armv7m.mk include tests/montgomery/montgomery.mk include tests/ntt-192/ntt-192.mk include tests/ntt-256/ntt-256.mk diff --git a/asm/manual/kyber-armv7m/add_kyber.s b/asm/manual/kyber-armv7m/add_kyber.s new file mode 120000 index 0000000..cbd60d3 --- /dev/null +++ b/asm/manual/kyber-armv7m/add_kyber.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/armv7m/add_kyber.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/add_kyber_opt_m7.s b/asm/manual/kyber-armv7m/add_kyber_opt_m7.s new file mode 120000 index 0000000..2735917 --- /dev/null +++ b/asm/manual/kyber-armv7m/add_kyber_opt_m7.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/armv7m/add_kyber_opt_m7.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/barrett_reduce_kyber.s b/asm/manual/kyber-armv7m/barrett_reduce_kyber.s new file mode 120000 index 0000000..8bbd376 --- /dev/null +++ b/asm/manual/kyber-armv7m/barrett_reduce_kyber.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/armv7m/barrett_reduce_kyber.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/barrett_reduce_kyber_opt_m7.s b/asm/manual/kyber-armv7m/barrett_reduce_kyber_opt_m7.s new file mode 120000 index 0000000..b111e46 --- /dev/null +++ b/asm/manual/kyber-armv7m/barrett_reduce_kyber_opt_m7.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/armv7m/barrett_reduce_kyber_opt_m7.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/basemul_16_32_kyber.s b/asm/manual/kyber-armv7m/basemul_16_32_kyber.s new file mode 120000 index 0000000..dff2703 --- /dev/null +++ b/asm/manual/kyber-armv7m/basemul_16_32_kyber.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/armv7m/basemul_16_32_kyber.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/basemul_16_32_kyber_opt_m7.s b/asm/manual/kyber-armv7m/basemul_16_32_kyber_opt_m7.s new file mode 120000 index 0000000..9c35354 --- /dev/null +++ b/asm/manual/kyber-armv7m/basemul_16_32_kyber_opt_m7.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/armv7m/basemul_16_32_kyber_opt_m7.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/basemul_acc_32_16_kyber.s b/asm/manual/kyber-armv7m/basemul_acc_32_16_kyber.s new file mode 120000 index 0000000..ddddf40 --- /dev/null +++ b/asm/manual/kyber-armv7m/basemul_acc_32_16_kyber.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/armv7m/basemul_acc_32_16_kyber.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/basemul_acc_32_16_kyber_opt_m7.s b/asm/manual/kyber-armv7m/basemul_acc_32_16_kyber_opt_m7.s new file mode 120000 index 0000000..5236dbb --- /dev/null +++ b/asm/manual/kyber-armv7m/basemul_acc_32_16_kyber_opt_m7.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/armv7m/basemul_acc_32_16_kyber_opt_m7.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/basemul_acc_32_32_kyber.s b/asm/manual/kyber-armv7m/basemul_acc_32_32_kyber.s new file mode 120000 index 0000000..d749960 --- /dev/null +++ b/asm/manual/kyber-armv7m/basemul_acc_32_32_kyber.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/armv7m/basemul_acc_32_32_kyber.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/basemul_acc_32_32_kyber_opt_m7.s b/asm/manual/kyber-armv7m/basemul_acc_32_32_kyber_opt_m7.s new file mode 120000 index 0000000..69ffef2 --- /dev/null +++ b/asm/manual/kyber-armv7m/basemul_acc_32_32_kyber_opt_m7.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/armv7m/basemul_acc_32_32_kyber_opt_m7.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/basemul_acc_kyber.s b/asm/manual/kyber-armv7m/basemul_acc_kyber.s new file mode 120000 index 0000000..2775b26 --- /dev/null +++ b/asm/manual/kyber-armv7m/basemul_acc_kyber.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/armv7m/basemul_acc_kyber.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/basemul_acc_kyber_opt_m7.s b/asm/manual/kyber-armv7m/basemul_acc_kyber_opt_m7.s new file mode 120000 index 0000000..f9dc911 --- /dev/null +++ b/asm/manual/kyber-armv7m/basemul_acc_kyber_opt_m7.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/armv7m/basemul_acc_kyber_opt_m7.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/basemul_kyber.s b/asm/manual/kyber-armv7m/basemul_kyber.s new file mode 120000 index 0000000..5295785 --- /dev/null +++ b/asm/manual/kyber-armv7m/basemul_kyber.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/armv7m/basemul_kyber.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/basemul_kyber_opt_m7.s b/asm/manual/kyber-armv7m/basemul_kyber_opt_m7.s new file mode 120000 index 0000000..daa6d11 --- /dev/null +++ b/asm/manual/kyber-armv7m/basemul_kyber_opt_m7.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/armv7m/basemul_kyber_opt_m7.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/frombytes_mul_16_32_kyber.s b/asm/manual/kyber-armv7m/frombytes_mul_16_32_kyber.s new file mode 120000 index 0000000..02fdcf1 --- /dev/null +++ b/asm/manual/kyber-armv7m/frombytes_mul_16_32_kyber.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/armv7m/frombytes_mul_16_32_kyber.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/frombytes_mul_16_32_kyber_opt_m7.s b/asm/manual/kyber-armv7m/frombytes_mul_16_32_kyber_opt_m7.s new file mode 120000 index 0000000..adce3c7 --- /dev/null +++ b/asm/manual/kyber-armv7m/frombytes_mul_16_32_kyber_opt_m7.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/armv7m/frombytes_mul_16_32_kyber_opt_m7.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/frombytes_mul_acc_32_16_kyber.s b/asm/manual/kyber-armv7m/frombytes_mul_acc_32_16_kyber.s new file mode 120000 index 0000000..0b459ed --- /dev/null +++ b/asm/manual/kyber-armv7m/frombytes_mul_acc_32_16_kyber.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/armv7m/frombytes_mul_acc_32_16_kyber.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/frombytes_mul_acc_32_16_kyber_opt_m7.s b/asm/manual/kyber-armv7m/frombytes_mul_acc_32_16_kyber_opt_m7.s new file mode 120000 index 0000000..c30285a --- /dev/null +++ b/asm/manual/kyber-armv7m/frombytes_mul_acc_32_16_kyber_opt_m7.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/armv7m/frombytes_mul_acc_32_16_kyber_opt_m7.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/frombytes_mul_acc_32_32_kyber.s b/asm/manual/kyber-armv7m/frombytes_mul_acc_32_32_kyber.s new file mode 120000 index 0000000..fe0c202 --- /dev/null +++ b/asm/manual/kyber-armv7m/frombytes_mul_acc_32_32_kyber.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/armv7m/frombytes_mul_acc_32_32_kyber.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/frombytes_mul_acc_32_32_kyber_opt_m7.s b/asm/manual/kyber-armv7m/frombytes_mul_acc_32_32_kyber_opt_m7.s new file mode 120000 index 0000000..62309ec --- /dev/null +++ b/asm/manual/kyber-armv7m/frombytes_mul_acc_32_32_kyber_opt_m7.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/armv7m/frombytes_mul_acc_32_32_kyber_opt_m7.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/frombytes_mul_acc_kyber.s b/asm/manual/kyber-armv7m/frombytes_mul_acc_kyber.s new file mode 120000 index 0000000..33d56b3 --- /dev/null +++ b/asm/manual/kyber-armv7m/frombytes_mul_acc_kyber.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/armv7m/frombytes_mul_acc_kyber.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/frombytes_mul_acc_kyber_opt_m7.s b/asm/manual/kyber-armv7m/frombytes_mul_acc_kyber_opt_m7.s new file mode 120000 index 0000000..2124eba --- /dev/null +++ b/asm/manual/kyber-armv7m/frombytes_mul_acc_kyber_opt_m7.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/armv7m/frombytes_mul_acc_kyber_opt_m7.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/frombytes_mul_kyber.s b/asm/manual/kyber-armv7m/frombytes_mul_kyber.s new file mode 120000 index 0000000..408800f --- /dev/null +++ b/asm/manual/kyber-armv7m/frombytes_mul_kyber.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/armv7m/frombytes_mul_kyber.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/frombytes_mul_kyber_opt_m7.s b/asm/manual/kyber-armv7m/frombytes_mul_kyber_opt_m7.s new file mode 120000 index 0000000..a004a89 --- /dev/null +++ b/asm/manual/kyber-armv7m/frombytes_mul_kyber_opt_m7.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/armv7m/frombytes_mul_kyber_opt_m7.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/fromplant_kyber.s b/asm/manual/kyber-armv7m/fromplant_kyber.s new file mode 120000 index 0000000..906ac60 --- /dev/null +++ b/asm/manual/kyber-armv7m/fromplant_kyber.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/armv7m/fromplant_kyber.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/fromplant_kyber_opt_m7.s b/asm/manual/kyber-armv7m/fromplant_kyber_opt_m7.s new file mode 120000 index 0000000..bbe5e6e --- /dev/null +++ b/asm/manual/kyber-armv7m/fromplant_kyber_opt_m7.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/armv7m/fromplant_kyber_opt_m7.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/intt_kyber.s b/asm/manual/kyber-armv7m/intt_kyber.s new file mode 120000 index 0000000..7f6b5ee --- /dev/null +++ b/asm/manual/kyber-armv7m/intt_kyber.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/armv7m/intt_kyber.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/intt_kyber_opt_m7.s b/asm/manual/kyber-armv7m/intt_kyber_opt_m7.s new file mode 120000 index 0000000..929c5fd --- /dev/null +++ b/asm/manual/kyber-armv7m/intt_kyber_opt_m7.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/armv7m/intt_kyber_opt_m7.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/matacc_acc_kyber.s b/asm/manual/kyber-armv7m/matacc_acc_kyber.s new file mode 120000 index 0000000..5690250 --- /dev/null +++ b/asm/manual/kyber-armv7m/matacc_acc_kyber.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/armv7m/matacc_acc_kyber.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/matacc_acc_kyber_opt_m7.s b/asm/manual/kyber-armv7m/matacc_acc_kyber_opt_m7.s new file mode 120000 index 0000000..2318505 --- /dev/null +++ b/asm/manual/kyber-armv7m/matacc_acc_kyber_opt_m7.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/armv7m/matacc_acc_kyber_opt_m7.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/matacc_asm_cache_16_32_kyber.s b/asm/manual/kyber-armv7m/matacc_asm_cache_16_32_kyber.s new file mode 120000 index 0000000..855ca53 --- /dev/null +++ b/asm/manual/kyber-armv7m/matacc_asm_cache_16_32_kyber.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/armv7m/matacc_asm_cache_16_32_kyber.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/matacc_asm_cache_16_32_kyber_opt_m7.s b/asm/manual/kyber-armv7m/matacc_asm_cache_16_32_kyber_opt_m7.s new file mode 120000 index 0000000..5e42d78 --- /dev/null +++ b/asm/manual/kyber-armv7m/matacc_asm_cache_16_32_kyber_opt_m7.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/armv7m/matacc_asm_cache_16_32_kyber_opt_m7.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/matacc_asm_cache_32_16_kyber.s b/asm/manual/kyber-armv7m/matacc_asm_cache_32_16_kyber.s new file mode 120000 index 0000000..82f203f --- /dev/null +++ b/asm/manual/kyber-armv7m/matacc_asm_cache_32_16_kyber.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/armv7m/matacc_asm_cache_32_16_kyber.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/matacc_asm_cache_32_16_kyber_opt_m7.s b/asm/manual/kyber-armv7m/matacc_asm_cache_32_16_kyber_opt_m7.s new file mode 120000 index 0000000..756fee7 --- /dev/null +++ b/asm/manual/kyber-armv7m/matacc_asm_cache_32_16_kyber_opt_m7.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/armv7m/matacc_asm_cache_32_16_kyber_opt_m7.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/matacc_asm_cache_32_32_kyber.s b/asm/manual/kyber-armv7m/matacc_asm_cache_32_32_kyber.s new file mode 120000 index 0000000..66fcde8 --- /dev/null +++ b/asm/manual/kyber-armv7m/matacc_asm_cache_32_32_kyber.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/armv7m/matacc_asm_cache_32_32_kyber.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/matacc_asm_cache_32_32_kyber_opt_m7.s b/asm/manual/kyber-armv7m/matacc_asm_cache_32_32_kyber_opt_m7.s new file mode 120000 index 0000000..e16b20d --- /dev/null +++ b/asm/manual/kyber-armv7m/matacc_asm_cache_32_32_kyber_opt_m7.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/armv7m/matacc_asm_cache_32_32_kyber_opt_m7.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/matacc_asm_opt_16_32_kyber.s b/asm/manual/kyber-armv7m/matacc_asm_opt_16_32_kyber.s new file mode 120000 index 0000000..54802af --- /dev/null +++ b/asm/manual/kyber-armv7m/matacc_asm_opt_16_32_kyber.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/armv7m/matacc_asm_opt_16_32_kyber.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/matacc_asm_opt_16_32_kyber_opt_m7.s b/asm/manual/kyber-armv7m/matacc_asm_opt_16_32_kyber_opt_m7.s new file mode 120000 index 0000000..0ee5a75 --- /dev/null +++ b/asm/manual/kyber-armv7m/matacc_asm_opt_16_32_kyber_opt_m7.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/armv7m/matacc_asm_opt_16_32_kyber_opt_m7.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/matacc_asm_opt_32_16_kyber.s b/asm/manual/kyber-armv7m/matacc_asm_opt_32_16_kyber.s new file mode 120000 index 0000000..071d492 --- /dev/null +++ b/asm/manual/kyber-armv7m/matacc_asm_opt_32_16_kyber.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/armv7m/matacc_asm_opt_32_16_kyber.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/matacc_asm_opt_32_16_kyber_opt_m7.s b/asm/manual/kyber-armv7m/matacc_asm_opt_32_16_kyber_opt_m7.s new file mode 120000 index 0000000..9b8f092 --- /dev/null +++ b/asm/manual/kyber-armv7m/matacc_asm_opt_32_16_kyber_opt_m7.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/armv7m/matacc_asm_opt_32_16_kyber_opt_m7.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/matacc_asm_opt_32_32_kyber.s b/asm/manual/kyber-armv7m/matacc_asm_opt_32_32_kyber.s new file mode 120000 index 0000000..09b2f78 --- /dev/null +++ b/asm/manual/kyber-armv7m/matacc_asm_opt_32_32_kyber.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/armv7m/matacc_asm_opt_32_32_kyber.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/matacc_asm_opt_32_32_kyber_opt_m7.s b/asm/manual/kyber-armv7m/matacc_asm_opt_32_32_kyber_opt_m7.s new file mode 120000 index 0000000..b063e73 --- /dev/null +++ b/asm/manual/kyber-armv7m/matacc_asm_opt_32_32_kyber_opt_m7.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/armv7m/matacc_asm_opt_32_32_kyber_opt_m7.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/matacc_kyber.s b/asm/manual/kyber-armv7m/matacc_kyber.s new file mode 120000 index 0000000..5bb7e99 --- /dev/null +++ b/asm/manual/kyber-armv7m/matacc_kyber.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/armv7m/matacc_kyber.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/matacc_kyber_opt_m7.s b/asm/manual/kyber-armv7m/matacc_kyber_opt_m7.s new file mode 120000 index 0000000..26de8c5 --- /dev/null +++ b/asm/manual/kyber-armv7m/matacc_kyber_opt_m7.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/armv7m/matacc_kyber_opt_m7.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/ntt_kyber.s b/asm/manual/kyber-armv7m/ntt_kyber.s new file mode 120000 index 0000000..f6bcc0e --- /dev/null +++ b/asm/manual/kyber-armv7m/ntt_kyber.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/armv7m/ntt_kyber.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/ntt_kyber_opt_m7.s b/asm/manual/kyber-armv7m/ntt_kyber_opt_m7.s new file mode 120000 index 0000000..e6a5c5a --- /dev/null +++ b/asm/manual/kyber-armv7m/ntt_kyber_opt_m7.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/armv7m/ntt_kyber_opt_m7.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/sub_kyber.s b/asm/manual/kyber-armv7m/sub_kyber.s new file mode 120000 index 0000000..e353062 --- /dev/null +++ b/asm/manual/kyber-armv7m/sub_kyber.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/armv7m/sub_kyber.s \ No newline at end of file diff --git a/asm/manual/kyber-armv7m/sub_kyber_opt_m7.s b/asm/manual/kyber-armv7m/sub_kyber_opt_m7.s new file mode 120000 index 0000000..2c180b5 --- /dev/null +++ b/asm/manual/kyber-armv7m/sub_kyber_opt_m7.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/armv7m/sub_kyber_opt_m7.s \ No newline at end of file diff --git a/tests/dilithium-armv7m/main.c b/tests/dilithium-armv7m/main.c index edaef98..9de7f1f 100644 --- a/tests/dilithium-armv7m/main.c +++ b/tests/dilithium-armv7m/main.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited + * Copyright (c) 2025 SLOTHY authors * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a copy diff --git a/tests/kyber-armv7m/fips202.c b/tests/kyber-armv7m/fips202.c new file mode 100644 index 0000000..a368e5b --- /dev/null +++ b/tests/kyber-armv7m/fips202.c @@ -0,0 +1,853 @@ +// SPDX-License-Identifier: Apache-2.0 or CC0-1.0 +/* Based on the public domain implementation in + * crypto_hash/keccakc512/simple/ from http://bench.cr.yp.to/supercop.html + * by Ronny Van Keer + * and the public domain "TweetFips202" implementation + * from https://twitter.com/tweetfips202 + * by Gilles Van Assche, Daniel J. Bernstein, and Peter Schwabe */ + +#include +#include +#include + +#include "fips202.h" +#include "keccakf1600.h" + +#define NROUNDS 24 +#define ROL(a, offset) (((a) << (offset)) ^ ((a) >> (64 - (offset)))) + +#ifdef PROFILE_HASHING +#include "hal.h" +extern unsigned long long hash_cycles; +#endif + +void KeccakF1600_StatePermute_adomnicai_m4_opt_m7(uint64_t * state); +#define KeccakF1600_StatePermute KeccakF1600_StatePermute_adomnicai_m4_opt_m7 + + + +/************************************************* + * Name: keccak_absorb + * + * Description: Absorb step of Keccak; + * non-incremental, starts by zeroeing the state. + * + * Arguments: - uint64_t *s: pointer to (uninitialized) output Keccak state + * - uint32_t r: rate in bytes (e.g., 168 for SHAKE128) + * - const uint8_t *m: pointer to input to be absorbed into s + * - size_t mlen: length of input in bytes + * - uint8_t p: domain-separation byte for different Keccak-derived functions + **************************************************/ +static void keccak_absorb(uint64_t *s, + uint32_t r, + const uint8_t *m, size_t mlen, + uint8_t p) +{ + while (mlen >= r) + { + KeccakF1600_StateXORBytes(s, m, 0, r); + KeccakF1600_StatePermute(s); + mlen -= r; + m += r; + } + + if(mlen > 0){ + KeccakF1600_StateXORBytes(s, m, 0, mlen); + } + + if(mlen == r-1){ + p |= 128; + KeccakF1600_StateXORBytes(s, &p, mlen, 1); + } else { + KeccakF1600_StateXORBytes(s, &p, mlen, 1); + p = 128; + KeccakF1600_StateXORBytes(s, &p, r-1, 1); + } +} + + +/************************************************* + * Name: keccak_squeezeblocks + * + * Description: Squeeze step of Keccak. Squeezes full blocks of r bytes each. + * Modifies the state. Can be called multiple times to keep squeezing, + * i.e., is incremental. + * + * Arguments: - uint8_t *h: pointer to output blocks + * - size_t nblocks: number of blocks to be squeezed (written to h) + * - uint64_t *s: pointer to in/output Keccak state + * - uint32_t r: rate in bytes (e.g., 168 for SHAKE128) + **************************************************/ +static void keccak_squeezeblocks(uint8_t *h, size_t nblocks, + uint64_t *s, + uint32_t r) +{ + while(nblocks > 0) + { + KeccakF1600_StatePermute(s); + KeccakF1600_StateExtractBytes(s, h, 0, r); + h += r; + nblocks--; + } +} + +/************************************************* + * Name: keccak_inc_init + * + * Description: Initializes the incremental Keccak state to zero. + * + * Arguments: - uint64_t *s_inc: pointer to input/output incremental state + * First 25 values represent Keccak state. + * 26th value represents either the number of absorbed bytes + * that have not been permuted, or not-yet-squeezed bytes. + **************************************************/ +static void keccak_inc_init(uint64_t *s_inc) { + size_t i; + + for (i = 0; i < 25; ++i) { + s_inc[i] = 0; + } + s_inc[25] = 0; +} +/************************************************* + * Name: keccak_inc_absorb + * + * Description: Incremental keccak absorb + * Preceded by keccak_inc_init, succeeded by keccak_inc_finalize + * + * Arguments: - uint64_t *s_inc: pointer to input/output incremental state + * First 25 values represent Keccak state. + * 26th value represents either the number of absorbed bytes + * that have not been permuted, or not-yet-squeezed bytes. + * - uint32_t r: rate in bytes (e.g., 168 for SHAKE128) + * - const uint8_t *m: pointer to input to be absorbed into s_inc + * - size_t mlen: length of input in bytes + **************************************************/ +static void keccak_inc_absorb(uint64_t *s_inc, uint32_t r, const uint8_t *m, + size_t mlen) { + /* Recall that s_inc[25] is the non-absorbed bytes xored into the state */ + while (mlen + s_inc[25] >= r) { + + KeccakF1600_StateXORBytes(s_inc, m, s_inc[25], r-s_inc[25]); + mlen -= (size_t)(r - s_inc[25]); + m += r - s_inc[25]; + s_inc[25] = 0; + + KeccakF1600_StatePermute(s_inc); + } + + KeccakF1600_StateXORBytes(s_inc, m, s_inc[25], mlen); + s_inc[25] += mlen; +} + +/************************************************* + * Name: keccak_inc_finalize + * + * Description: Finalizes Keccak absorb phase, prepares for squeezing + * + * Arguments: - uint64_t *s_inc: pointer to input/output incremental state + * First 25 values represent Keccak state. + * 26th value represents either the number of absorbed bytes + * that have not been permuted, or not-yet-squeezed bytes. + * - uint32_t r: rate in bytes (e.g., 168 for SHAKE128) + * - uint8_t p: domain-separation byte for different + * Keccak-derived functions + **************************************************/ +static void keccak_inc_finalize(uint64_t *s_inc, uint32_t r, uint8_t p) { + /* After keccak_inc_absorb, we are guaranteed that s_inc[25] < r, + so we can always use one more byte for p in the current state. */ + if(s_inc[25] == r-1){ + p |= 128; + KeccakF1600_StateXORBytes(s_inc, &p, s_inc[25], 1); + } else { + KeccakF1600_StateXORBytes(s_inc, &p, s_inc[25], 1); + p = 128; + KeccakF1600_StateXORBytes(s_inc, &p, r-1, 1); + } + s_inc[25] = 0; +} + +/************************************************* + * Name: keccak_inc_squeeze + * + * Description: Incremental Keccak squeeze; can be called on byte-level + * + * Arguments: - uint8_t *h: pointer to output bytes + * - size_t outlen: number of bytes to be squeezed + * - uint64_t *s_inc: pointer to input/output incremental state + * First 25 values represent Keccak state. + * 26th value represents either the number of absorbed bytes + * that have not been permuted, or not-yet-squeezed bytes. + * - uint32_t r: rate in bytes (e.g., 168 for SHAKE128) + **************************************************/ +static void keccak_inc_squeeze(uint8_t *h, size_t outlen, + uint64_t *s_inc, uint32_t r) { + size_t len; + if(outlen < s_inc[25]) + { + len = outlen; + } + else + { + len = s_inc[25]; + } + + KeccakF1600_StateExtractBytes(s_inc, h, r-s_inc[25], len); + h += len; + outlen -= len; + s_inc[25] -= len; + + /* Then squeeze the remaining necessary blocks */ + while (outlen > 0) { + KeccakF1600_StatePermute(s_inc); + + if(outlen < r) + { + len = outlen; + } + else + { + len = r; + } + KeccakF1600_StateExtractBytes(s_inc, h, 0, len); + h += len; + outlen -= len; + s_inc[25] = r - len; + } +} + +void shake128_inc_init(shake128incctx *state) { +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + keccak_inc_init(state->ctx); +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + +void shake128_inc_absorb(shake128incctx *state, const uint8_t *input, size_t inlen) { +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + keccak_inc_absorb(state->ctx, SHAKE128_RATE, input, inlen); +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + +void shake128_inc_finalize(shake128incctx *state) { +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + keccak_inc_finalize(state->ctx, SHAKE128_RATE, 0x1F); +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + +void shake128_inc_squeeze(uint8_t *output, size_t outlen, shake128incctx *state) { +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + keccak_inc_squeeze(output, outlen, state->ctx, SHAKE128_RATE); +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + +void shake128_inc_ctx_clone(shake128incctx* dest, const shake128incctx *src) { + memcpy(dest, src, sizeof(shake128incctx)); +} + +void shake128_inc_ctx_release(shake128incctx *state) { + (void) state; +} + +void shake256_inc_init(shake256incctx *state) { +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + keccak_inc_init(state->ctx); +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + +void shake256_inc_absorb(shake256incctx *state, const uint8_t *input, size_t inlen) { +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + keccak_inc_absorb(state->ctx, SHAKE256_RATE, input, inlen); +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + +void shake256_inc_finalize(shake256incctx *state) { +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + keccak_inc_finalize(state->ctx, SHAKE256_RATE, 0x1F); +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + +void shake256_inc_squeeze(uint8_t *output, size_t outlen, shake256incctx *state) { +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + keccak_inc_squeeze(output, outlen, state->ctx, SHAKE256_RATE); +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + +void shake256_inc_ctx_clone(shake256incctx* dest, const shake256incctx *src) { + memcpy(dest, src, sizeof(shake256incctx)); +} + +void shake256_inc_ctx_release(shake256incctx *state) { + (void) state; +} + +/********** cSHAKE128 ***********/ + +void cshake128_simple_absorb(shake128ctx *state, uint16_t cstm, const uint8_t *in, size_t inlen) +{ +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + + + uint8_t sep[8]; + size_t i; + + for (i = 0; i < 25; i++) + state->ctx[i] = 0; + + /* Absorb customization (domain-separation) string */ + sep[0] = 0x01; + sep[1] = 0xa8; + sep[2] = 0x01; + sep[3] = 0x00; + sep[4] = 0x01; + sep[5] = 16; // fixed bitlen of cstm + sep[6] = cstm & 0xff; + sep[7] = cstm >> 8; + + KeccakF1600_StateXORBytes(state->ctx, sep, 0, 8); + KeccakF1600_StatePermute(state->ctx); + + /* Absorb input */ + keccak_absorb(state->ctx, SHAKE128_RATE, in, inlen, 0x04); + +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif + +} + + +void cshake128_simple_squeezeblocks(uint8_t *output, size_t nblocks, shake128ctx *state) +{ +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + keccak_squeezeblocks(output, nblocks, state->ctx, SHAKE128_RATE); +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + + +void cshake128_simple(uint8_t *output, size_t outlen, uint16_t cstm, const uint8_t *in, size_t inlen) +{ + shake128incctx state; + uint8_t sep[8]; +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + + keccak_inc_init(state.ctx); + + /* Absorb customization (domain-separation) string */ + sep[0] = 0x01; + sep[1] = 0xa8; + sep[2] = 0x01; + sep[3] = 0x00; + sep[4] = 0x01; + sep[5] = 16; // fixed bitlen of cstm + sep[6] = cstm & 0xff; + sep[7] = cstm >> 8; + + KeccakF1600_StateXORBytes(state.ctx, sep, 0, 8); + KeccakF1600_StatePermute(state.ctx); + + /* Absorb input */ + keccak_inc_absorb(state.ctx, SHAKE128_RATE, in, inlen); + keccak_inc_finalize(state.ctx, SHAKE128_RATE, 0x04); + + /* Squeeze output */ + keccak_inc_squeeze(output, outlen, state.ctx, SHAKE128_RATE); +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + + + +/************************************************* + * Name: shake128_absorb + * + * Description: Absorb step of the SHAKE128 XOF. + * non-incremental, starts by zeroeing the state. + * + * Arguments: - uint64_t *state: pointer to (uninitialized) output Keccak state + * - const uint8_t *input: pointer to input to be absorbed into state + * - size_t inlen: length of input in bytes + **************************************************/ +void shake128_absorb(shake128ctx *state, const uint8_t *input, size_t inlen) +{ +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + int i; + for (i = 0; i < 25; i++) + state->ctx[i] = 0; + + keccak_absorb(state->ctx, SHAKE128_RATE, input, inlen, 0x1F); +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + +/************************************************* + * Name: shake128_squeezeblocks + * + * Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of SHAKE128_RATE bytes each. + * Modifies the state. Can be called multiple times to keep squeezing, + * i.e., is incremental. + * + * Arguments: - uint8_t *output: pointer to output blocks + * - size_t nblocks: number of blocks to be squeezed (written to output) + * - shake128ctx *state: pointer to in/output Keccak state + **************************************************/ +static const unsigned char shake128_bytes[168] = { 0x30, 0x29, 0x11, 0x08, 0x3E, 0x94, 0xD8, 0xC0, 0xD7, 0xB3, 0xD6, 0xDA, 0x94, 0xDF, 0x8A, 0x4F, 0x7F, 0x95, 0x52, 0xB1, 0xBC, 0xE4, 0xD2, 0x1D, 0xE7, 0xBD, 0x60, 0xD4, 0x38, 0xD0, 0x7D, 0x8E, 0xF6, 0xFA, 0x2D, 0xCD, 0xE1, 0x28, 0xA8, 0x91, 0x00, 0xA2, 0x8C, 0x3A, 0x5E, 0xFA, 0xEE, 0xAD, 0xDF, 0x8A, 0x19, 0xD7, 0xC5, 0xFA, 0x28, 0x30, 0x81, 0xE4, 0x27, 0xB5, 0x66, 0x9A, 0xBF, 0x10, 0x93, 0x78, 0xC2, 0xF3, 0xA7, 0xE5, 0xA3, 0xBF, 0x1B, 0xA9, 0x62, 0x3C, 0xEE, 0x90, 0x09, 0x06, 0x80, 0xE7, 0x69, 0xA1, 0xFC, 0x9F, 0xCF, 0xF7, 0xF0, 0x83, 0xBC, 0x36, 0x86, 0xC6, 0xED, 0x11, 0xA8, 0xA4, 0xD4, 0xC9, 0x1A, 0xB1, 0x54, 0x24, 0xE0, 0xBF, 0x2D, 0xA3, 0x84, 0xF4, 0x03, 0xA9, 0x0B, 0x9B, 0xEA, 0xA3, 0x09, 0x31, 0x6B, 0xA9, 0x2D, 0x09, 0x8A, 0x68, 0x57, 0x89, 0x9A, 0xDB, 0x67, 0x95, 0x39, 0xF0, 0xE7, 0xFF, 0x47, 0x39, 0x80, 0xB8, 0xAE, 0x88, 0x35, 0x61, 0xCA, 0x14, 0x95, 0xBE, 0x45, 0x0F, 0x10, 0x34, 0x0A, 0x9B, 0x08, 0x93, 0x35, 0xAB, 0x4A, 0xE4, 0x43, 0x7F, 0x9E, 0x36, 0x63, 0x02, 0xF6, 0x56, 0x73, 0xCB }; + +void shake128_squeezeblocks(uint8_t *output, size_t nblocks, shake128ctx *state) +{ + (void)state; + // we only need this inside of the mat acc functions, but we dont't want to benchmark keccak itself, + // just copy out fixed values + + // this should not happen in our tests; get stuck in that case + while(nblocks != 1); + + + for(int i=0;i<168;i++){ + output[i] = shake128_bytes[i]; + } + +} + +void shake128_ctx_release(shake128ctx *state) { + (void) state; +} +void shake128_ctx_clone(shake128ctx *dest, const shake128ctx *src) { + memcpy(dest, src, sizeof(shake128ctx)); +} + +void shake256_absorb(shake256ctx *state, const uint8_t *input, size_t inlen) +{ +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + int i; + for (i = 0; i < 25; i++) + state->ctx[i] = 0; + + keccak_absorb(state->ctx, SHAKE256_RATE, input, inlen, 0x1F); +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + + +void shake256_squeezeblocks(uint8_t *output, size_t nblocks, shake256ctx *state) +{ +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + keccak_squeezeblocks(output, nblocks, state->ctx, SHAKE256_RATE); +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + +/************************************************* + * Name: shake256 + * + * Description: SHAKE256 XOF with non-incremental API + * + * Arguments: - uint8_t *output: pointer to output + * - size_t outlen: requested output length in bytes + * - const uint8_t *input: pointer to input + * - size_t inlen: length of input in bytes + **************************************************/ +void shake256(uint8_t *output, size_t outlen, + const uint8_t *input, size_t inlen) +{ +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + shake256incctx state; + + keccak_inc_init(state.ctx); + + /* Absorb input */ + keccak_inc_absorb(state.ctx, SHAKE256_RATE, input, inlen); + keccak_inc_finalize(state.ctx, SHAKE256_RATE, 0x1F); + + /* Squeeze output */ + keccak_inc_squeeze(output, outlen, state.ctx, SHAKE256_RATE); +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + +void shake256_ctx_release(shake256ctx *state) { + (void) state; +} + +void shake256_ctx_clone(shake256ctx *dest, const shake256ctx *src) { + memcpy(dest, src, sizeof(shake256ctx)); +} + + +/************************************************* + * Name: sha3_256 + * + * Description: SHA3-256 with non-incremental API + * + * Arguments: - uint8_t *output: pointer to output + * - const uint8_t *input: pointer to input + * - size_t inlen: length of input in bytes + **************************************************/ +void sha3_256(uint8_t *output, const uint8_t *input, size_t inlen) +{ +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + sha3_256incctx state; + keccak_inc_init(state.ctx); + + /* Absorb input */ + keccak_inc_absorb(state.ctx, SHA3_256_RATE, input, inlen); + keccak_inc_finalize(state.ctx, SHA3_256_RATE, 0x06); + + /* Squeeze output */ + keccak_inc_squeeze(output, 32, state.ctx, SHA3_256_RATE); +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} +void sha3_256_inc_init(sha3_256incctx *state) { +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + keccak_inc_init(state->ctx); +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + +void sha3_256_inc_absorb(sha3_256incctx *state, const uint8_t *input, size_t inlen) { +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + keccak_inc_absorb(state->ctx, SHA3_256_RATE, input, inlen); +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + +void sha3_256_inc_finalize(uint8_t *output, sha3_256incctx *state) { +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + uint8_t t[SHA3_256_RATE]; + keccak_inc_finalize(state->ctx, SHA3_256_RATE, 0x06); + + keccak_squeezeblocks(t, 1, state->ctx, SHA3_256_RATE); + + for (size_t i = 0; i < 32; i++) { + output[i] = t[i]; + } +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + +void sha3_256_inc_ctx_clone(sha3_256incctx *dest, const sha3_256incctx *src) { + memcpy(dest, src, sizeof(sha3_256incctx)); +} + +void sha3_256_inc_ctx_release(sha3_256incctx *state) { + (void) state; +} + +void sha3_384_inc_init(sha3_384incctx *state) { +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + keccak_inc_init(state->ctx); +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + +void sha3_384_inc_absorb(sha3_384incctx *state, const uint8_t *input, size_t inlen) { +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + keccak_inc_absorb(state->ctx, SHA3_384_RATE, input, inlen); +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + +void sha3_384_inc_finalize(uint8_t *output, sha3_384incctx *state) { +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + uint8_t t[SHA3_384_RATE]; + keccak_inc_finalize(state->ctx, SHA3_384_RATE, 0x06); + + keccak_squeezeblocks(t, 1, state->ctx, SHA3_384_RATE); + + for (size_t i = 0; i < 48; i++) { + output[i] = t[i]; + } +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + +void sha3_384_inc_ctx_clone(sha3_384incctx *dest, const sha3_384incctx *src) { + memcpy(dest, src, sizeof(sha3_384incctx)); +} + +void sha3_384_inc_ctx_release(sha3_384incctx *state) { + (void) state; +} + +/************************************************* + * Name: sha3_384 + * + * Description: SHA3-256 with non-incremental API + * + * Arguments: - uint8_t *output: pointer to output + * - const uint8_t *input: pointer to input + * - size_t inlen: length of input in bytes + **************************************************/ +void sha3_384(uint8_t *output, const uint8_t *input, size_t inlen) { +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + sha3_384incctx state; + keccak_inc_init(state.ctx); + + /* Absorb input */ + keccak_inc_absorb(state.ctx, SHA3_384_RATE, input, inlen); + keccak_inc_finalize(state.ctx, SHA3_384_RATE, 0x06); + + /* Squeeze output */ + keccak_inc_squeeze(output, 48, state.ctx, SHA3_384_RATE); +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} +/************************************************* + * Name: sha3_512 + * + * Description: SHA3-512 with non-incremental API + * + * Arguments: - uint8_t *output: pointer to output + * - const uint8_t *input: pointer to input + * - size_t inlen: length of input in bytes + **************************************************/ +void sha3_512(uint8_t *output, const uint8_t *input, size_t inlen) +{ +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + sha3_512incctx state; + keccak_inc_init(state.ctx); + + /* Absorb input */ + keccak_inc_absorb(state.ctx, SHA3_512_RATE, input, inlen); + keccak_inc_finalize(state.ctx, SHA3_512_RATE, 0x06); + + /* Squeeze output */ + keccak_inc_squeeze(output, 64, state.ctx, SHA3_512_RATE); +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} +void sha3_512_inc_init(sha3_512incctx *state) { +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + keccak_inc_init(state->ctx); +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + +void sha3_512_inc_absorb(sha3_512incctx *state, const uint8_t *input, size_t inlen) { +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + keccak_inc_absorb(state->ctx, SHA3_512_RATE, input, inlen); +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + +void sha3_512_inc_finalize(uint8_t *output, sha3_512incctx *state) { +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + uint8_t t[SHA3_512_RATE]; + keccak_inc_finalize(state->ctx, SHA3_512_RATE, 0x06); + + keccak_squeezeblocks(t, 1, state->ctx, SHA3_512_RATE); + + for (size_t i = 0; i < 64; i++) { + output[i] = t[i]; + } +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + +void sha3_512_inc_ctx_clone(sha3_512incctx *dest, const sha3_512incctx *src) { + memcpy(dest, src, sizeof(sha3_512incctx)); +} + +void sha3_512_inc_ctx_release(sha3_512incctx *state) { + (void) state; +} + +/********** cSHAKE256 ***********/ + +void cshake256_simple_absorb(shake256ctx *state, uint16_t cstm, const uint8_t *in, size_t inlen) +{ +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + uint8_t sep[8]; + size_t i; + + for (i = 0; i < 25; i++) + state->ctx[i] = 0; + + /* Absorb customization (domain-separation) string */ + sep[0] = 0x01; + sep[1] = 0x88; + sep[2] = 0x01; + sep[3] = 0x00; + sep[4] = 0x01; + sep[5] = 16; // fixed bitlen of cstm + sep[6] = cstm & 0xff; + sep[7] = cstm >> 8; + + KeccakF1600_StateXORBytes(state->ctx, sep, 0, 8); + KeccakF1600_StatePermute(state->ctx); + + /* Absorb input */ + keccak_absorb(state->ctx, SHAKE256_RATE, in, inlen, 0x04); +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + + +void cshake256_simple_squeezeblocks(uint8_t *output, size_t nblocks, shake256ctx *state) +{ +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + keccak_squeezeblocks(output, nblocks, state->ctx, SHAKE256_RATE); +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + + +void cshake256_simple(uint8_t *output, size_t outlen, uint16_t cstm, const uint8_t *in, size_t inlen) +{ + shake256incctx state; + uint8_t sep[8]; + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif + + + keccak_inc_init(state.ctx); + + /* Absorb customization (domain-separation) string */ + sep[0] = 0x01; + sep[1] = 0x88; + sep[2] = 0x01; + sep[3] = 0x00; + sep[4] = 0x01; + sep[5] = 16; // fixed bitlen of cstm + sep[6] = cstm & 0xff; + sep[7] = cstm >> 8; + + KeccakF1600_StateXORBytes(state.ctx, sep, 0, 8); + KeccakF1600_StatePermute(state.ctx); + + /* Absorb input */ + keccak_inc_absorb(state.ctx, SHAKE256_RATE, in, inlen); + keccak_inc_finalize(state.ctx, SHAKE256_RATE, 0x04); + + /* Squeeze output */ + keccak_inc_squeeze(output, outlen, state.ctx, SHAKE256_RATE); +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} diff --git a/tests/kyber-armv7m/fips202.h b/tests/kyber-armv7m/fips202.h new file mode 100644 index 0000000..3db5f5b --- /dev/null +++ b/tests/kyber-armv7m/fips202.h @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: Apache-2.0 or CC0-1.0 +#ifndef FIPS202_H +#define FIPS202_H + +#include +#include + +#define SHAKE128_RATE 168 +#define SHAKE256_RATE 136 +#define SHA3_256_RATE 136 +#define SHA3_384_RATE 104 +#define SHA3_512_RATE 72 + + +// Context for incremental API +typedef struct { + uint64_t ctx[26]; +} shake128incctx; + +// Context for non-incremental API +typedef struct { + uint64_t ctx[25]; +} shake128ctx; + +// Context for incremental API +typedef struct { + uint64_t ctx[26]; +} shake256incctx; + +// Context for non-incremental API +typedef struct { + uint64_t ctx[25]; +} shake256ctx; + +// Context for incremental API +typedef struct { + uint64_t ctx[26]; +} sha3_256incctx; + +// Context for incremental API +typedef struct { + uint64_t ctx[26]; +} sha3_384incctx; + +// Context for incremental API +typedef struct { + uint64_t ctx[26]; +} sha3_512incctx; + +/* Initialize the state and absorb the provided input. + * + * This function does not support being called multiple times + * with the same state. + */ +void shake128_absorb(shake128ctx *state, const uint8_t *input, size_t inlen); +/* Squeeze output out of the sponge. + * + * Supports being called multiple times + */ +void shake128_squeezeblocks(uint8_t *output, size_t nblocks, shake128ctx *state); +/* Free the state */ +void shake128_ctx_release(shake128ctx *state); +/* Copy the state. */ +void shake128_ctx_clone(shake128ctx *dest, const shake128ctx *src); + +void cshake128_simple_absorb(shake128ctx *state, uint16_t cstm, const uint8_t *input, size_t inlen); +void cshake128_simple_squeezeblocks(uint8_t *output, size_t nblocks, shake128ctx *state); +void cshake128_simple(uint8_t *output, size_t outlen, uint16_t cstm, const uint8_t *input, size_t inlen); + +/* Initialize incremental hashing API */ +void shake128_inc_init(shake128incctx *state); +/* Absorb more information into the XOF. + * + * Can be called multiple times. + */ +void shake128_inc_absorb(shake128incctx *state, const uint8_t *input, size_t inlen); +/* Finalize the XOF for squeezing */ +void shake128_inc_finalize(shake128incctx *state); +/* Squeeze output out of the sponge. + * + * Supports being called multiple times + */ +void shake128_inc_squeeze(uint8_t *output, size_t outlen, shake128incctx *state); +/* Copy the context of the SHAKE128 XOF */ +void shake128_inc_ctx_clone(shake128incctx* dest, const shake128incctx *src); +/* Free the context of the SHAKE128 XOF */ +void shake128_inc_ctx_release(shake128incctx *state); + +/* Initialize the state and absorb the provided input. + * + * This function does not support being called multiple times + * with the same state. + */ +void shake256_absorb(shake256ctx *state, const uint8_t *input, size_t inlen); +/* Squeeze output out of the sponge. + * + * Supports being called multiple times + */ +void shake256_squeezeblocks(uint8_t *output, size_t nblocks, shake256ctx *state); +/* Free the context held by this XOF */ +void shake256_ctx_release(shake256ctx *state); +/* Copy the context held by this XOF */ +void shake256_ctx_clone(shake256ctx *dest, const shake256ctx *src); + +void cshake256_simple_absorb(shake256ctx *state, uint16_t cstm, const uint8_t *input, size_t inlen); +void cshake256_simple_squeezeblocks(uint8_t *output, size_t nblocks, shake256ctx *state); +void cshake256_simple(uint8_t *output, size_t outlen, uint16_t cstm, const uint8_t *input, size_t inlen); + +/* Initialize incremental hashing API */ +void shake256_inc_init(shake256incctx *state); +void shake256_inc_absorb(shake256incctx *state, const uint8_t *input, size_t inlen); +/* Prepares for squeeze phase */ +void shake256_inc_finalize(shake256incctx *state); + + + +/* Squeeze output out of the sponge. + * + * Supports being called multiple times + */ +void shake256_inc_squeeze(uint8_t *output, size_t outlen, shake256incctx *state); +/* Copy the state */ +void shake256_inc_ctx_clone(shake256incctx* dest, const shake256incctx *src); +/* Free the state */ +void shake256_inc_ctx_release(shake256incctx *state); + +/* One-stop SHAKE128 call */ +void shake128(uint8_t *output, size_t outlen, + const uint8_t *input, size_t inlen); + +/* One-stop SHAKE256 call */ +void shake256(uint8_t *output, size_t outlen, + const uint8_t *input, size_t inlen); + +/* Initialize the incremental hashing state */ +void sha3_256_inc_init(sha3_256incctx *state); +/* Absorb blocks into SHA3 */ +void sha3_256_inc_absorb(sha3_256incctx *state, const uint8_t *input, size_t inlen); +/* Obtain the output of the function and free `state` */ +void sha3_256_inc_finalize(uint8_t *output, sha3_256incctx *state); +/* Copy the context */ +void sha3_256_inc_ctx_clone(sha3_256incctx *dest, const sha3_256incctx *src); +/* Release the state, don't use if `_finalize` has been used */ +void sha3_256_inc_ctx_release(sha3_256incctx *state); + +void sha3_256(uint8_t *output, const uint8_t *input, size_t inlen); + +/* Initialize the incremental hashing state */ +void sha3_384_inc_init(sha3_384incctx *state); +/* Absorb blocks into SHA3 */ +void sha3_384_inc_absorb(sha3_384incctx *state, const uint8_t *input, size_t inlen); +/* Obtain the output of the function and free `state` */ +void sha3_384_inc_finalize(uint8_t *output, sha3_384incctx *state); +/* Copy the context */ +void sha3_384_inc_ctx_clone(sha3_384incctx *dest, const sha3_384incctx *src); +/* Release the state, don't use if `_finalize` has been used */ +void sha3_384_inc_ctx_release(sha3_384incctx *state); + +/* One-stop SHA3-384 shop */ +void sha3_384(uint8_t *output, const uint8_t *input, size_t inlen); + +/* Initialize the incremental hashing state */ +void sha3_512_inc_init(sha3_512incctx *state); +/* Absorb blocks into SHA3 */ +void sha3_512_inc_absorb(sha3_512incctx *state, const uint8_t *input, size_t inlen); +/* Obtain the output of the function and free `state` */ +void sha3_512_inc_finalize(uint8_t *output, sha3_512incctx *state); +/* Copy the context */ +void sha3_512_inc_ctx_clone(sha3_512incctx *dest, const sha3_512incctx *src); +/* Release the state, don't use if `_finalize` has been used */ +void sha3_512_inc_ctx_release(sha3_512incctx *state); + +/* One-stop SHA3-512 shop */ +void sha3_512(uint8_t *output, const uint8_t *input, size_t inlen); +#endif diff --git a/tests/kyber-armv7m/frombytes-asm.h b/tests/kyber-armv7m/frombytes-asm.h new file mode 100644 index 0000000..1f861c6 --- /dev/null +++ b/tests/kyber-armv7m/frombytes-asm.h @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: Apache-2.0 or CC0-1.0 +#ifndef FROMBYTES_ASM_H +#define FROMBYTES_ASM_H + +#include + +static const int32_t zetas[64] = {21932846, 3562152210, 752167598, 3417653460, 2112004045, 932791035, 2951903026, 1419184148, 1817845876, 3434425636, 4233039261, 300609006, 975366560, 2781600929, 3889854731, 3935010590, 2197155094, 2130066389, 3598276897, 2308109491, 2382939200, 1228239371, 1884934581, 3466679822, 1211467195, 2977706375, 3144137970, 3080919767, 945692709, 3015121229, 345764865, 826997308, 2043625172, 2964804700, 2628071007, 4154339049, 483812778, 3288636719, 2696449880, 2122325384, 1371447954, 411563403, 3577634219, 976656727, 2708061387, 723783916, 3181552825, 3346694253, 3617629408, 1408862808, 519937465, 1323711759, 1474661346, 2773859924, 3580214553, 1143088323, 2221668274, 1563682897, 2417773720, 1327582262, 2722253228, 3786641338, 1141798155, 2779020594}; + + +void frombytes_mul_asm_16_32(int32_t *r_tmp, const int16_t *b, const unsigned char *c, const int32_t zetas[64]); +void frombytes_mul_asm_16_32_opt_m7(int32_t *r_tmp, const int16_t *b, const unsigned char *c, const int32_t zetas[64]); + +void frombytes_mul_asm_acc_32_32(int32_t *r_tmp, const int16_t *b, const unsigned char *c, const int32_t zetas[64]); +void frombytes_mul_asm_acc_32_32_opt_m7(int32_t *r_tmp, const int16_t *b, const unsigned char *c, const int32_t zetas[64]); + +void frombytes_mul_asm_acc_32_16(int16_t *r, const int16_t *b, const unsigned char *c, const int32_t zetas[64], const int32_t *r_tmp); +void frombytes_mul_asm_acc_32_16_opt_m7(int16_t *r, const int16_t *b, const unsigned char *c, const int32_t zetas[64], const int32_t *r_tmp); + + + +void frombytes_mul_asm_16_32_wrap(int32_t *r_tmp, const int16_t *b, const unsigned char *c){ + frombytes_mul_asm_16_32(r_tmp, b, c, zetas); +} +void frombytes_mul_asm_16_32_opt_m7_wrap(int32_t *r_tmp, const int16_t *b, const unsigned char *c){ + frombytes_mul_asm_16_32_opt_m7(r_tmp, b, c, zetas); +} + +void frombytes_mul_asm_acc_32_32_wrap(int32_t *r_tmp, const int16_t *b, const unsigned char *c){ + frombytes_mul_asm_acc_32_32(r_tmp, b, c, zetas); +} +void frombytes_mul_asm_acc_32_32_opt_m7_wrap(int32_t *r_tmp, const int16_t *b, const unsigned char *c){ + frombytes_mul_asm_acc_32_32_opt_m7(r_tmp, b, c, zetas); +} + +void frombytes_mul_asm_acc_32_16_wrap(int16_t *r, const int16_t *b, const unsigned char *c, const int32_t *r_tmp){ + frombytes_mul_asm_acc_32_16(r, b, c, zetas, r_tmp); +} +void frombytes_mul_asm_acc_32_16_opt_m7_wrap(int16_t *r, const int16_t *b, const unsigned char *c, const int32_t *r_tmp){ + frombytes_mul_asm_acc_32_16_opt_m7(r, b, c, zetas, r_tmp); +} + + + +#endif \ No newline at end of file diff --git a/tests/kyber-armv7m/keccakf1600-misc.s b/tests/kyber-armv7m/keccakf1600-misc.s new file mode 100644 index 0000000..fb3f8eb --- /dev/null +++ b/tests/kyber-armv7m/keccakf1600-misc.s @@ -0,0 +1,338 @@ +@ +@ Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni, +@ Joan Daemen, Michaƫl Peeters, Gilles Van Assche and Ronny Van Keer, hereby +@ denoted as "the implementer". +@ Additional optimizations by Alexandre Adomnicai. +@ +@ For more information, feedback or questions, please refer to our websites: +@ http://keccak.noekeon.org/ +@ http://keyak.noekeon.org/ +@ http://ketje.noekeon.org/ +@ +@ To the extent possible under law, the implementer has waived all copyright +@ and related or neighboring rights to the source code in this file. +@ http://creativecommons.org/publicdomain/zero/1.0/ +@ + +@ WARNING: These functions work only on little endian CPU with@ ARMv7m architecture (ARM Cortex-M3, ...). + + + .thumb + .syntax unified +.text + + @ Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 +.macro toBitInterleaving x0,x1,s0,s1,t,over + + and \t,\x0,#0x55555555 + orr \t,\t,\t, LSR #1 + and \t,\t,#0x33333333 + orr \t,\t,\t, LSR #2 + and \t,\t,#0x0F0F0F0F + orr \t,\t,\t, LSR #4 + and \t,\t,#0x00FF00FF + bfi \t,\t,#8, #8 + .if \over != 0 + lsr \s0,\t, #8 + .else + eor \s0,\s0,\t, LSR #8 + .endif + + and \t,\x1,#0x55555555 + orr \t,\t,\t, LSR #1 + and \t,\t,#0x33333333 + orr \t,\t,\t, LSR #2 + and \t,\t,#0x0F0F0F0F + orr \t,\t,\t, LSR #4 + and \t,\t,#0x00FF00FF + orr \t,\t,\t, LSR #8 + eor \s0,\s0,\t, LSL #16 + + and \t,\x0,#0xAAAAAAAA + orr \t,\t,\t, LSL #1 + and \t,\t,#0xCCCCCCCC + orr \t,\t,\t, LSL #2 + and \t,\t,#0xF0F0F0F0 + orr \t,\t,\t, LSL #4 + and \t,\t,#0xFF00FF00 + orr \t,\t,\t, LSL #8 + .if \over != 0 + lsr \s1,\t, #16 + .else + eor \s1,\s1,\t, LSR #16 + .endif + + and \t,\x1,#0xAAAAAAAA + orr \t,\t,\t, LSL #1 + and \t,\t,#0xCCCCCCCC + orr \t,\t,\t, LSL #2 + and \t,\t,#0xF0F0F0F0 + orr \t,\t,\t, LSL #4 + and \t,\t,#0xFF00FF00 + orr \t,\t,\t, LSL #8 + bfc \t, #0, #16 + eors \s1,\s1,\t + .endm + + @ Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 +.macro fromBitInterleaving x0, x1, t + + movs \t, \x0 @ t = x0@ + bfi \x0, \x1, #16, #16 @ x0 = (x0 & 0x0000FFFF) | (x1 << 16)@ + bfc \x1, #0, #16 @ x1 = (t >> 16) | (x1 & 0xFFFF0000)@ + orr \x1, \x1, \t, LSR #16 + + eor \t, \x0, \x0, LSR #8 @ t = (x0 ^ (x0 >> 8)) & 0x0000FF00UL@ x0 = x0 ^ t ^ (t << 8)@ + and \t, #0x0000FF00 + eors \x0, \x0, \t + eor \x0, \x0, \t, LSL #8 + + eor \t, \x0, \x0, LSR #4 @ t = (x0 ^ (x0 >> 4)) & 0x00F000F0UL@ x0 = x0 ^ t ^ (t << 4)@ + and \t, #0x00F000F0 + eors \x0, \x0, \t + eor \x0, \x0, \t, LSL #4 + + eor \t, \x0, \x0, LSR #2 @ t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0CUL@ x0 = x0 ^ t ^ (t << 2)@ + and \t, #0x0C0C0C0C + eors \x0, \x0, \t + eor \x0, \x0, \t, LSL #2 + + eor \t, \x0, \x0, LSR #1 @ t = (x0 ^ (x0 >> 1)) & 0x22222222UL@ x0 = x0 ^ t ^ (t << 1)@ + and \t, #0x22222222 + eors \x0, \x0, \t + eor \x0, \x0, \t, LSL #1 + + eor \t, \x1, \x1, LSR #8 @ t = (x1 ^ (x1 >> 8)) & 0x0000FF00UL@ x1 = x1 ^ t ^ (t << 8)@ + and \t, #0x0000FF00 + eors \x1, \x1, \t + eor \x1, \x1, \t, LSL #8 + + eor \t, \x1, \x1, LSR #4 @ t = (x1 ^ (x1 >> 4)) & 0x00F000F0UL@ x1 = x1 ^ t ^ (t << 4)@ + and \t, #0x00F000F0 + eors \x1, \x1, \t + eor \x1, \x1, \t, LSL #4 + + eor \t, \x1, \x1, LSR #2 @ t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0CUL@ x1 = x1 ^ t ^ (t << 2)@ + and \t, #0x0C0C0C0C + eors \x1, \x1, \t + eor \x1, \x1, \t, LSL #2 + + eor \t, \x1, \x1, LSR #1 @ t = (x1 ^ (x1 >> 1)) & 0x22222222UL@ x1 = x1 ^ t ^ (t << 1)@ + and \t, #0x22222222 + eors \x1, \x1, \t + eor \x1, \x1, \t, LSL #1 + .endm + + + +@---------------------------------------------------------------------------- +@ +@ void KeccakF1600_Initialize( void ) +@ +.align 8 +.global KeccakF1600_Initialize +KeccakF1600_Initialize: + bx lr + + + +@---------------------------------------------------------------------------- +@ +@ void KeccakF1600_StateXORBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length) +@ +.align 8 +.global KeccakF1600_StateXORBytes +KeccakF1600_StateXORBytes: + cbz r3, KeccakF1600_StateXORBytes_Exit1 + push {r4 - r8, lr} @ then + bic r4, r2, #7 @ offset &= ~7 + adds r0, r0, r4 @ add whole lane offset to state pointer + ands r2, r2, #7 @ offset &= 7 (part not lane aligned) + beq KeccakF1600_StateXORBytes_CheckLanes @ .if offset != 0 + movs r4, r3 @ then, do remaining bytes in first lane + rsb r5, r2, #8 @ max size in lane = 8 - offset + cmp r4, r5 + ble KeccakF1600_StateXORBytes_BytesAlign + movs r4, r5 +KeccakF1600_StateXORBytes_BytesAlign: + sub r8, r3, r4 @ size left + movs r3, r4 + bl __KeccakF1600_StateXORBytesInLane + mov r3, r8 +KeccakF1600_StateXORBytes_CheckLanes: + lsrs r2, r3, #3 @ .if length >= 8 + beq KeccakF1600_StateXORBytes_Bytes + mov r8, r3 + bl __KeccakF1600_StateXORLanes + and r3, r8, #7 +KeccakF1600_StateXORBytes_Bytes: + cbz r3, KeccakF1600_StateXORBytes_Exit + movs r2, #0 + bl __KeccakF1600_StateXORBytesInLane +KeccakF1600_StateXORBytes_Exit: + pop {r4 - r8, pc} +KeccakF1600_StateXORBytes_Exit1: + bx lr + + +@---------------------------------------------------------------------------- +@ +@ __KeccakF1600_StateXORLanes +@ +@ Input: +@ r0 state pointer +@ r1 data pointer +@ r2 laneCount +@ +@ Output: +@ r0 state pointer next lane +@ r1 data pointer next byte to input +@ +@ Changed: r2-r7 +@ +.align 8 +__KeccakF1600_StateXORLanes: +__KeccakF1600_StateXORLanes_LoopAligned: + ldr r4, [r1], #4 + ldr r5, [r1], #4 + ldrd r6, r7, [r0] + toBitInterleaving r4, r5, r6, r7, r3, 0 + strd r6, r7, [r0], #8 + subs r2, r2, #1 + bne __KeccakF1600_StateXORLanes_LoopAligned + bx lr + + +@---------------------------------------------------------------------------- +@ +@ __KeccakF1600_StateXORBytesInLane +@ +@ Input: +@ r0 state pointer +@ r1 data pointer +@ r2 offset in lane +@ r3 length +@ +@ Output: +@ r0 state pointer next lane +@ r1 data pointer next byte to input +@ +@ Changed: r2-r7 +@ +.align 8 +__KeccakF1600_StateXORBytesInLane: + movs r4, #0 + movs r5, #0 + push { r4 - r5 } + add r2, r2, sp +__KeccakF1600_StateXORBytesInLane_Loop: + ldrb r5, [r1], #1 + strb r5, [r2], #1 + subs r3, r3, #1 + bne __KeccakF1600_StateXORBytesInLane_Loop + pop { r4 - r5 } + ldrd r6, r7, [r0] + toBitInterleaving r4, r5, r6, r7, r3, 0 + strd r6, r7, [r0], #8 + bx lr + + + + +@---------------------------------------------------------------------------- +@ +@ void KeccakF1600_StateExtractBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length) +@ +.align 8 +.global KeccakF1600_StateExtractBytes +KeccakF1600_StateExtractBytes: + cbz r3, KeccakF1600_StateExtractBytes_Exit1 @ .if length != 0 + push {r4 - r8, lr} @ then + bic r4, r2, #7 @ offset &= ~7 + adds r0, r0, r4 @ add whole lane offset to state pointer + ands r2, r2, #7 @ offset &= 7 (part not lane aligned) + beq KeccakF1600_StateExtractBytes_CheckLanes @ .if offset != 0 + movs r4, r3 @ then, do remaining bytes in first lane + rsb r5, r2, #8 @ max size in lane = 8 - offset + cmp r4, r5 + ble KeccakF1600_StateExtractBytes_BytesAlign + movs r4, r5 +KeccakF1600_StateExtractBytes_BytesAlign: + sub r8, r3, r4 @ size left + movs r3, r4 + bl __KeccakF1600_StateExtractBytesInLane + mov r3, r8 +KeccakF1600_StateExtractBytes_CheckLanes: + lsrs r2, r3, #3 @ .if length >= 8 + beq KeccakF1600_StateExtractBytes_Bytes + mov r8, r3 + bl __KeccakF1600_StateExtractLanes + and r3, r8, #7 +KeccakF1600_StateExtractBytes_Bytes: + cbz r3, KeccakF1600_StateExtractBytes_Exit + movs r2, #0 + bl __KeccakF1600_StateExtractBytesInLane +KeccakF1600_StateExtractBytes_Exit: + pop {r4 - r8, pc} +KeccakF1600_StateExtractBytes_Exit1: + bx lr + + +@---------------------------------------------------------------------------- +@ +@ __KeccakF1600_StateExtractLanes +@ +@ Input: +@ r0 state pointer +@ r1 data pointer +@ r2 laneCount +@ +@ Output: +@ r0 state pointer next lane +@ r1 data pointer next byte to input +@ +@ Changed: r2-r5 +@ +.align 8 +__KeccakF1600_StateExtractLanes: +__KeccakF1600_StateExtractLanes_LoopAligned: + ldrd r4, r5, [r0], #8 + fromBitInterleaving r4, r5, r3 + str r4, [r1], #4 + subs r2, r2, #1 + str r5, [r1], #4 + bne __KeccakF1600_StateExtractLanes_LoopAligned + bx lr + + +@---------------------------------------------------------------------------- +@ +@ __KeccakF1600_StateExtractBytesInLane +@ +@ Input: +@ r0 state pointer +@ r1 data pointer +@ r2 offset in lane +@ r3 length +@ +@ Output: +@ r0 state pointer next lane +@ r1 data pointer next byte to input +@ +@ Changed: r2-r6 +@ +.align 8 +__KeccakF1600_StateExtractBytesInLane: + ldrd r4, r5, [r0], #8 + fromBitInterleaving r4, r5, r6 + push {r4, r5} + add r2, sp, r2 +__KeccakF1600_StateExtractBytesInLane_Loop: + ldrb r4, [r2], #1 + subs r3, r3, #1 + strb r4, [r1], #1 + bne __KeccakF1600_StateExtractBytesInLane_Loop + add sp, #8 + bx lr + + diff --git a/tests/kyber-armv7m/keccakf1600.h b/tests/kyber-armv7m/keccakf1600.h new file mode 100644 index 0000000..e017bf1 --- /dev/null +++ b/tests/kyber-armv7m/keccakf1600.h @@ -0,0 +1,10 @@ +#ifndef KECCAKF1600_H +#define KECCAKF1600_H + +#include + +void KeccakF1600_StateExtractBytes(uint64_t *state, unsigned char *data, unsigned int offset, unsigned int length); +void KeccakF1600_StateXORBytes(uint64_t *state, const unsigned char *data, unsigned int offset, unsigned int length); +void KeccakF1600_StatePermute(uint64_t * state); + +#endif \ No newline at end of file diff --git a/tests/kyber-armv7m/kyber-armv7m.mk b/tests/kyber-armv7m/kyber-armv7m.mk new file mode 100644 index 0000000..2b34b25 --- /dev/null +++ b/tests/kyber-armv7m/kyber-armv7m.mk @@ -0,0 +1,91 @@ +# Test name - needs to match the directory name +TESTS += kyber-armv7m + +# All further variables must be prefixed with the capitalized test name + +# Platforms this test should run on (matching the directory name in envs/) +KYBER_ARMV7M_PLATFORMS += m7-an500 +KYBER_ARMV7M_PLATFORMS += nucleo-f767zi +KYBER_ARMV7M_PLATFORMS += stm32f4discovery + +# C sources required for this test +KYBER_ARMV7M_SOURCES += main.c +KYBER_ARMV7M_SOURCES += fips202.c + +# Keccak source +KYBER_ARMV7M_ASMS += ../../asm/manual/keccak/keccakf1600_adomnicai_m4_opt_m7.s +KYBER_ARMV7M_ASMS += keccakf1600-misc.s + +# Assembly sources required for this test +KYBER_ARMV7M_ASM_DIR = ../../asm/manual/kyber-armv7m +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/ntt_kyber.s +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/ntt_kyber_opt_m7.s + +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/intt_kyber.s +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/intt_kyber_opt_m7.s + +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/add_kyber.s +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/add_kyber_opt_m7.s + +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/sub_kyber.s +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/sub_kyber_opt_m7.s + +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/barrett_reduce_kyber.s +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/barrett_reduce_kyber_opt_m7.s + +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/fromplant_kyber.s +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/fromplant_kyber_opt_m7.s + +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/basemul_16_32_kyber.s +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/basemul_16_32_kyber_opt_m7.s + +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/basemul_acc_32_32_kyber.s +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/basemul_acc_32_32_kyber_opt_m7.s + +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/basemul_acc_32_16_kyber.s +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/basemul_acc_32_16_kyber_opt_m7.s + +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/frombytes_mul_16_32_kyber.s +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/frombytes_mul_16_32_kyber_opt_m7.s + +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/frombytes_mul_acc_32_32_kyber.s +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/frombytes_mul_acc_32_32_kyber_opt_m7.s + +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/frombytes_mul_acc_32_16_kyber.s +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/frombytes_mul_acc_32_16_kyber_opt_m7.s + +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/basemul_acc_kyber.s +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/basemul_acc_kyber_opt_m7.s + +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/basemul_kyber.s +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/basemul_kyber_opt_m7.s + +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/frombytes_mul_acc_kyber.s +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/frombytes_mul_acc_kyber_opt_m7.s + +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/frombytes_mul_kyber.s +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/frombytes_mul_kyber_opt_m7.s + +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/matacc_acc_kyber.s +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/matacc_acc_kyber_opt_m7.s + +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/matacc_asm_cache_16_32_kyber.s +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/matacc_asm_cache_16_32_kyber_opt_m7.s + +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/matacc_asm_cache_32_16_kyber.s +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/matacc_asm_cache_32_16_kyber_opt_m7.s + +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/matacc_asm_cache_32_32_kyber.s +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/matacc_asm_cache_32_32_kyber_opt_m7.s + +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/matacc_asm_opt_16_32_kyber.s +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/matacc_asm_opt_16_32_kyber_opt_m7.s + +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/matacc_asm_opt_32_16_kyber.s +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/matacc_asm_opt_32_16_kyber_opt_m7.s + +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/matacc_asm_opt_32_32_kyber.s +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/matacc_asm_opt_32_32_kyber_opt_m7.s + +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/matacc_kyber.s +KYBER_ARMV7M_ASMS += $(KYBER_ARMV7M_ASM_DIR)/matacc_kyber_opt_m7.s \ No newline at end of file diff --git a/tests/kyber-armv7m/main.c b/tests/kyber-armv7m/main.c new file mode 100644 index 0000000..efcc595 --- /dev/null +++ b/tests/kyber-armv7m/main.c @@ -0,0 +1,1366 @@ +/* + * Copyright (c) 2025 SLOTHY authors + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * + * Author: Hanno Becker + */ + +#define ENABLE_PMU_STATS /* Do not enable when benching for cycle count */ + +#if defined(ENABLE_PMU_STATS) +#define REPEAT 100 +#define REPEAT_MEDIAN 100 +#else +#define REPEAT 1024 +#endif + +/* + * Some external references to auto-generated assembly. + */ + +#include +#include + +#include +#include +#include +#include "misc.h" +#include "params.h" +#include "poly.h" +#include "frombytes-asm.h" +#include "ntt-asm.h" + +void asm_barrett_reduce(int16_t *); +void asm_barrett_reduce_opt_m7(int16_t *); + +void asm_fromplant(int16_t *); +void asm_fromplant_opt_m7(int16_t *); + +void pointwise_add(int16_t *, const int16_t *, const int16_t *); +void pointwise_add_opt_m7(int16_t *, const int16_t *, const int16_t *); + +void pointwise_sub(int16_t *, const int16_t *, const int16_t *); +void pointwise_sub_opt_m7(int16_t *, const int16_t *, const int16_t *); + +void basemul_asm_opt_16_32(int32_t *, const int16_t *, const int16_t *, const int16_t *); +void basemul_asm_opt_16_32_opt_m7(int32_t *, const int16_t *, const int16_t *, const int16_t *); + +void basemul_asm_acc_opt_32_32(int32_t *, const int16_t *, const int16_t *, const int16_t *); +void basemul_asm_acc_opt_32_32_opt_m7(int32_t *, const int16_t *, const int16_t *, const int16_t *); + +void basemul_asm_acc_opt_32_16(int16_t *, const int16_t *, const int16_t *, const int16_t *, const int32_t *); +void basemul_asm_acc_opt_32_16_opt_m7(int16_t *, const int16_t *, const int16_t *, const int16_t *, const int32_t *); + +void matacc_asm(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[168+2], const int32_t zetas[64], uint64_t *state); +void matacc_asm_opt_m7(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[168+2], const int32_t zetas[64], uint64_t *state); + +void matacc_asm_acc(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[168+2], const int32_t zetas[64], uint64_t *state); +void matacc_asm_acc_opt_m7(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[168+2], const int32_t zetas[64], uint64_t *state); + +void matacc_asm_cache_16_32(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[168+2], const int32_t zetas[64], uint64_t *state, int16_t *aprimeptr); +void matacc_asm_cache_16_32_opt_m7(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[168+2], const int32_t zetas[64], uint64_t *state, int16_t *aprimeptr); + +void matacc_asm_cache_32_32(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[168+2], const int32_t zetas[64], uint64_t *state, int16_t *aprimeptr); +void matacc_asm_cache_32_32_opt_m7(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[168+2], const int32_t zetas[64], uint64_t *state, int16_t *aprimeptr); + +void matacc_asm_cache_32_16(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[168+2], const int32_t zetas[64], uint64_t *state, int16_t *aprimeptr, const int32_t *r_tmp); +void matacc_asm_cache_32_16_opt_m7(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[168+2], const int32_t zetas[64], uint64_t *state, int16_t *aprimeptr, const int32_t *r_tmp); + +void matacc_asm_opt_16_32(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[168+2], uint64_t *state, const int16_t *aprimeptr); +void matacc_asm_opt_16_32_opt_m7(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[168+2], uint64_t *state, const int16_t *aprimeptr); + +void matacc_asm_opt_32_32(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[168+2], uint64_t *state, const int16_t *aprimeptr); +void matacc_asm_opt_32_32_opt_m7(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[168+2], uint64_t *state, const int16_t *aprimeptr); + +void matacc_asm_opt_32_16(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[168+2], uint64_t *state, const int16_t *aprimeptr, const int32_t *r_tmp); +void matacc_asm_opt_32_16_opt_m7(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[168+2], uint64_t *state, const int16_t *aprimeptr, const int32_t *r_tmp); + +// TODO: instruction counting does not work for more than 4 arguments; use a wrapper for now -- need to fix this later +void basemul_asm_acc_opt_32_16_wrap(int16_t * a, const int16_t *b, const int16_t *c, const int16_t *d){ + int32_t yyy[256]= {0}; + basemul_asm_acc_opt_32_16(a,b,c,d, yyy); +} +void basemul_asm_acc_opt_32_16_opt_m7_wrap(int16_t * a, const int16_t *b, const int16_t *c, const int16_t *d){ + int32_t yyy[256]= {0}; + basemul_asm_acc_opt_32_16_opt_m7(a,b,c,d, yyy); +} +void matacc_asm_wrap(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[168+2]){ + uint64_t state[26] = {0}; + matacc_asm(r,b,c,buf,zetas,state); +} +void matacc_asm_opt_m7_wrap(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[168+2]){ + uint64_t state[26] = {0}; + matacc_asm_opt_m7(r,b,c,buf,zetas,state); +} +void matacc_asm_acc_wrap(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[168+2]){ + uint64_t state[26] = {0}; + matacc_asm_acc(r,b,c,buf,zetas,state); +} +void matacc_asm_acc_opt_m7_wrap(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[168+2]){ + uint64_t state[26] = {0}; + matacc_asm_acc_opt_m7(r,b,c,buf,zetas,state); +} +void matacc_asm_cache_16_32_wrap(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[168+2]){ + uint64_t state[26] = {0}; + int16_t aprime[256] = {0}; + matacc_asm_cache_16_32(r_tmp,b,c,buf,zetas,state,aprime); +} +void matacc_asm_cache_16_32_opt_m7_wrap(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[168+2]){ + uint64_t state[26] = {0}; + int16_t aprime[256] = {0}; + matacc_asm_cache_16_32_opt_m7(r_tmp,b,c,buf,zetas,state,aprime); +} +void matacc_asm_cache_32_32_wrap(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[168+2]){ + uint64_t state[26] = {0}; + int16_t aprime[256] = {0}; + matacc_asm_cache_32_32(r_tmp,b,c,buf,zetas,state,aprime); +} +void matacc_asm_cache_32_32_opt_m7_wrap(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[168+2]){ + uint64_t state[26] = {0}; + int16_t aprime[256] = {0}; + matacc_asm_cache_32_32_opt_m7(r_tmp,b,c,buf,zetas,state,aprime); +} + +void matacc_asm_cache_32_16_wrap(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[168+2]){ + uint64_t state[26] = {0}; + int16_t aprime[256] = {0}; + int32_t r_tmp[256] = {0}; + matacc_asm_cache_32_16(r,b,c,buf,zetas,state,aprime,r_tmp); +} +void matacc_asm_cache_32_16_opt_m7_wrap(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[168+2]){ + uint64_t state[26] = {0}; + int16_t aprime[256] = {0}; + int32_t r_tmp[256] = {0}; + matacc_asm_cache_32_16_opt_m7(r,b,c,buf,zetas,state,aprime,r_tmp); +} + +void matacc_asm_opt_16_32_wrap(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[168+2]){ + uint64_t state[26] = {0}; + int16_t aprime[256] = {0}; + matacc_asm_opt_16_32(r_tmp,b,c,buf,state,aprime); +} +void matacc_asm_opt_16_32_opt_m7_wrap(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[168+2]){ + uint64_t state[26] = {0}; + int16_t aprime[256] = {0}; + matacc_asm_opt_16_32_opt_m7(r_tmp,b,c,buf,state,aprime); +} + +void matacc_asm_opt_32_32_wrap(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[168+2]){ + uint64_t state[26] = {0}; + int16_t aprime[256] = {0}; + matacc_asm_opt_32_32(r_tmp,b,c,buf,state,aprime); +} +void matacc_asm_opt_32_32_opt_m7_wrap(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[168+2]){ + uint64_t state[26] = {0}; + int16_t aprime[256] = {0}; + matacc_asm_opt_32_32_opt_m7(r_tmp,b,c,buf,state,aprime); +} + +void matacc_asm_opt_32_16_wrap(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[168+2]){ + uint64_t state[26] = {0}; + int16_t aprime[256] = {0}; + int32_t r_tmp[256] = {0}; + matacc_asm_opt_32_16(r,b,c,buf,state,aprime,r_tmp); +} +void matacc_asm_opt_32_16_opt_m7_wrap(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[168+2]){ + uint64_t state[26] = {0}; + int16_t aprime[256] = {0}; + int32_t r_tmp[256] = {0}; + matacc_asm_opt_32_16_opt_m7(r,b,c,buf,state,aprime,r_tmp); +} + + +#define NTT_LAYERS 7 +#define NTT_SIZE 256 + +typedef struct { + char name[100]; + uint64_t cycles; +} benchmark_result; + +benchmark_result results[100]; +int benchmark_cnt = 0; + +static void add_benchmark_results(char *name, uint64_t cycles){ + if(benchmark_cnt == 100) return; + + results[benchmark_cnt].cycles = cycles; + strncpy(results[benchmark_cnt].name, name, 100); + benchmark_cnt++; +} + +static void dump_benchmarks_tex(void){ + for(int i=0;i> 1])); \ + add_benchmark_results(#var, (cycles[REPEAT_MEDIAN >> 1])); \ + return (0); \ + } + +#define MAKE_BENCH_3(var, func) \ + int bench_##var() \ + { \ + uint64_t t1, t2; \ + uint64_t cycles[REPEAT_MEDIAN]; \ + int16_t dst[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + int16_t src1[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + int16_t src2[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + (func)(dst, src1, src2); \ + for (size_t cnt_median = 0; cnt_median < REPEAT_MEDIAN; cnt_median++) \ + { \ + t1 = hal_get_time(); \ + for (size_t cnt = 0; cnt < REPEAT; cnt++) \ + (func)(dst, src1, src2); \ + t2 = hal_get_time(); \ + cycles[cnt_median] = (t2 - t1) / REPEAT; \ + } \ + qsort(cycles, REPEAT_MEDIAN, sizeof(uint64_t), cmp_uint64_t); \ + debug_printf(#var " repeat %d, %d", \ + REPEAT *REPEAT_MEDIAN, (cycles[REPEAT_MEDIAN >> 1])); \ + add_benchmark_results(#var, (cycles[REPEAT_MEDIAN >> 1])); \ + return (0); \ + } + +#define MAKE_BENCH_basemul_32(var, func) \ + int bench_##var() \ + { \ + uint64_t t1, t2; \ + uint64_t cycles[REPEAT_MEDIAN]; \ + int32_t dst[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + int16_t src1[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + int16_t src2[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + int16_t src3[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + (func)(dst, src1, src2, src3); \ + for (size_t cnt_median = 0; cnt_median < REPEAT_MEDIAN; cnt_median++) \ + { \ + t1 = hal_get_time(); \ + for (size_t cnt = 0; cnt < REPEAT; cnt++) \ + (func)(dst, src1, src2, src3); \ + t2 = hal_get_time(); \ + cycles[cnt_median] = (t2 - t1) / REPEAT; \ + } \ + qsort(cycles, REPEAT_MEDIAN, sizeof(uint64_t), cmp_uint64_t); \ + debug_printf(#var " repeat %d, %d", \ + REPEAT *REPEAT_MEDIAN, (cycles[REPEAT_MEDIAN >> 1])); \ + add_benchmark_results(#var, (cycles[REPEAT_MEDIAN >> 1])); \ + return (0); \ + } + +#define MAKE_BENCH_basemul_32_16(var, func) \ + int bench_##var() \ + { \ + uint64_t t1, t2; \ + uint64_t cycles[REPEAT_MEDIAN]; \ + int16_t dst[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + int16_t src1[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + int16_t src2[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + int16_t src3[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + int32_t src4[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + (func)(dst, src1, src2, src3, src4); \ + for (size_t cnt_median = 0; cnt_median < REPEAT_MEDIAN; cnt_median++) \ + { \ + t1 = hal_get_time(); \ + for (size_t cnt = 0; cnt < REPEAT; cnt++) \ + (func)(dst, src1, src2, src3, src4); \ + t2 = hal_get_time(); \ + cycles[cnt_median] = (t2 - t1) / REPEAT; \ + } \ + qsort(cycles, REPEAT_MEDIAN, sizeof(uint64_t), cmp_uint64_t); \ + debug_printf(#var " repeat %d, %d", \ + REPEAT *REPEAT_MEDIAN, (cycles[REPEAT_MEDIAN >> 1])); \ + add_benchmark_results(#var, (cycles[REPEAT_MEDIAN >> 1])); \ + return (0); \ + } + +#define MAKE_BENCH_frombytes_mul(var, func) \ + int bench_##var() \ + { \ + uint64_t t1, t2; \ + uint64_t cycles[REPEAT_MEDIAN]; \ + int16_t dst[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + int16_t src1[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + uint8_t src2[KYBER_POLYBYTES] __attribute__((aligned(16))) = {0}; \ + (func)(dst, src1, src2); \ + for (size_t cnt_median = 0; cnt_median < REPEAT_MEDIAN; cnt_median++) \ + { \ + t1 = hal_get_time(); \ + for (size_t cnt = 0; cnt < REPEAT; cnt++) \ + (func)(dst, src1, src2); \ + t2 = hal_get_time(); \ + cycles[cnt_median] = (t2 - t1) / REPEAT; \ + } \ + qsort(cycles, REPEAT_MEDIAN, sizeof(uint64_t), cmp_uint64_t); \ + debug_printf(#var " repeat %d, %d", \ + REPEAT *REPEAT_MEDIAN, (cycles[REPEAT_MEDIAN >> 1])); \ + add_benchmark_results(#var, (cycles[REPEAT_MEDIAN >> 1])); \ + return (0); \ + } + +#define MAKE_BENCH_frombytes_mul_32(var, func) \ + int bench_##var() \ + { \ + uint64_t t1, t2; \ + uint64_t cycles[REPEAT_MEDIAN]; \ + int32_t dst[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + int16_t src1[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + uint8_t src2[KYBER_POLYBYTES] __attribute__((aligned(16))) = {0}; \ + (func)(dst, src1, src2); \ + for (size_t cnt_median = 0; cnt_median < REPEAT_MEDIAN; cnt_median++) \ + { \ + t1 = hal_get_time(); \ + for (size_t cnt = 0; cnt < REPEAT; cnt++) \ + (func)(dst, src1, src2); \ + t2 = hal_get_time(); \ + cycles[cnt_median] = (t2 - t1) / REPEAT; \ + } \ + qsort(cycles, REPEAT_MEDIAN, sizeof(uint64_t), cmp_uint64_t); \ + debug_printf(#var " repeat %d, %d", \ + REPEAT *REPEAT_MEDIAN, (cycles[REPEAT_MEDIAN >> 1])); \ + add_benchmark_results(#var, (cycles[REPEAT_MEDIAN >> 1])); \ + return (0); \ + } + +#define MAKE_BENCH_frombytes_mul_32_16(var, func) \ + int bench_##var() \ + { \ + uint64_t t1, t2; \ + uint64_t cycles[REPEAT_MEDIAN]; \ + int16_t dst[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + int16_t src1[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + uint8_t src2[KYBER_POLYBYTES] __attribute__((aligned(16))) = {0}; \ + int32_t src3[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + (func)(dst, src1, src2, src3); \ + for (size_t cnt_median = 0; cnt_median < REPEAT_MEDIAN; cnt_median++) \ + { \ + t1 = hal_get_time(); \ + for (size_t cnt = 0; cnt < REPEAT; cnt++) \ + (func)(dst, src1, src2, src3); \ + t2 = hal_get_time(); \ + cycles[cnt_median] = (t2 - t1) / REPEAT; \ + } \ + qsort(cycles, REPEAT_MEDIAN, sizeof(uint64_t), cmp_uint64_t); \ + debug_printf(#var " repeat %d, %d", \ + REPEAT *REPEAT_MEDIAN, (cycles[REPEAT_MEDIAN >> 1])); \ + add_benchmark_results(#var, (cycles[REPEAT_MEDIAN >> 1])); \ + return (0); \ + } + +#define MAKE_BENCH_matacc(var, func) \ + int bench_##var() \ + { \ + uint64_t t1, t2; \ + uint64_t cycles[REPEAT_MEDIAN]; \ + int16_t dst[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + int16_t b[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + int16_t c[4] __attribute__((aligned(16))) = {0}; \ + uint8_t buf[168+2] __attribute__((aligned(16))) = {0}; \ + uint64_t state[26] __attribute__((aligned(16))) = {0}; \ + (func)(dst, b, c, buf, zetas, state); \ + for (size_t cnt_median = 0; cnt_median < REPEAT_MEDIAN; cnt_median++) \ + { \ + t1 = hal_get_time(); \ + for (size_t cnt = 0; cnt < REPEAT; cnt++) \ + (func)(dst, b, c, buf, zetas, state); \ + t2 = hal_get_time(); \ + cycles[cnt_median] = (t2 - t1) / REPEAT; \ + } \ + qsort(cycles, REPEAT_MEDIAN, sizeof(uint64_t), cmp_uint64_t); \ + debug_printf(#var " repeat %d, %d", \ + REPEAT *REPEAT_MEDIAN, (cycles[REPEAT_MEDIAN >> 1])); \ + add_benchmark_results(#var, (cycles[REPEAT_MEDIAN >> 1])); \ + return (0); \ + } + +#define MAKE_BENCH_matacc_cache_32(var, func) \ + int bench_##var() \ + { \ + uint64_t t1, t2; \ + uint64_t cycles[REPEAT_MEDIAN]; \ + int32_t dst[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + int16_t b[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + int16_t c[4] __attribute__((aligned(16))) = {0}; \ + uint8_t buf[168+2] __attribute__((aligned(16))) = {0}; \ + uint64_t state[26] __attribute__((aligned(16))) = {0}; \ + int16_t aprime[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + (func)(dst, b, c, buf, zetas, state, aprime); \ + for (size_t cnt_median = 0; cnt_median < REPEAT_MEDIAN; cnt_median++) \ + { \ + t1 = hal_get_time(); \ + for (size_t cnt = 0; cnt < REPEAT; cnt++) \ + (func)(dst, b, c, buf, zetas, state, aprime); \ + t2 = hal_get_time(); \ + cycles[cnt_median] = (t2 - t1) / REPEAT; \ + } \ + qsort(cycles, REPEAT_MEDIAN, sizeof(uint64_t), cmp_uint64_t); \ + debug_printf(#var " repeat %d, %d", \ + REPEAT *REPEAT_MEDIAN, (cycles[REPEAT_MEDIAN >> 1])); \ + add_benchmark_results(#var, (cycles[REPEAT_MEDIAN >> 1])); \ + return (0); \ + } + +#define MAKE_BENCH_matacc_cache_32_16(var, func) \ + int bench_##var() \ + { \ + uint64_t t1, t2; \ + uint64_t cycles[REPEAT_MEDIAN]; \ + int16_t dst[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + int16_t b[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + int16_t c[4] __attribute__((aligned(16))) = {0}; \ + uint8_t buf[168+2] __attribute__((aligned(16))) = {0}; \ + uint64_t state[26] __attribute__((aligned(16))) = {0}; \ + int16_t aprime[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + int32_t rtmp[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + (func)(dst, b, c, buf, zetas, state, aprime, rtmp); \ + for (size_t cnt_median = 0; cnt_median < REPEAT_MEDIAN; cnt_median++) \ + { \ + t1 = hal_get_time(); \ + for (size_t cnt = 0; cnt < REPEAT; cnt++) \ + (func)(dst, b, c, buf, zetas, state, aprime, rtmp); \ + t2 = hal_get_time(); \ + cycles[cnt_median] = (t2 - t1) / REPEAT; \ + } \ + qsort(cycles, REPEAT_MEDIAN, sizeof(uint64_t), cmp_uint64_t); \ + debug_printf(#var " repeat %d, %d", \ + REPEAT *REPEAT_MEDIAN, (cycles[REPEAT_MEDIAN >> 1])); \ + add_benchmark_results(#var, (cycles[REPEAT_MEDIAN >> 1])); \ + return (0); \ + } + +#define MAKE_BENCH_matacc_opt_32(var, func) \ + int bench_##var() \ + { \ + uint64_t t1, t2; \ + uint64_t cycles[REPEAT_MEDIAN]; \ + int32_t dst[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + int16_t b[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + int16_t c[4] __attribute__((aligned(16))) = {0}; \ + uint8_t buf[168+2] __attribute__((aligned(16))) = {0}; \ + uint64_t state[26] __attribute__((aligned(16))) = {0}; \ + int16_t aprime[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + (func)(dst, b, c, buf, state, aprime); \ + for (size_t cnt_median = 0; cnt_median < REPEAT_MEDIAN; cnt_median++) \ + { \ + t1 = hal_get_time(); \ + for (size_t cnt = 0; cnt < REPEAT; cnt++) \ + (func)(dst, b, c, buf, state, aprime); \ + t2 = hal_get_time(); \ + cycles[cnt_median] = (t2 - t1) / REPEAT; \ + } \ + qsort(cycles, REPEAT_MEDIAN, sizeof(uint64_t), cmp_uint64_t); \ + debug_printf(#var " repeat %d, %d", \ + REPEAT *REPEAT_MEDIAN, (cycles[REPEAT_MEDIAN >> 1])); \ + add_benchmark_results(#var, (cycles[REPEAT_MEDIAN >> 1])); \ + return (0); \ + } + +#define MAKE_BENCH_matacc_opt_32_16(var, func) \ + int bench_##var() \ + { \ + uint64_t t1, t2; \ + uint64_t cycles[REPEAT_MEDIAN]; \ + int16_t dst[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + int16_t b[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + int16_t c[4] __attribute__((aligned(16))) = {0}; \ + uint8_t buf[168+2] __attribute__((aligned(16))) = {0}; \ + uint64_t state[26] __attribute__((aligned(16))) = {0}; \ + int16_t aprime[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + int32_t rtmp[NTT_SIZE] __attribute__((aligned(16))) = {0}; \ + (func)(dst, b, c, buf, state, aprime, rtmp); \ + for (size_t cnt_median = 0; cnt_median < REPEAT_MEDIAN; cnt_median++) \ + { \ + t1 = hal_get_time(); \ + for (size_t cnt = 0; cnt < REPEAT; cnt++) \ + (func)(dst, b, c, buf, state, aprime, rtmp); \ + t2 = hal_get_time(); \ + cycles[cnt_median] = (t2 - t1) / REPEAT; \ + } \ + qsort(cycles, REPEAT_MEDIAN, sizeof(uint64_t), cmp_uint64_t); \ + debug_printf(#var " repeat %d, %d", \ + REPEAT *REPEAT_MEDIAN, (cycles[REPEAT_MEDIAN >> 1])); \ + add_benchmark_results(#var, (cycles[REPEAT_MEDIAN >> 1])); \ + return (0); \ + } + + +MAKE_BENCH_1(kyber_ntt_pqm4,ntt_fast_wrap) +MAKE_BENCH_1(kyber_ntt_pqm4_opt_m7,ntt_fast_opt_m7_wrap) + +MAKE_BENCH_1(kyber_invntt_pqm4,invntt_fast_wrap) +MAKE_BENCH_1(kyber_invntt_pqm4_opt_m7,invntt_fast_opt_m7_wrap) + +MAKE_BENCH_1(kyber_barrett_reduce_pqm4,asm_barrett_reduce) +MAKE_BENCH_1(kyber_barrett_reduce_pqm4_opt_m7,asm_barrett_reduce_opt_m7) + +MAKE_BENCH_1(kyber_fromplant_pqm4,asm_fromplant) +MAKE_BENCH_1(kyber_fromplant_pqm4_opt_m7,asm_fromplant_opt_m7) + +MAKE_BENCH_3(kyber_pointwise_add_pqm4,pointwise_add) +MAKE_BENCH_3(kyber_pointwise_add_pqm4_opt_m7,pointwise_add_opt_m7) + +MAKE_BENCH_3(kyber_pointwise_sub_pqm4,pointwise_sub) +MAKE_BENCH_3(kyber_pointwise_sub_pqm4_opt_m7,pointwise_sub_opt_m7) + +MAKE_BENCH_basemul_32(kyber_basemul_16_32, basemul_asm_opt_16_32); +MAKE_BENCH_basemul_32(kyber_basemul_16_32_opt_m7, basemul_asm_opt_16_32_opt_m7); + +MAKE_BENCH_basemul_32(kyber_basemul_acc_32_32, basemul_asm_acc_opt_32_32); +MAKE_BENCH_basemul_32(kyber_basemul_acc_32_32_opt_m7, basemul_asm_acc_opt_32_32_opt_m7); + +MAKE_BENCH_basemul_32_16(kyber_basemul_acc_32_16, basemul_asm_acc_opt_32_16); +MAKE_BENCH_basemul_32_16(kyber_basemul_acc_32_16_opt_m7, basemul_asm_acc_opt_32_16_opt_m7); + +MAKE_BENCH_frombytes_mul_32(kyber_frombytes_mul_16_32, frombytes_mul_asm_16_32_wrap); +MAKE_BENCH_frombytes_mul_32(kyber_frombytes_mul_16_32_opt_m7, frombytes_mul_asm_16_32_opt_m7_wrap); + +MAKE_BENCH_frombytes_mul_32(kyber_frombytes_mul_acc_32_32, frombytes_mul_asm_acc_32_32_wrap); +MAKE_BENCH_frombytes_mul_32(kyber_frombytes_mul_acc_32_32_opt_m7, frombytes_mul_asm_acc_32_32_opt_m7_wrap); + +MAKE_BENCH_frombytes_mul_32_16(kyber_frombytes_mul_acc_32_16, frombytes_mul_asm_acc_32_16_wrap); +MAKE_BENCH_frombytes_mul_32_16(kyber_frombytes_mul_acc_32_16_opt_m7, frombytes_mul_asm_acc_32_16_opt_m7_wrap); + +MAKE_BENCH_3(kyber_basemul_asm_pqm4,basemul_asm_wrap) +MAKE_BENCH_3(kyber_basemul_asm_pqm4_opt_m7,basemul_asm_opt_m7_wrap) + +MAKE_BENCH_3(kyber_basemul_asm_acc_pqm4,basemul_asm_acc_wrap) +MAKE_BENCH_3(kyber_basemul_asm_acc_pqm4_opt_m7,basemul_asm_acc_opt_m7_wrap) + +MAKE_BENCH_frombytes_mul(kyber_frombytes_mul_pqm4,frombytes_mul_asm_wrap) +MAKE_BENCH_frombytes_mul(kyber_frombytes_mul_pqm4_opt_m7,frombytes_mul_asm_opt_m7_wrap) + +MAKE_BENCH_frombytes_mul(kyber_frombytes_mul_acc_pqm4,frombytes_mul_asm_acc_wrap) +MAKE_BENCH_frombytes_mul(kyber_frombytes_mul_acc_pqm4_opt_m7,frombytes_mul_asm_acc_opt_m7_wrap) + +MAKE_BENCH_matacc(kyber_matacc_asm_pqm4,matacc_asm) +MAKE_BENCH_matacc(kyber_matacc_asm_pqm4_opt_m7,matacc_asm_opt_m7) + +MAKE_BENCH_matacc(kyber_matacc_asm_acc_pqm4,matacc_asm_acc) +MAKE_BENCH_matacc(kyber_matacc_asm_acc_pqm4_opt_m7,matacc_asm_acc_opt_m7) + +MAKE_BENCH_matacc_cache_32(kyber_matacc_asm_cache_16_32_pqm4,matacc_asm_cache_16_32) +MAKE_BENCH_matacc_cache_32(kyber_matacc_asm_cache_16_32_pqm4_opt_m7,matacc_asm_cache_16_32_opt_m7) + +MAKE_BENCH_matacc_cache_32(kyber_matacc_asm_cache_32_32_pqm4,matacc_asm_cache_32_32) +MAKE_BENCH_matacc_cache_32(kyber_matacc_asm_cache_32_32_pqm4_opt_m7,matacc_asm_cache_32_32_opt_m7) + +MAKE_BENCH_matacc_cache_32_16(kyber_matacc_asm_cache_32_16_pqm4,matacc_asm_cache_32_16) +MAKE_BENCH_matacc_cache_32_16(kyber_matacc_asm_cache_32_16_pqm4_opt_m7,matacc_asm_cache_32_16_opt_m7) + +MAKE_BENCH_matacc_opt_32(kyber_matacc_asm_opt_16_32_pqm4,matacc_asm_opt_16_32) +MAKE_BENCH_matacc_opt_32(kyber_matacc_asm_opt_16_32_pqm4_opt_m7,matacc_asm_opt_16_32_opt_m7) + +MAKE_BENCH_matacc_opt_32(kyber_matacc_asm_opt_32_32_pqm4,matacc_asm_opt_32_32) +MAKE_BENCH_matacc_opt_32(kyber_matacc_asm_opt_32_32_pqm4_opt_m7,matacc_asm_opt_32_32_opt_m7) + +MAKE_BENCH_matacc_opt_32_16(kyber_matacc_asm_opt_32_16_pqm4,matacc_asm_opt_32_16) +MAKE_BENCH_matacc_opt_32_16(kyber_matacc_asm_opt_32_16_pqm4_opt_m7,matacc_asm_opt_32_16_opt_m7) + + +int main(void) +{ + int ret = 0; + debug_test_start( "\nKyber All Test!\n" ); + + /* Test cases */ + if( test_ntt_pqm4() != 0 ){return( 1 );} + if( test_ntt_pqm4_opt() != 0 ){return( 1 );} + + if( test_invntt_pqm4() != 0 ){return( 1 );} + if( test_invntt_pqm4_opt() != 0 ){return( 1 );} + + if( test_barrett_reduce_pqm4() != 0 ){return( 1 );} + if( test_barrett_reduce_pqm4_opt() != 0 ){return( 1 );} + + if( test_fromplant_pqm4() != 0 ){return( 1 );} + if( test_fromplant_pqm4_opt() != 0 ){return( 1 );} + + if( test_pointwise_add_pqm4() != 0 ){return( 1 );} + if( test_pointwise_add_pqm4_opt() != 0 ){return( 1 );} + + if( test_pointwise_sub_pqm4() != 0 ){return( 1 );} + if( test_pointwise_sub_pqm4_opt() != 0 ){return( 1 );} + + if( test_basemul_16_32() != 0 ){return( 1 );} + if( test_basemul_16_32_opt() != 0 ){return( 1 );} + + if( test_basemul_acc_32_32() != 0 ){return( 1 );} + if( test_basemul_acc_32_32_opt() != 0 ){return( 1 );} + + if( test_basemul_acc_32_16() != 0 ){return( 1 );} + if( test_basemul_acc_32_16_opt() != 0 ){return( 1 );} + + if( test_frombytes_mul_16_32() != 0 ){return( 1 );} + if( test_frombytes_mul_16_32_opt() != 0 ){return( 1 );} + + if( test_frombytes_mul_acc_32_32() != 0 ){return( 1 );} + if( test_frombytes_mul_acc_32_32_opt() != 0 ){return( 1 );} + + if( test_frombytes_mul_acc_32_16() != 0 ){return( 1 );} + if( test_frombytes_mul_acc_32_16_opt() != 0 ){return( 1 );} + + if( test_basemul_pqm4() != 0 ){return( 1 );} + if( test_basemul_pqm4_opt() != 0 ){return( 1 );} + + if( test_basemul_acc_pqm4() != 0 ){return( 1 );} + if( test_basemul_acc_pqm4_opt() != 0 ){return( 1 );} + + if( test_matacc_asm_pqm4() != 0 ){return( 1 );} + if( test_matacc_asm_pqm4_opt() != 0 ){return( 1 );} + + if( test_frombytes_mul_asm_pqm4() != 0 ){return( 1 );} + if( test_frombytes_mul_asm_pqm4_opt() != 0 ){return( 1 );} + + if( test_frombytes_mul_asm_acc_pqm4() != 0 ){return( 1 );} + if( test_frombytes_mul_asm_acc_pqm4_opt() != 0 ){return( 1 );} + + if( test_matacc_asm_acc_pqm4() != 0 ){return( 1 );} + if( test_matacc_asm_acc_pqm4_opt() != 0 ){return( 1 );} + + if( test_matacc_asm_cache_16_32_pqm4() != 0 ){return( 1 );} + if( test_matacc_asm_cache_16_32_pqm4_opt() != 0 ){return( 1 );} + + if( test_matacc_asm_cache_32_32_pqm4() != 0 ){return( 1 );} + if( test_matacc_asm_cache_32_32_pqm4_opt() != 0 ){return( 1 );} + + if( test_matacc_asm_cache_32_16_pqm4() != 0 ){return( 1 );} + if( test_matacc_asm_cache_32_16_pqm4_opt() != 0 ){return( 1 );} + + if( test_matacc_asm_opt_16_32_pqm4() != 0 ){return( 1 );} + if( test_matacc_asm_opt_16_32_pqm4_opt() != 0 ){return( 1 );} + + if( test_matacc_asm_opt_32_32_pqm4() != 0 ){return( 1 );} + if( test_matacc_asm_opt_32_32_pqm4_opt() != 0 ){return( 1 );} + + if( test_matacc_asm_opt_32_16_pqm4() != 0 ){return( 1 );} + if( test_matacc_asm_opt_32_16_pqm4_opt() != 0 ){return( 1 );} + + bench_kyber_ntt_pqm4(); + bench_kyber_ntt_pqm4_opt_m7(); + + bench_kyber_invntt_pqm4(); + bench_kyber_invntt_pqm4_opt_m7(); + + bench_kyber_barrett_reduce_pqm4(); + bench_kyber_barrett_reduce_pqm4_opt_m7(); + + bench_kyber_fromplant_pqm4(); + bench_kyber_fromplant_pqm4_opt_m7(); + + bench_kyber_pointwise_add_pqm4(); + bench_kyber_pointwise_add_pqm4_opt_m7(); + + bench_kyber_pointwise_sub_pqm4(); + bench_kyber_pointwise_sub_pqm4_opt_m7(); + + bench_kyber_basemul_16_32(); + bench_kyber_basemul_16_32_opt_m7(); + + bench_kyber_basemul_acc_32_32(); + bench_kyber_basemul_acc_32_32_opt_m7(); + + bench_kyber_basemul_acc_32_16(); + bench_kyber_basemul_acc_32_16_opt_m7(); + + bench_kyber_frombytes_mul_16_32(); + bench_kyber_frombytes_mul_16_32_opt_m7(); + + bench_kyber_frombytes_mul_acc_32_32(); + bench_kyber_frombytes_mul_acc_32_32_opt_m7(); + + bench_kyber_frombytes_mul_acc_32_16(); + bench_kyber_frombytes_mul_acc_32_16_opt_m7(); + + bench_kyber_basemul_asm_pqm4(); + bench_kyber_basemul_asm_pqm4_opt_m7(); + + bench_kyber_basemul_asm_acc_pqm4(); + bench_kyber_basemul_asm_acc_pqm4_opt_m7(); + + bench_kyber_frombytes_mul_pqm4(); + bench_kyber_frombytes_mul_pqm4_opt_m7(); + + bench_kyber_frombytes_mul_acc_pqm4(); + bench_kyber_frombytes_mul_acc_pqm4_opt_m7(); + + bench_kyber_matacc_asm_pqm4(); + bench_kyber_matacc_asm_pqm4_opt_m7(); + + bench_kyber_matacc_asm_acc_pqm4(); + bench_kyber_matacc_asm_acc_pqm4_opt_m7(); + + bench_kyber_matacc_asm_cache_16_32_pqm4(); + bench_kyber_matacc_asm_cache_16_32_pqm4_opt_m7(); + + bench_kyber_matacc_asm_cache_32_32_pqm4(); + bench_kyber_matacc_asm_cache_32_32_pqm4_opt_m7(); + + bench_kyber_matacc_asm_cache_32_16_pqm4(); + bench_kyber_matacc_asm_cache_32_16_pqm4_opt_m7(); + + bench_kyber_matacc_asm_opt_16_32_pqm4(); + bench_kyber_matacc_asm_opt_16_32_pqm4_opt_m7(); + + bench_kyber_matacc_asm_opt_32_32_pqm4(); + bench_kyber_matacc_asm_opt_32_32_pqm4_opt_m7(); + + bench_kyber_matacc_asm_opt_32_16_pqm4(); + bench_kyber_matacc_asm_opt_32_16_pqm4_opt_m7(); + + + /* Test cases */ + debug_printf( "Done!\n" ); + + debug_printf("======================" ); + dump_benchmarks_tex(); + debug_printf("======================\n" ); + debug_printf( "ALL GOOD!\n" ); + return( ret ); +} diff --git a/tests/kyber-armv7m/ntt-asm.h b/tests/kyber-armv7m/ntt-asm.h new file mode 100644 index 0000000..5346ee5 --- /dev/null +++ b/tests/kyber-armv7m/ntt-asm.h @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: Apache-2.0 or CC0-1.0 +#ifndef NTT_ASM_H +#define NTT_ASM_H + +#include +// asm +void ntt_fast(int16_t *, const int32_t *); +void ntt_fast_opt_m7(int16_t *, const int32_t *); + +void invntt_fast(int16_t *, const int32_t *); +void invntt_fast_opt_m7(int16_t *, const int32_t *); + + +static const int32_t zetas_asm[128] = { + 2230699446, 3328631909, 4243360600, 3408622288, 812805467, 2447447570, 1094061961, 1370157786, 2475831253, 249002310, 1028263423, 3594406395, 4205945745, 734105255, 2252632292, 381889553, 372858381, 427045412, 21932846, 3562152210, 752167598, 3417653460, 3157039644, 4196914574, 2265533966, 2112004045, 932791035, 2951903026, 1419184148, 1727534158, 1544330386, 2972545705, 1817845876, 3434425636, 4233039261, 300609006, 1904287092, 2937711185, 2651294021, 975366560, 2781600929, 3889854731, 3935010590, 3929849920, 838608815, 2550660963, 2197155094, 2130066389, 3598276897, 2308109491, 72249375, 3242190693, 815385801, 2382939200, 1228239371, 1884934581, 3466679822, 2889974991, 3696329620, 42575525, 1211467195, 2977706375, 3144137970, 3080919767, 1719793153, 1703020977, 2470670584, 945692709, 3015121229, 345764865, 826997308, 1839778722, 2991898216, 1851390229, 2043625172, 2964804700, 2628071007, 4154339049, 2701610550, 1041165097, 583155668, 483812778, 3288636719, 2696449880, 2122325384, 690239563, 1855260731, 3700200122, 1371447954, 411563403, 3577634219, 976656727, 3718262466, 1979116802, 3098982111, 2708061387, 723783916, 3181552825, 3346694253, 3087370604, 3415073125, 3376368103, 3617629408, 1408862808, 519937465, 1323711759, 3714391964, 1910737929, 836028480, 1474661346, 2773859924, 3580214553, 1143088323, 2546790461, 3191874164, 4012420634, 2221668274, 1563682897, 2417773720, 1327582262, 1059227441, 1583035408, 1174052340, 2722253228, 3786641338, 1141798155, 2779020594, 0}; + +static const int32_t zetas_inv_CT_asm[256] = { + // LAYER 7+6+5+4 + 1290168, 1290168, 2064267851, 1290168, 51606697, 2064267851, 966335388, 1290168, 3200905336, 51606697, 3482161830, 2064267851, 1847519727, 966335388, 886345009, + // removed first "2285" + LAYER 3+2+1 - 1 - butterfly + 1290168, 2064267851, 1290168, 51606697, 2064267851, 966335388, + // LAYER 3+2+1 - 1 - twist + 2435836064, 290287667, 2944162022, 3021572066, 1802363867, 603798347, 3375077936, 2677097369, + // LAYER 3+2+1 - 2 - butterfly + 2042335005, 3235739856, 1748176836, 3120914957, 282546663, 2711931889, 1103093133, + // LAYER 3+2+1 - 2 - twist + 1659155285, 1785591691, 1941701947, 2704190884, 358666539, 793452955, 1461759672, 1673347127, + // LAYER 3+2+1 - 3 - butterfly + 3200905336, 2042335005, 3560862042, 3235739856, 580575333, 1748176836, 1207596693, + // LAYER 3+2+1 - 3 - twist + 3887274396, 2126195886, 872153167, 3443456808, 526388302, 299318839, 3875662889, 3382818940, + // LAYER 3+2+1 - 4 - butterfly + 3266703874, 2575174144, 1404992306, 1824296713, 4252391772, 2591946320, 598637677, + // LAYER 3+2+1 - 4 - twist + 1997179146, 2904166832, 2577754479, 202556283, 30964018, 3807284017, 1238560711, 1967505295, + // LAYER 3+2+1 - 5 - butterfly + 51606697, 3200905336, 1847519727, 2042335005, 89021552, 3560862042, 700560902, + // LAYER 3+2+1 - 5 - twist + 1633351937, 2191994424, 909568022, 1780431021, 2022982494, 2497764099, 3609888404, 1126316146, + // LAYER 3+2+1 - 6 - butterfly + 89021552, 576704831, 3604727734, 1195985186, 594767175, 2315850495, 2439706566, + // LAYER 3+2+1 - 6 - twist + 3633111417, 2908037335, 3590535893, 357376372, 1887514916, 1410152976, 2486152593, 571544162, + // LAYER 3+2+1 - 7 - butterfly + 3482161830, 3266703874, 4045964987, 2575174144, 4222717922, 1404992306, 365117377, + // LAYER 3+2+1 - 7 - twist + 4003389463, 2444867236, 1221788534, 3305408896, 1626901100, 3367336931, 651534541, 1549491056, + // LAYER 3+2+1 - 8 - butterfly + 1819136044, 2390680205, 2567433139, 1643673276, 1322421592, 1357256112, 2750636911, + // LAYER 3+2+1 - 8 - twist + 993428903, 3680847611, 1082450454, 1205016358, 348345200, 956014049, 1048906102, 3880823559, + // LAYER 3+2+1 - 9 - butterfly + 2064267851, 51606697, 966335388, 3200905336, 3482161830, 1847519727, 886345009, + // LAYER 3+2+1 - 9 - twist + 3342823751, 4258842609, 568963827, 2849979801, 1283716570, 2330042337, 4104022520, 3007380225, + // LAYER 3+2+1 - 10 - butterfly + 3560862042, 580575333, 1207596693, 3458938817, 918599194, 2384229368, 879894172, + // LAYER 3+2+1 - 10 - twist + 2217797772, 503165289, 2812564947, 2946742357, 833448145, 1905577260, 3273154711, 3208646340, + // LAYER 3+2+1 - 11 - butterfly + 1847519727, 89021552, 700560902, 576704831, 1593356747, 3604727734, 2455188575, + // LAYER 3+2+1 - 11 - twist + 3162200314, 2808694444, 1933960943, 678628056, 49026362, 1375318456, 1961054458, 3473130659, + // LAYER 3+2+1 - 12 - butterfly + 4045964987, 4222717922, 365117377, 3479581496, 1744306334, 1052776604, 3456358482, + // LAYER 3+2+1 - 12 - twist + 438656919, 1681088131, 366407544, 2819015784, 1771399850, 1091481626, 2136517226, 709592074, + // LAYER 3+2+1 - 13 - butterfly + 966335388, 3482161830, 886345009, 3266703874, 1819136044, 4045964987, 2924809511, + // LAYER 3+2+1 - 13 - twist + 25803349, 3888564563, 1032133926, 923759864, 2630651342, 2590656153, 2146838565, 547030981, + // LAYER 3+2+1 - 14 - butterfly + 700560902, 1593356747, 2455188575, 3711811629, 2443577068, 3253802200, 1303069081, + // LAYER 3+2+1 - 14 - twist + 254162980, 3513125848, 1576584571, 3086080437, 2933840683, 3184133160, 1389510297, 2811274779, + // LAYER 3+2+1 - 15 - butterfly + 886345009, 1819136044, 2924809511, 2390680205, 1137927653, 2567433139, 3913077744, + // LAYER 3+2+1 - 15 - twist + 2288756980, 459299597, 1355965945, 1192114684, 2699030215, 439947086, 587026170, 418014240, + // LAYER 3+2+1 - 16 - butterfly + 2924809511, 1137927653, 3913077744, 2029433331, 3867921885, 98052723, 3922108916, 639923034, + // LAYER 3+2+1 - 16 - twist + 2806114109, 4122084864, 575414664, 1674637294, 1541750051, 2560982302, 1540459884, 0}; + +static void ntt_fast_wrap(int16_t *p){ + ntt_fast(p, zetas_asm); +} + +static void ntt_fast_opt_m7_wrap(int16_t *p){ + ntt_fast_opt_m7(p, zetas_asm); +} + +static void invntt_fast_wrap(int16_t *p){ + invntt_fast(p, zetas_inv_CT_asm); +} + +static void invntt_fast_opt_m7_wrap(int16_t *p){ + invntt_fast_opt_m7(p, zetas_inv_CT_asm); +} + + +void basemul_asm(int16_t *, const int16_t *, const int16_t *, const int32_t *); +void basemul_asm_opt_m7(int16_t *, const int16_t *, const int16_t *, const int32_t *); + +void basemul_asm_acc(int16_t *, const int16_t *, const int16_t *, const int32_t *); +void basemul_asm_acc_opt_m7(int16_t *, const int16_t *, const int16_t *, const int32_t *); + +static void basemul_asm_wrap(int16_t *a, const int16_t *b, const int16_t *c){ + basemul_asm(a,b,c,zetas); +} +static void basemul_asm_opt_m7_wrap(int16_t *a, const int16_t *b, const int16_t *c){ + basemul_asm_opt_m7(a,b,c,zetas); +} +static void basemul_asm_acc_wrap(int16_t *a, const int16_t *b, const int16_t *c){ + basemul_asm_acc(a,b,c,zetas); +} +static void basemul_asm_acc_opt_m7_wrap(int16_t *a, const int16_t *b, const int16_t *c){ + basemul_asm_acc_opt_m7(a,b,c,zetas); +} + + +void frombytes_mul_asm(int16_t *r, const int16_t *b, const unsigned char *c, const int32_t zetas[64]); +void frombytes_mul_asm_opt_m7(int16_t *r, const int16_t *b, const unsigned char *c, const int32_t zetas[64]); + +void frombytes_mul_asm_acc(int16_t *r, const int16_t *b, const unsigned char *c, const int32_t zetas[64]); +void frombytes_mul_asm_acc_opt_m7(int16_t *r, const int16_t *b, const unsigned char *c, const int32_t zetas[64]); + +void frombytes_mul_asm_wrap(int16_t *r, const int16_t *b, const unsigned char *c){ + frombytes_mul_asm(r,b,c,zetas); +} +void frombytes_mul_asm_opt_m7_wrap(int16_t *r, const int16_t *b, const unsigned char *c){ + frombytes_mul_asm_opt_m7(r,b,c,zetas); +} + +void frombytes_mul_asm_acc_wrap(int16_t *r, const int16_t *b, const unsigned char *c){ + frombytes_mul_asm_acc(r,b,c,zetas); +} +void frombytes_mul_asm_acc_opt_m7_wrap(int16_t *r, const int16_t *b, const unsigned char *c){ + frombytes_mul_asm_acc_opt_m7(r,b,c,zetas); +} + + +#endif \ No newline at end of file diff --git a/tests/kyber-armv7m/params.h b/tests/kyber-armv7m/params.h new file mode 100644 index 0000000..a9dd131 --- /dev/null +++ b/tests/kyber-armv7m/params.h @@ -0,0 +1,31 @@ +#ifndef PARAMS_H +#define PARAMS_H + +#define KYBER_K 3 + +/* Don't change parameters below this line */ + +#define KYBER_N 256 +#define KYBER_Q 3329 + +#define KYBER_ETA 2 + +#define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ +#define KYBER_SSBYTES 32 /* size in bytes of shared key */ + +#define KYBER_POLYBYTES 384 +#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) + +#define KYBER_POLYCOMPRESSEDBYTES 128 +#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320) + +#define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES +#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) +#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) +#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES) + +#define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES) +#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) /* 32 bytes of additional space to save H(pk) */ +#define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES + +#endif \ No newline at end of file