diff --git a/.github/workflows/hol_light.yml b/.github/workflows/hol_light.yml index eb01bc5e9..474cb4fd5 100644 --- a/.github/workflows/hol_light.yml +++ b/.github/workflows/hol_light.yml @@ -165,6 +165,8 @@ jobs: needs: ["mldsa_specs.ml", "mldsa_utils.ml", "mldsa_zetas.ml"] - name: mldsa_intt needs: ["mldsa_specs.ml", "mldsa_utils.ml", "mldsa_zetas.ml"] + - name: keccak_f1600_x4_avx2 + needs: ["keccak_utils.ml", "keccak_spec.ml", "keccak_f1600_x4_avx2_constants.ml", "keccak_constants.ml"] name: x86_64 HOL Light proof for ${{ matrix.proof.name }}.S runs-on: pqcp-x64 if: github.repository_owner == 'pq-code-package' && !github.event.pull_request.head.repo.fork diff --git a/README.md b/README.md index 3658bf723..1a76aec0e 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,7 @@ HOL-Light functional correctness proofs can be found in [proofs/hol_light](proof - AArch64 poly_caddq [poly_caddq_asm.S](mldsa/src/native/aarch64/src/poly_caddq_asm.S) - x86_64 NTT [ntt.S](mldsa/src/native/x86_64/src/ntt.S) +- x86_64 4-fold Keccak-F1600 using AVX2 [keccak_f1600_x4_avx2.S](mldsa/src/fips202/native/x86_64/src/keccak_f1600_x4_avx2.S) These proofs utilize the verification infrastructure in [s2n-bignum](https://github.com/awslabs/s2n-bignum). diff --git a/dev/fips202/aarch64/src/keccakf1600_round_constants.c b/dev/fips202/aarch64/src/keccakf1600_round_constants.c index 9e3533529..95df8f70c 100644 --- a/dev/fips202/aarch64/src/keccakf1600_round_constants.c +++ b/dev/fips202/aarch64/src/keccakf1600_round_constants.c @@ -1,9 +1,14 @@ /* - * Copyright (c) The mlkem-native project authors * Copyright (c) The mldsa-native project authors * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mldsa-native repository. + * Do not modify it directly. + */ + #include "../../../../common.h" #if (defined(MLD_FIPS202_AARCH64_NEED_X1_SCALAR) || \ diff --git a/mldsa/src/fips202/native/x86_64/xkcp.h b/dev/fips202/x86_64/keccak_f1600_x4_avx2.h similarity index 56% rename from mldsa/src/fips202/native/x86_64/xkcp.h rename to dev/fips202/x86_64/keccak_f1600_x4_avx2.h index 58cb4bb0b..377b87b0b 100644 --- a/mldsa/src/fips202/native/x86_64/xkcp.h +++ b/dev/fips202/x86_64/keccak_f1600_x4_avx2.h @@ -4,18 +4,19 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ -#ifndef MLD_FIPS202_NATIVE_X86_64_XKCP_H -#define MLD_FIPS202_NATIVE_X86_64_XKCP_H +#ifndef MLD_DEV_FIPS202_X86_64_KECCAK_F1600_X4_AVX2_H +#define MLD_DEV_FIPS202_X86_64_KECCAK_F1600_X4_AVX2_H #include "../../../common.h" -#define MLD_FIPS202_X86_64_XKCP +#define MLD_FIPS202_X86_64_NEED_X4_AVX2 + +/* Part of backend API */ +#define MLD_USE_FIPS202_X4_NATIVE #if !defined(__ASSEMBLER__) #include "../api.h" -#include "src/KeccakP_1600_times4_SIMD256.h" - -#define MLD_USE_FIPS202_X4_NATIVE +#include "src/fips202_native_x86_64.h" MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE int mld_keccak_f1600_x4_native(uint64_t *state) { @@ -23,9 +24,11 @@ static MLD_INLINE int mld_keccak_f1600_x4_native(uint64_t *state) { return MLD_NATIVE_FUNC_FALLBACK; } - mld_keccakf1600x4_permute24(state); + + mld_keccak_f1600_x4_avx2(state, mld_keccakf1600_round_constants, + mld_keccak_rho8, mld_keccak_rho56); return MLD_NATIVE_FUNC_SUCCESS; } #endif /* !__ASSEMBLER__ */ -#endif /* !MLD_FIPS202_NATIVE_X86_64_XKCP_H */ +#endif /* !MLD_DEV_FIPS202_X86_64_KECCAK_F1600_X4_AVX2_H */ diff --git a/dev/fips202/x86_64/src/fips202_native_x86_64.h b/dev/fips202/x86_64/src/fips202_native_x86_64.h new file mode 100644 index 000000000..d00ad0d7e --- /dev/null +++ b/dev/fips202/x86_64/src/fips202_native_x86_64.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLD_DEV_FIPS202_X86_64_SRC_FIPS202_NATIVE_X86_64_H +#define MLD_DEV_FIPS202_X86_64_SRC_FIPS202_NATIVE_X86_64_H + +#include "../../../../cbmc.h" +#include "../../../../common.h" + +/* TODO: Reconsider whether this check is needed -- x86_64 is always + * little-endian, so the backend selection already implies this. */ +#ifndef MLD_SYS_LITTLE_ENDIAN +#error Expecting a little-endian platform +#endif + +#define mld_keccakf1600_round_constants \ + MLD_NAMESPACE(keccakf1600_round_constants) +extern const uint64_t mld_keccakf1600_round_constants[]; + +#define mld_keccak_rho8 MLD_NAMESPACE(keccak_rho8) +extern const uint64_t mld_keccak_rho8[]; + +#define mld_keccak_rho56 MLD_NAMESPACE(keccak_rho56) +extern const uint64_t mld_keccak_rho56[]; + +#define mld_keccak_f1600_x4_avx2 MLD_NAMESPACE(keccak_f1600_x4_avx2) +void mld_keccak_f1600_x4_avx2(uint64_t states[100], const uint64_t rc[24], + const uint64_t rho8[4], const uint64_t rho56[4]) +/* This must be kept in sync with the HOL-Light specification + * in proofs/hol_light/x86_64/proofs/keccak_f1600_x4_avx2.ml */ +__contract__( + requires(memory_no_alias(states, sizeof(uint64_t) * 25 * 4)) + requires(rc == mld_keccakf1600_round_constants) + requires(rho8 == mld_keccak_rho8) + requires(rho56 == mld_keccak_rho56) + assigns(memory_slice(states, sizeof(uint64_t) * 25 * 4)) +); + +#endif /* !MLD_DEV_FIPS202_X86_64_SRC_FIPS202_NATIVE_X86_64_H */ diff --git a/dev/fips202/x86_64/src/keccak_f1600_x4_avx2.S b/dev/fips202/x86_64/src/keccak_f1600_x4_avx2.S new file mode 100644 index 000000000..9c0a85e65 --- /dev/null +++ b/dev/fips202/x86_64/src/keccak_f1600_x4_avx2.S @@ -0,0 +1,622 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ +#include "../../../../common.h" + +#if defined(MLD_FIPS202_X86_64_NEED_X4_AVX2) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(keccak_f1600_x4_avx2) +MLD_ASM_FN_SYMBOL(keccak_f1600_x4_avx2) + + // **** Bitstates Allocation Map **** // + // 0x0(%rsp) A0 (state0[0), state1[0], state2[0], state3[0]] Input (%rdi) offsets: 0x00, 0xC8, 0x190, 0x258 + // 0x20(%rsp) A1 (state0[1), state1[1], state2[1], state3[1]] Input (%rdi) offsets: 0x08, 0xD0, 0x198, 0x260 + // 0x40(%rsp) A2 (state0[2), state1[2], state2[2], state3[2]] Input (%rdi) offsets: 0x10, 0xD8, 0x1A0, 0x268 + // 0x60(%rsp) A3 (state0[3), state1[3], state2[3], state3[3]] Input (%rdi) offsets: 0x18, 0xE0, 0x1A8, 0x270 + // 0x80(%rsp) A4 (state0[4), state1[4], state2[4], state3[4]] Input (%rdi) offsets: 0x20, 0xE8, 0x1B0, 0x278 + // 0xa0(%rsp) A5 (state0[5), state1[5], state2[5], state3[5]] Input (%rdi) offsets: 0x28, 0xF0, 0x1B8, 0x280 + // 0xc0(%rsp) A6 (state0[6), state1[6], state2[6], state3[6]] Input (%rdi) offsets: 0x30, 0xF8, 0x1C0, 0x288 + // %ymm10 A7 (state0[7), state1[7], state2[7], state3[7]] Input (%rdi) offsets: 0x38, 0x100, 0x1C8, 0x290 + // %ymm14 A8 (state0[8), state1[8], state2[8], state3[8]] Input (%rdi) offsets: 0x40, 0x108, 0x1D0, 0x298 + // 0xe0(%rsp) A9 (state0[9), state1[9], state2[9], state3[9]] Input (%rdi) offsets: 0x48, 0x110, 0x1D8, 0x2A0 + // 0x100(%rsp) A10 (state0[10), state1[10], state2[10], state3[10]] Input (%rdi) offsets: 0x50, 0x118, 0x1E0, 0x2A8 + // %ymm8 A11 (state0[11), state1[11], state2[11], state3[11]] Input (%rdi) offsets: 0x58, 0x120, 0x1E8, 0x2B0 + // %ymm15 A12 (state0[12), state1[12], state2[12], state3[12]] Input (%rdi) offsets: 0x60, 0x128, 0x1F0, 0x2B8 + // 0x120(%rsp) A13 (state0[13), state1[13], state2[13], state3[13]] Input (%rdi) offsets: 0x68, 0x130, 0x1F8, 0x2C0 + // 0x140(%rsp) A14 (state0[14), state1[14], state2[14], state3[14]] Input (%rdi) offsets: 0x70, 0x138, 0x200, 0x2C8 + // %ymm9 A15 (state0[15), state1[15], state2[15], state3[15]] Input (%rdi) offsets: 0x78, 0x140, 0x208, 0x2D0 + // 0x160(%rsp) A16 (state0[16), state1[16], state2[16], state3[16]] Input (%rdi) offsets: 0x80, 0x148, 0x210, 0x2D8 + // 0x180(%rsp) A17 (state0[17), state1[17], state2[17], state3[17]] Input (%rdi) offsets: 0x88, 0x150, 0x218, 0x2E0 + // %ymm13 A18 (state0[18), state1[18], state2[18], state3[18]] Input (%rdi) offsets: 0x90, 0x158, 0x220, 0x2E8 + // 0x1a0(%rsp) A19 (state0[19), state1[19], state2[19], state3[19]] Input (%rdi) offsets: 0x98, 0x160, 0x228, 0x2F0 + // 0x1c0(%rsp) A20 (state0[20), state1[20], state2[20], state3[20]] Input (%rdi) offsets: 0xA0, 0x168, 0x230, 0x2F8 + // %ymm3 A21 (state0[21), state1[21], state2[21], state3[21]] Input (%rdi) offsets: 0xA8, 0x170, 0x238, 0x300 + // %ymm7 A22 (state0[22), state1[22], state2[22], state3[22]] Input (%rdi) offsets: 0xB0, 0x178, 0x240, 0x308 + // 0x1e0(%rsp) A23 (state0[23), state1[23], state2[23], state3[23]] Input (%rdi) offsets: 0xB8, 0x180, 0x248, 0x310 + // %ymm2 A24 (state0[24), state1[24], state2[24], state3[24]] Input (%rdi) offsets: 0xC0, 0x188, 0x250, 0x318 + + subq $0x300, %rsp + + // Load 32 bytes from each of the 4 states (A(0-3)) + vmovdqu (%rdi), %ymm0 // Load state0(0, 1, 2, 3) (32 bytes from Input (%rdi) offset: 0x00) + vmovdqu 0xc8(%rdi), %ymm3 // Load state1(0, 1, 2, 3) (32 bytes from Input (%rdi) offset: 0xC8) + vmovdqu 0x190(%rdi), %ymm1 // Load state2(0, 1, 2, 3) (32 bytes from Input (%rdi) offset: 0x190) + vmovdqu 0x258(%rdi), %ymm4 // Load state3(0, 1, 2, 3) (32 bytes from Input (%rdi) offset: 0x258) + + // Interleave low and high qwords from %ymm0(state0(0,1,2,3)) and %ymm3(state1[0,1,2,3]) + vpunpcklqdq %ymm3, %ymm0, %ymm2 // %ymm2 = (state0[0) | state1[0] | state0[2] | state1[2]] + vpunpckhqdq %ymm3, %ymm0, %ymm0 // %ymm0 = (state0[1) | state1[1] | state0[3] | state1[3]] + + // Interleave low and high qwords from %ymm1(state2(0,1,2,3)) and %ymm4(state3[0,1,2,3]) + vpunpcklqdq %ymm4, %ymm1, %ymm3 // %ymm3 = (state2[0) | state3[0] | state2[2] | state3[2]] + + // Permute 128-bit lanes to complete the interleave for A0 and A(2) + vperm2i128 $0x20, %ymm3, %ymm2, %ymm7 // A0 = %ymm7 = (state0[0) | state1[0] | state2[0] | state3[0]] + vpunpckhqdq %ymm4, %ymm1, %ymm1 // %ymm1 = (state2[1) | state3[1] | state2[3] | state3[3]] + vperm2i128 $0x31, %ymm3, %ymm2, %ymm3 // A2 = %ymm3 = (state0[2) | state1[2] | state2[2] | state3[2]] + vmovdqu 0x278(%rdi), %ymm4 // Pre-load state3(4, 5, 6, 7) for next group + vmovdqu %ymm3, 0x40(%rsp) // store A(2) -> on stack + + // Permute 128-bit lanes to complete the interleave for A3 and A(1) + vperm2i128 $0x31, %ymm1, %ymm0, %ymm3 // A3 = %ymm3 = (state0[3) | state1[3] | state2[3] | state3[3]] + vmovdqu %ymm7, 0x0(%rsp) // store A(0) -> on stack + vperm2i128 $0x20, %ymm1, %ymm0, %ymm7 // A1 = %ymm7 = (state0[1) | state1[1] | state2[1] | state3[1]] + + vmovdqu 0x20(%rdi), %ymm0 + vmovdqu 0x1b0(%rdi), %ymm1 + vmovdqu %ymm3, 0x60(%rsp) // store A(3) + vmovdqu 0xe8(%rdi), %ymm3 + vmovdqu %ymm7, 0x20(%rsp) // store A(1) + + // Load, Interleave, and Store 32 bytes from each of the 4 states (A(4-7)) + vpunpcklqdq %ymm3, %ymm0, %ymm2 + vpunpckhqdq %ymm3, %ymm0, %ymm0 + vpunpcklqdq %ymm4, %ymm1, %ymm3 + vperm2i128 $0x20, %ymm3, %ymm2, %ymm7 + vpunpckhqdq %ymm4, %ymm1, %ymm1 + vperm2i128 $0x31, %ymm3, %ymm2, %ymm3 + vmovdqu 0x298(%rdi), %ymm4 + vperm2i128 $0x31, %ymm1, %ymm0, %ymm14 + vmovdqu %ymm7, 0x80(%rsp) + vperm2i128 $0x20, %ymm1, %ymm0, %ymm7 + vmovdqu 0x40(%rdi), %ymm0 + vmovdqu 0x1d0(%rdi), %ymm1 + vmovdqu %ymm3, 0xc0(%rsp) + vmovdqu 0x108(%rdi), %ymm3 + vmovdqu %ymm14, %ymm10 + vmovdqu %ymm7, 0xa0(%rsp) + + // Load, Interleave, and Store 32 bytes from each of the 4 states (A(8-11)) + vpunpcklqdq %ymm3, %ymm0, %ymm2 + vpunpckhqdq %ymm3, %ymm0, %ymm0 + vpunpcklqdq %ymm4, %ymm1, %ymm3 + vpunpckhqdq %ymm4, %ymm1, %ymm1 + vperm2i128 $0x20, %ymm3, %ymm2, %ymm11 + vperm2i128 $0x31, %ymm3, %ymm2, %ymm3 + vperm2i128 $0x20, %ymm1, %ymm0, %ymm7 + vmovdqu %ymm3, 0x100(%rsp) + vperm2i128 $0x31, %ymm1, %ymm0, %ymm8 + vmovdqu 0x128(%rdi), %ymm3 + vmovdqu 0x60(%rdi), %ymm0 + vmovdqu 0x1f0(%rdi), %ymm1 + vmovdqu %ymm7, 0xe0(%rsp) + vmovdqu %ymm11, %ymm14 + vmovdqu 0x2b8(%rdi), %ymm4 + vmovdqu 0x2f8(%rdi), %ymm5 + + // Load, Interleave, and Store 32 bytes from each of the 4 states (A(12-15)) + vpunpcklqdq %ymm3, %ymm0, %ymm2 + vpunpckhqdq %ymm3, %ymm0, %ymm0 + vpunpcklqdq %ymm4, %ymm1, %ymm3 + vpunpckhqdq %ymm4, %ymm1, %ymm1 + vmovdqu 0x2d8(%rdi), %ymm4 + vperm2i128 $0x20, %ymm3, %ymm2, %ymm15 + vperm2i128 $0x31, %ymm3, %ymm2, %ymm3 + vperm2i128 $0x20, %ymm1, %ymm0, %ymm7 + vperm2i128 $0x31, %ymm1, %ymm0, %ymm9 + vmovdqu %ymm3, 0x140(%rsp) + vmovdqu 0x80(%rdi), %ymm0 + vmovdqu 0x148(%rdi), %ymm3 + vmovdqu 0x210(%rdi), %ymm1 + vmovdqu %ymm7, 0x120(%rsp) + + // Load, Interleave, and Store 32 bytes from each of the 4 states (A(16-19)) + vpunpcklqdq %ymm3, %ymm0, %ymm2 + vpunpckhqdq %ymm3, %ymm0, %ymm0 + vpunpcklqdq %ymm4, %ymm1, %ymm3 + vpunpckhqdq %ymm4, %ymm1, %ymm1 + vperm2i128 $0x20, %ymm3, %ymm2, %ymm7 + vperm2i128 $0x31, %ymm3, %ymm2, %ymm13 + vperm2i128 $0x31, %ymm1, %ymm0, %ymm3 + vmovdqu %ymm7, 0x160(%rsp) + vperm2i128 $0x20, %ymm1, %ymm0, %ymm7 + vmovdqu 0xa0(%rdi), %ymm0 + vmovdqu 0x230(%rdi), %ymm1 + vmovdqu %ymm3, 0x1a0(%rsp) + vmovdqu 0x168(%rdi), %ymm3 + + // Load, Interleave, and Store 32 bytes from each of the 4 states (A(20-23)) + vpunpcklqdq %ymm5, %ymm1, %ymm4 + vpunpckhqdq %ymm5, %ymm1, %ymm1 + vmovdqu %ymm7, 0x180(%rsp) + vpunpcklqdq %ymm3, %ymm0, %ymm2 + vpunpckhqdq %ymm3, %ymm0, %ymm0 + vperm2i128 $0x20, %ymm4, %ymm2, %ymm12 + vperm2i128 $0x20, %ymm1, %ymm0, %ymm3 + vperm2i128 $0x31, %ymm4, %ymm2, %ymm7 + vperm2i128 $0x31, %ymm1, %ymm0, %ymm4 + + // Load, Interleave, and Store 8 bytes from each of the 4 states (A24) + // A24 is the last element (only 8 bytes per state) + vmovq 0x250(%rdi), %xmm0 // Load state2(24) into lower 64 bits of %xmm0 + vmovq 0xc0(%rdi), %xmm1 // Load state0(24) into lower 64 bits of %xmm1 + vmovdqu %ymm12, 0x1c0(%rsp) + vmovdqu %ymm4, 0x1e0(%rsp) + vpinsrq $0x1, 0x318(%rdi), %xmm0, %xmm0 // Insert state3(24) into upper 64 bits of %xmm0 = [state2[24] | state3[24]] + vpinsrq $0x1, 0x188(%rdi), %xmm1, %xmm1 // Insert state1(24) into upper 64 bits of %xmm1 = [state0[24] | state1[24]] + vinserti128 $0x1, %xmm0, %ymm1, %ymm2 // Interleave into %ymm2 = A24 = (state0[24) | state1[24] | state2[24] | state3[24]] + + // Initialize the loop counter + mov $0, %r10 + + Lkeccak_f1600_x4_avx2: + + // ===================================================================== + // Theta Step + // ===================================================================== + // Compute the column parities C(x) = A(x,0) xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4] + // Then D(x) = C-1(x) xor ROL(C1(x), 1) + // Then A'(x,y) = A[x,y] xor D[x] + + // Theta step + vmovdqu 0xa0(%rsp), %ymm4 + vpxor 0x1c0(%rsp), %ymm9, %ymm0 // A(0,3) xor A[0,4] (A[15] xor A[20]) + vmovdqu %ymm9, 0x200(%rsp) + vmovdqu %ymm10, %ymm9 + vmovdqu 0xc0(%rsp), %ymm11 + vmovdqu 0x160(%rsp), %ymm12 + vmovdqu %ymm3, 0x240(%rsp) + vpxor 0x100(%rsp), %ymm4, %ymm1 // A(0,1) xor A[0,2] (A[5] xor A[10]) + vmovdqu 0x40(%rsp), %ymm10 + vmovdqu %ymm4, 0x220(%rsp) + vpxor %ymm3, %ymm12, %ymm12 // A(1,3) xor A[1,4] (A[16] xor A[21]) + vmovdqu 0x20(%rsp), %ymm6 + vmovdqu 0x140(%rsp), %ymm4 + vmovdqu %ymm14, 0x2a0(%rsp) + vpxor %ymm1, %ymm0, %ymm0 // (A(0,3) xor A[0,4]) xor (A[0,1] xor A[0,2]) ((A[15] xor A[20]) xor (A[5] xor A[10])) + vpxor %ymm8, %ymm11, %ymm1 // A(1,1) xor A[1,2] (A[6] xor A[11]) + vpxor 0x180(%rsp), %ymm7, %ymm11 // A(2,4) xor A[2,3] (A[22] xor A[17]) + vmovdqu %ymm10, 0x280(%rsp) + vpxor %ymm1, %ymm12, %ymm12 // (A(1,3) xor A[1,4]) xor (A[1,1] xor A[1,2]) ((A[16] xor A[21]) xor (A[6] xor A[11])) + vpxor %ymm15, %ymm9, %ymm1 // A(2,1) xor A[2,2] (A[7] xor A[12]) + vmovdqu 0xe0(%rsp), %ymm3 + vmovdqu %ymm8, 0x260(%rsp) + vpxor %ymm1, %ymm11, %ymm11 // (A(2,4) xor A[2,3]) xor (A[2,1] xor A[2,2]) ((A[22] xor A[17]) xor (A[7] xor A[12])) + vpxor 0x120(%rsp), %ymm14, %ymm1 // A(3,1) xor A[3,2] (A[8] xor A[13]) + vpxor %ymm6, %ymm12, %ymm12 // C1 = A(1,0) xor A[1,1] xor A[1,2] xor A[1,3] xor A[1,4] (A[1] xor A[6] xor A[11] xor A[16] xor A[21]) + vmovdqu 0x60(%rsp), %ymm8 + vpxor %ymm10, %ymm11, %ymm11 // C2 = A(2,0) xor A[2,1] xor A[2,2] xor A[2,3] xor A[2,4] (A[2] xor A[7] xor A[12] xor A[17] xor A[22]) + vpxor 0x1e0(%rsp), %ymm13, %ymm10 // A(3,3) xor A[3,4] (A[18] xor A[23]) + vpxor %ymm4, %ymm3, %ymm3 // A(4,1) xor A[4,2] (A[9] xor A[14]) + vmovdqu %ymm4, 0x2c0(%rsp) + vpsrlq $0x3f, %ymm12, %ymm4 + vpsrlq $0x3f, %ymm11, %ymm5 + vpxor 0x0(%rsp), %ymm0, %ymm0 // C(0) = A[0,0] xor A[0,1] xor A[0,2] xor A[0,3] xor A[0,4] (A[0] xor A[5] xor A[10] xor A[15] xor A[20]) + vpxor %ymm1, %ymm10, %ymm10 // (A(3,3) xor A[3,4]) xor (A[3,1] xor A[3,2]) ((A[18] xor A[23]) xor (A[8] xor A[13])) + vmovdqu 0x80(%rsp), %ymm1 + vpxor %ymm8, %ymm10, %ymm10 // C3 = A(3,0) xor A[3,1] xor A[3,2] xor A[3,3] xor A[3,4] (A[3] xor A[8] xor A[13] xor A[18] xor A[23]) + vmovdqu %ymm1, %ymm14 + vpxor 0x1a0(%rsp), %ymm2, %ymm1 // A(4,4) xor A[4,3] (A[24] xor A[19]) + vmovdqu %ymm14, 0x2e0(%rsp) + vpxor %ymm3, %ymm1, %ymm1 // (A(4,4) xor A[4,3]) xor (A[4,1] xor A[4,2]) ((A[24] xor A[19]) xor (A[9] xor A[14])) + vpsllq $0x1, %ymm12, %ymm3 + vpor %ymm4, %ymm3, %ymm3 // ROL(C1, 1) + vpsllq $0x1, %ymm11, %ymm4 + vpxor %ymm14, %ymm1, %ymm1 // C4 = A(4,0) xor A[4,1] xor A[4,2] xor A[4,3] xor A[4,4] (A[4] xor A[9] xor A[14] xor A[19] xor A[24]) + + // C0 = %ymm0 + // C1 = %ymm12 + // C2 = %ymm11 + // C3 = %ymm10 + // C4 = %ymm1 + + vpor %ymm5, %ymm4, %ymm4 // ROL(C2, 1) + vpsrlq $0x3f, %ymm10, %ymm14 + vpxor %ymm1, %ymm3, %ymm3 // D0 = C(4) xor ROL(C(1), 1) + vpsllq $0x1, %ymm10, %ymm5 + vpxor %ymm0, %ymm4, %ymm4 // D1 = C(0) xor ROL(C(2), 1) + vpor %ymm14, %ymm5, %ymm5 // ROL(C3, 1) + vpxor %ymm6, %ymm4, %ymm6 // A'(1,0) (A'[1]) = A[1,0] (A[1]) xor D[1] + vpxor %ymm12, %ymm5, %ymm5 // D2 = C(1) xor ROL(C(3), 1) + vpsrlq $0x3f, %ymm1, %ymm12 + vpsllq $0x1, %ymm1, %ymm1 + vpxor %ymm7, %ymm5, %ymm7 // A'(2,4) (A'[22]) = A[2,4] (A[22]) xor D[2] + vpxor %ymm9, %ymm5, %ymm9 // A'(2,1) (A'[7]) = A[2,1] (A[7]) xor D[2] + vpor %ymm12, %ymm1, %ymm1 // ROL(C4, 1) + vpxor 0x0(%rsp), %ymm3, %ymm12 // A'(0,0) (A'[0]) = A[0,0] (A[0]) xor D[0] + vpxor %ymm11, %ymm1, %ymm1 // D3 = C(2) xor ROL(C(4), 1) + vpsrlq $0x3f, %ymm0, %ymm11 + vpsllq $0x1, %ymm0, %ymm0 + vpxor %ymm13, %ymm1, %ymm13 // A'(3,3) (A'[18]) = A[3,3] (A[18]) xor D[3] + vpxor %ymm8, %ymm1, %ymm8 // A'(3,0) (A'[3]) = A[3,0] (A[3]) xor D[3] + vpor %ymm11, %ymm0, %ymm0 // ROL(C0, 1) + vpxor %ymm10, %ymm0, %ymm0 // D4 = C(3) xor ROL(C(0), 1) + + // D0 = %ymm3 + // D1 = %ymm4 + // D2 = %ymm5 + // D3 = %ymm1 + // D4 = %ymm0 + + vpxor 0xc0(%rsp), %ymm4, %ymm10 // A'(1,1) (A'[6]) = A[1,1] (A[6]) xor D[1] + vpxor %ymm2, %ymm0, %ymm2 // A'(4,4) (A'[24]) = A[4,4] (A[24]) xor D[4] + + // Rho, Pi, and Chi Steps (interleaved for performance) + // B(x,y) = ROL(A'[...], rotation_constant) placed at position determined by Pi + // A''(x,y) = B[x,y] XOR ((NOT B[x+1,y]) AND B[x+2,y]) + + vpsrlq $0x14, %ymm10, %ymm11 + vpsllq $0x2c, %ymm10, %ymm10 + vpor %ymm11, %ymm10, %ymm10 // B(1,0) (B[1]) = ROL(A'[1,1] (A'[6]), 44) + + vpxor %ymm15, %ymm5, %ymm11 // A'12 = A'(2,2) = A[2,2] (A[12]) xor D[2] + vpbroadcastq (%rsi), %ymm15 // Load Round Constant (RC) + vpsrlq $0x15, %ymm11, %ymm14 + vpsllq $0x2b, %ymm11, %ymm11 + vpor %ymm14, %ymm11, %ymm11 // B(2,0) (B[2]) = ROL(A'[2,2] (A'[12]), 43) + + vpandn %ymm11, %ymm10, %ymm14 + vpxor %ymm15, %ymm14, %ymm14 + vpxor %ymm12, %ymm14, %ymm15 // (A''0) A''(0,0) = B[0,0] xor ((not B[1,0]) and B[2,0]) xor RC + + vpsrlq $0x2b, %ymm13, %ymm14 + vpsllq $0x15, %ymm13, %ymm13 + vmovdqu %ymm15, 0x0(%rsp) // store A''(0,0) (A''[0]) -> on stack + vpor %ymm14, %ymm13, %ymm13 // B(3,0) (B[3]) = ROL(A'[3,3] (A'[18]), 21) + + vpandn %ymm13, %ymm11, %ymm14 + vpxor %ymm10, %ymm14, %ymm15 // (A''1) A''(1,0) = B[1,0] xor ((not B[2,0]) and B[3,0]) + + vpsrlq $0x32, %ymm2, %ymm14 + vpsllq $0xe, %ymm2, %ymm2 + vmovdqu %ymm15, 0x20(%rsp) // store A''(1,0) (A''[1]) -> on stack + vpor %ymm14, %ymm2, %ymm2 // B(4,0) (B[4]) = ROL(A'[4,4] (A'[24]), 14) + + // **** B0-B(4) Register Allocation Map **** + // B0 (B(0,0)) %ymm12 (A'[0,0] (A'[0]) unchanged, no rotation) + // B1 (B(1,0)) %ymm10 ROL(A'[1,1] (A'[6]), 44) + // B2 (B(2,0)) %ymm11 ROL(A'[2,2] (A'[12]), 43) + // B3 (B(3,0)) %ymm13 ROL(A'[3,3] (A'[18]), 21) + // B4 (B(4,0)) %ymm2 ROL(A'[4,4] (A'[24]), 14) + + vpandn %ymm2, %ymm13, %ymm14 + vpxor %ymm11, %ymm14, %ymm11 // (A''2) A''(2,0) = B[2,0] xor ((not B[3,0]) and B[4,0]) + vmovdqu %ymm11, 0x40(%rsp) // store A''(2,0) (A''[2]) -> on stack + vpandn %ymm12, %ymm2, %ymm11 + vpandn %ymm10, %ymm12, %ymm12 + vpxor %ymm13, %ymm11, %ymm11 // (A''3) A''(3,0) = B[3,0] xor ((not B[4,0]) and B[0,0]) + vmovdqu %ymm11, 0x60(%rsp) // store A''(3,0) (A''[3]) -> on stack + vpxor %ymm2, %ymm12, %ymm11 // (A''4) A''(4,0) = B[4,0] xor ((not B[0,0]) and B[1,0]) + + vpsrlq $0x24, %ymm8, %ymm2 + vpsllq $0x1c, %ymm8, %ymm8 + vmovdqu %ymm11, 0x80(%rsp) // store A''(4,0) (A''[4]) -> on stack + vpor %ymm2, %ymm8, %ymm8 // B(0,1) (B[5]) = ROL(A'[3,0] (A'[3]), 28) + + vpxor 0xe0(%rsp), %ymm0, %ymm2 // A'(9) = A'[4,1] = A[4,1] (A[9]) xor D[4] + vpsrlq $0x2c, %ymm2, %ymm10 + vpsllq $0x14, %ymm2, %ymm2 + vpor %ymm10, %ymm2, %ymm2 // B(1,1) (B[6]) = ROL(A'[4,1] (A'[9]), 20) + + vpxor 0x100(%rsp), %ymm3, %ymm10 // A'(10) = A'[0,2] = A[0,2] (A[10]) xor D[0] + vpsrlq $0x3d, %ymm10, %ymm11 + vpsllq $0x3, %ymm10, %ymm10 + vpor %ymm11, %ymm10, %ymm10 // B(2,1) (B[7]) = ROL(A'[0,2] (A'[10]), 3) + + vpandn %ymm10, %ymm2, %ymm11 + vpxor %ymm8, %ymm11, %ymm11 // (A''5) A''(0,1) = B[0,1] xor ((not B[1,1]) and B[2,1]) + vmovdqu %ymm11, 0xa0(%rsp) // store A''(0,1) (A''[5]) -> on stack + + vpxor 0x160(%rsp), %ymm4, %ymm11 // A'(16) = A'[1,3] = A[1,3] (A[16]) xor D[1] + vpsrlq $0x13, %ymm11, %ymm12 + vpsllq $0x2d, %ymm11, %ymm11 + vpor %ymm12, %ymm11, %ymm11 // B(3,1) (B[8]) = ROL(A'[1,3] (A'[16]), 45) + + vpandn %ymm11, %ymm10, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 // (A''6) A''(1,1) = B[1,1] xor ((not B[2,1]) and B[3,1]) + vmovdqu %ymm12, 0xc0(%rsp) // store A''(1,1) (A''[6]) -> on stack + + vpsrlq $0x3, %ymm7, %ymm12 + vpsllq $0x3d, %ymm7, %ymm7 + vpor %ymm12, %ymm7, %ymm7 // B(4,1) (B[9]) = ROL(A'[2,4] (A'[22]), 61) + + // **** B5-B(9) Register Allocation Map **** + // B5 (B(0,1)) %ymm8 ROL(A'[3,0] (A'[3]), 28) + // B6 (B(1,1)) %ymm2 ROL(A'[4,1] (A'[9]), 20) + // B7 (B(2,1)) %ymm10 ROL(A'[0,2] (A'[10]), 3) + // B8 (B(3,1)) %ymm11 ROL(A'[1,3] (A'[16]), 45) + // B9 (B(4,1)) %ymm7 ROL(A'[2,4] (A'[22]), 61) + + vpandn %ymm7, %ymm11, %ymm12 + vpxor %ymm10, %ymm12, %ymm10 // (A''7) A''(2,1) = B[2,1] xor ((not B[3,1]) and B[4,1]) + + vpandn %ymm8, %ymm7, %ymm12 + vpandn %ymm2, %ymm8, %ymm8 + + vpsrlq $0x3f, %ymm6, %ymm2 + vpsllq $0x1, %ymm6, %ymm6 + vpxor %ymm11, %ymm12, %ymm14 // (A''8) A''(3,1) = B[3,1] xor ((not B[4,1]) and B[0,1]) + vpor %ymm2, %ymm6, %ymm6 // B(0,2) (B[10]) = ROL(A'[1,0] (A'[1]), 1) + + vpsrlq $0x3a, %ymm9, %ymm2 + vpxor %ymm7, %ymm8, %ymm12 // (A''9) A''(4,1) = B[4,1] xor ((not B[0,1]) and B[1,1]) + vpsllq $0x6, %ymm9, %ymm9 + vmovdqu %ymm12, 0xe0(%rsp) // store A''(4,1) (A''[9]) -> on stack + + vpxor 0x1a0(%rsp), %ymm0, %ymm7 // A'(19) = A'[4,3] = A[4,3] (A[19]) xor D[4] + vpor %ymm2, %ymm9, %ymm9 // B(1,2) (B[11]) = ROL(A'[2,1] (A'[7]), 6) + + vpxor 0x120(%rsp), %ymm1, %ymm2 // A'(13) = A'[3,2] = A[3,2] (A[13]) xor D[3] + vpshufb (%rdx), %ymm7, %ymm7 // B(4,3) (B[19]) = ROL(A'[4,3] (A'[19]), 8) + vpsrlq $0x27, %ymm2, %ymm11 + vpsllq $0x19, %ymm2, %ymm2 + vpor %ymm2, %ymm11, %ymm11 // B(3,2) (B[13]) = ROL(A'[3,2] (A'[13]), 25) + + vpandn %ymm11, %ymm9, %ymm2 + vpandn %ymm7, %ymm11, %ymm8 + vpxor %ymm6, %ymm2, %ymm12 // (A''10) A''(0,2) = B[0,2] xor ((not B[1,2]) and B[2,2]) + + vpxor 0x1c0(%rsp), %ymm3, %ymm2 // A'(20) = A'[0,4] = A[0,4] (A[20]) xor D[0] + vpxor %ymm9, %ymm8, %ymm8 // (A''12) A''(2,2) partial + vmovdqu %ymm12, 0x100(%rsp) // store A''(0,2) (A''[10]) -> on stack + vpsrlq $0x2e, %ymm2, %ymm12 + vpsllq $0x12, %ymm2, %ymm2 + vpor %ymm2, %ymm12, %ymm2 // B(4,2) (B[14]) = ROL(A'[0,4] (A'[20]), 18) + + // **** B10-B(14) Register Allocation Map **** + // B10 (B(0,2)) %ymm6 ROL(A'[1,0] (A'[1]), 1) + // B11 (B(1,2)) %ymm9 ROL(A'[2,1] (A'[7]), 6) + // B12 (B(2,2)) %ymm11 ROL(A'[3,2] (A'[13]), 25) + // B13 (B(3,2)) %ymm7 ROL(A'[4,3] (A'[19]), 8) + // B14 (B(4,2)) %ymm2 ROL(A'[0,4] (A'[20]), 18) + + vpandn %ymm2, %ymm7, %ymm12 + vpxor %ymm11, %ymm12, %ymm15 // (A''12) A''(2,2) = B[2,2] xor ((not B[3,2]) and B[4,2]) + + vpandn %ymm6, %ymm2, %ymm11 + vpandn %ymm9, %ymm6, %ymm6 + vpxor %ymm7, %ymm11, %ymm12 // (A''13) A''(3,2) = B[3,2] xor ((not B[4,2]) and B[0,2]) + vmovdqu %ymm12, 0x120(%rsp) // store A''(3,2) (A''[13]) -> on stack + + vpxor %ymm2, %ymm6, %ymm12 // (A''14) A''(4,2) = B[4,2] xor ((not B[0,2]) and B[1,2]) + + vpxor 0x2e0(%rsp), %ymm0, %ymm6 // A'(4) = A'[4,0] = A[4,0] (A[4]) xor D[4] + vpxor 0x2c0(%rsp), %ymm0, %ymm0 // A'(14) = A'[4,2] = A[4,2] (A[14]) xor D[4] + vmovdqu %ymm12, 0x140(%rsp) // store A''(4,2) (A''[14]) -> on stack + vpsrlq $0x25, %ymm6, %ymm2 + vpsllq $0x1b, %ymm6, %ymm6 + vpor %ymm6, %ymm2, %ymm2 // B(0,3) (B[15]) = ROL(A'[4,0] (A'[4]), 27) + + vpxor 0x220(%rsp), %ymm3, %ymm6 // A'(5) = A'[0,1] = A[0,1] (A[5]) xor D[0] + vpxor 0x200(%rsp), %ymm3, %ymm3 // A'(15) = A'[0,3] = A[0,3] (A[15]) xor D[0] + vpsrlq $0x1c, %ymm6, %ymm7 + vpsllq $0x24, %ymm6, %ymm6 + vpor %ymm6, %ymm7, %ymm7 // B(1,3) (B[16]) = ROL(A'[0,1] (A'[5]), 36) + + vpxor 0x260(%rsp), %ymm4, %ymm6 // A'(11) = A'[1,2] = A[1,2] (A[11]) xor D[1] + vpxor 0x240(%rsp), %ymm4, %ymm4 // A'(21) = A'[1,4] = A[1,4] (A[21]) xor D[1] + vpsrlq $0x36, %ymm6, %ymm12 + vpsllq $0xa, %ymm6, %ymm6 + vpor %ymm6, %ymm12, %ymm12 // B(2,3) (B[17]) = ROL(A'[1,2] (A'[11]), 10) + + vpxor 0x180(%rsp), %ymm5, %ymm6 // A'(17) = A'[2,3] = A[2,3] (A[17]) xor D[2] + vpxor 0x280(%rsp), %ymm5, %ymm5 // A'(2) = A'[2,0] = A[2,0] (A[2]) xor D[2] + + vpandn %ymm12, %ymm7, %ymm9 + vpsrlq $0x31, %ymm6, %ymm11 + vpsllq $0xf, %ymm6, %ymm6 + vpxor %ymm2, %ymm9, %ymm9 // (A''15) A''(0,3) = B[0,3] xor ((not B[1,3]) and B[2,3]) + vpor %ymm6, %ymm11, %ymm11 // B(3,3) (B[18]) = ROL(A'[2,3] (A'[17]), 15) + + vpandn %ymm11, %ymm12, %ymm6 + vpxor %ymm7, %ymm6, %ymm6 // (A''16) A''(1,3) = B[1,3] xor ((not B[2,3]) and B[3,3]) + vmovdqu %ymm6, 0x160(%rsp) // store A''(1,3) (A''[16]) -> on stack + + vpxor 0x1e0(%rsp), %ymm1, %ymm6 // A'(23) = A'[3,4] = A[3,4] (A[23]) xor D[3] + vpxor 0x2a0(%rsp), %ymm1, %ymm1 // A'(8) = A'[3,1] = A[3,1] (A[8]) xor D[3] + vpshufb (%rcx), %ymm6, %ymm6 // B(4,3) (B[19]) = ROL(A'[3,4] (A'[23]), 56) + + // **** B15-B(19) Register Allocation Map **** + // B15 (B(0,3)) %ymm2 ROL(A'[4,0] (A'[4]), 27) + // B16 (B(1,3)) %ymm7 ROL(A'[0,1] (A'[5]), 36) + // B17 (B(2,3)) %ymm12 ROL(A'[1,2] (A'[11]), 10) + // B18 (B(3,3)) %ymm11 ROL(A'[2,3] (A'[17]), 15) + // B19 (B(4,3)) %ymm6 ROL(A'[3,4] (A'[23]), 56) + + vpandn %ymm6, %ymm11, %ymm13 + vpxor %ymm12, %ymm13, %ymm13 // (A''17) A''(2,3) = B[2,3] xor ((not B[3,3]) and B[4,3]) + vmovdqu %ymm13, 0x180(%rsp) // store A''(2,3) (A''[17]) -> on stack + + vpandn %ymm2, %ymm6, %ymm13 + vpandn %ymm7, %ymm2, %ymm2 + vpxor %ymm6, %ymm2, %ymm2 // (A''19) A''(4,3) = B[4,3] xor ((not B[0,3]) and B[1,3]) + + vpsrlq $0x3e, %ymm4, %ymm6 + vpxor %ymm11, %ymm13, %ymm13 // (A''18) A''(3,3) = B[3,3] xor ((not B[4,3]) and B[0,3]) + vmovdqu %ymm2, 0x1a0(%rsp) // store A''(4,3) (A''[19]) -> on stack + vpsrlq $0x2, %ymm5, %ymm2 + vpsllq $0x3e, %ymm5, %ymm5 + vpor %ymm5, %ymm2, %ymm2 // B(0,4) (B[20]) = ROL(A'[2,0] (A'[2]), 62) + + vpsrlq $0x9, %ymm1, %ymm5 + vpsllq $0x37, %ymm1, %ymm1 + + vpsllq $0x2, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm1 // B(1,4) (B[21]) = ROL(A'[3,1] (A'[8]), 55) + + vpsrlq $0x19, %ymm0, %ymm5 + vpor %ymm4, %ymm6, %ymm4 // B(3,4) (B[23]) = ROL(A'[1,4] (A'[21]), 2) + vpsllq $0x27, %ymm0, %ymm0 + vpor %ymm0, %ymm5, %ymm5 // B(2,4) (B[22]) = ROL(A'[4,2] (A'[14]), 39) + + vpandn %ymm5, %ymm1, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 // (A''20) A''(0,4) = B[0,4] xor ((not B[1,4]) and B[2,4]) + vmovdqu %ymm0, 0x1c0(%rsp) // store A''(0,4) (A''[20]) -> on stack + + vpsrlq $0x17, %ymm3, %ymm0 + vpsllq $0x29, %ymm3, %ymm3 + vpor %ymm3, %ymm0, %ymm0 // B(4,4) (B[24]) = ROL(A'[0,3] (A'[15]), 41) + + // **** B20-B(24) Register Allocation Map **** + // B20 (B(0,4)) %ymm2 ROL(A'[2,0] (A'[2]), 62) + // B21 (B(1,4)) %ymm1 ROL(A'[3,1] (A'[8]), 55) + // B22 (B(2,4)) %ymm5 ROL(A'[4,2] (A'[14]), 39) + // B23 (B(3,4)) %ymm4 ROL(A'[1,4] (A'[21]), 2) + // B24 (B(4,4)) %ymm0 ROL(A'[0,3] (A'[15]), 41) + + vpandn %ymm4, %ymm0, %ymm7 + vpandn %ymm0, %ymm5, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 // (A''22) A''(2,4) = B[2,4] xor ((not B[4,4]) and B[3,4]) + + vpandn %ymm2, %ymm4, %ymm5 + vpandn %ymm1, %ymm2, %ymm2 + vpxor %ymm0, %ymm5, %ymm5 // (A''23) A''(3,4) = B[3,4] xor ((not B[4,4]) and B[0,4]) + + vpxor %ymm1, %ymm3, %ymm3 // (A''21) A''(1,4) = B[1,4] xor ((not B[2,4]) and B[3,4]) + + vpxor %ymm4, %ymm2, %ymm2 // (A''24) A''(4,4) = B[4,4] xor ((not B[0,4]) and B[1,4]) + vmovdqu %ymm5, 0x1e0(%rsp) // store A''(3,4) (A''[23]) -> on stack + + add $8, %rsi + add $1, %r10 + cmp $0x18, %r10 + jne Lkeccak_f1600_x4_avx2 + + // Load, De-interleave, and Store 32 bytes to each of the 4 states (A(0-3)) + vmovdqu 0x0(%rsp), %ymm4 // Load A(0) from stack + vmovdqu 0x40(%rsp), %ymm5 // Load A(2) from stack + vmovdqu 0x20(%rsp), %ymm0 // Load A(1) from stack + vmovdqu 0x60(%rsp), %ymm1 // Load A(3) from stack + vmovdqu 0x1c0(%rsp), %ymm12 + vmovdqu %ymm2, 0x1c0(%rsp) + + // De-interleave %ymm4(A0) and %ymm0(A(1)) + vpunpcklqdq %ymm0, %ymm4, %ymm2 // %ymm2 = (state0[0) | state0[1] | state2[0] | state2[1]] + vpunpckhqdq %ymm0, %ymm4, %ymm0 // %ymm0 = (state1[0) | state1[1] | state3[0] | state3[1]] + // De-interleave %ymm5(A2) and %ymm1(A(3)) + vpunpcklqdq %ymm1, %ymm5, %ymm4 // %ymm4 = (state0[2) | state0[3] | state2[2] | state2[3]] + vpunpckhqdq %ymm1, %ymm5, %ymm1 // %ymm1 = (state1[2) | state1[3] | state3[2] | state3[3]] + + // Permute 128-bit lanes to complete the de-interleave + vperm2i128 $0x20, %ymm4, %ymm2, %ymm6 // %ymm6 = (state0[0) | state0[1] | state0[2] | state0[3]] + vperm2i128 $0x31, %ymm4, %ymm2, %ymm2 // %ymm2 = (state2[0) | state2[1] | state2[2] | state2[3]] + vmovdqu 0x80(%rsp), %ymm4 + vperm2i128 $0x20, %ymm1, %ymm0, %ymm5 // %ymm5 = (state1[0) | state1[1] | state1[2] | state1[3]] + vperm2i128 $0x31, %ymm1, %ymm0, %ymm0 // %ymm0 = (state3[0) | state3[1] | state3[2] | state3[3]] + + // Store de-interleaved results back to output + vmovdqu %ymm6, (%rdi) // Store state0(0, 1, 2, 3) (32 bytes to Output (%rdi) offset: 0x00) + vmovdqu %ymm5, 0xc8(%rdi) // Store state1(0, 1, 2, 3) (32 bytes to Output (%rdi) offset: 0xC8) + vmovdqu %ymm2, 0x190(%rdi) // Store state2(0, 1, 2, 3) (32 bytes to Output (%rdi) offset: 0x190) + vmovdqu %ymm0, 0x258(%rdi) // Store state3(0, 1, 2, 3) (32 bytes to Output (%rdi) offset: 0x258) + + // Load, De-interleave, and Store 32 bytes to each of the 4 states (A(4-7)) + vmovdqu 0xa0(%rsp), %ymm0 + vpunpcklqdq %ymm0, %ymm4, %ymm2 + vpunpckhqdq %ymm0, %ymm4, %ymm1 + vmovdqu 0xc0(%rsp), %ymm0 + vpunpcklqdq %ymm10, %ymm0, %ymm4 + vpunpckhqdq %ymm10, %ymm0, %ymm0 + vperm2i128 $0x20, %ymm4, %ymm2, %ymm6 + vperm2i128 $0x20, %ymm0, %ymm1, %ymm5 + vperm2i128 $0x31, %ymm4, %ymm2, %ymm2 + vmovdqu 0xe0(%rsp), %ymm4 + vperm2i128 $0x31, %ymm0, %ymm1, %ymm1 + vmovdqu 0x100(%rsp), %ymm0 + vmovdqu %ymm2, 0x1b0(%rdi) + vmovdqu %ymm1, 0x278(%rdi) + + // Load, De-interleave, and Store 32 bytes to each of the 4 states (A(8-11)) + vpunpcklqdq %ymm4, %ymm14, %ymm2 + vpunpckhqdq %ymm4, %ymm14, %ymm1 + vpunpcklqdq %ymm8, %ymm0, %ymm4 + vpunpckhqdq %ymm8, %ymm0, %ymm0 + vmovdqu %ymm6, 0x20(%rdi) + vmovdqu %ymm5, 0xe8(%rdi) + vperm2i128 $0x20, %ymm4, %ymm2, %ymm6 + vperm2i128 $0x20, %ymm0, %ymm1, %ymm5 + vperm2i128 $0x31, %ymm4, %ymm2, %ymm2 + vperm2i128 $0x31, %ymm0, %ymm1, %ymm1 + vmovdqu 0x120(%rsp), %ymm4 + vmovdqu 0x140(%rsp), %ymm0 + vmovdqu %ymm2, 0x1d0(%rdi) + vmovdqu %ymm1, 0x298(%rdi) + + // Load, De-interleave, and Store 32 bytes to each of the 4 states (A(12-15)) + vpunpcklqdq %ymm4, %ymm15, %ymm2 + vpunpckhqdq %ymm4, %ymm15, %ymm1 + vpunpcklqdq %ymm9, %ymm0, %ymm4 + vmovdqu %ymm5, 0x108(%rdi) + vpunpckhqdq %ymm9, %ymm0, %ymm0 + vmovdqu %ymm6, 0x40(%rdi) + vperm2i128 $0x20, %ymm4, %ymm2, %ymm6 + vperm2i128 $0x31, %ymm4, %ymm2, %ymm2 + vperm2i128 $0x20, %ymm0, %ymm1, %ymm5 + vmovdqu 0x160(%rsp), %ymm4 + vperm2i128 $0x31, %ymm0, %ymm1, %ymm1 + vmovdqu 0x180(%rsp), %ymm0 + vmovdqu %ymm5, 0x128(%rdi) + vmovdqu 0x1a0(%rsp), %ymm5 + vmovdqu %ymm2, 0x1f0(%rdi) + + // Load, De-interleave, and Store 32 bytes to each of the 4 states (A(16-19)) + vpunpcklqdq %ymm0, %ymm4, %ymm2 + vpunpckhqdq %ymm0, %ymm4, %ymm0 + vpunpcklqdq %ymm5, %ymm13, %ymm4 + vmovdqu %ymm6, 0x60(%rdi) + vperm2i128 $0x20, %ymm4, %ymm2, %ymm6 + vmovdqu %ymm1, 0x2b8(%rdi) + vperm2i128 $0x31, %ymm4, %ymm2, %ymm2 + vpunpckhqdq %ymm5, %ymm13, %ymm1 + vmovdqu %ymm6, 0x80(%rdi) + vmovdqu 0x1e0(%rsp), %ymm4 + vperm2i128 $0x20, %ymm1, %ymm0, %ymm5 + vperm2i128 $0x31, %ymm1, %ymm0, %ymm0 + vmovdqu %ymm2, 0x210(%rdi) + + // Load, De-interleave, and Store 32 bytes to each of the 4 states (A(20-23)) + vpunpcklqdq %ymm3, %ymm12, %ymm2 + vmovdqu %ymm0, 0x2d8(%rdi) + vpunpckhqdq %ymm3, %ymm12, %ymm0 + vpunpcklqdq %ymm4, %ymm7, %ymm3 + vpunpckhqdq %ymm4, %ymm7, %ymm1 + vmovdqu %ymm5, 0x148(%rdi) + vperm2i128 $0x20, %ymm3, %ymm2, %ymm5 + vperm2i128 $0x31, %ymm3, %ymm2, %ymm2 + vmovdqu 0x1c0(%rsp), %ymm3 + vperm2i128 $0x20, %ymm1, %ymm0, %ymm4 + vperm2i128 $0x31, %ymm1, %ymm0, %ymm0 + + // Store de-interleaved results back to output + vmovdqu %ymm5, 0xa0(%rdi) + vextracti128 $0x1, %ymm3, %xmm15 + vmovdqu %ymm4, 0x168(%rdi) + vmovdqu %ymm2, 0x230(%rdi) + vmovdqu %ymm0, 0x2f8(%rdi) + + // Load, De-interleave, and Store 8 bytes to each of the 4 states (A24) + // A24 is the last element (only 8 bytes per state) + vmovq %xmm3, 0xc0(%rdi) + vmovhpd %xmm3, 0x188(%rdi) + vmovq %xmm15, 0x250(%rdi) + vmovhpd %xmm15, 0x318(%rdi) + addq $0x300, %rsp + ret + +/* simpasm: footer-start */ +#endif /* MLD_FIPS202_X86_64_NEED_X4_AVX2 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/fips202/x86_64/src/keccakf1600_constants.c b/dev/fips202/x86_64/src/keccakf1600_constants.c new file mode 100644 index 000000000..c4be6ccb5 --- /dev/null +++ b/dev/fips202/x86_64/src/keccakf1600_constants.c @@ -0,0 +1,51 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mldsa-native repository. + * Do not modify it directly. + */ + +#include "../../../../common.h" +#if defined(MLD_FIPS202_X86_64_NEED_X4_AVX2) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include + +#include "fips202_native_x86_64.h" + +MLD_ALIGN const uint64_t mld_keccakf1600_round_constants[] = { + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008, +}; + +MLD_ALIGN const uint64_t mld_keccak_rho8[] = { + 0x0605040302010007, + 0x0e0d0c0b0a09080f, + 0x1615141312111017, + 0x1e1d1c1b1a19181f, +}; + +MLD_ALIGN const uint64_t mld_keccak_rho56[] = { + 0x0007060504030201, + 0x080f0e0d0c0b0a09, + 0x1017161514131211, + 0x181f1e1d1c1b1a19, +}; + +#else /* MLD_FIPS202_X86_64_NEED_X4_AVX2 && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + */ + +MLD_EMPTY_CU(fips202_x86_64_constants) + +#endif /* !(MLD_FIPS202_X86_64_NEED_X4_AVX2 && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED) */ diff --git a/mldsa/mldsa_native.c b/mldsa/mldsa_native.c index 302c14e72..52b77de73 100644 --- a/mldsa/mldsa_native.c +++ b/mldsa/mldsa_native.c @@ -102,7 +102,7 @@ #include "src/fips202/native/aarch64/src/keccakf1600_round_constants.c" #endif #if defined(MLD_SYS_X86_64) -#include "src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.c" +#include "src/fips202/native/x86_64/src/keccakf1600_constants.c" #endif #if defined(MLD_SYS_ARMV81M_MVE) #include "src/fips202/native/armv81m/src/keccak_f1600_x4_mve.c" @@ -570,13 +570,16 @@ /* * Undefine macros from native code (FIPS202, x86_64) */ -/* mldsa/src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.h */ -#undef MLD_FIPS202_NATIVE_X86_64_SRC_KECCAKP_1600_TIMES4_SIMD256_H -#undef mld_keccakf1600x4_permute24 -/* mldsa/src/fips202/native/x86_64/xkcp.h */ -#undef MLD_FIPS202_NATIVE_X86_64_XKCP_H -#undef MLD_FIPS202_X86_64_XKCP +/* mldsa/src/fips202/native/x86_64/keccak_f1600_x4_avx2.h */ +#undef MLD_FIPS202_NATIVE_X86_64_KECCAK_F1600_X4_AVX2_H +#undef MLD_FIPS202_X86_64_NEED_X4_AVX2 #undef MLD_USE_FIPS202_X4_NATIVE +/* mldsa/src/fips202/native/x86_64/src/fips202_native_x86_64.h */ +#undef MLD_FIPS202_NATIVE_X86_64_SRC_FIPS202_NATIVE_X86_64_H +#undef mld_keccak_f1600_x4_avx2 +#undef mld_keccak_rho56 +#undef mld_keccak_rho8 +#undef mld_keccakf1600_round_constants #endif /* MLD_SYS_X86_64 */ #if defined(MLD_SYS_ARMV81M_MVE) /* diff --git a/mldsa/mldsa_native_asm.S b/mldsa/mldsa_native_asm.S index a0dd45b62..1afd704c0 100644 --- a/mldsa/mldsa_native_asm.S +++ b/mldsa/mldsa_native_asm.S @@ -98,6 +98,7 @@ #include "src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm.S" #endif /* MLD_SYS_AARCH64 */ #if defined(MLD_SYS_X86_64) +#include "src/fips202/native/x86_64/src/keccak_f1600_x4_avx2.S" #endif #if defined(MLD_SYS_ARMV81M_MVE) #include "src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S" @@ -573,13 +574,16 @@ /* * Undefine macros from native code (FIPS202, x86_64) */ -/* mldsa/src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.h */ -#undef MLD_FIPS202_NATIVE_X86_64_SRC_KECCAKP_1600_TIMES4_SIMD256_H -#undef mld_keccakf1600x4_permute24 -/* mldsa/src/fips202/native/x86_64/xkcp.h */ -#undef MLD_FIPS202_NATIVE_X86_64_XKCP_H -#undef MLD_FIPS202_X86_64_XKCP +/* mldsa/src/fips202/native/x86_64/keccak_f1600_x4_avx2.h */ +#undef MLD_FIPS202_NATIVE_X86_64_KECCAK_F1600_X4_AVX2_H +#undef MLD_FIPS202_X86_64_NEED_X4_AVX2 #undef MLD_USE_FIPS202_X4_NATIVE +/* mldsa/src/fips202/native/x86_64/src/fips202_native_x86_64.h */ +#undef MLD_FIPS202_NATIVE_X86_64_SRC_FIPS202_NATIVE_X86_64_H +#undef mld_keccak_f1600_x4_avx2 +#undef mld_keccak_rho56 +#undef mld_keccak_rho8 +#undef mld_keccakf1600_round_constants #endif /* MLD_SYS_X86_64 */ #if defined(MLD_SYS_ARMV81M_MVE) /* diff --git a/mldsa/src/fips202/native/aarch64/src/keccakf1600_round_constants.c b/mldsa/src/fips202/native/aarch64/src/keccakf1600_round_constants.c index 9e3533529..95df8f70c 100644 --- a/mldsa/src/fips202/native/aarch64/src/keccakf1600_round_constants.c +++ b/mldsa/src/fips202/native/aarch64/src/keccakf1600_round_constants.c @@ -1,9 +1,14 @@ /* - * Copyright (c) The mlkem-native project authors * Copyright (c) The mldsa-native project authors * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mldsa-native repository. + * Do not modify it directly. + */ + #include "../../../../common.h" #if (defined(MLD_FIPS202_AARCH64_NEED_X1_SCALAR) || \ diff --git a/mldsa/src/fips202/native/auto.h b/mldsa/src/fips202/native/auto.h index 94fbe703f..29bb85206 100644 --- a/mldsa/src/fips202/native/auto.h +++ b/mldsa/src/fips202/native/auto.h @@ -17,7 +17,7 @@ #endif #if defined(MLD_SYS_X86_64) && defined(MLD_SYS_X86_64_AVX2) -#include "x86_64/xkcp.h" +#include "x86_64/keccak_f1600_x4_avx2.h" #endif /* We do not yet include the FIPS202 backend for Armv8.1-M+MVE by default diff --git a/mldsa/src/fips202/native/x86_64/keccak_f1600_x4_avx2.h b/mldsa/src/fips202/native/x86_64/keccak_f1600_x4_avx2.h new file mode 100644 index 000000000..565995304 --- /dev/null +++ b/mldsa/src/fips202/native/x86_64/keccak_f1600_x4_avx2.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLD_FIPS202_NATIVE_X86_64_KECCAK_F1600_X4_AVX2_H +#define MLD_FIPS202_NATIVE_X86_64_KECCAK_F1600_X4_AVX2_H + +#include "../../../common.h" + +#define MLD_FIPS202_X86_64_NEED_X4_AVX2 + +/* Part of backend API */ +#define MLD_USE_FIPS202_X4_NATIVE + +#if !defined(__ASSEMBLER__) +#include "../api.h" +#include "src/fips202_native_x86_64.h" +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_keccak_f1600_x4_native(uint64_t *state) +{ + if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2)) + { + return MLD_NATIVE_FUNC_FALLBACK; + } + + mld_keccak_f1600_x4_avx2(state, mld_keccakf1600_round_constants, + mld_keccak_rho8, mld_keccak_rho56); + return MLD_NATIVE_FUNC_SUCCESS; +} +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLD_FIPS202_NATIVE_X86_64_KECCAK_F1600_X4_AVX2_H */ diff --git a/mldsa/src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.c b/mldsa/src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.c deleted file mode 100644 index c323ed385..000000000 --- a/mldsa/src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.c +++ /dev/null @@ -1,488 +0,0 @@ -/* - * Copyright (c) The mlkem-native project authors - * Copyright (c) The mldsa-native project authors - * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - */ - -/* -Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni, -Joan Daemen, Michaƫl Peeters, Gilles Van Assche and Ronny Van Keer, hereby -denoted as "the implementer". - -For more information, feedback or questions, please refer to our websites: -http://keccak.noekeon.org/ -http://keyak.noekeon.org/ -http://ketje.noekeon.org/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ -*/ - -/* - * Changes for mlkem-native/mldsa-native: - * - MLD_COPY_FROM_STATE and MLD_COPY_TO_STATE operate on uninterleaved - * Keccak states in memory. - */ - -#include "../../../../common.h" -#if defined(MLD_FIPS202_X86_64_XKCP) && \ - !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) - -#include - -#include "KeccakP_1600_times4_SIMD256.h" - -#ifndef MLD_SYS_LITTLE_ENDIAN -#error Expecting a little-endian platform -#endif - -#define MLD_ANDNU256(a, b) _mm256_andnot_si256(a, b) -#define MLD_CONST256(a) _mm256_load_si256((const __m256i *)&(a)) -#define MLD_CONST256_64(a) (__m256i) _mm256_broadcast_sd((const double *)(&a)) -#define MLD_ROL64IN256(d, a, o) \ - d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64 - (o))) -#define MLD_ROL64IN256_8(d, a) \ - d = _mm256_shuffle_epi8(a, MLD_CONST256(mld_rho8)) -#define MLD_ROL64IN256_56(d, a) \ - d = _mm256_shuffle_epi8(a, MLD_CONST256(mld_rho56)) -static const uint64_t mld_rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, - 0x1615141312111017, 0x1E1D1C1B1A19181F}; -static const uint64_t mld_rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, - 0x1017161514131211, 0x181F1E1D1C1B1A19}; -#define MLD_STORE256(a, b) _mm256_store_si256((__m256i *)&(a), b) -#define MLD_XOR256(a, b) _mm256_xor_si256(a, b) -#define MLD_XOREQ256(a, b) a = _mm256_xor_si256(a, b) - -#define MLD_SNP_LANELENGTHINBYTES 8 - -#define MLD_DECLARE_ABCDE \ - __m256i Aba, Abe, Abi, Abo, Abu; \ - __m256i Aga, Age, Agi, Ago, Agu; \ - __m256i Aka, Ake, Aki, Ako, Aku; \ - __m256i Ama, Ame, Ami, Amo, Amu; \ - __m256i Asa, Ase, Asi, Aso, Asu; \ - __m256i Bba, Bbe, Bbi, Bbo, Bbu; \ - __m256i Bga, Bge, Bgi, Bgo, Bgu; \ - __m256i Bka, Bke, Bki, Bko, Bku; \ - __m256i Bma, Bme, Bmi, Bmo, Bmu; \ - __m256i Bsa, Bse, Bsi, Bso, Bsu; \ - __m256i Ca, Ce, Ci, Co, Cu; \ - __m256i Ca1, Ce1, Ci1, Co1, Cu1; \ - __m256i Da, De, Di, Do, Du; \ - __m256i Eba, Ebe, Ebi, Ebo, Ebu; \ - __m256i Ega, Ege, Egi, Ego, Egu; \ - __m256i Eka, Eke, Eki, Eko, Eku; \ - __m256i Ema, Eme, Emi, Emo, Emu; \ - __m256i Esa, Ese, Esi, Eso, Esu; - -#define MLD_prepareTheta \ - Ca = \ - MLD_XOR256(Aba, MLD_XOR256(Aga, MLD_XOR256(Aka, MLD_XOR256(Ama, Asa)))); \ - Ce = \ - MLD_XOR256(Abe, MLD_XOR256(Age, MLD_XOR256(Ake, MLD_XOR256(Ame, Ase)))); \ - Ci = \ - MLD_XOR256(Abi, MLD_XOR256(Agi, MLD_XOR256(Aki, MLD_XOR256(Ami, Asi)))); \ - Co = \ - MLD_XOR256(Abo, MLD_XOR256(Ago, MLD_XOR256(Ako, MLD_XOR256(Amo, Aso)))); \ - Cu = MLD_XOR256(Abu, MLD_XOR256(Agu, MLD_XOR256(Aku, MLD_XOR256(Amu, Asu)))); - -/* - * --- Theta Rho Pi Chi Iota Prepare-theta - * --- 64-bit lanes mapped to 64-bit words - */ -#define MLD_thetaRhoPiChiIotaPrepareTheta(i, A, E) \ - MLD_ROL64IN256(Ce1, Ce, 1); \ - Da = MLD_XOR256(Cu, Ce1); \ - MLD_ROL64IN256(Ci1, Ci, 1); \ - De = MLD_XOR256(Ca, Ci1); \ - MLD_ROL64IN256(Co1, Co, 1); \ - Di = MLD_XOR256(Ce, Co1); \ - MLD_ROL64IN256(Cu1, Cu, 1); \ - Do = MLD_XOR256(Ci, Cu1); \ - MLD_ROL64IN256(Ca1, Ca, 1); \ - Du = MLD_XOR256(Co, Ca1); \ - \ - MLD_XOREQ256(A##ba, Da); \ - Bba = A##ba; \ - MLD_XOREQ256(A##ge, De); \ - MLD_ROL64IN256(Bbe, A##ge, 44); \ - MLD_XOREQ256(A##ki, Di); \ - MLD_ROL64IN256(Bbi, A##ki, 43); \ - E##ba = MLD_XOR256(Bba, MLD_ANDNU256(Bbe, Bbi)); \ - MLD_XOREQ256(E##ba, MLD_CONST256_64(mld_keccakf1600RoundConstants[i])); \ - Ca = E##ba; \ - MLD_XOREQ256(A##mo, Do); \ - MLD_ROL64IN256(Bbo, A##mo, 21); \ - E##be = MLD_XOR256(Bbe, MLD_ANDNU256(Bbi, Bbo)); \ - Ce = E##be; \ - MLD_XOREQ256(A##su, Du); \ - MLD_ROL64IN256(Bbu, A##su, 14); \ - E##bi = MLD_XOR256(Bbi, MLD_ANDNU256(Bbo, Bbu)); \ - Ci = E##bi; \ - E##bo = MLD_XOR256(Bbo, MLD_ANDNU256(Bbu, Bba)); \ - Co = E##bo; \ - E##bu = MLD_XOR256(Bbu, MLD_ANDNU256(Bba, Bbe)); \ - Cu = E##bu; \ - \ - MLD_XOREQ256(A##bo, Do); \ - MLD_ROL64IN256(Bga, A##bo, 28); \ - MLD_XOREQ256(A##gu, Du); \ - MLD_ROL64IN256(Bge, A##gu, 20); \ - MLD_XOREQ256(A##ka, Da); \ - MLD_ROL64IN256(Bgi, A##ka, 3); \ - E##ga = MLD_XOR256(Bga, MLD_ANDNU256(Bge, Bgi)); \ - MLD_XOREQ256(Ca, E##ga); \ - MLD_XOREQ256(A##me, De); \ - MLD_ROL64IN256(Bgo, A##me, 45); \ - E##ge = MLD_XOR256(Bge, MLD_ANDNU256(Bgi, Bgo)); \ - MLD_XOREQ256(Ce, E##ge); \ - MLD_XOREQ256(A##si, Di); \ - MLD_ROL64IN256(Bgu, A##si, 61); \ - E##gi = MLD_XOR256(Bgi, MLD_ANDNU256(Bgo, Bgu)); \ - MLD_XOREQ256(Ci, E##gi); \ - E##go = MLD_XOR256(Bgo, MLD_ANDNU256(Bgu, Bga)); \ - MLD_XOREQ256(Co, E##go); \ - E##gu = MLD_XOR256(Bgu, MLD_ANDNU256(Bga, Bge)); \ - MLD_XOREQ256(Cu, E##gu); \ - \ - MLD_XOREQ256(A##be, De); \ - MLD_ROL64IN256(Bka, A##be, 1); \ - MLD_XOREQ256(A##gi, Di); \ - MLD_ROL64IN256(Bke, A##gi, 6); \ - MLD_XOREQ256(A##ko, Do); \ - MLD_ROL64IN256(Bki, A##ko, 25); \ - E##ka = MLD_XOR256(Bka, MLD_ANDNU256(Bke, Bki)); \ - MLD_XOREQ256(Ca, E##ka); \ - MLD_XOREQ256(A##mu, Du); \ - MLD_ROL64IN256_8(Bko, A##mu); \ - E##ke = MLD_XOR256(Bke, MLD_ANDNU256(Bki, Bko)); \ - MLD_XOREQ256(Ce, E##ke); \ - MLD_XOREQ256(A##sa, Da); \ - MLD_ROL64IN256(Bku, A##sa, 18); \ - E##ki = MLD_XOR256(Bki, MLD_ANDNU256(Bko, Bku)); \ - MLD_XOREQ256(Ci, E##ki); \ - E##ko = MLD_XOR256(Bko, MLD_ANDNU256(Bku, Bka)); \ - MLD_XOREQ256(Co, E##ko); \ - E##ku = MLD_XOR256(Bku, MLD_ANDNU256(Bka, Bke)); \ - MLD_XOREQ256(Cu, E##ku); \ - \ - MLD_XOREQ256(A##bu, Du); \ - MLD_ROL64IN256(Bma, A##bu, 27); \ - MLD_XOREQ256(A##ga, Da); \ - MLD_ROL64IN256(Bme, A##ga, 36); \ - MLD_XOREQ256(A##ke, De); \ - MLD_ROL64IN256(Bmi, A##ke, 10); \ - E##ma = MLD_XOR256(Bma, MLD_ANDNU256(Bme, Bmi)); \ - MLD_XOREQ256(Ca, E##ma); \ - MLD_XOREQ256(A##mi, Di); \ - MLD_ROL64IN256(Bmo, A##mi, 15); \ - E##me = MLD_XOR256(Bme, MLD_ANDNU256(Bmi, Bmo)); \ - MLD_XOREQ256(Ce, E##me); \ - MLD_XOREQ256(A##so, Do); \ - MLD_ROL64IN256_56(Bmu, A##so); \ - E##mi = MLD_XOR256(Bmi, MLD_ANDNU256(Bmo, Bmu)); \ - MLD_XOREQ256(Ci, E##mi); \ - E##mo = MLD_XOR256(Bmo, MLD_ANDNU256(Bmu, Bma)); \ - MLD_XOREQ256(Co, E##mo); \ - E##mu = MLD_XOR256(Bmu, MLD_ANDNU256(Bma, Bme)); \ - MLD_XOREQ256(Cu, E##mu); \ - \ - MLD_XOREQ256(A##bi, Di); \ - MLD_ROL64IN256(Bsa, A##bi, 62); \ - MLD_XOREQ256(A##go, Do); \ - MLD_ROL64IN256(Bse, A##go, 55); \ - MLD_XOREQ256(A##ku, Du); \ - MLD_ROL64IN256(Bsi, A##ku, 39); \ - E##sa = MLD_XOR256(Bsa, MLD_ANDNU256(Bse, Bsi)); \ - MLD_XOREQ256(Ca, E##sa); \ - MLD_XOREQ256(A##ma, Da); \ - MLD_ROL64IN256(Bso, A##ma, 41); \ - E##se = MLD_XOR256(Bse, MLD_ANDNU256(Bsi, Bso)); \ - MLD_XOREQ256(Ce, E##se); \ - MLD_XOREQ256(A##se, De); \ - MLD_ROL64IN256(Bsu, A##se, 2); \ - E##si = MLD_XOR256(Bsi, MLD_ANDNU256(Bso, Bsu)); \ - MLD_XOREQ256(Ci, E##si); \ - E##so = MLD_XOR256(Bso, MLD_ANDNU256(Bsu, Bsa)); \ - MLD_XOREQ256(Co, E##so); \ - E##su = MLD_XOR256(Bsu, MLD_ANDNU256(Bsa, Bse)); \ - MLD_XOREQ256(Cu, E##su); - - -/* - * --- Theta Rho Pi Chi Iota - * --- 64-bit lanes mapped to 64-bit words - */ -#define MLD_thetaRhoPiChiIota(i, A, E) \ - MLD_ROL64IN256(Ce1, Ce, 1); \ - Da = MLD_XOR256(Cu, Ce1); \ - MLD_ROL64IN256(Ci1, Ci, 1); \ - De = MLD_XOR256(Ca, Ci1); \ - MLD_ROL64IN256(Co1, Co, 1); \ - Di = MLD_XOR256(Ce, Co1); \ - MLD_ROL64IN256(Cu1, Cu, 1); \ - Do = MLD_XOR256(Ci, Cu1); \ - MLD_ROL64IN256(Ca1, Ca, 1); \ - Du = MLD_XOR256(Co, Ca1); \ - \ - MLD_XOREQ256(A##ba, Da); \ - Bba = A##ba; \ - MLD_XOREQ256(A##ge, De); \ - MLD_ROL64IN256(Bbe, A##ge, 44); \ - MLD_XOREQ256(A##ki, Di); \ - MLD_ROL64IN256(Bbi, A##ki, 43); \ - E##ba = MLD_XOR256(Bba, MLD_ANDNU256(Bbe, Bbi)); \ - MLD_XOREQ256(E##ba, MLD_CONST256_64(mld_keccakf1600RoundConstants[i])); \ - MLD_XOREQ256(A##mo, Do); \ - MLD_ROL64IN256(Bbo, A##mo, 21); \ - E##be = MLD_XOR256(Bbe, MLD_ANDNU256(Bbi, Bbo)); \ - MLD_XOREQ256(A##su, Du); \ - MLD_ROL64IN256(Bbu, A##su, 14); \ - E##bi = MLD_XOR256(Bbi, MLD_ANDNU256(Bbo, Bbu)); \ - E##bo = MLD_XOR256(Bbo, MLD_ANDNU256(Bbu, Bba)); \ - E##bu = MLD_XOR256(Bbu, MLD_ANDNU256(Bba, Bbe)); \ - \ - MLD_XOREQ256(A##bo, Do); \ - MLD_ROL64IN256(Bga, A##bo, 28); \ - MLD_XOREQ256(A##gu, Du); \ - MLD_ROL64IN256(Bge, A##gu, 20); \ - MLD_XOREQ256(A##ka, Da); \ - MLD_ROL64IN256(Bgi, A##ka, 3); \ - E##ga = MLD_XOR256(Bga, MLD_ANDNU256(Bge, Bgi)); \ - MLD_XOREQ256(A##me, De); \ - MLD_ROL64IN256(Bgo, A##me, 45); \ - E##ge = MLD_XOR256(Bge, MLD_ANDNU256(Bgi, Bgo)); \ - MLD_XOREQ256(A##si, Di); \ - MLD_ROL64IN256(Bgu, A##si, 61); \ - E##gi = MLD_XOR256(Bgi, MLD_ANDNU256(Bgo, Bgu)); \ - E##go = MLD_XOR256(Bgo, MLD_ANDNU256(Bgu, Bga)); \ - E##gu = MLD_XOR256(Bgu, MLD_ANDNU256(Bga, Bge)); \ - \ - MLD_XOREQ256(A##be, De); \ - MLD_ROL64IN256(Bka, A##be, 1); \ - MLD_XOREQ256(A##gi, Di); \ - MLD_ROL64IN256(Bke, A##gi, 6); \ - MLD_XOREQ256(A##ko, Do); \ - MLD_ROL64IN256(Bki, A##ko, 25); \ - E##ka = MLD_XOR256(Bka, MLD_ANDNU256(Bke, Bki)); \ - MLD_XOREQ256(A##mu, Du); \ - MLD_ROL64IN256_8(Bko, A##mu); \ - E##ke = MLD_XOR256(Bke, MLD_ANDNU256(Bki, Bko)); \ - MLD_XOREQ256(A##sa, Da); \ - MLD_ROL64IN256(Bku, A##sa, 18); \ - E##ki = MLD_XOR256(Bki, MLD_ANDNU256(Bko, Bku)); \ - E##ko = MLD_XOR256(Bko, MLD_ANDNU256(Bku, Bka)); \ - E##ku = MLD_XOR256(Bku, MLD_ANDNU256(Bka, Bke)); \ - \ - MLD_XOREQ256(A##bu, Du); \ - MLD_ROL64IN256(Bma, A##bu, 27); \ - MLD_XOREQ256(A##ga, Da); \ - MLD_ROL64IN256(Bme, A##ga, 36); \ - MLD_XOREQ256(A##ke, De); \ - MLD_ROL64IN256(Bmi, A##ke, 10); \ - E##ma = MLD_XOR256(Bma, MLD_ANDNU256(Bme, Bmi)); \ - MLD_XOREQ256(A##mi, Di); \ - MLD_ROL64IN256(Bmo, A##mi, 15); \ - E##me = MLD_XOR256(Bme, MLD_ANDNU256(Bmi, Bmo)); \ - MLD_XOREQ256(A##so, Do); \ - MLD_ROL64IN256_56(Bmu, A##so); \ - E##mi = MLD_XOR256(Bmi, MLD_ANDNU256(Bmo, Bmu)); \ - E##mo = MLD_XOR256(Bmo, MLD_ANDNU256(Bmu, Bma)); \ - E##mu = MLD_XOR256(Bmu, MLD_ANDNU256(Bma, Bme)); \ - \ - MLD_XOREQ256(A##bi, Di); \ - MLD_ROL64IN256(Bsa, A##bi, 62); \ - MLD_XOREQ256(A##go, Do); \ - MLD_ROL64IN256(Bse, A##go, 55); \ - MLD_XOREQ256(A##ku, Du); \ - MLD_ROL64IN256(Bsi, A##ku, 39); \ - E##sa = MLD_XOR256(Bsa, MLD_ANDNU256(Bse, Bsi)); \ - MLD_XOREQ256(A##ma, Da); \ - MLD_ROL64IN256(Bso, A##ma, 41); \ - E##se = MLD_XOR256(Bse, MLD_ANDNU256(Bsi, Bso)); \ - MLD_XOREQ256(A##se, De); \ - MLD_ROL64IN256(Bsu, A##se, 2); \ - E##si = MLD_XOR256(Bsi, MLD_ANDNU256(Bso, Bsu)); \ - E##so = MLD_XOR256(Bso, MLD_ANDNU256(Bsu, Bsa)); \ - E##su = MLD_XOR256(Bsu, MLD_ANDNU256(Bsa, Bse)); - - -static MLD_ALIGN const uint64_t mld_keccakf1600RoundConstants[24] = { - (uint64_t)0x0000000000000001ULL, (uint64_t)0x0000000000008082ULL, - (uint64_t)0x800000000000808aULL, (uint64_t)0x8000000080008000ULL, - (uint64_t)0x000000000000808bULL, (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008081ULL, (uint64_t)0x8000000000008009ULL, - (uint64_t)0x000000000000008aULL, (uint64_t)0x0000000000000088ULL, - (uint64_t)0x0000000080008009ULL, (uint64_t)0x000000008000000aULL, - (uint64_t)0x000000008000808bULL, (uint64_t)0x800000000000008bULL, - (uint64_t)0x8000000000008089ULL, (uint64_t)0x8000000000008003ULL, - (uint64_t)0x8000000000008002ULL, (uint64_t)0x8000000000000080ULL, - (uint64_t)0x000000000000800aULL, (uint64_t)0x800000008000000aULL, - (uint64_t)0x8000000080008081ULL, (uint64_t)0x8000000000008080ULL, - (uint64_t)0x0000000080000001ULL, (uint64_t)0x8000000080008008ULL}; - - -#define MLD_COPY_FROM_STATE(X, state) \ - do \ - { \ - const uint64_t *state64 = (const uint64_t *)(state); \ - __m256i _idx = \ - _mm256_set_epi64x((long long)&state64[75], (long long)&state64[50], \ - (long long)&state64[25], (long long)&state64[0]); \ - X##ba = _mm256_i64gather_epi64((long long *)(0 * 8), _idx, 1); \ - X##be = _mm256_i64gather_epi64((long long *)(1 * 8), _idx, 1); \ - X##bi = _mm256_i64gather_epi64((long long *)(2 * 8), _idx, 1); \ - X##bo = _mm256_i64gather_epi64((long long *)(3 * 8), _idx, 1); \ - X##bu = _mm256_i64gather_epi64((long long *)(4 * 8), _idx, 1); \ - X##ga = _mm256_i64gather_epi64((long long *)(5 * 8), _idx, 1); \ - X##ge = _mm256_i64gather_epi64((long long *)(6 * 8), _idx, 1); \ - X##gi = _mm256_i64gather_epi64((long long *)(7 * 8), _idx, 1); \ - X##go = _mm256_i64gather_epi64((long long *)(8 * 8), _idx, 1); \ - X##gu = _mm256_i64gather_epi64((long long *)(9 * 8), _idx, 1); \ - X##ka = _mm256_i64gather_epi64((long long *)(10 * 8), _idx, 1); \ - X##ke = _mm256_i64gather_epi64((long long *)(11 * 8), _idx, 1); \ - X##ki = _mm256_i64gather_epi64((long long *)(12 * 8), _idx, 1); \ - X##ko = _mm256_i64gather_epi64((long long *)(13 * 8), _idx, 1); \ - X##ku = _mm256_i64gather_epi64((long long *)(14 * 8), _idx, 1); \ - X##ma = _mm256_i64gather_epi64((long long *)(15 * 8), _idx, 1); \ - X##me = _mm256_i64gather_epi64((long long *)(16 * 8), _idx, 1); \ - X##mi = _mm256_i64gather_epi64((long long *)(17 * 8), _idx, 1); \ - X##mo = _mm256_i64gather_epi64((long long *)(18 * 8), _idx, 1); \ - X##mu = _mm256_i64gather_epi64((long long *)(19 * 8), _idx, 1); \ - X##sa = _mm256_i64gather_epi64((long long *)(20 * 8), _idx, 1); \ - X##se = _mm256_i64gather_epi64((long long *)(21 * 8), _idx, 1); \ - X##si = _mm256_i64gather_epi64((long long *)(22 * 8), _idx, 1); \ - X##so = _mm256_i64gather_epi64((long long *)(23 * 8), _idx, 1); \ - X##su = _mm256_i64gather_epi64((long long *)(24 * 8), _idx, 1); \ - } while (0); - -#define MLD_SCATTER_STORE256(state, idx, v) \ - do \ - { \ - const uint64_t *state64 = (const uint64_t *)(state); \ - __m128d t = _mm_castsi128_pd(_mm256_castsi256_si128((v))); \ - _mm_storel_pd((double *)&state64[0 + (idx)], t); \ - _mm_storeh_pd((double *)&state64[25 + (idx)], t); \ - t = _mm_castsi128_pd(_mm256_extracti128_si256((v), 1)); \ - _mm_storel_pd((double *)&state64[50 + (idx)], t); \ - _mm_storeh_pd((double *)&state64[75 + (idx)], t); \ - } while (0) - -#define MLD_COPY_TO_STATE(state, X) \ - MLD_SCATTER_STORE256(state, 0, X##ba); \ - MLD_SCATTER_STORE256(state, 1, X##be); \ - MLD_SCATTER_STORE256(state, 2, X##bi); \ - MLD_SCATTER_STORE256(state, 3, X##bo); \ - MLD_SCATTER_STORE256(state, 4, X##bu); \ - MLD_SCATTER_STORE256(state, 5, X##ga); \ - MLD_SCATTER_STORE256(state, 6, X##ge); \ - MLD_SCATTER_STORE256(state, 7, X##gi); \ - MLD_SCATTER_STORE256(state, 8, X##go); \ - MLD_SCATTER_STORE256(state, 9, X##gu); \ - MLD_SCATTER_STORE256(state, 10, X##ka); \ - MLD_SCATTER_STORE256(state, 11, X##ke); \ - MLD_SCATTER_STORE256(state, 12, X##ki); \ - MLD_SCATTER_STORE256(state, 13, X##ko); \ - MLD_SCATTER_STORE256(state, 14, X##ku); \ - MLD_SCATTER_STORE256(state, 15, X##ma); \ - MLD_SCATTER_STORE256(state, 16, X##me); \ - MLD_SCATTER_STORE256(state, 17, X##mi); \ - MLD_SCATTER_STORE256(state, 18, X##mo); \ - MLD_SCATTER_STORE256(state, 19, X##mu); \ - MLD_SCATTER_STORE256(state, 20, X##sa); \ - MLD_SCATTER_STORE256(state, 21, X##se); \ - MLD_SCATTER_STORE256(state, 22, X##si); \ - MLD_SCATTER_STORE256(state, 23, X##so); \ - MLD_SCATTER_STORE256(state, 24, X##su); - -#define MLD_COPY_STATE_VARIABLES(X, Y) \ - X##ba = Y##ba; \ - X##be = Y##be; \ - X##bi = Y##bi; \ - X##bo = Y##bo; \ - X##bu = Y##bu; \ - X##ga = Y##ga; \ - X##ge = Y##ge; \ - X##gi = Y##gi; \ - X##go = Y##go; \ - X##gu = Y##gu; \ - X##ka = Y##ka; \ - X##ke = Y##ke; \ - X##ki = Y##ki; \ - X##ko = Y##ko; \ - X##ku = Y##ku; \ - X##ma = Y##ma; \ - X##me = Y##me; \ - X##mi = Y##mi; \ - X##mo = Y##mo; \ - X##mu = Y##mu; \ - X##sa = Y##sa; \ - X##se = Y##se; \ - X##si = Y##si; \ - X##so = Y##so; \ - X##su = Y##su; - -/* clang-format off */ -#define MLD_ROUNDS24 \ - MLD_prepareTheta \ - MLD_thetaRhoPiChiIotaPrepareTheta( 0, A, E) \ - MLD_thetaRhoPiChiIotaPrepareTheta( 1, E, A) \ - MLD_thetaRhoPiChiIotaPrepareTheta( 2, A, E) \ - MLD_thetaRhoPiChiIotaPrepareTheta( 3, E, A) \ - MLD_thetaRhoPiChiIotaPrepareTheta( 4, A, E) \ - MLD_thetaRhoPiChiIotaPrepareTheta( 5, E, A) \ - MLD_thetaRhoPiChiIotaPrepareTheta( 6, A, E) \ - MLD_thetaRhoPiChiIotaPrepareTheta( 7, E, A) \ - MLD_thetaRhoPiChiIotaPrepareTheta( 8, A, E) \ - MLD_thetaRhoPiChiIotaPrepareTheta( 9, E, A) \ - MLD_thetaRhoPiChiIotaPrepareTheta(10, A, E) \ - MLD_thetaRhoPiChiIotaPrepareTheta(11, E, A) \ - MLD_thetaRhoPiChiIotaPrepareTheta(12, A, E) \ - MLD_thetaRhoPiChiIotaPrepareTheta(13, E, A) \ - MLD_thetaRhoPiChiIotaPrepareTheta(14, A, E) \ - MLD_thetaRhoPiChiIotaPrepareTheta(15, E, A) \ - MLD_thetaRhoPiChiIotaPrepareTheta(16, A, E) \ - MLD_thetaRhoPiChiIotaPrepareTheta(17, E, A) \ - MLD_thetaRhoPiChiIotaPrepareTheta(18, A, E) \ - MLD_thetaRhoPiChiIotaPrepareTheta(19, E, A) \ - MLD_thetaRhoPiChiIotaPrepareTheta(20, A, E) \ - MLD_thetaRhoPiChiIotaPrepareTheta(21, E, A) \ - MLD_thetaRhoPiChiIotaPrepareTheta(22, A, E) \ - MLD_thetaRhoPiChiIota(23, E, A) -/* clang-format on */ - -void mld_keccakf1600x4_permute24(void *states) -{ - __m256i *statesAsLanes = (__m256i *)states; - MLD_DECLARE_ABCDE MLD_COPY_FROM_STATE(A, statesAsLanes) - MLD_ROUNDS24 MLD_COPY_TO_STATE(statesAsLanes, A) -} - -#else /* MLD_FIPS202_X86_64_XKCP && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ - -MLD_EMPTY_CU(fips202_avx2_keccakx4) - -#endif /* !(MLD_FIPS202_X86_64_XKCP && !MLD_CONFIG_MULTILEVEL_NO_SHARED) */ - -/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. - * Don't modify by hand -- this is auto-generated by scripts/autogen. */ -#undef MLD_ANDNU256 -#undef MLD_CONST256 -#undef MLD_CONST256_64 -#undef MLD_ROL64IN256 -#undef MLD_ROL64IN256_8 -#undef MLD_ROL64IN256_56 -#undef MLD_STORE256 -#undef MLD_XOR256 -#undef MLD_XOREQ256 -#undef MLD_SNP_LANELENGTHINBYTES -#undef MLD_DECLARE_ABCDE -#undef MLD_prepareTheta -#undef MLD_thetaRhoPiChiIotaPrepareTheta -#undef MLD_thetaRhoPiChiIota -#undef MLD_COPY_FROM_STATE -#undef MLD_SCATTER_STORE256 -#undef MLD_COPY_TO_STATE -#undef MLD_COPY_STATE_VARIABLES -#undef MLD_ROUNDS24 diff --git a/mldsa/src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.h b/mldsa/src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.h deleted file mode 100644 index 3499d38e2..000000000 --- a/mldsa/src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.h +++ /dev/null @@ -1,16 +0,0 @@ -/* - * Copyright (c) The mlkem-native project authors - * Copyright (c) The mldsa-native project authors - * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - */ - -#ifndef MLD_FIPS202_NATIVE_X86_64_SRC_KECCAKP_1600_TIMES4_SIMD256_H -#define MLD_FIPS202_NATIVE_X86_64_SRC_KECCAKP_1600_TIMES4_SIMD256_H - -#include "../../../../common.h" - -#define mld_keccakf1600x4_permute24 \ - MLD_NAMESPACE(KeccakP1600times4_PermuteAll_24rounds) -void mld_keccakf1600x4_permute24(void *states); - -#endif /* !MLD_FIPS202_NATIVE_X86_64_SRC_KECCAKP_1600_TIMES4_SIMD256_H */ diff --git a/mldsa/src/fips202/native/x86_64/src/fips202_native_x86_64.h b/mldsa/src/fips202/native/x86_64/src/fips202_native_x86_64.h new file mode 100644 index 000000000..b84b33bd3 --- /dev/null +++ b/mldsa/src/fips202/native/x86_64/src/fips202_native_x86_64.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLD_FIPS202_NATIVE_X86_64_SRC_FIPS202_NATIVE_X86_64_H +#define MLD_FIPS202_NATIVE_X86_64_SRC_FIPS202_NATIVE_X86_64_H + +#include "../../../../cbmc.h" +#include "../../../../common.h" + +/* TODO: Reconsider whether this check is needed -- x86_64 is always + * little-endian, so the backend selection already implies this. */ +#ifndef MLD_SYS_LITTLE_ENDIAN +#error Expecting a little-endian platform +#endif + +#define mld_keccakf1600_round_constants \ + MLD_NAMESPACE(keccakf1600_round_constants) +extern const uint64_t mld_keccakf1600_round_constants[]; + +#define mld_keccak_rho8 MLD_NAMESPACE(keccak_rho8) +extern const uint64_t mld_keccak_rho8[]; + +#define mld_keccak_rho56 MLD_NAMESPACE(keccak_rho56) +extern const uint64_t mld_keccak_rho56[]; + +#define mld_keccak_f1600_x4_avx2 MLD_NAMESPACE(keccak_f1600_x4_avx2) +void mld_keccak_f1600_x4_avx2(uint64_t states[100], const uint64_t rc[24], + const uint64_t rho8[4], const uint64_t rho56[4]) +/* This must be kept in sync with the HOL-Light specification + * in proofs/hol_light/x86_64/proofs/keccak_f1600_x4_avx2.ml */ +__contract__( + requires(memory_no_alias(states, sizeof(uint64_t) * 25 * 4)) + requires(rc == mld_keccakf1600_round_constants) + requires(rho8 == mld_keccak_rho8) + requires(rho56 == mld_keccak_rho56) + assigns(memory_slice(states, sizeof(uint64_t) * 25 * 4)) +); + +#endif /* !MLD_FIPS202_NATIVE_X86_64_SRC_FIPS202_NATIVE_X86_64_H */ diff --git a/mldsa/src/fips202/native/x86_64/src/keccak_f1600_x4_avx2.S b/mldsa/src/fips202/native/x86_64/src/keccak_f1600_x4_avx2.S new file mode 100644 index 000000000..f1ebd86c3 --- /dev/null +++ b/mldsa/src/fips202/native/x86_64/src/keccak_f1600_x4_avx2.S @@ -0,0 +1,451 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ +#include "../../../../common.h" + +#if defined(MLD_FIPS202_X86_64_NEED_X4_AVX2) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/fips202/x86_64/src/keccak_f1600_x4_avx2.S using scripts/simpasm. Do not modify it directly. + */ + +#if defined(__ELF__) +.section .note.GNU-stack,"",@progbits +#endif + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(keccak_f1600_x4_avx2) +MLD_ASM_FN_SYMBOL(keccak_f1600_x4_avx2) + + .cfi_startproc + subq $0x300, %rsp # imm = 0x300 + .cfi_adjust_cfa_offset 0x300 + vmovdqu (%rdi), %ymm0 + vmovdqu 0xc8(%rdi), %ymm3 + vmovdqu 0x190(%rdi), %ymm1 + vmovdqu 0x258(%rdi), %ymm4 + vpunpcklqdq %ymm3, %ymm0, %ymm2 # ymm2 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] + vpunpckhqdq %ymm3, %ymm0, %ymm0 # ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] + vpunpcklqdq %ymm4, %ymm1, %ymm3 # ymm3 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] + vperm2i128 $0x20, %ymm3, %ymm2, %ymm7 # ymm7 = ymm2[0,1],ymm3[0,1] + vpunpckhqdq %ymm4, %ymm1, %ymm1 # ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] + vperm2i128 $0x31, %ymm3, %ymm2, %ymm3 # ymm3 = ymm2[2,3],ymm3[2,3] + vmovdqu 0x278(%rdi), %ymm4 + vmovdqu %ymm3, 0x40(%rsp) + vperm2i128 $0x31, %ymm1, %ymm0, %ymm3 # ymm3 = ymm0[2,3],ymm1[2,3] + vmovdqu %ymm7, (%rsp) + vperm2i128 $0x20, %ymm1, %ymm0, %ymm7 # ymm7 = ymm0[0,1],ymm1[0,1] + vmovdqu 0x20(%rdi), %ymm0 + vmovdqu 0x1b0(%rdi), %ymm1 + vmovdqu %ymm3, 0x60(%rsp) + vmovdqu 0xe8(%rdi), %ymm3 + vmovdqu %ymm7, 0x20(%rsp) + vpunpcklqdq %ymm3, %ymm0, %ymm2 # ymm2 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] + vpunpckhqdq %ymm3, %ymm0, %ymm0 # ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] + vpunpcklqdq %ymm4, %ymm1, %ymm3 # ymm3 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] + vperm2i128 $0x20, %ymm3, %ymm2, %ymm7 # ymm7 = ymm2[0,1],ymm3[0,1] + vpunpckhqdq %ymm4, %ymm1, %ymm1 # ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] + vperm2i128 $0x31, %ymm3, %ymm2, %ymm3 # ymm3 = ymm2[2,3],ymm3[2,3] + vmovdqu 0x298(%rdi), %ymm4 + vperm2i128 $0x31, %ymm1, %ymm0, %ymm14 # ymm14 = ymm0[2,3],ymm1[2,3] + vmovdqu %ymm7, 0x80(%rsp) + vperm2i128 $0x20, %ymm1, %ymm0, %ymm7 # ymm7 = ymm0[0,1],ymm1[0,1] + vmovdqu 0x40(%rdi), %ymm0 + vmovdqu 0x1d0(%rdi), %ymm1 + vmovdqu %ymm3, 0xc0(%rsp) + vmovdqu 0x108(%rdi), %ymm3 + vmovdqu %ymm14, %ymm10 + vmovdqu %ymm7, 0xa0(%rsp) + vpunpcklqdq %ymm3, %ymm0, %ymm2 # ymm2 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] + vpunpckhqdq %ymm3, %ymm0, %ymm0 # ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] + vpunpcklqdq %ymm4, %ymm1, %ymm3 # ymm3 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] + vpunpckhqdq %ymm4, %ymm1, %ymm1 # ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] + vperm2i128 $0x20, %ymm3, %ymm2, %ymm11 # ymm11 = ymm2[0,1],ymm3[0,1] + vperm2i128 $0x31, %ymm3, %ymm2, %ymm3 # ymm3 = ymm2[2,3],ymm3[2,3] + vperm2i128 $0x20, %ymm1, %ymm0, %ymm7 # ymm7 = ymm0[0,1],ymm1[0,1] + vmovdqu %ymm3, 0x100(%rsp) + vperm2i128 $0x31, %ymm1, %ymm0, %ymm8 # ymm8 = ymm0[2,3],ymm1[2,3] + vmovdqu 0x128(%rdi), %ymm3 + vmovdqu 0x60(%rdi), %ymm0 + vmovdqu 0x1f0(%rdi), %ymm1 + vmovdqu %ymm7, 0xe0(%rsp) + vmovdqu %ymm11, %ymm14 + vmovdqu 0x2b8(%rdi), %ymm4 + vmovdqu 0x2f8(%rdi), %ymm5 + vpunpcklqdq %ymm3, %ymm0, %ymm2 # ymm2 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] + vpunpckhqdq %ymm3, %ymm0, %ymm0 # ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] + vpunpcklqdq %ymm4, %ymm1, %ymm3 # ymm3 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] + vpunpckhqdq %ymm4, %ymm1, %ymm1 # ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] + vmovdqu 0x2d8(%rdi), %ymm4 + vperm2i128 $0x20, %ymm3, %ymm2, %ymm15 # ymm15 = ymm2[0,1],ymm3[0,1] + vperm2i128 $0x31, %ymm3, %ymm2, %ymm3 # ymm3 = ymm2[2,3],ymm3[2,3] + vperm2i128 $0x20, %ymm1, %ymm0, %ymm7 # ymm7 = ymm0[0,1],ymm1[0,1] + vperm2i128 $0x31, %ymm1, %ymm0, %ymm9 # ymm9 = ymm0[2,3],ymm1[2,3] + vmovdqu %ymm3, 0x140(%rsp) + vmovdqu 0x80(%rdi), %ymm0 + vmovdqu 0x148(%rdi), %ymm3 + vmovdqu 0x210(%rdi), %ymm1 + vmovdqu %ymm7, 0x120(%rsp) + vpunpcklqdq %ymm3, %ymm0, %ymm2 # ymm2 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] + vpunpckhqdq %ymm3, %ymm0, %ymm0 # ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] + vpunpcklqdq %ymm4, %ymm1, %ymm3 # ymm3 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] + vpunpckhqdq %ymm4, %ymm1, %ymm1 # ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] + vperm2i128 $0x20, %ymm3, %ymm2, %ymm7 # ymm7 = ymm2[0,1],ymm3[0,1] + vperm2i128 $0x31, %ymm3, %ymm2, %ymm13 # ymm13 = ymm2[2,3],ymm3[2,3] + vperm2i128 $0x31, %ymm1, %ymm0, %ymm3 # ymm3 = ymm0[2,3],ymm1[2,3] + vmovdqu %ymm7, 0x160(%rsp) + vperm2i128 $0x20, %ymm1, %ymm0, %ymm7 # ymm7 = ymm0[0,1],ymm1[0,1] + vmovdqu 0xa0(%rdi), %ymm0 + vmovdqu 0x230(%rdi), %ymm1 + vmovdqu %ymm3, 0x1a0(%rsp) + vmovdqu 0x168(%rdi), %ymm3 + vpunpcklqdq %ymm5, %ymm1, %ymm4 # ymm4 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] + vpunpckhqdq %ymm5, %ymm1, %ymm1 # ymm1 = ymm1[1],ymm5[1],ymm1[3],ymm5[3] + vmovdqu %ymm7, 0x180(%rsp) + vpunpcklqdq %ymm3, %ymm0, %ymm2 # ymm2 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] + vpunpckhqdq %ymm3, %ymm0, %ymm0 # ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] + vperm2i128 $0x20, %ymm4, %ymm2, %ymm12 # ymm12 = ymm2[0,1],ymm4[0,1] + vperm2i128 $0x20, %ymm1, %ymm0, %ymm3 # ymm3 = ymm0[0,1],ymm1[0,1] + vperm2i128 $0x31, %ymm4, %ymm2, %ymm7 # ymm7 = ymm2[2,3],ymm4[2,3] + vperm2i128 $0x31, %ymm1, %ymm0, %ymm4 # ymm4 = ymm0[2,3],ymm1[2,3] + vmovq 0x250(%rdi), %xmm0 + vmovq 0xc0(%rdi), %xmm1 + vmovdqu %ymm12, 0x1c0(%rsp) + vmovdqu %ymm4, 0x1e0(%rsp) + vpinsrq $0x1, 0x318(%rdi), %xmm0, %xmm0 + vpinsrq $0x1, 0x188(%rdi), %xmm1, %xmm1 + vinserti128 $0x1, %xmm0, %ymm1, %ymm2 + movq $0x0, %r10 + +LLkeccak_f1600_x4_avx2: + vmovdqu 0xa0(%rsp), %ymm4 + vpxor 0x1c0(%rsp), %ymm9, %ymm0 + vmovdqu %ymm9, 0x200(%rsp) + vmovdqu %ymm10, %ymm9 + vmovdqu 0xc0(%rsp), %ymm11 + vmovdqu 0x160(%rsp), %ymm12 + vmovdqu %ymm3, 0x240(%rsp) + vpxor 0x100(%rsp), %ymm4, %ymm1 + vmovdqu 0x40(%rsp), %ymm10 + vmovdqu %ymm4, 0x220(%rsp) + vpxor %ymm3, %ymm12, %ymm12 + vmovdqu 0x20(%rsp), %ymm6 + vmovdqu 0x140(%rsp), %ymm4 + vmovdqu %ymm14, 0x2a0(%rsp) + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm8, %ymm11, %ymm1 + vpxor 0x180(%rsp), %ymm7, %ymm11 + vmovdqu %ymm10, 0x280(%rsp) + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm15, %ymm9, %ymm1 + vmovdqu 0xe0(%rsp), %ymm3 + vmovdqu %ymm8, 0x260(%rsp) + vpxor %ymm1, %ymm11, %ymm11 + vpxor 0x120(%rsp), %ymm14, %ymm1 + vpxor %ymm6, %ymm12, %ymm12 + vmovdqu 0x60(%rsp), %ymm8 + vpxor %ymm10, %ymm11, %ymm11 + vpxor 0x1e0(%rsp), %ymm13, %ymm10 + vpxor %ymm4, %ymm3, %ymm3 + vmovdqu %ymm4, 0x2c0(%rsp) + vpsrlq $0x3f, %ymm12, %ymm4 + vpsrlq $0x3f, %ymm11, %ymm5 + vpxor (%rsp), %ymm0, %ymm0 + vpxor %ymm1, %ymm10, %ymm10 + vmovdqu 0x80(%rsp), %ymm1 + vpxor %ymm8, %ymm10, %ymm10 + vmovdqu %ymm1, %ymm14 + vpxor 0x1a0(%rsp), %ymm2, %ymm1 + vmovdqu %ymm14, 0x2e0(%rsp) + vpxor %ymm3, %ymm1, %ymm1 + vpsllq $0x1, %ymm12, %ymm3 + vpor %ymm4, %ymm3, %ymm3 + vpsllq $0x1, %ymm11, %ymm4 + vpxor %ymm14, %ymm1, %ymm1 + vpor %ymm5, %ymm4, %ymm4 + vpsrlq $0x3f, %ymm10, %ymm14 + vpxor %ymm1, %ymm3, %ymm3 + vpsllq $0x1, %ymm10, %ymm5 + vpxor %ymm0, %ymm4, %ymm4 + vpor %ymm14, %ymm5, %ymm5 + vpxor %ymm6, %ymm4, %ymm6 + vpxor %ymm12, %ymm5, %ymm5 + vpsrlq $0x3f, %ymm1, %ymm12 + vpsllq $0x1, %ymm1, %ymm1 + vpxor %ymm7, %ymm5, %ymm7 + vpxor %ymm9, %ymm5, %ymm9 + vpor %ymm12, %ymm1, %ymm1 + vpxor (%rsp), %ymm3, %ymm12 + vpxor %ymm11, %ymm1, %ymm1 + vpsrlq $0x3f, %ymm0, %ymm11 + vpsllq $0x1, %ymm0, %ymm0 + vpxor %ymm13, %ymm1, %ymm13 + vpxor %ymm8, %ymm1, %ymm8 + vpor %ymm11, %ymm0, %ymm0 + vpxor %ymm10, %ymm0, %ymm0 + vpxor 0xc0(%rsp), %ymm4, %ymm10 + vpxor %ymm2, %ymm0, %ymm2 + vpsrlq $0x14, %ymm10, %ymm11 + vpsllq $0x2c, %ymm10, %ymm10 + vpor %ymm11, %ymm10, %ymm10 + vpxor %ymm15, %ymm5, %ymm11 + vpbroadcastq (%rsi), %ymm15 + vpsrlq $0x15, %ymm11, %ymm14 + vpsllq $0x2b, %ymm11, %ymm11 + vpor %ymm14, %ymm11, %ymm11 + vpandn %ymm11, %ymm10, %ymm14 + vpxor %ymm15, %ymm14, %ymm14 + vpxor %ymm12, %ymm14, %ymm15 + vpsrlq $0x2b, %ymm13, %ymm14 + vpsllq $0x15, %ymm13, %ymm13 + vmovdqu %ymm15, (%rsp) + vpor %ymm14, %ymm13, %ymm13 + vpandn %ymm13, %ymm11, %ymm14 + vpxor %ymm10, %ymm14, %ymm15 + vpsrlq $0x32, %ymm2, %ymm14 + vpsllq $0xe, %ymm2, %ymm2 + vmovdqu %ymm15, 0x20(%rsp) + vpor %ymm14, %ymm2, %ymm2 + vpandn %ymm2, %ymm13, %ymm14 + vpxor %ymm11, %ymm14, %ymm11 + vmovdqu %ymm11, 0x40(%rsp) + vpandn %ymm12, %ymm2, %ymm11 + vpandn %ymm10, %ymm12, %ymm12 + vpxor %ymm13, %ymm11, %ymm11 + vmovdqu %ymm11, 0x60(%rsp) + vpxor %ymm2, %ymm12, %ymm11 + vpsrlq $0x24, %ymm8, %ymm2 + vpsllq $0x1c, %ymm8, %ymm8 + vmovdqu %ymm11, 0x80(%rsp) + vpor %ymm2, %ymm8, %ymm8 + vpxor 0xe0(%rsp), %ymm0, %ymm2 + vpsrlq $0x2c, %ymm2, %ymm10 + vpsllq $0x14, %ymm2, %ymm2 + vpor %ymm10, %ymm2, %ymm2 + vpxor 0x100(%rsp), %ymm3, %ymm10 + vpsrlq $0x3d, %ymm10, %ymm11 + vpsllq $0x3, %ymm10, %ymm10 + vpor %ymm11, %ymm10, %ymm10 + vpandn %ymm10, %ymm2, %ymm11 + vpxor %ymm8, %ymm11, %ymm11 + vmovdqu %ymm11, 0xa0(%rsp) + vpxor 0x160(%rsp), %ymm4, %ymm11 + vpsrlq $0x13, %ymm11, %ymm12 + vpsllq $0x2d, %ymm11, %ymm11 + vpor %ymm12, %ymm11, %ymm11 + vpandn %ymm11, %ymm10, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vmovdqu %ymm12, 0xc0(%rsp) + vpsrlq $0x3, %ymm7, %ymm12 + vpsllq $0x3d, %ymm7, %ymm7 + vpor %ymm12, %ymm7, %ymm7 + vpandn %ymm7, %ymm11, %ymm12 + vpxor %ymm10, %ymm12, %ymm10 + vpandn %ymm8, %ymm7, %ymm12 + vpandn %ymm2, %ymm8, %ymm8 + vpsrlq $0x3f, %ymm6, %ymm2 + vpsllq $0x1, %ymm6, %ymm6 + vpxor %ymm11, %ymm12, %ymm14 + vpor %ymm2, %ymm6, %ymm6 + vpsrlq $0x3a, %ymm9, %ymm2 + vpxor %ymm7, %ymm8, %ymm12 + vpsllq $0x6, %ymm9, %ymm9 + vmovdqu %ymm12, 0xe0(%rsp) + vpxor 0x1a0(%rsp), %ymm0, %ymm7 + vpor %ymm2, %ymm9, %ymm9 + vpxor 0x120(%rsp), %ymm1, %ymm2 + vpshufb (%rdx), %ymm7, %ymm7 + vpsrlq $0x27, %ymm2, %ymm11 + vpsllq $0x19, %ymm2, %ymm2 + vpor %ymm2, %ymm11, %ymm11 + vpandn %ymm11, %ymm9, %ymm2 + vpandn %ymm7, %ymm11, %ymm8 + vpxor %ymm6, %ymm2, %ymm12 + vpxor 0x1c0(%rsp), %ymm3, %ymm2 + vpxor %ymm9, %ymm8, %ymm8 + vmovdqu %ymm12, 0x100(%rsp) + vpsrlq $0x2e, %ymm2, %ymm12 + vpsllq $0x12, %ymm2, %ymm2 + vpor %ymm2, %ymm12, %ymm2 + vpandn %ymm2, %ymm7, %ymm12 + vpxor %ymm11, %ymm12, %ymm15 + vpandn %ymm6, %ymm2, %ymm11 + vpandn %ymm9, %ymm6, %ymm6 + vpxor %ymm7, %ymm11, %ymm12 + vmovdqu %ymm12, 0x120(%rsp) + vpxor %ymm2, %ymm6, %ymm12 + vpxor 0x2e0(%rsp), %ymm0, %ymm6 + vpxor 0x2c0(%rsp), %ymm0, %ymm0 + vmovdqu %ymm12, 0x140(%rsp) + vpsrlq $0x25, %ymm6, %ymm2 + vpsllq $0x1b, %ymm6, %ymm6 + vpor %ymm6, %ymm2, %ymm2 + vpxor 0x220(%rsp), %ymm3, %ymm6 + vpxor 0x200(%rsp), %ymm3, %ymm3 + vpsrlq $0x1c, %ymm6, %ymm7 + vpsllq $0x24, %ymm6, %ymm6 + vpor %ymm6, %ymm7, %ymm7 + vpxor 0x260(%rsp), %ymm4, %ymm6 + vpxor 0x240(%rsp), %ymm4, %ymm4 + vpsrlq $0x36, %ymm6, %ymm12 + vpsllq $0xa, %ymm6, %ymm6 + vpor %ymm6, %ymm12, %ymm12 + vpxor 0x180(%rsp), %ymm5, %ymm6 + vpxor 0x280(%rsp), %ymm5, %ymm5 + vpandn %ymm12, %ymm7, %ymm9 + vpsrlq $0x31, %ymm6, %ymm11 + vpsllq $0xf, %ymm6, %ymm6 + vpxor %ymm2, %ymm9, %ymm9 + vpor %ymm6, %ymm11, %ymm11 + vpandn %ymm11, %ymm12, %ymm6 + vpxor %ymm7, %ymm6, %ymm6 + vmovdqu %ymm6, 0x160(%rsp) + vpxor 0x1e0(%rsp), %ymm1, %ymm6 + vpxor 0x2a0(%rsp), %ymm1, %ymm1 + vpshufb (%rcx), %ymm6, %ymm6 + vpandn %ymm6, %ymm11, %ymm13 + vpxor %ymm12, %ymm13, %ymm13 + vmovdqu %ymm13, 0x180(%rsp) + vpandn %ymm2, %ymm6, %ymm13 + vpandn %ymm7, %ymm2, %ymm2 + vpxor %ymm6, %ymm2, %ymm2 + vpsrlq $0x3e, %ymm4, %ymm6 + vpxor %ymm11, %ymm13, %ymm13 + vmovdqu %ymm2, 0x1a0(%rsp) + vpsrlq $0x2, %ymm5, %ymm2 + vpsllq $0x3e, %ymm5, %ymm5 + vpor %ymm5, %ymm2, %ymm2 + vpsrlq $0x9, %ymm1, %ymm5 + vpsllq $0x37, %ymm1, %ymm1 + vpsllq $0x2, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm1 + vpsrlq $0x19, %ymm0, %ymm5 + vpor %ymm4, %ymm6, %ymm4 + vpsllq $0x27, %ymm0, %ymm0 + vpor %ymm0, %ymm5, %ymm5 + vpandn %ymm5, %ymm1, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vmovdqu %ymm0, 0x1c0(%rsp) + vpsrlq $0x17, %ymm3, %ymm0 + vpsllq $0x29, %ymm3, %ymm3 + vpor %ymm3, %ymm0, %ymm0 + vpandn %ymm4, %ymm0, %ymm7 + vpandn %ymm0, %ymm5, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpandn %ymm2, %ymm4, %ymm5 + vpandn %ymm1, %ymm2, %ymm2 + vpxor %ymm0, %ymm5, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm4, %ymm2, %ymm2 + vmovdqu %ymm5, 0x1e0(%rsp) + addq $0x8, %rsi + addq $0x1, %r10 + cmpq $0x18, %r10 + jne LLkeccak_f1600_x4_avx2 + vmovdqu (%rsp), %ymm4 + vmovdqu 0x40(%rsp), %ymm5 + vmovdqu 0x20(%rsp), %ymm0 + vmovdqu 0x60(%rsp), %ymm1 + vmovdqu 0x1c0(%rsp), %ymm12 + vmovdqu %ymm2, 0x1c0(%rsp) + vpunpcklqdq %ymm0, %ymm4, %ymm2 # ymm2 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] + vpunpckhqdq %ymm0, %ymm4, %ymm0 # ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] + vpunpcklqdq %ymm1, %ymm5, %ymm4 # ymm4 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] + vpunpckhqdq %ymm1, %ymm5, %ymm1 # ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] + vperm2i128 $0x20, %ymm4, %ymm2, %ymm6 # ymm6 = ymm2[0,1],ymm4[0,1] + vperm2i128 $0x31, %ymm4, %ymm2, %ymm2 # ymm2 = ymm2[2,3],ymm4[2,3] + vmovdqu 0x80(%rsp), %ymm4 + vperm2i128 $0x20, %ymm1, %ymm0, %ymm5 # ymm5 = ymm0[0,1],ymm1[0,1] + vperm2i128 $0x31, %ymm1, %ymm0, %ymm0 # ymm0 = ymm0[2,3],ymm1[2,3] + vmovdqu %ymm6, (%rdi) + vmovdqu %ymm5, 0xc8(%rdi) + vmovdqu %ymm2, 0x190(%rdi) + vmovdqu %ymm0, 0x258(%rdi) + vmovdqu 0xa0(%rsp), %ymm0 + vpunpcklqdq %ymm0, %ymm4, %ymm2 # ymm2 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] + vpunpckhqdq %ymm0, %ymm4, %ymm1 # ymm1 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] + vmovdqu 0xc0(%rsp), %ymm0 + vpunpcklqdq %ymm10, %ymm0, %ymm4 # ymm4 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] + vpunpckhqdq %ymm10, %ymm0, %ymm0 # ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] + vperm2i128 $0x20, %ymm4, %ymm2, %ymm6 # ymm6 = ymm2[0,1],ymm4[0,1] + vperm2i128 $0x20, %ymm0, %ymm1, %ymm5 # ymm5 = ymm1[0,1],ymm0[0,1] + vperm2i128 $0x31, %ymm4, %ymm2, %ymm2 # ymm2 = ymm2[2,3],ymm4[2,3] + vmovdqu 0xe0(%rsp), %ymm4 + vperm2i128 $0x31, %ymm0, %ymm1, %ymm1 # ymm1 = ymm1[2,3],ymm0[2,3] + vmovdqu 0x100(%rsp), %ymm0 + vmovdqu %ymm2, 0x1b0(%rdi) + vmovdqu %ymm1, 0x278(%rdi) + vpunpcklqdq %ymm4, %ymm14, %ymm2 # ymm2 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] + vpunpckhqdq %ymm4, %ymm14, %ymm1 # ymm1 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] + vpunpcklqdq %ymm8, %ymm0, %ymm4 # ymm4 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] + vpunpckhqdq %ymm8, %ymm0, %ymm0 # ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] + vmovdqu %ymm6, 0x20(%rdi) + vmovdqu %ymm5, 0xe8(%rdi) + vperm2i128 $0x20, %ymm4, %ymm2, %ymm6 # ymm6 = ymm2[0,1],ymm4[0,1] + vperm2i128 $0x20, %ymm0, %ymm1, %ymm5 # ymm5 = ymm1[0,1],ymm0[0,1] + vperm2i128 $0x31, %ymm4, %ymm2, %ymm2 # ymm2 = ymm2[2,3],ymm4[2,3] + vperm2i128 $0x31, %ymm0, %ymm1, %ymm1 # ymm1 = ymm1[2,3],ymm0[2,3] + vmovdqu 0x120(%rsp), %ymm4 + vmovdqu 0x140(%rsp), %ymm0 + vmovdqu %ymm2, 0x1d0(%rdi) + vmovdqu %ymm1, 0x298(%rdi) + vpunpcklqdq %ymm4, %ymm15, %ymm2 # ymm2 = ymm15[0],ymm4[0],ymm15[2],ymm4[2] + vpunpckhqdq %ymm4, %ymm15, %ymm1 # ymm1 = ymm15[1],ymm4[1],ymm15[3],ymm4[3] + vpunpcklqdq %ymm9, %ymm0, %ymm4 # ymm4 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] + vmovdqu %ymm5, 0x108(%rdi) + vpunpckhqdq %ymm9, %ymm0, %ymm0 # ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] + vmovdqu %ymm6, 0x40(%rdi) + vperm2i128 $0x20, %ymm4, %ymm2, %ymm6 # ymm6 = ymm2[0,1],ymm4[0,1] + vperm2i128 $0x31, %ymm4, %ymm2, %ymm2 # ymm2 = ymm2[2,3],ymm4[2,3] + vperm2i128 $0x20, %ymm0, %ymm1, %ymm5 # ymm5 = ymm1[0,1],ymm0[0,1] + vmovdqu 0x160(%rsp), %ymm4 + vperm2i128 $0x31, %ymm0, %ymm1, %ymm1 # ymm1 = ymm1[2,3],ymm0[2,3] + vmovdqu 0x180(%rsp), %ymm0 + vmovdqu %ymm5, 0x128(%rdi) + vmovdqu 0x1a0(%rsp), %ymm5 + vmovdqu %ymm2, 0x1f0(%rdi) + vpunpcklqdq %ymm0, %ymm4, %ymm2 # ymm2 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] + vpunpckhqdq %ymm0, %ymm4, %ymm0 # ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] + vpunpcklqdq %ymm5, %ymm13, %ymm4 # ymm4 = ymm13[0],ymm5[0],ymm13[2],ymm5[2] + vmovdqu %ymm6, 0x60(%rdi) + vperm2i128 $0x20, %ymm4, %ymm2, %ymm6 # ymm6 = ymm2[0,1],ymm4[0,1] + vmovdqu %ymm1, 0x2b8(%rdi) + vperm2i128 $0x31, %ymm4, %ymm2, %ymm2 # ymm2 = ymm2[2,3],ymm4[2,3] + vpunpckhqdq %ymm5, %ymm13, %ymm1 # ymm1 = ymm13[1],ymm5[1],ymm13[3],ymm5[3] + vmovdqu %ymm6, 0x80(%rdi) + vmovdqu 0x1e0(%rsp), %ymm4 + vperm2i128 $0x20, %ymm1, %ymm0, %ymm5 # ymm5 = ymm0[0,1],ymm1[0,1] + vperm2i128 $0x31, %ymm1, %ymm0, %ymm0 # ymm0 = ymm0[2,3],ymm1[2,3] + vmovdqu %ymm2, 0x210(%rdi) + vpunpcklqdq %ymm3, %ymm12, %ymm2 # ymm2 = ymm12[0],ymm3[0],ymm12[2],ymm3[2] + vmovdqu %ymm0, 0x2d8(%rdi) + vpunpckhqdq %ymm3, %ymm12, %ymm0 # ymm0 = ymm12[1],ymm3[1],ymm12[3],ymm3[3] + vpunpcklqdq %ymm4, %ymm7, %ymm3 # ymm3 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] + vpunpckhqdq %ymm4, %ymm7, %ymm1 # ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] + vmovdqu %ymm5, 0x148(%rdi) + vperm2i128 $0x20, %ymm3, %ymm2, %ymm5 # ymm5 = ymm2[0,1],ymm3[0,1] + vperm2i128 $0x31, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[2,3],ymm3[2,3] + vmovdqu 0x1c0(%rsp), %ymm3 + vperm2i128 $0x20, %ymm1, %ymm0, %ymm4 # ymm4 = ymm0[0,1],ymm1[0,1] + vperm2i128 $0x31, %ymm1, %ymm0, %ymm0 # ymm0 = ymm0[2,3],ymm1[2,3] + vmovdqu %ymm5, 0xa0(%rdi) + vextracti128 $0x1, %ymm3, %xmm15 + vmovdqu %ymm4, 0x168(%rdi) + vmovdqu %ymm2, 0x230(%rdi) + vmovdqu %ymm0, 0x2f8(%rdi) + vmovq %xmm3, 0xc0(%rdi) + vmovhpd %xmm3, 0x188(%rdi) + vmovq %xmm15, 0x250(%rdi) + vmovhpd %xmm15, 0x318(%rdi) + addq $0x300, %rsp # imm = 0x300 + .cfi_adjust_cfa_offset -0x300 + retq + .cfi_endproc + +MLD_ASM_FN_SIZE(keccak_f1600_x4_avx2) + +#endif /* MLD_FIPS202_X86_64_NEED_X4_AVX2 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mldsa/src/fips202/native/x86_64/src/keccakf1600_constants.c b/mldsa/src/fips202/native/x86_64/src/keccakf1600_constants.c new file mode 100644 index 000000000..c4be6ccb5 --- /dev/null +++ b/mldsa/src/fips202/native/x86_64/src/keccakf1600_constants.c @@ -0,0 +1,51 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mldsa-native repository. + * Do not modify it directly. + */ + +#include "../../../../common.h" +#if defined(MLD_FIPS202_X86_64_NEED_X4_AVX2) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include + +#include "fips202_native_x86_64.h" + +MLD_ALIGN const uint64_t mld_keccakf1600_round_constants[] = { + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008, +}; + +MLD_ALIGN const uint64_t mld_keccak_rho8[] = { + 0x0605040302010007, + 0x0e0d0c0b0a09080f, + 0x1615141312111017, + 0x1e1d1c1b1a19181f, +}; + +MLD_ALIGN const uint64_t mld_keccak_rho56[] = { + 0x0007060504030201, + 0x080f0e0d0c0b0a09, + 0x1017161514131211, + 0x181f1e1d1c1b1a19, +}; + +#else /* MLD_FIPS202_X86_64_NEED_X4_AVX2 && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + */ + +MLD_EMPTY_CU(fips202_x86_64_constants) + +#endif /* !(MLD_FIPS202_X86_64_NEED_X4_AVX2 && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED) */ diff --git a/proofs/README.md b/proofs/README.md index 769655461..9103fb4c5 100644 --- a/proofs/README.md +++ b/proofs/README.md @@ -10,4 +10,4 @@ We use the [C Bounded Model Checker (CBMC)](https://github.com/diffblue/cbmc) to ## Assembly verification: HOL-Light -We use the [HOL-Light](https://github.com/jrh13/hol-light) interactive theorem prover alongside the verification infrastructure from [s2n-bignum](https://github.com/awslabs/s2n-bignum) to show the functional correctness of highly optimized assembly routines in mlkem-native at the object-code level. See [proofs/hol_light/x86_64](hol_light/x86_64). +We use the [HOL-Light](https://github.com/jrh13/hol-light) interactive theorem prover alongside the verification infrastructure from [s2n-bignum](https://github.com/awslabs/s2n-bignum) to show the functional correctness of highly optimized assembly routines in mldsa-native at the object-code level. See [proofs/hol_light](hol_light). diff --git a/proofs/cbmc/keccak_f1600_x4_native_avx2/Makefile b/proofs/cbmc/keccak_f1600_x4_native_avx2/Makefile new file mode 100644 index 000000000..fbffa2f06 --- /dev/null +++ b/proofs/cbmc/keccak_f1600_x4_native_avx2/Makefile @@ -0,0 +1,55 @@ +# Copyright (c) The mldsa-native project authors +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +include ../Makefile_params.common + +HARNESS_ENTRY = harness +HARNESS_FILE = keccak_f1600_x4_native_avx2_harness + +# This should be a unique identifier for this proof, and will appear on the +# Litani dashboard. It can be human-readable and contain spaces if you wish. +PROOF_UID = keccak_f1600_x4_native_avx2 + +DEFINES += -DMLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 -DMLD_CONFIG_FIPS202_BACKEND_FILE="\"$(SRCDIR)/mldsa/src/fips202/native/x86_64/keccak_f1600_x4_avx2.h\"" -DMLD_CHECK_APIS +INCLUDES += + +REMOVE_FUNCTION_BODY += +UNWINDSET += + +PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c +PROJECT_SOURCES += $(SRCDIR)/mldsa/src/fips202/native/x86_64/src/keccakf1600_constants.c + +CHECK_FUNCTION_CONTRACTS=mld_keccak_f1600_x4_native +USE_FUNCTION_CONTRACTS=mld_keccak_f1600_x4_avx2 +APPLY_LOOP_CONTRACTS=on +USE_DYNAMIC_FRAMES=1 + +# Disable any setting of EXTERNAL_SAT_SOLVER, and choose SMT backend instead +EXTERNAL_SAT_SOLVER= +CBMCFLAGS=--bitwuzla + +FUNCTION_NAME = keccak_f1600_x4_native_avx2 + +# If this proof is found to consume huge amounts of RAM, you can set the +# EXPENSIVE variable. With new enough versions of the proof tools, this will +# restrict the number of EXPENSIVE CBMC jobs running at once. See the +# documentation in Makefile.common under the "Job Pools" heading for details. +# EXPENSIVE = true + +# This function is large enough to need... +CBMC_OBJECT_BITS = 8 + +# If you require access to a file-local ("static") function or object to conduct +# your proof, set the following (and do not include the original source file +# ("mldsa/src/poly.c") in PROJECT_SOURCES). +# REWRITTEN_SOURCES = $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i +# include ../Makefile.common +# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_SOURCE = $(SRCDIR)/mldsa/src/poly.c +# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_FUNCTIONS = foo bar +# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_OBJECTS = baz +# Care is required with variables on the left-hand side: REWRITTEN_SOURCES must +# be set before including Makefile.common, but any use of variables on the +# left-hand side requires those variables to be defined. Hence, _SOURCE, +# _FUNCTIONS, _OBJECTS is set after including Makefile.common. + +include ../Makefile.common diff --git a/proofs/cbmc/keccak_f1600_x4_native_avx2/keccak_f1600_x4_native_avx2_harness.c b/proofs/cbmc/keccak_f1600_x4_native_avx2/keccak_f1600_x4_native_avx2_harness.c new file mode 100644 index 000000000..4d062a0d3 --- /dev/null +++ b/proofs/cbmc/keccak_f1600_x4_native_avx2/keccak_f1600_x4_native_avx2_harness.c @@ -0,0 +1,13 @@ +// Copyright (c) The mldsa-native project authors +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT-0 + +#include + +int mld_keccak_f1600_x4_native(uint64_t *state); + +void harness(void) +{ + uint64_t *s; + int t = mld_keccak_f1600_x4_native(s); +} diff --git a/proofs/hol_light/README.md b/proofs/hol_light/README.md index e7df5f400..4e1e2f23c 100644 --- a/proofs/hol_light/README.md +++ b/proofs/hol_light/README.md @@ -54,3 +54,5 @@ echo '1+1;;' | nc -w 5 127.0.0.1 2012 - AArch64 poly_caddq: [mldsa_poly_caddq.S](aarch64/mldsa/mldsa_poly_caddq.S) - x86_64 forward NTT: [mldsa_ntt.S](x86_64/mldsa/mldsa_ntt.S) - x86_64 inverse NTT: [mldsa_intt.S](x86_64/mldsa/mldsa_intt.S) +- FIPS202: + * 4-fold Keccak-F1600 using AVX2: [keccak_f1600_x4_avx2.S](x86_64/mldsa/keccak_f1600_x4_avx2.S) diff --git a/proofs/hol_light/aarch64/proofs/keccak_utils.ml b/proofs/hol_light/aarch64/proofs/keccak_utils.ml new file mode 100644 index 000000000..25c489c26 --- /dev/null +++ b/proofs/hol_light/aarch64/proofs/keccak_utils.ml @@ -0,0 +1,92 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(* ========================================================================= *) +(* Keccak utilities for AArch64 proofs. *) +(* ========================================================================= *) + +needs "common/keccak_spec.ml";; + +(* ------------------------------------------------------------------------- *) +(* Some custom normalization for logical equivalence and conjunction, which *) +(* is enough to handle the shallow differences in various ways of expressing *) +(* Keccak-related operations, to avoid the overkill of using a SAT solver. *) +(* ------------------------------------------------------------------------- *) + +let KECCAK_BITBLAST_TAC = + let IFF_NOT_CONV = + let pth = TAUT + `((~p <=> q) <=> ~(p <=> q)) /\ + ((p <=> ~q) <=> ~(p <=> q))` in + GEN_REWRITE_CONV REDEPTH_CONV [pth; NOT_CLAUSES; EQ_CLAUSES] in + let [conv_e;conv_l;conv_r; conv_1e;conv_1r; + conv_e1;conv_l1;conv_r1; conv_ee; conv_11; conv_t] = + map (fun tm -> GEN_REWRITE_CONV I [TAUT tm]) + [`((p <=> q1) <=> (p <=> q2)) = (q1 <=> q2)`; + `((p1 <=> q1) <=> (p2 <=> q2)) = (p1 <=> (q1 <=> (p2 <=> q2)))`; + `((p1 <=> q1) <=> (p2 <=> q2)) = (p2 <=> (p1 <=> q1) <=> q2)`; + `(p <=> (p <=> q2)) = q2`; + `(p <=> (p2 <=> q2)) = (p2 <=> (p <=> q2))`; + `((p <=> q1) <=> p) = q1`; + `((p1 <=> q1) <=> p) = (p1 <=> (q1 <=> p))`; + `((p1 <=> q1) <=> p) = (p <=> (p1 <=> q1))`; + `(p <=> p) <=> T`; + `(p <=> q) <=> (q <=> p)`; + `(p <=> T) <=> p`] in + let rec IFF_MERGE_CONV tm = + match tm with + Comb(Comb(e,Comb(Comb(Const("=",_),p1),q1)), + Comb(Comb(Const("=",_),p2),q2)) -> + if p1 = p2 then (conv_e THENC IFF_MERGE_CONV) tm + else if p1 < p2 then (conv_l THENC IFF_RAND_CONV) tm + else (conv_r THENC IFF_RAND_CONV) tm + | Comb(Comb(e,p),Comb(Comb(Const("=",_),p2),q2)) -> + if p = p2 then conv_1e tm + else if p < p2 then REFL tm + else (conv_1r THENC IFF_RAND_CONV) tm + | Comb(Comb(e,Comb(Comb(Const("=",_),p1),q1)),p) -> + if p = p1 then conv_e1 tm + else if p1 < p then (conv_l1 THENC IFF_RAND_CONV) tm + else (conv_r1 THENC IFF_RAND_CONV) tm + | Comb(Comb(e,p),q) -> + if p = q then conv_ee tm + else if p < q then REFL tm + else conv_11 tm + | _ -> REFL tm + and IFF_RAND_CONV tm = + let th = RAND_CONV IFF_MERGE_CONV tm in + CONV_RULE(RAND_CONV(TRY_CONV conv_t)) th in + let rec IFF_CANON_CONV tm = + match tm with + Comb(Comb(Const("=",Tyapp("fun",[Tyapp("bool",[]);_])),l),r) -> + (BINOP_CONV IFF_CANON_CONV THENC IFF_MERGE_CONV) tm + | _ -> REFL tm in + let rec IFF_ATOM_CONV conv tm = + match tm with + Comb(Comb(Const("=",Tyapp("fun",[Tyapp("bool",[]);_])),l),r) -> + BINOP_CONV (IFF_ATOM_CONV conv) tm + | _ -> conv tm in + let rec AND_ATOM_CONV conv tm = + match tm with + Comb(Comb(Const("/\\",_),l),r) -> + BINOP_CONV (AND_ATOM_CONV conv) tm + | _ -> conv tm in + let rec IFF_NORM_CONV tm = + match tm with + Comb(Comb(Const("/\\",_),l),r) -> + let th = AND_ATOM_CONV IFF_NORM_CONV tm in + CONV_RULE (RAND_CONV CONJ_CANON_CONV) th + | Comb(Comb(Const("=",Tyapp("fun",[Tyapp("bool",[]);_])),l),r) -> + let th = IFF_ATOM_CONV IFF_NORM_CONV tm in + CONV_RULE (RAND_CONV IFF_CANON_CONV) th + | Comb(Const("~",_),l) -> RAND_CONV IFF_NORM_CONV tm + | _ -> REFL tm in + POP_ASSUM_LIST(K ALL_TAC) THEN + REWRITE_TAC[WORD_RULE `word_add x x = word_shl x 1`] THEN + BITBLAST_THEN(K ALL_TAC) THEN + CONV_TAC(AND_ATOM_CONV + (BINOP_CONV(IFF_NOT_CONV THENC IFF_NORM_CONV) THENC + GEN_REWRITE_CONV I [REFL_CLAUSE])) THEN + REWRITE_TAC[] THEN NO_TAC;; diff --git a/proofs/hol_light/common/keccak_constants.ml b/proofs/hol_light/common/keccak_constants.ml new file mode 100644 index 000000000..9c652613e --- /dev/null +++ b/proofs/hol_light/common/keccak_constants.ml @@ -0,0 +1,39 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(* + * WARNING: This file is auto-generated from scripts/autogen + * in the mldsa-native repository. + * Do not modify it directly. + *) + +(* Keccak-f[1600] round constants RC[i] for i = 0..23. *) + +let round_constants = define `round_constants:int64 list = [ + word 0x0000000000000001; + word 0x0000000000008082; + word 0x800000000000808A; + word 0x8000000080008000; + word 0x000000000000808B; + word 0x0000000080000001; + word 0x8000000080008081; + word 0x8000000000008009; + word 0x000000000000008A; + word 0x0000000000000088; + word 0x0000000080008009; + word 0x000000008000000A; + word 0x000000008000808B; + word 0x800000000000008B; + word 0x8000000000008089; + word 0x8000000000008003; + word 0x8000000000008002; + word 0x8000000000000080; + word 0x000000000000800A; + word 0x800000008000000A; + word 0x8000000080008081; + word 0x8000000000008080; + word 0x0000000080000001; + word 0x8000000080008008 +]`;; diff --git a/proofs/hol_light/common/keccak_spec.ml b/proofs/hol_light/common/keccak_spec.ml new file mode 100644 index 000000000..9337fdc6f --- /dev/null +++ b/proofs/hol_light/common/keccak_spec.ml @@ -0,0 +1,168 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(* ========================================================================= *) +(* Specification of Keccak (https://keccak.team/keccak_specs_summary.html). *) +(* ========================================================================= *) + +needs "Library/words.ml";; +needs "common/keccak_constants.ml";; + +(*** Some abbreviations on top of the word library ***) + +parse_as_prefix "~~";; +override_interface("~~",`word_not:N word->N word`);; +parse_as_infix("&&",(13,"right"));; +override_interface("&&",`word_and:N word->N word->N word`);; +parse_as_infix("||",(13,"right"));; +override_interface("^^",`word_xor:N word->N word->N word`);; +parse_as_infix("^^",(13,"right"));; +override_interface("||",`word_or:N word->N word->N word`);; + +(*** An individual round, with input and output lists in row-major order ***) + +let keccak_round = define + `(keccak_round:int64 -> int64 list->int64 list) RCi Alist = + let A00 = EL 0 Alist + and A10 = EL 1 Alist + and A20 = EL 2 Alist + and A30 = EL 3 Alist + and A40 = EL 4 Alist + and A01 = EL 5 Alist + and A11 = EL 6 Alist + and A21 = EL 7 Alist + and A31 = EL 8 Alist + and A41 = EL 9 Alist + and A02 = EL 10 Alist + and A12 = EL 11 Alist + and A22 = EL 12 Alist + and A32 = EL 13 Alist + and A42 = EL 14 Alist + and A03 = EL 15 Alist + and A13 = EL 16 Alist + and A23 = EL 17 Alist + and A33 = EL 18 Alist + and A43 = EL 19 Alist + and A04 = EL 20 Alist + and A14 = EL 21 Alist + and A24 = EL 22 Alist + and A34 = EL 23 Alist + and A44 = EL 24 Alist in + let C0 = A00 ^^ A01 ^^ A02 ^^ A03 ^^ A04 + and C1 = A10 ^^ A11 ^^ A12 ^^ A13 ^^ A14 + and C2 = A20 ^^ A21 ^^ A22 ^^ A23 ^^ A24 + and C3 = A30 ^^ A31 ^^ A32 ^^ A33 ^^ A34 + and C4 = A40 ^^ A41 ^^ A42 ^^ A43 ^^ A44 in + let D0 = C4 ^^ word_rol C1 1 + and D1 = C0 ^^ word_rol C2 1 + and D2 = C1 ^^ word_rol C3 1 + and D3 = C2 ^^ word_rol C4 1 + and D4 = C3 ^^ word_rol C0 1 in + let At00 = A00 ^^ D0 + and At01 = A01 ^^ D0 + and At02 = A02 ^^ D0 + and At03 = A03 ^^ D0 + and At04 = A04 ^^ D0 + and At10 = A10 ^^ D1 + and At11 = A11 ^^ D1 + and At12 = A12 ^^ D1 + and At13 = A13 ^^ D1 + and At14 = A14 ^^ D1 + and At20 = A20 ^^ D2 + and At21 = A21 ^^ D2 + and At22 = A22 ^^ D2 + and At23 = A23 ^^ D2 + and At24 = A24 ^^ D2 + and At30 = A30 ^^ D3 + and At31 = A31 ^^ D3 + and At32 = A32 ^^ D3 + and At33 = A33 ^^ D3 + and At34 = A34 ^^ D3 + and At40 = A40 ^^ D4 + and At41 = A41 ^^ D4 + and At42 = A42 ^^ D4 + and At43 = A43 ^^ D4 + and At44 = A44 ^^ D4 in + let B00 = word_rol At00 0 + and B01 = word_rol At30 28 + and B02 = word_rol At10 1 + and B03 = word_rol At40 27 + and B04 = word_rol At20 62 + and B10 = word_rol At11 44 + and B11 = word_rol At41 20 + and B12 = word_rol At21 6 + and B13 = word_rol At01 36 + and B14 = word_rol At31 55 + and B20 = word_rol At22 43 + and B21 = word_rol At02 3 + and B22 = word_rol At32 25 + and B23 = word_rol At12 10 + and B24 = word_rol At42 39 + and B30 = word_rol At33 21 + and B31 = word_rol At13 45 + and B32 = word_rol At43 8 + and B33 = word_rol At23 15 + and B34 = word_rol At03 41 + and B40 = word_rol At44 14 + and B41 = word_rol At24 61 + and B42 = word_rol At04 18 + and B43 = word_rol At34 56 + and B44 = word_rol At14 2 in + [(B00 ^^ (~~B10 && B20)) ^^ RCi; + B10 ^^ (~~B20 && B30); + B20 ^^ (~~B30 && B40); + B30 ^^ (~~B40 && B00); + B40 ^^ (~~B00 && B10); + B01 ^^ (~~B11 && B21); + B11 ^^ (~~B21 && B31); + B21 ^^ (~~B31 && B41); + B31 ^^ (~~B41 && B01); + B41 ^^ (~~B01 && B11); + B02 ^^ (~~B12 && B22); + B12 ^^ (~~B22 && B32); + B22 ^^ (~~B32 && B42); + B32 ^^ (~~B42 && B02); + B42 ^^ (~~B02 && B12); + B03 ^^ (~~B13 && B23); + B13 ^^ (~~B23 && B33); + B23 ^^ (~~B33 && B43); + B33 ^^ (~~B43 && B03); + B43 ^^ (~~B03 && B13); + B04 ^^ (~~B14 && B24); + B14 ^^ (~~B24 && B34); + B24 ^^ (~~B34 && B44); + B34 ^^ (~~B44 && B04); + B44 ^^ (~~B04 && B14)]`;; + +(*** Hence a recursive definition of n rounds starting from l ***) + +let keccak = define + `keccak 0 l = l /\ + keccak (n + 1) l = keccak_round (EL n round_constants) (keccak n l)`;; + +(* ------------------------------------------------------------------------- *) +(* A few lemmas that are useful when reasoning about Keccak. *) +(* ------------------------------------------------------------------------- *) + +let LENGTH_KECCAK = prove + (`!A i. LENGTH A = 25 ==> LENGTH(keccak i A) = 25`, + REWRITE_TAC[RIGHT_FORALL_IMP_THM] THEN GEN_TAC THEN DISCH_TAC THEN + INDUCT_TAC THEN ASM_REWRITE_TAC[keccak; ADD1; keccak_round] THEN + REPEAT LET_TAC THEN CONV_TAC(LAND_CONV LENGTH_CONV) THEN REFL_TAC);; + +let LENGTH_EQ_25 = prove + (`!l:A list. + LENGTH l = 25 <=> + l = [EL 0 l; EL 1 l; EL 2 l; EL 3 l; EL 4 l; + EL 5 l; EL 6 l; EL 7 l; EL 8 l; EL 9 l; + EL 10 l; EL 11 l; EL 12 l; EL 13 l; EL 14 l; + EL 15 l; EL 16 l; EL 17 l; EL 18 l; EL 19 l; + EL 20 l; EL 21 l; EL 22 l; EL 23 l; EL 24 l]`, + GEN_TAC THEN EQ_TAC THENL + [CONV_TAC(LAND_CONV(TOP_DEPTH_CONV num_CONV)) THEN + REWRITE_TAC[LENGTH_EQ_CONS; LENGTH_EQ_NIL] THEN + STRIP_TAC THEN ASM_REWRITE_TAC[CONS_11] THEN + CONV_TAC(ONCE_DEPTH_CONV EL_CONV) THEN REWRITE_TAC[]; + DISCH_THEN SUBST1_TAC THEN REWRITE_TAC[LENGTH] THEN ARITH_TAC]);; diff --git a/proofs/hol_light/x86_64/Makefile b/proofs/hol_light/x86_64/Makefile index 84aaa1d2e..7a6dca0ff 100644 --- a/proofs/hol_light/x86_64/Makefile +++ b/proofs/hol_light/x86_64/Makefile @@ -50,7 +50,8 @@ endif SPLIT=tr ';' '\n' -OBJ = mldsa/mldsa_ntt.o mldsa/mldsa_intt.o +OBJ = mldsa/mldsa_ntt.o mldsa/mldsa_intt.o \ + mldsa/keccak_f1600_x4_avx2.o # Build object files from assembly sources $(OBJ): %.o : %.S diff --git a/proofs/hol_light/x86_64/mldsa/keccak_f1600_x4_avx2.S b/proofs/hol_light/x86_64/mldsa/keccak_f1600_x4_avx2.S new file mode 100644 index 000000000..d0068bfce --- /dev/null +++ b/proofs/hol_light/x86_64/mldsa/keccak_f1600_x4_avx2.S @@ -0,0 +1,451 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/fips202/x86_64/src/keccak_f1600_x4_avx2.S using scripts/simpasm. Do not modify it directly. + */ + +#if defined(__ELF__) +.section .note.GNU-stack,"",@progbits +#endif + +.text +.balign 4 +#ifdef __APPLE__ +.global _PQCP_MLKEM_NATIVE_MLKEM768_keccak_f1600_x4_avx2 +_PQCP_MLKEM_NATIVE_MLKEM768_keccak_f1600_x4_avx2: +#else +.global PQCP_MLKEM_NATIVE_MLKEM768_keccak_f1600_x4_avx2 +PQCP_MLKEM_NATIVE_MLKEM768_keccak_f1600_x4_avx2: +#endif + + .cfi_startproc + endbr64 + subq $0x300, %rsp # imm = 0x300 + .cfi_adjust_cfa_offset 0x300 + vmovdqu (%rdi), %ymm0 + vmovdqu 0xc8(%rdi), %ymm3 + vmovdqu 0x190(%rdi), %ymm1 + vmovdqu 0x258(%rdi), %ymm4 + vpunpcklqdq %ymm3, %ymm0, %ymm2 # ymm2 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] + vpunpckhqdq %ymm3, %ymm0, %ymm0 # ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] + vpunpcklqdq %ymm4, %ymm1, %ymm3 # ymm3 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] + vperm2i128 $0x20, %ymm3, %ymm2, %ymm7 # ymm7 = ymm2[0,1],ymm3[0,1] + vpunpckhqdq %ymm4, %ymm1, %ymm1 # ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] + vperm2i128 $0x31, %ymm3, %ymm2, %ymm3 # ymm3 = ymm2[2,3],ymm3[2,3] + vmovdqu 0x278(%rdi), %ymm4 + vmovdqu %ymm3, 0x40(%rsp) + vperm2i128 $0x31, %ymm1, %ymm0, %ymm3 # ymm3 = ymm0[2,3],ymm1[2,3] + vmovdqu %ymm7, (%rsp) + vperm2i128 $0x20, %ymm1, %ymm0, %ymm7 # ymm7 = ymm0[0,1],ymm1[0,1] + vmovdqu 0x20(%rdi), %ymm0 + vmovdqu 0x1b0(%rdi), %ymm1 + vmovdqu %ymm3, 0x60(%rsp) + vmovdqu 0xe8(%rdi), %ymm3 + vmovdqu %ymm7, 0x20(%rsp) + vpunpcklqdq %ymm3, %ymm0, %ymm2 # ymm2 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] + vpunpckhqdq %ymm3, %ymm0, %ymm0 # ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] + vpunpcklqdq %ymm4, %ymm1, %ymm3 # ymm3 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] + vperm2i128 $0x20, %ymm3, %ymm2, %ymm7 # ymm7 = ymm2[0,1],ymm3[0,1] + vpunpckhqdq %ymm4, %ymm1, %ymm1 # ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] + vperm2i128 $0x31, %ymm3, %ymm2, %ymm3 # ymm3 = ymm2[2,3],ymm3[2,3] + vmovdqu 0x298(%rdi), %ymm4 + vperm2i128 $0x31, %ymm1, %ymm0, %ymm14 # ymm14 = ymm0[2,3],ymm1[2,3] + vmovdqu %ymm7, 0x80(%rsp) + vperm2i128 $0x20, %ymm1, %ymm0, %ymm7 # ymm7 = ymm0[0,1],ymm1[0,1] + vmovdqu 0x40(%rdi), %ymm0 + vmovdqu 0x1d0(%rdi), %ymm1 + vmovdqu %ymm3, 0xc0(%rsp) + vmovdqu 0x108(%rdi), %ymm3 + vmovdqu %ymm14, %ymm10 + vmovdqu %ymm7, 0xa0(%rsp) + vpunpcklqdq %ymm3, %ymm0, %ymm2 # ymm2 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] + vpunpckhqdq %ymm3, %ymm0, %ymm0 # ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] + vpunpcklqdq %ymm4, %ymm1, %ymm3 # ymm3 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] + vpunpckhqdq %ymm4, %ymm1, %ymm1 # ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] + vperm2i128 $0x20, %ymm3, %ymm2, %ymm11 # ymm11 = ymm2[0,1],ymm3[0,1] + vperm2i128 $0x31, %ymm3, %ymm2, %ymm3 # ymm3 = ymm2[2,3],ymm3[2,3] + vperm2i128 $0x20, %ymm1, %ymm0, %ymm7 # ymm7 = ymm0[0,1],ymm1[0,1] + vmovdqu %ymm3, 0x100(%rsp) + vperm2i128 $0x31, %ymm1, %ymm0, %ymm8 # ymm8 = ymm0[2,3],ymm1[2,3] + vmovdqu 0x128(%rdi), %ymm3 + vmovdqu 0x60(%rdi), %ymm0 + vmovdqu 0x1f0(%rdi), %ymm1 + vmovdqu %ymm7, 0xe0(%rsp) + vmovdqu %ymm11, %ymm14 + vmovdqu 0x2b8(%rdi), %ymm4 + vmovdqu 0x2f8(%rdi), %ymm5 + vpunpcklqdq %ymm3, %ymm0, %ymm2 # ymm2 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] + vpunpckhqdq %ymm3, %ymm0, %ymm0 # ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] + vpunpcklqdq %ymm4, %ymm1, %ymm3 # ymm3 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] + vpunpckhqdq %ymm4, %ymm1, %ymm1 # ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] + vmovdqu 0x2d8(%rdi), %ymm4 + vperm2i128 $0x20, %ymm3, %ymm2, %ymm15 # ymm15 = ymm2[0,1],ymm3[0,1] + vperm2i128 $0x31, %ymm3, %ymm2, %ymm3 # ymm3 = ymm2[2,3],ymm3[2,3] + vperm2i128 $0x20, %ymm1, %ymm0, %ymm7 # ymm7 = ymm0[0,1],ymm1[0,1] + vperm2i128 $0x31, %ymm1, %ymm0, %ymm9 # ymm9 = ymm0[2,3],ymm1[2,3] + vmovdqu %ymm3, 0x140(%rsp) + vmovdqu 0x80(%rdi), %ymm0 + vmovdqu 0x148(%rdi), %ymm3 + vmovdqu 0x210(%rdi), %ymm1 + vmovdqu %ymm7, 0x120(%rsp) + vpunpcklqdq %ymm3, %ymm0, %ymm2 # ymm2 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] + vpunpckhqdq %ymm3, %ymm0, %ymm0 # ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] + vpunpcklqdq %ymm4, %ymm1, %ymm3 # ymm3 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] + vpunpckhqdq %ymm4, %ymm1, %ymm1 # ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] + vperm2i128 $0x20, %ymm3, %ymm2, %ymm7 # ymm7 = ymm2[0,1],ymm3[0,1] + vperm2i128 $0x31, %ymm3, %ymm2, %ymm13 # ymm13 = ymm2[2,3],ymm3[2,3] + vperm2i128 $0x31, %ymm1, %ymm0, %ymm3 # ymm3 = ymm0[2,3],ymm1[2,3] + vmovdqu %ymm7, 0x160(%rsp) + vperm2i128 $0x20, %ymm1, %ymm0, %ymm7 # ymm7 = ymm0[0,1],ymm1[0,1] + vmovdqu 0xa0(%rdi), %ymm0 + vmovdqu 0x230(%rdi), %ymm1 + vmovdqu %ymm3, 0x1a0(%rsp) + vmovdqu 0x168(%rdi), %ymm3 + vpunpcklqdq %ymm5, %ymm1, %ymm4 # ymm4 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] + vpunpckhqdq %ymm5, %ymm1, %ymm1 # ymm1 = ymm1[1],ymm5[1],ymm1[3],ymm5[3] + vmovdqu %ymm7, 0x180(%rsp) + vpunpcklqdq %ymm3, %ymm0, %ymm2 # ymm2 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] + vpunpckhqdq %ymm3, %ymm0, %ymm0 # ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] + vperm2i128 $0x20, %ymm4, %ymm2, %ymm12 # ymm12 = ymm2[0,1],ymm4[0,1] + vperm2i128 $0x20, %ymm1, %ymm0, %ymm3 # ymm3 = ymm0[0,1],ymm1[0,1] + vperm2i128 $0x31, %ymm4, %ymm2, %ymm7 # ymm7 = ymm2[2,3],ymm4[2,3] + vperm2i128 $0x31, %ymm1, %ymm0, %ymm4 # ymm4 = ymm0[2,3],ymm1[2,3] + vmovq 0x250(%rdi), %xmm0 + vmovq 0xc0(%rdi), %xmm1 + vmovdqu %ymm12, 0x1c0(%rsp) + vmovdqu %ymm4, 0x1e0(%rsp) + vpinsrq $0x1, 0x318(%rdi), %xmm0, %xmm0 + vpinsrq $0x1, 0x188(%rdi), %xmm1, %xmm1 + vinserti128 $0x1, %xmm0, %ymm1, %ymm2 + movq $0x0, %r10 + +LLkeccak_f1600_x4_avx2: + vmovdqu 0xa0(%rsp), %ymm4 + vpxor 0x1c0(%rsp), %ymm9, %ymm0 + vmovdqu %ymm9, 0x200(%rsp) + vmovdqu %ymm10, %ymm9 + vmovdqu 0xc0(%rsp), %ymm11 + vmovdqu 0x160(%rsp), %ymm12 + vmovdqu %ymm3, 0x240(%rsp) + vpxor 0x100(%rsp), %ymm4, %ymm1 + vmovdqu 0x40(%rsp), %ymm10 + vmovdqu %ymm4, 0x220(%rsp) + vpxor %ymm3, %ymm12, %ymm12 + vmovdqu 0x20(%rsp), %ymm6 + vmovdqu 0x140(%rsp), %ymm4 + vmovdqu %ymm14, 0x2a0(%rsp) + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm8, %ymm11, %ymm1 + vpxor 0x180(%rsp), %ymm7, %ymm11 + vmovdqu %ymm10, 0x280(%rsp) + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm15, %ymm9, %ymm1 + vmovdqu 0xe0(%rsp), %ymm3 + vmovdqu %ymm8, 0x260(%rsp) + vpxor %ymm1, %ymm11, %ymm11 + vpxor 0x120(%rsp), %ymm14, %ymm1 + vpxor %ymm6, %ymm12, %ymm12 + vmovdqu 0x60(%rsp), %ymm8 + vpxor %ymm10, %ymm11, %ymm11 + vpxor 0x1e0(%rsp), %ymm13, %ymm10 + vpxor %ymm4, %ymm3, %ymm3 + vmovdqu %ymm4, 0x2c0(%rsp) + vpsrlq $0x3f, %ymm12, %ymm4 + vpsrlq $0x3f, %ymm11, %ymm5 + vpxor (%rsp), %ymm0, %ymm0 + vpxor %ymm1, %ymm10, %ymm10 + vmovdqu 0x80(%rsp), %ymm1 + vpxor %ymm8, %ymm10, %ymm10 + vmovdqu %ymm1, %ymm14 + vpxor 0x1a0(%rsp), %ymm2, %ymm1 + vmovdqu %ymm14, 0x2e0(%rsp) + vpxor %ymm3, %ymm1, %ymm1 + vpsllq $0x1, %ymm12, %ymm3 + vpor %ymm4, %ymm3, %ymm3 + vpsllq $0x1, %ymm11, %ymm4 + vpxor %ymm14, %ymm1, %ymm1 + vpor %ymm5, %ymm4, %ymm4 + vpsrlq $0x3f, %ymm10, %ymm14 + vpxor %ymm1, %ymm3, %ymm3 + vpsllq $0x1, %ymm10, %ymm5 + vpxor %ymm0, %ymm4, %ymm4 + vpor %ymm14, %ymm5, %ymm5 + vpxor %ymm6, %ymm4, %ymm6 + vpxor %ymm12, %ymm5, %ymm5 + vpsrlq $0x3f, %ymm1, %ymm12 + vpsllq $0x1, %ymm1, %ymm1 + vpxor %ymm7, %ymm5, %ymm7 + vpxor %ymm9, %ymm5, %ymm9 + vpor %ymm12, %ymm1, %ymm1 + vpxor (%rsp), %ymm3, %ymm12 + vpxor %ymm11, %ymm1, %ymm1 + vpsrlq $0x3f, %ymm0, %ymm11 + vpsllq $0x1, %ymm0, %ymm0 + vpxor %ymm13, %ymm1, %ymm13 + vpxor %ymm8, %ymm1, %ymm8 + vpor %ymm11, %ymm0, %ymm0 + vpxor %ymm10, %ymm0, %ymm0 + vpxor 0xc0(%rsp), %ymm4, %ymm10 + vpxor %ymm2, %ymm0, %ymm2 + vpsrlq $0x14, %ymm10, %ymm11 + vpsllq $0x2c, %ymm10, %ymm10 + vpor %ymm11, %ymm10, %ymm10 + vpxor %ymm15, %ymm5, %ymm11 + vpbroadcastq (%rsi), %ymm15 + vpsrlq $0x15, %ymm11, %ymm14 + vpsllq $0x2b, %ymm11, %ymm11 + vpor %ymm14, %ymm11, %ymm11 + vpandn %ymm11, %ymm10, %ymm14 + vpxor %ymm15, %ymm14, %ymm14 + vpxor %ymm12, %ymm14, %ymm15 + vpsrlq $0x2b, %ymm13, %ymm14 + vpsllq $0x15, %ymm13, %ymm13 + vmovdqu %ymm15, (%rsp) + vpor %ymm14, %ymm13, %ymm13 + vpandn %ymm13, %ymm11, %ymm14 + vpxor %ymm10, %ymm14, %ymm15 + vpsrlq $0x32, %ymm2, %ymm14 + vpsllq $0xe, %ymm2, %ymm2 + vmovdqu %ymm15, 0x20(%rsp) + vpor %ymm14, %ymm2, %ymm2 + vpandn %ymm2, %ymm13, %ymm14 + vpxor %ymm11, %ymm14, %ymm11 + vmovdqu %ymm11, 0x40(%rsp) + vpandn %ymm12, %ymm2, %ymm11 + vpandn %ymm10, %ymm12, %ymm12 + vpxor %ymm13, %ymm11, %ymm11 + vmovdqu %ymm11, 0x60(%rsp) + vpxor %ymm2, %ymm12, %ymm11 + vpsrlq $0x24, %ymm8, %ymm2 + vpsllq $0x1c, %ymm8, %ymm8 + vmovdqu %ymm11, 0x80(%rsp) + vpor %ymm2, %ymm8, %ymm8 + vpxor 0xe0(%rsp), %ymm0, %ymm2 + vpsrlq $0x2c, %ymm2, %ymm10 + vpsllq $0x14, %ymm2, %ymm2 + vpor %ymm10, %ymm2, %ymm2 + vpxor 0x100(%rsp), %ymm3, %ymm10 + vpsrlq $0x3d, %ymm10, %ymm11 + vpsllq $0x3, %ymm10, %ymm10 + vpor %ymm11, %ymm10, %ymm10 + vpandn %ymm10, %ymm2, %ymm11 + vpxor %ymm8, %ymm11, %ymm11 + vmovdqu %ymm11, 0xa0(%rsp) + vpxor 0x160(%rsp), %ymm4, %ymm11 + vpsrlq $0x13, %ymm11, %ymm12 + vpsllq $0x2d, %ymm11, %ymm11 + vpor %ymm12, %ymm11, %ymm11 + vpandn %ymm11, %ymm10, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vmovdqu %ymm12, 0xc0(%rsp) + vpsrlq $0x3, %ymm7, %ymm12 + vpsllq $0x3d, %ymm7, %ymm7 + vpor %ymm12, %ymm7, %ymm7 + vpandn %ymm7, %ymm11, %ymm12 + vpxor %ymm10, %ymm12, %ymm10 + vpandn %ymm8, %ymm7, %ymm12 + vpandn %ymm2, %ymm8, %ymm8 + vpsrlq $0x3f, %ymm6, %ymm2 + vpsllq $0x1, %ymm6, %ymm6 + vpxor %ymm11, %ymm12, %ymm14 + vpor %ymm2, %ymm6, %ymm6 + vpsrlq $0x3a, %ymm9, %ymm2 + vpxor %ymm7, %ymm8, %ymm12 + vpsllq $0x6, %ymm9, %ymm9 + vmovdqu %ymm12, 0xe0(%rsp) + vpxor 0x1a0(%rsp), %ymm0, %ymm7 + vpor %ymm2, %ymm9, %ymm9 + vpxor 0x120(%rsp), %ymm1, %ymm2 + vpshufb (%rdx), %ymm7, %ymm7 + vpsrlq $0x27, %ymm2, %ymm11 + vpsllq $0x19, %ymm2, %ymm2 + vpor %ymm2, %ymm11, %ymm11 + vpandn %ymm11, %ymm9, %ymm2 + vpandn %ymm7, %ymm11, %ymm8 + vpxor %ymm6, %ymm2, %ymm12 + vpxor 0x1c0(%rsp), %ymm3, %ymm2 + vpxor %ymm9, %ymm8, %ymm8 + vmovdqu %ymm12, 0x100(%rsp) + vpsrlq $0x2e, %ymm2, %ymm12 + vpsllq $0x12, %ymm2, %ymm2 + vpor %ymm2, %ymm12, %ymm2 + vpandn %ymm2, %ymm7, %ymm12 + vpxor %ymm11, %ymm12, %ymm15 + vpandn %ymm6, %ymm2, %ymm11 + vpandn %ymm9, %ymm6, %ymm6 + vpxor %ymm7, %ymm11, %ymm12 + vmovdqu %ymm12, 0x120(%rsp) + vpxor %ymm2, %ymm6, %ymm12 + vpxor 0x2e0(%rsp), %ymm0, %ymm6 + vpxor 0x2c0(%rsp), %ymm0, %ymm0 + vmovdqu %ymm12, 0x140(%rsp) + vpsrlq $0x25, %ymm6, %ymm2 + vpsllq $0x1b, %ymm6, %ymm6 + vpor %ymm6, %ymm2, %ymm2 + vpxor 0x220(%rsp), %ymm3, %ymm6 + vpxor 0x200(%rsp), %ymm3, %ymm3 + vpsrlq $0x1c, %ymm6, %ymm7 + vpsllq $0x24, %ymm6, %ymm6 + vpor %ymm6, %ymm7, %ymm7 + vpxor 0x260(%rsp), %ymm4, %ymm6 + vpxor 0x240(%rsp), %ymm4, %ymm4 + vpsrlq $0x36, %ymm6, %ymm12 + vpsllq $0xa, %ymm6, %ymm6 + vpor %ymm6, %ymm12, %ymm12 + vpxor 0x180(%rsp), %ymm5, %ymm6 + vpxor 0x280(%rsp), %ymm5, %ymm5 + vpandn %ymm12, %ymm7, %ymm9 + vpsrlq $0x31, %ymm6, %ymm11 + vpsllq $0xf, %ymm6, %ymm6 + vpxor %ymm2, %ymm9, %ymm9 + vpor %ymm6, %ymm11, %ymm11 + vpandn %ymm11, %ymm12, %ymm6 + vpxor %ymm7, %ymm6, %ymm6 + vmovdqu %ymm6, 0x160(%rsp) + vpxor 0x1e0(%rsp), %ymm1, %ymm6 + vpxor 0x2a0(%rsp), %ymm1, %ymm1 + vpshufb (%rcx), %ymm6, %ymm6 + vpandn %ymm6, %ymm11, %ymm13 + vpxor %ymm12, %ymm13, %ymm13 + vmovdqu %ymm13, 0x180(%rsp) + vpandn %ymm2, %ymm6, %ymm13 + vpandn %ymm7, %ymm2, %ymm2 + vpxor %ymm6, %ymm2, %ymm2 + vpsrlq $0x3e, %ymm4, %ymm6 + vpxor %ymm11, %ymm13, %ymm13 + vmovdqu %ymm2, 0x1a0(%rsp) + vpsrlq $0x2, %ymm5, %ymm2 + vpsllq $0x3e, %ymm5, %ymm5 + vpor %ymm5, %ymm2, %ymm2 + vpsrlq $0x9, %ymm1, %ymm5 + vpsllq $0x37, %ymm1, %ymm1 + vpsllq $0x2, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm1 + vpsrlq $0x19, %ymm0, %ymm5 + vpor %ymm4, %ymm6, %ymm4 + vpsllq $0x27, %ymm0, %ymm0 + vpor %ymm0, %ymm5, %ymm5 + vpandn %ymm5, %ymm1, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vmovdqu %ymm0, 0x1c0(%rsp) + vpsrlq $0x17, %ymm3, %ymm0 + vpsllq $0x29, %ymm3, %ymm3 + vpor %ymm3, %ymm0, %ymm0 + vpandn %ymm4, %ymm0, %ymm7 + vpandn %ymm0, %ymm5, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpandn %ymm2, %ymm4, %ymm5 + vpandn %ymm1, %ymm2, %ymm2 + vpxor %ymm0, %ymm5, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm4, %ymm2, %ymm2 + vmovdqu %ymm5, 0x1e0(%rsp) + addq $0x8, %rsi + addq $0x1, %r10 + cmpq $0x18, %r10 + jne LLkeccak_f1600_x4_avx2 + vmovdqu (%rsp), %ymm4 + vmovdqu 0x40(%rsp), %ymm5 + vmovdqu 0x20(%rsp), %ymm0 + vmovdqu 0x60(%rsp), %ymm1 + vmovdqu 0x1c0(%rsp), %ymm12 + vmovdqu %ymm2, 0x1c0(%rsp) + vpunpcklqdq %ymm0, %ymm4, %ymm2 # ymm2 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] + vpunpckhqdq %ymm0, %ymm4, %ymm0 # ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] + vpunpcklqdq %ymm1, %ymm5, %ymm4 # ymm4 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] + vpunpckhqdq %ymm1, %ymm5, %ymm1 # ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] + vperm2i128 $0x20, %ymm4, %ymm2, %ymm6 # ymm6 = ymm2[0,1],ymm4[0,1] + vperm2i128 $0x31, %ymm4, %ymm2, %ymm2 # ymm2 = ymm2[2,3],ymm4[2,3] + vmovdqu 0x80(%rsp), %ymm4 + vperm2i128 $0x20, %ymm1, %ymm0, %ymm5 # ymm5 = ymm0[0,1],ymm1[0,1] + vperm2i128 $0x31, %ymm1, %ymm0, %ymm0 # ymm0 = ymm0[2,3],ymm1[2,3] + vmovdqu %ymm6, (%rdi) + vmovdqu %ymm5, 0xc8(%rdi) + vmovdqu %ymm2, 0x190(%rdi) + vmovdqu %ymm0, 0x258(%rdi) + vmovdqu 0xa0(%rsp), %ymm0 + vpunpcklqdq %ymm0, %ymm4, %ymm2 # ymm2 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] + vpunpckhqdq %ymm0, %ymm4, %ymm1 # ymm1 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] + vmovdqu 0xc0(%rsp), %ymm0 + vpunpcklqdq %ymm10, %ymm0, %ymm4 # ymm4 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] + vpunpckhqdq %ymm10, %ymm0, %ymm0 # ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] + vperm2i128 $0x20, %ymm4, %ymm2, %ymm6 # ymm6 = ymm2[0,1],ymm4[0,1] + vperm2i128 $0x20, %ymm0, %ymm1, %ymm5 # ymm5 = ymm1[0,1],ymm0[0,1] + vperm2i128 $0x31, %ymm4, %ymm2, %ymm2 # ymm2 = ymm2[2,3],ymm4[2,3] + vmovdqu 0xe0(%rsp), %ymm4 + vperm2i128 $0x31, %ymm0, %ymm1, %ymm1 # ymm1 = ymm1[2,3],ymm0[2,3] + vmovdqu 0x100(%rsp), %ymm0 + vmovdqu %ymm2, 0x1b0(%rdi) + vmovdqu %ymm1, 0x278(%rdi) + vpunpcklqdq %ymm4, %ymm14, %ymm2 # ymm2 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] + vpunpckhqdq %ymm4, %ymm14, %ymm1 # ymm1 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] + vpunpcklqdq %ymm8, %ymm0, %ymm4 # ymm4 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] + vpunpckhqdq %ymm8, %ymm0, %ymm0 # ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] + vmovdqu %ymm6, 0x20(%rdi) + vmovdqu %ymm5, 0xe8(%rdi) + vperm2i128 $0x20, %ymm4, %ymm2, %ymm6 # ymm6 = ymm2[0,1],ymm4[0,1] + vperm2i128 $0x20, %ymm0, %ymm1, %ymm5 # ymm5 = ymm1[0,1],ymm0[0,1] + vperm2i128 $0x31, %ymm4, %ymm2, %ymm2 # ymm2 = ymm2[2,3],ymm4[2,3] + vperm2i128 $0x31, %ymm0, %ymm1, %ymm1 # ymm1 = ymm1[2,3],ymm0[2,3] + vmovdqu 0x120(%rsp), %ymm4 + vmovdqu 0x140(%rsp), %ymm0 + vmovdqu %ymm2, 0x1d0(%rdi) + vmovdqu %ymm1, 0x298(%rdi) + vpunpcklqdq %ymm4, %ymm15, %ymm2 # ymm2 = ymm15[0],ymm4[0],ymm15[2],ymm4[2] + vpunpckhqdq %ymm4, %ymm15, %ymm1 # ymm1 = ymm15[1],ymm4[1],ymm15[3],ymm4[3] + vpunpcklqdq %ymm9, %ymm0, %ymm4 # ymm4 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] + vmovdqu %ymm5, 0x108(%rdi) + vpunpckhqdq %ymm9, %ymm0, %ymm0 # ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] + vmovdqu %ymm6, 0x40(%rdi) + vperm2i128 $0x20, %ymm4, %ymm2, %ymm6 # ymm6 = ymm2[0,1],ymm4[0,1] + vperm2i128 $0x31, %ymm4, %ymm2, %ymm2 # ymm2 = ymm2[2,3],ymm4[2,3] + vperm2i128 $0x20, %ymm0, %ymm1, %ymm5 # ymm5 = ymm1[0,1],ymm0[0,1] + vmovdqu 0x160(%rsp), %ymm4 + vperm2i128 $0x31, %ymm0, %ymm1, %ymm1 # ymm1 = ymm1[2,3],ymm0[2,3] + vmovdqu 0x180(%rsp), %ymm0 + vmovdqu %ymm5, 0x128(%rdi) + vmovdqu 0x1a0(%rsp), %ymm5 + vmovdqu %ymm2, 0x1f0(%rdi) + vpunpcklqdq %ymm0, %ymm4, %ymm2 # ymm2 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] + vpunpckhqdq %ymm0, %ymm4, %ymm0 # ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] + vpunpcklqdq %ymm5, %ymm13, %ymm4 # ymm4 = ymm13[0],ymm5[0],ymm13[2],ymm5[2] + vmovdqu %ymm6, 0x60(%rdi) + vperm2i128 $0x20, %ymm4, %ymm2, %ymm6 # ymm6 = ymm2[0,1],ymm4[0,1] + vmovdqu %ymm1, 0x2b8(%rdi) + vperm2i128 $0x31, %ymm4, %ymm2, %ymm2 # ymm2 = ymm2[2,3],ymm4[2,3] + vpunpckhqdq %ymm5, %ymm13, %ymm1 # ymm1 = ymm13[1],ymm5[1],ymm13[3],ymm5[3] + vmovdqu %ymm6, 0x80(%rdi) + vmovdqu 0x1e0(%rsp), %ymm4 + vperm2i128 $0x20, %ymm1, %ymm0, %ymm5 # ymm5 = ymm0[0,1],ymm1[0,1] + vperm2i128 $0x31, %ymm1, %ymm0, %ymm0 # ymm0 = ymm0[2,3],ymm1[2,3] + vmovdqu %ymm2, 0x210(%rdi) + vpunpcklqdq %ymm3, %ymm12, %ymm2 # ymm2 = ymm12[0],ymm3[0],ymm12[2],ymm3[2] + vmovdqu %ymm0, 0x2d8(%rdi) + vpunpckhqdq %ymm3, %ymm12, %ymm0 # ymm0 = ymm12[1],ymm3[1],ymm12[3],ymm3[3] + vpunpcklqdq %ymm4, %ymm7, %ymm3 # ymm3 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] + vpunpckhqdq %ymm4, %ymm7, %ymm1 # ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] + vmovdqu %ymm5, 0x148(%rdi) + vperm2i128 $0x20, %ymm3, %ymm2, %ymm5 # ymm5 = ymm2[0,1],ymm3[0,1] + vperm2i128 $0x31, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[2,3],ymm3[2,3] + vmovdqu 0x1c0(%rsp), %ymm3 + vperm2i128 $0x20, %ymm1, %ymm0, %ymm4 # ymm4 = ymm0[0,1],ymm1[0,1] + vperm2i128 $0x31, %ymm1, %ymm0, %ymm0 # ymm0 = ymm0[2,3],ymm1[2,3] + vmovdqu %ymm5, 0xa0(%rdi) + vextracti128 $0x1, %ymm3, %xmm15 + vmovdqu %ymm4, 0x168(%rdi) + vmovdqu %ymm2, 0x230(%rdi) + vmovdqu %ymm0, 0x2f8(%rdi) + vmovq %xmm3, 0xc0(%rdi) + vmovhpd %xmm3, 0x188(%rdi) + vmovq %xmm15, 0x250(%rdi) + vmovhpd %xmm15, 0x318(%rdi) + addq $0x300, %rsp # imm = 0x300 + .cfi_adjust_cfa_offset -0x300 + retq + .cfi_endproc diff --git a/proofs/hol_light/x86_64/proofs/dump_bytecode.ml b/proofs/hol_light/x86_64/proofs/dump_bytecode.ml index b06bb2321..6c1758e4c 100644 --- a/proofs/hol_light/x86_64/proofs/dump_bytecode.ml +++ b/proofs/hol_light/x86_64/proofs/dump_bytecode.ml @@ -12,3 +12,7 @@ print_string "==== bytecode end =====================================\n\n";; print_string "=== bytecode start: x86_64/mldsa/mldsa_intt.o ================\n";; print_literal_from_elf "x86_64/mldsa/mldsa_intt.o";; print_string "==== bytecode end =====================================\n\n";; + +print_string "=== bytecode start: x86_64/mldsa/keccak_f1600_x4_avx2.o ================\n";; +print_literal_from_elf "x86_64/mldsa/keccak_f1600_x4_avx2.o";; +print_string "==== bytecode end =====================================\n\n";; diff --git a/proofs/hol_light/x86_64/proofs/keccak_f1600_x4_avx2.ml b/proofs/hol_light/x86_64/proofs/keccak_f1600_x4_avx2.ml new file mode 100644 index 000000000..5aa12cafd --- /dev/null +++ b/proofs/hol_light/x86_64/proofs/keccak_f1600_x4_avx2.ml @@ -0,0 +1,1321 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + + needs "x86/proofs/base.ml";; + + needs "x86_64/proofs/keccak_utils.ml";; + +(**** print_literal_from_elf "x86/sha3/keccak_f1600_x4_avx2.o";; +****) + +let keccak_f1600_x4_avx2_mc = define_assert_from_elf + "keccak_f1600_x4_avx2_mc" "x86_64/mldsa/keccak_f1600_x4_avx2.o" +(*** BYTECODE START ***) +[ + 0xf3; 0x0f; 0x1e; 0xfa; (* ENDBR64 *) + 0x48; 0x81; 0xec; 0x00; 0x03; 0x00; 0x00; + (* SUB (% rsp) (Imm32 (word 768)) *) + 0xc5; 0xfe; 0x6f; 0x07; (* VMOVDQU (%_% ymm0) (Memop Word256 (%% (rdi,0))) *) + 0xc5; 0xfe; 0x6f; 0x9f; 0xc8; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% ymm3) (Memop Word256 (%% (rdi,200))) *) + 0xc5; 0xfe; 0x6f; 0x8f; 0x90; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% ymm1) (Memop Word256 (%% (rdi,400))) *) + 0xc5; 0xfe; 0x6f; 0xa7; 0x58; 0x02; 0x00; 0x00; + (* VMOVDQU (%_% ymm4) (Memop Word256 (%% (rdi,600))) *) + 0xc5; 0xfd; 0x6c; 0xd3; (* VPUNPCKLQDQ (%_% ymm2) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xfd; 0x6d; 0xc3; (* VPUNPCKHQDQ (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xf5; 0x6c; 0xdc; (* VPUNPCKLQDQ (%_% ymm3) (%_% ymm1) (%_% ymm4) *) + 0xc4; 0xe3; 0x6d; 0x46; 0xfb; 0x20; + (* VPERM2I128 (%_% ymm7) (%_% ymm2) (%_% ymm3) (Imm8 (word 32)) *) + 0xc5; 0xf5; 0x6d; 0xcc; (* VPUNPCKHQDQ (%_% ymm1) (%_% ymm1) (%_% ymm4) *) + 0xc4; 0xe3; 0x6d; 0x46; 0xdb; 0x31; + (* VPERM2I128 (%_% ymm3) (%_% ymm2) (%_% ymm3) (Imm8 (word 49)) *) + 0xc5; 0xfe; 0x6f; 0xa7; 0x78; 0x02; 0x00; 0x00; + (* VMOVDQU (%_% ymm4) (Memop Word256 (%% (rdi,632))) *) + 0xc5; 0xfe; 0x7f; 0x5c; 0x24; 0x40; + (* VMOVDQU (Memop Word256 (%% (rsp,64))) (%_% ymm3) *) + 0xc4; 0xe3; 0x7d; 0x46; 0xd9; 0x31; + (* VPERM2I128 (%_% ymm3) (%_% ymm0) (%_% ymm1) (Imm8 (word 49)) *) + 0xc5; 0xfe; 0x7f; 0x3c; 0x24; + (* VMOVDQU (Memop Word256 (%% (rsp,0))) (%_% ymm7) *) + 0xc4; 0xe3; 0x7d; 0x46; 0xf9; 0x20; + (* VPERM2I128 (%_% ymm7) (%_% ymm0) (%_% ymm1) (Imm8 (word 32)) *) + 0xc5; 0xfe; 0x6f; 0x47; 0x20; + (* VMOVDQU (%_% ymm0) (Memop Word256 (%% (rdi,32))) *) + 0xc5; 0xfe; 0x6f; 0x8f; 0xb0; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% ymm1) (Memop Word256 (%% (rdi,432))) *) + 0xc5; 0xfe; 0x7f; 0x5c; 0x24; 0x60; + (* VMOVDQU (Memop Word256 (%% (rsp,96))) (%_% ymm3) *) + 0xc5; 0xfe; 0x6f; 0x9f; 0xe8; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% ymm3) (Memop Word256 (%% (rdi,232))) *) + 0xc5; 0xfe; 0x7f; 0x7c; 0x24; 0x20; + (* VMOVDQU (Memop Word256 (%% (rsp,32))) (%_% ymm7) *) + 0xc5; 0xfd; 0x6c; 0xd3; (* VPUNPCKLQDQ (%_% ymm2) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xfd; 0x6d; 0xc3; (* VPUNPCKHQDQ (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xf5; 0x6c; 0xdc; (* VPUNPCKLQDQ (%_% ymm3) (%_% ymm1) (%_% ymm4) *) + 0xc4; 0xe3; 0x6d; 0x46; 0xfb; 0x20; + (* VPERM2I128 (%_% ymm7) (%_% ymm2) (%_% ymm3) (Imm8 (word 32)) *) + 0xc5; 0xf5; 0x6d; 0xcc; (* VPUNPCKHQDQ (%_% ymm1) (%_% ymm1) (%_% ymm4) *) + 0xc4; 0xe3; 0x6d; 0x46; 0xdb; 0x31; + (* VPERM2I128 (%_% ymm3) (%_% ymm2) (%_% ymm3) (Imm8 (word 49)) *) + 0xc5; 0xfe; 0x6f; 0xa7; 0x98; 0x02; 0x00; 0x00; + (* VMOVDQU (%_% ymm4) (Memop Word256 (%% (rdi,664))) *) + 0xc4; 0x63; 0x7d; 0x46; 0xf1; 0x31; + (* VPERM2I128 (%_% ymm14) (%_% ymm0) (%_% ymm1) (Imm8 (word 49)) *) + 0xc5; 0xfe; 0x7f; 0xbc; 0x24; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,128))) (%_% ymm7) *) + 0xc4; 0xe3; 0x7d; 0x46; 0xf9; 0x20; + (* VPERM2I128 (%_% ymm7) (%_% ymm0) (%_% ymm1) (Imm8 (word 32)) *) + 0xc5; 0xfe; 0x6f; 0x47; 0x40; + (* VMOVDQU (%_% ymm0) (Memop Word256 (%% (rdi,64))) *) + 0xc5; 0xfe; 0x6f; 0x8f; 0xd0; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% ymm1) (Memop Word256 (%% (rdi,464))) *) + 0xc5; 0xfe; 0x7f; 0x9c; 0x24; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,192))) (%_% ymm3) *) + 0xc5; 0xfe; 0x6f; 0x9f; 0x08; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% ymm3) (Memop Word256 (%% (rdi,264))) *) + 0xc4; 0x41; 0x7e; 0x6f; 0xd6; + (* VMOVDQU (%_% ymm10) (%_% ymm14) *) + 0xc5; 0xfe; 0x7f; 0xbc; 0x24; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,160))) (%_% ymm7) *) + 0xc5; 0xfd; 0x6c; 0xd3; (* VPUNPCKLQDQ (%_% ymm2) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xfd; 0x6d; 0xc3; (* VPUNPCKHQDQ (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xf5; 0x6c; 0xdc; (* VPUNPCKLQDQ (%_% ymm3) (%_% ymm1) (%_% ymm4) *) + 0xc5; 0xf5; 0x6d; 0xcc; (* VPUNPCKHQDQ (%_% ymm1) (%_% ymm1) (%_% ymm4) *) + 0xc4; 0x63; 0x6d; 0x46; 0xdb; 0x20; + (* VPERM2I128 (%_% ymm11) (%_% ymm2) (%_% ymm3) (Imm8 (word 32)) *) + 0xc4; 0xe3; 0x6d; 0x46; 0xdb; 0x31; + (* VPERM2I128 (%_% ymm3) (%_% ymm2) (%_% ymm3) (Imm8 (word 49)) *) + 0xc4; 0xe3; 0x7d; 0x46; 0xf9; 0x20; + (* VPERM2I128 (%_% ymm7) (%_% ymm0) (%_% ymm1) (Imm8 (word 32)) *) + 0xc5; 0xfe; 0x7f; 0x9c; 0x24; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,256))) (%_% ymm3) *) + 0xc4; 0x63; 0x7d; 0x46; 0xc1; 0x31; + (* VPERM2I128 (%_% ymm8) (%_% ymm0) (%_% ymm1) (Imm8 (word 49)) *) + 0xc5; 0xfe; 0x6f; 0x9f; 0x28; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% ymm3) (Memop Word256 (%% (rdi,296))) *) + 0xc5; 0xfe; 0x6f; 0x47; 0x60; + (* VMOVDQU (%_% ymm0) (Memop Word256 (%% (rdi,96))) *) + 0xc5; 0xfe; 0x6f; 0x8f; 0xf0; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% ymm1) (Memop Word256 (%% (rdi,496))) *) + 0xc5; 0xfe; 0x7f; 0xbc; 0x24; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,224))) (%_% ymm7) *) + 0xc4; 0x41; 0x7e; 0x6f; 0xf3; + (* VMOVDQU (%_% ymm14) (%_% ymm11) *) + 0xc5; 0xfe; 0x6f; 0xa7; 0xb8; 0x02; 0x00; 0x00; + (* VMOVDQU (%_% ymm4) (Memop Word256 (%% (rdi,696))) *) + 0xc5; 0xfe; 0x6f; 0xaf; 0xf8; 0x02; 0x00; 0x00; + (* VMOVDQU (%_% ymm5) (Memop Word256 (%% (rdi,760))) *) + 0xc5; 0xfd; 0x6c; 0xd3; (* VPUNPCKLQDQ (%_% ymm2) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xfd; 0x6d; 0xc3; (* VPUNPCKHQDQ (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xf5; 0x6c; 0xdc; (* VPUNPCKLQDQ (%_% ymm3) (%_% ymm1) (%_% ymm4) *) + 0xc5; 0xf5; 0x6d; 0xcc; (* VPUNPCKHQDQ (%_% ymm1) (%_% ymm1) (%_% ymm4) *) + 0xc5; 0xfe; 0x6f; 0xa7; 0xd8; 0x02; 0x00; 0x00; + (* VMOVDQU (%_% ymm4) (Memop Word256 (%% (rdi,728))) *) + 0xc4; 0x63; 0x6d; 0x46; 0xfb; 0x20; + (* VPERM2I128 (%_% ymm15) (%_% ymm2) (%_% ymm3) (Imm8 (word 32)) *) + 0xc4; 0xe3; 0x6d; 0x46; 0xdb; 0x31; + (* VPERM2I128 (%_% ymm3) (%_% ymm2) (%_% ymm3) (Imm8 (word 49)) *) + 0xc4; 0xe3; 0x7d; 0x46; 0xf9; 0x20; + (* VPERM2I128 (%_% ymm7) (%_% ymm0) (%_% ymm1) (Imm8 (word 32)) *) + 0xc4; 0x63; 0x7d; 0x46; 0xc9; 0x31; + (* VPERM2I128 (%_% ymm9) (%_% ymm0) (%_% ymm1) (Imm8 (word 49)) *) + 0xc5; 0xfe; 0x7f; 0x9c; 0x24; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,320))) (%_% ymm3) *) + 0xc5; 0xfe; 0x6f; 0x87; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% ymm0) (Memop Word256 (%% (rdi,128))) *) + 0xc5; 0xfe; 0x6f; 0x9f; 0x48; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% ymm3) (Memop Word256 (%% (rdi,328))) *) + 0xc5; 0xfe; 0x6f; 0x8f; 0x10; 0x02; 0x00; 0x00; + (* VMOVDQU (%_% ymm1) (Memop Word256 (%% (rdi,528))) *) + 0xc5; 0xfe; 0x7f; 0xbc; 0x24; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,288))) (%_% ymm7) *) + 0xc5; 0xfd; 0x6c; 0xd3; (* VPUNPCKLQDQ (%_% ymm2) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xfd; 0x6d; 0xc3; (* VPUNPCKHQDQ (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xf5; 0x6c; 0xdc; (* VPUNPCKLQDQ (%_% ymm3) (%_% ymm1) (%_% ymm4) *) + 0xc5; 0xf5; 0x6d; 0xcc; (* VPUNPCKHQDQ (%_% ymm1) (%_% ymm1) (%_% ymm4) *) + 0xc4; 0xe3; 0x6d; 0x46; 0xfb; 0x20; + (* VPERM2I128 (%_% ymm7) (%_% ymm2) (%_% ymm3) (Imm8 (word 32)) *) + 0xc4; 0x63; 0x6d; 0x46; 0xeb; 0x31; + (* VPERM2I128 (%_% ymm13) (%_% ymm2) (%_% ymm3) (Imm8 (word 49)) *) + 0xc4; 0xe3; 0x7d; 0x46; 0xd9; 0x31; + (* VPERM2I128 (%_% ymm3) (%_% ymm0) (%_% ymm1) (Imm8 (word 49)) *) + 0xc5; 0xfe; 0x7f; 0xbc; 0x24; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,352))) (%_% ymm7) *) + 0xc4; 0xe3; 0x7d; 0x46; 0xf9; 0x20; + (* VPERM2I128 (%_% ymm7) (%_% ymm0) (%_% ymm1) (Imm8 (word 32)) *) + 0xc5; 0xfe; 0x6f; 0x87; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% ymm0) (Memop Word256 (%% (rdi,160))) *) + 0xc5; 0xfe; 0x6f; 0x8f; 0x30; 0x02; 0x00; 0x00; + (* VMOVDQU (%_% ymm1) (Memop Word256 (%% (rdi,560))) *) + 0xc5; 0xfe; 0x7f; 0x9c; 0x24; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,416))) (%_% ymm3) *) + 0xc5; 0xfe; 0x6f; 0x9f; 0x68; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% ymm3) (Memop Word256 (%% (rdi,360))) *) + 0xc5; 0xf5; 0x6c; 0xe5; (* VPUNPCKLQDQ (%_% ymm4) (%_% ymm1) (%_% ymm5) *) + 0xc5; 0xf5; 0x6d; 0xcd; (* VPUNPCKHQDQ (%_% ymm1) (%_% ymm1) (%_% ymm5) *) + 0xc5; 0xfe; 0x7f; 0xbc; 0x24; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,384))) (%_% ymm7) *) + 0xc5; 0xfd; 0x6c; 0xd3; (* VPUNPCKLQDQ (%_% ymm2) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xfd; 0x6d; 0xc3; (* VPUNPCKHQDQ (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc4; 0x63; 0x6d; 0x46; 0xe4; 0x20; + (* VPERM2I128 (%_% ymm12) (%_% ymm2) (%_% ymm4) (Imm8 (word 32)) *) + 0xc4; 0xe3; 0x7d; 0x46; 0xd9; 0x20; + (* VPERM2I128 (%_% ymm3) (%_% ymm0) (%_% ymm1) (Imm8 (word 32)) *) + 0xc4; 0xe3; 0x6d; 0x46; 0xfc; 0x31; + (* VPERM2I128 (%_% ymm7) (%_% ymm2) (%_% ymm4) (Imm8 (word 49)) *) + 0xc4; 0xe3; 0x7d; 0x46; 0xe1; 0x31; + (* VPERM2I128 (%_% ymm4) (%_% ymm0) (%_% ymm1) (Imm8 (word 49)) *) + 0xc5; 0xfa; 0x7e; 0x87; 0x50; 0x02; 0x00; 0x00; + (* VMOVQ (%_% xmm0) (Memop Quadword (%% (rdi,592))) *) + 0xc5; 0xfa; 0x7e; 0x8f; 0xc0; 0x00; 0x00; 0x00; + (* VMOVQ (%_% xmm1) (Memop Quadword (%% (rdi,192))) *) + 0xc5; 0x7e; 0x7f; 0xa4; 0x24; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,448))) (%_% ymm12) *) + 0xc5; 0xfe; 0x7f; 0xa4; 0x24; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,480))) (%_% ymm4) *) + 0xc4; 0xe3; 0xf9; 0x22; 0x87; 0x18; 0x03; 0x00; 0x00; 0x01; + (* VPINSRQ (%_% xmm0) (%_% xmm0) (Memop Quadword (%% (rdi,792))) (Imm8 (word 1)) *) + 0xc4; 0xe3; 0xf1; 0x22; 0x8f; 0x88; 0x01; 0x00; 0x00; 0x01; + (* VPINSRQ (%_% xmm1) (%_% xmm1) (Memop Quadword (%% (rdi,392))) (Imm8 (word 1)) *) + 0xc4; 0xe3; 0x75; 0x38; 0xd0; 0x01; + (* VINSERTI128 (%_% ymm2) (%_% ymm1) (%_% xmm0) (Imm8 (word 1)) *) + 0x49; 0xc7; 0xc2; 0x00; 0x00; 0x00; 0x00; + (* MOV (% r10) (Imm32 (word 0)) *) + 0xc5; 0xfe; 0x6f; 0xa4; 0x24; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% ymm4) (Memop Word256 (%% (rsp,160))) *) + 0xc5; 0xb5; 0xef; 0x84; 0x24; 0xc0; 0x01; 0x00; 0x00; + (* VPXOR (%_% ymm0) (%_% ymm9) (Memop Word256 (%% (rsp,448))) *) + 0xc5; 0x7e; 0x7f; 0x8c; 0x24; 0x00; 0x02; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,512))) (%_% ymm9) *) + 0xc4; 0x41; 0x7e; 0x6f; 0xca; + (* VMOVDQU (%_% ymm9) (%_% ymm10) *) + 0xc5; 0x7e; 0x6f; 0x9c; 0x24; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% ymm11) (Memop Word256 (%% (rsp,192))) *) + 0xc5; 0x7e; 0x6f; 0xa4; 0x24; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% ymm12) (Memop Word256 (%% (rsp,352))) *) + 0xc5; 0xfe; 0x7f; 0x9c; 0x24; 0x40; 0x02; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,576))) (%_% ymm3) *) + 0xc5; 0xdd; 0xef; 0x8c; 0x24; 0x00; 0x01; 0x00; 0x00; + (* VPXOR (%_% ymm1) (%_% ymm4) (Memop Word256 (%% (rsp,256))) *) + 0xc5; 0x7e; 0x6f; 0x54; 0x24; 0x40; + (* VMOVDQU (%_% ymm10) (Memop Word256 (%% (rsp,64))) *) + 0xc5; 0xfe; 0x7f; 0xa4; 0x24; 0x20; 0x02; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,544))) (%_% ymm4) *) + 0xc5; 0x1d; 0xef; 0xe3; (* VPXOR (%_% ymm12) (%_% ymm12) (%_% ymm3) *) + 0xc5; 0xfe; 0x6f; 0x74; 0x24; 0x20; + (* VMOVDQU (%_% ymm6) (Memop Word256 (%% (rsp,32))) *) + 0xc5; 0xfe; 0x6f; 0xa4; 0x24; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% ymm4) (Memop Word256 (%% (rsp,320))) *) + 0xc5; 0x7e; 0x7f; 0xb4; 0x24; 0xa0; 0x02; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,672))) (%_% ymm14) *) + 0xc5; 0xfd; 0xef; 0xc1; (* VPXOR (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xc1; 0x25; 0xef; 0xc8; + (* VPXOR (%_% ymm1) (%_% ymm11) (%_% ymm8) *) + 0xc5; 0x45; 0xef; 0x9c; 0x24; 0x80; 0x01; 0x00; 0x00; + (* VPXOR (%_% ymm11) (%_% ymm7) (Memop Word256 (%% (rsp,384))) *) + 0xc5; 0x7e; 0x7f; 0x94; 0x24; 0x80; 0x02; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,640))) (%_% ymm10) *) + 0xc5; 0x1d; 0xef; 0xe1; (* VPXOR (%_% ymm12) (%_% ymm12) (%_% ymm1) *) + 0xc4; 0xc1; 0x35; 0xef; 0xcf; + (* VPXOR (%_% ymm1) (%_% ymm9) (%_% ymm15) *) + 0xc5; 0xfe; 0x6f; 0x9c; 0x24; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% ymm3) (Memop Word256 (%% (rsp,224))) *) + 0xc5; 0x7e; 0x7f; 0x84; 0x24; 0x60; 0x02; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,608))) (%_% ymm8) *) + 0xc5; 0x25; 0xef; 0xd9; (* VPXOR (%_% ymm11) (%_% ymm11) (%_% ymm1) *) + 0xc5; 0x8d; 0xef; 0x8c; 0x24; 0x20; 0x01; 0x00; 0x00; + (* VPXOR (%_% ymm1) (%_% ymm14) (Memop Word256 (%% (rsp,288))) *) + 0xc5; 0x1d; 0xef; 0xe6; (* VPXOR (%_% ymm12) (%_% ymm12) (%_% ymm6) *) + 0xc5; 0x7e; 0x6f; 0x44; 0x24; 0x60; + (* VMOVDQU (%_% ymm8) (Memop Word256 (%% (rsp,96))) *) + 0xc4; 0x41; 0x25; 0xef; 0xda; + (* VPXOR (%_% ymm11) (%_% ymm11) (%_% ymm10) *) + 0xc5; 0x15; 0xef; 0x94; 0x24; 0xe0; 0x01; 0x00; 0x00; + (* VPXOR (%_% ymm10) (%_% ymm13) (Memop Word256 (%% (rsp,480))) *) + 0xc5; 0xe5; 0xef; 0xdc; (* VPXOR (%_% ymm3) (%_% ymm3) (%_% ymm4) *) + 0xc5; 0xfe; 0x7f; 0xa4; 0x24; 0xc0; 0x02; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,704))) (%_% ymm4) *) + 0xc4; 0xc1; 0x5d; 0x73; 0xd4; 0x3f; + (* VPSRLQ (%_% ymm4) (%_% ymm12) (Imm8 (word 63)) *) + 0xc4; 0xc1; 0x55; 0x73; 0xd3; 0x3f; + (* VPSRLQ (%_% ymm5) (%_% ymm11) (Imm8 (word 63)) *) + 0xc5; 0xfd; 0xef; 0x04; 0x24; + (* VPXOR (%_% ymm0) (%_% ymm0) (Memop Word256 (%% (rsp,0))) *) + 0xc5; 0x2d; 0xef; 0xd1; (* VPXOR (%_% ymm10) (%_% ymm10) (%_% ymm1) *) + 0xc5; 0xfe; 0x6f; 0x8c; 0x24; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% ymm1) (Memop Word256 (%% (rsp,128))) *) + 0xc4; 0x41; 0x2d; 0xef; 0xd0; + (* VPXOR (%_% ymm10) (%_% ymm10) (%_% ymm8) *) + 0xc5; 0x7e; 0x6f; 0xf1; (* VMOVDQU (%_% ymm14) (%_% ymm1) *) + 0xc5; 0xed; 0xef; 0x8c; 0x24; 0xa0; 0x01; 0x00; 0x00; + (* VPXOR (%_% ymm1) (%_% ymm2) (Memop Word256 (%% (rsp,416))) *) + 0xc5; 0x7e; 0x7f; 0xb4; 0x24; 0xe0; 0x02; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,736))) (%_% ymm14) *) + 0xc5; 0xf5; 0xef; 0xcb; (* VPXOR (%_% ymm1) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x65; 0x73; 0xf4; 0x01; + (* VPSLLQ (%_% ymm3) (%_% ymm12) (Imm8 (word 1)) *) + 0xc5; 0xe5; 0xeb; 0xdc; (* VPOR (%_% ymm3) (%_% ymm3) (%_% ymm4) *) + 0xc4; 0xc1; 0x5d; 0x73; 0xf3; 0x01; + (* VPSLLQ (%_% ymm4) (%_% ymm11) (Imm8 (word 1)) *) + 0xc4; 0xc1; 0x75; 0xef; 0xce; + (* VPXOR (%_% ymm1) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xdd; 0xeb; 0xe5; (* VPOR (%_% ymm4) (%_% ymm4) (%_% ymm5) *) + 0xc4; 0xc1; 0x0d; 0x73; 0xd2; 0x3f; + (* VPSRLQ (%_% ymm14) (%_% ymm10) (Imm8 (word 63)) *) + 0xc5; 0xe5; 0xef; 0xd9; (* VPXOR (%_% ymm3) (%_% ymm3) (%_% ymm1) *) + 0xc4; 0xc1; 0x55; 0x73; 0xf2; 0x01; + (* VPSLLQ (%_% ymm5) (%_% ymm10) (Imm8 (word 1)) *) + 0xc5; 0xdd; 0xef; 0xe0; (* VPXOR (%_% ymm4) (%_% ymm4) (%_% ymm0) *) + 0xc4; 0xc1; 0x55; 0xeb; 0xee; + (* VPOR (%_% ymm5) (%_% ymm5) (%_% ymm14) *) + 0xc5; 0xdd; 0xef; 0xf6; (* VPXOR (%_% ymm6) (%_% ymm4) (%_% ymm6) *) + 0xc4; 0xc1; 0x55; 0xef; 0xec; + (* VPXOR (%_% ymm5) (%_% ymm5) (%_% ymm12) *) + 0xc5; 0x9d; 0x73; 0xd1; 0x3f; + (* VPSRLQ (%_% ymm12) (%_% ymm1) (Imm8 (word 63)) *) + 0xc5; 0xf5; 0x73; 0xf1; 0x01; + (* VPSLLQ (%_% ymm1) (%_% ymm1) (Imm8 (word 1)) *) + 0xc5; 0xd5; 0xef; 0xff; (* VPXOR (%_% ymm7) (%_% ymm5) (%_% ymm7) *) + 0xc4; 0x41; 0x55; 0xef; 0xc9; + (* VPXOR (%_% ymm9) (%_% ymm5) (%_% ymm9) *) + 0xc4; 0xc1; 0x75; 0xeb; 0xcc; + (* VPOR (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc5; 0x65; 0xef; 0x24; 0x24; + (* VPXOR (%_% ymm12) (%_% ymm3) (Memop Word256 (%% (rsp,0))) *) + 0xc4; 0xc1; 0x75; 0xef; 0xcb; + (* VPXOR (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc5; 0xa5; 0x73; 0xd0; 0x3f; + (* VPSRLQ (%_% ymm11) (%_% ymm0) (Imm8 (word 63)) *) + 0xc5; 0xfd; 0x73; 0xf0; 0x01; + (* VPSLLQ (%_% ymm0) (%_% ymm0) (Imm8 (word 1)) *) + 0xc4; 0x41; 0x75; 0xef; 0xed; + (* VPXOR (%_% ymm13) (%_% ymm1) (%_% ymm13) *) + 0xc4; 0x41; 0x75; 0xef; 0xc0; + (* VPXOR (%_% ymm8) (%_% ymm1) (%_% ymm8) *) + 0xc4; 0xc1; 0x7d; 0xeb; 0xc3; + (* VPOR (%_% ymm0) (%_% ymm0) (%_% ymm11) *) + 0xc4; 0xc1; 0x7d; 0xef; 0xc2; + (* VPXOR (%_% ymm0) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xef; 0x94; 0x24; 0xc0; 0x00; 0x00; 0x00; + (* VPXOR (%_% ymm10) (%_% ymm4) (Memop Word256 (%% (rsp,192))) *) + 0xc5; 0xfd; 0xef; 0xd2; (* VPXOR (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc4; 0xc1; 0x25; 0x73; 0xd2; 0x14; + (* VPSRLQ (%_% ymm11) (%_% ymm10) (Imm8 (word 20)) *) + 0xc4; 0xc1; 0x2d; 0x73; 0xf2; 0x2c; + (* VPSLLQ (%_% ymm10) (%_% ymm10) (Imm8 (word 44)) *) + 0xc4; 0x41; 0x2d; 0xeb; 0xd3; + (* VPOR (%_% ymm10) (%_% ymm10) (%_% ymm11) *) + 0xc4; 0x41; 0x55; 0xef; 0xdf; + (* VPXOR (%_% ymm11) (%_% ymm5) (%_% ymm15) *) + 0xc4; 0x62; 0x7d; 0x59; 0x3e; + (* VPBROADCASTQ (%_% ymm15) (Memop Quadword (%% (rsi,0))) *) + 0xc4; 0xc1; 0x0d; 0x73; 0xd3; 0x15; + (* VPSRLQ (%_% ymm14) (%_% ymm11) (Imm8 (word 21)) *) + 0xc4; 0xc1; 0x25; 0x73; 0xf3; 0x2b; + (* VPSLLQ (%_% ymm11) (%_% ymm11) (Imm8 (word 43)) *) + 0xc4; 0x41; 0x25; 0xeb; 0xde; + (* VPOR (%_% ymm11) (%_% ymm11) (%_% ymm14) *) + 0xc4; 0x41; 0x2d; 0xdf; 0xf3; + (* VPANDN (%_% ymm14) (%_% ymm10) (%_% ymm11) *) + 0xc4; 0x41; 0x0d; 0xef; 0xf7; + (* VPXOR (%_% ymm14) (%_% ymm14) (%_% ymm15) *) + 0xc4; 0x41; 0x0d; 0xef; 0xfc; + (* VPXOR (%_% ymm15) (%_% ymm14) (%_% ymm12) *) + 0xc4; 0xc1; 0x0d; 0x73; 0xd5; 0x2b; + (* VPSRLQ (%_% ymm14) (%_% ymm13) (Imm8 (word 43)) *) + 0xc4; 0xc1; 0x15; 0x73; 0xf5; 0x15; + (* VPSLLQ (%_% ymm13) (%_% ymm13) (Imm8 (word 21)) *) + 0xc5; 0x7e; 0x7f; 0x3c; 0x24; + (* VMOVDQU (Memop Word256 (%% (rsp,0))) (%_% ymm15) *) + 0xc4; 0x41; 0x15; 0xeb; 0xee; + (* VPOR (%_% ymm13) (%_% ymm13) (%_% ymm14) *) + 0xc4; 0x41; 0x25; 0xdf; 0xf5; + (* VPANDN (%_% ymm14) (%_% ymm11) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xef; 0xfa; + (* VPXOR (%_% ymm15) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0x8d; 0x73; 0xd2; 0x32; + (* VPSRLQ (%_% ymm14) (%_% ymm2) (Imm8 (word 50)) *) + 0xc5; 0xed; 0x73; 0xf2; 0x0e; + (* VPSLLQ (%_% ymm2) (%_% ymm2) (Imm8 (word 14)) *) + 0xc5; 0x7e; 0x7f; 0x7c; 0x24; 0x20; + (* VMOVDQU (Memop Word256 (%% (rsp,32))) (%_% ymm15) *) + 0xc4; 0xc1; 0x6d; 0xeb; 0xd6; + (* VPOR (%_% ymm2) (%_% ymm2) (%_% ymm14) *) + 0xc5; 0x15; 0xdf; 0xf2; (* VPANDN (%_% ymm14) (%_% ymm13) (%_% ymm2) *) + 0xc4; 0x41; 0x0d; 0xef; 0xdb; + (* VPXOR (%_% ymm11) (%_% ymm14) (%_% ymm11) *) + 0xc5; 0x7e; 0x7f; 0x5c; 0x24; 0x40; + (* VMOVDQU (Memop Word256 (%% (rsp,64))) (%_% ymm11) *) + 0xc4; 0x41; 0x6d; 0xdf; 0xdc; + (* VPANDN (%_% ymm11) (%_% ymm2) (%_% ymm12) *) + 0xc4; 0x41; 0x1d; 0xdf; 0xe2; + (* VPANDN (%_% ymm12) (%_% ymm12) (%_% ymm10) *) + 0xc4; 0x41; 0x25; 0xef; 0xdd; + (* VPXOR (%_% ymm11) (%_% ymm11) (%_% ymm13) *) + 0xc5; 0x7e; 0x7f; 0x5c; 0x24; 0x60; + (* VMOVDQU (Memop Word256 (%% (rsp,96))) (%_% ymm11) *) + 0xc5; 0x1d; 0xef; 0xda; (* VPXOR (%_% ymm11) (%_% ymm12) (%_% ymm2) *) + 0xc4; 0xc1; 0x6d; 0x73; 0xd0; 0x24; + (* VPSRLQ (%_% ymm2) (%_% ymm8) (Imm8 (word 36)) *) + 0xc4; 0xc1; 0x3d; 0x73; 0xf0; 0x1c; + (* VPSLLQ (%_% ymm8) (%_% ymm8) (Imm8 (word 28)) *) + 0xc5; 0x7e; 0x7f; 0x9c; 0x24; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,128))) (%_% ymm11) *) + 0xc5; 0x3d; 0xeb; 0xc2; (* VPOR (%_% ymm8) (%_% ymm8) (%_% ymm2) *) + 0xc5; 0xfd; 0xef; 0x94; 0x24; 0xe0; 0x00; 0x00; 0x00; + (* VPXOR (%_% ymm2) (%_% ymm0) (Memop Word256 (%% (rsp,224))) *) + 0xc5; 0xad; 0x73; 0xd2; 0x2c; + (* VPSRLQ (%_% ymm10) (%_% ymm2) (Imm8 (word 44)) *) + 0xc5; 0xed; 0x73; 0xf2; 0x14; + (* VPSLLQ (%_% ymm2) (%_% ymm2) (Imm8 (word 20)) *) + 0xc4; 0xc1; 0x6d; 0xeb; 0xd2; + (* VPOR (%_% ymm2) (%_% ymm2) (%_% ymm10) *) + 0xc5; 0x65; 0xef; 0x94; 0x24; 0x00; 0x01; 0x00; 0x00; + (* VPXOR (%_% ymm10) (%_% ymm3) (Memop Word256 (%% (rsp,256))) *) + 0xc4; 0xc1; 0x25; 0x73; 0xd2; 0x3d; + (* VPSRLQ (%_% ymm11) (%_% ymm10) (Imm8 (word 61)) *) + 0xc4; 0xc1; 0x2d; 0x73; 0xf2; 0x03; + (* VPSLLQ (%_% ymm10) (%_% ymm10) (Imm8 (word 3)) *) + 0xc4; 0x41; 0x2d; 0xeb; 0xd3; + (* VPOR (%_% ymm10) (%_% ymm10) (%_% ymm11) *) + 0xc4; 0x41; 0x6d; 0xdf; 0xda; + (* VPANDN (%_% ymm11) (%_% ymm2) (%_% ymm10) *) + 0xc4; 0x41; 0x25; 0xef; 0xd8; + (* VPXOR (%_% ymm11) (%_% ymm11) (%_% ymm8) *) + 0xc5; 0x7e; 0x7f; 0x9c; 0x24; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,160))) (%_% ymm11) *) + 0xc5; 0x5d; 0xef; 0x9c; 0x24; 0x60; 0x01; 0x00; 0x00; + (* VPXOR (%_% ymm11) (%_% ymm4) (Memop Word256 (%% (rsp,352))) *) + 0xc4; 0xc1; 0x1d; 0x73; 0xd3; 0x13; + (* VPSRLQ (%_% ymm12) (%_% ymm11) (Imm8 (word 19)) *) + 0xc4; 0xc1; 0x25; 0x73; 0xf3; 0x2d; + (* VPSLLQ (%_% ymm11) (%_% ymm11) (Imm8 (word 45)) *) + 0xc4; 0x41; 0x25; 0xeb; 0xdc; + (* VPOR (%_% ymm11) (%_% ymm11) (%_% ymm12) *) + 0xc4; 0x41; 0x2d; 0xdf; 0xe3; + (* VPANDN (%_% ymm12) (%_% ymm10) (%_% ymm11) *) + 0xc5; 0x1d; 0xef; 0xe2; (* VPXOR (%_% ymm12) (%_% ymm12) (%_% ymm2) *) + 0xc5; 0x7e; 0x7f; 0xa4; 0x24; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,192))) (%_% ymm12) *) + 0xc5; 0x9d; 0x73; 0xd7; 0x03; + (* VPSRLQ (%_% ymm12) (%_% ymm7) (Imm8 (word 3)) *) + 0xc5; 0xc5; 0x73; 0xf7; 0x3d; + (* VPSLLQ (%_% ymm7) (%_% ymm7) (Imm8 (word 61)) *) + 0xc4; 0xc1; 0x45; 0xeb; 0xfc; + (* VPOR (%_% ymm7) (%_% ymm7) (%_% ymm12) *) + 0xc5; 0x25; 0xdf; 0xe7; (* VPANDN (%_% ymm12) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xef; 0xd2; + (* VPXOR (%_% ymm10) (%_% ymm12) (%_% ymm10) *) + 0xc4; 0x41; 0x45; 0xdf; 0xe0; + (* VPANDN (%_% ymm12) (%_% ymm7) (%_% ymm8) *) + 0xc5; 0x3d; 0xdf; 0xc2; (* VPANDN (%_% ymm8) (%_% ymm8) (%_% ymm2) *) + 0xc5; 0xed; 0x73; 0xd6; 0x3f; + (* VPSRLQ (%_% ymm2) (%_% ymm6) (Imm8 (word 63)) *) + 0xc5; 0xcd; 0x73; 0xf6; 0x01; + (* VPSLLQ (%_% ymm6) (%_% ymm6) (Imm8 (word 1)) *) + 0xc4; 0x41; 0x1d; 0xef; 0xf3; + (* VPXOR (%_% ymm14) (%_% ymm12) (%_% ymm11) *) + 0xc5; 0xcd; 0xeb; 0xf2; (* VPOR (%_% ymm6) (%_% ymm6) (%_% ymm2) *) + 0xc4; 0xc1; 0x6d; 0x73; 0xd1; 0x3a; + (* VPSRLQ (%_% ymm2) (%_% ymm9) (Imm8 (word 58)) *) + 0xc5; 0x3d; 0xef; 0xe7; (* VPXOR (%_% ymm12) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0xc1; 0x35; 0x73; 0xf1; 0x06; + (* VPSLLQ (%_% ymm9) (%_% ymm9) (Imm8 (word 6)) *) + 0xc5; 0x7e; 0x7f; 0xa4; 0x24; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,224))) (%_% ymm12) *) + 0xc5; 0xfd; 0xef; 0xbc; 0x24; 0xa0; 0x01; 0x00; 0x00; + (* VPXOR (%_% ymm7) (%_% ymm0) (Memop Word256 (%% (rsp,416))) *) + 0xc5; 0x35; 0xeb; 0xca; (* VPOR (%_% ymm9) (%_% ymm9) (%_% ymm2) *) + 0xc5; 0xf5; 0xef; 0x94; 0x24; 0x20; 0x01; 0x00; 0x00; + (* VPXOR (%_% ymm2) (%_% ymm1) (Memop Word256 (%% (rsp,288))) *) + 0xc4; 0xe2; 0x45; 0x00; 0x3a; + (* VPSHUFB (%_% ymm7) (%_% ymm7) (Memop Word256 (%% (rdx,0))) *) + 0xc5; 0xa5; 0x73; 0xd2; 0x27; + (* VPSRLQ (%_% ymm11) (%_% ymm2) (Imm8 (word 39)) *) + 0xc5; 0xed; 0x73; 0xf2; 0x19; + (* VPSLLQ (%_% ymm2) (%_% ymm2) (Imm8 (word 25)) *) + 0xc5; 0x25; 0xeb; 0xda; (* VPOR (%_% ymm11) (%_% ymm11) (%_% ymm2) *) + 0xc4; 0xc1; 0x35; 0xdf; 0xd3; + (* VPANDN (%_% ymm2) (%_% ymm9) (%_% ymm11) *) + 0xc5; 0x25; 0xdf; 0xc7; (* VPANDN (%_% ymm8) (%_% ymm11) (%_% ymm7) *) + 0xc5; 0x6d; 0xef; 0xe6; (* VPXOR (%_% ymm12) (%_% ymm2) (%_% ymm6) *) + 0xc5; 0xe5; 0xef; 0x94; 0x24; 0xc0; 0x01; 0x00; 0x00; + (* VPXOR (%_% ymm2) (%_% ymm3) (Memop Word256 (%% (rsp,448))) *) + 0xc4; 0x41; 0x3d; 0xef; 0xc1; + (* VPXOR (%_% ymm8) (%_% ymm8) (%_% ymm9) *) + 0xc5; 0x7e; 0x7f; 0xa4; 0x24; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,256))) (%_% ymm12) *) + 0xc5; 0x9d; 0x73; 0xd2; 0x2e; + (* VPSRLQ (%_% ymm12) (%_% ymm2) (Imm8 (word 46)) *) + 0xc5; 0xed; 0x73; 0xf2; 0x12; + (* VPSLLQ (%_% ymm2) (%_% ymm2) (Imm8 (word 18)) *) + 0xc5; 0x9d; 0xeb; 0xd2; (* VPOR (%_% ymm2) (%_% ymm12) (%_% ymm2) *) + 0xc5; 0x45; 0xdf; 0xe2; (* VPANDN (%_% ymm12) (%_% ymm7) (%_% ymm2) *) + 0xc4; 0x41; 0x1d; 0xef; 0xfb; + (* VPXOR (%_% ymm15) (%_% ymm12) (%_% ymm11) *) + 0xc5; 0x6d; 0xdf; 0xde; (* VPANDN (%_% ymm11) (%_% ymm2) (%_% ymm6) *) + 0xc4; 0xc1; 0x4d; 0xdf; 0xf1; + (* VPANDN (%_% ymm6) (%_% ymm6) (%_% ymm9) *) + 0xc5; 0x25; 0xef; 0xe7; (* VPXOR (%_% ymm12) (%_% ymm11) (%_% ymm7) *) + 0xc5; 0x7e; 0x7f; 0xa4; 0x24; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,288))) (%_% ymm12) *) + 0xc5; 0x4d; 0xef; 0xe2; (* VPXOR (%_% ymm12) (%_% ymm6) (%_% ymm2) *) + 0xc5; 0xfd; 0xef; 0xb4; 0x24; 0xe0; 0x02; 0x00; 0x00; + (* VPXOR (%_% ymm6) (%_% ymm0) (Memop Word256 (%% (rsp,736))) *) + 0xc5; 0xfd; 0xef; 0x84; 0x24; 0xc0; 0x02; 0x00; 0x00; + (* VPXOR (%_% ymm0) (%_% ymm0) (Memop Word256 (%% (rsp,704))) *) + 0xc5; 0x7e; 0x7f; 0xa4; 0x24; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,320))) (%_% ymm12) *) + 0xc5; 0xed; 0x73; 0xd6; 0x25; + (* VPSRLQ (%_% ymm2) (%_% ymm6) (Imm8 (word 37)) *) + 0xc5; 0xcd; 0x73; 0xf6; 0x1b; + (* VPSLLQ (%_% ymm6) (%_% ymm6) (Imm8 (word 27)) *) + 0xc5; 0xed; 0xeb; 0xd6; (* VPOR (%_% ymm2) (%_% ymm2) (%_% ymm6) *) + 0xc5; 0xe5; 0xef; 0xb4; 0x24; 0x20; 0x02; 0x00; 0x00; + (* VPXOR (%_% ymm6) (%_% ymm3) (Memop Word256 (%% (rsp,544))) *) + 0xc5; 0xe5; 0xef; 0x9c; 0x24; 0x00; 0x02; 0x00; 0x00; + (* VPXOR (%_% ymm3) (%_% ymm3) (Memop Word256 (%% (rsp,512))) *) + 0xc5; 0xc5; 0x73; 0xd6; 0x1c; + (* VPSRLQ (%_% ymm7) (%_% ymm6) (Imm8 (word 28)) *) + 0xc5; 0xcd; 0x73; 0xf6; 0x24; + (* VPSLLQ (%_% ymm6) (%_% ymm6) (Imm8 (word 36)) *) + 0xc5; 0xc5; 0xeb; 0xfe; (* VPOR (%_% ymm7) (%_% ymm7) (%_% ymm6) *) + 0xc5; 0xdd; 0xef; 0xb4; 0x24; 0x60; 0x02; 0x00; 0x00; + (* VPXOR (%_% ymm6) (%_% ymm4) (Memop Word256 (%% (rsp,608))) *) + 0xc5; 0xdd; 0xef; 0xa4; 0x24; 0x40; 0x02; 0x00; 0x00; + (* VPXOR (%_% ymm4) (%_% ymm4) (Memop Word256 (%% (rsp,576))) *) + 0xc5; 0x9d; 0x73; 0xd6; 0x36; + (* VPSRLQ (%_% ymm12) (%_% ymm6) (Imm8 (word 54)) *) + 0xc5; 0xcd; 0x73; 0xf6; 0x0a; + (* VPSLLQ (%_% ymm6) (%_% ymm6) (Imm8 (word 10)) *) + 0xc5; 0x1d; 0xeb; 0xe6; (* VPOR (%_% ymm12) (%_% ymm12) (%_% ymm6) *) + 0xc5; 0xd5; 0xef; 0xb4; 0x24; 0x80; 0x01; 0x00; 0x00; + (* VPXOR (%_% ymm6) (%_% ymm5) (Memop Word256 (%% (rsp,384))) *) + 0xc5; 0xd5; 0xef; 0xac; 0x24; 0x80; 0x02; 0x00; 0x00; + (* VPXOR (%_% ymm5) (%_% ymm5) (Memop Word256 (%% (rsp,640))) *) + 0xc4; 0x41; 0x45; 0xdf; 0xcc; + (* VPANDN (%_% ymm9) (%_% ymm7) (%_% ymm12) *) + 0xc5; 0xa5; 0x73; 0xd6; 0x31; + (* VPSRLQ (%_% ymm11) (%_% ymm6) (Imm8 (word 49)) *) + 0xc5; 0xcd; 0x73; 0xf6; 0x0f; + (* VPSLLQ (%_% ymm6) (%_% ymm6) (Imm8 (word 15)) *) + 0xc5; 0x35; 0xef; 0xca; (* VPXOR (%_% ymm9) (%_% ymm9) (%_% ymm2) *) + 0xc5; 0x25; 0xeb; 0xde; (* VPOR (%_% ymm11) (%_% ymm11) (%_% ymm6) *) + 0xc4; 0xc1; 0x1d; 0xdf; 0xf3; + (* VPANDN (%_% ymm6) (%_% ymm12) (%_% ymm11) *) + 0xc5; 0xcd; 0xef; 0xf7; (* VPXOR (%_% ymm6) (%_% ymm6) (%_% ymm7) *) + 0xc5; 0xfe; 0x7f; 0xb4; 0x24; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,352))) (%_% ymm6) *) + 0xc5; 0xf5; 0xef; 0xb4; 0x24; 0xe0; 0x01; 0x00; 0x00; + (* VPXOR (%_% ymm6) (%_% ymm1) (Memop Word256 (%% (rsp,480))) *) + 0xc5; 0xf5; 0xef; 0x8c; 0x24; 0xa0; 0x02; 0x00; 0x00; + (* VPXOR (%_% ymm1) (%_% ymm1) (Memop Word256 (%% (rsp,672))) *) + 0xc4; 0xe2; 0x4d; 0x00; 0x31; + (* VPSHUFB (%_% ymm6) (%_% ymm6) (Memop Word256 (%% (rcx,0))) *) + 0xc5; 0x25; 0xdf; 0xee; (* VPANDN (%_% ymm13) (%_% ymm11) (%_% ymm6) *) + 0xc4; 0x41; 0x15; 0xef; 0xec; + (* VPXOR (%_% ymm13) (%_% ymm13) (%_% ymm12) *) + 0xc5; 0x7e; 0x7f; 0xac; 0x24; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,384))) (%_% ymm13) *) + 0xc5; 0x4d; 0xdf; 0xea; (* VPANDN (%_% ymm13) (%_% ymm6) (%_% ymm2) *) + 0xc5; 0xed; 0xdf; 0xd7; (* VPANDN (%_% ymm2) (%_% ymm2) (%_% ymm7) *) + 0xc5; 0xed; 0xef; 0xd6; (* VPXOR (%_% ymm2) (%_% ymm2) (%_% ymm6) *) + 0xc5; 0xcd; 0x73; 0xd4; 0x3e; + (* VPSRLQ (%_% ymm6) (%_% ymm4) (Imm8 (word 62)) *) + 0xc4; 0x41; 0x15; 0xef; 0xeb; + (* VPXOR (%_% ymm13) (%_% ymm13) (%_% ymm11) *) + 0xc5; 0xfe; 0x7f; 0x94; 0x24; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,416))) (%_% ymm2) *) + 0xc5; 0xed; 0x73; 0xd5; 0x02; + (* VPSRLQ (%_% ymm2) (%_% ymm5) (Imm8 (word 2)) *) + 0xc5; 0xd5; 0x73; 0xf5; 0x3e; + (* VPSLLQ (%_% ymm5) (%_% ymm5) (Imm8 (word 62)) *) + 0xc5; 0xed; 0xeb; 0xd5; (* VPOR (%_% ymm2) (%_% ymm2) (%_% ymm5) *) + 0xc5; 0xd5; 0x73; 0xd1; 0x09; + (* VPSRLQ (%_% ymm5) (%_% ymm1) (Imm8 (word 9)) *) + 0xc5; 0xf5; 0x73; 0xf1; 0x37; + (* VPSLLQ (%_% ymm1) (%_% ymm1) (Imm8 (word 55)) *) + 0xc5; 0xdd; 0x73; 0xf4; 0x02; + (* VPSLLQ (%_% ymm4) (%_% ymm4) (Imm8 (word 2)) *) + 0xc5; 0xd5; 0xeb; 0xc9; (* VPOR (%_% ymm1) (%_% ymm5) (%_% ymm1) *) + 0xc5; 0xd5; 0x73; 0xd0; 0x19; + (* VPSRLQ (%_% ymm5) (%_% ymm0) (Imm8 (word 25)) *) + 0xc5; 0xcd; 0xeb; 0xe4; (* VPOR (%_% ymm4) (%_% ymm6) (%_% ymm4) *) + 0xc5; 0xfd; 0x73; 0xf0; 0x27; + (* VPSLLQ (%_% ymm0) (%_% ymm0) (Imm8 (word 39)) *) + 0xc5; 0xd5; 0xeb; 0xe8; (* VPOR (%_% ymm5) (%_% ymm5) (%_% ymm0) *) + 0xc5; 0xf5; 0xdf; 0xc5; (* VPANDN (%_% ymm0) (%_% ymm1) (%_% ymm5) *) + 0xc5; 0xfd; 0xef; 0xc2; (* VPXOR (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfe; 0x7f; 0x84; 0x24; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,448))) (%_% ymm0) *) + 0xc5; 0xfd; 0x73; 0xd3; 0x17; + (* VPSRLQ (%_% ymm0) (%_% ymm3) (Imm8 (word 23)) *) + 0xc5; 0xe5; 0x73; 0xf3; 0x29; + (* VPSLLQ (%_% ymm3) (%_% ymm3) (Imm8 (word 41)) *) + 0xc5; 0xfd; 0xeb; 0xc3; (* VPOR (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xfd; 0xdf; 0xfc; (* VPANDN (%_% ymm7) (%_% ymm0) (%_% ymm4) *) + 0xc5; 0xd5; 0xdf; 0xd8; (* VPANDN (%_% ymm3) (%_% ymm5) (%_% ymm0) *) + 0xc5; 0xc5; 0xef; 0xfd; (* VPXOR (%_% ymm7) (%_% ymm7) (%_% ymm5) *) + 0xc5; 0xdd; 0xdf; 0xea; (* VPANDN (%_% ymm5) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0xed; 0xdf; 0xd1; (* VPANDN (%_% ymm2) (%_% ymm2) (%_% ymm1) *) + 0xc5; 0xd5; 0xef; 0xe8; (* VPXOR (%_% ymm5) (%_% ymm5) (%_% ymm0) *) + 0xc5; 0xe5; 0xef; 0xd9; (* VPXOR (%_% ymm3) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xef; 0xd4; (* VPXOR (%_% ymm2) (%_% ymm2) (%_% ymm4) *) + 0xc5; 0xfe; 0x7f; 0xac; 0x24; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,480))) (%_% ymm5) *) + 0x48; 0x83; 0xc6; 0x08; (* ADD (% rsi) (Imm8 (word 8)) *) + 0x49; 0x83; 0xc2; 0x01; (* ADD (% r10) (Imm8 (word 1)) *) + 0x49; 0x83; 0xfa; 0x18; (* CMP (% r10) (Imm8 (word 24)) *) + 0x0f; 0x85; 0xfe; 0xfa; 0xff; 0xff; + (* JNE (Imm32 (word 4294966014)) *) + 0xc5; 0xfe; 0x6f; 0x24; 0x24; + (* VMOVDQU (%_% ymm4) (Memop Word256 (%% (rsp,0))) *) + 0xc5; 0xfe; 0x6f; 0x6c; 0x24; 0x40; + (* VMOVDQU (%_% ymm5) (Memop Word256 (%% (rsp,64))) *) + 0xc5; 0xfe; 0x6f; 0x44; 0x24; 0x20; + (* VMOVDQU (%_% ymm0) (Memop Word256 (%% (rsp,32))) *) + 0xc5; 0xfe; 0x6f; 0x4c; 0x24; 0x60; + (* VMOVDQU (%_% ymm1) (Memop Word256 (%% (rsp,96))) *) + 0xc5; 0x7e; 0x6f; 0xa4; 0x24; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% ymm12) (Memop Word256 (%% (rsp,448))) *) + 0xc5; 0xfe; 0x7f; 0x94; 0x24; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rsp,448))) (%_% ymm2) *) + 0xc5; 0xdd; 0x6c; 0xd0; (* VPUNPCKLQDQ (%_% ymm2) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xdd; 0x6d; 0xc0; (* VPUNPCKHQDQ (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xd5; 0x6c; 0xe1; (* VPUNPCKLQDQ (%_% ymm4) (%_% ymm5) (%_% ymm1) *) + 0xc5; 0xd5; 0x6d; 0xc9; (* VPUNPCKHQDQ (%_% ymm1) (%_% ymm5) (%_% ymm1) *) + 0xc4; 0xe3; 0x6d; 0x46; 0xf4; 0x20; + (* VPERM2I128 (%_% ymm6) (%_% ymm2) (%_% ymm4) (Imm8 (word 32)) *) + 0xc4; 0xe3; 0x6d; 0x46; 0xd4; 0x31; + (* VPERM2I128 (%_% ymm2) (%_% ymm2) (%_% ymm4) (Imm8 (word 49)) *) + 0xc5; 0xfe; 0x6f; 0xa4; 0x24; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% ymm4) (Memop Word256 (%% (rsp,128))) *) + 0xc4; 0xe3; 0x7d; 0x46; 0xe9; 0x20; + (* VPERM2I128 (%_% ymm5) (%_% ymm0) (%_% ymm1) (Imm8 (word 32)) *) + 0xc4; 0xe3; 0x7d; 0x46; 0xc1; 0x31; + (* VPERM2I128 (%_% ymm0) (%_% ymm0) (%_% ymm1) (Imm8 (word 49)) *) + 0xc5; 0xfe; 0x7f; 0x37; (* VMOVDQU (Memop Word256 (%% (rdi,0))) (%_% ymm6) *) + 0xc5; 0xfe; 0x7f; 0xaf; 0xc8; 0x00; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rdi,200))) (%_% ymm5) *) + 0xc5; 0xfe; 0x7f; 0x97; 0x90; 0x01; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rdi,400))) (%_% ymm2) *) + 0xc5; 0xfe; 0x7f; 0x87; 0x58; 0x02; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rdi,600))) (%_% ymm0) *) + 0xc5; 0xfe; 0x6f; 0x84; 0x24; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% ymm0) (Memop Word256 (%% (rsp,160))) *) + 0xc5; 0xdd; 0x6c; 0xd0; (* VPUNPCKLQDQ (%_% ymm2) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xdd; 0x6d; 0xc8; (* VPUNPCKHQDQ (%_% ymm1) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfe; 0x6f; 0x84; 0x24; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% ymm0) (Memop Word256 (%% (rsp,192))) *) + 0xc4; 0xc1; 0x7d; 0x6c; 0xe2; + (* VPUNPCKLQDQ (%_% ymm4) (%_% ymm0) (%_% ymm10) *) + 0xc4; 0xc1; 0x7d; 0x6d; 0xc2; + (* VPUNPCKHQDQ (%_% ymm0) (%_% ymm0) (%_% ymm10) *) + 0xc4; 0xe3; 0x6d; 0x46; 0xf4; 0x20; + (* VPERM2I128 (%_% ymm6) (%_% ymm2) (%_% ymm4) (Imm8 (word 32)) *) + 0xc4; 0xe3; 0x75; 0x46; 0xe8; 0x20; + (* VPERM2I128 (%_% ymm5) (%_% ymm1) (%_% ymm0) (Imm8 (word 32)) *) + 0xc4; 0xe3; 0x6d; 0x46; 0xd4; 0x31; + (* VPERM2I128 (%_% ymm2) (%_% ymm2) (%_% ymm4) (Imm8 (word 49)) *) + 0xc5; 0xfe; 0x6f; 0xa4; 0x24; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% ymm4) (Memop Word256 (%% (rsp,224))) *) + 0xc4; 0xe3; 0x75; 0x46; 0xc8; 0x31; + (* VPERM2I128 (%_% ymm1) (%_% ymm1) (%_% ymm0) (Imm8 (word 49)) *) + 0xc5; 0xfe; 0x6f; 0x84; 0x24; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% ymm0) (Memop Word256 (%% (rsp,256))) *) + 0xc5; 0xfe; 0x7f; 0x97; 0xb0; 0x01; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rdi,432))) (%_% ymm2) *) + 0xc5; 0xfe; 0x7f; 0x8f; 0x78; 0x02; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rdi,632))) (%_% ymm1) *) + 0xc5; 0x8d; 0x6c; 0xd4; (* VPUNPCKLQDQ (%_% ymm2) (%_% ymm14) (%_% ymm4) *) + 0xc5; 0x8d; 0x6d; 0xcc; (* VPUNPCKHQDQ (%_% ymm1) (%_% ymm14) (%_% ymm4) *) + 0xc4; 0xc1; 0x7d; 0x6c; 0xe0; + (* VPUNPCKLQDQ (%_% ymm4) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0xc1; 0x7d; 0x6d; 0xc0; + (* VPUNPCKHQDQ (%_% ymm0) (%_% ymm0) (%_% ymm8) *) + 0xc5; 0xfe; 0x7f; 0x77; 0x20; + (* VMOVDQU (Memop Word256 (%% (rdi,32))) (%_% ymm6) *) + 0xc5; 0xfe; 0x7f; 0xaf; 0xe8; 0x00; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rdi,232))) (%_% ymm5) *) + 0xc4; 0xe3; 0x6d; 0x46; 0xf4; 0x20; + (* VPERM2I128 (%_% ymm6) (%_% ymm2) (%_% ymm4) (Imm8 (word 32)) *) + 0xc4; 0xe3; 0x75; 0x46; 0xe8; 0x20; + (* VPERM2I128 (%_% ymm5) (%_% ymm1) (%_% ymm0) (Imm8 (word 32)) *) + 0xc4; 0xe3; 0x6d; 0x46; 0xd4; 0x31; + (* VPERM2I128 (%_% ymm2) (%_% ymm2) (%_% ymm4) (Imm8 (word 49)) *) + 0xc4; 0xe3; 0x75; 0x46; 0xc8; 0x31; + (* VPERM2I128 (%_% ymm1) (%_% ymm1) (%_% ymm0) (Imm8 (word 49)) *) + 0xc5; 0xfe; 0x6f; 0xa4; 0x24; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% ymm4) (Memop Word256 (%% (rsp,288))) *) + 0xc5; 0xfe; 0x6f; 0x84; 0x24; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% ymm0) (Memop Word256 (%% (rsp,320))) *) + 0xc5; 0xfe; 0x7f; 0x97; 0xd0; 0x01; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rdi,464))) (%_% ymm2) *) + 0xc5; 0xfe; 0x7f; 0x8f; 0x98; 0x02; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rdi,664))) (%_% ymm1) *) + 0xc5; 0x85; 0x6c; 0xd4; (* VPUNPCKLQDQ (%_% ymm2) (%_% ymm15) (%_% ymm4) *) + 0xc5; 0x85; 0x6d; 0xcc; (* VPUNPCKHQDQ (%_% ymm1) (%_% ymm15) (%_% ymm4) *) + 0xc4; 0xc1; 0x7d; 0x6c; 0xe1; + (* VPUNPCKLQDQ (%_% ymm4) (%_% ymm0) (%_% ymm9) *) + 0xc5; 0xfe; 0x7f; 0xaf; 0x08; 0x01; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rdi,264))) (%_% ymm5) *) + 0xc4; 0xc1; 0x7d; 0x6d; 0xc1; + (* VPUNPCKHQDQ (%_% ymm0) (%_% ymm0) (%_% ymm9) *) + 0xc5; 0xfe; 0x7f; 0x77; 0x40; + (* VMOVDQU (Memop Word256 (%% (rdi,64))) (%_% ymm6) *) + 0xc4; 0xe3; 0x6d; 0x46; 0xf4; 0x20; + (* VPERM2I128 (%_% ymm6) (%_% ymm2) (%_% ymm4) (Imm8 (word 32)) *) + 0xc4; 0xe3; 0x6d; 0x46; 0xd4; 0x31; + (* VPERM2I128 (%_% ymm2) (%_% ymm2) (%_% ymm4) (Imm8 (word 49)) *) + 0xc4; 0xe3; 0x75; 0x46; 0xe8; 0x20; + (* VPERM2I128 (%_% ymm5) (%_% ymm1) (%_% ymm0) (Imm8 (word 32)) *) + 0xc5; 0xfe; 0x6f; 0xa4; 0x24; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% ymm4) (Memop Word256 (%% (rsp,352))) *) + 0xc4; 0xe3; 0x75; 0x46; 0xc8; 0x31; + (* VPERM2I128 (%_% ymm1) (%_% ymm1) (%_% ymm0) (Imm8 (word 49)) *) + 0xc5; 0xfe; 0x6f; 0x84; 0x24; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% ymm0) (Memop Word256 (%% (rsp,384))) *) + 0xc5; 0xfe; 0x7f; 0xaf; 0x28; 0x01; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rdi,296))) (%_% ymm5) *) + 0xc5; 0xfe; 0x6f; 0xac; 0x24; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% ymm5) (Memop Word256 (%% (rsp,416))) *) + 0xc5; 0xfe; 0x7f; 0x97; 0xf0; 0x01; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rdi,496))) (%_% ymm2) *) + 0xc5; 0xdd; 0x6c; 0xd0; (* VPUNPCKLQDQ (%_% ymm2) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xdd; 0x6d; 0xc0; (* VPUNPCKHQDQ (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0x95; 0x6c; 0xe5; (* VPUNPCKLQDQ (%_% ymm4) (%_% ymm13) (%_% ymm5) *) + 0xc5; 0xfe; 0x7f; 0x77; 0x60; + (* VMOVDQU (Memop Word256 (%% (rdi,96))) (%_% ymm6) *) + 0xc4; 0xe3; 0x6d; 0x46; 0xf4; 0x20; + (* VPERM2I128 (%_% ymm6) (%_% ymm2) (%_% ymm4) (Imm8 (word 32)) *) + 0xc5; 0xfe; 0x7f; 0x8f; 0xb8; 0x02; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rdi,696))) (%_% ymm1) *) + 0xc4; 0xe3; 0x6d; 0x46; 0xd4; 0x31; + (* VPERM2I128 (%_% ymm2) (%_% ymm2) (%_% ymm4) (Imm8 (word 49)) *) + 0xc5; 0x95; 0x6d; 0xcd; (* VPUNPCKHQDQ (%_% ymm1) (%_% ymm13) (%_% ymm5) *) + 0xc5; 0xfe; 0x7f; 0xb7; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rdi,128))) (%_% ymm6) *) + 0xc5; 0xfe; 0x6f; 0xa4; 0x24; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% ymm4) (Memop Word256 (%% (rsp,480))) *) + 0xc4; 0xe3; 0x7d; 0x46; 0xe9; 0x20; + (* VPERM2I128 (%_% ymm5) (%_% ymm0) (%_% ymm1) (Imm8 (word 32)) *) + 0xc4; 0xe3; 0x7d; 0x46; 0xc1; 0x31; + (* VPERM2I128 (%_% ymm0) (%_% ymm0) (%_% ymm1) (Imm8 (word 49)) *) + 0xc5; 0xfe; 0x7f; 0x97; 0x10; 0x02; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rdi,528))) (%_% ymm2) *) + 0xc5; 0x9d; 0x6c; 0xd3; (* VPUNPCKLQDQ (%_% ymm2) (%_% ymm12) (%_% ymm3) *) + 0xc5; 0xfe; 0x7f; 0x87; 0xd8; 0x02; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rdi,728))) (%_% ymm0) *) + 0xc5; 0x9d; 0x6d; 0xc3; (* VPUNPCKHQDQ (%_% ymm0) (%_% ymm12) (%_% ymm3) *) + 0xc5; 0xc5; 0x6c; 0xdc; (* VPUNPCKLQDQ (%_% ymm3) (%_% ymm7) (%_% ymm4) *) + 0xc5; 0xc5; 0x6d; 0xcc; (* VPUNPCKHQDQ (%_% ymm1) (%_% ymm7) (%_% ymm4) *) + 0xc5; 0xfe; 0x7f; 0xaf; 0x48; 0x01; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rdi,328))) (%_% ymm5) *) + 0xc4; 0xe3; 0x6d; 0x46; 0xeb; 0x20; + (* VPERM2I128 (%_% ymm5) (%_% ymm2) (%_% ymm3) (Imm8 (word 32)) *) + 0xc4; 0xe3; 0x6d; 0x46; 0xd3; 0x31; + (* VPERM2I128 (%_% ymm2) (%_% ymm2) (%_% ymm3) (Imm8 (word 49)) *) + 0xc5; 0xfe; 0x6f; 0x9c; 0x24; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% ymm3) (Memop Word256 (%% (rsp,448))) *) + 0xc4; 0xe3; 0x7d; 0x46; 0xe1; 0x20; + (* VPERM2I128 (%_% ymm4) (%_% ymm0) (%_% ymm1) (Imm8 (word 32)) *) + 0xc4; 0xe3; 0x7d; 0x46; 0xc1; 0x31; + (* VPERM2I128 (%_% ymm0) (%_% ymm0) (%_% ymm1) (Imm8 (word 49)) *) + 0xc5; 0xfe; 0x7f; 0xaf; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rdi,160))) (%_% ymm5) *) + 0xc4; 0xc3; 0x7d; 0x39; 0xdf; 0x01; + (* VEXTRACTI128 (%_% xmm15) (%_% ymm3) (Imm8 (word 1)) *) + 0xc5; 0xfe; 0x7f; 0xa7; 0x68; 0x01; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rdi,360))) (%_% ymm4) *) + 0xc5; 0xfe; 0x7f; 0x97; 0x30; 0x02; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rdi,560))) (%_% ymm2) *) + 0xc5; 0xfe; 0x7f; 0x87; 0xf8; 0x02; 0x00; 0x00; + (* VMOVDQU (Memop Word256 (%% (rdi,760))) (%_% ymm0) *) + 0xc5; 0xf9; 0xd6; 0x9f; 0xc0; 0x00; 0x00; 0x00; + (* VMOVQ (Memop Quadword (%% (rdi,192))) (%_% xmm3) *) + 0xc5; 0xf9; 0x17; 0x9f; 0x88; 0x01; 0x00; 0x00; + (* VMOVHPD (Memop Quadword (%% (rdi,392))) (%_% xmm3) *) + 0xc5; 0x79; 0xd6; 0xbf; 0x50; 0x02; 0x00; 0x00; + (* VMOVQ (Memop Quadword (%% (rdi,592))) (%_% xmm15) *) + 0xc5; 0x79; 0x17; 0xbf; 0x18; 0x03; 0x00; 0x00; + (* VMOVHPD (Memop Quadword (%% (rdi,792))) (%_% xmm15) *) + 0x48; 0x81; 0xc4; 0x00; 0x03; 0x00; 0x00; + (* ADD (% rsp) (Imm32 (word 768)) *) + 0xc3 (* RET *) +];; +(*** BYTECODE END ***) + +let keccak_f1600_x4_avx2_tmc = define_trimmed "keccak_f1600_x4_avx2_tmc" keccak_f1600_x4_avx2_mc;; + +let KECCAK_F1600_X4_AVX2_EXEC = X86_MK_CORE_EXEC_RULE keccak_f1600_x4_avx2_tmc;; +let keccak_f1600_x4_avx2_TMC_EXEC = KECCAK_F1600_X4_AVX2_EXEC;; + +let LENGTH_KECCAK_F1600_X4_AVX2_TMC = + REWRITE_CONV[keccak_f1600_x4_avx2_tmc] `LENGTH keccak_f1600_x4_avx2_tmc` + |> CONV_RULE(RAND_CONV LENGTH_CONV);; + +(* Preamble: SUB RSP, 768 (7 bytes) *) +let KECCAK_F1600_X4_AVX2_PREAMBLE_LENGTH = new_definition + `KECCAK_F1600_X4_AVX2_PREAMBLE_LENGTH = 7`;; + +(* Postamble: ADD RSP, 768 (7 bytes) + RET (1 byte) *) +let KECCAK_F1600_X4_AVX2_POSTAMBLE_LENGTH = new_definition + `KECCAK_F1600_X4_AVX2_POSTAMBLE_LENGTH = 8`;; + +let KECCAK_F1600_X4_AVX2_CORE_END = new_definition + `KECCAK_F1600_X4_AVX2_CORE_END = LENGTH keccak_f1600_x4_avx2_tmc - KECCAK_F1600_X4_AVX2_POSTAMBLE_LENGTH`;; + +let LENGTH_SIMPLIFY_CONV = + REWRITE_CONV[LENGTH_KECCAK_F1600_X4_AVX2_TMC; + KECCAK_F1600_X4_AVX2_CORE_END; + KECCAK_F1600_X4_AVX2_PREAMBLE_LENGTH; + KECCAK_F1600_X4_AVX2_POSTAMBLE_LENGTH] THENC + NUM_REDUCE_CONV THENC REWRITE_CONV [ADD_0];; + +let KECCAK_F1600_X4_AVX2_CORRECT = prove + (`!rc_pointer:int64 bitstate_in:int64 rho8_ptr:int64 rho56_ptr:int64 A1 A2 A3 A4 pc:num stackpointer:int64. + PAIRWISE nonoverlapping + [(word pc, LENGTH keccak_f1600_x4_avx2_tmc); + (stackpointer, 0x300); + (bitstate_in, 800); + (rc_pointer, 192); + (rho8_ptr, 32); + (rho56_ptr, 32)] + ==> ensures x86 + (\s. bytes_loaded s (word pc) (BUTLAST keccak_f1600_x4_avx2_tmc) /\ + read RIP s = word (pc + KECCAK_F1600_X4_AVX2_PREAMBLE_LENGTH) /\ + read RSP s = stackpointer /\ + read RDI s = bitstate_in /\ + C_ARGUMENTS [bitstate_in; rc_pointer; rho8_ptr; rho56_ptr] s /\ + wordlist_from_memory(rc_pointer,24) s = round_constants /\ + wordlist_from_memory(rho8_ptr,4) s = rho8_constant /\ + wordlist_from_memory(rho56_ptr,4) s = rho56_constant /\ + wordlist_from_memory(bitstate_in,25) s = A1 /\ + wordlist_from_memory(word_add bitstate_in (word 200),25) s = A2 /\ + wordlist_from_memory(word_add bitstate_in (word 400),25) s = A3 /\ + wordlist_from_memory(word_add bitstate_in (word 600),25) s = A4) + (\s. read RIP s = word(pc + KECCAK_F1600_X4_AVX2_CORE_END) /\ + wordlist_from_memory(bitstate_in,25) s = keccak 24 A1 /\ + wordlist_from_memory(word_add bitstate_in (word 200),25) s = keccak 24 A2 /\ + wordlist_from_memory(word_add bitstate_in (word 400),25) s = keccak 24 A3 /\ + wordlist_from_memory(word_add bitstate_in (word 600),25) s = keccak 24 A4) + (MAYCHANGE [RIP; R10; RSI] ,, + MAYCHANGE[ZMM0; ZMM1; ZMM2; ZMM3; ZMM4; ZMM5; ZMM6; ZMM7; ZMM8; ZMM9; ZMM10; ZMM11; ZMM12; ZMM13; ZMM14; ZMM15] ,, + MAYCHANGE SOME_FLAGS ,, MAYCHANGE [events] ,, + MAYCHANGE [memory :> bytes (stackpointer, 0x300)],, + MAYCHANGE [memory :> bytes (bitstate_in, 800)])`, + CONV_TAC LENGTH_SIMPLIFY_CONV THEN + REWRITE_TAC[SOME_FLAGS] THEN + MAP_EVERY X_GEN_TAC [`rc_pointer:int64`;`bitstate_in:int64`;`rho8_ptr:int64`;`rho56_ptr:int64`;`A1:int64 list`;`A2:int64 list`;`A3:int64 list`;`A4:int64 list`] THEN + MAP_EVERY X_GEN_TAC [`pc:num`;`stackpointer:int64`] THEN + REWRITE_TAC[MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI] THEN + REWRITE_TAC[PAIRWISE; C_ARGUMENTS; ALL] THEN + DISCH_THEN(REPEAT_TCL CONJUNCTS_THEN ASSUME_TAC) THEN + + ASM_CASES_TAC `LENGTH(A1:int64 list) = 25 /\ + LENGTH(A2:int64 list) = 25 /\ + LENGTH(A3:int64 list) = 25 /\ + LENGTH(A4:int64 list) = 25` THENL + [ALL_TAC; + ENSURES_INIT_TAC "s0" THEN + MATCH_MP_TAC(TAUT `F ==> p`) THEN + REPEAT(FIRST_X_ASSUM(MP_TAC o AP_TERM `LENGTH:int64 list->num`)) THEN + CONV_TAC(ONCE_DEPTH_CONV WORDLIST_FROM_MEMORY_CONV) THEN + REWRITE_TAC[LENGTH; ARITH] THEN ASM_MESON_TAC[]] THEN + + (*** Set up the loop invariant ***) + + ENSURES_WHILE_PAUP_TAC `0` `24` `pc + 0x268` `pc + 0x764` + `\i s. + (read R10 s = word i /\ + read RDI s = bitstate_in /\ + read RSP s = stackpointer /\ + read RDX s = rho8_ptr /\ + read RCX s = rho56_ptr /\ + read RSI s = word_add rc_pointer (word (8 * i)) /\ + wordlist_from_memory(rho8_ptr,4) s = rho8_constant /\ + wordlist_from_memory(rho56_ptr,4) s = rho56_constant /\ + wordlist_from_memory(rc_pointer,24) s = round_constants /\ + (APPEND (APPEND (APPEND (APPEND (APPEND (APPEND (APPEND (APPEND (APPEND (APPEND (APPEND (APPEND (APPEND (APPEND + (wordlist_from_memory(word_add stackpointer (word 0x0),7) s) + (CONS (read YMM10 s) [])) + (CONS (read YMM14 s) [])) + (wordlist_from_memory(word_add stackpointer (word 0xe0),2) s)) + (CONS (read YMM8 s) [])) + (CONS (read YMM15 s) [])) + (wordlist_from_memory(word_add stackpointer (word 0x120),2) s)) + (CONS (read YMM9 s) [])) + (wordlist_from_memory(word_add stackpointer (word 0x160),2) s)) + (CONS (read YMM13 s) [])) + (wordlist_from_memory(word_add stackpointer (word 0x1a0),2) s)) + (CONS (read YMM3 s) [])) + (CONS (read YMM7 s) [])) + (wordlist_from_memory(word_add stackpointer (word 0x1e0),1) s)) + (CONS (read YMM2 s) [])) = + (MAP2 word_join ((MAP2 word_join (keccak (i) A4) (keccak (i) A3)):int128 list) + ((MAP2 word_join (keccak (i) A2) (keccak (i) A1)):int128 list)):int256 list) /\ + (read ZF s <=> i = 24)` THEN + REPEAT CONJ_TAC THENL + [ARITH_TAC; + + (*** Initial holding of the invariant ***) + + REWRITE_TAC[rho56_constant; rho8_constant; round_constants; GSYM CONJ_ASSOC] THEN + REWRITE_TAC[WORDLIST_FROM_MEMORY_CONV `wordlist_from_memory(rc_pointer,24) s:int64 list`; + WORDLIST_FROM_MEMORY_CONV `wordlist_from_memory(bitstate_in,100) s:int64 list`; + WORDLIST_FROM_MEMORY_CONV `wordlist_from_memory(rho8_ptr,4) s:int64 list`; + WORDLIST_FROM_MEMORY_CONV `wordlist_from_memory(rho56_ptr,4) s:int64 list`] THEN + CONV_TAC(ONCE_DEPTH_CONV WORDLIST_FROM_MEMORY_CONV) THEN + REWRITE_TAC[APPEND] THEN + ENSURES_INIT_TAC "s0" THEN + BIGNUM_DIGITIZE_TAC "A_" `read (memory :> bytes (bitstate_in,8 * 100)) s0` THEN + ASM_REWRITE_TAC[] THEN REPEAT DISCH_TAC THEN + MEMORY_256_FROM_64_TAC "bitstate_in" 0 25 THEN + MEMORY_256_FROM_64_TAC "bitstate_in" 200 25 THEN + MEMORY_256_FROM_64_TAC "bitstate_in" 400 25 THEN + MEMORY_256_FROM_64_TAC "bitstate_in" 600 25 THEN + MEMORY_256_FROM_64_TAC "rho8_ptr" 0 4 THEN + MEMORY_256_FROM_64_TAC "rho56_ptr" 0 4 THEN + ASM_REWRITE_TAC[WORD_ADD_0] THEN + REPEAT STRIP_TAC THEN + X86_STEPS_TAC KECCAK_F1600_X4_AVX2_EXEC (1--96) THEN + ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN + REPEAT CONJ_TAC THENL + [PURE_ONCE_REWRITE_TAC[ARITH_RULE `8 * 0 = 0`] THEN + REWRITE_TAC[WORD_ADD_0]; + ASM_REWRITE_TAC [WORD_SUBWORD_JOIN_EXTRACT_64] THEN + ASM_REWRITE_TAC [WORD_SUBWORD_JOIN_EXTRACT_128] THEN + EXPAND_TAC "A1" THEN + EXPAND_TAC "A2" THEN + EXPAND_TAC "A3" THEN + EXPAND_TAC "A4" THEN + REWRITE_TAC[keccak] THEN + REWRITE_TAC[MAP2] THEN + REWRITE_TAC[CONS_11] THEN + CONV_TAC WORD_BLAST]; + + (*** Preservation of the invariant including end condition code ***) + + X_GEN_TAC `i:num` THEN STRIP_TAC THEN VAL_INT64_TAC `i:num` THEN + CONV_TAC(ONCE_DEPTH_CONV WORDLIST_FROM_MEMORY_CONV) THEN + + REWRITE_TAC[rho56_constant; rho8_constant; round_constants; GSYM CONJ_ASSOC] THEN + REWRITE_TAC[WORDLIST_FROM_MEMORY_CONV `wordlist_from_memory(rc_pointer,24) s:int64 list`; + WORDLIST_FROM_MEMORY_CONV `wordlist_from_memory(bitstate_in,100) s:int64 list`; + WORDLIST_FROM_MEMORY_CONV `wordlist_from_memory(rho8_ptr,1) s:int256 list`; + WORDLIST_FROM_MEMORY_CONV `wordlist_from_memory(rho56_ptr,1) s:int256 list`] THEN + REWRITE_TAC[APPEND] THEN + MP_TAC(ISPECL [`A1:int64 list`; `i:num`] LENGTH_KECCAK) THEN + MP_TAC(ISPECL [`A2:int64 list`; `i:num`] LENGTH_KECCAK) THEN + MP_TAC(ISPECL [`A3:int64 list`; `i:num`] LENGTH_KECCAK) THEN + MP_TAC(ISPECL [`A4:int64 list`; `i:num`] LENGTH_KECCAK) THEN + ASM_REWRITE_TAC[IMP_IMP] THEN REWRITE_TAC[LENGTH_EQ_25] THEN + DISCH_THEN(REPEAT_TCL CONJUNCTS_THEN SUBST1_TAC) THEN + REWRITE_TAC[MAP2; CONS_11; GSYM CONJ_ASSOC] THEN + ENSURES_INIT_TAC "s0" THEN + + SUBGOAL_THEN + `read (memory :> bytes64(word_add rc_pointer (word(8 * i)))) s0 = + EL i round_constants` + ASSUME_TAC THENL + [UNDISCH_TAC `i < 24` THEN SPEC_TAC(`i:num`,`i:num`) THEN + CONV_TAC EXPAND_CASES_CONV THEN + CONV_TAC(DEPTH_CONV WORD_NUM_RED_CONV) THEN + ASM_REWRITE_TAC[round_constants; WORD_ADD_0] THEN + CONV_TAC(ONCE_DEPTH_CONV EL_CONV) THEN REWRITE_TAC[]; + ALL_TAC] THEN + + ASM_REWRITE_TAC [WORD_SUBWORD_JOIN_EXTRACT_64] THEN + ASM_REWRITE_TAC [WORD_SUBWORD_JOIN_EXTRACT_128] THEN + BIGNUM_DIGITIZE_TAC "A_" `read (memory :> bytes (bitstate_in,8 * 100)) s0` THEN + ASM_REWRITE_TAC[] THEN REPEAT DISCH_TAC THEN + MEMORY_256_FROM_64_TAC "bitstate_in" 0 25 THEN + MEMORY_256_FROM_64_TAC "bitstate_in" 200 25 THEN + MEMORY_256_FROM_64_TAC "bitstate_in" 400 25 THEN + MEMORY_256_FROM_64_TAC "bitstate_in" 600 25 THEN + MEMORY_256_FROM_64_TAC "rho8_ptr" 0 4 THEN + MEMORY_256_FROM_64_TAC "rho56_ptr" 0 4 THEN + ASM_REWRITE_TAC[WORD_ADD_0] THEN REPEAT STRIP_TAC THEN + X86_STEPS_TAC KECCAK_F1600_X4_AVX2_EXEC (1--223) THEN + ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN + REPEAT CONJ_TAC THENL + [REWRITE_TAC[WORD_ADD]; + CONV_TAC WORD_BLAST; + REPEAT(CONJ_TAC THENL [CONV_TAC WORD_RULE]) THEN + REWRITE_TAC[keccak; keccak_round] THEN + CONV_TAC(ONCE_DEPTH_CONV EL_CONV) THEN + CONV_TAC(TOP_DEPTH_CONV let_CONV) THEN + REWRITE_TAC[MAP2; CONS_11] THEN + CONV_TAC(ONCE_DEPTH_CONV EL_CONV) THEN + ASM_REWRITE_TAC[] THEN + REPEAT CONJ_TAC THEN BITBLAST_TAC; + + REWRITE_TAC [WORD_BLAST `word_add x (word 18446744073709551593):int64 = + word_sub x (word 23)`] THEN + REWRITE_TAC[VAL_WORD_SUB_EQ_0] THEN + REWRITE_TAC[VAL_WORD;DIMINDEX_64] THEN + IMP_REWRITE_TAC[MOD_LT; ARITH_RULE `23 < 2 EXP 64`] THEN + CONJ_TAC THENL + [UNDISCH_TAC `i < 24` THEN ARITH_TAC; + ARITH_TAC]]; + + (*** The trivial loop-back goal ***) + + X_GEN_TAC `i:num` THEN STRIP_TAC THEN VAL_INT64_TAC `i:num` THEN + CONV_TAC(ONCE_DEPTH_CONV WORDLIST_FROM_MEMORY_CONV) THEN + + REWRITE_TAC[rho56_constant; rho8_constant; round_constants; GSYM CONJ_ASSOC] THEN + REWRITE_TAC[WORDLIST_FROM_MEMORY_CONV `wordlist_from_memory(rc_pointer,24) s:int64 list`; + WORDLIST_FROM_MEMORY_CONV `wordlist_from_memory(bitstate_in,100) s:int64 list`; + WORDLIST_FROM_MEMORY_CONV `wordlist_from_memory(rho8_ptr,1) s:int256 list`; + WORDLIST_FROM_MEMORY_CONV `wordlist_from_memory(rho56_ptr,1) s:int256 list`] THEN + REWRITE_TAC[APPEND] THEN + MP_TAC(ISPECL [`A1:int64 list`; `i:num`] LENGTH_KECCAK) THEN + MP_TAC(ISPECL [`A2:int64 list`; `i:num`] LENGTH_KECCAK) THEN + MP_TAC(ISPECL [`A3:int64 list`; `i:num`] LENGTH_KECCAK) THEN + MP_TAC(ISPECL [`A4:int64 list`; `i:num`] LENGTH_KECCAK) THEN + ASM_REWRITE_TAC[IMP_IMP] THEN + REWRITE_TAC[LENGTH_EQ_25] THEN + DISCH_THEN(REPEAT_TCL CONJUNCTS_THEN SUBST1_TAC) THEN + REWRITE_TAC[MAP2; CONS_11; GSYM CONJ_ASSOC] THEN + ENSURES_INIT_TAC "s0" THEN + BIGNUM_DIGITIZE_TAC "A_" `read (memory :> bytes (bitstate_in,8 * 100)) s0` THEN + ASM_REWRITE_TAC[] THEN REPEAT DISCH_TAC THEN + MEMORY_256_FROM_64_TAC "bitstate_in" 0 25 THEN + MEMORY_256_FROM_64_TAC "bitstate_in" 200 25 THEN + MEMORY_256_FROM_64_TAC "bitstate_in" 400 25 THEN + MEMORY_256_FROM_64_TAC "bitstate_in" 600 25 THEN + MEMORY_256_FROM_64_TAC "rho8_ptr" 0 4 THEN + MEMORY_256_FROM_64_TAC "rho56_ptr" 0 4 THEN + ASM_REWRITE_TAC[WORD_ADD_0] THEN REPEAT STRIP_TAC THEN + X86_STEPS_TAC KECCAK_F1600_X4_AVX2_EXEC (1--1) THEN + ENSURES_FINAL_STATE_TAC THEN + ASM_REWRITE_TAC[]; + + (*** The tail of logical not operation and writeback ***) + + REWRITE_TAC[round_constants; CONS_11; GSYM CONJ_ASSOC] THEN + REWRITE_TAC[WORDLIST_FROM_MEMORY_CONV `wordlist_from_memory(rc_pointer,24) s:int64 list`; + WORDLIST_FROM_MEMORY_CONV `wordlist_from_memory(bitstate_in,100) s:int64 list`; + WORDLIST_FROM_MEMORY_CONV `wordlist_from_memory(rho8_ptr,1) s:int256 list`; + WORDLIST_FROM_MEMORY_CONV `wordlist_from_memory(rho56_ptr,1) s:int256 list`] THEN + CONV_TAC(ONCE_DEPTH_CONV WORDLIST_FROM_MEMORY_CONV) THEN + REWRITE_TAC[APPEND] THEN + MP_TAC(ISPECL [`A1:int64 list`; `24:num`] LENGTH_KECCAK) THEN + MP_TAC(ISPECL [`A2:int64 list`; `24:num`] LENGTH_KECCAK) THEN + MP_TAC(ISPECL [`A3:int64 list`; `24:num`] LENGTH_KECCAK) THEN + MP_TAC(ISPECL [`A4:int64 list`; `24:num`] LENGTH_KECCAK) THEN + ASM_REWRITE_TAC[IMP_IMP] THEN REWRITE_TAC[LENGTH_EQ_25] THEN + DISCH_THEN(REPEAT_TCL CONJUNCTS_THEN SUBST1_TAC) THEN + REWRITE_TAC[MAP2; CONS_11; GSYM CONJ_ASSOC] THEN + CONV_TAC NUM_REDUCE_CONV THEN + REWRITE_TAC [keccak; keccak_round] THEN + ENSURES_INIT_TAC "s0" THEN + X86_STEPS_TAC KECCAK_F1600_X4_AVX2_EXEC (1--96) THEN + REPEAT(FIRST_X_ASSUM(STRIP_ASSUME_TAC o + CONV_RULE(READ_MEMORY_SPLIT_CONV 2) o + check (can (term_match [] `read qqq s:int256 = xxx`) o concl))) THEN + CONV_TAC(ONCE_DEPTH_CONV NORMALIZE_RELATIVE_ADDRESS_CONV) THEN + ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN + REPEAT CONJ_TAC THEN + BITBLAST_TAC]);; + + +let KECCAK_F1600_X4_AVX2_FULL_EXEC = X86_MK_EXEC_RULE keccak_f1600_x4_avx2_tmc;; + +let KECCAK_F1600_X4_AVX2_NOIBT_SUBROUTINE_CORRECT = prove + (`!rc_pointer:int64 bitstate_in:int64 rho8_ptr:int64 rho56_ptr:int64 A1 A2 A3 A4 pc:num stackpointer:int64 returnaddress. + PAIRWISE nonoverlapping + [(word pc, LENGTH keccak_f1600_x4_avx2_tmc); + (word_sub stackpointer (word 0x300), 0x300); + (bitstate_in, 800); + (rc_pointer, 192); + (rho8_ptr, 32); + (rho56_ptr, 32); + (stackpointer, 8)] + ==> ensures x86 + (\s. bytes_loaded s (word pc) keccak_f1600_x4_avx2_tmc /\ + read RIP s = word pc /\ + read RSP s = stackpointer /\ + read (memory :> bytes64 stackpointer) s = returnaddress /\ + C_ARGUMENTS [bitstate_in; rc_pointer; rho8_ptr; rho56_ptr] s /\ + wordlist_from_memory(rc_pointer, 24) s = round_constants /\ + wordlist_from_memory(rho8_ptr, 4) s = rho8_constant /\ + wordlist_from_memory(rho56_ptr, 4) s = rho56_constant /\ + wordlist_from_memory(bitstate_in, 25) s = A1 /\ + wordlist_from_memory(word_add bitstate_in (word 200), 25) s = A2 /\ + wordlist_from_memory(word_add bitstate_in (word 400), 25) s = A3 /\ + wordlist_from_memory(word_add bitstate_in (word 600), 25) s = A4) + (\s. read RIP s = returnaddress /\ + read RSP s = word_add stackpointer (word 8) /\ + wordlist_from_memory(bitstate_in, 25) s = keccak 24 A1 /\ + wordlist_from_memory(word_add bitstate_in (word 200), 25) s = keccak 24 A2 /\ + wordlist_from_memory(word_add bitstate_in (word 400), 25) s = keccak 24 A3 /\ + wordlist_from_memory(word_add bitstate_in (word 600), 25) s = keccak 24 A4) + (MAYCHANGE [RSP] ,, MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [memory :> bytes (bitstate_in, 800); + memory :> bytes(word_sub stackpointer (word 0x300), 0x300)])`, + let TWEAK_CONV = ONCE_DEPTH_CONV WORDLIST_FROM_MEMORY_CONV in + let EXPAND_PAIRWISE_CONV = REWRITE_CONV[PAIRWISE; ALL; NONOVERLAPPING_CLAUSES] in + let EXPAND_PAIRWISE = REWRITE_RULE[PAIRWISE; ALL; NONOVERLAPPING_CLAUSES] in + CONV_TAC(ONCE_DEPTH_CONV EXPAND_PAIRWISE_CONV) THEN + CONV_TAC TWEAK_CONV THEN + X86_PROMOTE_RETURN_STACK_TAC keccak_f1600_x4_avx2_tmc + (CONV_RULE TWEAK_CONV + (EXPAND_PAIRWISE + (CONV_RULE LENGTH_SIMPLIFY_CONV KECCAK_F1600_X4_AVX2_CORRECT))) + `[]` 768);; + +(* NOTE: This must be kept in sync with the CBMC specification + * in dev/fips202/x86_64/src/fips202_native_x86_64.h *) + +let KECCAK_F1600_X4_AVX2_SUBROUTINE_CORRECT = prove + (`!rc_pointer:int64 bitstate_in:int64 rho8_ptr:int64 rho56_ptr:int64 A1 A2 A3 A4 pc:num stackpointer:int64 returnaddress. + PAIRWISE nonoverlapping + [(word pc, LENGTH keccak_f1600_x4_avx2_mc); + (word_sub stackpointer (word 0x300), 0x300); + (bitstate_in, 800); + (rc_pointer, 192); + (rho8_ptr, 32); + (rho56_ptr, 32); + (stackpointer, 8)] + ==> ensures x86 + (\s. bytes_loaded s (word pc) keccak_f1600_x4_avx2_mc /\ + read RIP s = word pc /\ + read RSP s = stackpointer /\ + read (memory :> bytes64 stackpointer) s = returnaddress /\ + C_ARGUMENTS [bitstate_in; rc_pointer; rho8_ptr; rho56_ptr] s /\ + wordlist_from_memory(rc_pointer, 24) s = round_constants /\ + wordlist_from_memory(rho8_ptr, 4) s = rho8_constant /\ + wordlist_from_memory(rho56_ptr, 4) s = rho56_constant /\ + wordlist_from_memory(bitstate_in, 25) s = A1 /\ + wordlist_from_memory(word_add bitstate_in (word 200), 25) s = A2 /\ + wordlist_from_memory(word_add bitstate_in (word 400), 25) s = A3 /\ + wordlist_from_memory(word_add bitstate_in (word 600), 25) s = A4) + (\s. read RIP s = returnaddress /\ + read RSP s = word_add stackpointer (word 8) /\ + wordlist_from_memory(bitstate_in, 25) s = keccak 24 A1 /\ + wordlist_from_memory(word_add bitstate_in (word 200), 25) s = keccak 24 A2 /\ + wordlist_from_memory(word_add bitstate_in (word 400), 25) s = keccak 24 A3 /\ + wordlist_from_memory(word_add bitstate_in (word 600), 25) s = keccak 24 A4) + (MAYCHANGE [RSP] ,, MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [memory :> bytes (bitstate_in, 800); + memory :> bytes(word_sub stackpointer (word 0x300), 0x300)])`, + let TWEAK_CONV = ONCE_DEPTH_CONV WORDLIST_FROM_MEMORY_CONV in + let EXPAND_PAIRWISE_CONV = REWRITE_CONV[PAIRWISE; ALL; NONOVERLAPPING_CLAUSES] in + CONV_TAC(ONCE_DEPTH_CONV EXPAND_PAIRWISE_CONV) THEN + CONV_TAC TWEAK_CONV THEN + MATCH_ACCEPT_TAC(ADD_IBT_RULE + (CONV_RULE TWEAK_CONV + (REWRITE_RULE[PAIRWISE; ALL; NONOVERLAPPING_CLAUSES] + KECCAK_F1600_X4_AVX2_NOIBT_SUBROUTINE_CORRECT))));; + +(* ========================================================================= *) +(* Constant-time and memory safety proof. *) +(* ========================================================================= *) + +needs "x86/proofs/consttime.ml";; +needs "x86_64/proofs/subroutine_signatures.ml";; +needs "common/consttime_utils.ml";; + + +let full_spec,public_vars = mk_safety_spec + ~keep_maychanges:true + (assoc "keccak_f1600_x4_avx2" subroutine_signatures) + (CONV_RULE LENGTH_SIMPLIFY_CONV KECCAK_F1600_X4_AVX2_CORRECT) + KECCAK_F1600_X4_AVX2_EXEC;; + +(* Remove duplicates from memaccess_inbounds lists (s2n-bignum#350) *) +let full_spec = ONCE_DEPTH_CONV MEMACCESS_INBOUNDS_DEDUP_CONV full_spec + |> concl |> rhs;; + +let KECCAK_F1600_X4_AVX2_SAFE = time prove + (`exists f_events. + forall e rc_pointer bitstate_in rho8_ptr rho56_ptr pc stackpointer. + PAIRWISE nonoverlapping + [word pc, LENGTH keccak_f1600_x4_avx2_tmc; + stackpointer, 768; bitstate_in, 800; rc_pointer, 192; + rho8_ptr, 32; rho56_ptr, 32] + ==> ensures x86 + (\s. + bytes_loaded s (word pc) + (BUTLAST keccak_f1600_x4_avx2_tmc) /\ + read RIP s = word (pc + KECCAK_F1600_X4_AVX2_PREAMBLE_LENGTH) /\ + read RSP s = stackpointer /\ + C_ARGUMENTS [bitstate_in; rc_pointer; rho8_ptr; rho56_ptr] s /\ + read events s = e) + (\s. + read RIP s = word (pc + KECCAK_F1600_X4_AVX2_CORE_END) /\ + (exists e2. + read events s = APPEND e2 e /\ + e2 = f_events rc_pointer rho8_ptr rho56_ptr bitstate_in pc stackpointer /\ + memaccess_inbounds e2 + [bitstate_in,800; rc_pointer,192; rho8_ptr,32; rho56_ptr,32; + stackpointer,768] + [bitstate_in,800; stackpointer,768])) + (MAYCHANGE [RIP; R10; RSI] ,, + MAYCHANGE + [ZMM0; ZMM1; ZMM2; ZMM3; ZMM4; ZMM5; ZMM6; ZMM7; ZMM8; + ZMM9; ZMM10; ZMM11; ZMM12; ZMM13; ZMM14; ZMM15] ,, + MAYCHANGE SOME_FLAGS ,, + MAYCHANGE [events] ,, + MAYCHANGE [memory :> bytes (stackpointer,768)] ,, + MAYCHANGE [memory :> bytes (bitstate_in,800)])`, + CONV_TAC(ONCE_DEPTH_CONV LENGTH_SIMPLIFY_CONV) THEN + ASSERT_CONCL_TAC full_spec THEN + REWRITE_TAC[PAIRWISE; ALL; NONOVERLAPPING_CLAUSES] THEN + PROVE_SAFETY_SPEC_TAC ~public_vars:public_vars KECCAK_F1600_X4_AVX2_EXEC);; + +(* ========================================================================= *) +(* Workaround for s2n-bignum's GEN_X86_ADD_RETURN_STACK_TAC and *) +(* DISCHARGE_SAFETY_PROPERTY_TAC not supporting empty callee-saved register *) +(* lists with non-zero stack offsets. WORD_FORALL_OFFSET_TAC uses a *) +(* polymorphic type that clashes with META_EXISTS_TAC, and *) +(* SAFE_UNIFY_REFL_TAC rejects `stackpointer` when f_events takes *) +(* `word_add stackpointer (word N)` instead. *) +(* TODO: remove once fixed upstream in s2n-bignum. *) +(* ========================================================================= *) +let WORD_FORALL_OFFSET_64_TAC = + let lemma = prove + (`!(P:int64->bool) a. (!x. P(word_add x (word a))) ==> (!x. P x)`, + MESON_TAC[WORD_RULE `word_add (word_sub x a) (a:int64) = x`]) in + fun n -> MATCH_MP_TAC lemma THEN EXISTS_TAC (mk_small_numeral n) THEN + CONV_TAC(ONCE_DEPTH_CONV NORMALIZE_ADD_SUBTRACT_WORD_CONV);; + +let DISCHARGE_SAFETY_PROPERTY_STACKOFFSET_TAC stack_offset = + REWRITE_TAC[APPEND] THEN + ABBREV_TAC (mk_eq(mk_var("rsp_orig",`:int64`), + mk_comb(mk_comb(`word_add:int64->int64->int64`,`stackpointer:int64`), + mk_comb(`word:num->int64`,mk_small_numeral stack_offset)))) THEN + SUBGOAL_THEN + (mk_eq(`stackpointer:int64`, + mk_comb(mk_comb(`word_sub:int64->int64->int64`,mk_var("rsp_orig",`:int64`)), + mk_comb(`word:num->int64`,mk_small_numeral stack_offset)))) + SUBST_ALL_TAC THENL + [EXPAND_TAC "rsp_orig" THEN CONV_TAC WORD_RULE; ALL_TAC] THEN + DISCHARGE_SAFETY_PROPERTY_TAC;; + +let X86_PROMOTE_RETURN_STACK_SAFE_TAC execname coreth reglist stack_offset = + let n0 = length(dest_list(parse_term reglist)) in + let n = n0 + (if stack_offset > 0 then 1 else 0) in + let m = (if stack_offset > 0 then 1 else 0) + n0 + 1 in + let execth = X86_MK_EXEC_RULE execname in + let coreth = X86_CORE_PROMOTE coreth in + ASSUME_CALLEE_SAFETY_TAC coreth "" THEN + META_EXISTS_TAC THEN + check_forallvars_tac THEN + FIRST_X_ASSUM (fun th -> MP_TAC (ONCE_REWRITE_RULE[append_lemma]th)) THEN + REPEAT(CONV_TAC (LAND_CONV (ONCE_REWRITE_CONV[swap_forall])) THEN + MATCH_MP_TAC mono3lemma THEN GEN_TAC) THEN + CONV_TAC (LAND_CONV (ONCE_REWRITE_CONV[swap_forall])) THEN + REWRITE_TAC[fst execth] THEN + REWRITE_TAC [MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI; + WINDOWS_MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI] THEN + (if stack_offset > 0 then + DISCH_THEN(fun th -> WORD_FORALL_OFFSET_64_TAC stack_offset THEN MP_TAC th) THEN + MATCH_MP_TAC MONO_FORALL THEN GEN_TAC + else + ALL_TAC) THEN + REWRITE_TAC[NONOVERLAPPING_CLAUSES; ALLPAIRS; ALL] THEN + REWRITE_TAC[C_ARGUMENTS; C_RETURN; SOME_FLAGS] THEN + REWRITE_TAC[WINDOWS_C_ARGUMENTS; WINDOWS_C_RETURN] THEN + DISCH_THEN(fun th -> + REPEAT GEN_TAC THEN + TRY(DISCH_THEN(REPEAT_TCL CONJUNCTS_THEN ASSUME_TAC)) THEN + MP_TAC th) THEN + ASM_REWRITE_TAC[] THEN + ONCE_REWRITE_TAC[GSYM LEFT_EXISTS_IMP_THM] THEN + META_EXISTS_TAC THEN + DISCH_THEN(fun th -> + ENSURES_INIT_TAC "s0" THEN + X86_STEPS_TAC execth (1--n) THEN + MP_TAC th) THEN + X86_BIGSTEP_TAC execth ("s" ^ string_of_int (n + 1)) THEN + TRY(GEN_REWRITE_TAC LAND_CONV [GSYM(CONJUNCT1 APPEND)] THEN + BINOP_TAC THENL [UNIFY_REFL_TAC; REFL_TAC] THEN NO_TAC) THEN + REWRITE_TAC(!simulation_precanon_thms) THEN + X86_STEPS_TAC execth ((n+2)--(n+1+m)) THEN + ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[];; + +let KECCAK_F1600_X4_AVX2_NOIBT_SUBROUTINE_SAFE = time prove + (`exists f_events. + forall e rc_pointer bitstate_in rho8_ptr rho56_ptr pc stackpointer returnaddress. + PAIRWISE nonoverlapping + [(word pc, LENGTH keccak_f1600_x4_avx2_tmc); + (word_sub stackpointer (word 0x300), 0x300); + (bitstate_in, 800); + (rc_pointer, 192); + (rho8_ptr, 32); + (rho56_ptr, 32); + (stackpointer, 8)] + ==> ensures x86 + (\s. + bytes_loaded s (word pc) keccak_f1600_x4_avx2_tmc /\ + read RIP s = word pc /\ + read RSP s = stackpointer /\ + read (memory :> bytes64 stackpointer) s = returnaddress /\ + C_ARGUMENTS [bitstate_in; rc_pointer; rho8_ptr; rho56_ptr] s /\ + read events s = e) + (\s. read RIP s = returnaddress /\ + read RSP s = word_add stackpointer (word 8) /\ + (exists e2. + read events s = APPEND e2 e /\ + e2 = f_events rc_pointer rho8_ptr rho56_ptr bitstate_in pc stackpointer + returnaddress /\ + memaccess_inbounds e2 + [bitstate_in,800; rc_pointer,192; rho8_ptr,32; rho56_ptr,32; + word_sub stackpointer (word 768),768; + stackpointer,8] + [bitstate_in,800; + word_sub stackpointer (word 768),768; + stackpointer,8])) + (\s s'. true)`, + let EXPAND_PAIRWISE_CONV = REWRITE_CONV[PAIRWISE; ALL; NONOVERLAPPING_CLAUSES] in + let EXPAND_PAIRWISE = REWRITE_RULE[PAIRWISE; ALL; NONOVERLAPPING_CLAUSES] in + CONV_TAC(ONCE_DEPTH_CONV EXPAND_PAIRWISE_CONV) THEN + X86_PROMOTE_RETURN_STACK_SAFE_TAC keccak_f1600_x4_avx2_tmc + (EXPAND_PAIRWISE (CONV_RULE LENGTH_SIMPLIFY_CONV KECCAK_F1600_X4_AVX2_SAFE)) + "[]" 768 THEN + DISCHARGE_SAFETY_PROPERTY_STACKOFFSET_TAC 768);; + +let KECCAK_F1600_X4_AVX2_SUBROUTINE_SAFE = time prove + (`exists f_events. + forall e rc_pointer bitstate_in rho8_ptr rho56_ptr pc stackpointer returnaddress. + PAIRWISE nonoverlapping + [(word pc, LENGTH keccak_f1600_x4_avx2_mc); + (word_sub stackpointer (word 0x300), 0x300); + (bitstate_in, 800); + (rc_pointer, 192); + (rho8_ptr, 32); + (rho56_ptr, 32); + (stackpointer, 8)] + ==> ensures x86 + (\s. + bytes_loaded s (word pc) keccak_f1600_x4_avx2_mc /\ + read RIP s = word pc /\ + read RSP s = stackpointer /\ + read (memory :> bytes64 stackpointer) s = returnaddress /\ + C_ARGUMENTS [bitstate_in; rc_pointer; rho8_ptr; rho56_ptr] s /\ + read events s = e) + (\s. read RIP s = returnaddress /\ + read RSP s = word_add stackpointer (word 8) /\ + (exists e2. + read events s = APPEND e2 e /\ + e2 = f_events rc_pointer rho8_ptr rho56_ptr bitstate_in pc stackpointer + returnaddress /\ + memaccess_inbounds e2 + [bitstate_in,800; rc_pointer,192; rho8_ptr,32; rho56_ptr,32; + word_sub stackpointer (word 768),768; + stackpointer,8] + [bitstate_in,800; + word_sub stackpointer (word 768),768; + stackpointer,8])) + (\s s'. true)`, + let EXPAND_PAIRWISE_CONV = REWRITE_CONV[PAIRWISE; ALL; NONOVERLAPPING_CLAUSES] in + CONV_TAC(ONCE_DEPTH_CONV EXPAND_PAIRWISE_CONV) THEN + MATCH_ACCEPT_TAC(ADD_IBT_RULE + (REWRITE_RULE[PAIRWISE; ALL; NONOVERLAPPING_CLAUSES] + KECCAK_F1600_X4_AVX2_NOIBT_SUBROUTINE_SAFE)));; diff --git a/proofs/hol_light/x86_64/proofs/keccak_f1600_x4_avx2_constants.ml b/proofs/hol_light/x86_64/proofs/keccak_f1600_x4_avx2_constants.ml new file mode 100644 index 000000000..637d3fe94 --- /dev/null +++ b/proofs/hol_light/x86_64/proofs/keccak_f1600_x4_avx2_constants.ml @@ -0,0 +1,26 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(* + * WARNING: This file is auto-generated from scripts/autogen + * in the mldsa-native repository. + * Do not modify it directly. + *) + +(* Keccak constants for x86_64 AVX2 implementations. *) + +let rho8_constant = define `rho8_constant:int64 list = [ + word 0x0605040302010007; + word 0x0E0D0C0B0A09080F; + word 0x1615141312111017; + word 0x1E1D1C1B1A19181F +]`;; + +let rho56_constant = define `rho56_constant:int64 list = [ + word 0x0007060504030201; + word 0x080F0E0D0C0B0A09; + word 0x1017161514131211; + word 0x181F1E1D1C1B1A19 +]`;; diff --git a/proofs/hol_light/x86_64/proofs/keccak_utils.ml b/proofs/hol_light/x86_64/proofs/keccak_utils.ml new file mode 100644 index 000000000..5dc2edd04 --- /dev/null +++ b/proofs/hol_light/x86_64/proofs/keccak_utils.ml @@ -0,0 +1,125 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(* ========================================================================= *) +(* Keccak utilities for x86_64 proofs. *) +(* ========================================================================= *) + +needs "common/keccak_spec.ml";; +needs "x86_64/proofs/keccak_f1600_x4_avx2_constants.ml";; + +(* ------------------------------------------------------------------------- *) +(* Some custom normalization for logical equivalence and conjunction, which *) +(* is enough to handle the shallow differences in various ways of expressing *) +(* Keccak-related operations, to avoid the overkill of using a SAT solver. *) +(* ------------------------------------------------------------------------- *) + +let KECCAK_BITBLAST_TAC = + let IFF_NOT_CONV = + let pth = TAUT + `((~p <=> q) <=> ~(p <=> q)) /\ + ((p <=> ~q) <=> ~(p <=> q))` in + GEN_REWRITE_CONV REDEPTH_CONV [pth; NOT_CLAUSES; EQ_CLAUSES] in + let [conv_e;conv_l;conv_r; conv_1e;conv_1r; + conv_e1;conv_l1;conv_r1; conv_ee; conv_11; conv_t] = + map (fun tm -> GEN_REWRITE_CONV I [TAUT tm]) + [`((p <=> q1) <=> (p <=> q2)) = (q1 <=> q2)`; + `((p1 <=> q1) <=> (p2 <=> q2)) = (p1 <=> (q1 <=> (p2 <=> q2)))`; + `((p1 <=> q1) <=> (p2 <=> q2)) = (p2 <=> (p1 <=> q1) <=> q2)`; + `(p <=> (p <=> q2)) = q2`; + `(p <=> (p2 <=> q2)) = (p2 <=> (p <=> q2))`; + `((p <=> q1) <=> p) = q1`; + `((p1 <=> q1) <=> p) = (p1 <=> (q1 <=> p))`; + `((p1 <=> q1) <=> p) = (p <=> (p1 <=> q1))`; + `(p <=> p) <=> T`; + `(p <=> q) <=> (q <=> p)`; + `(p <=> T) <=> p`] in + let rec IFF_MERGE_CONV tm = + match tm with + Comb(Comb(e,Comb(Comb(Const("=",_),p1),q1)), + Comb(Comb(Const("=",_),p2),q2)) -> + if p1 = p2 then (conv_e THENC IFF_MERGE_CONV) tm + else if p1 < p2 then (conv_l THENC IFF_RAND_CONV) tm + else (conv_r THENC IFF_RAND_CONV) tm + | Comb(Comb(e,p),Comb(Comb(Const("=",_),p2),q2)) -> + if p = p2 then conv_1e tm + else if p < p2 then REFL tm + else (conv_1r THENC IFF_RAND_CONV) tm + | Comb(Comb(e,Comb(Comb(Const("=",_),p1),q1)),p) -> + if p = p1 then conv_e1 tm + else if p1 < p then (conv_l1 THENC IFF_RAND_CONV) tm + else (conv_r1 THENC IFF_RAND_CONV) tm + | Comb(Comb(e,p),q) -> + if p = q then conv_ee tm + else if p < q then REFL tm + else conv_11 tm + | _ -> REFL tm + and IFF_RAND_CONV tm = + let th = RAND_CONV IFF_MERGE_CONV tm in + CONV_RULE(RAND_CONV(TRY_CONV conv_t)) th in + let rec IFF_CANON_CONV tm = + match tm with + Comb(Comb(Const("=",Tyapp("fun",[Tyapp("bool",[]);_])),l),r) -> + (BINOP_CONV IFF_CANON_CONV THENC IFF_MERGE_CONV) tm + | _ -> REFL tm in + let rec IFF_ATOM_CONV conv tm = + match tm with + Comb(Comb(Const("=",Tyapp("fun",[Tyapp("bool",[]);_])),l),r) -> + BINOP_CONV (IFF_ATOM_CONV conv) tm + | _ -> conv tm in + let rec AND_ATOM_CONV conv tm = + match tm with + Comb(Comb(Const("/\\",_),l),r) -> + BINOP_CONV (AND_ATOM_CONV conv) tm + | _ -> conv tm in + let rec IFF_NORM_CONV tm = + match tm with + Comb(Comb(Const("/\\",_),l),r) -> + let th = AND_ATOM_CONV IFF_NORM_CONV tm in + CONV_RULE (RAND_CONV CONJ_CANON_CONV) th + | Comb(Comb(Const("=",Tyapp("fun",[Tyapp("bool",[]);_])),l),r) -> + let th = IFF_ATOM_CONV IFF_NORM_CONV tm in + CONV_RULE (RAND_CONV IFF_CANON_CONV) th + | Comb(Const("~",_),l) -> RAND_CONV IFF_NORM_CONV tm + | _ -> REFL tm in + POP_ASSUM_LIST(K ALL_TAC) THEN + REWRITE_TAC[WORD_RULE `word_add x x = word_shl x 1`] THEN + BITBLAST_THEN(K ALL_TAC) THEN + CONV_TAC(AND_ATOM_CONV + (BINOP_CONV(IFF_NOT_CONV THENC IFF_NORM_CONV) THENC + GEN_REWRITE_CONV I [REFL_CLAUSE])) THEN + REWRITE_TAC[] THEN NO_TAC;; + +(* ------------------------------------------------------------------------- *) +(* Additional definitions and tactics used in the proof. *) +(* ------------------------------------------------------------------------- *) + +let PC_OFFSET_CONV = + GEN_REWRITE_CONV DEPTH_CONV [ARITH_RULE `(m + a) + b = m + (a + b)`] THENC + NUM_REDUCE_CONV;; + +let MEMORY_256_FROM_64_TAC = + let a_tm = `a:int64` and n_tm = `n:num` and i64_ty = `:int64` + and pat = `read (memory :> bytes256(word_add a (word n))) s0` in + fun v boff n -> + let pat' = subst[mk_var(v,i64_ty),a_tm] pat in + let f i = + let itm = mk_small_numeral(boff + 32*i) in + READ_MEMORY_MERGE_CONV 2 (subst[itm,n_tm] pat') in + MP_TAC(end_itlist CONJ (map f (0--(n-1))));; + +let WORD_SUBWORD_JOIN_EXTRACT_64 = prove + (`!a:int64 b:int64 c:int64 d:int64. ((word_subword (word_join ((word_join a b):int128) ((word_join c d):int128):int256) (0,64)):int64) = d /\ + !a:int64 b:int64 c:int64 d:int64. ((word_subword (word_join ((word_join a b):int128) ((word_join c d):int128):int256) (64,64)):int64) = c /\ + !a:int64 b:int64 c:int64 d:int64. ((word_subword (word_join ((word_join a b):int128) ((word_join c d):int128):int256) (128,64)):int64) = b /\ + !a:int64 b:int64 c:int64 d:int64. ((word_subword (word_join ((word_join a b):int128) ((word_join c d):int128):int256) (192,64)):int64) = a`, + REPEAT GEN_TAC THEN + BITBLAST_TAC);; + +let WORD_SUBWORD_JOIN_EXTRACT_128 = prove + (`!a:int64 b:int64 c:int64 d:int64. ((word_subword (word_join ((word_join a b):int128) ((word_join c d):int128):int256) (0,128)):int128) = ((word_join c d):int128) /\ + !a:int64 b:int64 c:int64 d:int64. ((word_subword (word_join ((word_join a b):int128) ((word_join c d):int128):int256) (128,128)):int128) = ((word_join a b):int128)`, + REPEAT GEN_TAC THEN + BITBLAST_TAC);; diff --git a/proofs/hol_light/x86_64/proofs/subroutine_signatures.ml b/proofs/hol_light/x86_64/proofs/subroutine_signatures.ml new file mode 100644 index 000000000..3c8d1f43e --- /dev/null +++ b/proofs/hol_light/x86_64/proofs/subroutine_signatures.ml @@ -0,0 +1,30 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(* ========================================================================= *) +(* ML-DSA x86_64 subroutine signatures for constant-time proofs. *) +(* Trimmed version of s2n-bignum's x86/proofs/subroutine_signatures.ml. *) +(* ========================================================================= *) + +let subroutine_signatures = [ +("keccak_f1600_x4_avx2", + ([(*args*) + ("bitstate_in", "uint64_t[static 100]", (*is const?*)"false"); + ("rc_pointer", "const uint64_t[static 24]", (*is const?*)"true"); + ("rho8_ptr", "const uint64_t[static 4]", (*is const?*)"true"); + ("rho56_ptr", "const uint64_t[static 4]", (*is const?*)"true")], + "void", + [(* input buffers *) + ("bitstate_in", "100"(* num elems *), 8(* elem bytesize *)); + ("rc_pointer", "24"(* num elems *), 8(* elem bytesize *)); + ("rho8_ptr", "4"(* num elems *), 8(* elem bytesize *)); + ("rho56_ptr", "4"(* num elems *), 8(* elem bytesize *))], + [(* output buffers *) + ("bitstate_in", "100"(* num elems *), 8(* elem bytesize *))], + [(* temporary buffers *) + ]) +); + +];; diff --git a/scripts/autogen b/scripts/autogen index b341f6c82..2e11fe658 100755 --- a/scripts/autogen +++ b/scripts/autogen @@ -1169,6 +1169,206 @@ def gen_avx2_rej_uniform_table(): update_file("dev/x86_64/src/rej_uniform_table.c", "\n".join(gen())) +def gen_keccak_round_constants(): + """Generate the 24 Keccak-f[1600] round constants (RC values). + + Yields uint64 values computed via the LFSR-based algorithm + from the Keccak specification. + """ + rc = [0] * 24 + lfsr = 1 + for round_idx in range(24): + rc_val = 0 + for j in range(7): + if lfsr & 1: + rc_val ^= 1 << ((1 << j) - 1) + # x^8 + x^6 + x^5 + x^4 + 1 feedback polynomial + lfsr = (lfsr << 1) ^ (0x71 if lfsr & 0x80 else 0) + lfsr &= 0xFF + rc[round_idx] = rc_val + yield from rc + + +def gen_keccak_rho_shuffle(offset): + """Generate a vpshufb shuffle mask for byte rotation within each qword. + + The Keccak rho step requires 64-bit rotations. Some rotation amounts + (8 and 56 bits) are multiples of 8 and can be implemented as byte + shuffles via vpshufb, which is faster than a general-purpose rotation. + + A vpshufb mask for a 256-bit register (YMM) has 32 bytes: 4 groups of 8, + one per 64-bit lane. Within each lane, byte[i] of the output is sourced + from byte[src] where src = (i + offset) % 8, shifted to the lane's base. + + offset=-1: each output byte[i] reads from byte[i-1], rotating the qword + right by 8 bits (equivalently, left by 56 bits). + offset=+1: each output byte[i] reads from byte[i+1], rotating the qword + right by 56 bits (equivalently, left by 8 bits). + """ + for lane in range(4): + base = lane * 8 + val = 0 + for byte_pos in range(8): + src = (byte_pos + offset) % 8 + base + val |= src << (byte_pos * 8) + yield val + + +def gen_keccak_rho8(): + """Generate the vpshufb shuffle mask for 8-bit rotation of each qword.""" + yield from gen_keccak_rho_shuffle(-1) + + +def gen_keccak_rho56(): + """Generate the vpshufb shuffle mask for 56-bit rotation of each qword.""" + yield from gen_keccak_rho_shuffle(+1) + + +def print_hol_light_word64_list(g, entries_per_line=4): + """Format a list of uint64 values as a HOL Light int64 word list. + + Analogous to print_hol_light_array but for int64 word lists. + Yields lines with leading indent, semicolon-separated. + """ + l = [f"word 0x{v:016X}" for v in g] + + for i in range(0, len(l), entries_per_line): + row = l[i : i + entries_per_line] + is_last = i + entries_per_line >= len(l) + line = "; ".join(row) + if not is_last: + line += ";" + yield " " + line + + +def print_c_uint64_array(name, values, entries_per_line=1): + """Format a C uint64_t array definition with MLD_ALIGN. + + Yields lines for a complete array definition including braces. + """ + yield f"MLD_ALIGN const uint64_t {name}[] = {{" + vals = list(values) + for i in range(0, len(vals), entries_per_line): + row = vals[i : i + entries_per_line] + yield " " + ", ".join(f"0x{v:016x}" for v in row) + "," + yield "};" + + +def gen_aarch64_keccak_constants_c_file(): + def gen_c(): + yield from gen_header() + yield '#include "../../../../common.h"' + yield "" + yield "#if (defined(MLD_FIPS202_AARCH64_NEED_X1_SCALAR) || \\" + yield " defined(MLD_FIPS202_AARCH64_NEED_X1_V84A) || \\" + yield " defined(MLD_FIPS202_AARCH64_NEED_X2_V84A) || \\" + yield " defined(MLD_FIPS202_AARCH64_NEED_X4_V8A_SCALAR_HYBRID) || \\" + yield " defined(MLD_FIPS202_AARCH64_NEED_X4_V8A_V84A_SCALAR_HYBRID)) && \\" + yield " !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)" + yield "" + yield '#include "fips202_native_aarch64.h"' + yield "" + yield from print_c_uint64_array( + "mld_keccakf1600_round_constants", + gen_keccak_round_constants(), + entries_per_line=3, + ) + yield "" + yield "#else /* (MLD_FIPS202_AARCH64_NEED_X1_SCALAR || \\" + yield " MLD_FIPS202_AARCH64_NEED_X1_V84A || MLD_FIPS202_AARCH64_NEED_X2_V84A \\" + yield " || MLD_FIPS202_AARCH64_NEED_X4_V8A_SCALAR_HYBRID || \\" + yield " MLD_FIPS202_AARCH64_NEED_X4_V8A_V84A_SCALAR_HYBRID) && \\" + yield " !MLD_CONFIG_MULTILEVEL_NO_SHARED */" + yield "" + yield "MLD_EMPTY_CU(fips202_aarch64_round_constants)" + yield "" + yield "#endif /* !((MLD_FIPS202_AARCH64_NEED_X1_SCALAR || \\" + yield " MLD_FIPS202_AARCH64_NEED_X1_V84A || MLD_FIPS202_AARCH64_NEED_X2_V84A \\" + yield " || MLD_FIPS202_AARCH64_NEED_X4_V8A_SCALAR_HYBRID || \\" + yield " MLD_FIPS202_AARCH64_NEED_X4_V8A_V84A_SCALAR_HYBRID) && \\" + yield " !MLD_CONFIG_MULTILEVEL_NO_SHARED) */" + yield "" + + update_file( + "dev/fips202/aarch64/src/keccakf1600_round_constants.c", + "\n".join(gen_c()), + force_format=True, + ) + + +def gen_avx2_keccak_constants_c_file(): + def gen_c(): + yield from gen_header() + yield '#include "../../../../common.h"' + yield "#if defined(MLD_FIPS202_X86_64_NEED_X4_AVX2) && \\" + yield " !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)" + yield "" + yield "#include " + yield "" + yield '#include "fips202_native_x86_64.h"' + yield "" + yield from print_c_uint64_array( + "mld_keccakf1600_round_constants", + gen_keccak_round_constants(), + entries_per_line=3, + ) + yield "" + yield from print_c_uint64_array("mld_keccak_rho8", gen_keccak_rho8()) + yield "" + yield from print_c_uint64_array("mld_keccak_rho56", gen_keccak_rho56()) + yield "" + yield "#else /* MLD_FIPS202_X86_64_NEED_X4_AVX2 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */" + yield "" + yield "MLD_EMPTY_CU(fips202_x86_64_constants)" + yield "" + yield "#endif /* !(MLD_FIPS202_X86_64_NEED_X4_AVX2 && !MLD_CONFIG_MULTILEVEL_NO_SHARED) */" + yield "" + + update_file( + "dev/fips202/x86_64/src/keccakf1600_constants.c", + "\n".join(gen_c()), + force_format=True, + ) + + +def gen_hol_light_keccak_constants_file(): + def gen(): + yield from gen_hol_light_header() + yield "(* Keccak-f[1600] round constants RC[i] for i = 0..23. *)" + yield "" + yield "let round_constants = define `round_constants:int64 list = [" + yield from print_hol_light_word64_list( + gen_keccak_round_constants(), entries_per_line=1 + ) + yield "]`;;" + yield "" + + update_file( + "proofs/hol_light/common/keccak_constants.ml", + "\n".join(gen()), + ) + + +def gen_avx2_keccak_hol_light_constants_file(): + def gen(): + yield from gen_hol_light_header() + yield "(* Keccak constants for x86_64 AVX2 implementations. *)" + yield "" + yield "let rho8_constant = define `rho8_constant:int64 list = [" + yield from print_hol_light_word64_list(gen_keccak_rho8(), entries_per_line=1) + yield "]`;;" + yield "" + yield "let rho56_constant = define `rho56_constant:int64 list = [" + yield from print_hol_light_word64_list(gen_keccak_rho56(), entries_per_line=1) + yield "]`;;" + yield "" + + update_file( + "proofs/hol_light/x86_64/proofs/keccak_f1600_x4_avx2_constants.ml", + "\n".join(gen()), + ) + + def signed_reduce(a): """Return signed canonical representative of a mod b""" c = a % modulus @@ -2294,6 +2494,13 @@ def gen_hol_light_asm(): f"-Imldsa/src/native/x86_64/src -Imldsa/src/common.h {x86_64_flags}", "x86_64", ), + ( + "keccak_f1600_x4_avx2.S", + "keccak_f1600_x4_avx2.S", + "dev/fips202/x86_64/src", + f"-Idev/fips202/x86_64/src -Imldsa/src/fips202/native/x86_64/src {x86_64_flags}", + "x86_64", + ), ] joblist = joblist_aarch64 + joblist_x86_64 @@ -2448,6 +2655,31 @@ def synchronize_backends( # comparing the object code before and after simplification. cflags="-Imldsa/src/native/x86_64/src/ -mavx2 -mbmi2 -msse4 -fcf-protection=none", ) + synchronize_backend( + "dev/fips202/x86_64/src", + "mldsa/src/fips202/native/x86_64/src", + delete=delete, + force_cross=force_cross, + no_simplify=no_simplify, + x86_64_syntax=x86_64_syntax, + # Turn off control-flow protection (CET) explicitly. Newer versions of + # clang turn it on by default and insert endbr64 instructions at every + # global symbol. + # We insert endbr64 instruction manually via the MLD_ASM_FN_SYMBOL + # macro. + # This leads to duplicate endbr64 instructions causing a failure when + # comparing the object code before and after simplification. + cflags="-Idev/fips202/x86_64/src -Imldsa/src/fips202/native/x86_64/src -mavx2 -mbmi2 -msse4 -fcf-protection=none", + ) + synchronize_backend( + "dev/fips202/x86_64", + "mldsa/src/fips202/native/x86_64", + delete=delete, + force_cross=force_cross, + no_simplify=no_simplify, + x86_64_syntax=x86_64_syntax, + cflags="-Idev/fips202/x86_64 -Imldsa/src/fips202/native/x86_64 -mavx2 -mbmi2 -msse4 -fcf-protection=none", + ) synchronize_backend( "dev/fips202/armv81m/src", "mldsa/src/fips202/native/armv81m/src", @@ -3312,6 +3544,10 @@ def _main(): gen_avx2_hol_light_zeta_file() gen_avx2_zeta_file() gen_avx2_rej_uniform_table() + gen_hol_light_keccak_constants_file() + gen_aarch64_keccak_constants_c_file() + gen_avx2_keccak_constants_c_file() + gen_avx2_keccak_hol_light_constants_file() def gen_monolithic(): gen_monolithic_source_file() diff --git a/test/mk/components.mk b/test/mk/components.mk index 88931db96..8bc1dfad7 100644 --- a/test/mk/components.mk +++ b/test/mk/components.mk @@ -4,7 +4,7 @@ FIPS202_SRCS = $(wildcard mldsa/src/fips202/*.c) ifeq ($(OPT),1) - FIPS202_SRCS += $(wildcard mldsa/src/fips202/native/aarch64/src/*.S) $(wildcard mldsa/src/fips202/native/aarch64/src/*.c) $(wildcard mldsa/src/fips202/native/x86_64/src/*.c) $(wildcard mldsa/src/fips202/native/armv81m/src/*.[csS]) + FIPS202_SRCS += $(wildcard mldsa/src/fips202/native/aarch64/src/*.S) $(wildcard mldsa/src/fips202/native/aarch64/src/*.c) $(wildcard mldsa/src/fips202/native/x86_64/src/*.c) $(wildcard mldsa/src/fips202/native/x86_64/src/*.S) $(wildcard mldsa/src/fips202/native/armv81m/src/*.[csS]) endif