diff --git a/.github/workflows/avx512-sde.yml b/.github/workflows/avx512-sde.yml new file mode 100644 index 0000000000000..4c47ed1aaf451 --- /dev/null +++ b/.github/workflows/avx512-sde.yml @@ -0,0 +1,137 @@ +# Copyright 2026 The OpenSSL Project Authors. All Rights Reserved. +# Copyright (c) 2026 Intel Corporation. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# Run AVX512VL-specific tests under Intel SDE. +# +# GitHub Actions runners currently do not have AVX512 hardware. +# Intel SDE emulates AVX512 instructions and spoofs CPUID, +# so AVX512 code paths are exercised. +# +# To update Intel SDE: find the new mirror ID and file date from +# https://www.intel.com/content/www/us/en/download/684897 +# and update the three env vars below. + +name: AVX512 tests via Intel SDE + +on: [pull_request, push] + +permissions: + contents: read + +env: + SDE_VERSION: 10.8.0 + SDE_DATE: 2026-03-15 + SDE_MIRROR_ID: 915934 + +jobs: + linux: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + with: + persist-credentials: false + + - name: install NASM + run: sudo apt-get install -y nasm + + - name: install Intel SDE + run: | + SDE_URL="https://downloadmirror.intel.com/${SDE_MIRROR_ID}/sde-external-${SDE_VERSION}-${SDE_DATE}-lin.tar.xz" + SDE_SHA256="50b320cd226acef7a491f5b321fc1be3c3c7984f9e27a456e64894b5b0979dd3" + curl -fsSL -o /tmp/sde.tar.xz "$SDE_URL" + echo "$SDE_SHA256 /tmp/sde.tar.xz" | sha256sum -c - + mkdir /tmp/sde + tar -xf /tmp/sde.tar.xz -C /tmp/sde/ + sudo mv /tmp/sde/sde-external-${SDE_VERSION}-${SDE_DATE}-lin /opt/sde + echo "/opt/sde" >> "$GITHUB_PATH" + + - name: config + run: | + ./config --banner=Configured --strict-warnings no-shared enable-fips + + - name: build + run: make -j4 + + - name: show CPU and OpenSSL build info + run: | + cat /proc/cpuinfo | grep -m1 "model name" + sde64 -skx -- ./apps/openssl version -c + + - name: ml_dsa_internal_test (AVX512VL via SDE) + run: sde64 -skx -- ./test/ml_dsa_internal_test + + - name: sha3_x4_internal_test (AVX512VL via SDE) + run: sde64 -skx -- ./test/sha3_x4_internal_test + + - name: fipsinstall (FIPS KAT via SDE) + run: sde64 -skx -- ./apps/openssl fipsinstall -module ./providers/fips.so -out /tmp/fipsmodule.cnf -provider_name fips + + windows: + runs-on: windows-2022 + env: + VCVARS: C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat + steps: + - uses: actions/checkout@v6 + with: + persist-credentials: false + + - name: install NASM + run: | + choco install nasm + "C:\Program Files\NASM" | Out-File -FilePath "$env:GITHUB_PATH" -Append + + - name: install JOM + run: choco install jom + + - name: install Intel SDE + run: | + $url = "https://downloadmirror.intel.com/$env:SDE_MIRROR_ID/sde-external-$env:SDE_VERSION-$env:SDE_DATE-win.tar.xz" + $expected = "176F87C80EB42BB91B73E1428F4A0FD067DF322F901F9B4359B20B86B92C2BAE" + curl.exe -fsSL -o sde-win.tar.xz $url + $actual = (Get-FileHash sde-win.tar.xz -Algorithm SHA256).Hash + if ($actual -ne $expected) { throw "SDE SHA256 mismatch: got $actual" } + & "C:\Program Files\7-Zip\7z.exe" x sde-win.tar.xz -so | & "C:\Program Files\7-Zip\7z.exe" x -si -ttar -o"C:\sde" + $sdeRoot = "C:\sde\sde-external-$env:SDE_VERSION-$env:SDE_DATE-win" + if (-not (Test-Path "$sdeRoot\sde.exe")) { throw "sde.exe not found in $sdeRoot" } + "$sdeRoot" | Out-File -FilePath $env:GITHUB_PATH -Append + + - name: prepare build directory + run: mkdir _build + + - name: config + working-directory: _build + shell: cmd + run: | + call "%VCVARS%" + perl ..\Configure --banner=Configured --strict-warnings no-shared enable-fips no-makedepend + + - name: build + working-directory: _build + shell: cmd + run: | + call "%VCVARS%" + jom /j4 /S + + - name: show CPU and OpenSSL build info + working-directory: _build + run: sde -skx -- apps\openssl.exe version -c + + - name: ml_dsa_internal_test (AVX512VL via SDE) + working-directory: _build + shell: cmd + run: sde -skx -- test\ml_dsa_internal_test.exe + + - name: sha3_x4_internal_test (AVX512VL via SDE) + working-directory: _build + shell: cmd + run: sde -skx -- test\sha3_x4_internal_test.exe + + - name: fipsinstall (FIPS KAT via SDE) + working-directory: _build + shell: cmd + run: sde -skx -- apps\openssl.exe fipsinstall -module providers\fips.dll -out fipsmodule.cnf -provider_name fips diff --git a/CHANGES.md b/CHANGES.md index 049c0e7288710..f7ee59641cf2c 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -31,6 +31,10 @@ OpenSSL Releases ### Changes between 4.0 and 4.1 [xx XXX xxxx] + * Added AVX512 optimized SHAKE x4 operations for ML-DSA on x86_64. + + *Marcel Cornu and Tomasz Kantecki* + * Added test framework for testing function memory allocation failures. *Jakub Zelenka* diff --git a/crypto/ml_dsa/ml_dsa_hash.h b/crypto/ml_dsa/ml_dsa_hash.h index 7625d3367d9c3..4280ef67c8897 100644 --- a/crypto/ml_dsa/ml_dsa_hash.h +++ b/crypto/ml_dsa/ml_dsa_hash.h @@ -7,6 +7,9 @@ * https://www.openssl.org/source/license.html */ +#ifndef OSSL_CRYPTO_ML_DSA_HASH_H +#define OSSL_CRYPTO_ML_DSA_HASH_H + #include static ossl_inline ossl_unused int @@ -39,3 +42,5 @@ shake_xof_3(EVP_MD_CTX *ctx, const EVP_MD *md, const uint8_t *in1, size_t in1_le && EVP_DigestUpdate(ctx, in3, in3_len) && EVP_DigestSqueeze(ctx, out, out_len); } + +#endif /* OSSL_CRYPTO_ML_DSA_HASH_H */ diff --git a/crypto/ml_dsa/ml_dsa_key.c b/crypto/ml_dsa/ml_dsa_key.c index 24fa7596e2f77..74488365c31f2 100644 --- a/crypto/ml_dsa/ml_dsa_key.c +++ b/crypto/ml_dsa/ml_dsa_key.c @@ -332,7 +332,7 @@ int ossl_ml_dsa_key_has(const ML_DSA_KEY *key, int selection) * @returns 1 on success, or 0 on failure. */ static int public_from_private(const ML_DSA_KEY *key, EVP_MD_CTX *md_ctx, - VECTOR *t1, VECTOR *t0) + const OSSL_ML_DSA_SAMPLE_OPS *sample_ops, VECTOR *t1, VECTOR *t0) { int ret = 0; const ML_DSA_PARAMS *params = key->params; @@ -351,7 +351,7 @@ static int public_from_private(const ML_DSA_KEY *key, EVP_MD_CTX *md_ctx, matrix_init(&a_ntt, s1_ntt.poly + l, k, l); /* Using rho generate A' = A in NTT form */ - if (!matrix_expand_A(md_ctx, key->shake128_md, key->rho, &a_ntt)) + if (!sample_ops->matrix_expand_A(md_ctx, key->shake128_md, key->rho, &a_ntt)) goto err; /* t = NTT_inv(A' * NTT(s1)) + s2 */ @@ -376,6 +376,7 @@ static int public_from_private(const ML_DSA_KEY *key, EVP_MD_CTX *md_ctx, int ossl_ml_dsa_key_public_from_private(ML_DSA_KEY *key) { int ret = 0; + const OSSL_ML_DSA_SAMPLE_OPS *sample_ops = ossl_ml_dsa_sample_ops(); VECTOR t0; EVP_MD_CTX *md_ctx = NULL; @@ -383,7 +384,7 @@ int ossl_ml_dsa_key_public_from_private(ML_DSA_KEY *key) return 0; ret = ((md_ctx = EVP_MD_CTX_new()) != NULL) && ossl_ml_dsa_key_pub_alloc(key) /* allocate space for t1 */ - && public_from_private(key, md_ctx, &key->t1, &t0) + && public_from_private(key, md_ctx, sample_ops, &key->t1, &t0) && vector_equal(&t0, &key->t0) /* compare the generated t0 to the expected */ && ossl_ml_dsa_pk_encode(key) && shake_xof(md_ctx, key->shake256_md, @@ -397,6 +398,7 @@ int ossl_ml_dsa_key_public_from_private(ML_DSA_KEY *key) int ossl_ml_dsa_key_pairwise_check(const ML_DSA_KEY *key) { int ret = 0; + const OSSL_ML_DSA_SAMPLE_OPS *sample_ops = ossl_ml_dsa_sample_ops(); VECTOR t1, t0; POLY *polys = NULL; uint32_t k = (uint32_t)key->params->k; @@ -414,7 +416,7 @@ int ossl_ml_dsa_key_pairwise_check(const ML_DSA_KEY *key) vector_init(&t1, polys, k); vector_init(&t0, polys + k, k); - if (!public_from_private(key, md_ctx, &t1, &t0)) + if (!public_from_private(key, md_ctx, sample_ops, &t1, &t0)) goto err; ret = vector_equal(&t1, &key->t1) && vector_equal(&t0, &key->t0); @@ -435,6 +437,7 @@ int ossl_ml_dsa_key_pairwise_check(const ML_DSA_KEY *key) static int keygen_internal(ML_DSA_KEY *out) { int ret = 0; + const OSSL_ML_DSA_SAMPLE_OPS *sample_ops = ossl_ml_dsa_sample_ops(); uint8_t augmented_seed[ML_DSA_SEED_BYTES + 2]; uint8_t expanded_seed[ML_DSA_RHO_BYTES + ML_DSA_PRIV_SEED_BYTES + ML_DSA_K_BYTES]; const uint8_t *const rho = expanded_seed; /* p = Public Random Seed */ @@ -461,8 +464,9 @@ static int keygen_internal(ML_DSA_KEY *out) memcpy(out->rho, rho, sizeof(out->rho)); memcpy(out->K, K, sizeof(out->K)); - ret = vector_expand_S(md_ctx, out->shake256_md, params->eta, priv_seed, &out->s1, &out->s2) - && public_from_private(out, md_ctx, &out->t1, &out->t0) + ret = sample_ops->vector_expand_S(md_ctx, out->shake256_md, params->eta, + priv_seed, &out->s1, &out->s2) + && public_from_private(out, md_ctx, sample_ops, &out->t1, &out->t0) && ossl_ml_dsa_pk_encode(out) && shake_xof(md_ctx, out->shake256_md, out->pub_encoding, out->params->pk_len, out->tr, sizeof(out->tr)) diff --git a/crypto/ml_dsa/ml_dsa_local.h b/crypto/ml_dsa/ml_dsa_local.h index bbaa6dafc75a9..34a83f8ffbe0e 100644 --- a/crypto/ml_dsa/ml_dsa_local.h +++ b/crypto/ml_dsa/ml_dsa_local.h @@ -59,10 +59,23 @@ typedef struct vector_st VECTOR; typedef struct matrix_st MATRIX; typedef struct ml_dsa_sig_st ML_DSA_SIG; -int ossl_ml_dsa_matrix_expand_A(EVP_MD_CTX *g_ctx, const EVP_MD *md, +typedef int(ML_DSA_MATRIX_EXPAND_A_FN)(EVP_MD_CTX *g_ctx, const EVP_MD *md, const uint8_t *rho, MATRIX *out); -int ossl_ml_dsa_vector_expand_S(EVP_MD_CTX *h_ctx, const EVP_MD *md, int eta, - const uint8_t *seed, VECTOR *s1, VECTOR *s2); +typedef int(ML_DSA_VECTOR_EXPAND_S_FN)(EVP_MD_CTX *h_ctx, const EVP_MD *md, + int eta, const uint8_t *seed, VECTOR *s1, VECTOR *s2); +typedef void(ML_DSA_VECTOR_EXPAND_MASK_FN)(VECTOR *out, const uint8_t *rho_prime, + size_t rho_prime_len, uint32_t kappa, uint32_t gamma1, + EVP_MD_CTX *h_ctx, const EVP_MD *md); + +typedef struct ossl_ml_dsa_sample_ops_st { + ML_DSA_MATRIX_EXPAND_A_FN *matrix_expand_A; + ML_DSA_VECTOR_EXPAND_S_FN *vector_expand_S; + ML_DSA_VECTOR_EXPAND_MASK_FN *vector_expand_mask; +} OSSL_ML_DSA_SAMPLE_OPS; + +const OSSL_ML_DSA_SAMPLE_OPS *ossl_ml_dsa_sample_ops(void); +const OSSL_ML_DSA_SAMPLE_OPS *ossl_ml_dsa_sample_generic_ops(void); +const OSSL_ML_DSA_SAMPLE_OPS *ossl_ml_dsa_sample_x86_64_ops(void); void ossl_ml_dsa_matrix_mult_vector(const MATRIX *matrix_kl, const VECTOR *vl, VECTOR *vk); int ossl_ml_dsa_poly_expand_mask(POLY *out, const uint8_t *seed, size_t seed_len, diff --git a/crypto/ml_dsa/ml_dsa_matrix.h b/crypto/ml_dsa/ml_dsa_matrix.h index 0352ecac7afc0..cd9005fc87177 100644 --- a/crypto/ml_dsa/ml_dsa_matrix.h +++ b/crypto/ml_dsa/ml_dsa_matrix.h @@ -35,10 +35,3 @@ matrix_mult_vector(const MATRIX *a, const VECTOR *s, VECTOR *t) { ossl_ml_dsa_matrix_mult_vector(a, s, t); } - -static ossl_inline ossl_unused int -matrix_expand_A(EVP_MD_CTX *g_ctx, const EVP_MD *md, const uint8_t *rho, - MATRIX *out) -{ - return ossl_ml_dsa_matrix_expand_A(g_ctx, md, rho, out); -} diff --git a/crypto/ml_dsa/ml_dsa_sample.c b/crypto/ml_dsa/ml_dsa_sample.c index 5d9dc84a54fa3..3eef3c0176b1b 100644 --- a/crypto/ml_dsa/ml_dsa_sample.c +++ b/crypto/ml_dsa/ml_dsa_sample.c @@ -8,6 +8,7 @@ */ #include +#include #include "ml_dsa_local.h" #include "ml_dsa_vector.h" #include "ml_dsa_matrix.h" @@ -35,6 +36,10 @@ typedef int(COEFF_FROM_NIBBLE_FUNC)(uint32_t nibble, uint32_t *out); static COEFF_FROM_NIBBLE_FUNC coeff_from_nibble_4; static COEFF_FROM_NIBBLE_FUNC coeff_from_nibble_2; +static ML_DSA_MATRIX_EXPAND_A_FN matrix_expand_A_scalar; +static ML_DSA_VECTOR_EXPAND_S_FN vector_expand_S_scalar; +static ML_DSA_VECTOR_EXPAND_MASK_FN vector_expand_mask_scalar; + /** * @brief Combine 3 bytes to form an coefficient. * See FIPS 204, Algorithm 14, CoeffFromThreeBytes() @@ -198,7 +203,7 @@ static int rej_bounded_poly(EVP_MD_CTX *h_ctx, const EVP_MD *md, * in the range of 0..q-1. * @returns 1 if the matrix was generated, or 0 on error. */ -int ossl_ml_dsa_matrix_expand_A(EVP_MD_CTX *g_ctx, const EVP_MD *md, +static int matrix_expand_A_scalar(EVP_MD_CTX *g_ctx, const EVP_MD *md, const uint8_t *rho, MATRIX *out) { int ret = 0; @@ -208,7 +213,6 @@ int ossl_ml_dsa_matrix_expand_A(EVP_MD_CTX *g_ctx, const EVP_MD *md, /* The seed used for each matrix element is rho + column_index + row_index */ memcpy(derived_seed, rho, ML_DSA_RHO_BYTES); - for (i = 0; i < out->k; i++) { for (j = 0; j < out->l; j++) { derived_seed[ML_DSA_RHO_BYTES + 1] = (uint8_t)i; @@ -241,7 +245,7 @@ int ossl_ml_dsa_matrix_expand_A(EVP_MD_CTX *g_ctx, const EVP_MD *md, * the range (q-eta)..0..eta * @returns 1 if s1 and s2 were successfully generated, or 0 otherwise. */ -int ossl_ml_dsa_vector_expand_S(EVP_MD_CTX *h_ctx, const EVP_MD *md, int eta, +static int vector_expand_S_scalar(EVP_MD_CTX *h_ctx, const EVP_MD *md, int eta, const uint8_t *seed, VECTOR *s1, VECTOR *s2) { int ret = 0; @@ -376,3 +380,57 @@ int ossl_ml_dsa_poly_sample_in_ball(POLY *out_c, const uint8_t *seed, int seed_l } return 1; } + +static void vector_expand_mask_scalar(VECTOR *out, const uint8_t *rho_prime, + size_t rho_prime_len, uint32_t kappa, uint32_t gamma1, + EVP_MD_CTX *h_ctx, const EVP_MD *md) +{ + size_t i; + uint8_t derived_seed[ML_DSA_RHO_PRIME_BYTES + 2]; + + (void)rho_prime_len; + + memcpy(derived_seed, rho_prime, ML_DSA_RHO_PRIME_BYTES); + + for (i = 0; i < out->num_poly; i++) { + size_t index = kappa + i; + + derived_seed[ML_DSA_RHO_PRIME_BYTES] = index & 0xFF; + derived_seed[ML_DSA_RHO_PRIME_BYTES + 1] = (index >> 8) & 0xFF; + poly_expand_mask(out->poly + i, derived_seed, sizeof(derived_seed), + gamma1, h_ctx, md); + } +} + +static const OSSL_ML_DSA_SAMPLE_OPS ml_dsa_sample_generic_meth = { + matrix_expand_A_scalar, + vector_expand_S_scalar, + vector_expand_mask_scalar +}; + +const OSSL_ML_DSA_SAMPLE_OPS *ossl_ml_dsa_sample_ops(void) +{ +#if defined(KECCAK1600_ASM) \ + && (defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)) \ + && !defined(OPENSSL_NO_ASM) + return ossl_ml_dsa_sample_x86_64_ops(); +#else + return ossl_ml_dsa_sample_generic_ops(); +#endif +} + +const OSSL_ML_DSA_SAMPLE_OPS *ossl_ml_dsa_sample_generic_ops(void) +{ + return &ml_dsa_sample_generic_meth; +} + +#if defined(KECCAK1600_ASM) \ + && (defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)) \ + && !defined(OPENSSL_NO_ASM) +#include "ml_dsa_sample_hw_x86_64.inc" +#else +const OSSL_ML_DSA_SAMPLE_OPS *ossl_ml_dsa_sample_x86_64_ops(void) +{ + return ossl_ml_dsa_sample_generic_ops(); +} +#endif diff --git a/crypto/ml_dsa/ml_dsa_sample_hw_x86_64.inc b/crypto/ml_dsa/ml_dsa_sample_hw_x86_64.inc new file mode 100644 index 0000000000000..cc36c489a761f --- /dev/null +++ b/crypto/ml_dsa/ml_dsa_sample_hw_x86_64.inc @@ -0,0 +1,310 @@ +/* + * Copyright 2026 The OpenSSL Project Authors. All Rights Reserved. + * Copyright (c) 2026 Intel Corporation. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#define ML_DSA_SHAKE_X4_BATCH_SIZE 4 +#define ML_DSA_SHAKE_X4_DONE_MASK ((1 << ML_DSA_SHAKE_X4_BATCH_SIZE) - 1) +#define ML_DSA_EXPAND_MASK_BYTES_PER_COEFF 32 +#define ML_DSA_EXPAND_MASK_COEFFS_GAMMA1_19 20 +#define ML_DSA_EXPAND_MASK_COEFFS_GAMMA1_17 18 +#define ML_DSA_EXPAND_MASK_BUF_SIZE_GAMMA1_19 \ + (ML_DSA_EXPAND_MASK_BYTES_PER_COEFF * ML_DSA_EXPAND_MASK_COEFFS_GAMMA1_19) +#define ML_DSA_EXPAND_MASK_BUF_SIZE_GAMMA1_17 \ + (ML_DSA_EXPAND_MASK_BYTES_PER_COEFF * ML_DSA_EXPAND_MASK_COEFFS_GAMMA1_17) +#define ML_DSA_EXPAND_MASK_BUF_SIZE(gamma1) \ + ((gamma1) == ML_DSA_GAMMA1_TWO_POWER_19 \ + ? ML_DSA_EXPAND_MASK_BUF_SIZE_GAMMA1_19 \ + : ML_DSA_EXPAND_MASK_BUF_SIZE_GAMMA1_17) + +static ossl_unused int rej_ntt_poly_mb(EVP_MD_CTX *g_ctx, const EVP_MD *md, + const uint8_t *seeds[ML_DSA_SHAKE_X4_BATCH_SIZE], const size_t seed_len, + POLY *outs[ML_DSA_SHAKE_X4_BATCH_SIZE], const size_t count) +{ + KECCAK1600_X4_CTX ctx; + uint8_t blocks[ML_DSA_SHAKE_X4_BATCH_SIZE][SHAKE128_BLOCKSIZE]; + int coeff_idx[ML_DSA_SHAKE_X4_BATCH_SIZE] = { 0, 0, 0, 0 }; + size_t done_mask = 0; + size_t lane; + + (void)g_ctx; + (void)md; + + for (lane = count; lane < ML_DSA_SHAKE_X4_BATCH_SIZE; lane++) + done_mask |= ((size_t)1 << lane); + + ossl_sha3_shake128_x4_inc_init(&ctx); + ossl_sha3_shake128_x4_inc_absorb(&ctx, seeds[0], seeds[1], + seeds[2], seeds[3], seed_len); + ossl_sha3_shake128_x4_inc_finalize(&ctx); + + while (done_mask != ML_DSA_SHAKE_X4_DONE_MASK) { + ossl_sha3_shake128_x4_inc_squeeze(blocks[0], blocks[1], + blocks[2], blocks[3], SHAKE128_BLOCKSIZE, &ctx); + + for (lane = 0; lane < ML_DSA_SHAKE_X4_BATCH_SIZE; lane++) { + if (done_mask & ((size_t)1 << lane)) + continue; + + const uint8_t *b = blocks[lane]; + const uint8_t *end = b + SHAKE128_BLOCKSIZE; + + for (; b < end && coeff_idx[lane] < ML_DSA_NUM_POLY_COEFFICIENTS; b += 3) { + uint32_t *coeff_ptr = &(outs[lane]->coeff[coeff_idx[lane]]); + + if (coeff_from_three_bytes(b, coeff_ptr)) + coeff_idx[lane]++; + } + + if (coeff_idx[lane] >= ML_DSA_NUM_POLY_COEFFICIENTS) + done_mask |= ((size_t)1 << lane); + } + } + + return 1; +} + +static void vector_expand_mask_mb(VECTOR *out, const uint8_t *rho_prime, + const size_t rho_prime_len, const uint32_t kappa, const uint32_t gamma1, + EVP_MD_CTX *h_ctx, const EVP_MD *md) +{ + size_t i; + const size_t num_polys = out->num_poly; + uint8_t derived_seeds[ML_DSA_SHAKE_X4_BATCH_SIZE][ML_DSA_RHO_PRIME_BYTES + 2]; + const size_t seed_len = sizeof(derived_seeds[0]); + const size_t buf_size = ML_DSA_EXPAND_MASK_BUF_SIZE(gamma1); + uint8_t buffers[ML_DSA_SHAKE_X4_BATCH_SIZE][ML_DSA_EXPAND_MASK_BUF_SIZE_GAMMA1_19]; + + (void)rho_prime_len; + (void)h_ctx; + (void)md; + + for (i = 0; i < ML_DSA_SHAKE_X4_BATCH_SIZE; i++) + memcpy(derived_seeds[i], rho_prime, ML_DSA_RHO_PRIME_BYTES); + + for (i = 0; i + (ML_DSA_SHAKE_X4_BATCH_SIZE - 1) < num_polys; i += ML_DSA_SHAKE_X4_BATCH_SIZE) { + size_t b; + + for (b = 0; b < ML_DSA_SHAKE_X4_BATCH_SIZE; b++) { + const size_t index = kappa + i + b; + + derived_seeds[b][ML_DSA_RHO_PRIME_BYTES] = index & 0xFF; + derived_seeds[b][ML_DSA_RHO_PRIME_BYTES + 1] = (index >> 8) & 0xFF; + } + + ossl_sha3_shake256_x4(buffers[0], buffers[1], buffers[2], buffers[3], buf_size, + derived_seeds[0], derived_seeds[1], derived_seeds[2], derived_seeds[3], seed_len); + + ossl_ml_dsa_poly_decode_expand_mask(&out->poly[i + 0], buffers[0], buf_size, gamma1); + ossl_ml_dsa_poly_decode_expand_mask(&out->poly[i + 1], buffers[1], buf_size, gamma1); + ossl_ml_dsa_poly_decode_expand_mask(&out->poly[i + 2], buffers[2], buf_size, gamma1); + ossl_ml_dsa_poly_decode_expand_mask(&out->poly[i + 3], buffers[3], buf_size, gamma1); + } + + if (i < num_polys) { + const size_t left = num_polys - i; + size_t b; + + for (b = 0; b < left; b++) { + const size_t index = kappa + i + b; + + derived_seeds[b][ML_DSA_RHO_PRIME_BYTES] = (uint8_t)index; + derived_seeds[b][ML_DSA_RHO_PRIME_BYTES + 1] = (uint8_t)(index >> 8); + } + + ossl_sha3_shake256_x4(buffers[0], buffers[1], buffers[2], buffers[3], buf_size, + derived_seeds[0], derived_seeds[1], derived_seeds[2], derived_seeds[3], seed_len); + + ossl_ml_dsa_poly_decode_expand_mask(&out->poly[i + 0], buffers[0], buf_size, gamma1); + + if ((i + 1) < num_polys) + ossl_ml_dsa_poly_decode_expand_mask(&out->poly[i + 1], buffers[1], buf_size, gamma1); + + if ((i + 2) < num_polys) + ossl_ml_dsa_poly_decode_expand_mask(&out->poly[i + 2], buffers[2], buf_size, gamma1); + } +} + +static ossl_unused int rej_bounded_poly_mb(EVP_MD_CTX *h_ctx, const EVP_MD *md, + COEFF_FROM_NIBBLE_FUNC *coef_from_nibble, + const uint8_t *seeds[ML_DSA_SHAKE_X4_BATCH_SIZE], const size_t seed_len, + POLY *outs[ML_DSA_SHAKE_X4_BATCH_SIZE], const size_t count) +{ + KECCAK1600_X4_CTX ctx; + uint8_t blocks[ML_DSA_SHAKE_X4_BATCH_SIZE][SHAKE256_BLOCKSIZE]; + int coeff_idx[ML_DSA_SHAKE_X4_BATCH_SIZE] = { 0, 0, 0, 0 }; + size_t done_mask = 0; + size_t lane; + + (void)h_ctx; + (void)md; + + for (lane = count; lane < ML_DSA_SHAKE_X4_BATCH_SIZE; lane++) + done_mask |= ((size_t)1 << lane); + + ossl_sha3_shake256_x4_inc_init(&ctx); + ossl_sha3_shake256_x4_inc_absorb(&ctx, seeds[0], seeds[1], + seeds[2], seeds[3], seed_len); + ossl_sha3_shake256_x4_inc_finalize(&ctx); + + while (done_mask != ML_DSA_SHAKE_X4_DONE_MASK) { + ossl_sha3_shake256_x4_inc_squeeze(blocks[0], blocks[1], + blocks[2], blocks[3], SHAKE256_BLOCKSIZE, &ctx); + + for (lane = 0; lane < ML_DSA_SHAKE_X4_BATCH_SIZE; lane++) { + if (done_mask & ((size_t)1 << lane)) + continue; + + const uint8_t *b = blocks[lane]; + const uint8_t *end = b + SHAKE256_BLOCKSIZE; + + for (; b < end && coeff_idx[lane] < ML_DSA_NUM_POLY_COEFFICIENTS; b++) { + uint32_t z0 = *b & 0x0F; + uint32_t z1 = *b >> 4; + + if (coef_from_nibble(z0, &outs[lane]->coeff[coeff_idx[lane]])) + coeff_idx[lane]++; + + if (coeff_idx[lane] >= ML_DSA_NUM_POLY_COEFFICIENTS) { + done_mask |= ((size_t)1 << lane); + break; + } + + if (coef_from_nibble(z1, &outs[lane]->coeff[coeff_idx[lane]])) + coeff_idx[lane]++; + + if (coeff_idx[lane] >= ML_DSA_NUM_POLY_COEFFICIENTS) { + done_mask |= ((size_t)1 << lane); + break; + } + } + } + } + + return 1; +} + +static int matrix_expand_A_mb(EVP_MD_CTX *g_ctx, const EVP_MD *md, + const uint8_t *rho, MATRIX *out) +{ + size_t b, idx; + uint8_t derived_seeds[ML_DSA_SHAKE_X4_BATCH_SIZE][ML_DSA_RHO_BYTES + 2]; + const size_t seed_len = sizeof(derived_seeds[0]); + const uint8_t *seeds[ML_DSA_SHAKE_X4_BATCH_SIZE]; + POLY *polys[ML_DSA_SHAKE_X4_BATCH_SIZE]; + POLY *poly = out->m_poly; + + for (b = 0; b < ML_DSA_SHAKE_X4_BATCH_SIZE; b++) { + memcpy(derived_seeds[b], rho, ML_DSA_RHO_BYTES); + seeds[b] = derived_seeds[b]; + } + + for (idx = 0; (idx + ML_DSA_SHAKE_X4_BATCH_SIZE - 1) < (out->k * out->l); + idx += ML_DSA_SHAKE_X4_BATCH_SIZE) { + for (b = 0; b < ML_DSA_SHAKE_X4_BATCH_SIZE; b++) { + const size_t row = (idx + b) / out->l; + const size_t col = (idx + b) % out->l; + + derived_seeds[b][ML_DSA_RHO_BYTES] = (uint8_t)col; + derived_seeds[b][ML_DSA_RHO_BYTES + 1] = (uint8_t)row; + polys[b] = &poly[idx + b]; + } + + if (!rej_ntt_poly_mb(g_ctx, md, seeds, seed_len, polys, 4)) + return 0; + } + + if (idx < (out->k * out->l)) { + const size_t left = (out->k * out->l) - idx; + + for (b = 0; b < left; b++) { + const size_t row = (idx + b) / out->l; + const size_t col = (idx + b) % out->l; + + derived_seeds[b][ML_DSA_RHO_BYTES] = (uint8_t)col; + derived_seeds[b][ML_DSA_RHO_BYTES + 1] = (uint8_t)row; + polys[b] = &poly[idx + b]; + } + + if (!rej_ntt_poly_mb(g_ctx, md, seeds, seed_len, polys, left)) + return 0; + } + + return 1; +} + +static int vector_expand_S_mb(EVP_MD_CTX *h_ctx, const EVP_MD *md, const int eta, + const uint8_t *seed, VECTOR *s1, VECTOR *s2) +{ + size_t b, idx; + const size_t l = s1->num_poly; + const size_t total = l + s2->num_poly; + uint8_t derived_seeds[ML_DSA_SHAKE_X4_BATCH_SIZE][ML_DSA_PRIV_SEED_BYTES + 2]; + const uint8_t *seeds[ML_DSA_SHAKE_X4_BATCH_SIZE]; + const size_t seed_len = sizeof(derived_seeds[0]); + POLY *polys[ML_DSA_SHAKE_X4_BATCH_SIZE]; + COEFF_FROM_NIBBLE_FUNC *coef_from_nibble_fn = (eta == ML_DSA_ETA_4) ? coeff_from_nibble_4 : coeff_from_nibble_2; + + for (b = 0; b < ML_DSA_SHAKE_X4_BATCH_SIZE; b++) { + memcpy(derived_seeds[b], seed, ML_DSA_PRIV_SEED_BYTES); + seeds[b] = derived_seeds[b]; + } + + for (idx = 0; (idx + ML_DSA_SHAKE_X4_BATCH_SIZE - 1) < total; idx += ML_DSA_SHAKE_X4_BATCH_SIZE) { + for (b = 0; b < ML_DSA_SHAKE_X4_BATCH_SIZE; b++) { + const size_t poly_idx = idx + b; + + derived_seeds[b][ML_DSA_PRIV_SEED_BYTES] = (uint8_t)(poly_idx); + derived_seeds[b][ML_DSA_PRIV_SEED_BYTES + 1] = (uint8_t)(poly_idx >> 8); + + if (poly_idx < l) + polys[b] = &s1->poly[poly_idx]; + else + polys[b] = &s2->poly[poly_idx - l]; + } + + if (!rej_bounded_poly_mb(h_ctx, md, coef_from_nibble_fn, + seeds, seed_len, polys, ML_DSA_SHAKE_X4_BATCH_SIZE)) + return 0; + } + + if (idx < total) { + const size_t batch_count = total - idx; + + for (b = 0; b < batch_count; b++) { + const size_t poly_idx = idx + b; + + derived_seeds[b][ML_DSA_PRIV_SEED_BYTES] = (uint8_t)(poly_idx); + derived_seeds[b][ML_DSA_PRIV_SEED_BYTES + 1] = (uint8_t)(poly_idx >> 8); + + if (poly_idx < l) + polys[b] = &s1->poly[poly_idx]; + else + polys[b] = &s2->poly[poly_idx - l]; + } + + if (!rej_bounded_poly_mb(h_ctx, md, coef_from_nibble_fn, + seeds, seed_len, polys, batch_count)) + return 0; + } + + return 1; +} + +static const OSSL_ML_DSA_SAMPLE_OPS ml_dsa_sample_x86_64 = { + matrix_expand_A_mb, + vector_expand_S_mb, + vector_expand_mask_mb +}; + +const OSSL_ML_DSA_SAMPLE_OPS *ossl_ml_dsa_sample_x86_64_ops(void) +{ + if (SHA3_avx512vl_capable()) + return &ml_dsa_sample_x86_64; + return ossl_ml_dsa_sample_generic_ops(); +} diff --git a/crypto/ml_dsa/ml_dsa_sign.c b/crypto/ml_dsa/ml_dsa_sign.c index 51c2709ddbaf9..b42323266aad4 100644 --- a/crypto/ml_dsa/ml_dsa_sign.c +++ b/crypto/ml_dsa/ml_dsa_sign.c @@ -164,6 +164,7 @@ static int ml_dsa_sign_internal(const ML_DSA_KEY *priv, uint8_t *out_sig) { int ret = 0; + const OSSL_ML_DSA_SAMPLE_OPS *sample_ops = ossl_ml_dsa_sample_ops(); const ML_DSA_PARAMS *params = priv->params; EVP_MD_CTX *md_ctx = NULL; uint32_t k = (uint32_t)params->k, l = (uint32_t)params->l; @@ -232,7 +233,7 @@ static int ml_dsa_sign_internal(const ML_DSA_KEY *priv, CONSTTIME_SECRET_VECTOR(priv->s2); CONSTTIME_SECRET_VECTOR(priv->t0); - if (!matrix_expand_A(md_ctx, priv->shake128_md, priv->rho, &a_ntt)) + if (!sample_ops->matrix_expand_A(md_ctx, priv->shake128_md, priv->rho, &a_ntt)) goto err; /* @@ -263,8 +264,8 @@ static int ml_dsa_sign_internal(const ML_DSA_KEY *priv, VECTOR *ct0 = &w1; uint32_t z_max, r0_max, ct0_max, h_ones; - vector_expand_mask(&y, rho_prime, sizeof(rho_prime), (uint32_t)kappa, - gamma1, md_ctx, priv->shake256_md); + sample_ops->vector_expand_mask(&y, rho_prime, sizeof(rho_prime), + (uint32_t)kappa, gamma1, md_ctx, priv->shake256_md); vector_copy(y_ntt, &y); vector_ntt(y_ntt); @@ -380,6 +381,7 @@ static int ml_dsa_verify_internal(const ML_DSA_KEY *pub, const uint8_t *sig_enc, size_t sig_enc_len) { int ret = 0; + const OSSL_ML_DSA_SAMPLE_OPS *sample_ops = ossl_ml_dsa_sample_ops(); uint8_t *alloc = NULL, *w1_encoded; POLY *p, *c_ntt; MATRIX a_ntt; @@ -428,7 +430,7 @@ static int ml_dsa_verify_internal(const ML_DSA_KEY *pub, vector_init(&ct1_ntt, p + k, k); if (!ossl_ml_dsa_sig_decode(&sig, sig_enc, sig_enc_len, pub->params) - || !matrix_expand_A(md_ctx, pub->shake128_md, pub->rho, &a_ntt)) + || !sample_ops->matrix_expand_A(md_ctx, pub->shake128_md, pub->rho, &a_ntt)) goto err; /* Compute verifiers challenge c_ntt = NTT(SampleInBall(c_tilde)) */ diff --git a/crypto/ml_dsa/ml_dsa_vector.h b/crypto/ml_dsa/ml_dsa_vector.h index 0693eb6e3c30c..389c0ed045338 100644 --- a/crypto/ml_dsa/ml_dsa_vector.h +++ b/crypto/ml_dsa/ml_dsa_vector.h @@ -149,33 +149,6 @@ vector_mult_scalar(const VECTOR *lhs, const POLY *rhs, VECTOR *out) ossl_ml_dsa_poly_ntt_mult(lhs->poly + i, rhs, out->poly + i); } -static ossl_inline ossl_unused int -vector_expand_S(EVP_MD_CTX *h_ctx, const EVP_MD *md, int eta, - const uint8_t *seed, VECTOR *s1, VECTOR *s2) -{ - return ossl_ml_dsa_vector_expand_S(h_ctx, md, eta, seed, s1, s2); -} - -static ossl_inline ossl_unused void -vector_expand_mask(VECTOR *out, const uint8_t *rho_prime, size_t rho_prime_len, - uint32_t kappa, uint32_t gamma1, - EVP_MD_CTX *h_ctx, const EVP_MD *md) -{ - size_t i; - uint8_t derived_seed[ML_DSA_RHO_PRIME_BYTES + 2]; - - memcpy(derived_seed, rho_prime, ML_DSA_RHO_PRIME_BYTES); - - for (i = 0; i < out->num_poly; i++) { - size_t index = kappa + i; - - derived_seed[ML_DSA_RHO_PRIME_BYTES] = index & 0xFF; - derived_seed[ML_DSA_RHO_PRIME_BYTES + 1] = (index >> 8) & 0xFF; - poly_expand_mask(out->poly + i, derived_seed, sizeof(derived_seed), - gamma1, h_ctx, md); - } -} - /* Scale back previously rounded value */ static ossl_inline ossl_unused void vector_scale_power2_round_ntt(const VECTOR *in, VECTOR *out) diff --git a/crypto/sha/asm/keccak1600x4-avx512vl.pl b/crypto/sha/asm/keccak1600x4-avx512vl.pl new file mode 100755 index 0000000000000..cf52b190407e8 --- /dev/null +++ b/crypto/sha/asm/keccak1600x4-avx512vl.pl @@ -0,0 +1,2343 @@ +#!/usr/bin/env perl +# +# Copyright 2026 The OpenSSL Project Authors. All Rights Reserved. +# Copyright (c) 2026 Intel Corporation. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +############################################################################### +# Keccak x4 AVX512VL SHA3/SHAKE Assembly Routines +# +# Description: +# This file emits x86_64 assembly for AVX512VL accelerated Keccak-f[1600] +# processing of 4 independent states in parallel ("x4"). +# +# It provides the core 24-round Keccak permutation and x4 helper routines +# used by SHA3 and SHAKE absorb/finalize/squeeze paths. Data from four +# input/output lanes is packed across YMM registers so lane-local operations +# execute in SIMD. +# +############################################################################### + +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$avx512vl = 0; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +# Check for AVX512VL support in assembler +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version (\d+)\.(\d+)/) { + my ($gas_major, $gas_minor) = ($1, $2); + $avx512vl = ($gas_major > 2 || ($gas_major == 2 && $gas_minor >= 26)); +} + +if (!$avx512vl + && $win64 + && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) + && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) +{ + $avx512vl = ($1 >= 2.12); +} + +if (!$avx512vl && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { + $avx512vl = ($2>=3.9); +} + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; +*STDOUT=*OUT; + +$arg1="%rdi"; +$arg2="%rsi"; +$arg3="%rdx"; +$arg4="%rcx"; +$arg5="%r8"; +$arg6="%r9"; +$roundn="%r13d"; +$tblptr="%r14"; + +# Define SHAKE rates +$SHAKE128_RATE="\$168"; +$SHAKE256_RATE="\$136"; + +# Stack frame offsets for SHAKE x4 wrapper functions +$STATE_SIZE="808"; # (25 * 8 * 4) + 8 = 808 bytes +$sf_arg1="0"; +$sf_arg2="8"; +$sf_arg3="16"; +$sf_arg4="24"; +$sf_arg5="32"; +$sf_state_ptr="40"; +$sf_state_x4="48"; +$sf_size="856"; # 48 + 808 = 856 bytes + +# Emit an internal helper call used by one-shot wrappers. +# - Win64: call the provided *_internal shim and bracket it with 32-byte +# shadow space so shim entry can use xlate-compatible [rsp+8]/[rsp+16]. +# - non-Win64: call the public API symbol (same base name without _internal). +# The argument must be the shim/internal symbol name, e.g. +# SHA3_shake128_x4_inc_squeeze_avx512vl_internal +sub call_internal { + my ($shim_name) = @_; + my $external_name = $shim_name; + + $external_name =~ s/_internal$//; + + return <<___ if ($win64); + sub \$32, %rsp + call $shim_name + add \$32, %rsp +___ + + return <<___; + call $external_name +___ +} + +if ($avx512vl>0) {{{ + +# AVX512VL feature bit (bit 31 in OPENSSL_ia32cap_P+8) +my $avx512vl_mask = (1<<31); + +$code .= <<___; +.text + +.extern OPENSSL_ia32cap_P + +.globl SHA3_avx512vl_capable +.type SHA3_avx512vl_capable,\@abi-omnipotent +.align 32 +SHA3_avx512vl_capable: + mov OPENSSL_ia32cap_P+8(%rip), %rcx + xor %eax, %eax + and \$$avx512vl_mask, %ecx + cmovnz %ecx, %eax + ret +.size SHA3_avx512vl_capable, .-SHA3_avx512vl_capable +___ + +$code.=<<___; +.text + +# Perform Keccak permutation +# +# YMM registers 0 to 24 are used as Keccak state registers. +# This function, as is, can work on 1 to 4 independent states at the same time. +# +# There is no clear boundary between Theta, Rho, Pi, Chi and Iota steps. +# Instructions corresponding to these steps overlap for better efficiency. +# +# Arguments: +# ymm0-ymm24 [in/out] Keccak state registers (one SIMD per one state register) +# ymm25-ymm31 [clobbered] temporary SIMD registers +# $roundn [clobbered] used for round tracking +# $tblptr [clobbered] used for access to SHA3 constant table +.type keccak_1600_permute,\@abi-omnipotent +.align 32 +keccak_1600_permute: +.cfi_startproc + mov \$24, $roundn # 24 rounds + lea iotas(%rip), $tblptr # Load the address of the SHA3 round constants + +.align 32 +.Lkeccak_rnd_loop: + # Theta step + + # Compute column parities + # C[5] = [0, 0, 0, 0, 0] + # for x in 0 to 4: + # C[x] = state[x][0] XOR state[x][1] XOR state[x][2] XOR state[x][3] XOR state[x][4] + + vmovdqa64 %ymm0, %ymm25 + vpternlogq \$0x96, %ymm5, %ymm10, %ymm25 + vmovdqa64 %ymm1, %ymm26 + vpternlogq \$0x96, %ymm11, %ymm6, %ymm26 + vmovdqa64 %ymm2, %ymm27 + vpternlogq \$0x96, %ymm12, %ymm7, %ymm27 + + vmovdqa64 %ymm3, %ymm28 + vpternlogq \$0x96, %ymm13, %ymm8, %ymm28 + vmovdqa64 %ymm4, %ymm29 + vpternlogq \$0x96, %ymm14, %ymm9, %ymm29 + vpternlogq \$0x96, %ymm20, %ymm15, %ymm25 + + vpternlogq \$0x96, %ymm21, %ymm16, %ymm26 + vpternlogq \$0x96, %ymm22, %ymm17, %ymm27 + vpternlogq \$0x96, %ymm23, %ymm18, %ymm28 + + # Start computing D values and keep computing column parity + # D[5] = [0, 0, 0, 0, 0] + # for x in 0 to 4: + # D[x] = C[(x+4) mod 5] XOR ROTATE_LEFT(C[(x+1) mod 5], 1) + + vprolq \$1, %ymm26, %ymm30 + vprolq \$1, %ymm27, %ymm31 + vpternlogq \$0x96, %ymm24, %ymm19, %ymm29 + + # Continue computing D values and apply Theta + # for x in 0 to 4: + # for y in 0 to 4: + # state[x][y] = state[x][y] XOR D[x] + + vpternlogq \$0x96, %ymm30, %ymm29, %ymm0 + vpternlogq \$0x96, %ymm30, %ymm29, %ymm10 + vpternlogq \$0x96, %ymm30, %ymm29, %ymm20 + + vpternlogq \$0x96, %ymm30, %ymm29, %ymm5 + vpternlogq \$0x96, %ymm30, %ymm29, %ymm15 + vprolq \$1, %ymm28, %ymm30 + + vpternlogq \$0x96, %ymm31, %ymm25, %ymm6 + vpternlogq \$0x96, %ymm31, %ymm25, %ymm16 + vpternlogq \$0x96, %ymm31, %ymm25, %ymm1 + + vpternlogq \$0x96, %ymm31, %ymm25, %ymm11 + vpternlogq \$0x96, %ymm31, %ymm25, %ymm21 + vprolq \$1, %ymm29, %ymm31 + + vpbroadcastq ($tblptr), %ymm29 # Load the round constant into ymm29 (Iota) + add \$8, $tblptr # Increment the pointer to the next round constant + + vpternlogq \$0x96, %ymm30, %ymm26, %ymm12 + vpternlogq \$0x96, %ymm30, %ymm26, %ymm7 + vpternlogq \$0x96, %ymm30, %ymm26, %ymm22 + + vpternlogq \$0x96, %ymm30, %ymm26, %ymm17 + vpternlogq \$0x96, %ymm30, %ymm26, %ymm2 + vprolq \$1, %ymm25, %ymm30 + + # Rho step + # Keep applying Theta and start Rho step + # + # ROTATION_OFFSETS[5][5] = [ + # [0, 1, 62, 28, 27], + # [36, 44, 6, 55, 20], + # [3, 10, 43, 25, 39], + # [41, 45, 15, 21, 8], + # [18, 2, 61, 56, 14] ] + # + # for x in 0 to 4: + # for y in 0 to 4: + # state[x][y] = ROTATE_LEFT(state[x][y], ROTATION_OFFSETS[x][y]) + + vpternlogq \$0x96, %ymm31, %ymm27, %ymm3 + vpternlogq \$0x96, %ymm31, %ymm27, %ymm13 + vpternlogq \$0x96, %ymm31, %ymm27, %ymm23 + + vprolq \$44, %ymm6, %ymm6 + vpternlogq \$0x96, %ymm31, %ymm27, %ymm18 + vpternlogq \$0x96, %ymm31, %ymm27, %ymm8 + + vprolq \$43, %ymm12, %ymm12 + vprolq \$21, %ymm18, %ymm18 + vpternlogq \$0x96, %ymm30, %ymm28, %ymm24 + + vprolq \$14, %ymm24, %ymm24 + vprolq \$28, %ymm3, %ymm3 + vpternlogq \$0x96, %ymm30, %ymm28, %ymm9 + + vprolq \$20, %ymm9, %ymm9 + vprolq \$3, %ymm10, %ymm10 + vpternlogq \$0x96, %ymm30, %ymm28, %ymm19 + + vprolq \$45, %ymm16, %ymm16 + vprolq \$61, %ymm22, %ymm22 + vpternlogq \$0x96, %ymm30, %ymm28, %ymm4 + + vprolq \$1, %ymm1, %ymm1 + vprolq \$6, %ymm7, %ymm7 + vpternlogq \$0x96, %ymm30, %ymm28, %ymm14 + + # Continue with Rho and start Pi and Chi steps at the same time + # Ternary logic 0xD2 is used for Chi step + # + # for x in 0 to 4: + # for y in 0 to 4: + # state[x][y] = state[x][y] XOR ((NOT state[(x+1) mod 5][y]) AND state[(x+2) mod 5][y]) + + vprolq \$25, %ymm13, %ymm13 + vprolq \$8, %ymm19, %ymm19 + vmovdqa64 %ymm0, %ymm30 + vpternlogq \$0xD2, %ymm12, %ymm6, %ymm30 + + vprolq \$18, %ymm20, %ymm20 + vprolq \$27, %ymm4, %ymm4 + vpxorq %ymm29, %ymm30, %ymm30 # Iota step + + vprolq \$36, %ymm5, %ymm5 + vprolq \$10, %ymm11, %ymm11 + vmovdqa64 %ymm6, %ymm31 + vpternlogq \$0xD2, %ymm18, %ymm12, %ymm31 + + vprolq \$15, %ymm17, %ymm17 + vprolq \$56, %ymm23, %ymm23 + vpternlogq \$0xD2, %ymm24, %ymm18, %ymm12 + + vprolq \$62, %ymm2, %ymm2 + vprolq \$55, %ymm8, %ymm8 + vpternlogq \$0xD2, %ymm0, %ymm24, %ymm18 + + vprolq \$39, %ymm14, %ymm14 + vprolq \$41, %ymm15, %ymm15 + vpternlogq \$0xD2, %ymm6, %ymm0, %ymm24 + vmovdqa64 %ymm30, %ymm0 + vmovdqa64 %ymm31, %ymm6 + + vprolq \$2, %ymm21, %ymm21 + vmovdqa64 %ymm3, %ymm30 + vpternlogq \$0xD2, %ymm10, %ymm9, %ymm30 + vmovdqa64 %ymm9, %ymm31 + vpternlogq \$0xD2, %ymm16, %ymm10, %ymm31 + + vpternlogq \$0xD2, %ymm22, %ymm16, %ymm10 + vpternlogq \$0xD2, %ymm3, %ymm22, %ymm16 + vpternlogq \$0xD2, %ymm9, %ymm3, %ymm22 + vmovdqa64 %ymm30, %ymm3 + vmovdqa64 %ymm31, %ymm9 + + vmovdqa64 %ymm1, %ymm30 + vpternlogq \$0xD2, %ymm13, %ymm7, %ymm30 + vmovdqa64 %ymm7, %ymm31 + vpternlogq \$0xD2, %ymm19, %ymm13, %ymm31 + vpternlogq \$0xD2, %ymm20, %ymm19, %ymm13 + + vpternlogq \$0xD2, %ymm1, %ymm20, %ymm19 + vpternlogq \$0xD2, %ymm7, %ymm1, %ymm20 + vmovdqa64 %ymm30, %ymm1 + vmovdqa64 %ymm31, %ymm7 + vmovdqa64 %ymm4, %ymm30 + vpternlogq \$0xD2, %ymm11, %ymm5, %ymm30 + + vmovdqa64 %ymm5, %ymm31 + vpternlogq \$0xD2, %ymm17, %ymm11, %ymm31 + vpternlogq \$0xD2, %ymm23, %ymm17, %ymm11 + vpternlogq \$0xD2, %ymm4, %ymm23, %ymm17 + + vpternlogq \$0xD2, %ymm5, %ymm4, %ymm23 + vmovdqa64 %ymm30, %ymm4 + vmovdqa64 %ymm31, %ymm5 + vmovdqa64 %ymm2, %ymm30 + vpternlogq \$0xD2, %ymm14, %ymm8, %ymm30 + vmovdqa64 %ymm8, %ymm31 + vpternlogq \$0xD2, %ymm15, %ymm14, %ymm31 + + vpternlogq \$0xD2, %ymm21, %ymm15, %ymm14 + vpternlogq \$0xD2, %ymm2, %ymm21, %ymm15 + vpternlogq \$0xD2, %ymm8, %ymm2, %ymm21 + vmovdqa64 %ymm30, %ymm2 + vmovdqa64 %ymm31, %ymm8 + + # Complete the steps and get updated state registers in ymm0 to ymm24 + vmovdqa64 %ymm3, %ymm30 + vmovdqa64 %ymm18, %ymm3 + vmovdqa64 %ymm17, %ymm18 + vmovdqa64 %ymm11, %ymm17 + vmovdqa64 %ymm7, %ymm11 + vmovdqa64 %ymm10, %ymm7 + vmovdqa64 %ymm1, %ymm10 + vmovdqa64 %ymm6, %ymm1 + vmovdqa64 %ymm9, %ymm6 + vmovdqa64 %ymm22, %ymm9 + vmovdqa64 %ymm14, %ymm22 + vmovdqa64 %ymm20, %ymm14 + vmovdqa64 %ymm2, %ymm20 + vmovdqa64 %ymm12, %ymm2 + vmovdqa64 %ymm13, %ymm12 + vmovdqa64 %ymm19, %ymm13 + vmovdqa64 %ymm23, %ymm19 + vmovdqa64 %ymm15, %ymm23 + vmovdqa64 %ymm4, %ymm15 + vmovdqa64 %ymm24, %ymm4 + vmovdqa64 %ymm21, %ymm24 + vmovdqa64 %ymm8, %ymm21 + vmovdqa64 %ymm16, %ymm8 + vmovdqa64 %ymm5, %ymm16 + vmovdqa64 %ymm30, %ymm5 + + dec $roundn # Decrement the round counter + jnz .Lkeccak_rnd_loop # Jump to the start of the loop if r13d is not zero + ret +.cfi_endproc +.size keccak_1600_permute,.-keccak_1600_permute + +# Initialize YMM registers 0-24 to zero +.globl keccak_1600_init_state +.type keccak_1600_init_state,\@abi-omnipotent +.align 32 +keccak_1600_init_state: +.cfi_startproc + vpxorq %ymm0, %ymm0, %ymm0 + vmovdqa64 %ymm0, %ymm1 + vmovdqa64 %ymm0, %ymm2 + vmovdqa64 %ymm0, %ymm3 + vmovdqa64 %ymm0, %ymm4 + vmovdqa64 %ymm0, %ymm5 + vmovdqa64 %ymm0, %ymm6 + vmovdqa64 %ymm0, %ymm7 + vmovdqa64 %ymm0, %ymm8 + vmovdqa64 %ymm0, %ymm9 + vmovdqa64 %ymm0, %ymm10 + vmovdqa64 %ymm0, %ymm11 + vmovdqa64 %ymm0, %ymm12 + vmovdqa64 %ymm0, %ymm13 + vmovdqa64 %ymm0, %ymm14 + vmovdqa64 %ymm0, %ymm15 + vmovdqa64 %ymm0, %ymm16 + vmovdqa64 %ymm0, %ymm17 + vmovdqa64 %ymm0, %ymm18 + vmovdqa64 %ymm0, %ymm19 + vmovdqa64 %ymm0, %ymm20 + vmovdqa64 %ymm0, %ymm21 + vmovdqa64 %ymm0, %ymm22 + vmovdqa64 %ymm0, %ymm23 + vmovdqa64 %ymm0, %ymm24 + ret +.cfi_endproc +.size keccak_1600_init_state,.-keccak_1600_init_state + +.globl keccak_1600_load_state_x4 +.type keccak_1600_load_state_x4,\@abi-omnipotent +.align 32 +keccak_1600_load_state_x4: +.cfi_startproc + vmovdqu64 32*0($arg1), %ymm0 + vmovdqu64 32*1($arg1), %ymm1 + vmovdqu64 32*2($arg1), %ymm2 + vmovdqu64 32*3($arg1), %ymm3 + vmovdqu64 32*4($arg1), %ymm4 + vmovdqu64 32*5($arg1), %ymm5 + vmovdqu64 32*6($arg1), %ymm6 + vmovdqu64 32*7($arg1), %ymm7 + vmovdqu64 32*8($arg1), %ymm8 + vmovdqu64 32*9($arg1), %ymm9 + vmovdqu64 32*10($arg1), %ymm10 + vmovdqu64 32*11($arg1), %ymm11 + vmovdqu64 32*12($arg1), %ymm12 + vmovdqu64 32*13($arg1), %ymm13 + vmovdqu64 32*14($arg1), %ymm14 + vmovdqu64 32*15($arg1), %ymm15 + vmovdqu64 32*16($arg1), %ymm16 + vmovdqu64 32*17($arg1), %ymm17 + vmovdqu64 32*18($arg1), %ymm18 + vmovdqu64 32*19($arg1), %ymm19 + vmovdqu64 32*20($arg1), %ymm20 + vmovdqu64 32*21($arg1), %ymm21 + vmovdqu64 32*22($arg1), %ymm22 + vmovdqu64 32*23($arg1), %ymm23 + vmovdqu64 32*24($arg1), %ymm24 + ret +.cfi_endproc +.size keccak_1600_load_state_x4,.-keccak_1600_load_state_x4 + + +.globl keccak_1600_save_state_x4 +.type keccak_1600_save_state_x4,\@abi-omnipotent +.align 32 +keccak_1600_save_state_x4: +.cfi_startproc + vmovdqu64 %ymm0, 32*0($arg1) + vmovdqu64 %ymm1, 32*1($arg1) + vmovdqu64 %ymm2, 32*2($arg1) + vmovdqu64 %ymm3, 32*3($arg1) + vmovdqu64 %ymm4, 32*4($arg1) + vmovdqu64 %ymm5, 32*5($arg1) + vmovdqu64 %ymm6, 32*6($arg1) + vmovdqu64 %ymm7, 32*7($arg1) + vmovdqu64 %ymm8, 32*8($arg1) + vmovdqu64 %ymm9, 32*9($arg1) + vmovdqu64 %ymm10, 32*10($arg1) + vmovdqu64 %ymm11, 32*11($arg1) + vmovdqu64 %ymm12, 32*12($arg1) + vmovdqu64 %ymm13, 32*13($arg1) + vmovdqu64 %ymm14, 32*14($arg1) + vmovdqu64 %ymm15, 32*15($arg1) + vmovdqu64 %ymm16, 32*16($arg1) + vmovdqu64 %ymm17, 32*17($arg1) + vmovdqu64 %ymm18, 32*18($arg1) + vmovdqu64 %ymm19, 32*19($arg1) + vmovdqu64 %ymm20, 32*20($arg1) + vmovdqu64 %ymm21, 32*21($arg1) + vmovdqu64 %ymm22, 32*22($arg1) + vmovdqu64 %ymm23, 32*23($arg1) + vmovdqu64 %ymm24, 32*24($arg1) + ret +.cfi_endproc +.size keccak_1600_save_state_x4,.-keccak_1600_save_state_x4 + + +# Add input data to state when message length is less than rate +# Arguments: +# r10: state pointer to absorb into (clobbered) +# arg2 (rsi): message pointer lane 0 (updated on output) +# arg3 (rdx): message pointer lane 1 (updated on output) +# arg4 (rcx): message pointer lane 2 (updated on output) +# arg5 (r8): message pointer lane 3 (updated on output) +# r12: length in bytes (clobbered on output) +# Clobbers: r9, rbx, r15, k1, ymm31-ymm29 +.globl keccak_1600_partial_add_x4 +.type keccak_1600_partial_add_x4,\@abi-omnipotent +.align 32 +keccak_1600_partial_add_x4: +.cfi_startproc + mov 8*100(%r10), %r9 + test \$7, %r9d + jz .Lstart_aligned_to_4x8 + + # Start offset is not aligned to register size + mov %r9, %r15 # %r15 = s[100] + + and \$7, %r9d + neg %r9d + add \$8, %r9d # register capacity = 8 - (offset % 8) + cmp %r9d, %r12d + cmovnae %r12d, %r9d # %r9d = min(register capacity, length) + + lea byte_kmask_0_to_7(%rip), %rbx + kmovb (%rbx,%r9), %k1 # message load mask + + mov %r15, %rbx + and \$~7, %ebx + lea (%r10,%rbx,4), %r10 # get to state starting register + + mov %r15, %rbx + and \$7, %ebx + + vmovdqu8 (%r10), %ymm31 # load & store / allocate SB for the register + vmovdqu8 %ymm31, (%r10) + + vmovdqu8 ($arg2), %xmm31{%k1}{z} # Read 1 to 7 bytes from lane 0 + vmovdqu8 8*0(%r10,%rbx), %xmm30{%k1}{z} # Read 1 to 7 bytes from state reg lane 0 + vpxorq %xmm30, %xmm31, %xmm31 + vmovdqu8 %xmm31, 8*0(%r10,%rbx){%k1} # Write 1 to 7 bytes to state reg lane 0 + + vmovdqu8 ($arg3), %xmm31{%k1}{z} # Read 1 to 7 bytes from lane 1 + vmovdqu8 8*1(%r10,%rbx), %xmm30{%k1}{z} # Read 1 to 7 bytes from state reg lane 1 + vpxorq %xmm30, %xmm31, %xmm31 + vmovdqu8 %xmm31, 8*1(%r10,%rbx){%k1} # Write 1 to 7 bytes to state reg lane 1 + + vmovdqu8 ($arg4), %xmm31{%k1}{z} # Read 1 to 7 bytes from lane 2 + vmovdqu8 8*2(%r10,%rbx), %xmm30{%k1}{z} # Read 1 to 7 bytes from state reg lane 2 + vpxorq %xmm30, %xmm31, %xmm31 + vmovdqu8 %xmm31, 8*2(%r10,%rbx){%k1} # Write 1 to 7 bytes to state reg lane 2 + + vmovdqu8 ($arg5), %xmm31{%k1}{z} # Read 1 to 7 bytes from lane 3 + vmovdqu8 8*3(%r10,%rbx), %xmm30{%k1}{z} # Read 1 to 7 bytes from state reg lane 3 + vpxorq %xmm30, %xmm31, %xmm31 + vmovdqu8 %xmm31, 8*3(%r10,%rbx){%k1} # Write 1 to 7 bytes to state reg lane 3 + + sub %r9, %r12 + jz .Lzero_bytes + + add %r9, $arg2 + add %r9, $arg3 + add %r9, $arg4 + add %r9, $arg5 + add \$32, %r10 + xor %r9, %r9 + jmp .Lymm_loop + +.Lstart_aligned_to_4x8: + lea (%r10,%r9,4), %r10 + xor %r9, %r9 + +.align 32 +.Lymm_loop: + cmp \$8, %r12d + jb .Llt_8_bytes + + vmovq ($arg2,%r9), %xmm31 # Read 8 bytes from lane 0 + vpinsrq \$1, ($arg3,%r9), %xmm31, %xmm31 # Read 8 bytes from lane 1 + vmovq ($arg4,%r9), %xmm30 # Read 8 bytes from lane 2 + vpinsrq \$1, ($arg5,%r9),%xmm30, %xmm30 # Read 8 bytes from lane 3 + vinserti32x4 \$1, %xmm30, %ymm31, %ymm31 + vpxorq (%r10,%r9,4), %ymm31, %ymm31 # Add data with the state + vmovdqu64 %ymm31, (%r10,%r9,4) + add \$8, %r9 + sub \$8, %r12 + jz .Lzero_bytes + + jmp .Lymm_loop + +.align 32 +.Lzero_bytes: + add %r9, $arg2 + add %r9, $arg3 + add %r9, $arg4 + add %r9, $arg5 + ret + +.align 32 +.Llt_8_bytes: + add %r9, $arg2 + add %r9, $arg3 + add %r9, $arg4 + add %r9, $arg5 + lea (%r10,%r9,4), %r10 + + lea byte_kmask_0_to_7(%rip), %rbx + kmovb (%rbx,%r12), %k1 # message load mask + + vmovdqu8 ($arg2), %xmm31{%k1}{z} # Read 1 to 7 bytes from lane 0 + vmovdqu8 ($arg3), %xmm30{%k1}{z} # Read 1 to 7 bytes from lane 1 + vpunpcklqdq %xmm30, %xmm31, %xmm31 # Interleave data from lane 0 and lane 1 + vmovdqu8 ($arg4), %xmm30{%k1}{z} # Read 1 to 7 bytes from lane 2 + vmovdqu8 ($arg5), %xmm29{%k1}{z} # Read 1 to 7 bytes from lane 3 + vpunpcklqdq %xmm29, %xmm30, %xmm30 # Interleave data from lane 2 and lane 3 + vinserti32x4 \$1, %xmm30, %ymm31, %ymm31 + + vpxorq (%r10), %ymm31, %ymm31 # Add data to the state + vmovdqu64 %ymm31, (%r10) # Update state in memory + + add %r12, $arg2 # increment message pointer lane 0 + add %r12, $arg3 # increment message pointer lane 1 + add %r12, $arg4 # increment message pointer lane 2 + add %r12, $arg5 # increment message pointer lane 3 + ret +.cfi_endproc +.size keccak_1600_partial_add_x4,.-keccak_1600_partial_add_x4 + + +# Extract bytes from state and write to outputs +# Arguments: +# r10: state pointer to start extracting from (clobbered) +# arg1 (rdi): output pointer lane 0 (updated on output) +# arg2 (rsi): output pointer lane 1 (updated on output) +# arg3 (rdx): output pointer lane 2 (updated on output) +# arg4 (rcx): output pointer lane 3 (updated on output) +# r12: length in bytes (clobbered on output) +# r11: state offset to start extract from +.globl keccak_1600_extract_bytes_x4 +.type keccak_1600_extract_bytes_x4,\@abi-omnipotent +.align 32 +keccak_1600_extract_bytes_x4: +.cfi_startproc + or %r12, %r12 + jz .Lextract_zero_bytes + + test \$7, %r11d + jz .Lextract_start_aligned_to_4x8 + + # Extract offset is not aligned to the register size (8 bytes) + mov %r11, %r9 + + and \$7, %r9d + neg %r9d + add \$8, %r9d # register capacity = 8 - (offset % 8) + cmp %r9d, %r12d + cmovnae %r12d, %r9d # %r9d = min(register capacity, length) + + lea byte_kmask_0_to_7(%rip), %rbx + kmovb (%rbx,%r9), %k1 # message store mask + + mov %r11, %rbx + and \$~7, %ebx + lea (%r10,%rbx,4), %r10 # get to state starting register + + mov %r11, %rbx + and \$7, %ebx + + vmovdqu8 8*0(%r10,%rbx), %xmm31{%k1}{z} # Read 1-7 bytes from state reg lane 0 + vmovdqu8 %xmm31, ($arg1){%k1} # Write 1-7 bytes to lane 0 output + + vmovdqu8 8*1(%r10,%rbx), %xmm31{%k1}{z} # Read 1-7 bytes from state reg lane 1 + vmovdqu8 %xmm31, ($arg2){%k1} # Write 1-7 bytes to lane 1 output + + vmovdqu8 8*2(%r10,%rbx), %xmm31{%k1}{z} # Read 1-7 bytes from state reg lane 2 + vmovdqu8 %xmm31, ($arg3){%k1} # Write 1-7 bytes to lane 2 output + + vmovdqu8 8*3(%r10,%rbx), %xmm31{%k1}{z} # Read 1-7 bytes from state reg lane 3 + vmovdqu8 %xmm31, ($arg4){%k1} # Write 1-7 bytes to lane 3 output + + # Increment output registers + add %r9, $arg1 + add %r9, $arg2 + add %r9, $arg3 + add %r9, $arg4 + + # Decrement length to extract + sub %r9, %r12 + jz .Lextract_zero_bytes + + # More data to extract, update state register pointer + add \$32, %r10 + xor %r9, %r9 + jmp .Lextract_ymm_loop + +.Lextract_start_aligned_to_4x8: + lea (%r10,%r11,4), %r10 + xor %r9, %r9 + +.align 32 +.Lextract_ymm_loop: + cmp \$8, %r12 + jb .Lextract_lt_8_bytes + + vmovdqu64 (%r10), %xmm31 + vmovdqu64 16(%r10), %xmm30 + vmovq %xmm31, ($arg1,%r9) + vpextrq \$1, %xmm31, ($arg2,%r9) + vmovq %xmm30, ($arg3,%r9) + vpextrq \$1, %xmm30, ($arg4,%r9) + add \$8, %r9 + sub \$8, %r12 + jz .Lzero_bytes_left + + add \$32, %r10 + jmp .Lextract_ymm_loop + +.align 32 +.Lzero_bytes_left: + # Increment output pointers + add %r9, $arg1 + add %r9, $arg2 + add %r9, $arg3 + add %r9, $arg4 +.Lextract_zero_bytes: + ret + +.align 32 +.Lextract_lt_8_bytes: + add %r9, $arg1 + add %r9, $arg2 + add %r9, $arg3 + add %r9, $arg4 + + lea byte_kmask_0_to_7(%rip), %r9 + kmovb (%r9,%r12), %k1 # k1 is the mask of message bytes to read + + vmovq 0*8(%r10), %xmm31 # Read 8 bytes from state lane 0 + vmovdqu8 %xmm31, ($arg1){%k1} # Extract 1-7 bytes into output 0 + vmovq 1*8(%r10), %xmm31 # Read 8 bytes from state lane 1 + vmovdqu8 %xmm31, ($arg2){%k1} # Extract 1-7 bytes into output 1 + vmovq 2*8(%r10), %xmm31 # Read 8 bytes from state lane 2 + vmovdqu8 %xmm31, ($arg3){%k1} # Extract 1-7 bytes into output 2 + vmovq 3*8(%r10), %xmm31 # Read 8 bytes from state lane 3 + vmovdqu8 %xmm31, ($arg4){%k1} # Extract 1-7 bytes into output 3 + + # Increment output pointers + add %r12, $arg1 + add %r12, $arg2 + add %r12, $arg3 + add %r12, $arg4 + ret +.cfi_endproc +.size keccak_1600_extract_bytes_x4,.-keccak_1600_extract_bytes_x4 + + +# SHAKE128 x4 multi-buffer functions +# These functions process 4 independent SHAKE128 streams in parallel using AVX-512VL +# State layout: 25 ymm registers (200 bytes each) + 1 qword = 808 bytes per context +# Rate: 168 bytes for SHAKE128 + +# SHA3_shake128_x4_avx512vl +# One-shot SHAKE-128 x4 function: init + absorb + finalize + squeeze +# Arguments: +# arg1 (rdi): pointer to output lane 0 +# arg2 (rsi): pointer to output lane 1 +# arg3 (rdx): pointer to output lane 2 +# arg4 (rcx): pointer to output lane 3 +# arg5 (r8): output length in bytes (must be same for all lanes) +# arg6 (r9): pointer to input lane 0 +# [stack+0]: pointer to input lane 1 +# [stack+8]: pointer to input lane 2 +# [stack+16]: pointer to input lane 3 +# [stack+24]: input length in bytes (must be same for all lanes) +# Returns: void +.globl SHA3_shake128_x4_avx512vl +.type SHA3_shake128_x4_avx512vl,\@function,10 +.align 32 +SHA3_shake128_x4_avx512vl: +.cfi_startproc + push %rbp +.cfi_push %rbp + mov %rsp, %rbp + push %rbx +.cfi_push %rbx +___ +$code .= <<___ if ($win64); + sub \$160, %rsp + vmovups %xmm6, 0(%rsp) + vmovups %xmm7, 16(%rsp) + vmovups %xmm8, 32(%rsp) + vmovups %xmm9, 48(%rsp) + vmovups %xmm10, 64(%rsp) + vmovups %xmm11, 80(%rsp) + vmovups %xmm12, 96(%rsp) + vmovups %xmm13, 112(%rsp) + vmovups %xmm14, 128(%rsp) + vmovups %xmm15, 144(%rsp) +___ +$code.=<<___; + + sub \$$sf_size, %rsp + mov %rsp, %rbx + +.Lshake128_x4_body: + mov $arg1, $sf_arg1(%rbx) + mov $arg2, $sf_arg2(%rbx) + mov $arg3, $sf_arg3(%rbx) + mov $arg4, $sf_arg4(%rbx) + mov $arg5, $sf_arg5(%rbx) + + lea $sf_state_x4(%rbx), $arg1 # start of x4 state on the stack frame + mov $arg1, $sf_state_ptr(%rbx) + + # Initialize the state array to zero + call keccak_1600_init_state + + call keccak_1600_save_state_x4 + + movq \$0, 8*100($arg1) # clear s[100] + + mov $sf_state_ptr(%rbx), $arg1 + mov $arg6, $arg2 +___ +$code .= <<___ if ($win64); + # xlate prologue handles up to six arguments. For one-shot x4 wrappers + # (10 args), the remaining four stay in Win64 stack slots. + mov 64(%rbp), $arg3 # arg7 from stack + mov 72(%rbp), $arg4 # arg8 from stack + mov 80(%rbp), $arg5 # arg9 from stack + mov 88(%rbp), $arg6 # arg10 from stack +___ +$code .= <<___ if (!$win64); + mov 16(%rbp), $arg3 # arg7 from stack + mov 24(%rbp), $arg4 # arg8 from stack + mov 32(%rbp), $arg5 # arg9 from stack + mov 40(%rbp), $arg6 # arg10 from stack +___ +$code.=<<___; + # Internal entry avoids Win64 xlate prologue argument remapping. +___ +$code .= call_internal("SHA3_shake128_x4_inc_absorb_avx512vl_internal"); +$code.=<<___; + + mov $sf_state_ptr(%rbx), $arg1 + call .L_SHA3_shake128_x4_inc_finalize_avx512vl + + # squeeze + mov $sf_arg1(%rbx), $arg1 + mov $sf_arg2(%rbx), $arg2 + mov $sf_arg3(%rbx), $arg3 + mov $sf_arg4(%rbx), $arg4 + mov $sf_arg5(%rbx), $arg5 + mov $sf_state_ptr(%rbx), $arg6 +___ +$code .= call_internal("SHA3_shake128_x4_inc_squeeze_avx512vl_internal"); +$code.=<<___; + + # Clear the temporary buffer + lea $sf_state_x4(%rbx), %r9 + vpxorq %ymm31, %ymm31, %ymm31 + vmovdqu64 %ymm31, 32*0(%r9) + vmovdqu64 %ymm31, 32*1(%r9) + vmovdqu64 %ymm31, 32*2(%r9) + vmovdqu64 %ymm31, 32*3(%r9) + vmovdqu64 %ymm31, 32*4(%r9) + vmovdqu64 %ymm31, 32*5(%r9) + vmovdqu64 %ymm31, 32*6(%r9) + vmovdqu64 %ymm31, 32*7(%r9) + vmovdqu64 %ymm31, 32*8(%r9) + vmovdqu64 %ymm31, 32*9(%r9) + vmovdqu64 %ymm31, 32*10(%r9) + vmovdqu64 %ymm31, 32*11(%r9) + vmovdqu64 %ymm31, 32*12(%r9) + vmovdqu64 %ymm31, 32*13(%r9) + vmovdqu64 %ymm31, 32*14(%r9) + vmovdqu64 %ymm31, 32*15(%r9) + vmovdqu64 %ymm31, 32*16(%r9) + vmovdqu64 %ymm31, 32*17(%r9) + vmovdqu64 %ymm31, 32*18(%r9) + vmovdqu64 %ymm31, 32*19(%r9) + vmovdqu64 %ymm31, 32*20(%r9) + vmovdqu64 %ymm31, 32*21(%r9) + vmovdqu64 %ymm31, 32*22(%r9) + vmovdqu64 %ymm31, 32*23(%r9) + vmovdqu64 %ymm31, 32*24(%r9) + vmovq %xmm31, 32*25(%r9) + +.Lshake128_x4_epilogue: +___ +$code .= <<___ if ($win64); + vmovups $sf_size+0(%rsp), %xmm6 + vmovups $sf_size+16(%rsp), %xmm7 + vmovups $sf_size+32(%rsp), %xmm8 + vmovups $sf_size+48(%rsp), %xmm9 + vmovups $sf_size+64(%rsp), %xmm10 + vmovups $sf_size+80(%rsp), %xmm11 + vmovups $sf_size+96(%rsp), %xmm12 + vmovups $sf_size+112(%rsp), %xmm13 + vmovups $sf_size+128(%rsp), %xmm14 + vmovups $sf_size+144(%rsp), %xmm15 + add \$160, %rsp +___ +$code.=<<___; + add \$$sf_size, %rsp + pop %rbx +.cfi_pop %rbx + pop %rbp +.cfi_pop %rbp + ret +.cfi_endproc +.size SHA3_shake128_x4_avx512vl,.-SHA3_shake128_x4_avx512vl + +___ + +$code .= <<___ if ($win64); +# Internal Win64 shim for absorb entry. It establishes xlate-compatible +# unwind state and then jumps to the function entry after the prologue. +# This is required for internal calls since the xlate ABI conversion +# is already done in the caller function. +.type SHA3_shake128_x4_inc_absorb_avx512vl_internal,\@abi-omnipotent +.align 32 +.LSEH_begin_SHA3_shake128_x4_inc_absorb_avx512vl_internal: +SHA3_shake128_x4_inc_absorb_avx512vl_internal: + mov %rsp, %rax + mov $arg1, 8(%rsp) + mov $arg2, 16(%rsp) + jmp .L_SHA3_shake128_x4_inc_absorb_avx512vl +.LSEH_end_SHA3_shake128_x4_inc_absorb_avx512vl_internal: +.size SHA3_shake128_x4_inc_absorb_avx512vl_internal,.-SHA3_shake128_x4_inc_absorb_avx512vl_internal +___ +$code.=<<___; + +# SHA3_shake128_x4_inc_absorb_avx512vl +# Absorb input data into 4 parallel SHAKE128 states +# Arguments: +# arg1 (rdi): pointer to state context (808 bytes) +# arg2 (rsi): pointer to lane 0 input data +# arg3 (rdx): pointer to lane 1 input data +# arg4 (rcx): pointer to lane 2 input data +# arg5 (r8): pointer to lane 3 input data +# arg6 (r9): input length in bytes (must be same for all lanes) +# Returns: void +# Note: Input is XORed into state and Keccak permutation is applied for each rate-sized block +.globl SHA3_shake128_x4_inc_absorb_avx512vl +.type SHA3_shake128_x4_inc_absorb_avx512vl,\@function,6 +.align 32 +SHA3_shake128_x4_inc_absorb_avx512vl: +.L_SHA3_shake128_x4_inc_absorb_avx512vl: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +___ +$code .= <<___ if ($win64); + sub \$160, %rsp + vmovups %xmm6, 0(%rsp) + vmovups %xmm7, 16(%rsp) + vmovups %xmm8, 32(%rsp) + vmovups %xmm9, 48(%rsp) + vmovups %xmm10, 64(%rsp) + vmovups %xmm11, 80(%rsp) + vmovups %xmm12, 96(%rsp) + vmovups %xmm13, 112(%rsp) + vmovups %xmm14, 128(%rsp) + vmovups %xmm15, 144(%rsp) +___ +$code.=<<___; + +.Lshake128_absorb_body: + # check for partially processed block + mov 8*100($arg1), %r14 + or %r14, %r14 # s[100] == 0? + je .Lshake128_absorb_main_loop_start + + # process remaining bytes if message long enough + mov \$168, %r12 # SHAKE128_RATE = 168 + sub %r14, %r12 # %r12 = capacity + + cmp %r12, $arg6 # if mlen <= capacity then no permute + jbe .Lshake128_absorb_skip_permute + + sub %r12, $arg6 + mov $arg6, %r11 # preserve remaining length across helper calls + + # r10/state, arg2-arg5/inputs, r12/length + mov $arg1, %r10 # %r10 = state + call keccak_1600_partial_add_x4 # arg2-arg5 are updated + + call keccak_1600_load_state_x4 + + call keccak_1600_permute + + movq \$0, 8*100($arg1) # clear s[100] + jmp .Lshake128_absorb_partial_block_done + +.Lshake128_absorb_skip_permute: + # r10/state, arg2-arg5/inputs, r12/length + mov $arg1, %r10 + mov $arg6, %r12 + mov $arg6, %r11 # preserve input length across helper call + call keccak_1600_partial_add_x4 + + lea (%r11,%r14), %r15 + mov %r15, 8*100($arg1) # s[100] += inlen + + cmp \$168, %r15 # check s[100] below SHAKE128_RATE + jb .Lshake128_absorb_exit + + call keccak_1600_load_state_x4 + + call keccak_1600_permute + + call keccak_1600_save_state_x4 + + movq \$0, 8*100($arg1) # clear s[100] + jmp .Lshake128_absorb_exit + +.Lshake128_absorb_main_loop_start: + call keccak_1600_load_state_x4 + mov $arg6, %r11 # full input length when no prior partial block + +.Lshake128_absorb_partial_block_done: + xor %r12, %r12 # zero message offset + + # Process the input message in blocks +.align 32 +.Lshake128_absorb_while_loop: + cmp \$168, %r11 # compare mlen to SHAKE128_RATE + jb .Lshake128_absorb_while_loop_done + + # Inline absorb_bytes_x4 for SHAKE128_RATE (168 bytes = 21 ymm registers) +___ + +# Generate absorb code for SHAKE128 rate (168 bytes) +for (my $i = 0; $i < 21; $i++) { + my $offset = $i * 8; + $code.=<<___; + vmovq $offset($arg2,%r12), %xmm31 + vpinsrq \$1, $offset($arg3,%r12), %xmm31, %xmm31 + vmovq $offset($arg4,%r12), %xmm30 + vpinsrq \$1, $offset($arg5,%r12), %xmm30, %xmm30 + vinserti32x4 \$1, %xmm30, %ymm31, %ymm31 + vpxorq %ymm31, %ymm$i, %ymm$i +___ +} + +$code.=<<___; + sub \$168, %r11 # Subtract the rate from the remaining length + add \$168, %r12 # Adjust offset to next block + call keccak_1600_permute # Perform the Keccak permutation + + jmp .Lshake128_absorb_while_loop + +.align 32 +.Lshake128_absorb_while_loop_done: + call keccak_1600_save_state_x4 + + mov %r11, 8*100($arg1) # update s[100] + or %r11, %r11 + jz .Lshake128_absorb_exit + + movq \$0, 8*100($arg1) # clear s[100] + + # r10/state, arg2-arg5/input, r12/length + mov $arg1, %r10 + add %r12, $arg2 + add %r12, $arg3 + add %r12, $arg4 + add %r12, $arg5 + mov %r11, %r12 + call keccak_1600_partial_add_x4 + + mov %r11, 8*100($arg1) # update s[100] + +.Lshake128_absorb_exit: + # Clear sensitive registers + vpxorq %xmm16, %xmm16, %xmm16 + vmovdqa64 %ymm16, %ymm17 + vmovdqa64 %ymm16, %ymm18 + vmovdqa64 %ymm16, %ymm19 + vmovdqa64 %ymm16, %ymm20 + vmovdqa64 %ymm16, %ymm21 + vmovdqa64 %ymm16, %ymm22 + vmovdqa64 %ymm16, %ymm23 + vmovdqa64 %ymm16, %ymm24 + vmovdqa64 %ymm16, %ymm25 + vmovdqa64 %ymm16, %ymm26 + vmovdqa64 %ymm16, %ymm27 + vmovdqa64 %ymm16, %ymm28 + vmovdqa64 %ymm16, %ymm29 + vmovdqa64 %ymm16, %ymm30 + vmovdqa64 %ymm16, %ymm31 +.Lshake128_absorb_epilogue: + vzeroall +___ +$code .= <<___ if ($win64); + vmovups 0(%rsp), %xmm6 + vmovups 16(%rsp), %xmm7 + vmovups 32(%rsp), %xmm8 + vmovups 48(%rsp), %xmm9 + vmovups 64(%rsp), %xmm10 + vmovups 80(%rsp), %xmm11 + vmovups 96(%rsp), %xmm12 + vmovups 112(%rsp), %xmm13 + vmovups 128(%rsp), %xmm14 + vmovups 144(%rsp), %xmm15 + add \$160, %rsp +___ +$code.=<<___; + + pop %r15 +.cfi_pop %r15 + pop %r14 +.cfi_pop %r14 + pop %r13 +.cfi_pop %r13 + pop %r12 +.cfi_pop %r12 + pop %rbx +.cfi_pop %rbx + pop %rbp +.cfi_pop %rbp + ret +.cfi_endproc +.size SHA3_shake128_x4_inc_absorb_avx512vl,.-SHA3_shake128_x4_inc_absorb_avx512vl + + +# SHA3_shake128_x4_inc_finalize_avx512vl +# Finalize absorption phase for 4 parallel SHAKE-128 states +# Adds padding and terminator bytes and clears the absorb offset +# Arguments: +# arg1 (rdi): pointer to state context (808 bytes) +# Returns: void +# Note: After this call, state is ready for squeezing output +.globl SHA3_shake128_x4_inc_finalize_avx512vl +.type SHA3_shake128_x4_inc_finalize_avx512vl,\@function,1 +.align 32 +SHA3_shake128_x4_inc_finalize_avx512vl: +.L_SHA3_shake128_x4_inc_finalize_avx512vl: +.cfi_startproc + mov 8*100($arg1), %r11 # load state offset from s[100] + mov %r11, %r10 + and \$~7, %r10d # offset to the state register + and \$7, %r11d # offset within the register + + # add EOM byte right after the message + vmovdqu32 ($arg1,%r10,4), %ymm31 + lea shake_msg_pad_x4(%rip), %r9 + sub %r11, %r9 + vmovdqu32 (%r9), %ymm30 + vpxorq %ymm30, %ymm31, %ymm31 + vmovdqu32 %ymm31, ($arg1,%r10,4) + + # add terminating byte at offset equal to rate - 1 (SHAKE128_RATE = 168) + vmovdqu32 640($arg1), %ymm31 # 168*4 - 32 = 672 - 32 = 640 + vmovdqa32 shake_terminator_byte_x4(%rip), %ymm30 + vpxorq %ymm30, %ymm31, %ymm31 + vmovdqu32 %ymm31, 640($arg1) + + movq \$0, 8*100($arg1) # clear s[100] + vpxorq %ymm31, %ymm31, %ymm31 + ret +.cfi_endproc +.size SHA3_shake128_x4_inc_finalize_avx512vl,.-SHA3_shake128_x4_inc_finalize_avx512vl + +___ + +$code .= <<___ if ($win64); +# Internal Win64 shim for squeeze entry. It establishes xlate-compatible +# unwind state and then jumps to the function entry after the prologue. +# This is required for internal calls since the xlate ABI conversion +# is already done in the caller function. +.type SHA3_shake128_x4_inc_squeeze_avx512vl_internal,\@abi-omnipotent +.align 32 +.LSEH_begin_SHA3_shake128_x4_inc_squeeze_avx512vl_internal: +SHA3_shake128_x4_inc_squeeze_avx512vl_internal: + mov %rsp, %rax + mov $arg1, 8(%rsp) + mov $arg2, 16(%rsp) + jmp .L_SHA3_shake128_x4_inc_squeeze_avx512vl +.LSEH_end_SHA3_shake128_x4_inc_squeeze_avx512vl_internal: +.size SHA3_shake128_x4_inc_squeeze_avx512vl_internal,.-SHA3_shake128_x4_inc_squeeze_avx512vl_internal +___ +$code.=<<___; + +# SHA3_shake128_x4_inc_squeeze_avx512vl +# Squeeze output from 4 parallel SHAKE128 states +# Arguments: +# arg1 (rdi): pointer to lane 0 output buffer +# arg2 (rsi): pointer to lane 1 output buffer +# arg3 (rdx): pointer to lane 2 output buffer +# arg4 (rcx): pointer to lane 3 output buffer +# arg5 (r8): output length in bytes (must be same for all lanes) +# arg6 (r9): pointer to state context (808 bytes) +# Returns: void +# Note: Can be called multiple times to generate arbitrary-length output +.globl SHA3_shake128_x4_inc_squeeze_avx512vl +.type SHA3_shake128_x4_inc_squeeze_avx512vl,\@function,6 +.align 32 +SHA3_shake128_x4_inc_squeeze_avx512vl: +.L_SHA3_shake128_x4_inc_squeeze_avx512vl: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +___ +$code .= <<___ if ($win64); + sub \$160, %rsp + vmovups %xmm6, 0(%rsp) + vmovups %xmm7, 16(%rsp) + vmovups %xmm8, 32(%rsp) + vmovups %xmm9, 48(%rsp) + vmovups %xmm10, 64(%rsp) + vmovups %xmm11, 80(%rsp) + vmovups %xmm12, 96(%rsp) + vmovups %xmm13, 112(%rsp) + vmovups %xmm14, 128(%rsp) + vmovups %xmm15, 144(%rsp) +___ +$code.=<<___; + +.Lshake128_squeeze_body: + or $arg5, $arg5 + jz .Lshake128_squeeze_done + + # check for partially processed block + mov 8*100($arg6), %r15 # s[100] - capacity + or %r15, %r15 + jnz .Lshake128_squeeze_no_init_permute + + mov $arg1, %r14 + mov $arg6, $arg1 + call keccak_1600_load_state_x4 + + mov %r14, $arg1 + + xor %rbp, %rbp + jmp .Lshake128_squeeze_loop + +.align 32 +.Lshake128_squeeze_no_init_permute: + # extract bytes: r10 - state/src, arg1-arg4 - output/dst, r12 - length = min(capacity, outlen), r11 - offset + mov $arg6, %r10 + mov $arg6, %r14 # preserve state pointer across extract helper + + mov %r15, %r12 + cmp %r15, $arg5 + cmovnae $arg5, %r12 # %r12 = min(capacity, outlen) + + sub %r12, $arg5 # outlen -= length + + mov \$168, %r11d # SHAKE128_RATE + sub %r15, %r11 # state offset + + sub %r12, %r15 # capacity -= length + mov %r15, 8*100($arg6) # update s[100] + + call keccak_1600_extract_bytes_x4 + mov %r14, $arg6 # restore state pointer after helper clobbers + + or %r15, %r15 + jnz .Lshake128_squeeze_done # check s[100] not zero + + mov $arg1, %r13 # preserve arg1 + mov %r14, $arg1 + call keccak_1600_load_state_x4 + + mov %r13, $arg1 + xor %rbp, %rbp + +.align 32 +.Lshake128_squeeze_loop: + cmp \$168, $arg5 # outlen > SHAKE128_RATE + jb .Lshake128_squeeze_final_extract + + call keccak_1600_permute + + # Extract SHAKE128 rate bytes (168 bytes = 21 x 8 bytes) inline +___ + +# Generate extract code for SHAKE128 rate (168 bytes = 21 ymm registers) +for (my $i = 0; $i < 21; $i++) { + my $offset = $i * 8; + $code.=<<___; + vextracti64x2 \$1, %ymm$i, %xmm31 + vmovq %xmm$i, $offset($arg1,%rbp) + vpextrq \$1, %xmm$i, $offset($arg2,%rbp) + vmovq %xmm31, $offset($arg3,%rbp) + vpextrq \$1, %xmm31, $offset($arg4,%rbp) +___ +} + +$code.=<<___; + add \$168, %rbp # dst offset += SHAKE128_RATE + sub \$168, $arg5 # outlen -= SHAKE128_RATE + jmp .Lshake128_squeeze_loop + +.align 32 +.Lshake128_squeeze_final_extract: + or $arg5, $arg5 + jz .Lshake128_squeeze_no_end_permute + + # update output pointers + add %rbp, $arg1 + add %rbp, $arg2 + add %rbp, $arg3 + add %rbp, $arg4 + + mov \$168, %r15d # SHAKE128_RATE + sub $arg5, %r15 + mov %r15, 8*100($arg6) # s[100] = capacity + + call keccak_1600_permute + + mov $arg1, %r14 + mov $arg6, $arg1 + call keccak_1600_save_state_x4 + + mov %r14, $arg1 + + # extract bytes: r10 - state/src, arg1-arg4 - output/dst, r12 - length, r11 - offset = 0 + mov $arg6, %r10 + mov $arg5, %r12 + xor %r11, %r11 + call keccak_1600_extract_bytes_x4 + + jmp .Lshake128_squeeze_done + +.Lshake128_squeeze_no_end_permute: + movq \$0, 8*100($arg6) # s[100] = 0 + mov $arg6, $arg1 + call keccak_1600_save_state_x4 + +.Lshake128_squeeze_done: + # Clear sensitive registers + vpxorq %xmm16, %xmm16, %xmm16 + vmovdqa64 %ymm16, %ymm17 + vmovdqa64 %ymm16, %ymm18 + vmovdqa64 %ymm16, %ymm19 + vmovdqa64 %ymm16, %ymm20 + vmovdqa64 %ymm16, %ymm21 + vmovdqa64 %ymm16, %ymm22 + vmovdqa64 %ymm16, %ymm23 + vmovdqa64 %ymm16, %ymm24 + vmovdqa64 %ymm16, %ymm25 + vmovdqa64 %ymm16, %ymm26 + vmovdqa64 %ymm16, %ymm27 + vmovdqa64 %ymm16, %ymm28 + vmovdqa64 %ymm16, %ymm29 + vmovdqa64 %ymm16, %ymm30 + vmovdqa64 %ymm16, %ymm31 +.Lshake128_squeeze_epilogue: + vzeroall +___ +$code .= <<___ if ($win64); + vmovups 0(%rsp), %xmm6 + vmovups 16(%rsp), %xmm7 + vmovups 32(%rsp), %xmm8 + vmovups 48(%rsp), %xmm9 + vmovups 64(%rsp), %xmm10 + vmovups 80(%rsp), %xmm11 + vmovups 96(%rsp), %xmm12 + vmovups 112(%rsp), %xmm13 + vmovups 128(%rsp), %xmm14 + vmovups 144(%rsp), %xmm15 + add \$160, %rsp +___ +$code.=<<___; + + pop %r15 +.cfi_pop %r15 + pop %r14 +.cfi_pop %r14 + pop %r13 +.cfi_pop %r13 + pop %r12 +.cfi_pop %r12 + pop %rbx +.cfi_pop %rbx + pop %rbp +.cfi_pop %rbp + ret +.cfi_endproc +.size SHA3_shake128_x4_inc_squeeze_avx512vl,.-SHA3_shake128_x4_inc_squeeze_avx512vl + + +# SHAKE256 x4 multi-buffer functions +# These functions process 4 independent SHAKE256 streams in parallel using AVX-512VL +# State layout: 25 ymm registers (200 bytes each) + 1 qword = 808 bytes per context +# Rate: 136 bytes for SHAKE256 + +# SHA3_shake256_x4_avx512vl +# One-shot SHAKE-256 x4 function: init + absorb + finalize + squeeze +# Arguments: +# arg1 (rdi): pointer to output lane 0 +# arg2 (rsi): pointer to output lane 1 +# arg3 (rdx): pointer to output lane 2 +# arg4 (rcx): pointer to output lane 3 +# arg5 (r8): output length in bytes (must be same for all lanes) +# arg6 (r9): pointer to input lane 0 +# [stack+0]: pointer to input lane 1 +# [stack+8]: pointer to input lane 2 +# [stack+16]: pointer to input lane 3 +# [stack+24]: input length in bytes (must be same for all lanes) +# Returns: void +.globl SHA3_shake256_x4_avx512vl +.type SHA3_shake256_x4_avx512vl,\@function,10 +.align 32 +SHA3_shake256_x4_avx512vl: +.cfi_startproc + push %rbp +.cfi_push %rbp + mov %rsp, %rbp + push %rbx +.cfi_push %rbx +___ +$code .= <<___ if ($win64); + sub \$160, %rsp + vmovups %xmm6, 0(%rsp) + vmovups %xmm7, 16(%rsp) + vmovups %xmm8, 32(%rsp) + vmovups %xmm9, 48(%rsp) + vmovups %xmm10, 64(%rsp) + vmovups %xmm11, 80(%rsp) + vmovups %xmm12, 96(%rsp) + vmovups %xmm13, 112(%rsp) + vmovups %xmm14, 128(%rsp) + vmovups %xmm15, 144(%rsp) +___ +$code.=<<___; + + sub \$$sf_size, %rsp + mov %rsp, %rbx + +.Lshake256_x4_body: + mov $arg1, $sf_arg1(%rbx) + mov $arg2, $sf_arg2(%rbx) + mov $arg3, $sf_arg3(%rbx) + mov $arg4, $sf_arg4(%rbx) + mov $arg5, $sf_arg5(%rbx) + + lea $sf_state_x4(%rbx), $arg1 # start of x4 state on the stack frame + mov $arg1, $sf_state_ptr(%rbx) + + # Initialize the state array to zero + call keccak_1600_init_state + + call keccak_1600_save_state_x4 + + movq \$0, 8*100($arg1) # clear s[100] + + mov $sf_state_ptr(%rbx), $arg1 + mov $arg6, $arg2 +___ +$code .= <<___ if ($win64); + # xlate prologue handles up to six arguments. For one-shot x4 wrappers + # (10 args), the remaining four stay in Win64 stack slots. + mov 64(%rbp), $arg3 # arg7 from stack + mov 72(%rbp), $arg4 # arg8 from stack + mov 80(%rbp), $arg5 # arg9 from stack + mov 88(%rbp), $arg6 # arg10 from stack +___ +$code .= <<___ if (!$win64); + mov 16(%rbp), $arg3 # arg7 from stack + mov 24(%rbp), $arg4 # arg8 from stack + mov 32(%rbp), $arg5 # arg9 from stack + mov 40(%rbp), $arg6 # arg10 from stack +___ +$code.=<<___; + # Internal entry avoids Win64 xlate prologue argument remapping. +___ +$code .= call_internal("SHA3_shake256_x4_inc_absorb_avx512vl_internal"); +$code.=<<___; + + mov $sf_state_ptr(%rbx), $arg1 + call .L_SHA3_shake256_x4_inc_finalize_avx512vl + + # squeeze + mov $sf_arg1(%rbx), $arg1 + mov $sf_arg2(%rbx), $arg2 + mov $sf_arg3(%rbx), $arg3 + mov $sf_arg4(%rbx), $arg4 + mov $sf_arg5(%rbx), $arg5 + mov $sf_state_ptr(%rbx), $arg6 +___ +$code .= call_internal("SHA3_shake256_x4_inc_squeeze_avx512vl_internal"); +$code.=<<___; + + # Clear the temporary buffer + lea $sf_state_x4(%rbx), %r9 + vpxorq %ymm31, %ymm31, %ymm31 + vmovdqu64 %ymm31, 32*0(%r9) + vmovdqu64 %ymm31, 32*1(%r9) + vmovdqu64 %ymm31, 32*2(%r9) + vmovdqu64 %ymm31, 32*3(%r9) + vmovdqu64 %ymm31, 32*4(%r9) + vmovdqu64 %ymm31, 32*5(%r9) + vmovdqu64 %ymm31, 32*6(%r9) + vmovdqu64 %ymm31, 32*7(%r9) + vmovdqu64 %ymm31, 32*8(%r9) + vmovdqu64 %ymm31, 32*9(%r9) + vmovdqu64 %ymm31, 32*10(%r9) + vmovdqu64 %ymm31, 32*11(%r9) + vmovdqu64 %ymm31, 32*12(%r9) + vmovdqu64 %ymm31, 32*13(%r9) + vmovdqu64 %ymm31, 32*14(%r9) + vmovdqu64 %ymm31, 32*15(%r9) + vmovdqu64 %ymm31, 32*16(%r9) + vmovdqu64 %ymm31, 32*17(%r9) + vmovdqu64 %ymm31, 32*18(%r9) + vmovdqu64 %ymm31, 32*19(%r9) + vmovdqu64 %ymm31, 32*20(%r9) + vmovdqu64 %ymm31, 32*21(%r9) + vmovdqu64 %ymm31, 32*22(%r9) + vmovdqu64 %ymm31, 32*23(%r9) + vmovdqu64 %ymm31, 32*24(%r9) + vmovq %xmm31, 32*25(%r9) + +.Lshake256_x4_epilogue: +___ +$code .= <<___ if ($win64); + vmovups $sf_size+0(%rsp), %xmm6 + vmovups $sf_size+16(%rsp), %xmm7 + vmovups $sf_size+32(%rsp), %xmm8 + vmovups $sf_size+48(%rsp), %xmm9 + vmovups $sf_size+64(%rsp), %xmm10 + vmovups $sf_size+80(%rsp), %xmm11 + vmovups $sf_size+96(%rsp), %xmm12 + vmovups $sf_size+112(%rsp), %xmm13 + vmovups $sf_size+128(%rsp), %xmm14 + vmovups $sf_size+144(%rsp), %xmm15 + add \$160, %rsp +___ +$code.=<<___; + add \$$sf_size, %rsp + pop %rbx +.cfi_pop %rbx + pop %rbp +.cfi_pop %rbp + ret +.cfi_endproc +.size SHA3_shake256_x4_avx512vl,.-SHA3_shake256_x4_avx512vl + +___ + +$code .= <<___ if ($win64); +# Internal Win64 shim for absorb entry. It establishes xlate-compatible +# unwind state and then jumps to the function entry after the prologue. +# This is required for internal calls since the xlate ABI conversion +# is already done in the caller function. +.type SHA3_shake256_x4_inc_absorb_avx512vl_internal,\@abi-omnipotent +.align 32 +.LSEH_begin_SHA3_shake256_x4_inc_absorb_avx512vl_internal: +SHA3_shake256_x4_inc_absorb_avx512vl_internal: + mov %rsp, %rax + mov $arg1, 8(%rsp) + mov $arg2, 16(%rsp) + jmp .L_SHA3_shake256_x4_inc_absorb_avx512vl +.LSEH_end_SHA3_shake256_x4_inc_absorb_avx512vl_internal: +.size SHA3_shake256_x4_inc_absorb_avx512vl_internal,.-SHA3_shake256_x4_inc_absorb_avx512vl_internal +___ +$code.=<<___; + +# SHA3_shake256_x4_inc_absorb_avx512vl +# Absorb input data into 4 parallel SHAKE256 states +# Arguments: +# arg1 (rdi): pointer to state context (808 bytes) +# arg2 (rsi): pointer to lane 0 input data +# arg3 (rdx): pointer to lane 1 input data +# arg4 (rcx): pointer to lane 2 input data +# arg5 (r8): pointer to lane 3 input data +# arg6 (r9): input length in bytes (must be same for all lanes) +# Returns: void +# Note: Input is XORed into state and Keccak permutation is applied for each rate-sized block +.globl SHA3_shake256_x4_inc_absorb_avx512vl +.type SHA3_shake256_x4_inc_absorb_avx512vl,\@function,6 +.align 32 +SHA3_shake256_x4_inc_absorb_avx512vl: +.L_SHA3_shake256_x4_inc_absorb_avx512vl: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +___ +$code .= <<___ if ($win64); + sub \$160, %rsp + vmovups %xmm6, 0(%rsp) + vmovups %xmm7, 16(%rsp) + vmovups %xmm8, 32(%rsp) + vmovups %xmm9, 48(%rsp) + vmovups %xmm10, 64(%rsp) + vmovups %xmm11, 80(%rsp) + vmovups %xmm12, 96(%rsp) + vmovups %xmm13, 112(%rsp) + vmovups %xmm14, 128(%rsp) + vmovups %xmm15, 144(%rsp) +___ +$code.=<<___; + +.Lshake256_absorb_body: + # check for partially processed block + mov 8*100($arg1), %r14 + or %r14, %r14 # s[100] == 0? + je .Lshake256_absorb_main_loop_start + + # process remaining bytes if message long enough + mov \$136, %r12 # SHAKE256_RATE = 136 + sub %r14, %r12 # %r12 = capacity + + cmp %r12, $arg6 # if mlen <= capacity then no permute + jbe .Lshake256_absorb_skip_permute + + sub %r12, $arg6 + mov $arg6, %r11 # preserve remaining length across helper calls + + # r10/state, arg2-arg5/inputs, r12/length + mov $arg1, %r10 # %r10 = state + call keccak_1600_partial_add_x4 # arg2-arg5 are updated + + call keccak_1600_load_state_x4 + + call keccak_1600_permute + + movq \$0, 8*100($arg1) # clear s[100] + jmp .Lshake256_absorb_partial_block_done + +.Lshake256_absorb_skip_permute: + # r10/state, arg2-arg5/inputs, r12/length + mov $arg1, %r10 + mov $arg6, %r12 + mov $arg6, %r11 # preserve input length across helper call + call keccak_1600_partial_add_x4 + + lea (%r11,%r14), %r15 + mov %r15, 8*100($arg1) # s[100] += inlen + + cmp \$136, %r15 # check s[100] below SHAKE256_RATE + jb .Lshake256_absorb_exit + + call keccak_1600_load_state_x4 + + call keccak_1600_permute + + call keccak_1600_save_state_x4 + + movq \$0, 8*100($arg1) # clear s[100] + jmp .Lshake256_absorb_exit + +.Lshake256_absorb_main_loop_start: + call keccak_1600_load_state_x4 + mov $arg6, %r11 # full input length when no prior partial block + +.Lshake256_absorb_partial_block_done: + xor %r12, %r12 # zero message offset + + # Process the input message in blocks +.align 32 +.Lshake256_absorb_while_loop: + cmp \$136, %r11 # compare mlen to SHAKE256_RATE + jb .Lshake256_absorb_while_loop_done + + # Inline absorb_bytes_x4 for SHAKE256_RATE (136 bytes = 17 ymm registers) +___ + +# Generate absorb code for SHAKE256 rate (136 bytes) +for (my $i = 0; $i < 17; $i++) { + my $offset = $i * 8; + $code.=<<___; + vmovq $offset($arg2,%r12), %xmm31 + vpinsrq \$1, $offset($arg3,%r12), %xmm31, %xmm31 + vmovq $offset($arg4,%r12), %xmm30 + vpinsrq \$1, $offset($arg5,%r12), %xmm30, %xmm30 + vinserti32x4 \$1, %xmm30, %ymm31, %ymm31 + vpxorq %ymm31, %ymm$i, %ymm$i +___ +} + +$code.=<<___; + sub \$136, %r11 # Subtract the rate from the remaining length + add \$136, %r12 # Adjust offset to next block + call keccak_1600_permute # Perform the Keccak permutation + + jmp .Lshake256_absorb_while_loop + +.align 32 +.Lshake256_absorb_while_loop_done: + call keccak_1600_save_state_x4 + + mov %r11, 8*100($arg1) # update s[100] + or %r11, %r11 + jz .Lshake256_absorb_exit + + movq \$0, 8*100($arg1) # clear s[100] + + # r10/state, arg2-arg5/input, r12/length + mov $arg1, %r10 + add %r12, $arg2 + add %r12, $arg3 + add %r12, $arg4 + add %r12, $arg5 + mov %r11, %r12 + call keccak_1600_partial_add_x4 + + mov %r11, 8*100($arg1) # update s[100] + +.Lshake256_absorb_exit: + # Clear sensitive registers + vpxorq %xmm16, %xmm16, %xmm16 + vmovdqa64 %ymm16, %ymm17 + vmovdqa64 %ymm16, %ymm18 + vmovdqa64 %ymm16, %ymm19 + vmovdqa64 %ymm16, %ymm20 + vmovdqa64 %ymm16, %ymm21 + vmovdqa64 %ymm16, %ymm22 + vmovdqa64 %ymm16, %ymm23 + vmovdqa64 %ymm16, %ymm24 + vmovdqa64 %ymm16, %ymm25 + vmovdqa64 %ymm16, %ymm26 + vmovdqa64 %ymm16, %ymm27 + vmovdqa64 %ymm16, %ymm28 + vmovdqa64 %ymm16, %ymm29 + vmovdqa64 %ymm16, %ymm30 + vmovdqa64 %ymm16, %ymm31 +.Lshake256_absorb_epilogue: +___ +$code .= <<___ if ($win64); + vmovups 0(%rsp), %xmm6 + vmovups 16(%rsp), %xmm7 + vmovups 32(%rsp), %xmm8 + vmovups 48(%rsp), %xmm9 + vmovups 64(%rsp), %xmm10 + vmovups 80(%rsp), %xmm11 + vmovups 96(%rsp), %xmm12 + vmovups 112(%rsp), %xmm13 + vmovups 128(%rsp), %xmm14 + vmovups 144(%rsp), %xmm15 + add \$160, %rsp +___ +$code.=<<___; + + pop %r15 +.cfi_pop %r15 + pop %r14 +.cfi_pop %r14 + pop %r13 +.cfi_pop %r13 + pop %r12 +.cfi_pop %r12 + pop %rbx +.cfi_pop %rbx + pop %rbp +.cfi_pop %rbp + vzeroall + ret +.cfi_endproc +.size SHA3_shake256_x4_inc_absorb_avx512vl,.-SHA3_shake256_x4_inc_absorb_avx512vl + + +# SHA3_shake256_x4_inc_finalize_avx512vl +# Finalize absorption phase for 4 parallel SHAKE-256 states +# Adds padding and terminator bytes and clears the absorb offset +# Arguments: +# arg1 (rdi): pointer to state context (808 bytes) +# Returns: void +# Note: After this call, state is ready for squeezing output +.globl SHA3_shake256_x4_inc_finalize_avx512vl +.type SHA3_shake256_x4_inc_finalize_avx512vl,\@function,1 +.align 32 +SHA3_shake256_x4_inc_finalize_avx512vl: +.L_SHA3_shake256_x4_inc_finalize_avx512vl: +.cfi_startproc + mov 8*100($arg1), %r11 # load state offset from s[100] + mov %r11, %r10 + and \$~7, %r10d # offset to the state register + and \$7, %r11d # offset within the register + + # add EOM byte right after the message + vmovdqu32 ($arg1,%r10,4), %ymm31 + lea shake_msg_pad_x4(%rip), %r9 + sub %r11, %r9 + vmovdqu32 (%r9), %ymm30 + vpxorq %ymm30, %ymm31, %ymm31 + vmovdqu32 %ymm31, ($arg1,%r10,4) + + # add terminating byte at offset equal to rate - 1 (SHAKE256_RATE = 136) + vmovdqu32 512($arg1), %ymm31 # 136*4 - 32 = 544 - 32 = 512 + vmovdqa32 shake_terminator_byte_x4(%rip), %ymm30 + vpxorq %ymm30, %ymm31, %ymm31 + vmovdqu32 %ymm31, 512($arg1) + + movq \$0, 8*100($arg1) # clear s[100] + vpxorq %ymm31, %ymm31, %ymm31 + ret +.cfi_endproc +.size SHA3_shake256_x4_inc_finalize_avx512vl,.-SHA3_shake256_x4_inc_finalize_avx512vl + +___ + +$code .= <<___ if ($win64); +# Internal Win64 shim for squeeze entry. It establishes xlate-compatible +# unwind state and then jumps to the function entry after the prologue. +# This is required for internal calls since the xlate ABI conversion +# is already done in the caller function. +.type SHA3_shake256_x4_inc_squeeze_avx512vl_internal,\@abi-omnipotent +.align 32 +.LSEH_begin_SHA3_shake256_x4_inc_squeeze_avx512vl_internal: +SHA3_shake256_x4_inc_squeeze_avx512vl_internal: + mov %rsp, %rax + mov $arg1, 8(%rsp) + mov $arg2, 16(%rsp) + jmp .L_SHA3_shake256_x4_inc_squeeze_avx512vl +.LSEH_end_SHA3_shake256_x4_inc_squeeze_avx512vl_internal: +.size SHA3_shake256_x4_inc_squeeze_avx512vl_internal,.-SHA3_shake256_x4_inc_squeeze_avx512vl_internal +___ +$code.=<<___; + +# SHA3_shake256_x4_inc_squeeze_avx512vl +# Squeeze output from 4 parallel SHAKE256 states +# Arguments: +# arg1 (rdi): pointer to lane 0 output buffer +# arg2 (rsi): pointer to lane 1 output buffer +# arg3 (rdx): pointer to lane 2 output buffer +# arg4 (rcx): pointer to lane 3 output buffer +# arg5 (r8): output length in bytes (must be same for all lanes) +# arg6 (r9): pointer to state context (808 bytes) +# Returns: void +# Note: Can be called multiple times to generate arbitrary-length output +.globl SHA3_shake256_x4_inc_squeeze_avx512vl +.type SHA3_shake256_x4_inc_squeeze_avx512vl,\@function,6 +.align 32 +SHA3_shake256_x4_inc_squeeze_avx512vl: +.L_SHA3_shake256_x4_inc_squeeze_avx512vl: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +___ +$code .= <<___ if ($win64); + sub \$160, %rsp + vmovups %xmm6, 0(%rsp) + vmovups %xmm7, 16(%rsp) + vmovups %xmm8, 32(%rsp) + vmovups %xmm9, 48(%rsp) + vmovups %xmm10, 64(%rsp) + vmovups %xmm11, 80(%rsp) + vmovups %xmm12, 96(%rsp) + vmovups %xmm13, 112(%rsp) + vmovups %xmm14, 128(%rsp) + vmovups %xmm15, 144(%rsp) +___ +$code.=<<___; + +.Lshake256_squeeze_body: + or $arg5, $arg5 + jz .Lshake256_squeeze_done + + # check for partially processed block + mov 8*100($arg6), %r15 # s[100] - capacity + or %r15, %r15 + jnz .Lshake256_squeeze_no_init_permute + + mov $arg1, %r14 + mov $arg6, $arg1 + call keccak_1600_load_state_x4 + + mov %r14, $arg1 + + xor %rbp, %rbp + jmp .Lshake256_squeeze_loop + +.align 32 +.Lshake256_squeeze_no_init_permute: + # extract bytes: r10 - state/src, arg1-arg4 - output/dst, r12 - length = min(capacity, outlen), r11 - offset + mov $arg6, %r10 + mov $arg6, %r14 # preserve state pointer across extract helper + + mov %r15, %r12 + cmp %r15, $arg5 + cmovnae $arg5, %r12 # %r12 = min(capacity, outlen) + + sub %r12, $arg5 # outlen -= length + + mov \$136, %r11d # SHAKE256_RATE + sub %r15, %r11 # state offset + + sub %r12, %r15 # capacity -= length + mov %r15, 8*100($arg6) # update s[100] + + call keccak_1600_extract_bytes_x4 + mov %r14, $arg6 # restore state pointer after helper clobbers + + or %r15, %r15 + jnz .Lshake256_squeeze_done # check s[100] not zero + + mov $arg1, %r13 # preserve arg1 + mov %r14, $arg1 + call keccak_1600_load_state_x4 + + mov %r13, $arg1 + xor %rbp, %rbp + +.align 32 +.Lshake256_squeeze_loop: + cmp \$136, $arg5 # outlen > SHAKE256_RATE + jb .Lshake256_squeeze_final_extract + + call keccak_1600_permute + + # Extract SHAKE256 rate bytes (136 bytes = 17 x 8 bytes) inline +___ + +# Generate extract code for SHAKE256 rate (136 bytes = 17 ymm registers) +for (my $i = 0; $i < 17; $i++) { + my $offset = $i * 8; + $code.=<<___; + vextracti64x2 \$1, %ymm$i, %xmm31 + vmovq %xmm$i, $offset($arg1,%rbp) + vpextrq \$1, %xmm$i, $offset($arg2,%rbp) + vmovq %xmm31, $offset($arg3,%rbp) + vpextrq \$1, %xmm31, $offset($arg4,%rbp) +___ +} + +$code.=<<___; + add \$136, %rbp # dst offset += SHAKE256_RATE + sub \$136, $arg5 # outlen -= SHAKE256_RATE + jmp .Lshake256_squeeze_loop + +.align 32 +.Lshake256_squeeze_final_extract: + or $arg5, $arg5 + jz .Lshake256_squeeze_no_end_permute + + # update output pointers + add %rbp, $arg1 + add %rbp, $arg2 + add %rbp, $arg3 + add %rbp, $arg4 + + mov \$136, %r15d # SHAKE256_RATE + sub $arg5, %r15 + mov %r15, 8*100($arg6) # s[100] = capacity + + call keccak_1600_permute + + mov $arg1, %r14 + mov $arg6, $arg1 + call keccak_1600_save_state_x4 + + mov %r14, $arg1 + + # extract bytes: r10 - state/src, arg1-arg4 - output/dst, r12 - length, r11 - offset = 0 + mov $arg6, %r10 + mov $arg5, %r12 + xor %r11, %r11 + call keccak_1600_extract_bytes_x4 + + jmp .Lshake256_squeeze_done + +.Lshake256_squeeze_no_end_permute: + movq \$0, 8*100($arg6) # s[100] = 0 + mov $arg6, $arg1 + call keccak_1600_save_state_x4 + +.Lshake256_squeeze_done: + # Clear sensitive registers + vpxorq %xmm16, %xmm16, %xmm16 + vmovdqa64 %ymm16, %ymm17 + vmovdqa64 %ymm16, %ymm18 + vmovdqa64 %ymm16, %ymm19 + vmovdqa64 %ymm16, %ymm20 + vmovdqa64 %ymm16, %ymm21 + vmovdqa64 %ymm16, %ymm22 + vmovdqa64 %ymm16, %ymm23 + vmovdqa64 %ymm16, %ymm24 + vmovdqa64 %ymm16, %ymm25 + vmovdqa64 %ymm16, %ymm26 + vmovdqa64 %ymm16, %ymm27 + vmovdqa64 %ymm16, %ymm28 + vmovdqa64 %ymm16, %ymm29 + vmovdqa64 %ymm16, %ymm30 + vmovdqa64 %ymm16, %ymm31 +.Lshake256_squeeze_epilogue: + vzeroall +___ +$code .= <<___ if ($win64); + vmovups 0(%rsp), %xmm6 + vmovups 16(%rsp), %xmm7 + vmovups 32(%rsp), %xmm8 + vmovups 48(%rsp), %xmm9 + vmovups 64(%rsp), %xmm10 + vmovups 80(%rsp), %xmm11 + vmovups 96(%rsp), %xmm12 + vmovups 112(%rsp), %xmm13 + vmovups 128(%rsp), %xmm14 + vmovups 144(%rsp), %xmm15 + add \$160, %rsp +___ +$code.=<<___; + + pop %r15 +.cfi_pop %r15 + pop %r14 +.cfi_pop %r14 + pop %r13 +.cfi_pop %r13 + pop %r12 +.cfi_pop %r12 + pop %rbx +.cfi_pop %rbx + pop %rbp +.cfi_pop %rbp + ret +.cfi_endproc +.size SHA3_shake256_x4_inc_squeeze_avx512vl,.-SHA3_shake256_x4_inc_squeeze_avx512vl +___ + +if ($win64) { +my $context = "%r8"; +my $disp = "%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type keccak_se_handler,\@abi-omnipotent +.align 16 +keccak_se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64, %rsp + + mov 120($context), %rax # context->Rax = original %rsp from xlate prologue + mov 248($context), %rbx # context->Rip + + mov 8($disp), %rsi # disp->ImageBase + mov 56($disp), %r11 # disp->HandlerData + + mov 0(%r11), %r10d # HandlerData[0]: body label (rva) + lea (%rsi,%r10), %r10 + cmp %r10, %rbx # Rip < body? + jb .Lkeccak_in_prologue + + mov 4(%r11), %r10d # HandlerData[1]: epilogue label (rva) + lea (%rsi,%r10), %r10 + cmp %r10, %rbx # Rip >= epilogue? + jae .Lkeccak_in_epilogue + + # In function body: + # HandlerData[2]: delta from context->Rsp(body) to original %rsp + # HandlerData[3]: offset of XMM6 save area from context->Rsp(body), -1 if none + # HandlerData[4]: number of saved non-volatiles in stack frame layout (2 or 6) + # HandlerData[5]: delta from context->Rsp(epilogue) to original %rsp + mov 152($context), %rdx # body rsp + mov 8(%r11), %r10d + lea (%rdx,%r10), %rax # original rsp + jmp .Lkeccak_restore_body_or_epilogue + +.Lkeccak_in_epilogue: + mov 152($context), %rdx # epilogue rsp + mov 20(%r11), %r10d + lea (%rdx,%r10), %rax # original rsp + +.Lkeccak_restore_body_or_epilogue: + mov 8(%rax), %rcx # xlate shadow save of original rdi + mov 16(%rax), %rsi # xlate shadow save of original rsi + mov %rax, 152($context) # context->Rsp = original rsp + mov %rsi, 168($context) # context->Rsi + mov %rcx, 176($context) # context->Rdi + + mov 16(%r11), %r10d # gpr save count + cmp \$6, %r10d + jne .Lkeccak_restore_two + + mov -24(%rax), %r12 + mov -32(%rax), %r13 + mov -40(%rax), %r14 + mov -48(%rax), %r15 + mov %r12, 216($context) # context->R12 + mov %r13, 224($context) # context->R13 + mov %r14, 232($context) # context->R14 + mov %r15, 240($context) # context->R15 + +.Lkeccak_restore_two: + mov -8(%rax), %rbp + mov -16(%rax), %rbx + mov %rbp, 160($context) # context->Rbp + mov %rbx, 144($context) # context->Rbx + + mov 12(%r11), %r10d # xmm save offset from body rsp + cmp \$-1, %r10d + je .Lkeccak_in_prologue + + lea (%rdx,%r10), %rsi # source = xmm save area + lea 512($context), %rdi # &context->Xmm6 + mov \$20, %ecx # 10 XMM * 2 qwords + .long 0xa548f3fc # cld; rep movsq + +.Lkeccak_in_prologue: + mov 8(%rax), %rcx + mov 16(%rax), %rdx + mov %rcx, 176($context) # context->Rdi + mov %rdx, 168($context) # context->Rsi + mov %rax, 152($context) # context->Rsp = original rsp + + mov 40($disp), %rdi # disp->ContextRecord + mov $context, %rsi + mov \$154, %ecx # sizeof(CONTEXT)/8 + .long 0xa548f3fc # cld; rep movsq + + mov $disp, %rsi + xor %rcx, %rcx # UNW_FLAG_NHANDLER + mov 8(%rsi), %rdx # disp->ImageBase + mov 0(%rsi), %r8 # disp->ControlPc + mov 16(%rsi), %r9 # disp->FunctionEntry + mov 40(%rsi), %r10 # disp->ContextRecord + lea 56(%rsi), %r11 # &disp->HandlerData + lea 24(%rsi), %r12 # &disp->EstablisherFrame + mov %r10, 32(%rsp) + mov %r11, 40(%rsp) + mov %r12, 48(%rsp) + mov %rcx, 56(%rsp) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1, %eax # ExceptionContinueSearch + add \$64, %rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size keccak_se_handler,.-keccak_se_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_SHA3_shake128_x4_avx512vl + .rva .LSEH_end_SHA3_shake128_x4_avx512vl + .rva .LSEH_info_SHA3_shake128_x4_avx512vl + .rva .LSEH_begin_SHA3_shake128_x4_inc_absorb_avx512vl_internal + .rva .LSEH_end_SHA3_shake128_x4_inc_absorb_avx512vl_internal + .rva .LSEH_info_SHA3_shake128_x4_inc_absorb_avx512vl_internal + .rva .LSEH_begin_SHA3_shake128_x4_inc_absorb_avx512vl + .rva .LSEH_end_SHA3_shake128_x4_inc_absorb_avx512vl + .rva .LSEH_info_SHA3_shake128_x4_inc_absorb_avx512vl + .rva .LSEH_begin_SHA3_shake128_x4_inc_squeeze_avx512vl_internal + .rva .LSEH_end_SHA3_shake128_x4_inc_squeeze_avx512vl_internal + .rva .LSEH_info_SHA3_shake128_x4_inc_squeeze_avx512vl_internal + .rva .LSEH_begin_SHA3_shake128_x4_inc_squeeze_avx512vl + .rva .LSEH_end_SHA3_shake128_x4_inc_squeeze_avx512vl + .rva .LSEH_info_SHA3_shake128_x4_inc_squeeze_avx512vl + .rva .LSEH_begin_SHA3_shake256_x4_avx512vl + .rva .LSEH_end_SHA3_shake256_x4_avx512vl + .rva .LSEH_info_SHA3_shake256_x4_avx512vl + .rva .LSEH_begin_SHA3_shake256_x4_inc_absorb_avx512vl_internal + .rva .LSEH_end_SHA3_shake256_x4_inc_absorb_avx512vl_internal + .rva .LSEH_info_SHA3_shake256_x4_inc_absorb_avx512vl_internal + .rva .LSEH_begin_SHA3_shake256_x4_inc_absorb_avx512vl + .rva .LSEH_end_SHA3_shake256_x4_inc_absorb_avx512vl + .rva .LSEH_info_SHA3_shake256_x4_inc_absorb_avx512vl + .rva .LSEH_begin_SHA3_shake256_x4_inc_squeeze_avx512vl_internal + .rva .LSEH_end_SHA3_shake256_x4_inc_squeeze_avx512vl_internal + .rva .LSEH_info_SHA3_shake256_x4_inc_squeeze_avx512vl_internal + .rva .LSEH_begin_SHA3_shake256_x4_inc_squeeze_avx512vl + .rva .LSEH_end_SHA3_shake256_x4_inc_squeeze_avx512vl + .rva .LSEH_info_SHA3_shake256_x4_inc_squeeze_avx512vl + +.section .xdata +.align 8 +.LSEH_info_SHA3_shake128_x4_avx512vl: + .byte 9,0,0,0 + .rva keccak_se_handler + .rva .Lshake128_x4_body,.Lshake128_x4_epilogue + .long 1032,856,2,1032 +.LSEH_info_SHA3_shake128_x4_inc_absorb_avx512vl: + .byte 9,0,0,0 + .rva keccak_se_handler + .rva .Lshake128_absorb_body,.Lshake128_absorb_epilogue + .long 208,0,6,208 +.LSEH_info_SHA3_shake128_x4_inc_absorb_avx512vl_internal: + .byte 9,0,0,0 + .rva keccak_se_handler + .rva .Lshake128_absorb_body,.Lshake128_absorb_epilogue + .long 208,0,6,208 +.LSEH_info_SHA3_shake128_x4_inc_squeeze_avx512vl: + .byte 9,0,0,0 + .rva keccak_se_handler + .rva .Lshake128_squeeze_body,.Lshake128_squeeze_epilogue + .long 208,0,6,208 +.LSEH_info_SHA3_shake128_x4_inc_squeeze_avx512vl_internal: + .byte 9,0,0,0 + .rva keccak_se_handler + .rva .Lshake128_squeeze_body,.Lshake128_squeeze_epilogue + .long 208,0,6,208 +.LSEH_info_SHA3_shake256_x4_avx512vl: + .byte 9,0,0,0 + .rva keccak_se_handler + .rva .Lshake256_x4_body,.Lshake256_x4_epilogue + .long 1032,856,2,1032 +.LSEH_info_SHA3_shake256_x4_inc_absorb_avx512vl: + .byte 9,0,0,0 + .rva keccak_se_handler + .rva .Lshake256_absorb_body,.Lshake256_absorb_epilogue + .long 208,0,6,208 +.LSEH_info_SHA3_shake256_x4_inc_absorb_avx512vl_internal: + .byte 9,0,0,0 + .rva keccak_se_handler + .rva .Lshake256_absorb_body,.Lshake256_absorb_epilogue + .long 208,0,6,208 +.LSEH_info_SHA3_shake256_x4_inc_squeeze_avx512vl: + .byte 9,0,0,0 + .rva keccak_se_handler + .rva .Lshake256_squeeze_body,.Lshake256_squeeze_epilogue + .long 208,0,6,208 +.LSEH_info_SHA3_shake256_x4_inc_squeeze_avx512vl_internal: + .byte 9,0,0,0 + .rva keccak_se_handler + .rva .Lshake256_squeeze_body,.Lshake256_squeeze_epilogue + .long 208,0,6,208 +___ +} + +$code.=<<___; + +.section .rodata align=128 +.align 128 +.type iotas,\@object +iotas: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +.size iotas,.-iotas + +.align 8 +byte_kmask_0_to_7: + .byte 0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f + +.align 32 +shake_terminator_byte_x4: + .byte 0, 0, 0, 0, 0, 0, 0, 0x80 + .byte 0, 0, 0, 0, 0, 0, 0, 0x80 + .byte 0, 0, 0, 0, 0, 0, 0, 0x80 + .byte 0, 0, 0, 0, 0, 0, 0, 0x80 + +.align 8 + .byte 0, 0, 0, 0, 0, 0, 0, 0 +shake_msg_pad_x4: + .byte 0x1F, 0, 0, 0, 0, 0, 0, 0 + .byte 0x1F, 0, 0, 0, 0, 0, 0, 0 + .byte 0x1F, 0, 0, 0, 0, 0, 0, 0 + .byte 0x1F, 0, 0, 0, 0, 0, 0, 0 + +.asciz "Keccak-1600 absorb and squeeze for AVX512VL, CRYPTOGAMS by " +___ + +}}} else {{{ + +# When AVX512VL is not available, output stub functions +# The capable function returns 0, and the operation functions are not defined (will use C fallback) + +$code .= <<___; +.text + +.globl SHA3_avx512vl_capable +.type SHA3_avx512vl_capable,\@abi-omnipotent +SHA3_avx512vl_capable: + xor %eax, %eax + ret +.size SHA3_avx512vl_capable, .-SHA3_avx512vl_capable + +.globl SHA3_shake128_x4_inc_absorb_avx512vl +.globl SHA3_shake256_x4_inc_absorb_avx512vl +.globl SHA3_shake128_x4_inc_finalize_avx512vl +.globl SHA3_shake256_x4_inc_finalize_avx512vl +.globl SHA3_shake128_x4_inc_squeeze_avx512vl +.globl SHA3_shake256_x4_inc_squeeze_avx512vl +.globl SHA3_shake128_x4_avx512vl +.globl SHA3_shake256_x4_avx512vl +.type SHA3_shake128_x4_inc_absorb_avx512vl,\@abi-omnipotent +SHA3_shake128_x4_inc_absorb_avx512vl: +SHA3_shake256_x4_inc_absorb_avx512vl: +SHA3_shake128_x4_inc_finalize_avx512vl: +SHA3_shake256_x4_inc_finalize_avx512vl: +SHA3_shake128_x4_inc_squeeze_avx512vl: +SHA3_shake256_x4_inc_squeeze_avx512vl: +SHA3_shake128_x4_avx512vl: +SHA3_shake256_x4_avx512vl: + .byte 0x0f,0x0b # ud2 + ret +.size SHA3_shake128_x4_inc_absorb_avx512vl, .-SHA3_shake128_x4_inc_absorb_avx512vl +___ +}}} + +print $code; +close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/sha/build.info b/crypto/sha/build.info index 457ac8d06ab7b..fd192a66dda4c 100644 --- a/crypto/sha/build.info +++ b/crypto/sha/build.info @@ -65,7 +65,7 @@ ENDIF $KECCAK1600ASM=keccak1600.c IF[{- !$disabled{asm} -}] $KECCAK1600ASM_x86= - $KECCAK1600ASM_x86_64=keccak1600-x86_64.s + $KECCAK1600ASM_x86_64=keccak1600-x86_64.s keccak1600x4-avx512vl.s $KECCAK1600ASM_s390x=keccak1600-s390x.S @@ -83,8 +83,8 @@ IF[{- !$disabled{asm} -}] ENDIF $COMMON=sha1dgst.c sha256.c sha512.c sha3.c sha3_encode.c $SHA1ASM $KECCAK1600ASM -SOURCE[../../libcrypto]=$COMMON sha1_one.c -SOURCE[../../providers/libfips.a]= $COMMON +SOURCE[../../libcrypto]=$COMMON sha1_one.c sha3_x4.c +SOURCE[../../providers/libfips.a]= $COMMON sha3_x4.c # Implementations are now spread across several libraries, so the defines # need to be applied to all affected libraries and modules. @@ -198,4 +198,8 @@ GENERATE[keccak1600-avx512vl.S]=asm/keccak1600-avx512vl.pl GENERATE[keccak1600-mmx.S]=asm/keccak1600-mmx.pl GENERATE[keccak1600p8-ppc.S]=asm/keccak1600p8-ppc.pl +# keccak1600x4-avx512vl.s supports multi-squeeze +# Currently only used in ML-DSA on x86_64 with AVX-512VL support +GENERATE[keccak1600x4-avx512vl.s]=asm/keccak1600x4-avx512vl.pl + GENERATE[sha1-thumb.S]=asm/sha1-thumb.pl diff --git a/crypto/sha/sha3_x4.c b/crypto/sha/sha3_x4.c new file mode 100644 index 0000000000000..1d993c326c0a5 --- /dev/null +++ b/crypto/sha/sha3_x4.c @@ -0,0 +1,202 @@ +/* + * Copyright 2026 The OpenSSL Project Authors. All Rights Reserved. + * Copyright (c) 2026 Intel Corporation. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +/* + * SHAKE x4 multi-buffer implementation for AVX-512VL + * + * This file provides incremental API wrappers around the AVX-512VL + * assembly implementations for processing 4 SHAKE instances in parallel. + * + * Callers should check SHA3_avx512vl_capable() before calling. + */ + +#include "internal/sha3.h" +#include + +#if defined(KECCAK1600_ASM) \ + && (defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)) \ + && !defined(OPENSSL_NO_ASM) + +/* External assembly function declarations */ +extern void SHA3_shake128_x4_inc_absorb_avx512vl( + uint64_t *state, + const void *in0, const void *in1, + const void *in2, const void *in3, + size_t inlen); + +extern void SHA3_shake256_x4_inc_absorb_avx512vl( + uint64_t *state, + const void *in0, const void *in1, + const void *in2, const void *in3, + size_t inlen); + +extern void SHA3_shake128_x4_inc_finalize_avx512vl(uint64_t *state); +extern void SHA3_shake256_x4_inc_finalize_avx512vl(uint64_t *state); + +extern void SHA3_shake128_x4_inc_squeeze_avx512vl( + void *out0, void *out1, + void *out2, void *out3, + size_t outlen, + uint64_t *state); + +extern void SHA3_shake256_x4_inc_squeeze_avx512vl( + void *out0, void *out1, + void *out2, void *out3, + size_t outlen, + uint64_t *state); + +/* One-shot assembly function declarations */ +extern void SHA3_shake128_x4_avx512vl( + void *out0, void *out1, + void *out2, void *out3, + size_t outlen, + const void *in0, const void *in1, + const void *in2, const void *in3, + size_t inlen); + +extern void SHA3_shake256_x4_avx512vl( + void *out0, void *out1, + void *out2, void *out3, + size_t outlen, + const void *in0, const void *in1, + const void *in2, const void *in3, + size_t inlen); + +/* + * SHAKE-128 x4 Implementation + */ + +void ossl_sha3_shake128_x4_inc_init(KECCAK1600_X4_CTX *ctx) +{ + memset(ctx->A, 0, sizeof(ctx->A)); + ctx->rate = 168; /* SHAKE-128 rate in bytes */ + ctx->finalized = 0; +} + +void ossl_sha3_shake128_x4_inc_absorb( + KECCAK1600_X4_CTX *ctx, + const void *in0, const void *in1, + const void *in2, const void *in3, + size_t inlen) +{ + if (ctx->finalized) { + /* Error: cannot absorb after finalize */ + return; + } + + SHA3_shake128_x4_inc_absorb_avx512vl( + ctx->A, in0, in1, in2, in3, inlen); +} + +void ossl_sha3_shake128_x4_inc_finalize(KECCAK1600_X4_CTX *ctx) +{ + if (ctx->finalized) { + return; /* Already finalized */ + } + + SHA3_shake128_x4_inc_finalize_avx512vl(ctx->A); + ctx->finalized = 1; +} + +void ossl_sha3_shake128_x4_inc_squeeze( + void *out0, void *out1, + void *out2, void *out3, + size_t outlen, + KECCAK1600_X4_CTX *ctx) +{ + if (!ctx->finalized) { + /* Auto-finalize on first squeeze */ + ossl_sha3_shake128_x4_inc_finalize(ctx); + } + + SHA3_shake128_x4_inc_squeeze_avx512vl( + out0, out1, out2, out3, outlen, ctx->A); +} + +/* + * SHAKE-256 x4 Implementation + */ + +void ossl_sha3_shake256_x4_inc_init(KECCAK1600_X4_CTX *ctx) +{ + memset(ctx->A, 0, sizeof(ctx->A)); + ctx->rate = 136; /* SHAKE-256 rate in bytes */ + ctx->finalized = 0; +} + +void ossl_sha3_shake256_x4_inc_absorb( + KECCAK1600_X4_CTX *ctx, + const void *in0, const void *in1, + const void *in2, const void *in3, + size_t inlen) +{ + if (ctx->finalized) { + /* Error: cannot absorb after finalize */ + return; + } + + SHA3_shake256_x4_inc_absorb_avx512vl( + ctx->A, in0, in1, in2, in3, inlen); +} + +void ossl_sha3_shake256_x4_inc_finalize(KECCAK1600_X4_CTX *ctx) +{ + if (ctx->finalized) { + return; /* Already finalized */ + } + + SHA3_shake256_x4_inc_finalize_avx512vl(ctx->A); + ctx->finalized = 1; +} + +void ossl_sha3_shake256_x4_inc_squeeze( + void *out0, void *out1, + void *out2, void *out3, + size_t outlen, + KECCAK1600_X4_CTX *ctx) +{ + if (!ctx->finalized) { + /* Auto-finalize on first squeeze */ + ossl_sha3_shake256_x4_inc_finalize(ctx); + } + + SHA3_shake256_x4_inc_squeeze_avx512vl( + out0, out1, out2, out3, outlen, ctx->A); +} + +/* + * Single-call wrapper APIs + */ + +void ossl_sha3_shake128_x4( + void *out0, void *out1, + void *out2, void *out3, + size_t outlen, + const void *in0, const void *in1, + const void *in2, const void *in3, + size_t inlen) +{ + SHA3_shake128_x4_avx512vl(out0, out1, out2, out3, outlen, + in0, in1, in2, in3, inlen); +} + +void ossl_sha3_shake256_x4( + void *out0, void *out1, + void *out2, void *out3, + size_t outlen, + const void *in0, const void *in1, + const void *in2, const void *in3, + size_t inlen) +{ + SHA3_shake256_x4_avx512vl(out0, out1, out2, out3, outlen, + in0, in1, in2, in3, inlen); +} + +#endif /* KECCAK1600_ASM && x86_64 && !OPENSSL_NO_ASM */ diff --git a/include/internal/sha3.h b/include/internal/sha3.h index f91d00a74f838..82a7ec158b1a7 100644 --- a/include/internal/sha3.h +++ b/include/internal/sha3.h @@ -65,4 +65,75 @@ int ossl_shake_squeeze_default(KECCAK1600_CTX *ctx, unsigned char *out, size_t o size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len, size_t r); +/* Multi-buffer (x4) Keccak-f[1600] context and API */ +#if defined(KECCAK1600_ASM) \ + && (defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)) \ + && !defined(OPENSSL_NO_ASM) + +/* Runtime capability check for AVX512VL */ +int SHA3_avx512vl_capable(void); + +/* Context for 4-way parallel SHAKE operations */ +typedef struct { + /* 4 interleaved Keccak states (800 bytes) + plus 8 bytes to store the number of + already absorbed or not yet squeezed bytes */ + uint64_t A[(25 * 4) + 1]; + size_t rate; /* Rate in bytes: 168 (SHAKE-128) or 136 (SHAKE-256) */ + unsigned finalized; /* Has finalize been called? 0=no, 1=yes */ +} KECCAK1600_X4_CTX; + +/* SHAKE-128 x4 incremental API */ +void ossl_sha3_shake128_x4_inc_init(KECCAK1600_X4_CTX *ctx); + +void ossl_sha3_shake128_x4_inc_absorb( + KECCAK1600_X4_CTX *ctx, + const void *in0, const void *in1, + const void *in2, const void *in3, + size_t inlen); + +void ossl_sha3_shake128_x4_inc_finalize(KECCAK1600_X4_CTX *ctx); + +void ossl_sha3_shake128_x4_inc_squeeze( + void *out0, void *out1, + void *out2, void *out3, + size_t outlen, + KECCAK1600_X4_CTX *ctx); + +/* SHAKE-256 x4 incremental API */ +void ossl_sha3_shake256_x4_inc_init(KECCAK1600_X4_CTX *ctx); + +void ossl_sha3_shake256_x4_inc_absorb( + KECCAK1600_X4_CTX *ctx, + const void *in0, const void *in1, + const void *in2, const void *in3, + size_t inlen); + +void ossl_sha3_shake256_x4_inc_finalize(KECCAK1600_X4_CTX *ctx); + +void ossl_sha3_shake256_x4_inc_squeeze( + void *out0, void *out1, + void *out2, void *out3, + size_t outlen, + KECCAK1600_X4_CTX *ctx); + +/* Single-call SHAKE x4 APIs (wrapper functions) */ +void ossl_sha3_shake128_x4( + void *out0, void *out1, + void *out2, void *out3, + size_t outlen, + const void *in0, const void *in1, + const void *in2, const void *in3, + size_t inlen); + +void ossl_sha3_shake256_x4( + void *out0, void *out1, + void *out2, void *out3, + size_t outlen, + const void *in0, const void *in1, + const void *in2, const void *in3, + size_t inlen); + +#endif /* KECCAK1600_ASM && x86_64 && !OPENSSL_NO_ASM */ + #endif /* OSSL_INTERNAL_SHA3_H */ diff --git a/test/build.info b/test/build.info index f599b3aff8c61..d6a36ba9ba035 100644 --- a/test/build.info +++ b/test/build.info @@ -915,6 +915,8 @@ IF[{- !$disabled{tests} -}] PROGRAMS{noinst}=cmactest ENDIF + PROGRAMS{noinst}=sha3_x4_internal_test + SOURCE[poly1305_internal_test]=poly1305_internal_test.c INCLUDE[poly1305_internal_test]=.. ../include ../apps/include DEPEND[poly1305_internal_test]=../libcrypto.a libtestutil.a @@ -923,6 +925,10 @@ IF[{- !$disabled{tests} -}] INCLUDE[chacha_internal_test]=.. ../include ../apps/include DEPEND[chacha_internal_test]=../libcrypto.a libtestutil.a + SOURCE[sha3_x4_internal_test]=sha3_x4_internal_test.c + INCLUDE[sha3_x4_internal_test]=.. ../include ../apps/include + DEPEND[sha3_x4_internal_test]=../libcrypto.a libtestutil.a + SOURCE[asn1_internal_test]=asn1_internal_test.c INCLUDE[asn1_internal_test]=.. ../include ../apps/include DEPEND[asn1_internal_test]=../libcrypto.a libtestutil.a diff --git a/test/recipes/03-test_sha3_x4_internal.t b/test/recipes/03-test_sha3_x4_internal.t new file mode 100644 index 0000000000000..9e5793aaf3cd3 --- /dev/null +++ b/test/recipes/03-test_sha3_x4_internal.t @@ -0,0 +1,16 @@ +#! /usr/bin/env perl +# Copyright 2026 The OpenSSL Project Authors. All Rights Reserved. +# Copyright (c) 2026 Intel Corporation. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +use strict; +use OpenSSL::Test; +use OpenSSL::Test::Simple; + +setup("test_sha3_x4_internal"); + +simple_test("test_sha3_x4_internal", "sha3_x4_internal_test"); diff --git a/test/sha3_x4_internal_test.c b/test/sha3_x4_internal_test.c new file mode 100644 index 0000000000000..e387b6f51ae46 --- /dev/null +++ b/test/sha3_x4_internal_test.c @@ -0,0 +1,432 @@ +/* + * Copyright 2026 The OpenSSL Project Authors. All Rights Reserved. + * Copyright (c) 2026 Intel Corporation. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +/* + * Internal cross-validation tests for the SHAKE x4 multi-buffer API. + * + * Each test computes SHAKE-128 or SHAKE-256 on four independent inputs + * using the x4 (AVX-512VL) path and compares every lane's output to the + * equivalent result produced by the scalar ossl_sha3_* API. + * + * Tests cover: + * - Single-call (ossl_sha3_shake{128,256}_x4) for many (inlen, outlen) pairs + * - Incremental init/absorb/squeeze for the same (inlen, outlen) pairs + * - Multi-absorb: input split at every possible block boundary + * - Multi-squeeze: output produced in two successive squeeze calls + */ + +#include +#include "testutil.h" + +/* + * KECCAK1600_ASM is only added to the library compilation flags by the build + * system, not to test binaries. Since the x4 declarations in internal/sha3.h + * are guarded by that macro, we define it here before the include so that the + * KECCAK1600_X4_CTX type and function prototypes are visible. The symbols + * themselves live in libcrypto.a which is always compiled with the flag set. + * We additionally gate all x4 code on x86_64 (GCC/Clang: __x86_64__, + * MSVC: _M_AMD64/_M_X64) and !OPENSSL_NO_ASM so that the test still + * compiles on other platforms or in no-asm builds. + */ +#if (defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)) \ + && !defined(OPENSSL_NO_ASM) +#ifndef KECCAK1600_ASM +#define KECCAK1600_ASM +#endif +#endif +#include "internal/sha3.h" + +/* + * A single deterministic 1024-byte message. Each of the four lanes receives + * a different slice of this buffer, with lane base pointers spaced 64 bytes + * apart, so their inputs are distinct yet entirely self-contained. + */ +#define MSG_BUF_SIZE 1024 +#define LANE_STRIDE 64 /* byte offset between lane base pointers */ +#define NUM_LANES 4 + +static unsigned char msg[MSG_BUF_SIZE]; + +/* Maximum output length used in this file – must fit chunk1 + chunk2. */ +#define MAX_OUT 640 + +#if defined(KECCAK1600_ASM) \ + && (defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)) \ + && !defined(OPENSSL_NO_ASM) + +/* + * Input lengths exercising: empty, tiny, sub-block, block boundary ±1, + * multiple blocks and a longer message for SHAKE-128 (rate=168) and + * SHAKE-256 (rate=136). + */ +static const size_t input_sizes[] = { + 0, 1, 17, 100, 135, 136, 137, 168, 169, 200, 400 +}; +#define NUM_INPUT_SIZES (sizeof(input_sizes) / sizeof(input_sizes[0])) + +/* Output lengths chosen to straddle rate boundaries for both variants. */ +static const size_t output_sizes[] = { + 16, 32, 64, 136, 168, 256, 512 +}; +#define NUM_OUTPUT_SIZES (sizeof(output_sizes) / sizeof(output_sizes[0])) + +/* Helpers functions */ + +/* + * Compute a scalar SHAKE-128 or SHAKE-256 digest. + * bitlen: 128 or 256. Returns 1 on success, 0 on failure. + */ +static int scalar_shake(const unsigned int bitlen, + const unsigned char *in, const size_t inlen, + unsigned char *out, const size_t outlen) +{ + KECCAK1600_CTX ctx; + + if (!ossl_sha3_init(&ctx, 0x1f, bitlen)) + return 0; + /* ossl_sha3_init does not populate the method vtable; do it here. */ + ctx.meth.absorb = ossl_sha3_absorb_default; + ctx.meth.final = ossl_sha3_final_default; + ctx.meth.squeeze = ossl_shake_squeeze_default; + return ossl_sha3_absorb(&ctx, in, inlen) + && ossl_sha3_squeeze(&ctx, out, outlen); +} + +/* + * Encode (inlen_idx, outlen_idx) into a single test index and back. + * test index n = inlen_idx * NUM_OUTPUT_SIZES + outlen_idx + */ +static void decode_idx(const int n, size_t *inlen, size_t *outlen) +{ + *inlen = input_sizes[n / (int)NUM_OUTPUT_SIZES]; + *outlen = output_sizes[n % (int)NUM_OUTPUT_SIZES]; +} + +/* One-shot tests */ + +static int test_shake_x4_oneshot(const unsigned int bitlen, const int n) +{ + size_t inlen, outlen; + const unsigned char *in[NUM_LANES]; + unsigned char x4_out[NUM_LANES][MAX_OUT]; + unsigned char ref_out[NUM_LANES][MAX_OUT]; + int i; + + decode_idx(n, &inlen, &outlen); + + for (i = 0; i < NUM_LANES; i++) + in[i] = msg + i * LANE_STRIDE; + + /* Ensure the lane inputs fit within the message buffer. */ + if (!TEST_size_t_le(inlen + (NUM_LANES - 1) * LANE_STRIDE, MSG_BUF_SIZE)) + return 0; + if (!TEST_size_t_le(outlen, MAX_OUT)) + return 0; + + /* x4 single-call */ + if (bitlen == 128) + ossl_sha3_shake128_x4(x4_out[0], x4_out[1], x4_out[2], x4_out[3], + outlen, + in[0], in[1], in[2], in[3], inlen); + else + ossl_sha3_shake256_x4(x4_out[0], x4_out[1], x4_out[2], x4_out[3], + outlen, + in[0], in[1], in[2], in[3], inlen); + + /* scalar reference */ + for (i = 0; i < NUM_LANES; i++) + if (!TEST_true(scalar_shake(bitlen, in[i], inlen, ref_out[i], outlen))) + return 0; + + /* compare */ + for (i = 0; i < NUM_LANES; i++) { + if (!TEST_mem_eq(x4_out[i], outlen, ref_out[i], outlen)) { + TEST_info("SHAKE-%u x4 oneshot lane %d: inlen=%zu outlen=%zu", + bitlen, i, inlen, outlen); + return 0; + } + } + return 1; +} + +static int test_shake128_x4_oneshot(const int n) +{ + return test_shake_x4_oneshot(128, n); +} + +static int test_shake256_x4_oneshot(const int n) +{ + return test_shake_x4_oneshot(256, n); +} + +/* Incremental (init / absorb / finalize / squeeze) tests */ + +static int test_shake_x4_incremental(const unsigned int bitlen, const int n) +{ + size_t inlen, outlen; + const unsigned char *in[NUM_LANES]; + unsigned char x4_out[NUM_LANES][MAX_OUT]; + unsigned char ref_out[NUM_LANES][MAX_OUT]; + KECCAK1600_X4_CTX ctx; + int i; + + decode_idx(n, &inlen, &outlen); + + for (i = 0; i < NUM_LANES; i++) + in[i] = msg + i * LANE_STRIDE; + + if (!TEST_size_t_le(inlen + (NUM_LANES - 1) * LANE_STRIDE, MSG_BUF_SIZE)) + return 0; + + /* x4 incremental */ + if (bitlen == 128) { + ossl_sha3_shake128_x4_inc_init(&ctx); + ossl_sha3_shake128_x4_inc_absorb(&ctx, in[0], in[1], in[2], in[3], + inlen); + ossl_sha3_shake128_x4_inc_finalize(&ctx); + ossl_sha3_shake128_x4_inc_squeeze(x4_out[0], x4_out[1], + x4_out[2], x4_out[3], outlen, &ctx); + } else { + ossl_sha3_shake256_x4_inc_init(&ctx); + ossl_sha3_shake256_x4_inc_absorb(&ctx, in[0], in[1], in[2], in[3], + inlen); + ossl_sha3_shake256_x4_inc_finalize(&ctx); + ossl_sha3_shake256_x4_inc_squeeze(x4_out[0], x4_out[1], + x4_out[2], x4_out[3], outlen, &ctx); + } + + /* scalar reference */ + for (i = 0; i < NUM_LANES; i++) + if (!TEST_true(scalar_shake(bitlen, in[i], inlen, ref_out[i], outlen))) + return 0; + + for (i = 0; i < NUM_LANES; i++) { + if (!TEST_mem_eq(x4_out[i], outlen, ref_out[i], outlen)) { + TEST_info("SHAKE-%u x4 incremental lane %d: inlen=%zu outlen=%zu", + bitlen, i, inlen, outlen); + return 0; + } + } + return 1; +} + +static int test_shake128_x4_incremental(const int n) +{ + return test_shake_x4_incremental(128, n); +} + +static int test_shake256_x4_incremental(const int n) +{ + return test_shake_x4_incremental(256, n); +} + +/* Multi-absorb tests */ + +/* + * Split the input at every tested input size, absorbing the two halves + * in separate calls. The split length is chosen as input_sizes[n] so that + * we exercise sub-block, at-block and multi-block split points. + * + * Full message length is fixed at the largest tested input size so that + * every split index is meaningful. + */ +static int test_shake_x4_multi_absorb(const unsigned int bitlen, const int n) +{ + const size_t total = input_sizes[NUM_INPUT_SIZES - 1]; + const size_t split = input_sizes[n]; + const size_t outlen = 64; /* fixed output length for this sub-test */ + const unsigned char *in[NUM_LANES]; + unsigned char x4_out[NUM_LANES][MAX_OUT]; + unsigned char ref_out[NUM_LANES][MAX_OUT]; + KECCAK1600_X4_CTX ctx; + int i; + + if (split > total) + return 1; /* nothing to test */ + + for (i = 0; i < NUM_LANES; i++) + in[i] = msg + i * LANE_STRIDE; + + if (!TEST_size_t_le(total + (NUM_LANES - 1) * LANE_STRIDE, MSG_BUF_SIZE)) + return 0; + + /* x4 split absorb */ + if (bitlen == 128) { + ossl_sha3_shake128_x4_inc_init(&ctx); + ossl_sha3_shake128_x4_inc_absorb(&ctx, + in[0], in[1], in[2], in[3], split); + ossl_sha3_shake128_x4_inc_absorb(&ctx, + in[0] + split, in[1] + split, in[2] + split, in[3] + split, + total - split); + ossl_sha3_shake128_x4_inc_squeeze(x4_out[0], x4_out[1], + x4_out[2], x4_out[3], outlen, &ctx); + } else { + ossl_sha3_shake256_x4_inc_init(&ctx); + ossl_sha3_shake256_x4_inc_absorb(&ctx, + in[0], in[1], in[2], in[3], split); + ossl_sha3_shake256_x4_inc_absorb(&ctx, + in[0] + split, in[1] + split, in[2] + split, in[3] + split, + total - split); + ossl_sha3_shake256_x4_inc_squeeze(x4_out[0], x4_out[1], + x4_out[2], x4_out[3], outlen, &ctx); + } + + /* scalar reference (single absorb of full message) */ + for (i = 0; i < NUM_LANES; i++) + if (!TEST_true(scalar_shake(bitlen, in[i], total, ref_out[i], outlen))) + return 0; + + for (i = 0; i < NUM_LANES; i++) { + if (!TEST_mem_eq(x4_out[i], outlen, ref_out[i], outlen)) { + TEST_info("SHAKE-%u x4 multi-absorb lane %d: total=%zu split=%zu", + bitlen, i, total, split); + return 0; + } + } + return 1; +} + +static int test_shake128_x4_multi_absorb(const int n) +{ + return test_shake_x4_multi_absorb(128, n); +} + +static int test_shake256_x4_multi_absorb(const int n) +{ + return test_shake_x4_multi_absorb(256, n); +} + +/* Multi-squeeze tests */ + +/* + * Squeeze in two successive calls and verify that the concatenated output + * matches a single scalar squeeze of the same total length. + * Parameterized over output_sizes[] for the first chunk; the second chunk + * is always 64 bytes so the total length varies. + */ +static int test_shake_x4_multi_squeeze(const unsigned int bitlen, const int n) +{ + const size_t inlen = 200; /* fixed input length */ + const size_t chunk1 = output_sizes[n]; + const size_t chunk2 = 64; + const size_t total = chunk1 + chunk2; + const unsigned char *in[NUM_LANES]; + unsigned char x4_a[NUM_LANES][MAX_OUT]; /* first chunk */ + unsigned char x4_b[NUM_LANES][MAX_OUT]; /* second chunk */ + unsigned char ref_out[NUM_LANES][MAX_OUT]; + KECCAK1600_X4_CTX ctx; + int i; + + if (!TEST_size_t_le(total, MAX_OUT)) + return 0; + if (!TEST_size_t_le(inlen + (NUM_LANES - 1) * LANE_STRIDE, MSG_BUF_SIZE)) + return 0; + + for (i = 0; i < NUM_LANES; i++) + in[i] = msg + i * LANE_STRIDE; + + /* x4 two-shot squeeze */ + if (bitlen == 128) { + ossl_sha3_shake128_x4_inc_init(&ctx); + ossl_sha3_shake128_x4_inc_absorb(&ctx, in[0], in[1], in[2], in[3], + inlen); + /* first squeeze */ + ossl_sha3_shake128_x4_inc_squeeze(x4_a[0], x4_a[1], x4_a[2], x4_a[3], + chunk1, &ctx); + /* second squeeze – context carries state from previous call */ + ossl_sha3_shake128_x4_inc_squeeze(x4_b[0], x4_b[1], x4_b[2], x4_b[3], + chunk2, &ctx); + } else { + ossl_sha3_shake256_x4_inc_init(&ctx); + ossl_sha3_shake256_x4_inc_absorb(&ctx, in[0], in[1], in[2], in[3], + inlen); + ossl_sha3_shake256_x4_inc_squeeze(x4_a[0], x4_a[1], x4_a[2], x4_a[3], + chunk1, &ctx); + ossl_sha3_shake256_x4_inc_squeeze(x4_b[0], x4_b[1], x4_b[2], x4_b[3], + chunk2, &ctx); + } + + /* scalar reference – squeeze the full total in one call */ + for (i = 0; i < NUM_LANES; i++) + if (!TEST_true(scalar_shake(bitlen, in[i], inlen, ref_out[i], total))) + return 0; + + /* check first chunk, then second chunk */ + for (i = 0; i < NUM_LANES; i++) { + if (!TEST_mem_eq(x4_a[i], chunk1, ref_out[i], chunk1)) { + TEST_info("SHAKE-%u x4 multi-squeeze lane %d chunk1: " + "inlen=%zu chunk1=%zu chunk2=%zu", + bitlen, i, inlen, chunk1, chunk2); + return 0; + } + if (!TEST_mem_eq(x4_b[i], chunk2, ref_out[i] + chunk1, chunk2)) { + TEST_info("SHAKE-%u x4 multi-squeeze lane %d chunk2: " + "inlen=%zu chunk1=%zu chunk2=%zu", + bitlen, i, inlen, chunk1, chunk2); + return 0; + } + } + return 1; +} + +static int test_shake128_x4_multi_squeeze(const int n) +{ + return test_shake_x4_multi_squeeze(128, n); +} + +static int test_shake256_x4_multi_squeeze(const int n) +{ + return test_shake_x4_multi_squeeze(256, n); +} + +#endif /* KECCAK1600_ASM && x86_64 && !OPENSSL_NO_ASM */ + +/* Test entry point */ + +int setup_tests(void) +{ + size_t i; + + /* Fill the message buffer with a deterministic non-zero pattern. */ + for (i = 0; i < MSG_BUF_SIZE; i++) + msg[i] = (unsigned char)(251 * i + 17); + +#ifdef OPENSSL_CPUID_OBJ + OPENSSL_cpuid_setup(); +#endif + +#if !defined(KECCAK1600_ASM) \ + || !(defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)) \ + || defined(OPENSSL_NO_ASM) + return TEST_skip("SHAKE x4 API not available in this build"); +#else + if (!SHA3_avx512vl_capable()) { + return TEST_skip("AVX-512VL not available; skipping SHAKE x4 tests"); + } + + ADD_ALL_TESTS(test_shake128_x4_oneshot, + (int)(NUM_INPUT_SIZES * NUM_OUTPUT_SIZES)); + ADD_ALL_TESTS(test_shake256_x4_oneshot, + (int)(NUM_INPUT_SIZES * NUM_OUTPUT_SIZES)); + + ADD_ALL_TESTS(test_shake128_x4_incremental, + (int)(NUM_INPUT_SIZES * NUM_OUTPUT_SIZES)); + ADD_ALL_TESTS(test_shake256_x4_incremental, + (int)(NUM_INPUT_SIZES * NUM_OUTPUT_SIZES)); + + ADD_ALL_TESTS(test_shake128_x4_multi_absorb, (int)NUM_INPUT_SIZES); + ADD_ALL_TESTS(test_shake256_x4_multi_absorb, (int)NUM_INPUT_SIZES); + + ADD_ALL_TESTS(test_shake128_x4_multi_squeeze, (int)NUM_OUTPUT_SIZES); + ADD_ALL_TESTS(test_shake256_x4_multi_squeeze, (int)NUM_OUTPUT_SIZES); +#endif + + return 1; +}