From 79b61e8c00d61a0ebc5a12db8f28c4f0cd1f8f54 Mon Sep 17 00:00:00 2001
From: Marcel Cornu <marcel.d.cornu@intel.com>
Date: Fri, 10 Apr 2026 09:36:25 +0000
Subject: [PATCH 1/5] ML-DSA: Add AVX512VL SHAKE x4 multi-buffer integration

Changes:
- Adds new SHAKE x4 API to perform 4 SHAKE operations in parallel when AVX512VL is supported.
- Adds AVX512VL Keccak x4 assembly module (keccak1600x4-avx512vl).
- Adds internal SHA3 x4 APIs/context in sha3.h and wrappers in sha3_x4.c modules.
- Adds runtime dispatch for ML-DSA sample operations with an OSSL_ML_DSA_SAMPLE_OPS vtable.
  Callers obtain the correct implementation via ossl_ml_dsa_sample_ops(), which returns
  either the generic scalar ops functions, or the AVX512VL multi-buffer ops depending
  on the build and CPU capabilities.
- Adds x86-64 multi-buffer function implementation into ml_dsa_sample_hw_x86_64.inc,
  included in ml_dsa_sample.c when KECCAK1600_ASM and x86_64 are defined.

Co-authored-by: Tomasz Kantecki <tomasz.kantecki@intel.com>
Signed-off-by: Marcel Cornu <marcel.d.cornu@intel.com>
---
 CHANGES.md                                |    4 +
 crypto/ml_dsa/ml_dsa_hash.h               |    5 +
 crypto/ml_dsa/ml_dsa_key.c                |   16 +-
 crypto/ml_dsa/ml_dsa_local.h              |   19 +-
 crypto/ml_dsa/ml_dsa_matrix.h             |    7 -
 crypto/ml_dsa/ml_dsa_sample.c             |   60 +-
 crypto/ml_dsa/ml_dsa_sample_hw_x86_64.inc |  310 +++
 crypto/ml_dsa/ml_dsa_sign.c               |   10 +-
 crypto/ml_dsa/ml_dsa_vector.h             |   27 -
 crypto/sha/asm/keccak1600x4-avx512vl.pl   | 2343 +++++++++++++++++++++
 crypto/sha/build.info                     |   10 +-
 crypto/sha/sha3_x4.c                      |  202 ++
 include/internal/sha3.h                   |   71 +
 13 files changed, 3031 insertions(+), 53 deletions(-)
 create mode 100644 crypto/ml_dsa/ml_dsa_sample_hw_x86_64.inc
 create mode 100755 crypto/sha/asm/keccak1600x4-avx512vl.pl
 create mode 100644 crypto/sha/sha3_x4.c

diff --git a/CHANGES.md b/CHANGES.md
index 049c0e7288710..f7ee59641cf2c 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -31,6 +31,10 @@ OpenSSL Releases
 
 ### Changes between 4.0 and 4.1 [xx XXX xxxx]
 
+ * Added AVX512 optimized SHAKE x4 operations for ML-DSA on x86_64.
+
+   *Marcel Cornu and Tomasz Kantecki*
+
  * Added test framework for testing function memory allocation failures.
 
    *Jakub Zelenka*
diff --git a/crypto/ml_dsa/ml_dsa_hash.h b/crypto/ml_dsa/ml_dsa_hash.h
index 7625d3367d9c3..4280ef67c8897 100644
--- a/crypto/ml_dsa/ml_dsa_hash.h
+++ b/crypto/ml_dsa/ml_dsa_hash.h
@@ -7,6 +7,9 @@
  * https://www.openssl.org/source/license.html
  */
 
+#ifndef OSSL_CRYPTO_ML_DSA_HASH_H
+#define OSSL_CRYPTO_ML_DSA_HASH_H
+
 #include <openssl/evp.h>
 
 static ossl_inline ossl_unused int
@@ -39,3 +42,5 @@ shake_xof_3(EVP_MD_CTX *ctx, const EVP_MD *md, const uint8_t *in1, size_t in1_le
         && EVP_DigestUpdate(ctx, in3, in3_len)
         && EVP_DigestSqueeze(ctx, out, out_len);
 }
+
+#endif /* OSSL_CRYPTO_ML_DSA_HASH_H */
diff --git a/crypto/ml_dsa/ml_dsa_key.c b/crypto/ml_dsa/ml_dsa_key.c
index 24fa7596e2f77..74488365c31f2 100644
--- a/crypto/ml_dsa/ml_dsa_key.c
+++ b/crypto/ml_dsa/ml_dsa_key.c
@@ -332,7 +332,7 @@ int ossl_ml_dsa_key_has(const ML_DSA_KEY *key, int selection)
  * @returns 1 on success, or 0 on failure.
  */
 static int public_from_private(const ML_DSA_KEY *key, EVP_MD_CTX *md_ctx,
-    VECTOR *t1, VECTOR *t0)
+    const OSSL_ML_DSA_SAMPLE_OPS *sample_ops, VECTOR *t1, VECTOR *t0)
 {
     int ret = 0;
     const ML_DSA_PARAMS *params = key->params;
@@ -351,7 +351,7 @@ static int public_from_private(const ML_DSA_KEY *key, EVP_MD_CTX *md_ctx,
     matrix_init(&a_ntt, s1_ntt.poly + l, k, l);
 
     /* Using rho generate A' = A in NTT form */
-    if (!matrix_expand_A(md_ctx, key->shake128_md, key->rho, &a_ntt))
+    if (!sample_ops->matrix_expand_A(md_ctx, key->shake128_md, key->rho, &a_ntt))
         goto err;
 
     /* t = NTT_inv(A' * NTT(s1)) + s2 */
@@ -376,6 +376,7 @@ static int public_from_private(const ML_DSA_KEY *key, EVP_MD_CTX *md_ctx,
 int ossl_ml_dsa_key_public_from_private(ML_DSA_KEY *key)
 {
     int ret = 0;
+    const OSSL_ML_DSA_SAMPLE_OPS *sample_ops = ossl_ml_dsa_sample_ops();
     VECTOR t0;
     EVP_MD_CTX *md_ctx = NULL;
 
@@ -383,7 +384,7 @@ int ossl_ml_dsa_key_public_from_private(ML_DSA_KEY *key)
         return 0;
     ret = ((md_ctx = EVP_MD_CTX_new()) != NULL)
         && ossl_ml_dsa_key_pub_alloc(key) /* allocate space for t1 */
-        && public_from_private(key, md_ctx, &key->t1, &t0)
+        && public_from_private(key, md_ctx, sample_ops, &key->t1, &t0)
         && vector_equal(&t0, &key->t0) /* compare the generated t0 to the expected */
         && ossl_ml_dsa_pk_encode(key)
         && shake_xof(md_ctx, key->shake256_md,
@@ -397,6 +398,7 @@ int ossl_ml_dsa_key_public_from_private(ML_DSA_KEY *key)
 int ossl_ml_dsa_key_pairwise_check(const ML_DSA_KEY *key)
 {
     int ret = 0;
+    const OSSL_ML_DSA_SAMPLE_OPS *sample_ops = ossl_ml_dsa_sample_ops();
     VECTOR t1, t0;
     POLY *polys = NULL;
     uint32_t k = (uint32_t)key->params->k;
@@ -414,7 +416,7 @@ int ossl_ml_dsa_key_pairwise_check(const ML_DSA_KEY *key)
 
     vector_init(&t1, polys, k);
     vector_init(&t0, polys + k, k);
-    if (!public_from_private(key, md_ctx, &t1, &t0))
+    if (!public_from_private(key, md_ctx, sample_ops, &t1, &t0))
         goto err;
 
     ret = vector_equal(&t1, &key->t1) && vector_equal(&t0, &key->t0);
@@ -435,6 +437,7 @@ int ossl_ml_dsa_key_pairwise_check(const ML_DSA_KEY *key)
 static int keygen_internal(ML_DSA_KEY *out)
 {
     int ret = 0;
+    const OSSL_ML_DSA_SAMPLE_OPS *sample_ops = ossl_ml_dsa_sample_ops();
     uint8_t augmented_seed[ML_DSA_SEED_BYTES + 2];
     uint8_t expanded_seed[ML_DSA_RHO_BYTES + ML_DSA_PRIV_SEED_BYTES + ML_DSA_K_BYTES];
     const uint8_t *const rho = expanded_seed; /* p = Public Random Seed */
@@ -461,8 +464,9 @@ static int keygen_internal(ML_DSA_KEY *out)
     memcpy(out->rho, rho, sizeof(out->rho));
     memcpy(out->K, K, sizeof(out->K));
 
-    ret = vector_expand_S(md_ctx, out->shake256_md, params->eta, priv_seed, &out->s1, &out->s2)
-        && public_from_private(out, md_ctx, &out->t1, &out->t0)
+    ret = sample_ops->vector_expand_S(md_ctx, out->shake256_md, params->eta,
+              priv_seed, &out->s1, &out->s2)
+        && public_from_private(out, md_ctx, sample_ops, &out->t1, &out->t0)
         && ossl_ml_dsa_pk_encode(out)
         && shake_xof(md_ctx, out->shake256_md, out->pub_encoding, out->params->pk_len,
             out->tr, sizeof(out->tr))
diff --git a/crypto/ml_dsa/ml_dsa_local.h b/crypto/ml_dsa/ml_dsa_local.h
index bbaa6dafc75a9..34a83f8ffbe0e 100644
--- a/crypto/ml_dsa/ml_dsa_local.h
+++ b/crypto/ml_dsa/ml_dsa_local.h
@@ -59,10 +59,23 @@ typedef struct vector_st VECTOR;
 typedef struct matrix_st MATRIX;
 typedef struct ml_dsa_sig_st ML_DSA_SIG;
 
-int ossl_ml_dsa_matrix_expand_A(EVP_MD_CTX *g_ctx, const EVP_MD *md,
+typedef int(ML_DSA_MATRIX_EXPAND_A_FN)(EVP_MD_CTX *g_ctx, const EVP_MD *md,
     const uint8_t *rho, MATRIX *out);
-int ossl_ml_dsa_vector_expand_S(EVP_MD_CTX *h_ctx, const EVP_MD *md, int eta,
-    const uint8_t *seed, VECTOR *s1, VECTOR *s2);
+typedef int(ML_DSA_VECTOR_EXPAND_S_FN)(EVP_MD_CTX *h_ctx, const EVP_MD *md,
+    int eta, const uint8_t *seed, VECTOR *s1, VECTOR *s2);
+typedef void(ML_DSA_VECTOR_EXPAND_MASK_FN)(VECTOR *out, const uint8_t *rho_prime,
+    size_t rho_prime_len, uint32_t kappa, uint32_t gamma1,
+    EVP_MD_CTX *h_ctx, const EVP_MD *md);
+
+typedef struct ossl_ml_dsa_sample_ops_st {
+    ML_DSA_MATRIX_EXPAND_A_FN *matrix_expand_A;
+    ML_DSA_VECTOR_EXPAND_S_FN *vector_expand_S;
+    ML_DSA_VECTOR_EXPAND_MASK_FN *vector_expand_mask;
+} OSSL_ML_DSA_SAMPLE_OPS;
+
+const OSSL_ML_DSA_SAMPLE_OPS *ossl_ml_dsa_sample_ops(void);
+const OSSL_ML_DSA_SAMPLE_OPS *ossl_ml_dsa_sample_generic_ops(void);
+const OSSL_ML_DSA_SAMPLE_OPS *ossl_ml_dsa_sample_x86_64_ops(void);
 void ossl_ml_dsa_matrix_mult_vector(const MATRIX *matrix_kl, const VECTOR *vl,
     VECTOR *vk);
 int ossl_ml_dsa_poly_expand_mask(POLY *out, const uint8_t *seed, size_t seed_len,
diff --git a/crypto/ml_dsa/ml_dsa_matrix.h b/crypto/ml_dsa/ml_dsa_matrix.h
index 0352ecac7afc0..cd9005fc87177 100644
--- a/crypto/ml_dsa/ml_dsa_matrix.h
+++ b/crypto/ml_dsa/ml_dsa_matrix.h
@@ -35,10 +35,3 @@ matrix_mult_vector(const MATRIX *a, const VECTOR *s, VECTOR *t)
 {
     ossl_ml_dsa_matrix_mult_vector(a, s, t);
 }
-
-static ossl_inline ossl_unused int
-matrix_expand_A(EVP_MD_CTX *g_ctx, const EVP_MD *md, const uint8_t *rho,
-    MATRIX *out)
-{
-    return ossl_ml_dsa_matrix_expand_A(g_ctx, md, rho, out);
-}
diff --git a/crypto/ml_dsa/ml_dsa_sample.c b/crypto/ml_dsa/ml_dsa_sample.c
index 5d9dc84a54fa3..d59261e404254 100644
--- a/crypto/ml_dsa/ml_dsa_sample.c
+++ b/crypto/ml_dsa/ml_dsa_sample.c
@@ -8,6 +8,7 @@
  */
 
 #include <openssl/byteorder.h>
+#include <openssl/crypto.h>
 #include "ml_dsa_local.h"
 #include "ml_dsa_vector.h"
 #include "ml_dsa_matrix.h"
@@ -35,6 +36,10 @@ typedef int(COEFF_FROM_NIBBLE_FUNC)(uint32_t nibble, uint32_t *out);
 static COEFF_FROM_NIBBLE_FUNC coeff_from_nibble_4;
 static COEFF_FROM_NIBBLE_FUNC coeff_from_nibble_2;
 
+static ML_DSA_MATRIX_EXPAND_A_FN matrix_expand_A_scalar;
+static ML_DSA_VECTOR_EXPAND_S_FN vector_expand_S_scalar;
+static ML_DSA_VECTOR_EXPAND_MASK_FN vector_expand_mask_scalar;
+
 /**
  * @brief Combine 3 bytes to form an coefficient.
  * See FIPS 204, Algorithm 14, CoeffFromThreeBytes()
@@ -198,7 +203,7 @@ static int rej_bounded_poly(EVP_MD_CTX *h_ctx, const EVP_MD *md,
  *            in the range of 0..q-1.
  * @returns 1 if the matrix was generated, or 0 on error.
  */
-int ossl_ml_dsa_matrix_expand_A(EVP_MD_CTX *g_ctx, const EVP_MD *md,
+int matrix_expand_A_scalar(EVP_MD_CTX *g_ctx, const EVP_MD *md,
     const uint8_t *rho, MATRIX *out)
 {
     int ret = 0;
@@ -208,7 +213,6 @@ int ossl_ml_dsa_matrix_expand_A(EVP_MD_CTX *g_ctx, const EVP_MD *md,
 
     /* The seed used for each matrix element is rho + column_index + row_index */
     memcpy(derived_seed, rho, ML_DSA_RHO_BYTES);
-
     for (i = 0; i < out->k; i++) {
         for (j = 0; j < out->l; j++) {
             derived_seed[ML_DSA_RHO_BYTES + 1] = (uint8_t)i;
@@ -241,7 +245,7 @@ int ossl_ml_dsa_matrix_expand_A(EVP_MD_CTX *g_ctx, const EVP_MD *md,
  *           the range (q-eta)..0..eta
  * @returns 1 if s1 and s2 were successfully generated, or 0 otherwise.
  */
-int ossl_ml_dsa_vector_expand_S(EVP_MD_CTX *h_ctx, const EVP_MD *md, int eta,
+int vector_expand_S_scalar(EVP_MD_CTX *h_ctx, const EVP_MD *md, int eta,
     const uint8_t *seed, VECTOR *s1, VECTOR *s2)
 {
     int ret = 0;
@@ -376,3 +380,53 @@ int ossl_ml_dsa_poly_sample_in_ball(POLY *out_c, const uint8_t *seed, int seed_l
     }
     return 1;
 }
+
+void vector_expand_mask_scalar(VECTOR *out, const uint8_t *rho_prime,
+    size_t rho_prime_len, uint32_t kappa, uint32_t gamma1,
+    EVP_MD_CTX *h_ctx, const EVP_MD *md)
+{
+    size_t i;
+    uint8_t derived_seed[ML_DSA_RHO_PRIME_BYTES + 2];
+
+    (void)rho_prime_len;
+
+    memcpy(derived_seed, rho_prime, ML_DSA_RHO_PRIME_BYTES);
+
+    for (i = 0; i < out->num_poly; i++) {
+        size_t index = kappa + i;
+
+        derived_seed[ML_DSA_RHO_PRIME_BYTES] = index & 0xFF;
+        derived_seed[ML_DSA_RHO_PRIME_BYTES + 1] = (index >> 8) & 0xFF;
+        poly_expand_mask(out->poly + i, derived_seed, sizeof(derived_seed),
+            gamma1, h_ctx, md);
+    }
+}
+
+static const OSSL_ML_DSA_SAMPLE_OPS ml_dsa_sample_generic_meth = {
+    matrix_expand_A_scalar,
+    vector_expand_S_scalar,
+    vector_expand_mask_scalar
+};
+
+const OSSL_ML_DSA_SAMPLE_OPS *ossl_ml_dsa_sample_ops(void)
+{
+#if defined(KECCAK1600_ASM) && defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+    return ossl_ml_dsa_sample_x86_64_ops();
+#else
+    return ossl_ml_dsa_sample_generic_ops();
+#endif
+}
+
+const OSSL_ML_DSA_SAMPLE_OPS *ossl_ml_dsa_sample_generic_ops(void)
+{
+    return &ml_dsa_sample_generic_meth;
+}
+
+#if defined(KECCAK1600_ASM) && defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#include "ml_dsa_sample_hw_x86_64.inc"
+#else
+const OSSL_ML_DSA_SAMPLE_OPS *ossl_ml_dsa_sample_x86_64_ops(void)
+{
+    return ossl_ml_dsa_sample_generic_ops();
+}
+#endif
diff --git a/crypto/ml_dsa/ml_dsa_sample_hw_x86_64.inc b/crypto/ml_dsa/ml_dsa_sample_hw_x86_64.inc
new file mode 100644
index 0000000000000..527e0456d1949
--- /dev/null
+++ b/crypto/ml_dsa/ml_dsa_sample_hw_x86_64.inc
@@ -0,0 +1,310 @@
+/*
+ * Copyright 2026 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright (c) 2026 Intel Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#define ML_DSA_SHAKE_X4_BATCH_SIZE 4
+#define ML_DSA_SHAKE_X4_DONE_MASK ((1 << ML_DSA_SHAKE_X4_BATCH_SIZE) - 1)
+#define ML_DSA_EXPAND_MASK_BYTES_PER_COEFF 32
+#define ML_DSA_EXPAND_MASK_COEFFS_GAMMA1_19 20
+#define ML_DSA_EXPAND_MASK_COEFFS_GAMMA1_17 18
+#define ML_DSA_EXPAND_MASK_BUF_SIZE_GAMMA1_19 \
+    (ML_DSA_EXPAND_MASK_BYTES_PER_COEFF * ML_DSA_EXPAND_MASK_COEFFS_GAMMA1_19)
+#define ML_DSA_EXPAND_MASK_BUF_SIZE_GAMMA1_17 \
+    (ML_DSA_EXPAND_MASK_BYTES_PER_COEFF * ML_DSA_EXPAND_MASK_COEFFS_GAMMA1_17)
+#define ML_DSA_EXPAND_MASK_BUF_SIZE(gamma1)         \
+    ((gamma1) == ML_DSA_GAMMA1_TWO_POWER_19         \
+            ? ML_DSA_EXPAND_MASK_BUF_SIZE_GAMMA1_19 \
+            : ML_DSA_EXPAND_MASK_BUF_SIZE_GAMMA1_17)
+
+static ossl_unused int rej_ntt_poly_mb(EVP_MD_CTX *g_ctx, const EVP_MD *md,
+    const uint8_t *seeds[ML_DSA_SHAKE_X4_BATCH_SIZE], const size_t seed_len,
+    POLY *outs[ML_DSA_SHAKE_X4_BATCH_SIZE], const size_t count)
+{
+    KECCAK1600_X4_CTX ctx;
+    uint8_t blocks[ML_DSA_SHAKE_X4_BATCH_SIZE][SHAKE128_BLOCKSIZE];
+    int coeff_idx[ML_DSA_SHAKE_X4_BATCH_SIZE] = { 0, 0, 0, 0 };
+    size_t done_mask = 0;
+    size_t lane;
+
+    (void)g_ctx;
+    (void)md;
+
+    for (lane = count; lane < ML_DSA_SHAKE_X4_BATCH_SIZE; lane++)
+        done_mask |= (1 << lane);
+
+    ossl_sha3_shake128_x4_inc_init(&ctx);
+    ossl_sha3_shake128_x4_inc_absorb(&ctx, seeds[0], seeds[1],
+        seeds[2], seeds[3], seed_len);
+    ossl_sha3_shake128_x4_inc_finalize(&ctx);
+
+    while (done_mask != ML_DSA_SHAKE_X4_DONE_MASK) {
+        ossl_sha3_shake128_x4_inc_squeeze(blocks[0], blocks[1],
+            blocks[2], blocks[3], SHAKE128_BLOCKSIZE, &ctx);
+
+        for (lane = 0; lane < ML_DSA_SHAKE_X4_BATCH_SIZE; lane++) {
+            if (done_mask & (1 << lane))
+                continue;
+
+            const uint8_t *b = blocks[lane];
+            const uint8_t *end = b + SHAKE128_BLOCKSIZE;
+
+            for (; b < end && coeff_idx[lane] < ML_DSA_NUM_POLY_COEFFICIENTS; b += 3) {
+                uint32_t *coeff_ptr = &(outs[lane]->coeff[coeff_idx[lane]]);
+
+                if (coeff_from_three_bytes(b, coeff_ptr))
+                    coeff_idx[lane]++;
+            }
+
+            if (coeff_idx[lane] >= ML_DSA_NUM_POLY_COEFFICIENTS)
+                done_mask |= (1 << lane);
+        }
+    }
+
+    return 1;
+}
+
+static void vector_expand_mask_mb(VECTOR *out, const uint8_t *rho_prime,
+    const size_t rho_prime_len, const uint32_t kappa, const uint32_t gamma1,
+    EVP_MD_CTX *h_ctx, const EVP_MD *md)
+{
+    size_t i;
+    const size_t num_polys = out->num_poly;
+    uint8_t derived_seeds[ML_DSA_SHAKE_X4_BATCH_SIZE][ML_DSA_RHO_PRIME_BYTES + 2];
+    const size_t seed_len = sizeof(derived_seeds[0]);
+    const size_t buf_size = ML_DSA_EXPAND_MASK_BUF_SIZE(gamma1);
+    uint8_t buffers[ML_DSA_SHAKE_X4_BATCH_SIZE][ML_DSA_EXPAND_MASK_BUF_SIZE_GAMMA1_19];
+
+    (void)rho_prime_len;
+    (void)h_ctx;
+    (void)md;
+
+    for (i = 0; i < ML_DSA_SHAKE_X4_BATCH_SIZE; i++)
+        memcpy(derived_seeds[i], rho_prime, ML_DSA_RHO_PRIME_BYTES);
+
+    for (i = 0; i + (ML_DSA_SHAKE_X4_BATCH_SIZE - 1) < num_polys; i += ML_DSA_SHAKE_X4_BATCH_SIZE) {
+        size_t b;
+
+        for (b = 0; b < ML_DSA_SHAKE_X4_BATCH_SIZE; b++) {
+            const size_t index = kappa + i + b;
+
+            derived_seeds[b][ML_DSA_RHO_PRIME_BYTES] = index & 0xFF;
+            derived_seeds[b][ML_DSA_RHO_PRIME_BYTES + 1] = (index >> 8) & 0xFF;
+        }
+
+        ossl_sha3_shake256_x4(buffers[0], buffers[1], buffers[2], buffers[3], buf_size,
+            derived_seeds[0], derived_seeds[1], derived_seeds[2], derived_seeds[3], seed_len);
+
+        ossl_ml_dsa_poly_decode_expand_mask(&out->poly[i + 0], buffers[0], buf_size, gamma1);
+        ossl_ml_dsa_poly_decode_expand_mask(&out->poly[i + 1], buffers[1], buf_size, gamma1);
+        ossl_ml_dsa_poly_decode_expand_mask(&out->poly[i + 2], buffers[2], buf_size, gamma1);
+        ossl_ml_dsa_poly_decode_expand_mask(&out->poly[i + 3], buffers[3], buf_size, gamma1);
+    }
+
+    if (i < num_polys) {
+        const size_t left = num_polys - i;
+        size_t b;
+
+        for (b = 0; b < left; b++) {
+            const size_t index = kappa + i + b;
+
+            derived_seeds[b][ML_DSA_RHO_PRIME_BYTES] = (uint8_t)index;
+            derived_seeds[b][ML_DSA_RHO_PRIME_BYTES + 1] = (uint8_t)(index >> 8);
+        }
+
+        ossl_sha3_shake256_x4(buffers[0], buffers[1], buffers[2], buffers[3], buf_size,
+            derived_seeds[0], derived_seeds[1], derived_seeds[2], derived_seeds[3], seed_len);
+
+        ossl_ml_dsa_poly_decode_expand_mask(&out->poly[i + 0], buffers[0], buf_size, gamma1);
+
+        if ((i + 1) < num_polys)
+            ossl_ml_dsa_poly_decode_expand_mask(&out->poly[i + 1], buffers[1], buf_size, gamma1);
+
+        if ((i + 2) < num_polys)
+            ossl_ml_dsa_poly_decode_expand_mask(&out->poly[i + 2], buffers[2], buf_size, gamma1);
+    }
+}
+
+static ossl_unused int rej_bounded_poly_mb(EVP_MD_CTX *h_ctx, const EVP_MD *md,
+    COEFF_FROM_NIBBLE_FUNC *coef_from_nibble,
+    const uint8_t *seeds[ML_DSA_SHAKE_X4_BATCH_SIZE], const size_t seed_len,
+    POLY *outs[ML_DSA_SHAKE_X4_BATCH_SIZE], const size_t count)
+{
+    KECCAK1600_X4_CTX ctx;
+    uint8_t blocks[ML_DSA_SHAKE_X4_BATCH_SIZE][SHAKE256_BLOCKSIZE];
+    int coeff_idx[ML_DSA_SHAKE_X4_BATCH_SIZE] = { 0, 0, 0, 0 };
+    size_t done_mask = 0;
+    size_t lane;
+
+    (void)h_ctx;
+    (void)md;
+
+    for (lane = count; lane < ML_DSA_SHAKE_X4_BATCH_SIZE; lane++)
+        done_mask |= (1 << lane);
+
+    ossl_sha3_shake256_x4_inc_init(&ctx);
+    ossl_sha3_shake256_x4_inc_absorb(&ctx, seeds[0], seeds[1],
+        seeds[2], seeds[3], seed_len);
+    ossl_sha3_shake256_x4_inc_finalize(&ctx);
+
+    while (done_mask != ML_DSA_SHAKE_X4_DONE_MASK) {
+        ossl_sha3_shake256_x4_inc_squeeze(blocks[0], blocks[1],
+            blocks[2], blocks[3], SHAKE256_BLOCKSIZE, &ctx);
+
+        for (lane = 0; lane < ML_DSA_SHAKE_X4_BATCH_SIZE; lane++) {
+            if (done_mask & (1 << lane))
+                continue;
+
+            const uint8_t *b = blocks[lane];
+            const uint8_t *end = b + SHAKE256_BLOCKSIZE;
+
+            for (; b < end && coeff_idx[lane] < ML_DSA_NUM_POLY_COEFFICIENTS; b++) {
+                uint32_t z0 = *b & 0x0F;
+                uint32_t z1 = *b >> 4;
+
+                if (coef_from_nibble(z0, &outs[lane]->coeff[coeff_idx[lane]]))
+                    coeff_idx[lane]++;
+
+                if (coeff_idx[lane] >= ML_DSA_NUM_POLY_COEFFICIENTS) {
+                    done_mask |= (1 << lane);
+                    break;
+                }
+
+                if (coef_from_nibble(z1, &outs[lane]->coeff[coeff_idx[lane]]))
+                    coeff_idx[lane]++;
+
+                if (coeff_idx[lane] >= ML_DSA_NUM_POLY_COEFFICIENTS) {
+                    done_mask |= (1 << lane);
+                    break;
+                }
+            }
+        }
+    }
+
+    return 1;
+}
+
+static int matrix_expand_A_mb(EVP_MD_CTX *g_ctx, const EVP_MD *md,
+    const uint8_t *rho, MATRIX *out)
+{
+    size_t b, idx;
+    uint8_t derived_seeds[ML_DSA_SHAKE_X4_BATCH_SIZE][ML_DSA_RHO_BYTES + 2];
+    const size_t seed_len = sizeof(derived_seeds[0]);
+    const uint8_t *seeds[ML_DSA_SHAKE_X4_BATCH_SIZE];
+    POLY *polys[ML_DSA_SHAKE_X4_BATCH_SIZE];
+    POLY *poly = out->m_poly;
+
+    for (b = 0; b < ML_DSA_SHAKE_X4_BATCH_SIZE; b++) {
+        memcpy(derived_seeds[b], rho, ML_DSA_RHO_BYTES);
+        seeds[b] = derived_seeds[b];
+    }
+
+    for (idx = 0; (idx + ML_DSA_SHAKE_X4_BATCH_SIZE - 1) < (out->k * out->l);
+         idx += ML_DSA_SHAKE_X4_BATCH_SIZE) {
+        for (b = 0; b < ML_DSA_SHAKE_X4_BATCH_SIZE; b++) {
+            const size_t row = (idx + b) / out->l;
+            const size_t col = (idx + b) % out->l;
+
+            derived_seeds[b][ML_DSA_RHO_BYTES] = (uint8_t)col;
+            derived_seeds[b][ML_DSA_RHO_BYTES + 1] = (uint8_t)row;
+            polys[b] = &poly[idx + b];
+        }
+
+        if (!rej_ntt_poly_mb(g_ctx, md, seeds, seed_len, polys, 4))
+            return 0;
+    }
+
+    if (idx < (out->k * out->l)) {
+        const size_t left = (out->k * out->l) - idx;
+
+        for (b = 0; b < left; b++) {
+            const size_t row = (idx + b) / out->l;
+            const size_t col = (idx + b) % out->l;
+
+            derived_seeds[b][ML_DSA_RHO_BYTES] = (uint8_t)col;
+            derived_seeds[b][ML_DSA_RHO_BYTES + 1] = (uint8_t)row;
+            polys[b] = &poly[idx + b];
+        }
+
+        if (!rej_ntt_poly_mb(g_ctx, md, seeds, seed_len, polys, left))
+            return 0;
+    }
+
+    return 1;
+}
+
+static int vector_expand_S_mb(EVP_MD_CTX *h_ctx, const EVP_MD *md, const int eta,
+    const uint8_t *seed, VECTOR *s1, VECTOR *s2)
+{
+    size_t b, idx;
+    const size_t l = s1->num_poly;
+    const size_t total = l + s2->num_poly;
+    uint8_t derived_seeds[ML_DSA_SHAKE_X4_BATCH_SIZE][ML_DSA_PRIV_SEED_BYTES + 2];
+    const uint8_t *seeds[ML_DSA_SHAKE_X4_BATCH_SIZE];
+    const size_t seed_len = sizeof(derived_seeds[0]);
+    POLY *polys[ML_DSA_SHAKE_X4_BATCH_SIZE];
+    COEFF_FROM_NIBBLE_FUNC *coef_from_nibble_fn = (eta == ML_DSA_ETA_4) ? coeff_from_nibble_4 : coeff_from_nibble_2;
+
+    for (b = 0; b < ML_DSA_SHAKE_X4_BATCH_SIZE; b++) {
+        memcpy(derived_seeds[b], seed, ML_DSA_PRIV_SEED_BYTES);
+        seeds[b] = derived_seeds[b];
+    }
+
+    for (idx = 0; (idx + ML_DSA_SHAKE_X4_BATCH_SIZE - 1) < total; idx += ML_DSA_SHAKE_X4_BATCH_SIZE) {
+        for (b = 0; b < ML_DSA_SHAKE_X4_BATCH_SIZE; b++) {
+            const size_t poly_idx = idx + b;
+
+            derived_seeds[b][ML_DSA_PRIV_SEED_BYTES] = (uint8_t)(poly_idx);
+            derived_seeds[b][ML_DSA_PRIV_SEED_BYTES + 1] = (uint8_t)(poly_idx >> 8);
+
+            if (poly_idx < l)
+                polys[b] = &s1->poly[poly_idx];
+            else
+                polys[b] = &s2->poly[poly_idx - l];
+        }
+
+        if (!rej_bounded_poly_mb(h_ctx, md, coef_from_nibble_fn,
+                seeds, seed_len, polys, ML_DSA_SHAKE_X4_BATCH_SIZE))
+            return 0;
+    }
+
+    if (idx < total) {
+        const size_t batch_count = total - idx;
+
+        for (b = 0; b < batch_count; b++) {
+            const size_t poly_idx = idx + b;
+
+            derived_seeds[b][ML_DSA_PRIV_SEED_BYTES] = (uint8_t)(poly_idx);
+            derived_seeds[b][ML_DSA_PRIV_SEED_BYTES + 1] = (uint8_t)(poly_idx >> 8);
+
+            if (poly_idx < l)
+                polys[b] = &s1->poly[poly_idx];
+            else
+                polys[b] = &s2->poly[poly_idx - l];
+        }
+
+        if (!rej_bounded_poly_mb(h_ctx, md, coef_from_nibble_fn,
+                seeds, seed_len, polys, batch_count))
+            return 0;
+    }
+
+    return 1;
+}
+
+static const OSSL_ML_DSA_SAMPLE_OPS ml_dsa_sample_x86_64 = {
+    matrix_expand_A_mb,
+    vector_expand_S_mb,
+    vector_expand_mask_mb
+};
+
+const OSSL_ML_DSA_SAMPLE_OPS *ossl_ml_dsa_sample_x86_64_ops(void)
+{
+    if (SHA3_avx512vl_capable())
+        return &ml_dsa_sample_x86_64;
+    return ossl_ml_dsa_sample_generic_ops();
+}
diff --git a/crypto/ml_dsa/ml_dsa_sign.c b/crypto/ml_dsa/ml_dsa_sign.c
index 51c2709ddbaf9..b42323266aad4 100644
--- a/crypto/ml_dsa/ml_dsa_sign.c
+++ b/crypto/ml_dsa/ml_dsa_sign.c
@@ -164,6 +164,7 @@ static int ml_dsa_sign_internal(const ML_DSA_KEY *priv,
     uint8_t *out_sig)
 {
     int ret = 0;
+    const OSSL_ML_DSA_SAMPLE_OPS *sample_ops = ossl_ml_dsa_sample_ops();
     const ML_DSA_PARAMS *params = priv->params;
     EVP_MD_CTX *md_ctx = NULL;
     uint32_t k = (uint32_t)params->k, l = (uint32_t)params->l;
@@ -232,7 +233,7 @@ static int ml_dsa_sign_internal(const ML_DSA_KEY *priv,
     CONSTTIME_SECRET_VECTOR(priv->s2);
     CONSTTIME_SECRET_VECTOR(priv->t0);
 
-    if (!matrix_expand_A(md_ctx, priv->shake128_md, priv->rho, &a_ntt))
+    if (!sample_ops->matrix_expand_A(md_ctx, priv->shake128_md, priv->rho, &a_ntt))
         goto err;
 
     /*
@@ -263,8 +264,8 @@ static int ml_dsa_sign_internal(const ML_DSA_KEY *priv,
         VECTOR *ct0 = &w1;
         uint32_t z_max, r0_max, ct0_max, h_ones;
 
-        vector_expand_mask(&y, rho_prime, sizeof(rho_prime), (uint32_t)kappa,
-            gamma1, md_ctx, priv->shake256_md);
+        sample_ops->vector_expand_mask(&y, rho_prime, sizeof(rho_prime),
+            (uint32_t)kappa, gamma1, md_ctx, priv->shake256_md);
         vector_copy(y_ntt, &y);
         vector_ntt(y_ntt);
 
@@ -380,6 +381,7 @@ static int ml_dsa_verify_internal(const ML_DSA_KEY *pub,
     const uint8_t *sig_enc, size_t sig_enc_len)
 {
     int ret = 0;
+    const OSSL_ML_DSA_SAMPLE_OPS *sample_ops = ossl_ml_dsa_sample_ops();
     uint8_t *alloc = NULL, *w1_encoded;
     POLY *p, *c_ntt;
     MATRIX a_ntt;
@@ -428,7 +430,7 @@ static int ml_dsa_verify_internal(const ML_DSA_KEY *pub,
     vector_init(&ct1_ntt, p + k, k);
 
     if (!ossl_ml_dsa_sig_decode(&sig, sig_enc, sig_enc_len, pub->params)
-        || !matrix_expand_A(md_ctx, pub->shake128_md, pub->rho, &a_ntt))
+        || !sample_ops->matrix_expand_A(md_ctx, pub->shake128_md, pub->rho, &a_ntt))
         goto err;
 
     /* Compute verifiers challenge c_ntt = NTT(SampleInBall(c_tilde)) */
diff --git a/crypto/ml_dsa/ml_dsa_vector.h b/crypto/ml_dsa/ml_dsa_vector.h
index 0693eb6e3c30c..389c0ed045338 100644
--- a/crypto/ml_dsa/ml_dsa_vector.h
+++ b/crypto/ml_dsa/ml_dsa_vector.h
@@ -149,33 +149,6 @@ vector_mult_scalar(const VECTOR *lhs, const POLY *rhs, VECTOR *out)
         ossl_ml_dsa_poly_ntt_mult(lhs->poly + i, rhs, out->poly + i);
 }
 
-static ossl_inline ossl_unused int
-vector_expand_S(EVP_MD_CTX *h_ctx, const EVP_MD *md, int eta,
-    const uint8_t *seed, VECTOR *s1, VECTOR *s2)
-{
-    return ossl_ml_dsa_vector_expand_S(h_ctx, md, eta, seed, s1, s2);
-}
-
-static ossl_inline ossl_unused void
-vector_expand_mask(VECTOR *out, const uint8_t *rho_prime, size_t rho_prime_len,
-    uint32_t kappa, uint32_t gamma1,
-    EVP_MD_CTX *h_ctx, const EVP_MD *md)
-{
-    size_t i;
-    uint8_t derived_seed[ML_DSA_RHO_PRIME_BYTES + 2];
-
-    memcpy(derived_seed, rho_prime, ML_DSA_RHO_PRIME_BYTES);
-
-    for (i = 0; i < out->num_poly; i++) {
-        size_t index = kappa + i;
-
-        derived_seed[ML_DSA_RHO_PRIME_BYTES] = index & 0xFF;
-        derived_seed[ML_DSA_RHO_PRIME_BYTES + 1] = (index >> 8) & 0xFF;
-        poly_expand_mask(out->poly + i, derived_seed, sizeof(derived_seed),
-            gamma1, h_ctx, md);
-    }
-}
-
 /* Scale back previously rounded value */
 static ossl_inline ossl_unused void
 vector_scale_power2_round_ntt(const VECTOR *in, VECTOR *out)
diff --git a/crypto/sha/asm/keccak1600x4-avx512vl.pl b/crypto/sha/asm/keccak1600x4-avx512vl.pl
new file mode 100755
index 0000000000000..cf52b190407e8
--- /dev/null
+++ b/crypto/sha/asm/keccak1600x4-avx512vl.pl
@@ -0,0 +1,2343 @@
+#!/usr/bin/env perl
+#
+# Copyright 2026 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright (c) 2026 Intel Corporation. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+###############################################################################
+# Keccak x4 AVX512VL SHA3/SHAKE Assembly Routines
+#
+# Description:
+#   This file emits x86_64 assembly for AVX512VL accelerated Keccak-f[1600]
+#   processing of 4 independent states in parallel ("x4").
+#
+#   It provides the core 24-round Keccak permutation and x4 helper routines
+#   used by SHA3 and SHAKE absorb/finalize/squeeze paths. Data from four
+#   input/output lanes is packed across YMM registers so lane-local operations
+#   execute in SIMD.
+#
+###############################################################################
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$avx512vl = 0;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+# Check for AVX512VL support in assembler
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version (\d+)\.(\d+)/) {
+  my ($gas_major, $gas_minor) = ($1, $2);
+  $avx512vl = ($gas_major > 2 || ($gas_major == 2 && $gas_minor >= 26));
+}
+
+if (!$avx512vl
+  && $win64
+  && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/)
+  && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/)
+{
+  $avx512vl = ($1 >= 2.12);
+}
+
+if (!$avx512vl && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
+    $avx512vl = ($2>=3.9);
+}
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+$arg1="%rdi";
+$arg2="%rsi";
+$arg3="%rdx";
+$arg4="%rcx";
+$arg5="%r8";
+$arg6="%r9";
+$roundn="%r13d";
+$tblptr="%r14";
+
+# Define SHAKE rates
+$SHAKE128_RATE="\$168";
+$SHAKE256_RATE="\$136";
+
+# Stack frame offsets for SHAKE x4 wrapper functions
+$STATE_SIZE="808";    # (25 * 8 * 4) + 8 = 808 bytes
+$sf_arg1="0";
+$sf_arg2="8";
+$sf_arg3="16";
+$sf_arg4="24";
+$sf_arg5="32";
+$sf_state_ptr="40";
+$sf_state_x4="48";
+$sf_size="856";       # 48 + 808 = 856 bytes
+
+# Emit an internal helper call used by one-shot wrappers.
+# - Win64: call the provided *_internal shim and bracket it with 32-byte
+#   shadow space so shim entry can use xlate-compatible [rsp+8]/[rsp+16].
+# - non-Win64: call the public API symbol (same base name without _internal).
+# The argument must be the shim/internal symbol name, e.g.
+#   SHA3_shake128_x4_inc_squeeze_avx512vl_internal
+sub call_internal {
+    my ($shim_name) = @_;
+    my $external_name = $shim_name;
+
+    $external_name =~ s/_internal$//;
+
+    return <<___ if ($win64);
+    sub     \$32, %rsp
+    call    $shim_name
+    add     \$32, %rsp
+___
+
+    return <<___;
+    call    $external_name
+___
+}
+
+if ($avx512vl>0) {{{
+
+# AVX512VL feature bit (bit 31 in OPENSSL_ia32cap_P+8)
+my $avx512vl_mask = (1<<31);
+
+$code .= <<___;
+.text
+
+.extern OPENSSL_ia32cap_P
+
+.globl  SHA3_avx512vl_capable
+.type   SHA3_avx512vl_capable,\@abi-omnipotent
+.align 32
+SHA3_avx512vl_capable:
+    mov     OPENSSL_ia32cap_P+8(%rip), %rcx
+    xor     %eax, %eax
+    and     \$$avx512vl_mask, %ecx
+    cmovnz  %ecx, %eax
+    ret
+.size   SHA3_avx512vl_capable, .-SHA3_avx512vl_capable
+___
+
+$code.=<<___;
+.text
+
+# Perform Keccak permutation
+#
+# YMM registers 0 to 24 are used as Keccak state registers.
+# This function, as is, can work on 1 to 4 independent states at the same time.
+#
+# There is no clear boundary between Theta, Rho, Pi, Chi and Iota steps.
+# Instructions corresponding to these steps overlap for better efficiency.
+#
+# Arguments:
+# ymm0-ymm24    [in/out]    Keccak state registers (one SIMD per one state register)
+# ymm25-ymm31   [clobbered] temporary SIMD registers
+# $roundn       [clobbered] used for round tracking
+# $tblptr       [clobbered] used for access to SHA3 constant table
+.type keccak_1600_permute,\@abi-omnipotent
+.align  32
+keccak_1600_permute:
+.cfi_startproc
+    mov     \$24, $roundn        # 24 rounds
+    lea     iotas(%rip), $tblptr # Load the address of the SHA3 round constants
+
+.align  32
+.Lkeccak_rnd_loop:
+    # Theta step
+
+    # Compute column parities
+    # C[5] = [0, 0, 0, 0, 0]
+    # for x in 0 to 4:
+    #     C[x] = state[x][0] XOR state[x][1] XOR state[x][2] XOR state[x][3] XOR state[x][4]
+
+    vmovdqa64   %ymm0, %ymm25
+    vpternlogq  \$0x96, %ymm5, %ymm10, %ymm25
+    vmovdqa64   %ymm1, %ymm26
+    vpternlogq  \$0x96, %ymm11, %ymm6, %ymm26
+    vmovdqa64   %ymm2, %ymm27
+    vpternlogq  \$0x96, %ymm12, %ymm7, %ymm27
+
+    vmovdqa64   %ymm3, %ymm28
+    vpternlogq  \$0x96, %ymm13, %ymm8, %ymm28
+    vmovdqa64   %ymm4, %ymm29
+    vpternlogq  \$0x96, %ymm14, %ymm9, %ymm29
+    vpternlogq  \$0x96, %ymm20, %ymm15, %ymm25
+
+    vpternlogq  \$0x96, %ymm21, %ymm16, %ymm26
+    vpternlogq  \$0x96, %ymm22, %ymm17, %ymm27
+    vpternlogq  \$0x96, %ymm23, %ymm18, %ymm28
+
+    # Start computing D values and keep computing column parity
+    # D[5] = [0, 0, 0, 0, 0]
+    # for x in 0 to 4:
+    #     D[x] = C[(x+4) mod 5] XOR ROTATE_LEFT(C[(x+1) mod 5], 1)
+
+    vprolq      \$1, %ymm26, %ymm30
+    vprolq      \$1, %ymm27, %ymm31
+    vpternlogq  \$0x96, %ymm24, %ymm19, %ymm29
+
+    # Continue computing D values and apply Theta
+    # for x in 0 to 4:
+    #     for y in 0 to 4:
+    #         state[x][y] = state[x][y] XOR D[x]
+
+    vpternlogq  \$0x96, %ymm30, %ymm29, %ymm0
+    vpternlogq  \$0x96, %ymm30, %ymm29, %ymm10
+    vpternlogq  \$0x96, %ymm30, %ymm29, %ymm20
+
+    vpternlogq  \$0x96, %ymm30, %ymm29, %ymm5
+    vpternlogq  \$0x96, %ymm30, %ymm29, %ymm15
+    vprolq      \$1, %ymm28, %ymm30
+
+    vpternlogq  \$0x96, %ymm31, %ymm25, %ymm6
+    vpternlogq  \$0x96, %ymm31, %ymm25, %ymm16
+    vpternlogq  \$0x96, %ymm31, %ymm25, %ymm1
+
+    vpternlogq  \$0x96, %ymm31, %ymm25, %ymm11
+    vpternlogq  \$0x96, %ymm31, %ymm25, %ymm21
+    vprolq      \$1, %ymm29, %ymm31
+
+    vpbroadcastq    ($tblptr), %ymm29 # Load the round constant into ymm29 (Iota)
+    add         \$8, $tblptr          # Increment the pointer to the next round constant
+
+    vpternlogq  \$0x96, %ymm30, %ymm26, %ymm12
+    vpternlogq  \$0x96, %ymm30, %ymm26, %ymm7
+    vpternlogq  \$0x96, %ymm30, %ymm26, %ymm22
+
+    vpternlogq  \$0x96, %ymm30, %ymm26, %ymm17
+    vpternlogq  \$0x96, %ymm30, %ymm26, %ymm2
+    vprolq      \$1, %ymm25, %ymm30
+
+    # Rho step
+    # Keep applying Theta and start Rho step
+    #
+    # ROTATION_OFFSETS[5][5] = [
+    #     [0, 1, 62, 28, 27],
+    #     [36, 44, 6, 55, 20],
+    #     [3, 10, 43, 25, 39],
+    #     [41, 45, 15, 21, 8],
+    #     [18, 2, 61, 56, 14] ]
+    #
+    # for x in 0 to 4:
+    #     for y in 0 to 4:
+    #         state[x][y] = ROTATE_LEFT(state[x][y], ROTATION_OFFSETS[x][y])
+
+    vpternlogq  \$0x96, %ymm31, %ymm27, %ymm3
+    vpternlogq  \$0x96, %ymm31, %ymm27, %ymm13
+    vpternlogq  \$0x96, %ymm31, %ymm27, %ymm23
+
+    vprolq      \$44, %ymm6, %ymm6
+    vpternlogq  \$0x96, %ymm31, %ymm27, %ymm18
+    vpternlogq  \$0x96, %ymm31, %ymm27, %ymm8
+
+    vprolq      \$43, %ymm12, %ymm12
+    vprolq      \$21, %ymm18, %ymm18
+    vpternlogq  \$0x96, %ymm30, %ymm28, %ymm24
+
+    vprolq      \$14, %ymm24, %ymm24
+    vprolq      \$28, %ymm3, %ymm3
+    vpternlogq  \$0x96, %ymm30, %ymm28, %ymm9
+
+    vprolq      \$20, %ymm9, %ymm9
+    vprolq      \$3, %ymm10, %ymm10
+    vpternlogq  \$0x96, %ymm30, %ymm28, %ymm19
+
+    vprolq      \$45, %ymm16, %ymm16
+    vprolq      \$61, %ymm22, %ymm22
+    vpternlogq  \$0x96, %ymm30, %ymm28, %ymm4
+
+    vprolq      \$1, %ymm1, %ymm1
+    vprolq      \$6, %ymm7, %ymm7
+    vpternlogq  \$0x96, %ymm30, %ymm28, %ymm14
+
+    # Continue with Rho and start Pi and Chi steps at the same time
+    # Ternary logic 0xD2 is used for Chi step
+    #
+    # for x in 0 to 4:
+    #     for y in 0 to 4:
+    #         state[x][y] = state[x][y] XOR ((NOT state[(x+1) mod 5][y]) AND state[(x+2) mod 5][y])
+
+    vprolq      \$25, %ymm13, %ymm13
+    vprolq      \$8, %ymm19, %ymm19
+    vmovdqa64   %ymm0, %ymm30
+    vpternlogq  \$0xD2, %ymm12, %ymm6, %ymm30
+
+    vprolq      \$18, %ymm20, %ymm20
+    vprolq      \$27, %ymm4, %ymm4
+    vpxorq      %ymm29, %ymm30, %ymm30 # Iota step
+
+    vprolq      \$36, %ymm5, %ymm5
+    vprolq      \$10, %ymm11, %ymm11
+    vmovdqa64   %ymm6, %ymm31
+    vpternlogq  \$0xD2, %ymm18, %ymm12, %ymm31
+
+    vprolq      \$15, %ymm17, %ymm17
+    vprolq      \$56, %ymm23, %ymm23
+    vpternlogq  \$0xD2, %ymm24, %ymm18, %ymm12
+
+    vprolq      \$62, %ymm2, %ymm2
+    vprolq      \$55, %ymm8, %ymm8
+    vpternlogq  \$0xD2, %ymm0, %ymm24, %ymm18
+
+    vprolq      \$39, %ymm14, %ymm14
+    vprolq      \$41, %ymm15, %ymm15
+    vpternlogq  \$0xD2, %ymm6, %ymm0, %ymm24
+    vmovdqa64   %ymm30, %ymm0
+    vmovdqa64   %ymm31, %ymm6
+
+    vprolq      \$2, %ymm21, %ymm21
+    vmovdqa64   %ymm3, %ymm30
+    vpternlogq  \$0xD2, %ymm10, %ymm9, %ymm30
+    vmovdqa64   %ymm9, %ymm31
+    vpternlogq  \$0xD2, %ymm16, %ymm10, %ymm31
+
+    vpternlogq  \$0xD2, %ymm22, %ymm16, %ymm10
+    vpternlogq  \$0xD2, %ymm3, %ymm22, %ymm16
+    vpternlogq  \$0xD2, %ymm9, %ymm3, %ymm22
+    vmovdqa64   %ymm30, %ymm3
+    vmovdqa64   %ymm31, %ymm9
+
+    vmovdqa64   %ymm1, %ymm30
+    vpternlogq  \$0xD2, %ymm13, %ymm7, %ymm30
+    vmovdqa64   %ymm7, %ymm31
+    vpternlogq  \$0xD2, %ymm19, %ymm13, %ymm31
+    vpternlogq  \$0xD2, %ymm20, %ymm19, %ymm13
+
+    vpternlogq  \$0xD2, %ymm1, %ymm20, %ymm19
+    vpternlogq  \$0xD2, %ymm7, %ymm1, %ymm20
+    vmovdqa64   %ymm30, %ymm1
+    vmovdqa64   %ymm31, %ymm7
+    vmovdqa64   %ymm4, %ymm30
+    vpternlogq  \$0xD2, %ymm11, %ymm5, %ymm30
+
+    vmovdqa64   %ymm5, %ymm31
+    vpternlogq  \$0xD2, %ymm17, %ymm11, %ymm31
+    vpternlogq  \$0xD2, %ymm23, %ymm17, %ymm11
+    vpternlogq  \$0xD2, %ymm4, %ymm23, %ymm17
+
+    vpternlogq  \$0xD2, %ymm5, %ymm4, %ymm23
+    vmovdqa64   %ymm30, %ymm4
+    vmovdqa64   %ymm31, %ymm5
+    vmovdqa64   %ymm2, %ymm30
+    vpternlogq  \$0xD2, %ymm14, %ymm8, %ymm30
+    vmovdqa64   %ymm8, %ymm31
+    vpternlogq  \$0xD2, %ymm15, %ymm14, %ymm31
+
+    vpternlogq  \$0xD2, %ymm21, %ymm15, %ymm14
+    vpternlogq  \$0xD2, %ymm2, %ymm21, %ymm15
+    vpternlogq  \$0xD2, %ymm8, %ymm2, %ymm21
+    vmovdqa64   %ymm30, %ymm2
+    vmovdqa64   %ymm31, %ymm8
+
+    # Complete the steps and get updated state registers in ymm0 to ymm24
+    vmovdqa64   %ymm3,  %ymm30
+    vmovdqa64   %ymm18, %ymm3
+    vmovdqa64   %ymm17, %ymm18
+    vmovdqa64   %ymm11, %ymm17
+    vmovdqa64   %ymm7,  %ymm11
+    vmovdqa64   %ymm10, %ymm7
+    vmovdqa64   %ymm1,  %ymm10
+    vmovdqa64   %ymm6,  %ymm1
+    vmovdqa64   %ymm9,  %ymm6
+    vmovdqa64   %ymm22, %ymm9
+    vmovdqa64   %ymm14, %ymm22
+    vmovdqa64   %ymm20, %ymm14
+    vmovdqa64   %ymm2,  %ymm20
+    vmovdqa64   %ymm12, %ymm2
+    vmovdqa64   %ymm13, %ymm12
+    vmovdqa64   %ymm19, %ymm13
+    vmovdqa64   %ymm23, %ymm19
+    vmovdqa64   %ymm15, %ymm23
+    vmovdqa64   %ymm4,  %ymm15
+    vmovdqa64   %ymm24, %ymm4
+    vmovdqa64   %ymm21, %ymm24
+    vmovdqa64   %ymm8,  %ymm21
+    vmovdqa64   %ymm16, %ymm8
+    vmovdqa64   %ymm5,  %ymm16
+    vmovdqa64   %ymm30, %ymm5
+
+    dec         $roundn           # Decrement the round counter
+    jnz         .Lkeccak_rnd_loop # Jump to the start of the loop if r13d is not zero
+    ret
+.cfi_endproc
+.size   keccak_1600_permute,.-keccak_1600_permute
+
+# Initialize YMM registers 0-24 to zero
+.globl  keccak_1600_init_state
+.type   keccak_1600_init_state,\@abi-omnipotent
+.align  32
+keccak_1600_init_state:
+.cfi_startproc
+    vpxorq      %ymm0, %ymm0, %ymm0
+    vmovdqa64   %ymm0, %ymm1
+    vmovdqa64   %ymm0, %ymm2
+    vmovdqa64   %ymm0, %ymm3
+    vmovdqa64   %ymm0, %ymm4
+    vmovdqa64   %ymm0, %ymm5
+    vmovdqa64   %ymm0, %ymm6
+    vmovdqa64   %ymm0, %ymm7
+    vmovdqa64   %ymm0, %ymm8
+    vmovdqa64   %ymm0, %ymm9
+    vmovdqa64   %ymm0, %ymm10
+    vmovdqa64   %ymm0, %ymm11
+    vmovdqa64   %ymm0, %ymm12
+    vmovdqa64   %ymm0, %ymm13
+    vmovdqa64   %ymm0, %ymm14
+    vmovdqa64   %ymm0, %ymm15
+    vmovdqa64   %ymm0, %ymm16
+    vmovdqa64   %ymm0, %ymm17
+    vmovdqa64   %ymm0, %ymm18
+    vmovdqa64   %ymm0, %ymm19
+    vmovdqa64   %ymm0, %ymm20
+    vmovdqa64   %ymm0, %ymm21
+    vmovdqa64   %ymm0, %ymm22
+    vmovdqa64   %ymm0, %ymm23
+    vmovdqa64   %ymm0, %ymm24
+    ret
+.cfi_endproc
+.size   keccak_1600_init_state,.-keccak_1600_init_state
+
+.globl  keccak_1600_load_state_x4
+.type   keccak_1600_load_state_x4,\@abi-omnipotent
+.align  32
+keccak_1600_load_state_x4:
+.cfi_startproc
+    vmovdqu64   32*0($arg1),  %ymm0
+    vmovdqu64   32*1($arg1),  %ymm1
+    vmovdqu64   32*2($arg1),  %ymm2
+    vmovdqu64   32*3($arg1),  %ymm3
+    vmovdqu64   32*4($arg1),  %ymm4
+    vmovdqu64   32*5($arg1),  %ymm5
+    vmovdqu64   32*6($arg1),  %ymm6
+    vmovdqu64   32*7($arg1),  %ymm7
+    vmovdqu64   32*8($arg1),  %ymm8
+    vmovdqu64   32*9($arg1),  %ymm9
+    vmovdqu64   32*10($arg1), %ymm10
+    vmovdqu64   32*11($arg1), %ymm11
+    vmovdqu64   32*12($arg1), %ymm12
+    vmovdqu64   32*13($arg1), %ymm13
+    vmovdqu64   32*14($arg1), %ymm14
+    vmovdqu64   32*15($arg1), %ymm15
+    vmovdqu64   32*16($arg1), %ymm16
+    vmovdqu64   32*17($arg1), %ymm17
+    vmovdqu64   32*18($arg1), %ymm18
+    vmovdqu64   32*19($arg1), %ymm19
+    vmovdqu64   32*20($arg1), %ymm20
+    vmovdqu64   32*21($arg1), %ymm21
+    vmovdqu64   32*22($arg1), %ymm22
+    vmovdqu64   32*23($arg1), %ymm23
+    vmovdqu64   32*24($arg1), %ymm24
+    ret
+.cfi_endproc
+.size   keccak_1600_load_state_x4,.-keccak_1600_load_state_x4
+
+
+.globl  keccak_1600_save_state_x4
+.type   keccak_1600_save_state_x4,\@abi-omnipotent
+.align  32
+keccak_1600_save_state_x4:
+.cfi_startproc
+    vmovdqu64   %ymm0,  32*0($arg1)
+    vmovdqu64   %ymm1,  32*1($arg1)
+    vmovdqu64   %ymm2,  32*2($arg1)
+    vmovdqu64   %ymm3,  32*3($arg1)
+    vmovdqu64   %ymm4,  32*4($arg1)
+    vmovdqu64   %ymm5,  32*5($arg1)
+    vmovdqu64   %ymm6,  32*6($arg1)
+    vmovdqu64   %ymm7,  32*7($arg1)
+    vmovdqu64   %ymm8,  32*8($arg1)
+    vmovdqu64   %ymm9,  32*9($arg1)
+    vmovdqu64   %ymm10, 32*10($arg1)
+    vmovdqu64   %ymm11, 32*11($arg1)
+    vmovdqu64   %ymm12, 32*12($arg1)
+    vmovdqu64   %ymm13, 32*13($arg1)
+    vmovdqu64   %ymm14, 32*14($arg1)
+    vmovdqu64   %ymm15, 32*15($arg1)
+    vmovdqu64   %ymm16, 32*16($arg1)
+    vmovdqu64   %ymm17, 32*17($arg1)
+    vmovdqu64   %ymm18, 32*18($arg1)
+    vmovdqu64   %ymm19, 32*19($arg1)
+    vmovdqu64   %ymm20, 32*20($arg1)
+    vmovdqu64   %ymm21, 32*21($arg1)
+    vmovdqu64   %ymm22, 32*22($arg1)
+    vmovdqu64   %ymm23, 32*23($arg1)
+    vmovdqu64   %ymm24, 32*24($arg1)
+    ret
+.cfi_endproc
+.size   keccak_1600_save_state_x4,.-keccak_1600_save_state_x4
+
+
+# Add input data to state when message length is less than rate
+# Arguments:
+#   r10:        state pointer to absorb into (clobbered)
+#   arg2 (rsi): message pointer lane 0 (updated on output)
+#   arg3 (rdx): message pointer lane 1 (updated on output)
+#   arg4 (rcx): message pointer lane 2 (updated on output)
+#   arg5 (r8):  message pointer lane 3 (updated on output)
+#   r12:        length in bytes (clobbered on output)
+# Clobbers: r9, rbx, r15, k1, ymm31-ymm29
+.globl  keccak_1600_partial_add_x4
+.type   keccak_1600_partial_add_x4,\@abi-omnipotent
+.align  32
+keccak_1600_partial_add_x4:
+.cfi_startproc
+    mov     8*100(%r10), %r9
+    test    \$7, %r9d
+    jz      .Lstart_aligned_to_4x8
+
+    # Start offset is not aligned to register size
+    mov     %r9, %r15 # %r15 = s[100]
+
+    and     \$7, %r9d
+    neg     %r9d
+    add     \$8, %r9d     # register capacity = 8 - (offset % 8)
+    cmp     %r9d, %r12d
+    cmovnae   %r12d, %r9d # %r9d = min(register capacity, length)
+
+    lea     byte_kmask_0_to_7(%rip), %rbx
+    kmovb   (%rbx,%r9), %k1 # message load mask
+
+    mov     %r15, %rbx
+    and     \$~7, %ebx
+    lea     (%r10,%rbx,4), %r10 # get to state starting register
+
+    mov     %r15, %rbx
+    and     \$7, %ebx
+
+    vmovdqu8    (%r10), %ymm31 # load & store / allocate SB for the register
+    vmovdqu8    %ymm31, (%r10)
+
+    vmovdqu8    ($arg2), %xmm31{%k1}{z}        # Read 1 to 7 bytes from lane 0
+    vmovdqu8    8*0(%r10,%rbx), %xmm30{%k1}{z} # Read 1 to 7 bytes from state reg lane 0
+    vpxorq      %xmm30, %xmm31, %xmm31
+    vmovdqu8    %xmm31, 8*0(%r10,%rbx){%k1}    # Write 1 to 7 bytes to state reg lane 0
+
+    vmovdqu8    ($arg3), %xmm31{%k1}{z}        # Read 1 to 7 bytes from lane 1
+    vmovdqu8    8*1(%r10,%rbx), %xmm30{%k1}{z} # Read 1 to 7 bytes from state reg lane 1
+    vpxorq      %xmm30, %xmm31, %xmm31
+    vmovdqu8    %xmm31, 8*1(%r10,%rbx){%k1}    # Write 1 to 7 bytes to state reg lane 1
+
+    vmovdqu8    ($arg4), %xmm31{%k1}{z}        # Read 1 to 7 bytes from lane 2
+    vmovdqu8    8*2(%r10,%rbx), %xmm30{%k1}{z} # Read 1 to 7 bytes from state reg lane 2
+    vpxorq      %xmm30, %xmm31, %xmm31
+    vmovdqu8    %xmm31, 8*2(%r10,%rbx){%k1}    # Write 1 to 7 bytes to state reg lane 2
+
+    vmovdqu8    ($arg5), %xmm31{%k1}{z}        # Read 1 to 7 bytes from lane 3
+    vmovdqu8    8*3(%r10,%rbx), %xmm30{%k1}{z} # Read 1 to 7 bytes from state reg lane 3
+    vpxorq      %xmm30, %xmm31, %xmm31
+    vmovdqu8    %xmm31, 8*3(%r10,%rbx){%k1}    # Write 1 to 7 bytes to state reg lane 3
+
+    sub     %r9, %r12
+    jz      .Lzero_bytes
+
+    add     %r9, $arg2
+    add     %r9, $arg3
+    add     %r9, $arg4
+    add     %r9, $arg5
+    add     \$32, %r10
+    xor     %r9, %r9
+    jmp     .Lymm_loop
+
+.Lstart_aligned_to_4x8:
+    lea     (%r10,%r9,4), %r10
+    xor     %r9, %r9
+
+.align  32
+.Lymm_loop:
+    cmp     \$8, %r12d
+    jb      .Llt_8_bytes
+
+    vmovq       ($arg2,%r9), %xmm31              # Read 8 bytes from lane 0
+    vpinsrq     \$1, ($arg3,%r9), %xmm31, %xmm31 # Read 8 bytes from lane 1
+    vmovq       ($arg4,%r9), %xmm30              # Read 8 bytes from lane 2
+    vpinsrq     \$1, ($arg5,%r9),%xmm30, %xmm30  # Read 8 bytes from lane 3
+    vinserti32x4 \$1, %xmm30, %ymm31, %ymm31
+    vpxorq      (%r10,%r9,4), %ymm31, %ymm31     # Add data with the state
+    vmovdqu64   %ymm31, (%r10,%r9,4)
+    add     \$8, %r9
+    sub     \$8, %r12
+    jz      .Lzero_bytes
+
+    jmp     .Lymm_loop
+
+.align  32
+.Lzero_bytes:
+    add     %r9, $arg2
+    add     %r9, $arg3
+    add     %r9, $arg4
+    add     %r9, $arg5
+    ret
+
+.align  32
+.Llt_8_bytes:
+    add     %r9, $arg2
+    add     %r9, $arg3
+    add     %r9, $arg4
+    add     %r9, $arg5
+    lea     (%r10,%r9,4), %r10
+
+    lea     byte_kmask_0_to_7(%rip), %rbx
+    kmovb   (%rbx,%r12), %k1 # message load mask
+
+    vmovdqu8    ($arg2), %xmm31{%k1}{z} # Read 1 to 7 bytes from lane 0
+    vmovdqu8    ($arg3), %xmm30{%k1}{z} # Read 1 to 7 bytes from lane 1
+    vpunpcklqdq %xmm30, %xmm31, %xmm31  # Interleave data from lane 0 and lane 1
+    vmovdqu8    ($arg4), %xmm30{%k1}{z} # Read 1 to 7 bytes from lane 2
+    vmovdqu8    ($arg5), %xmm29{%k1}{z} # Read 1 to 7 bytes from lane 3
+    vpunpcklqdq %xmm29, %xmm30, %xmm30  # Interleave data from lane 2 and lane 3
+    vinserti32x4 \$1, %xmm30, %ymm31, %ymm31
+
+    vpxorq      (%r10), %ymm31, %ymm31 # Add data to the state
+    vmovdqu64   %ymm31, (%r10)         # Update state in memory
+
+    add     %r12, $arg2 # increment message pointer lane 0
+    add     %r12, $arg3 # increment message pointer lane 1
+    add     %r12, $arg4 # increment message pointer lane 2
+    add     %r12, $arg5 # increment message pointer lane 3
+    ret
+.cfi_endproc
+.size   keccak_1600_partial_add_x4,.-keccak_1600_partial_add_x4
+
+
+# Extract bytes from state and write to outputs
+# Arguments:
+#   r10:        state pointer to start extracting from (clobbered)
+#   arg1 (rdi): output pointer lane 0 (updated on output)
+#   arg2 (rsi): output pointer lane 1 (updated on output)
+#   arg3 (rdx): output pointer lane 2 (updated on output)
+#   arg4 (rcx): output pointer lane 3 (updated on output)
+#   r12:        length in bytes (clobbered on output)
+#   r11:        state offset to start extract from
+.globl  keccak_1600_extract_bytes_x4
+.type   keccak_1600_extract_bytes_x4,\@abi-omnipotent
+.align  32
+keccak_1600_extract_bytes_x4:
+.cfi_startproc
+    or      %r12, %r12
+    jz      .Lextract_zero_bytes
+
+    test    \$7, %r11d
+    jz      .Lextract_start_aligned_to_4x8
+
+    # Extract offset is not aligned to the register size (8 bytes)
+    mov     %r11, %r9
+
+    and     \$7, %r9d
+    neg     %r9d
+    add     \$8, %r9d     # register capacity = 8 - (offset % 8)
+    cmp     %r9d, %r12d
+    cmovnae   %r12d, %r9d # %r9d = min(register capacity, length)
+
+    lea     byte_kmask_0_to_7(%rip), %rbx
+    kmovb   (%rbx,%r9), %k1 # message store mask
+
+    mov     %r11, %rbx
+    and     \$~7, %ebx
+    lea     (%r10,%rbx,4), %r10 # get to state starting register
+
+    mov     %r11, %rbx
+    and     \$7, %ebx
+
+    vmovdqu8    8*0(%r10,%rbx), %xmm31{%k1}{z} # Read 1-7 bytes from state reg lane 0
+    vmovdqu8    %xmm31, ($arg1){%k1}           # Write 1-7 bytes to lane 0 output
+
+    vmovdqu8    8*1(%r10,%rbx), %xmm31{%k1}{z} # Read 1-7 bytes from state reg lane 1
+    vmovdqu8    %xmm31, ($arg2){%k1}           # Write 1-7 bytes to lane 1 output
+
+    vmovdqu8    8*2(%r10,%rbx), %xmm31{%k1}{z} # Read 1-7 bytes from state reg lane 2
+    vmovdqu8    %xmm31, ($arg3){%k1}           # Write 1-7 bytes to lane 2 output
+
+    vmovdqu8    8*3(%r10,%rbx), %xmm31{%k1}{z} # Read 1-7 bytes from state reg lane 3
+    vmovdqu8    %xmm31, ($arg4){%k1}           # Write 1-7 bytes to lane 3 output
+
+    # Increment output registers
+    add     %r9, $arg1
+    add     %r9, $arg2
+    add     %r9, $arg3
+    add     %r9, $arg4
+
+    # Decrement length to extract
+    sub     %r9, %r12
+    jz      .Lextract_zero_bytes
+
+    # More data to extract, update state register pointer
+    add     \$32, %r10
+    xor     %r9, %r9
+    jmp     .Lextract_ymm_loop
+
+.Lextract_start_aligned_to_4x8:
+        lea     (%r10,%r11,4), %r10
+        xor     %r9, %r9
+
+.align  32
+.Lextract_ymm_loop:
+    cmp     \$8, %r12
+    jb      .Lextract_lt_8_bytes
+
+    vmovdqu64   (%r10), %xmm31
+    vmovdqu64   16(%r10), %xmm30
+    vmovq       %xmm31, ($arg1,%r9)
+    vpextrq     \$1, %xmm31, ($arg2,%r9)
+    vmovq       %xmm30, ($arg3,%r9)
+    vpextrq     \$1, %xmm30, ($arg4,%r9)
+    add     \$8, %r9
+    sub     \$8, %r12
+    jz      .Lzero_bytes_left
+
+    add     \$32, %r10
+    jmp     .Lextract_ymm_loop
+
+.align  32
+.Lzero_bytes_left:
+    # Increment output pointers
+    add     %r9, $arg1
+    add     %r9, $arg2
+    add     %r9, $arg3
+    add     %r9, $arg4
+.Lextract_zero_bytes:
+    ret
+
+.align  32
+.Lextract_lt_8_bytes:
+    add     %r9, $arg1
+    add     %r9, $arg2
+    add     %r9, $arg3
+    add     %r9, $arg4
+
+    lea     byte_kmask_0_to_7(%rip), %r9
+    kmovb   (%r9,%r12), %k1 # k1 is the mask of message bytes to read
+
+    vmovq       0*8(%r10), %xmm31    # Read 8 bytes from state lane 0
+    vmovdqu8    %xmm31, ($arg1){%k1} # Extract 1-7 bytes into output 0
+    vmovq       1*8(%r10), %xmm31    # Read 8 bytes from state lane 1
+    vmovdqu8    %xmm31, ($arg2){%k1} # Extract 1-7 bytes into output 1
+    vmovq       2*8(%r10), %xmm31    # Read 8 bytes from state lane 2
+    vmovdqu8    %xmm31, ($arg3){%k1} # Extract 1-7 bytes into output 2
+    vmovq       3*8(%r10), %xmm31    # Read 8 bytes from state lane 3
+    vmovdqu8    %xmm31, ($arg4){%k1} # Extract 1-7 bytes into output 3
+
+    # Increment output pointers
+    add     %r12, $arg1
+    add     %r12, $arg2
+    add     %r12, $arg3
+    add     %r12, $arg4
+    ret
+.cfi_endproc
+.size   keccak_1600_extract_bytes_x4,.-keccak_1600_extract_bytes_x4
+
+
+# SHAKE128 x4 multi-buffer functions
+# These functions process 4 independent SHAKE128 streams in parallel using AVX-512VL
+# State layout: 25 ymm registers (200 bytes each) + 1 qword = 808 bytes per context
+# Rate: 168 bytes for SHAKE128
+
+# SHA3_shake128_x4_avx512vl
+# One-shot SHAKE-128 x4 function: init + absorb + finalize + squeeze
+# Arguments:
+#   arg1 (rdi): pointer to output lane 0
+#   arg2 (rsi): pointer to output lane 1
+#   arg3 (rdx): pointer to output lane 2
+#   arg4 (rcx): pointer to output lane 3
+#   arg5 (r8):  output length in bytes (must be same for all lanes)
+#   arg6 (r9):  pointer to input lane 0
+#   [stack+0]:  pointer to input lane 1
+#   [stack+8]:  pointer to input lane 2
+#   [stack+16]: pointer to input lane 3
+#   [stack+24]: input length in bytes (must be same for all lanes)
+# Returns: void
+.globl  SHA3_shake128_x4_avx512vl
+.type   SHA3_shake128_x4_avx512vl,\@function,10
+.align  32
+SHA3_shake128_x4_avx512vl:
+.cfi_startproc
+    push    %rbp
+.cfi_push       %rbp
+    mov     %rsp, %rbp
+    push    %rbx
+.cfi_push       %rbx
+___
+$code .= <<___ if ($win64);
+    sub     \$160, %rsp
+    vmovups %xmm6,   0(%rsp)
+    vmovups %xmm7,   16(%rsp)
+    vmovups %xmm8,   32(%rsp)
+    vmovups %xmm9,   48(%rsp)
+    vmovups %xmm10,  64(%rsp)
+    vmovups %xmm11,  80(%rsp)
+    vmovups %xmm12,  96(%rsp)
+    vmovups %xmm13,  112(%rsp)
+    vmovups %xmm14,  128(%rsp)
+    vmovups %xmm15,  144(%rsp)
+___
+$code.=<<___;
+
+    sub     \$$sf_size, %rsp
+    mov     %rsp, %rbx
+
+.Lshake128_x4_body:
+    mov     $arg1, $sf_arg1(%rbx)
+    mov     $arg2, $sf_arg2(%rbx)
+    mov     $arg3, $sf_arg3(%rbx)
+    mov     $arg4, $sf_arg4(%rbx)
+    mov     $arg5, $sf_arg5(%rbx)
+
+    lea     $sf_state_x4(%rbx), $arg1 # start of x4 state on the stack frame
+    mov     $arg1, $sf_state_ptr(%rbx)
+
+    # Initialize the state array to zero
+    call    keccak_1600_init_state
+
+    call    keccak_1600_save_state_x4
+
+    movq    \$0, 8*100($arg1) # clear s[100]
+
+    mov     $sf_state_ptr(%rbx), $arg1
+    mov     $arg6, $arg2
+___
+$code .= <<___ if ($win64);
+    # xlate prologue handles up to six arguments. For one-shot x4 wrappers
+    # (10 args), the remaining four stay in Win64 stack slots.
+    mov     64(%rbp), $arg3 # arg7 from stack
+    mov     72(%rbp), $arg4 # arg8 from stack
+    mov     80(%rbp), $arg5 # arg9 from stack
+    mov     88(%rbp), $arg6 # arg10 from stack
+___
+$code .= <<___ if (!$win64);
+    mov     16(%rbp), $arg3 # arg7 from stack
+    mov     24(%rbp), $arg4 # arg8 from stack
+    mov     32(%rbp), $arg5 # arg9 from stack
+    mov     40(%rbp), $arg6 # arg10 from stack
+___
+$code.=<<___;
+    # Internal entry avoids Win64 xlate prologue argument remapping.
+___
+$code .= call_internal("SHA3_shake128_x4_inc_absorb_avx512vl_internal");
+$code.=<<___;
+
+    mov     $sf_state_ptr(%rbx), $arg1
+    call    .L_SHA3_shake128_x4_inc_finalize_avx512vl
+
+    # squeeze
+    mov     $sf_arg1(%rbx), $arg1
+    mov     $sf_arg2(%rbx), $arg2
+    mov     $sf_arg3(%rbx), $arg3
+    mov     $sf_arg4(%rbx), $arg4
+    mov     $sf_arg5(%rbx), $arg5
+    mov     $sf_state_ptr(%rbx), $arg6
+___
+$code .= call_internal("SHA3_shake128_x4_inc_squeeze_avx512vl_internal");
+$code.=<<___;
+
+    # Clear the temporary buffer
+    lea     $sf_state_x4(%rbx), %r9
+    vpxorq      %ymm31, %ymm31, %ymm31
+    vmovdqu64   %ymm31, 32*0(%r9)
+    vmovdqu64   %ymm31, 32*1(%r9)
+    vmovdqu64   %ymm31, 32*2(%r9)
+    vmovdqu64   %ymm31, 32*3(%r9)
+    vmovdqu64   %ymm31, 32*4(%r9)
+    vmovdqu64   %ymm31, 32*5(%r9)
+    vmovdqu64   %ymm31, 32*6(%r9)
+    vmovdqu64   %ymm31, 32*7(%r9)
+    vmovdqu64   %ymm31, 32*8(%r9)
+    vmovdqu64   %ymm31, 32*9(%r9)
+    vmovdqu64   %ymm31, 32*10(%r9)
+    vmovdqu64   %ymm31, 32*11(%r9)
+    vmovdqu64   %ymm31, 32*12(%r9)
+    vmovdqu64   %ymm31, 32*13(%r9)
+    vmovdqu64   %ymm31, 32*14(%r9)
+    vmovdqu64   %ymm31, 32*15(%r9)
+    vmovdqu64   %ymm31, 32*16(%r9)
+    vmovdqu64   %ymm31, 32*17(%r9)
+    vmovdqu64   %ymm31, 32*18(%r9)
+    vmovdqu64   %ymm31, 32*19(%r9)
+    vmovdqu64   %ymm31, 32*20(%r9)
+    vmovdqu64   %ymm31, 32*21(%r9)
+    vmovdqu64   %ymm31, 32*22(%r9)
+    vmovdqu64   %ymm31, 32*23(%r9)
+    vmovdqu64   %ymm31, 32*24(%r9)
+    vmovq       %xmm31, 32*25(%r9)
+
+.Lshake128_x4_epilogue:
+___
+$code .= <<___ if ($win64);
+    vmovups $sf_size+0(%rsp),   %xmm6
+    vmovups $sf_size+16(%rsp),  %xmm7
+    vmovups $sf_size+32(%rsp),  %xmm8
+    vmovups $sf_size+48(%rsp),  %xmm9
+    vmovups $sf_size+64(%rsp),  %xmm10
+    vmovups $sf_size+80(%rsp),  %xmm11
+    vmovups $sf_size+96(%rsp),  %xmm12
+    vmovups $sf_size+112(%rsp), %xmm13
+    vmovups $sf_size+128(%rsp), %xmm14
+    vmovups $sf_size+144(%rsp), %xmm15
+    add     \$160, %rsp
+___
+$code.=<<___;
+    add     \$$sf_size, %rsp
+    pop     %rbx
+.cfi_pop        %rbx
+    pop     %rbp
+.cfi_pop        %rbp
+    ret
+.cfi_endproc
+.size   SHA3_shake128_x4_avx512vl,.-SHA3_shake128_x4_avx512vl
+
+___
+
+$code .= <<___ if ($win64);
+# Internal Win64 shim for absorb entry. It establishes xlate-compatible
+# unwind state and then jumps to the function entry after the prologue.
+# This is required for internal calls since the xlate ABI conversion
+# is already done in the caller function.
+.type   SHA3_shake128_x4_inc_absorb_avx512vl_internal,\@abi-omnipotent
+.align  32
+.LSEH_begin_SHA3_shake128_x4_inc_absorb_avx512vl_internal:
+SHA3_shake128_x4_inc_absorb_avx512vl_internal:
+    mov     %rsp, %rax
+    mov     $arg1, 8(%rsp)
+    mov     $arg2, 16(%rsp)
+    jmp     .L_SHA3_shake128_x4_inc_absorb_avx512vl
+.LSEH_end_SHA3_shake128_x4_inc_absorb_avx512vl_internal:
+.size   SHA3_shake128_x4_inc_absorb_avx512vl_internal,.-SHA3_shake128_x4_inc_absorb_avx512vl_internal
+___
+$code.=<<___;
+
+# SHA3_shake128_x4_inc_absorb_avx512vl
+# Absorb input data into 4 parallel SHAKE128 states
+# Arguments:
+#   arg1 (rdi): pointer to state context (808 bytes)
+#   arg2 (rsi): pointer to lane 0 input data
+#   arg3 (rdx): pointer to lane 1 input data
+#   arg4 (rcx): pointer to lane 2 input data
+#   arg5 (r8):  pointer to lane 3 input data
+#   arg6 (r9):  input length in bytes (must be same for all lanes)
+# Returns: void
+# Note: Input is XORed into state and Keccak permutation is applied for each rate-sized block
+.globl  SHA3_shake128_x4_inc_absorb_avx512vl
+.type   SHA3_shake128_x4_inc_absorb_avx512vl,\@function,6
+.align  32
+SHA3_shake128_x4_inc_absorb_avx512vl:
+.L_SHA3_shake128_x4_inc_absorb_avx512vl:
+.cfi_startproc
+        push    %rbp
+.cfi_push       %rbp
+        push    %rbx
+.cfi_push       %rbx
+        push    %r12
+.cfi_push       %r12
+        push    %r13
+.cfi_push       %r13
+        push    %r14
+.cfi_push       %r14
+        push    %r15
+.cfi_push       %r15
+___
+$code .= <<___ if ($win64);
+    sub     \$160, %rsp
+    vmovups %xmm6,   0(%rsp)
+    vmovups %xmm7,   16(%rsp)
+    vmovups %xmm8,   32(%rsp)
+    vmovups %xmm9,   48(%rsp)
+    vmovups %xmm10,  64(%rsp)
+    vmovups %xmm11,  80(%rsp)
+    vmovups %xmm12,  96(%rsp)
+    vmovups %xmm13,  112(%rsp)
+    vmovups %xmm14,  128(%rsp)
+    vmovups %xmm15,  144(%rsp)
+___
+$code.=<<___;
+
+.Lshake128_absorb_body:
+    # check for partially processed block
+    mov     8*100($arg1), %r14
+    or      %r14, %r14 # s[100] == 0?
+    je      .Lshake128_absorb_main_loop_start
+
+    # process remaining bytes if message long enough
+    mov     \$168, %r12 # SHAKE128_RATE = 168
+    sub     %r14, %r12  # %r12 = capacity
+
+    cmp     %r12, $arg6 # if mlen <= capacity then no permute
+    jbe     .Lshake128_absorb_skip_permute
+
+    sub     %r12, $arg6
+    mov     $arg6, %r11 # preserve remaining length across helper calls
+
+    # r10/state, arg2-arg5/inputs, r12/length
+    mov     $arg1, %r10                # %r10 = state
+    call    keccak_1600_partial_add_x4 # arg2-arg5 are updated
+
+    call    keccak_1600_load_state_x4
+
+    call    keccak_1600_permute
+
+    movq    \$0, 8*100($arg1) # clear s[100]
+    jmp     .Lshake128_absorb_partial_block_done
+
+.Lshake128_absorb_skip_permute:
+    # r10/state, arg2-arg5/inputs, r12/length
+    mov     $arg1, %r10
+    mov     $arg6, %r12
+    mov     $arg6, %r11 # preserve input length across helper call
+    call    keccak_1600_partial_add_x4
+
+    lea     (%r11,%r14), %r15
+    mov     %r15, 8*100($arg1) # s[100] += inlen
+
+    cmp     \$168, %r15 # check s[100] below SHAKE128_RATE
+    jb      .Lshake128_absorb_exit
+
+    call    keccak_1600_load_state_x4
+
+    call    keccak_1600_permute
+
+    call    keccak_1600_save_state_x4
+
+    movq    \$0, 8*100($arg1) # clear s[100]
+    jmp     .Lshake128_absorb_exit
+
+.Lshake128_absorb_main_loop_start:
+    call    keccak_1600_load_state_x4
+    mov     $arg6, %r11 # full input length when no prior partial block
+
+.Lshake128_absorb_partial_block_done:
+    xor     %r12, %r12  # zero message offset
+
+    # Process the input message in blocks
+.align  32
+.Lshake128_absorb_while_loop:
+    cmp     \$168, %r11 # compare mlen to SHAKE128_RATE
+    jb      .Lshake128_absorb_while_loop_done
+
+    # Inline absorb_bytes_x4 for SHAKE128_RATE (168 bytes = 21 ymm registers)
+___
+
+# Generate absorb code for SHAKE128 rate (168 bytes)
+for (my $i = 0; $i < 21; $i++) {
+    my $offset = $i * 8;
+    $code.=<<___;
+        vmovq       $offset($arg2,%r12), %xmm31
+        vpinsrq     \$1, $offset($arg3,%r12), %xmm31, %xmm31
+        vmovq       $offset($arg4,%r12), %xmm30
+        vpinsrq     \$1, $offset($arg5,%r12), %xmm30, %xmm30
+        vinserti32x4 \$1, %xmm30, %ymm31, %ymm31
+        vpxorq      %ymm31, %ymm$i, %ymm$i
+___
+}
+
+$code.=<<___;
+    sub     \$168, %r11         # Subtract the rate from the remaining length
+    add     \$168, %r12         # Adjust offset to next block
+    call    keccak_1600_permute # Perform the Keccak permutation
+
+    jmp     .Lshake128_absorb_while_loop
+
+.align  32
+.Lshake128_absorb_while_loop_done:
+    call    keccak_1600_save_state_x4
+
+    mov     %r11, 8*100($arg1) # update s[100]
+    or      %r11, %r11
+    jz      .Lshake128_absorb_exit
+
+    movq    \$0, 8*100($arg1) # clear s[100]
+
+    # r10/state, arg2-arg5/input, r12/length
+    mov     $arg1, %r10
+    add     %r12, $arg2
+    add     %r12, $arg3
+    add     %r12, $arg4
+    add     %r12, $arg5
+    mov     %r11, %r12
+    call    keccak_1600_partial_add_x4
+
+    mov     %r11, 8*100($arg1) # update s[100]
+
+.Lshake128_absorb_exit:
+    # Clear sensitive registers
+    vpxorq      %xmm16, %xmm16, %xmm16
+    vmovdqa64   %ymm16, %ymm17
+    vmovdqa64   %ymm16, %ymm18
+    vmovdqa64   %ymm16, %ymm19
+    vmovdqa64   %ymm16, %ymm20
+    vmovdqa64   %ymm16, %ymm21
+    vmovdqa64   %ymm16, %ymm22
+    vmovdqa64   %ymm16, %ymm23
+    vmovdqa64   %ymm16, %ymm24
+    vmovdqa64   %ymm16, %ymm25
+    vmovdqa64   %ymm16, %ymm26
+    vmovdqa64   %ymm16, %ymm27
+    vmovdqa64   %ymm16, %ymm28
+    vmovdqa64   %ymm16, %ymm29
+    vmovdqa64   %ymm16, %ymm30
+    vmovdqa64   %ymm16, %ymm31
+.Lshake128_absorb_epilogue:
+    vzeroall
+___
+$code .= <<___ if ($win64);
+    vmovups 0(%rsp),   %xmm6
+    vmovups 16(%rsp),  %xmm7
+    vmovups 32(%rsp),  %xmm8
+    vmovups 48(%rsp),  %xmm9
+    vmovups 64(%rsp),  %xmm10
+    vmovups 80(%rsp),  %xmm11
+    vmovups 96(%rsp),  %xmm12
+    vmovups 112(%rsp), %xmm13
+    vmovups 128(%rsp), %xmm14
+    vmovups 144(%rsp), %xmm15
+    add     \$160, %rsp
+___
+$code.=<<___;
+
+    pop     %r15
+.cfi_pop        %r15
+    pop     %r14
+.cfi_pop        %r14
+    pop     %r13
+.cfi_pop        %r13
+    pop     %r12
+.cfi_pop        %r12
+    pop     %rbx
+.cfi_pop        %rbx
+    pop     %rbp
+.cfi_pop        %rbp
+    ret
+.cfi_endproc
+.size   SHA3_shake128_x4_inc_absorb_avx512vl,.-SHA3_shake128_x4_inc_absorb_avx512vl
+
+
+# SHA3_shake128_x4_inc_finalize_avx512vl
+# Finalize absorption phase for 4 parallel SHAKE-128 states
+# Adds padding and terminator bytes and clears the absorb offset
+# Arguments:
+#   arg1 (rdi): pointer to state context (808 bytes)
+# Returns: void
+# Note: After this call, state is ready for squeezing output
+.globl  SHA3_shake128_x4_inc_finalize_avx512vl
+.type   SHA3_shake128_x4_inc_finalize_avx512vl,\@function,1
+.align  32
+SHA3_shake128_x4_inc_finalize_avx512vl:
+.L_SHA3_shake128_x4_inc_finalize_avx512vl:
+.cfi_startproc
+    mov         8*100($arg1), %r11 # load state offset from s[100]
+    mov         %r11, %r10
+    and         \$~7, %r10d        # offset to the state register
+    and         \$7, %r11d         # offset within the register
+
+    # add EOM byte right after the message
+    vmovdqu32   ($arg1,%r10,4), %ymm31
+    lea         shake_msg_pad_x4(%rip), %r9
+    sub         %r11, %r9
+    vmovdqu32   (%r9), %ymm30
+    vpxorq      %ymm30, %ymm31, %ymm31
+    vmovdqu32   %ymm31, ($arg1,%r10,4)
+
+    # add terminating byte at offset equal to rate - 1 (SHAKE128_RATE = 168)
+    vmovdqu32   640($arg1), %ymm31 # 168*4 - 32 = 672 - 32 = 640
+    vmovdqa32   shake_terminator_byte_x4(%rip), %ymm30
+    vpxorq      %ymm30, %ymm31, %ymm31
+    vmovdqu32   %ymm31, 640($arg1)
+
+    movq        \$0, 8*100($arg1) # clear s[100]
+    vpxorq      %ymm31, %ymm31, %ymm31
+    ret
+.cfi_endproc
+.size   SHA3_shake128_x4_inc_finalize_avx512vl,.-SHA3_shake128_x4_inc_finalize_avx512vl
+
+___
+
+$code .= <<___ if ($win64);
+# Internal Win64 shim for squeeze entry. It establishes xlate-compatible
+# unwind state and then jumps to the function entry after the prologue.
+# This is required for internal calls since the xlate ABI conversion
+# is already done in the caller function.
+.type   SHA3_shake128_x4_inc_squeeze_avx512vl_internal,\@abi-omnipotent
+.align  32
+.LSEH_begin_SHA3_shake128_x4_inc_squeeze_avx512vl_internal:
+SHA3_shake128_x4_inc_squeeze_avx512vl_internal:
+    mov     %rsp, %rax
+    mov     $arg1, 8(%rsp)
+    mov     $arg2, 16(%rsp)
+    jmp     .L_SHA3_shake128_x4_inc_squeeze_avx512vl
+.LSEH_end_SHA3_shake128_x4_inc_squeeze_avx512vl_internal:
+.size   SHA3_shake128_x4_inc_squeeze_avx512vl_internal,.-SHA3_shake128_x4_inc_squeeze_avx512vl_internal
+___
+$code.=<<___;
+
+# SHA3_shake128_x4_inc_squeeze_avx512vl
+# Squeeze output from 4 parallel SHAKE128 states
+# Arguments:
+#   arg1 (rdi): pointer to lane 0 output buffer
+#   arg2 (rsi): pointer to lane 1 output buffer
+#   arg3 (rdx): pointer to lane 2 output buffer
+#   arg4 (rcx): pointer to lane 3 output buffer
+#   arg5 (r8):  output length in bytes (must be same for all lanes)
+#   arg6 (r9):  pointer to state context (808 bytes)
+# Returns: void
+# Note: Can be called multiple times to generate arbitrary-length output
+.globl  SHA3_shake128_x4_inc_squeeze_avx512vl
+.type   SHA3_shake128_x4_inc_squeeze_avx512vl,\@function,6
+.align  32
+SHA3_shake128_x4_inc_squeeze_avx512vl:
+.L_SHA3_shake128_x4_inc_squeeze_avx512vl:
+.cfi_startproc
+    push    %rbp
+.cfi_push       %rbp
+    push    %rbx
+.cfi_push       %rbx
+    push    %r12
+.cfi_push       %r12
+    push    %r13
+.cfi_push       %r13
+    push    %r14
+.cfi_push       %r14
+    push    %r15
+.cfi_push       %r15
+___
+$code .= <<___ if ($win64);
+    sub     \$160, %rsp
+    vmovups %xmm6,   0(%rsp)
+    vmovups %xmm7,   16(%rsp)
+    vmovups %xmm8,   32(%rsp)
+    vmovups %xmm9,   48(%rsp)
+    vmovups %xmm10,  64(%rsp)
+    vmovups %xmm11,  80(%rsp)
+    vmovups %xmm12,  96(%rsp)
+    vmovups %xmm13,  112(%rsp)
+    vmovups %xmm14,  128(%rsp)
+    vmovups %xmm15,  144(%rsp)
+___
+$code.=<<___;
+
+.Lshake128_squeeze_body:
+    or      $arg5, $arg5
+    jz      .Lshake128_squeeze_done
+
+    # check for partially processed block
+    mov     8*100($arg6), %r15 # s[100] - capacity
+    or      %r15, %r15
+    jnz     .Lshake128_squeeze_no_init_permute
+
+    mov     $arg1, %r14
+    mov     $arg6, $arg1
+    call    keccak_1600_load_state_x4
+
+    mov     %r14, $arg1
+
+    xor     %rbp, %rbp
+    jmp     .Lshake128_squeeze_loop
+
+.align  32
+.Lshake128_squeeze_no_init_permute:
+    # extract bytes: r10 - state/src, arg1-arg4 - output/dst, r12 - length = min(capacity, outlen), r11 - offset
+    mov     $arg6, %r10
+    mov     $arg6, %r14 # preserve state pointer across extract helper
+
+    mov     %r15, %r12
+    cmp     %r15, $arg5
+    cmovnae $arg5, %r12 # %r12 = min(capacity, outlen)
+
+    sub     %r12, $arg5 # outlen -= length
+
+    mov     \$168, %r11d # SHAKE128_RATE
+    sub     %r15, %r11   # state offset
+
+    sub     %r12, %r15         # capacity -= length
+    mov     %r15, 8*100($arg6) # update s[100]
+
+    call    keccak_1600_extract_bytes_x4
+    mov     %r14, $arg6        # restore state pointer after helper clobbers
+
+    or      %r15, %r15
+    jnz     .Lshake128_squeeze_done # check s[100] not zero
+
+    mov     $arg1, %r13 # preserve arg1
+    mov     %r14, $arg1
+    call    keccak_1600_load_state_x4
+
+    mov     %r13, $arg1
+    xor     %rbp, %rbp
+
+.align  32
+.Lshake128_squeeze_loop:
+    cmp     \$168, $arg5 # outlen > SHAKE128_RATE
+    jb      .Lshake128_squeeze_final_extract
+
+    call    keccak_1600_permute
+
+    # Extract SHAKE128 rate bytes (168 bytes = 21 x 8 bytes) inline
+___
+
+# Generate extract code for SHAKE128 rate (168 bytes = 21 ymm registers)
+for (my $i = 0; $i < 21; $i++) {
+    my $offset = $i * 8;
+    $code.=<<___;
+        vextracti64x2 \$1, %ymm$i, %xmm31
+        vmovq       %xmm$i, $offset($arg1,%rbp)
+        vpextrq     \$1, %xmm$i, $offset($arg2,%rbp)
+        vmovq       %xmm31, $offset($arg3,%rbp)
+        vpextrq     \$1, %xmm31, $offset($arg4,%rbp)
+___
+}
+
+$code.=<<___;
+    add     \$168, %rbp  # dst offset += SHAKE128_RATE
+    sub     \$168, $arg5 # outlen -= SHAKE128_RATE
+    jmp     .Lshake128_squeeze_loop
+
+.align  32
+.Lshake128_squeeze_final_extract:
+    or      $arg5, $arg5
+    jz      .Lshake128_squeeze_no_end_permute
+
+    # update output pointers
+    add     %rbp, $arg1
+    add     %rbp, $arg2
+    add     %rbp, $arg3
+    add     %rbp, $arg4
+
+    mov     \$168, %r15d       # SHAKE128_RATE
+    sub     $arg5, %r15
+    mov     %r15, 8*100($arg6) # s[100] = capacity
+
+    call    keccak_1600_permute
+
+    mov     $arg1, %r14
+    mov     $arg6, $arg1
+    call    keccak_1600_save_state_x4
+
+    mov     %r14, $arg1
+
+    # extract bytes: r10 - state/src, arg1-arg4 - output/dst, r12 - length, r11 - offset = 0
+    mov     $arg6, %r10
+    mov     $arg5, %r12
+    xor     %r11, %r11
+    call    keccak_1600_extract_bytes_x4
+
+    jmp     .Lshake128_squeeze_done
+
+.Lshake128_squeeze_no_end_permute:
+    movq    \$0, 8*100($arg6) # s[100] = 0
+    mov     $arg6, $arg1
+    call    keccak_1600_save_state_x4
+
+.Lshake128_squeeze_done:
+    # Clear sensitive registers
+    vpxorq      %xmm16, %xmm16, %xmm16
+    vmovdqa64   %ymm16, %ymm17
+    vmovdqa64   %ymm16, %ymm18
+    vmovdqa64   %ymm16, %ymm19
+    vmovdqa64   %ymm16, %ymm20
+    vmovdqa64   %ymm16, %ymm21
+    vmovdqa64   %ymm16, %ymm22
+    vmovdqa64   %ymm16, %ymm23
+    vmovdqa64   %ymm16, %ymm24
+    vmovdqa64   %ymm16, %ymm25
+    vmovdqa64   %ymm16, %ymm26
+    vmovdqa64   %ymm16, %ymm27
+    vmovdqa64   %ymm16, %ymm28
+    vmovdqa64   %ymm16, %ymm29
+    vmovdqa64   %ymm16, %ymm30
+    vmovdqa64   %ymm16, %ymm31
+.Lshake128_squeeze_epilogue:
+    vzeroall
+___
+$code .= <<___ if ($win64);
+    vmovups 0(%rsp),   %xmm6
+    vmovups 16(%rsp),  %xmm7
+    vmovups 32(%rsp),  %xmm8
+    vmovups 48(%rsp),  %xmm9
+    vmovups 64(%rsp),  %xmm10
+    vmovups 80(%rsp),  %xmm11
+    vmovups 96(%rsp),  %xmm12
+    vmovups 112(%rsp), %xmm13
+    vmovups 128(%rsp), %xmm14
+    vmovups 144(%rsp), %xmm15
+    add     \$160, %rsp
+___
+$code.=<<___;
+
+    pop %r15
+.cfi_pop    %r15
+    pop %r14
+.cfi_pop    %r14
+    pop %r13
+.cfi_pop    %r13
+    pop %r12
+.cfi_pop    %r12
+    pop %rbx
+.cfi_pop    %rbx
+    pop %rbp
+.cfi_pop    %rbp
+    ret
+.cfi_endproc
+.size   SHA3_shake128_x4_inc_squeeze_avx512vl,.-SHA3_shake128_x4_inc_squeeze_avx512vl
+
+
+# SHAKE256 x4 multi-buffer functions
+# These functions process 4 independent SHAKE256 streams in parallel using AVX-512VL
+# State layout: 25 ymm registers (200 bytes each) + 1 qword = 808 bytes per context
+# Rate: 136 bytes for SHAKE256
+
+# SHA3_shake256_x4_avx512vl
+# One-shot SHAKE-256 x4 function: init + absorb + finalize + squeeze
+# Arguments:
+#   arg1 (rdi): pointer to output lane 0
+#   arg2 (rsi): pointer to output lane 1
+#   arg3 (rdx): pointer to output lane 2
+#   arg4 (rcx): pointer to output lane 3
+#   arg5 (r8):  output length in bytes (must be same for all lanes)
+#   arg6 (r9):  pointer to input lane 0
+#   [stack+0]:  pointer to input lane 1
+#   [stack+8]:  pointer to input lane 2
+#   [stack+16]: pointer to input lane 3
+#   [stack+24]: input length in bytes (must be same for all lanes)
+# Returns: void
+.globl  SHA3_shake256_x4_avx512vl
+.type   SHA3_shake256_x4_avx512vl,\@function,10
+.align  32
+SHA3_shake256_x4_avx512vl:
+.cfi_startproc
+    push    %rbp
+.cfi_push       %rbp
+    mov     %rsp, %rbp
+    push    %rbx
+.cfi_push       %rbx
+___
+$code .= <<___ if ($win64);
+    sub     \$160, %rsp
+    vmovups %xmm6,   0(%rsp)
+    vmovups %xmm7,   16(%rsp)
+    vmovups %xmm8,   32(%rsp)
+    vmovups %xmm9,   48(%rsp)
+    vmovups %xmm10,  64(%rsp)
+    vmovups %xmm11,  80(%rsp)
+    vmovups %xmm12,  96(%rsp)
+    vmovups %xmm13,  112(%rsp)
+    vmovups %xmm14,  128(%rsp)
+    vmovups %xmm15,  144(%rsp)
+___
+$code.=<<___;
+
+    sub     \$$sf_size, %rsp
+    mov     %rsp, %rbx
+
+.Lshake256_x4_body:
+    mov     $arg1, $sf_arg1(%rbx)
+    mov     $arg2, $sf_arg2(%rbx)
+    mov     $arg3, $sf_arg3(%rbx)
+    mov     $arg4, $sf_arg4(%rbx)
+    mov     $arg5, $sf_arg5(%rbx)
+
+    lea     $sf_state_x4(%rbx), $arg1 # start of x4 state on the stack frame
+    mov     $arg1, $sf_state_ptr(%rbx)
+
+    # Initialize the state array to zero
+    call    keccak_1600_init_state
+
+    call    keccak_1600_save_state_x4
+
+    movq    \$0, 8*100($arg1) # clear s[100]
+
+    mov     $sf_state_ptr(%rbx), $arg1
+    mov     $arg6, $arg2
+___
+$code .= <<___ if ($win64);
+    # xlate prologue handles up to six arguments. For one-shot x4 wrappers
+    # (10 args), the remaining four stay in Win64 stack slots.
+    mov     64(%rbp), $arg3 # arg7 from stack
+    mov     72(%rbp), $arg4 # arg8 from stack
+    mov     80(%rbp), $arg5 # arg9 from stack
+    mov     88(%rbp), $arg6 # arg10 from stack
+___
+$code .= <<___ if (!$win64);
+    mov     16(%rbp), $arg3 # arg7 from stack
+    mov     24(%rbp), $arg4 # arg8 from stack
+    mov     32(%rbp), $arg5 # arg9 from stack
+    mov     40(%rbp), $arg6 # arg10 from stack
+___
+$code.=<<___;
+    # Internal entry avoids Win64 xlate prologue argument remapping.
+___
+$code .= call_internal("SHA3_shake256_x4_inc_absorb_avx512vl_internal");
+$code.=<<___;
+
+    mov     $sf_state_ptr(%rbx), $arg1
+    call    .L_SHA3_shake256_x4_inc_finalize_avx512vl
+
+    # squeeze
+    mov     $sf_arg1(%rbx), $arg1
+    mov     $sf_arg2(%rbx), $arg2
+    mov     $sf_arg3(%rbx), $arg3
+    mov     $sf_arg4(%rbx), $arg4
+    mov     $sf_arg5(%rbx), $arg5
+    mov     $sf_state_ptr(%rbx), $arg6
+___
+$code .= call_internal("SHA3_shake256_x4_inc_squeeze_avx512vl_internal");
+$code.=<<___;
+
+    # Clear the temporary buffer
+    lea     $sf_state_x4(%rbx), %r9
+    vpxorq      %ymm31, %ymm31, %ymm31
+    vmovdqu64   %ymm31, 32*0(%r9)
+    vmovdqu64   %ymm31, 32*1(%r9)
+    vmovdqu64   %ymm31, 32*2(%r9)
+    vmovdqu64   %ymm31, 32*3(%r9)
+    vmovdqu64   %ymm31, 32*4(%r9)
+    vmovdqu64   %ymm31, 32*5(%r9)
+    vmovdqu64   %ymm31, 32*6(%r9)
+    vmovdqu64   %ymm31, 32*7(%r9)
+    vmovdqu64   %ymm31, 32*8(%r9)
+    vmovdqu64   %ymm31, 32*9(%r9)
+    vmovdqu64   %ymm31, 32*10(%r9)
+    vmovdqu64   %ymm31, 32*11(%r9)
+    vmovdqu64   %ymm31, 32*12(%r9)
+    vmovdqu64   %ymm31, 32*13(%r9)
+    vmovdqu64   %ymm31, 32*14(%r9)
+    vmovdqu64   %ymm31, 32*15(%r9)
+    vmovdqu64   %ymm31, 32*16(%r9)
+    vmovdqu64   %ymm31, 32*17(%r9)
+    vmovdqu64   %ymm31, 32*18(%r9)
+    vmovdqu64   %ymm31, 32*19(%r9)
+    vmovdqu64   %ymm31, 32*20(%r9)
+    vmovdqu64   %ymm31, 32*21(%r9)
+    vmovdqu64   %ymm31, 32*22(%r9)
+    vmovdqu64   %ymm31, 32*23(%r9)
+    vmovdqu64   %ymm31, 32*24(%r9)
+    vmovq       %xmm31, 32*25(%r9)
+
+.Lshake256_x4_epilogue:
+___
+$code .= <<___ if ($win64);
+    vmovups $sf_size+0(%rsp),   %xmm6
+    vmovups $sf_size+16(%rsp),  %xmm7
+    vmovups $sf_size+32(%rsp),  %xmm8
+    vmovups $sf_size+48(%rsp),  %xmm9
+    vmovups $sf_size+64(%rsp),  %xmm10
+    vmovups $sf_size+80(%rsp),  %xmm11
+    vmovups $sf_size+96(%rsp),  %xmm12
+    vmovups $sf_size+112(%rsp), %xmm13
+    vmovups $sf_size+128(%rsp), %xmm14
+    vmovups $sf_size+144(%rsp), %xmm15
+    add     \$160, %rsp
+___
+$code.=<<___;
+    add     \$$sf_size, %rsp
+    pop     %rbx
+.cfi_pop        %rbx
+    pop     %rbp
+.cfi_pop        %rbp
+    ret
+.cfi_endproc
+.size   SHA3_shake256_x4_avx512vl,.-SHA3_shake256_x4_avx512vl
+
+___
+
+$code .= <<___ if ($win64);
+# Internal Win64 shim for absorb entry. It establishes xlate-compatible
+# unwind state and then jumps to the function entry after the prologue.
+# This is required for internal calls since the xlate ABI conversion
+# is already done in the caller function.
+.type   SHA3_shake256_x4_inc_absorb_avx512vl_internal,\@abi-omnipotent
+.align  32
+.LSEH_begin_SHA3_shake256_x4_inc_absorb_avx512vl_internal:
+SHA3_shake256_x4_inc_absorb_avx512vl_internal:
+    mov     %rsp, %rax
+    mov     $arg1, 8(%rsp)
+    mov     $arg2, 16(%rsp)
+    jmp     .L_SHA3_shake256_x4_inc_absorb_avx512vl
+.LSEH_end_SHA3_shake256_x4_inc_absorb_avx512vl_internal:
+.size   SHA3_shake256_x4_inc_absorb_avx512vl_internal,.-SHA3_shake256_x4_inc_absorb_avx512vl_internal
+___
+$code.=<<___;
+
+# SHA3_shake256_x4_inc_absorb_avx512vl
+# Absorb input data into 4 parallel SHAKE256 states
+# Arguments:
+#   arg1 (rdi): pointer to state context (808 bytes)
+#   arg2 (rsi): pointer to lane 0 input data
+#   arg3 (rdx): pointer to lane 1 input data
+#   arg4 (rcx): pointer to lane 2 input data
+#   arg5 (r8):  pointer to lane 3 input data
+#   arg6 (r9):  input length in bytes (must be same for all lanes)
+# Returns: void
+# Note: Input is XORed into state and Keccak permutation is applied for each rate-sized block
+.globl  SHA3_shake256_x4_inc_absorb_avx512vl
+.type   SHA3_shake256_x4_inc_absorb_avx512vl,\@function,6
+.align  32
+SHA3_shake256_x4_inc_absorb_avx512vl:
+.L_SHA3_shake256_x4_inc_absorb_avx512vl:
+.cfi_startproc
+    push    %rbp
+.cfi_push       %rbp
+    push    %rbx
+.cfi_push       %rbx
+    push    %r12
+.cfi_push       %r12
+    push    %r13
+.cfi_push       %r13
+    push    %r14
+.cfi_push       %r14
+    push    %r15
+.cfi_push       %r15
+___
+$code .= <<___ if ($win64);
+    sub     \$160, %rsp
+    vmovups %xmm6,   0(%rsp)
+    vmovups %xmm7,   16(%rsp)
+    vmovups %xmm8,   32(%rsp)
+    vmovups %xmm9,   48(%rsp)
+    vmovups %xmm10,  64(%rsp)
+    vmovups %xmm11,  80(%rsp)
+    vmovups %xmm12,  96(%rsp)
+    vmovups %xmm13,  112(%rsp)
+    vmovups %xmm14,  128(%rsp)
+    vmovups %xmm15,  144(%rsp)
+___
+$code.=<<___;
+
+.Lshake256_absorb_body:
+    # check for partially processed block
+    mov     8*100($arg1), %r14
+    or      %r14, %r14 # s[100] == 0?
+    je      .Lshake256_absorb_main_loop_start
+
+    # process remaining bytes if message long enough
+    mov     \$136, %r12 # SHAKE256_RATE = 136
+    sub     %r14, %r12  # %r12 = capacity
+
+    cmp     %r12, $arg6 # if mlen <= capacity then no permute
+    jbe     .Lshake256_absorb_skip_permute
+
+    sub     %r12, $arg6
+    mov     $arg6, %r11 # preserve remaining length across helper calls
+
+    # r10/state, arg2-arg5/inputs, r12/length
+    mov     $arg1, %r10                # %r10 = state
+    call    keccak_1600_partial_add_x4 # arg2-arg5 are updated
+
+    call    keccak_1600_load_state_x4
+
+    call    keccak_1600_permute
+
+    movq    \$0, 8*100($arg1) # clear s[100]
+    jmp     .Lshake256_absorb_partial_block_done
+
+.Lshake256_absorb_skip_permute:
+    # r10/state, arg2-arg5/inputs, r12/length
+    mov     $arg1, %r10
+    mov     $arg6, %r12
+    mov     $arg6, %r11 # preserve input length across helper call
+    call    keccak_1600_partial_add_x4
+
+    lea     (%r11,%r14), %r15
+    mov     %r15, 8*100($arg1) # s[100] += inlen
+
+    cmp     \$136, %r15 # check s[100] below SHAKE256_RATE
+    jb      .Lshake256_absorb_exit
+
+    call    keccak_1600_load_state_x4
+
+    call    keccak_1600_permute
+
+    call    keccak_1600_save_state_x4
+
+    movq    \$0, 8*100($arg1) # clear s[100]
+    jmp     .Lshake256_absorb_exit
+
+.Lshake256_absorb_main_loop_start:
+    call    keccak_1600_load_state_x4
+    mov     $arg6, %r11 # full input length when no prior partial block
+
+.Lshake256_absorb_partial_block_done:
+    xor     %r12, %r12  # zero message offset
+
+    # Process the input message in blocks
+.align  32
+.Lshake256_absorb_while_loop:
+    cmp     \$136, %r11 # compare mlen to SHAKE256_RATE
+    jb      .Lshake256_absorb_while_loop_done
+
+    # Inline absorb_bytes_x4 for SHAKE256_RATE (136 bytes = 17 ymm registers)
+___
+
+# Generate absorb code for SHAKE256 rate (136 bytes)
+for (my $i = 0; $i < 17; $i++) {
+    my $offset = $i * 8;
+    $code.=<<___;
+        vmovq       $offset($arg2,%r12), %xmm31
+        vpinsrq     \$1, $offset($arg3,%r12), %xmm31, %xmm31
+        vmovq       $offset($arg4,%r12), %xmm30
+        vpinsrq     \$1, $offset($arg5,%r12), %xmm30, %xmm30
+        vinserti32x4 \$1, %xmm30, %ymm31, %ymm31
+        vpxorq      %ymm31, %ymm$i, %ymm$i
+___
+}
+
+$code.=<<___;
+    sub     \$136, %r11         # Subtract the rate from the remaining length
+    add     \$136, %r12         # Adjust offset to next block
+    call    keccak_1600_permute # Perform the Keccak permutation
+
+    jmp     .Lshake256_absorb_while_loop
+
+.align  32
+.Lshake256_absorb_while_loop_done:
+    call    keccak_1600_save_state_x4
+
+    mov     %r11, 8*100($arg1) # update s[100]
+    or      %r11, %r11
+    jz      .Lshake256_absorb_exit
+
+    movq    \$0, 8*100($arg1) # clear s[100]
+
+    # r10/state, arg2-arg5/input, r12/length
+    mov     $arg1, %r10
+    add     %r12, $arg2
+    add     %r12, $arg3
+    add     %r12, $arg4
+    add     %r12, $arg5
+    mov     %r11, %r12
+    call    keccak_1600_partial_add_x4
+
+    mov     %r11, 8*100($arg1) # update s[100]
+
+.Lshake256_absorb_exit:
+    # Clear sensitive registers
+    vpxorq      %xmm16, %xmm16, %xmm16
+    vmovdqa64   %ymm16, %ymm17
+    vmovdqa64   %ymm16, %ymm18
+    vmovdqa64   %ymm16, %ymm19
+    vmovdqa64   %ymm16, %ymm20
+    vmovdqa64   %ymm16, %ymm21
+    vmovdqa64   %ymm16, %ymm22
+    vmovdqa64   %ymm16, %ymm23
+    vmovdqa64   %ymm16, %ymm24
+    vmovdqa64   %ymm16, %ymm25
+    vmovdqa64   %ymm16, %ymm26
+    vmovdqa64   %ymm16, %ymm27
+    vmovdqa64   %ymm16, %ymm28
+    vmovdqa64   %ymm16, %ymm29
+    vmovdqa64   %ymm16, %ymm30
+    vmovdqa64   %ymm16, %ymm31
+.Lshake256_absorb_epilogue:
+___
+$code .= <<___ if ($win64);
+    vmovups 0(%rsp),   %xmm6
+    vmovups 16(%rsp),  %xmm7
+    vmovups 32(%rsp),  %xmm8
+    vmovups 48(%rsp),  %xmm9
+    vmovups 64(%rsp),  %xmm10
+    vmovups 80(%rsp),  %xmm11
+    vmovups 96(%rsp),  %xmm12
+    vmovups 112(%rsp), %xmm13
+    vmovups 128(%rsp), %xmm14
+    vmovups 144(%rsp), %xmm15
+    add     \$160, %rsp
+___
+$code.=<<___;
+
+    pop %r15
+.cfi_pop    %r15
+    pop %r14
+.cfi_pop    %r14
+    pop %r13
+.cfi_pop    %r13
+    pop %r12
+.cfi_pop    %r12
+    pop %rbx
+.cfi_pop    %rbx
+    pop %rbp
+.cfi_pop    %rbp
+    vzeroall
+    ret
+.cfi_endproc
+.size   SHA3_shake256_x4_inc_absorb_avx512vl,.-SHA3_shake256_x4_inc_absorb_avx512vl
+
+
+# SHA3_shake256_x4_inc_finalize_avx512vl
+# Finalize absorption phase for 4 parallel SHAKE-256 states
+# Adds padding and terminator bytes and clears the absorb offset
+# Arguments:
+#   arg1 (rdi): pointer to state context (808 bytes)
+# Returns: void
+# Note: After this call, state is ready for squeezing output
+.globl  SHA3_shake256_x4_inc_finalize_avx512vl
+.type   SHA3_shake256_x4_inc_finalize_avx512vl,\@function,1
+.align  32
+SHA3_shake256_x4_inc_finalize_avx512vl:
+.L_SHA3_shake256_x4_inc_finalize_avx512vl:
+.cfi_startproc
+    mov     8*100($arg1), %r11 # load state offset from s[100]
+    mov     %r11, %r10
+    and     \$~7, %r10d        # offset to the state register
+    and     \$7, %r11d         # offset within the register
+
+    # add EOM byte right after the message
+    vmovdqu32   ($arg1,%r10,4), %ymm31
+    lea         shake_msg_pad_x4(%rip), %r9
+    sub         %r11, %r9
+    vmovdqu32   (%r9), %ymm30
+    vpxorq      %ymm30, %ymm31, %ymm31
+    vmovdqu32   %ymm31, ($arg1,%r10,4)
+
+    # add terminating byte at offset equal to rate - 1 (SHAKE256_RATE = 136)
+    vmovdqu32   512($arg1), %ymm31 # 136*4 - 32 = 544 - 32 = 512
+    vmovdqa32   shake_terminator_byte_x4(%rip), %ymm30
+    vpxorq      %ymm30, %ymm31, %ymm31
+    vmovdqu32   %ymm31, 512($arg1)
+
+    movq        \$0, 8*100($arg1) # clear s[100]
+    vpxorq      %ymm31, %ymm31, %ymm31
+    ret
+.cfi_endproc
+.size   SHA3_shake256_x4_inc_finalize_avx512vl,.-SHA3_shake256_x4_inc_finalize_avx512vl
+
+___
+
+$code .= <<___ if ($win64);
+# Internal Win64 shim for squeeze entry. It establishes xlate-compatible
+# unwind state and then jumps to the function entry after the prologue.
+# This is required for internal calls since the xlate ABI conversion
+# is already done in the caller function.
+.type   SHA3_shake256_x4_inc_squeeze_avx512vl_internal,\@abi-omnipotent
+.align  32
+.LSEH_begin_SHA3_shake256_x4_inc_squeeze_avx512vl_internal:
+SHA3_shake256_x4_inc_squeeze_avx512vl_internal:
+    mov     %rsp, %rax
+    mov     $arg1, 8(%rsp)
+    mov     $arg2, 16(%rsp)
+    jmp     .L_SHA3_shake256_x4_inc_squeeze_avx512vl
+.LSEH_end_SHA3_shake256_x4_inc_squeeze_avx512vl_internal:
+.size   SHA3_shake256_x4_inc_squeeze_avx512vl_internal,.-SHA3_shake256_x4_inc_squeeze_avx512vl_internal
+___
+$code.=<<___;
+
+# SHA3_shake256_x4_inc_squeeze_avx512vl
+# Squeeze output from 4 parallel SHAKE256 states
+# Arguments:
+#   arg1 (rdi): pointer to lane 0 output buffer
+#   arg2 (rsi): pointer to lane 1 output buffer
+#   arg3 (rdx): pointer to lane 2 output buffer
+#   arg4 (rcx): pointer to lane 3 output buffer
+#   arg5 (r8):  output length in bytes (must be same for all lanes)
+#   arg6 (r9):  pointer to state context (808 bytes)
+# Returns: void
+# Note: Can be called multiple times to generate arbitrary-length output
+.globl  SHA3_shake256_x4_inc_squeeze_avx512vl
+.type   SHA3_shake256_x4_inc_squeeze_avx512vl,\@function,6
+.align  32
+SHA3_shake256_x4_inc_squeeze_avx512vl:
+.L_SHA3_shake256_x4_inc_squeeze_avx512vl:
+.cfi_startproc
+    push    %rbp
+.cfi_push       %rbp
+    push    %rbx
+.cfi_push       %rbx
+    push    %r12
+.cfi_push       %r12
+    push    %r13
+.cfi_push       %r13
+    push    %r14
+.cfi_push       %r14
+    push    %r15
+.cfi_push       %r15
+___
+$code .= <<___ if ($win64);
+    sub     \$160, %rsp
+    vmovups %xmm6,   0(%rsp)
+    vmovups %xmm7,   16(%rsp)
+    vmovups %xmm8,   32(%rsp)
+    vmovups %xmm9,   48(%rsp)
+    vmovups %xmm10,  64(%rsp)
+    vmovups %xmm11,  80(%rsp)
+    vmovups %xmm12,  96(%rsp)
+    vmovups %xmm13,  112(%rsp)
+    vmovups %xmm14,  128(%rsp)
+    vmovups %xmm15,  144(%rsp)
+___
+$code.=<<___;
+
+.Lshake256_squeeze_body:
+    or      $arg5, $arg5
+    jz      .Lshake256_squeeze_done
+
+    # check for partially processed block
+    mov     8*100($arg6), %r15 # s[100] - capacity
+    or      %r15, %r15
+    jnz     .Lshake256_squeeze_no_init_permute
+
+    mov     $arg1, %r14
+    mov     $arg6, $arg1
+    call    keccak_1600_load_state_x4
+
+    mov     %r14, $arg1
+
+    xor     %rbp, %rbp
+    jmp     .Lshake256_squeeze_loop
+
+.align  32
+.Lshake256_squeeze_no_init_permute:
+    # extract bytes: r10 - state/src, arg1-arg4 - output/dst, r12 - length = min(capacity, outlen), r11 - offset
+    mov     $arg6, %r10
+    mov     $arg6, %r14 # preserve state pointer across extract helper
+
+    mov     %r15, %r12
+    cmp     %r15, $arg5
+    cmovnae $arg5, %r12 # %r12 = min(capacity, outlen)
+
+    sub     %r12, $arg5 # outlen -= length
+
+    mov     \$136, %r11d # SHAKE256_RATE
+    sub     %r15, %r11   # state offset
+
+    sub     %r12, %r15         # capacity -= length
+    mov     %r15, 8*100($arg6) # update s[100]
+
+    call    keccak_1600_extract_bytes_x4
+    mov     %r14, $arg6        # restore state pointer after helper clobbers
+
+    or      %r15, %r15
+    jnz     .Lshake256_squeeze_done # check s[100] not zero
+
+    mov     $arg1, %r13 # preserve arg1
+    mov     %r14, $arg1
+    call    keccak_1600_load_state_x4
+
+    mov     %r13, $arg1
+    xor     %rbp, %rbp
+
+.align  32
+.Lshake256_squeeze_loop:
+    cmp     \$136, $arg5 # outlen > SHAKE256_RATE
+    jb      .Lshake256_squeeze_final_extract
+
+    call    keccak_1600_permute
+
+    # Extract SHAKE256 rate bytes (136 bytes = 17 x 8 bytes) inline
+___
+
+# Generate extract code for SHAKE256 rate (136 bytes = 17 ymm registers)
+for (my $i = 0; $i < 17; $i++) {
+    my $offset = $i * 8;
+    $code.=<<___;
+        vextracti64x2 \$1, %ymm$i, %xmm31
+        vmovq       %xmm$i, $offset($arg1,%rbp)
+        vpextrq     \$1, %xmm$i, $offset($arg2,%rbp)
+        vmovq       %xmm31, $offset($arg3,%rbp)
+        vpextrq     \$1, %xmm31, $offset($arg4,%rbp)
+___
+}
+
+$code.=<<___;
+    add     \$136, %rbp  # dst offset += SHAKE256_RATE
+    sub     \$136, $arg5 # outlen -= SHAKE256_RATE
+    jmp     .Lshake256_squeeze_loop
+
+.align  32
+.Lshake256_squeeze_final_extract:
+    or      $arg5, $arg5
+    jz      .Lshake256_squeeze_no_end_permute
+
+    # update output pointers
+    add     %rbp, $arg1
+    add     %rbp, $arg2
+    add     %rbp, $arg3
+    add     %rbp, $arg4
+
+    mov     \$136, %r15d       # SHAKE256_RATE
+    sub     $arg5, %r15
+    mov     %r15, 8*100($arg6) # s[100] = capacity
+
+    call    keccak_1600_permute
+
+    mov     $arg1, %r14
+    mov     $arg6, $arg1
+    call    keccak_1600_save_state_x4
+
+    mov     %r14, $arg1
+
+    # extract bytes: r10 - state/src, arg1-arg4 - output/dst, r12 - length, r11 - offset = 0
+    mov     $arg6, %r10
+    mov     $arg5, %r12
+    xor     %r11, %r11
+    call    keccak_1600_extract_bytes_x4
+
+    jmp     .Lshake256_squeeze_done
+
+.Lshake256_squeeze_no_end_permute:
+    movq    \$0, 8*100($arg6) # s[100] = 0
+    mov     $arg6, $arg1
+    call    keccak_1600_save_state_x4
+
+.Lshake256_squeeze_done:
+    # Clear sensitive registers
+    vpxorq      %xmm16, %xmm16, %xmm16
+    vmovdqa64   %ymm16, %ymm17
+    vmovdqa64   %ymm16, %ymm18
+    vmovdqa64   %ymm16, %ymm19
+    vmovdqa64   %ymm16, %ymm20
+    vmovdqa64   %ymm16, %ymm21
+    vmovdqa64   %ymm16, %ymm22
+    vmovdqa64   %ymm16, %ymm23
+    vmovdqa64   %ymm16, %ymm24
+    vmovdqa64   %ymm16, %ymm25
+    vmovdqa64   %ymm16, %ymm26
+    vmovdqa64   %ymm16, %ymm27
+    vmovdqa64   %ymm16, %ymm28
+    vmovdqa64   %ymm16, %ymm29
+    vmovdqa64   %ymm16, %ymm30
+    vmovdqa64   %ymm16, %ymm31
+.Lshake256_squeeze_epilogue:
+    vzeroall
+___
+$code .= <<___ if ($win64);
+    vmovups 0(%rsp),   %xmm6
+    vmovups 16(%rsp),  %xmm7
+    vmovups 32(%rsp),  %xmm8
+    vmovups 48(%rsp),  %xmm9
+    vmovups 64(%rsp),  %xmm10
+    vmovups 80(%rsp),  %xmm11
+    vmovups 96(%rsp),  %xmm12
+    vmovups 112(%rsp), %xmm13
+    vmovups 128(%rsp), %xmm14
+    vmovups 144(%rsp), %xmm15
+    add     \$160, %rsp
+___
+$code.=<<___;
+
+    pop %r15
+.cfi_pop    %r15
+    pop %r14
+.cfi_pop    %r14
+    pop %r13
+.cfi_pop    %r13
+    pop %r12
+.cfi_pop    %r12
+    pop %rbx
+.cfi_pop    %rbx
+    pop %rbp
+.cfi_pop    %rbp
+    ret
+.cfi_endproc
+.size   SHA3_shake256_x4_inc_squeeze_avx512vl,.-SHA3_shake256_x4_inc_squeeze_avx512vl
+___
+
+if ($win64) {
+my $context = "%r8";
+my $disp    = "%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type   keccak_se_handler,\@abi-omnipotent
+.align  16
+keccak_se_handler:
+    push    %rsi
+    push    %rdi
+    push    %rbx
+    push    %rbp
+    push    %r12
+    push    %r13
+    push    %r14
+    push    %r15
+    pushfq
+    sub     \$64, %rsp
+
+    mov     120($context), %rax # context->Rax = original %rsp from xlate prologue
+    mov     248($context), %rbx # context->Rip
+
+    mov     8($disp), %rsi  # disp->ImageBase
+    mov     56($disp), %r11 # disp->HandlerData
+
+    mov     0(%r11), %r10d # HandlerData[0]: body label (rva)
+    lea     (%rsi,%r10), %r10
+    cmp     %r10, %rbx     # Rip < body?
+    jb      .Lkeccak_in_prologue
+
+    mov     4(%r11), %r10d # HandlerData[1]: epilogue label (rva)
+    lea     (%rsi,%r10), %r10
+    cmp     %r10, %rbx     # Rip >= epilogue?
+    jae     .Lkeccak_in_epilogue
+
+    # In function body:
+    # HandlerData[2]: delta from context->Rsp(body) to original %rsp
+    # HandlerData[3]: offset of XMM6 save area from context->Rsp(body), -1 if none
+    # HandlerData[4]: number of saved non-volatiles in stack frame layout (2 or 6)
+    # HandlerData[5]: delta from context->Rsp(epilogue) to original %rsp
+    mov     152($context), %rdx # body rsp
+    mov     8(%r11), %r10d
+    lea     (%rdx,%r10), %rax   # original rsp
+    jmp     .Lkeccak_restore_body_or_epilogue
+
+.Lkeccak_in_epilogue:
+    mov     152($context), %rdx # epilogue rsp
+    mov     20(%r11), %r10d
+    lea     (%rdx,%r10), %rax   # original rsp
+
+.Lkeccak_restore_body_or_epilogue:
+    mov     8(%rax), %rcx       # xlate shadow save of original rdi
+    mov     16(%rax), %rsi      # xlate shadow save of original rsi
+    mov     %rax, 152($context) # context->Rsp = original rsp
+    mov     %rsi, 168($context) # context->Rsi
+    mov     %rcx, 176($context) # context->Rdi
+
+    mov     16(%r11), %r10d # gpr save count
+    cmp     \$6, %r10d
+    jne     .Lkeccak_restore_two
+
+    mov     -24(%rax), %r12
+    mov     -32(%rax), %r13
+    mov     -40(%rax), %r14
+    mov     -48(%rax), %r15
+    mov     %r12, 216($context) # context->R12
+    mov     %r13, 224($context) # context->R13
+    mov     %r14, 232($context) # context->R14
+    mov     %r15, 240($context) # context->R15
+
+.Lkeccak_restore_two:
+    mov     -8(%rax), %rbp
+    mov     -16(%rax), %rbx
+    mov     %rbp, 160($context) # context->Rbp
+    mov     %rbx, 144($context) # context->Rbx
+
+    mov     12(%r11), %r10d # xmm save offset from body rsp
+    cmp     \$-1, %r10d
+    je      .Lkeccak_in_prologue
+
+    lea     (%rdx,%r10), %rsi   # source = xmm save area
+    lea     512($context), %rdi # &context->Xmm6
+    mov     \$20, %ecx          # 10 XMM * 2 qwords
+    .long   0xa548f3fc          # cld; rep movsq
+
+.Lkeccak_in_prologue:
+    mov     8(%rax), %rcx
+    mov     16(%rax), %rdx
+    mov     %rcx, 176($context) # context->Rdi
+    mov     %rdx, 168($context) # context->Rsi
+    mov     %rax, 152($context) # context->Rsp = original rsp
+
+    mov     40($disp), %rdi # disp->ContextRecord
+    mov     $context, %rsi
+    mov     \$154, %ecx     # sizeof(CONTEXT)/8
+    .long   0xa548f3fc      # cld; rep movsq
+
+    mov     $disp, %rsi
+    xor     %rcx, %rcx     # UNW_FLAG_NHANDLER
+    mov     8(%rsi), %rdx  # disp->ImageBase
+    mov     0(%rsi), %r8   # disp->ControlPc
+    mov     16(%rsi), %r9  # disp->FunctionEntry
+    mov     40(%rsi), %r10 # disp->ContextRecord
+    lea     56(%rsi), %r11 # &disp->HandlerData
+    lea     24(%rsi), %r12 # &disp->EstablisherFrame
+    mov     %r10, 32(%rsp)
+    mov     %r11, 40(%rsp)
+    mov     %r12, 48(%rsp)
+    mov     %rcx, 56(%rsp)
+    call    *__imp_RtlVirtualUnwind(%rip)
+
+    mov     \$1, %eax # ExceptionContinueSearch
+    add     \$64, %rsp
+    popfq
+    pop     %r15
+    pop     %r14
+    pop     %r13
+    pop     %r12
+    pop     %rbp
+    pop     %rbx
+    pop     %rdi
+    pop     %rsi
+    ret
+.size   keccak_se_handler,.-keccak_se_handler
+
+.section    .pdata
+.align  4
+    .rva    .LSEH_begin_SHA3_shake128_x4_avx512vl
+    .rva    .LSEH_end_SHA3_shake128_x4_avx512vl
+    .rva    .LSEH_info_SHA3_shake128_x4_avx512vl
+    .rva    .LSEH_begin_SHA3_shake128_x4_inc_absorb_avx512vl_internal
+    .rva    .LSEH_end_SHA3_shake128_x4_inc_absorb_avx512vl_internal
+    .rva    .LSEH_info_SHA3_shake128_x4_inc_absorb_avx512vl_internal
+    .rva    .LSEH_begin_SHA3_shake128_x4_inc_absorb_avx512vl
+    .rva    .LSEH_end_SHA3_shake128_x4_inc_absorb_avx512vl
+    .rva    .LSEH_info_SHA3_shake128_x4_inc_absorb_avx512vl
+    .rva    .LSEH_begin_SHA3_shake128_x4_inc_squeeze_avx512vl_internal
+    .rva    .LSEH_end_SHA3_shake128_x4_inc_squeeze_avx512vl_internal
+    .rva    .LSEH_info_SHA3_shake128_x4_inc_squeeze_avx512vl_internal
+    .rva    .LSEH_begin_SHA3_shake128_x4_inc_squeeze_avx512vl
+    .rva    .LSEH_end_SHA3_shake128_x4_inc_squeeze_avx512vl
+    .rva    .LSEH_info_SHA3_shake128_x4_inc_squeeze_avx512vl
+    .rva    .LSEH_begin_SHA3_shake256_x4_avx512vl
+    .rva    .LSEH_end_SHA3_shake256_x4_avx512vl
+    .rva    .LSEH_info_SHA3_shake256_x4_avx512vl
+    .rva    .LSEH_begin_SHA3_shake256_x4_inc_absorb_avx512vl_internal
+    .rva    .LSEH_end_SHA3_shake256_x4_inc_absorb_avx512vl_internal
+    .rva    .LSEH_info_SHA3_shake256_x4_inc_absorb_avx512vl_internal
+    .rva    .LSEH_begin_SHA3_shake256_x4_inc_absorb_avx512vl
+    .rva    .LSEH_end_SHA3_shake256_x4_inc_absorb_avx512vl
+    .rva    .LSEH_info_SHA3_shake256_x4_inc_absorb_avx512vl
+    .rva    .LSEH_begin_SHA3_shake256_x4_inc_squeeze_avx512vl_internal
+    .rva    .LSEH_end_SHA3_shake256_x4_inc_squeeze_avx512vl_internal
+    .rva    .LSEH_info_SHA3_shake256_x4_inc_squeeze_avx512vl_internal
+    .rva    .LSEH_begin_SHA3_shake256_x4_inc_squeeze_avx512vl
+    .rva    .LSEH_end_SHA3_shake256_x4_inc_squeeze_avx512vl
+    .rva    .LSEH_info_SHA3_shake256_x4_inc_squeeze_avx512vl
+
+.section    .xdata
+.align  8
+.LSEH_info_SHA3_shake128_x4_avx512vl:
+    .byte   9,0,0,0
+    .rva    keccak_se_handler
+    .rva    .Lshake128_x4_body,.Lshake128_x4_epilogue
+    .long   1032,856,2,1032
+.LSEH_info_SHA3_shake128_x4_inc_absorb_avx512vl:
+    .byte   9,0,0,0
+    .rva    keccak_se_handler
+    .rva    .Lshake128_absorb_body,.Lshake128_absorb_epilogue
+    .long   208,0,6,208
+.LSEH_info_SHA3_shake128_x4_inc_absorb_avx512vl_internal:
+    .byte   9,0,0,0
+    .rva    keccak_se_handler
+    .rva    .Lshake128_absorb_body,.Lshake128_absorb_epilogue
+    .long   208,0,6,208
+.LSEH_info_SHA3_shake128_x4_inc_squeeze_avx512vl:
+    .byte   9,0,0,0
+    .rva    keccak_se_handler
+    .rva    .Lshake128_squeeze_body,.Lshake128_squeeze_epilogue
+    .long   208,0,6,208
+.LSEH_info_SHA3_shake128_x4_inc_squeeze_avx512vl_internal:
+    .byte   9,0,0,0
+    .rva    keccak_se_handler
+    .rva    .Lshake128_squeeze_body,.Lshake128_squeeze_epilogue
+    .long   208,0,6,208
+.LSEH_info_SHA3_shake256_x4_avx512vl:
+    .byte   9,0,0,0
+    .rva    keccak_se_handler
+    .rva    .Lshake256_x4_body,.Lshake256_x4_epilogue
+    .long   1032,856,2,1032
+.LSEH_info_SHA3_shake256_x4_inc_absorb_avx512vl:
+    .byte   9,0,0,0
+    .rva    keccak_se_handler
+    .rva    .Lshake256_absorb_body,.Lshake256_absorb_epilogue
+    .long   208,0,6,208
+.LSEH_info_SHA3_shake256_x4_inc_absorb_avx512vl_internal:
+    .byte   9,0,0,0
+    .rva    keccak_se_handler
+    .rva    .Lshake256_absorb_body,.Lshake256_absorb_epilogue
+    .long   208,0,6,208
+.LSEH_info_SHA3_shake256_x4_inc_squeeze_avx512vl:
+    .byte   9,0,0,0
+    .rva    keccak_se_handler
+    .rva    .Lshake256_squeeze_body,.Lshake256_squeeze_epilogue
+    .long   208,0,6,208
+.LSEH_info_SHA3_shake256_x4_inc_squeeze_avx512vl_internal:
+    .byte   9,0,0,0
+    .rva    keccak_se_handler
+    .rva    .Lshake256_squeeze_body,.Lshake256_squeeze_epilogue
+    .long   208,0,6,208
+___
+}
+
+$code.=<<___;
+
+.section .rodata align=128
+.align  128
+.type   iotas,\@object
+iotas:
+    .quad   0x0000000000000001
+    .quad   0x0000000000008082
+    .quad   0x800000000000808a
+    .quad   0x8000000080008000
+    .quad   0x000000000000808b
+    .quad   0x0000000080000001
+    .quad   0x8000000080008081
+    .quad   0x8000000000008009
+    .quad   0x000000000000008a
+    .quad   0x0000000000000088
+    .quad   0x0000000080008009
+    .quad   0x000000008000000a
+    .quad   0x000000008000808b
+    .quad   0x800000000000008b
+    .quad   0x8000000000008089
+    .quad   0x8000000000008003
+    .quad   0x8000000000008002
+    .quad   0x8000000000000080
+    .quad   0x000000000000800a
+    .quad   0x800000008000000a
+    .quad   0x8000000080008081
+    .quad   0x8000000000008080
+    .quad   0x0000000080000001
+    .quad   0x8000000080008008
+.size   iotas,.-iotas
+
+.align  8
+byte_kmask_0_to_7:
+    .byte   0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f
+
+.align  32
+shake_terminator_byte_x4:
+    .byte   0, 0, 0, 0, 0, 0, 0, 0x80
+    .byte   0, 0, 0, 0, 0, 0, 0, 0x80
+    .byte   0, 0, 0, 0, 0, 0, 0, 0x80
+    .byte   0, 0, 0, 0, 0, 0, 0, 0x80
+
+.align  8
+    .byte   0, 0, 0, 0, 0, 0, 0, 0
+shake_msg_pad_x4:
+    .byte   0x1F, 0, 0, 0, 0, 0, 0, 0
+    .byte   0x1F, 0, 0, 0, 0, 0, 0, 0
+    .byte   0x1F, 0, 0, 0, 0, 0, 0, 0
+    .byte   0x1F, 0, 0, 0, 0, 0, 0, 0
+
+.asciz  "Keccak-1600 absorb and squeeze for AVX512VL, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+}}} else {{{
+
+# When AVX512VL is not available, output stub functions
+# The capable function returns 0, and the operation functions are not defined (will use C fallback)
+
+$code .= <<___;
+.text
+
+.globl  SHA3_avx512vl_capable
+.type   SHA3_avx512vl_capable,\@abi-omnipotent
+SHA3_avx512vl_capable:
+    xor     %eax, %eax
+    ret
+.size   SHA3_avx512vl_capable, .-SHA3_avx512vl_capable
+
+.globl  SHA3_shake128_x4_inc_absorb_avx512vl
+.globl  SHA3_shake256_x4_inc_absorb_avx512vl
+.globl  SHA3_shake128_x4_inc_finalize_avx512vl
+.globl  SHA3_shake256_x4_inc_finalize_avx512vl
+.globl  SHA3_shake128_x4_inc_squeeze_avx512vl
+.globl  SHA3_shake256_x4_inc_squeeze_avx512vl
+.globl  SHA3_shake128_x4_avx512vl
+.globl  SHA3_shake256_x4_avx512vl
+.type   SHA3_shake128_x4_inc_absorb_avx512vl,\@abi-omnipotent
+SHA3_shake128_x4_inc_absorb_avx512vl:
+SHA3_shake256_x4_inc_absorb_avx512vl:
+SHA3_shake128_x4_inc_finalize_avx512vl:
+SHA3_shake256_x4_inc_finalize_avx512vl:
+SHA3_shake128_x4_inc_squeeze_avx512vl:
+SHA3_shake256_x4_inc_squeeze_avx512vl:
+SHA3_shake128_x4_avx512vl:
+SHA3_shake256_x4_avx512vl:
+    .byte   0x0f,0x0b # ud2
+    ret
+.size   SHA3_shake128_x4_inc_absorb_avx512vl, .-SHA3_shake128_x4_inc_absorb_avx512vl
+___
+}}}
+
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/crypto/sha/build.info b/crypto/sha/build.info
index 457ac8d06ab7b..fd192a66dda4c 100644
--- a/crypto/sha/build.info
+++ b/crypto/sha/build.info
@@ -65,7 +65,7 @@ ENDIF
 $KECCAK1600ASM=keccak1600.c
 IF[{- !$disabled{asm} -}]
   $KECCAK1600ASM_x86=
-  $KECCAK1600ASM_x86_64=keccak1600-x86_64.s
+  $KECCAK1600ASM_x86_64=keccak1600-x86_64.s keccak1600x4-avx512vl.s
 
   $KECCAK1600ASM_s390x=keccak1600-s390x.S
 
@@ -83,8 +83,8 @@ IF[{- !$disabled{asm} -}]
 ENDIF
 
 $COMMON=sha1dgst.c sha256.c sha512.c sha3.c sha3_encode.c $SHA1ASM $KECCAK1600ASM
-SOURCE[../../libcrypto]=$COMMON sha1_one.c
-SOURCE[../../providers/libfips.a]= $COMMON
+SOURCE[../../libcrypto]=$COMMON sha1_one.c sha3_x4.c
+SOURCE[../../providers/libfips.a]= $COMMON sha3_x4.c
 
 # Implementations are now spread across several libraries, so the defines
 # need to be applied to all affected libraries and modules.
@@ -198,4 +198,8 @@ GENERATE[keccak1600-avx512vl.S]=asm/keccak1600-avx512vl.pl
 GENERATE[keccak1600-mmx.S]=asm/keccak1600-mmx.pl
 GENERATE[keccak1600p8-ppc.S]=asm/keccak1600p8-ppc.pl
 
+# keccak1600x4-avx512vl.s supports multi-squeeze
+# Currently only used in ML-DSA on x86_64 with AVX-512VL support
+GENERATE[keccak1600x4-avx512vl.s]=asm/keccak1600x4-avx512vl.pl
+
 GENERATE[sha1-thumb.S]=asm/sha1-thumb.pl
diff --git a/crypto/sha/sha3_x4.c b/crypto/sha/sha3_x4.c
new file mode 100644
index 0000000000000..1d993c326c0a5
--- /dev/null
+++ b/crypto/sha/sha3_x4.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright 2026 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright (c) 2026 Intel Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/*
+ * SHAKE x4 multi-buffer implementation for AVX-512VL
+ *
+ * This file provides incremental API wrappers around the AVX-512VL
+ * assembly implementations for processing 4 SHAKE instances in parallel.
+ *
+ * Callers should check SHA3_avx512vl_capable() before calling.
+ */
+
+#include "internal/sha3.h"
+#include <string.h>
+
+#if defined(KECCAK1600_ASM)                                                               \
+    && (defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)) \
+    && !defined(OPENSSL_NO_ASM)
+
+/* External assembly function declarations */
+extern void SHA3_shake128_x4_inc_absorb_avx512vl(
+    uint64_t *state,
+    const void *in0, const void *in1,
+    const void *in2, const void *in3,
+    size_t inlen);
+
+extern void SHA3_shake256_x4_inc_absorb_avx512vl(
+    uint64_t *state,
+    const void *in0, const void *in1,
+    const void *in2, const void *in3,
+    size_t inlen);
+
+extern void SHA3_shake128_x4_inc_finalize_avx512vl(uint64_t *state);
+extern void SHA3_shake256_x4_inc_finalize_avx512vl(uint64_t *state);
+
+extern void SHA3_shake128_x4_inc_squeeze_avx512vl(
+    void *out0, void *out1,
+    void *out2, void *out3,
+    size_t outlen,
+    uint64_t *state);
+
+extern void SHA3_shake256_x4_inc_squeeze_avx512vl(
+    void *out0, void *out1,
+    void *out2, void *out3,
+    size_t outlen,
+    uint64_t *state);
+
+/* One-shot assembly function declarations */
+extern void SHA3_shake128_x4_avx512vl(
+    void *out0, void *out1,
+    void *out2, void *out3,
+    size_t outlen,
+    const void *in0, const void *in1,
+    const void *in2, const void *in3,
+    size_t inlen);
+
+extern void SHA3_shake256_x4_avx512vl(
+    void *out0, void *out1,
+    void *out2, void *out3,
+    size_t outlen,
+    const void *in0, const void *in1,
+    const void *in2, const void *in3,
+    size_t inlen);
+
+/*
+ * SHAKE-128 x4 Implementation
+ */
+
+void ossl_sha3_shake128_x4_inc_init(KECCAK1600_X4_CTX *ctx)
+{
+    memset(ctx->A, 0, sizeof(ctx->A));
+    ctx->rate = 168; /* SHAKE-128 rate in bytes */
+    ctx->finalized = 0;
+}
+
+void ossl_sha3_shake128_x4_inc_absorb(
+    KECCAK1600_X4_CTX *ctx,
+    const void *in0, const void *in1,
+    const void *in2, const void *in3,
+    size_t inlen)
+{
+    if (ctx->finalized) {
+        /* Error: cannot absorb after finalize */
+        return;
+    }
+
+    SHA3_shake128_x4_inc_absorb_avx512vl(
+        ctx->A, in0, in1, in2, in3, inlen);
+}
+
+void ossl_sha3_shake128_x4_inc_finalize(KECCAK1600_X4_CTX *ctx)
+{
+    if (ctx->finalized) {
+        return; /* Already finalized */
+    }
+
+    SHA3_shake128_x4_inc_finalize_avx512vl(ctx->A);
+    ctx->finalized = 1;
+}
+
+void ossl_sha3_shake128_x4_inc_squeeze(
+    void *out0, void *out1,
+    void *out2, void *out3,
+    size_t outlen,
+    KECCAK1600_X4_CTX *ctx)
+{
+    if (!ctx->finalized) {
+        /* Auto-finalize on first squeeze */
+        ossl_sha3_shake128_x4_inc_finalize(ctx);
+    }
+
+    SHA3_shake128_x4_inc_squeeze_avx512vl(
+        out0, out1, out2, out3, outlen, ctx->A);
+}
+
+/*
+ * SHAKE-256 x4 Implementation
+ */
+
+void ossl_sha3_shake256_x4_inc_init(KECCAK1600_X4_CTX *ctx)
+{
+    memset(ctx->A, 0, sizeof(ctx->A));
+    ctx->rate = 136; /* SHAKE-256 rate in bytes */
+    ctx->finalized = 0;
+}
+
+void ossl_sha3_shake256_x4_inc_absorb(
+    KECCAK1600_X4_CTX *ctx,
+    const void *in0, const void *in1,
+    const void *in2, const void *in3,
+    size_t inlen)
+{
+    if (ctx->finalized) {
+        /* Error: cannot absorb after finalize */
+        return;
+    }
+
+    SHA3_shake256_x4_inc_absorb_avx512vl(
+        ctx->A, in0, in1, in2, in3, inlen);
+}
+
+void ossl_sha3_shake256_x4_inc_finalize(KECCAK1600_X4_CTX *ctx)
+{
+    if (ctx->finalized) {
+        return; /* Already finalized */
+    }
+
+    SHA3_shake256_x4_inc_finalize_avx512vl(ctx->A);
+    ctx->finalized = 1;
+}
+
+void ossl_sha3_shake256_x4_inc_squeeze(
+    void *out0, void *out1,
+    void *out2, void *out3,
+    size_t outlen,
+    KECCAK1600_X4_CTX *ctx)
+{
+    if (!ctx->finalized) {
+        /* Auto-finalize on first squeeze */
+        ossl_sha3_shake256_x4_inc_finalize(ctx);
+    }
+
+    SHA3_shake256_x4_inc_squeeze_avx512vl(
+        out0, out1, out2, out3, outlen, ctx->A);
+}
+
+/*
+ * Single-call wrapper APIs
+ */
+
+void ossl_sha3_shake128_x4(
+    void *out0, void *out1,
+    void *out2, void *out3,
+    size_t outlen,
+    const void *in0, const void *in1,
+    const void *in2, const void *in3,
+    size_t inlen)
+{
+    SHA3_shake128_x4_avx512vl(out0, out1, out2, out3, outlen,
+        in0, in1, in2, in3, inlen);
+}
+
+void ossl_sha3_shake256_x4(
+    void *out0, void *out1,
+    void *out2, void *out3,
+    size_t outlen,
+    const void *in0, const void *in1,
+    const void *in2, const void *in3,
+    size_t inlen)
+{
+    SHA3_shake256_x4_avx512vl(out0, out1, out2, out3, outlen,
+        in0, in1, in2, in3, inlen);
+}
+
+#endif /* KECCAK1600_ASM && x86_64 && !OPENSSL_NO_ASM */
diff --git a/include/internal/sha3.h b/include/internal/sha3.h
index f91d00a74f838..82a7ec158b1a7 100644
--- a/include/internal/sha3.h
+++ b/include/internal/sha3.h
@@ -65,4 +65,75 @@ int ossl_shake_squeeze_default(KECCAK1600_CTX *ctx, unsigned char *out, size_t o
 size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
     size_t r);
 
+/* Multi-buffer (x4) Keccak-f[1600] context and API */
+#if defined(KECCAK1600_ASM)                                                               \
+    && (defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)) \
+    && !defined(OPENSSL_NO_ASM)
+
+/* Runtime capability check for AVX512VL */
+int SHA3_avx512vl_capable(void);
+
+/* Context for 4-way parallel SHAKE operations */
+typedef struct {
+    /* 4 interleaved Keccak states (800 bytes)
+       plus 8 bytes to store the number of
+       already absorbed or not yet squeezed bytes */
+    uint64_t A[(25 * 4) + 1];
+    size_t rate; /* Rate in bytes: 168 (SHAKE-128) or 136 (SHAKE-256) */
+    unsigned finalized; /* Has finalize been called? 0=no, 1=yes */
+} KECCAK1600_X4_CTX;
+
+/* SHAKE-128 x4 incremental API */
+void ossl_sha3_shake128_x4_inc_init(KECCAK1600_X4_CTX *ctx);
+
+void ossl_sha3_shake128_x4_inc_absorb(
+    KECCAK1600_X4_CTX *ctx,
+    const void *in0, const void *in1,
+    const void *in2, const void *in3,
+    size_t inlen);
+
+void ossl_sha3_shake128_x4_inc_finalize(KECCAK1600_X4_CTX *ctx);
+
+void ossl_sha3_shake128_x4_inc_squeeze(
+    void *out0, void *out1,
+    void *out2, void *out3,
+    size_t outlen,
+    KECCAK1600_X4_CTX *ctx);
+
+/* SHAKE-256 x4 incremental API */
+void ossl_sha3_shake256_x4_inc_init(KECCAK1600_X4_CTX *ctx);
+
+void ossl_sha3_shake256_x4_inc_absorb(
+    KECCAK1600_X4_CTX *ctx,
+    const void *in0, const void *in1,
+    const void *in2, const void *in3,
+    size_t inlen);
+
+void ossl_sha3_shake256_x4_inc_finalize(KECCAK1600_X4_CTX *ctx);
+
+void ossl_sha3_shake256_x4_inc_squeeze(
+    void *out0, void *out1,
+    void *out2, void *out3,
+    size_t outlen,
+    KECCAK1600_X4_CTX *ctx);
+
+/* Single-call SHAKE x4 APIs (wrapper functions) */
+void ossl_sha3_shake128_x4(
+    void *out0, void *out1,
+    void *out2, void *out3,
+    size_t outlen,
+    const void *in0, const void *in1,
+    const void *in2, const void *in3,
+    size_t inlen);
+
+void ossl_sha3_shake256_x4(
+    void *out0, void *out1,
+    void *out2, void *out3,
+    size_t outlen,
+    const void *in0, const void *in1,
+    const void *in2, const void *in3,
+    size_t inlen);
+
+#endif /* KECCAK1600_ASM && x86_64 && !OPENSSL_NO_ASM */
+
 #endif /* OSSL_INTERNAL_SHA3_H */

From e576caa48dc9ce2f280fdee4a6634f3e4dd0749f Mon Sep 17 00:00:00 2001
From: Marcel Cornu <marcel.d.cornu@intel.com>
Date: Mon, 13 Apr 2026 16:44:04 +0000
Subject: [PATCH 2/5] test: add SHAKE x4 internal cross-validation tests

Add a new `sha3_x4_internal_test` target and recipe to validate the
internal SHAKE x4 implementation against scalar SHA3 reference paths.

Cover SHAKE-128 and SHAKE-256 in one-shot and incremental modes, plus
multi-absorb and multi-squeeze cases across varied input and output
sizes. Tests are skipped when AVX512VL extensions are not available.

Signed-off-by: Marcel Cornu <marcel.d.cornu@intel.com>
---
 test/build.info                         |   6 +
 test/recipes/03-test_sha3_x4_internal.t |  16 +
 test/sha3_x4_internal_test.c            | 432 ++++++++++++++++++++++++
 3 files changed, 454 insertions(+)
 create mode 100644 test/recipes/03-test_sha3_x4_internal.t
 create mode 100644 test/sha3_x4_internal_test.c

diff --git a/test/build.info b/test/build.info
index f599b3aff8c61..d6a36ba9ba035 100644
--- a/test/build.info
+++ b/test/build.info
@@ -915,6 +915,8 @@ IF[{- !$disabled{tests} -}]
       PROGRAMS{noinst}=cmactest
     ENDIF
 
+    PROGRAMS{noinst}=sha3_x4_internal_test
+
     SOURCE[poly1305_internal_test]=poly1305_internal_test.c
     INCLUDE[poly1305_internal_test]=.. ../include ../apps/include
     DEPEND[poly1305_internal_test]=../libcrypto.a libtestutil.a
@@ -923,6 +925,10 @@ IF[{- !$disabled{tests} -}]
     INCLUDE[chacha_internal_test]=.. ../include ../apps/include
     DEPEND[chacha_internal_test]=../libcrypto.a libtestutil.a
 
+    SOURCE[sha3_x4_internal_test]=sha3_x4_internal_test.c
+    INCLUDE[sha3_x4_internal_test]=.. ../include ../apps/include
+    DEPEND[sha3_x4_internal_test]=../libcrypto.a libtestutil.a
+
     SOURCE[asn1_internal_test]=asn1_internal_test.c
     INCLUDE[asn1_internal_test]=.. ../include ../apps/include
     DEPEND[asn1_internal_test]=../libcrypto.a libtestutil.a
diff --git a/test/recipes/03-test_sha3_x4_internal.t b/test/recipes/03-test_sha3_x4_internal.t
new file mode 100644
index 0000000000000..9e5793aaf3cd3
--- /dev/null
+++ b/test/recipes/03-test_sha3_x4_internal.t
@@ -0,0 +1,16 @@
+#! /usr/bin/env perl
+# Copyright 2026 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright (c) 2026 Intel Corporation. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+use strict;
+use OpenSSL::Test;
+use OpenSSL::Test::Simple;
+
+setup("test_sha3_x4_internal");
+
+simple_test("test_sha3_x4_internal", "sha3_x4_internal_test");
diff --git a/test/sha3_x4_internal_test.c b/test/sha3_x4_internal_test.c
new file mode 100644
index 0000000000000..e387b6f51ae46
--- /dev/null
+++ b/test/sha3_x4_internal_test.c
@@ -0,0 +1,432 @@
+/*
+ * Copyright 2026 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright (c) 2026 Intel Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/*
+ * Internal cross-validation tests for the SHAKE x4 multi-buffer API.
+ *
+ * Each test computes SHAKE-128 or SHAKE-256 on four independent inputs
+ * using the x4 (AVX-512VL) path and compares every lane's output to the
+ * equivalent result produced by the scalar ossl_sha3_* API.
+ *
+ * Tests cover:
+ *   - Single-call (ossl_sha3_shake{128,256}_x4) for many (inlen, outlen) pairs
+ *   - Incremental init/absorb/squeeze for the same (inlen, outlen) pairs
+ *   - Multi-absorb: input split at every possible block boundary
+ *   - Multi-squeeze: output produced in two successive squeeze calls
+ */
+
+#include <string.h>
+#include "testutil.h"
+
+/*
+ * KECCAK1600_ASM is only added to the library compilation flags by the build
+ * system, not to test binaries.  Since the x4 declarations in internal/sha3.h
+ * are guarded by that macro, we define it here before the include so that the
+ * KECCAK1600_X4_CTX type and function prototypes are visible.  The symbols
+ * themselves live in libcrypto.a which is always compiled with the flag set.
+ * We additionally gate all x4 code on x86_64 (GCC/Clang: __x86_64__,
+ * MSVC: _M_AMD64/_M_X64) and !OPENSSL_NO_ASM so that the test still
+ * compiles on other platforms or in no-asm builds.
+ */
+#if (defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)) \
+    && !defined(OPENSSL_NO_ASM)
+#ifndef KECCAK1600_ASM
+#define KECCAK1600_ASM
+#endif
+#endif
+#include "internal/sha3.h"
+
+/*
+ * A single deterministic 1024-byte message.  Each of the four lanes receives
+ * a different slice of this buffer, with lane base pointers spaced 64 bytes
+ * apart, so their inputs are distinct yet entirely self-contained.
+ */
+#define MSG_BUF_SIZE 1024
+#define LANE_STRIDE 64 /* byte offset between lane base pointers */
+#define NUM_LANES 4
+
+static unsigned char msg[MSG_BUF_SIZE];
+
+/* Maximum output length used in this file – must fit chunk1 + chunk2. */
+#define MAX_OUT 640
+
+#if defined(KECCAK1600_ASM)                                                               \
+    && (defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)) \
+    && !defined(OPENSSL_NO_ASM)
+
+/*
+ * Input lengths exercising: empty, tiny, sub-block, block boundary ±1,
+ * multiple blocks and a longer message for SHAKE-128 (rate=168) and
+ * SHAKE-256 (rate=136).
+ */
+static const size_t input_sizes[] = {
+    0, 1, 17, 100, 135, 136, 137, 168, 169, 200, 400
+};
+#define NUM_INPUT_SIZES (sizeof(input_sizes) / sizeof(input_sizes[0]))
+
+/* Output lengths chosen to straddle rate boundaries for both variants. */
+static const size_t output_sizes[] = {
+    16, 32, 64, 136, 168, 256, 512
+};
+#define NUM_OUTPUT_SIZES (sizeof(output_sizes) / sizeof(output_sizes[0]))
+
+/* Helpers functions */
+
+/*
+ * Compute a scalar SHAKE-128 or SHAKE-256 digest.
+ * bitlen: 128 or 256.  Returns 1 on success, 0 on failure.
+ */
+static int scalar_shake(const unsigned int bitlen,
+    const unsigned char *in, const size_t inlen,
+    unsigned char *out, const size_t outlen)
+{
+    KECCAK1600_CTX ctx;
+
+    if (!ossl_sha3_init(&ctx, 0x1f, bitlen))
+        return 0;
+    /* ossl_sha3_init does not populate the method vtable; do it here. */
+    ctx.meth.absorb = ossl_sha3_absorb_default;
+    ctx.meth.final = ossl_sha3_final_default;
+    ctx.meth.squeeze = ossl_shake_squeeze_default;
+    return ossl_sha3_absorb(&ctx, in, inlen)
+        && ossl_sha3_squeeze(&ctx, out, outlen);
+}
+
+/*
+ * Encode (inlen_idx, outlen_idx) into a single test index and back.
+ * test index n = inlen_idx * NUM_OUTPUT_SIZES + outlen_idx
+ */
+static void decode_idx(const int n, size_t *inlen, size_t *outlen)
+{
+    *inlen = input_sizes[n / (int)NUM_OUTPUT_SIZES];
+    *outlen = output_sizes[n % (int)NUM_OUTPUT_SIZES];
+}
+
+/* One-shot tests */
+
+static int test_shake_x4_oneshot(const unsigned int bitlen, const int n)
+{
+    size_t inlen, outlen;
+    const unsigned char *in[NUM_LANES];
+    unsigned char x4_out[NUM_LANES][MAX_OUT];
+    unsigned char ref_out[NUM_LANES][MAX_OUT];
+    int i;
+
+    decode_idx(n, &inlen, &outlen);
+
+    for (i = 0; i < NUM_LANES; i++)
+        in[i] = msg + i * LANE_STRIDE;
+
+    /* Ensure the lane inputs fit within the message buffer. */
+    if (!TEST_size_t_le(inlen + (NUM_LANES - 1) * LANE_STRIDE, MSG_BUF_SIZE))
+        return 0;
+    if (!TEST_size_t_le(outlen, MAX_OUT))
+        return 0;
+
+    /* x4 single-call */
+    if (bitlen == 128)
+        ossl_sha3_shake128_x4(x4_out[0], x4_out[1], x4_out[2], x4_out[3],
+            outlen,
+            in[0], in[1], in[2], in[3], inlen);
+    else
+        ossl_sha3_shake256_x4(x4_out[0], x4_out[1], x4_out[2], x4_out[3],
+            outlen,
+            in[0], in[1], in[2], in[3], inlen);
+
+    /* scalar reference */
+    for (i = 0; i < NUM_LANES; i++)
+        if (!TEST_true(scalar_shake(bitlen, in[i], inlen, ref_out[i], outlen)))
+            return 0;
+
+    /* compare */
+    for (i = 0; i < NUM_LANES; i++) {
+        if (!TEST_mem_eq(x4_out[i], outlen, ref_out[i], outlen)) {
+            TEST_info("SHAKE-%u x4 oneshot lane %d: inlen=%zu outlen=%zu",
+                bitlen, i, inlen, outlen);
+            return 0;
+        }
+    }
+    return 1;
+}
+
+static int test_shake128_x4_oneshot(const int n)
+{
+    return test_shake_x4_oneshot(128, n);
+}
+
+static int test_shake256_x4_oneshot(const int n)
+{
+    return test_shake_x4_oneshot(256, n);
+}
+
+/* Incremental (init / absorb / finalize / squeeze) tests */
+
+static int test_shake_x4_incremental(const unsigned int bitlen, const int n)
+{
+    size_t inlen, outlen;
+    const unsigned char *in[NUM_LANES];
+    unsigned char x4_out[NUM_LANES][MAX_OUT];
+    unsigned char ref_out[NUM_LANES][MAX_OUT];
+    KECCAK1600_X4_CTX ctx;
+    int i;
+
+    decode_idx(n, &inlen, &outlen);
+
+    for (i = 0; i < NUM_LANES; i++)
+        in[i] = msg + i * LANE_STRIDE;
+
+    if (!TEST_size_t_le(inlen + (NUM_LANES - 1) * LANE_STRIDE, MSG_BUF_SIZE))
+        return 0;
+
+    /* x4 incremental */
+    if (bitlen == 128) {
+        ossl_sha3_shake128_x4_inc_init(&ctx);
+        ossl_sha3_shake128_x4_inc_absorb(&ctx, in[0], in[1], in[2], in[3],
+            inlen);
+        ossl_sha3_shake128_x4_inc_finalize(&ctx);
+        ossl_sha3_shake128_x4_inc_squeeze(x4_out[0], x4_out[1],
+            x4_out[2], x4_out[3], outlen, &ctx);
+    } else {
+        ossl_sha3_shake256_x4_inc_init(&ctx);
+        ossl_sha3_shake256_x4_inc_absorb(&ctx, in[0], in[1], in[2], in[3],
+            inlen);
+        ossl_sha3_shake256_x4_inc_finalize(&ctx);
+        ossl_sha3_shake256_x4_inc_squeeze(x4_out[0], x4_out[1],
+            x4_out[2], x4_out[3], outlen, &ctx);
+    }
+
+    /* scalar reference */
+    for (i = 0; i < NUM_LANES; i++)
+        if (!TEST_true(scalar_shake(bitlen, in[i], inlen, ref_out[i], outlen)))
+            return 0;
+
+    for (i = 0; i < NUM_LANES; i++) {
+        if (!TEST_mem_eq(x4_out[i], outlen, ref_out[i], outlen)) {
+            TEST_info("SHAKE-%u x4 incremental lane %d: inlen=%zu outlen=%zu",
+                bitlen, i, inlen, outlen);
+            return 0;
+        }
+    }
+    return 1;
+}
+
+static int test_shake128_x4_incremental(const int n)
+{
+    return test_shake_x4_incremental(128, n);
+}
+
+static int test_shake256_x4_incremental(const int n)
+{
+    return test_shake_x4_incremental(256, n);
+}
+
+/* Multi-absorb tests */
+
+/*
+ * Split the input at every tested input size, absorbing the two halves
+ * in separate calls.  The split length is chosen as input_sizes[n] so that
+ * we exercise sub-block, at-block and multi-block split points.
+ *
+ * Full message length is fixed at the largest tested input size so that
+ * every split index is meaningful.
+ */
+static int test_shake_x4_multi_absorb(const unsigned int bitlen, const int n)
+{
+    const size_t total = input_sizes[NUM_INPUT_SIZES - 1];
+    const size_t split = input_sizes[n];
+    const size_t outlen = 64; /* fixed output length for this sub-test */
+    const unsigned char *in[NUM_LANES];
+    unsigned char x4_out[NUM_LANES][MAX_OUT];
+    unsigned char ref_out[NUM_LANES][MAX_OUT];
+    KECCAK1600_X4_CTX ctx;
+    int i;
+
+    if (split > total)
+        return 1; /* nothing to test */
+
+    for (i = 0; i < NUM_LANES; i++)
+        in[i] = msg + i * LANE_STRIDE;
+
+    if (!TEST_size_t_le(total + (NUM_LANES - 1) * LANE_STRIDE, MSG_BUF_SIZE))
+        return 0;
+
+    /* x4 split absorb */
+    if (bitlen == 128) {
+        ossl_sha3_shake128_x4_inc_init(&ctx);
+        ossl_sha3_shake128_x4_inc_absorb(&ctx,
+            in[0], in[1], in[2], in[3], split);
+        ossl_sha3_shake128_x4_inc_absorb(&ctx,
+            in[0] + split, in[1] + split, in[2] + split, in[3] + split,
+            total - split);
+        ossl_sha3_shake128_x4_inc_squeeze(x4_out[0], x4_out[1],
+            x4_out[2], x4_out[3], outlen, &ctx);
+    } else {
+        ossl_sha3_shake256_x4_inc_init(&ctx);
+        ossl_sha3_shake256_x4_inc_absorb(&ctx,
+            in[0], in[1], in[2], in[3], split);
+        ossl_sha3_shake256_x4_inc_absorb(&ctx,
+            in[0] + split, in[1] + split, in[2] + split, in[3] + split,
+            total - split);
+        ossl_sha3_shake256_x4_inc_squeeze(x4_out[0], x4_out[1],
+            x4_out[2], x4_out[3], outlen, &ctx);
+    }
+
+    /* scalar reference (single absorb of full message) */
+    for (i = 0; i < NUM_LANES; i++)
+        if (!TEST_true(scalar_shake(bitlen, in[i], total, ref_out[i], outlen)))
+            return 0;
+
+    for (i = 0; i < NUM_LANES; i++) {
+        if (!TEST_mem_eq(x4_out[i], outlen, ref_out[i], outlen)) {
+            TEST_info("SHAKE-%u x4 multi-absorb lane %d: total=%zu split=%zu",
+                bitlen, i, total, split);
+            return 0;
+        }
+    }
+    return 1;
+}
+
+static int test_shake128_x4_multi_absorb(const int n)
+{
+    return test_shake_x4_multi_absorb(128, n);
+}
+
+static int test_shake256_x4_multi_absorb(const int n)
+{
+    return test_shake_x4_multi_absorb(256, n);
+}
+
+/* Multi-squeeze tests */
+
+/*
+ * Squeeze in two successive calls and verify that the concatenated output
+ * matches a single scalar squeeze of the same total length.
+ * Parameterized over output_sizes[] for the first chunk; the second chunk
+ * is always 64 bytes so the total length varies.
+ */
+static int test_shake_x4_multi_squeeze(const unsigned int bitlen, const int n)
+{
+    const size_t inlen = 200; /* fixed input length */
+    const size_t chunk1 = output_sizes[n];
+    const size_t chunk2 = 64;
+    const size_t total = chunk1 + chunk2;
+    const unsigned char *in[NUM_LANES];
+    unsigned char x4_a[NUM_LANES][MAX_OUT]; /* first chunk              */
+    unsigned char x4_b[NUM_LANES][MAX_OUT]; /* second chunk             */
+    unsigned char ref_out[NUM_LANES][MAX_OUT];
+    KECCAK1600_X4_CTX ctx;
+    int i;
+
+    if (!TEST_size_t_le(total, MAX_OUT))
+        return 0;
+    if (!TEST_size_t_le(inlen + (NUM_LANES - 1) * LANE_STRIDE, MSG_BUF_SIZE))
+        return 0;
+
+    for (i = 0; i < NUM_LANES; i++)
+        in[i] = msg + i * LANE_STRIDE;
+
+    /* x4 two-shot squeeze */
+    if (bitlen == 128) {
+        ossl_sha3_shake128_x4_inc_init(&ctx);
+        ossl_sha3_shake128_x4_inc_absorb(&ctx, in[0], in[1], in[2], in[3],
+            inlen);
+        /* first squeeze */
+        ossl_sha3_shake128_x4_inc_squeeze(x4_a[0], x4_a[1], x4_a[2], x4_a[3],
+            chunk1, &ctx);
+        /* second squeeze – context carries state from previous call */
+        ossl_sha3_shake128_x4_inc_squeeze(x4_b[0], x4_b[1], x4_b[2], x4_b[3],
+            chunk2, &ctx);
+    } else {
+        ossl_sha3_shake256_x4_inc_init(&ctx);
+        ossl_sha3_shake256_x4_inc_absorb(&ctx, in[0], in[1], in[2], in[3],
+            inlen);
+        ossl_sha3_shake256_x4_inc_squeeze(x4_a[0], x4_a[1], x4_a[2], x4_a[3],
+            chunk1, &ctx);
+        ossl_sha3_shake256_x4_inc_squeeze(x4_b[0], x4_b[1], x4_b[2], x4_b[3],
+            chunk2, &ctx);
+    }
+
+    /* scalar reference – squeeze the full total in one call */
+    for (i = 0; i < NUM_LANES; i++)
+        if (!TEST_true(scalar_shake(bitlen, in[i], inlen, ref_out[i], total)))
+            return 0;
+
+    /* check first chunk, then second chunk */
+    for (i = 0; i < NUM_LANES; i++) {
+        if (!TEST_mem_eq(x4_a[i], chunk1, ref_out[i], chunk1)) {
+            TEST_info("SHAKE-%u x4 multi-squeeze lane %d chunk1: "
+                      "inlen=%zu chunk1=%zu chunk2=%zu",
+                bitlen, i, inlen, chunk1, chunk2);
+            return 0;
+        }
+        if (!TEST_mem_eq(x4_b[i], chunk2, ref_out[i] + chunk1, chunk2)) {
+            TEST_info("SHAKE-%u x4 multi-squeeze lane %d chunk2: "
+                      "inlen=%zu chunk1=%zu chunk2=%zu",
+                bitlen, i, inlen, chunk1, chunk2);
+            return 0;
+        }
+    }
+    return 1;
+}
+
+static int test_shake128_x4_multi_squeeze(const int n)
+{
+    return test_shake_x4_multi_squeeze(128, n);
+}
+
+static int test_shake256_x4_multi_squeeze(const int n)
+{
+    return test_shake_x4_multi_squeeze(256, n);
+}
+
+#endif /* KECCAK1600_ASM && x86_64 && !OPENSSL_NO_ASM */
+
+/* Test entry point */
+
+int setup_tests(void)
+{
+    size_t i;
+
+    /* Fill the message buffer with a deterministic non-zero pattern. */
+    for (i = 0; i < MSG_BUF_SIZE; i++)
+        msg[i] = (unsigned char)(251 * i + 17);
+
+#ifdef OPENSSL_CPUID_OBJ
+    OPENSSL_cpuid_setup();
+#endif
+
+#if !defined(KECCAK1600_ASM)                                                               \
+    || !(defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)) \
+    || defined(OPENSSL_NO_ASM)
+    return TEST_skip("SHAKE x4 API not available in this build");
+#else
+    if (!SHA3_avx512vl_capable()) {
+        return TEST_skip("AVX-512VL not available; skipping SHAKE x4 tests");
+    }
+
+    ADD_ALL_TESTS(test_shake128_x4_oneshot,
+        (int)(NUM_INPUT_SIZES * NUM_OUTPUT_SIZES));
+    ADD_ALL_TESTS(test_shake256_x4_oneshot,
+        (int)(NUM_INPUT_SIZES * NUM_OUTPUT_SIZES));
+
+    ADD_ALL_TESTS(test_shake128_x4_incremental,
+        (int)(NUM_INPUT_SIZES * NUM_OUTPUT_SIZES));
+    ADD_ALL_TESTS(test_shake256_x4_incremental,
+        (int)(NUM_INPUT_SIZES * NUM_OUTPUT_SIZES));
+
+    ADD_ALL_TESTS(test_shake128_x4_multi_absorb, (int)NUM_INPUT_SIZES);
+    ADD_ALL_TESTS(test_shake256_x4_multi_absorb, (int)NUM_INPUT_SIZES);
+
+    ADD_ALL_TESTS(test_shake128_x4_multi_squeeze, (int)NUM_OUTPUT_SIZES);
+    ADD_ALL_TESTS(test_shake256_x4_multi_squeeze, (int)NUM_OUTPUT_SIZES);
+#endif
+
+    return 1;
+}

From 0822add44e79daa16545023e1c495a35f2ece888 Mon Sep 17 00:00:00 2001
From: Marcel Cornu <marcel.d.cornu@intel.com>
Date: Tue, 28 Apr 2026 13:16:35 +0000
Subject: [PATCH 3/5] Address PR feedback

Signed-off-by: Marcel Cornu <marcel.d.cornu@intel.com>
---
 crypto/ml_dsa/ml_dsa_sample.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/crypto/ml_dsa/ml_dsa_sample.c b/crypto/ml_dsa/ml_dsa_sample.c
index d59261e404254..3eef3c0176b1b 100644
--- a/crypto/ml_dsa/ml_dsa_sample.c
+++ b/crypto/ml_dsa/ml_dsa_sample.c
@@ -203,7 +203,7 @@ static int rej_bounded_poly(EVP_MD_CTX *h_ctx, const EVP_MD *md,
  *            in the range of 0..q-1.
  * @returns 1 if the matrix was generated, or 0 on error.
  */
-int matrix_expand_A_scalar(EVP_MD_CTX *g_ctx, const EVP_MD *md,
+static int matrix_expand_A_scalar(EVP_MD_CTX *g_ctx, const EVP_MD *md,
     const uint8_t *rho, MATRIX *out)
 {
     int ret = 0;
@@ -245,7 +245,7 @@ int matrix_expand_A_scalar(EVP_MD_CTX *g_ctx, const EVP_MD *md,
  *           the range (q-eta)..0..eta
  * @returns 1 if s1 and s2 were successfully generated, or 0 otherwise.
  */
-int vector_expand_S_scalar(EVP_MD_CTX *h_ctx, const EVP_MD *md, int eta,
+static int vector_expand_S_scalar(EVP_MD_CTX *h_ctx, const EVP_MD *md, int eta,
     const uint8_t *seed, VECTOR *s1, VECTOR *s2)
 {
     int ret = 0;
@@ -381,7 +381,7 @@ int ossl_ml_dsa_poly_sample_in_ball(POLY *out_c, const uint8_t *seed, int seed_l
     return 1;
 }
 
-void vector_expand_mask_scalar(VECTOR *out, const uint8_t *rho_prime,
+static void vector_expand_mask_scalar(VECTOR *out, const uint8_t *rho_prime,
     size_t rho_prime_len, uint32_t kappa, uint32_t gamma1,
     EVP_MD_CTX *h_ctx, const EVP_MD *md)
 {
@@ -410,7 +410,9 @@ static const OSSL_ML_DSA_SAMPLE_OPS ml_dsa_sample_generic_meth = {
 
 const OSSL_ML_DSA_SAMPLE_OPS *ossl_ml_dsa_sample_ops(void)
 {
-#if defined(KECCAK1600_ASM) && defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(KECCAK1600_ASM)                                                               \
+    && (defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)) \
+    && !defined(OPENSSL_NO_ASM)
     return ossl_ml_dsa_sample_x86_64_ops();
 #else
     return ossl_ml_dsa_sample_generic_ops();
@@ -422,7 +424,9 @@ const OSSL_ML_DSA_SAMPLE_OPS *ossl_ml_dsa_sample_generic_ops(void)
     return &ml_dsa_sample_generic_meth;
 }
 
-#if defined(KECCAK1600_ASM) && defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(KECCAK1600_ASM)                                                               \
+    && (defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)) \
+    && !defined(OPENSSL_NO_ASM)
 #include "ml_dsa_sample_hw_x86_64.inc"
 #else
 const OSSL_ML_DSA_SAMPLE_OPS *ossl_ml_dsa_sample_x86_64_ops(void)

From 9791bc172ef843f6693fe57d98ce5067cd349eb5 Mon Sep 17 00:00:00 2001
From: Marcel Cornu <marcel.d.cornu@intel.com>
Date: Tue, 28 Apr 2026 13:54:56 +0000
Subject: [PATCH 4/5] Fix windows warnings

Signed-off-by: Marcel Cornu <marcel.d.cornu@intel.com>
---
 crypto/ml_dsa/ml_dsa_sample_hw_x86_64.inc | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/crypto/ml_dsa/ml_dsa_sample_hw_x86_64.inc b/crypto/ml_dsa/ml_dsa_sample_hw_x86_64.inc
index 527e0456d1949..cc36c489a761f 100644
--- a/crypto/ml_dsa/ml_dsa_sample_hw_x86_64.inc
+++ b/crypto/ml_dsa/ml_dsa_sample_hw_x86_64.inc
@@ -36,7 +36,7 @@ static ossl_unused int rej_ntt_poly_mb(EVP_MD_CTX *g_ctx, const EVP_MD *md,
     (void)md;
 
     for (lane = count; lane < ML_DSA_SHAKE_X4_BATCH_SIZE; lane++)
-        done_mask |= (1 << lane);
+        done_mask |= ((size_t)1 << lane);
 
     ossl_sha3_shake128_x4_inc_init(&ctx);
     ossl_sha3_shake128_x4_inc_absorb(&ctx, seeds[0], seeds[1],
@@ -48,7 +48,7 @@ static ossl_unused int rej_ntt_poly_mb(EVP_MD_CTX *g_ctx, const EVP_MD *md,
             blocks[2], blocks[3], SHAKE128_BLOCKSIZE, &ctx);
 
         for (lane = 0; lane < ML_DSA_SHAKE_X4_BATCH_SIZE; lane++) {
-            if (done_mask & (1 << lane))
+            if (done_mask & ((size_t)1 << lane))
                 continue;
 
             const uint8_t *b = blocks[lane];
@@ -62,7 +62,7 @@ static ossl_unused int rej_ntt_poly_mb(EVP_MD_CTX *g_ctx, const EVP_MD *md,
             }
 
             if (coeff_idx[lane] >= ML_DSA_NUM_POLY_COEFFICIENTS)
-                done_mask |= (1 << lane);
+                done_mask |= ((size_t)1 << lane);
         }
     }
 
@@ -145,7 +145,7 @@ static ossl_unused int rej_bounded_poly_mb(EVP_MD_CTX *h_ctx, const EVP_MD *md,
     (void)md;
 
     for (lane = count; lane < ML_DSA_SHAKE_X4_BATCH_SIZE; lane++)
-        done_mask |= (1 << lane);
+        done_mask |= ((size_t)1 << lane);
 
     ossl_sha3_shake256_x4_inc_init(&ctx);
     ossl_sha3_shake256_x4_inc_absorb(&ctx, seeds[0], seeds[1],
@@ -157,7 +157,7 @@ static ossl_unused int rej_bounded_poly_mb(EVP_MD_CTX *h_ctx, const EVP_MD *md,
             blocks[2], blocks[3], SHAKE256_BLOCKSIZE, &ctx);
 
         for (lane = 0; lane < ML_DSA_SHAKE_X4_BATCH_SIZE; lane++) {
-            if (done_mask & (1 << lane))
+            if (done_mask & ((size_t)1 << lane))
                 continue;
 
             const uint8_t *b = blocks[lane];
@@ -171,7 +171,7 @@ static ossl_unused int rej_bounded_poly_mb(EVP_MD_CTX *h_ctx, const EVP_MD *md,
                     coeff_idx[lane]++;
 
                 if (coeff_idx[lane] >= ML_DSA_NUM_POLY_COEFFICIENTS) {
-                    done_mask |= (1 << lane);
+                    done_mask |= ((size_t)1 << lane);
                     break;
                 }
 
@@ -179,7 +179,7 @@ static ossl_unused int rej_bounded_poly_mb(EVP_MD_CTX *h_ctx, const EVP_MD *md,
                     coeff_idx[lane]++;
 
                 if (coeff_idx[lane] >= ML_DSA_NUM_POLY_COEFFICIENTS) {
-                    done_mask |= (1 << lane);
+                    done_mask |= ((size_t)1 << lane);
                     break;
                 }
             }

From f0f91f005ed37d0d148a3db32b6a58e8014223fa Mon Sep 17 00:00:00 2001
From: Marcel Cornu <marcel.d.cornu@intel.com>
Date: Tue, 28 Apr 2026 15:41:29 +0000
Subject: [PATCH 5/5] .github: add AVX512VL workflow using Intel SDE

Add a new CI workflow that runs AVX512VL specific tests under Intel SDE
v10.8, since GitHub Actions runners do not currently have AVX512 hardware.
SDE emulates AVX512 instructions and spoofs CPUID so the AVX512 code
paths can be exercised.

Two jobs are included: linux (ubuntu-latest) and windows (windows-2022).
Each job builds OpenSSL with no-shared and enable-fips, then runs the
following tests under `sde64 -skx` (Skylake-X, AVX512F+BW+DQ+VL):

- ml_dsa_internal_test: exercises AVX512VL ML-DSA sampling
- sha3_x4_internal_test: exercises AVX512VL SHAKE x4 functions
- openssl fipsinstall: runs the full FIPS KAT suite (including ML-DSA
  and SHA3 self-tests) against the FIPS provider under emulation

Signed-off-by: Marcel Cornu <marcel.d.cornu@intel.com>
---
 .github/workflows/avx512-sde.yml | 137 +++++++++++++++++++++++++++++++
 1 file changed, 137 insertions(+)
 create mode 100644 .github/workflows/avx512-sde.yml

diff --git a/.github/workflows/avx512-sde.yml b/.github/workflows/avx512-sde.yml
new file mode 100644
index 0000000000000..4c47ed1aaf451
--- /dev/null
+++ b/.github/workflows/avx512-sde.yml
@@ -0,0 +1,137 @@
+# Copyright 2026 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright (c) 2026 Intel Corporation. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+# Run AVX512VL-specific tests under Intel SDE.
+#
+# GitHub Actions runners currently do not have AVX512 hardware.
+# Intel SDE emulates AVX512 instructions and spoofs CPUID,
+# so AVX512 code paths are exercised.
+#
+# To update Intel SDE: find the new mirror ID and file date from
+# https://www.intel.com/content/www/us/en/download/684897
+# and update the three env vars below.
+
+name: AVX512 tests via Intel SDE
+
+on: [pull_request, push]
+
+permissions:
+  contents: read
+
+env:
+  SDE_VERSION: 10.8.0
+  SDE_DATE: 2026-03-15
+  SDE_MIRROR_ID: 915934
+
+jobs:
+  linux:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          persist-credentials: false
+
+      - name: install NASM
+        run: sudo apt-get install -y nasm
+
+      - name: install Intel SDE
+        run: |
+          SDE_URL="https://downloadmirror.intel.com/${SDE_MIRROR_ID}/sde-external-${SDE_VERSION}-${SDE_DATE}-lin.tar.xz"
+          SDE_SHA256="50b320cd226acef7a491f5b321fc1be3c3c7984f9e27a456e64894b5b0979dd3"
+          curl -fsSL -o /tmp/sde.tar.xz "$SDE_URL"
+          echo "$SDE_SHA256  /tmp/sde.tar.xz" | sha256sum -c -
+          mkdir /tmp/sde
+          tar -xf /tmp/sde.tar.xz -C /tmp/sde/
+          sudo mv /tmp/sde/sde-external-${SDE_VERSION}-${SDE_DATE}-lin /opt/sde
+          echo "/opt/sde" >> "$GITHUB_PATH"
+
+      - name: config
+        run: |
+          ./config --banner=Configured --strict-warnings no-shared enable-fips
+
+      - name: build
+        run: make -j4
+
+      - name: show CPU and OpenSSL build info
+        run: |
+          cat /proc/cpuinfo | grep -m1 "model name"
+          sde64 -skx -- ./apps/openssl version -c
+
+      - name: ml_dsa_internal_test (AVX512VL via SDE)
+        run: sde64 -skx -- ./test/ml_dsa_internal_test
+
+      - name: sha3_x4_internal_test (AVX512VL via SDE)
+        run: sde64 -skx -- ./test/sha3_x4_internal_test
+
+      - name: fipsinstall (FIPS KAT via SDE)
+        run: sde64 -skx -- ./apps/openssl fipsinstall -module ./providers/fips.so -out /tmp/fipsmodule.cnf -provider_name fips
+
+  windows:
+    runs-on: windows-2022
+    env:
+      VCVARS: C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          persist-credentials: false
+
+      - name: install NASM
+        run: |
+          choco install nasm
+          "C:\Program Files\NASM" | Out-File -FilePath "$env:GITHUB_PATH" -Append
+
+      - name: install JOM
+        run: choco install jom
+
+      - name: install Intel SDE
+        run: |
+          $url = "https://downloadmirror.intel.com/$env:SDE_MIRROR_ID/sde-external-$env:SDE_VERSION-$env:SDE_DATE-win.tar.xz"
+          $expected = "176F87C80EB42BB91B73E1428F4A0FD067DF322F901F9B4359B20B86B92C2BAE"
+          curl.exe -fsSL -o sde-win.tar.xz $url
+          $actual = (Get-FileHash sde-win.tar.xz -Algorithm SHA256).Hash
+          if ($actual -ne $expected) { throw "SDE SHA256 mismatch: got $actual" }
+          & "C:\Program Files\7-Zip\7z.exe" x sde-win.tar.xz -so | & "C:\Program Files\7-Zip\7z.exe" x -si -ttar -o"C:\sde"
+          $sdeRoot = "C:\sde\sde-external-$env:SDE_VERSION-$env:SDE_DATE-win"
+          if (-not (Test-Path "$sdeRoot\sde.exe")) { throw "sde.exe not found in $sdeRoot" }
+          "$sdeRoot" | Out-File -FilePath $env:GITHUB_PATH -Append
+
+      - name: prepare build directory
+        run: mkdir _build
+
+      - name: config
+        working-directory: _build
+        shell: cmd
+        run: |
+          call "%VCVARS%"
+          perl ..\Configure --banner=Configured --strict-warnings no-shared enable-fips no-makedepend
+
+      - name: build
+        working-directory: _build
+        shell: cmd
+        run: |
+          call "%VCVARS%"
+          jom /j4 /S
+
+      - name: show CPU and OpenSSL build info
+        working-directory: _build
+        run: sde -skx -- apps\openssl.exe version -c
+
+      - name: ml_dsa_internal_test (AVX512VL via SDE)
+        working-directory: _build
+        shell: cmd
+        run: sde -skx -- test\ml_dsa_internal_test.exe
+
+      - name: sha3_x4_internal_test (AVX512VL via SDE)
+        working-directory: _build
+        shell: cmd
+        run: sde -skx -- test\sha3_x4_internal_test.exe
+
+      - name: fipsinstall (FIPS KAT via SDE)
+        working-directory: _build
+        shell: cmd
+        run: sde -skx -- apps\openssl.exe fipsinstall -module providers\fips.dll -out fipsmodule.cnf -provider_name fips