diff --git a/.github/workflows/avx512-sde.yml b/.github/workflows/avx512-sde.yml
new file mode 100644
index 0000000000000..4c47ed1aaf451
--- /dev/null
+++ b/.github/workflows/avx512-sde.yml
@@ -0,0 +1,137 @@
+# Copyright 2026 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright (c) 2026 Intel Corporation. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+# Run AVX512VL-specific tests under Intel SDE.
+#
+# GitHub Actions runners currently do not have AVX512 hardware.
+# Intel SDE emulates AVX512 instructions and spoofs CPUID,
+# so AVX512 code paths are exercised.
+#
+# To update Intel SDE: find the new mirror ID and file date from
+# https://www.intel.com/content/www/us/en/download/684897
+# and update the three env vars below.
+
+name: AVX512 tests via Intel SDE
+
+on: [pull_request, push]
+
+permissions:
+  contents: read
+
+env:
+  SDE_VERSION: 10.8.0
+  SDE_DATE: 2026-03-15
+  SDE_MIRROR_ID: 915934
+
+jobs:
+  linux:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          persist-credentials: false
+
+      - name: install NASM
+        run: sudo apt-get install -y nasm
+
+      - name: install Intel SDE
+        run: |
+          SDE_URL="https://downloadmirror.intel.com/${SDE_MIRROR_ID}/sde-external-${SDE_VERSION}-${SDE_DATE}-lin.tar.xz"
+          SDE_SHA256="50b320cd226acef7a491f5b321fc1be3c3c7984f9e27a456e64894b5b0979dd3"
+          curl -fsSL -o /tmp/sde.tar.xz "$SDE_URL"
+          echo "$SDE_SHA256  /tmp/sde.tar.xz" | sha256sum -c -
+          mkdir /tmp/sde
+          tar -xf /tmp/sde.tar.xz -C /tmp/sde/
+          sudo mv /tmp/sde/sde-external-${SDE_VERSION}-${SDE_DATE}-lin /opt/sde
+          echo "/opt/sde" >> "$GITHUB_PATH"
+
+      - name: config
+        run: |
+          ./config --banner=Configured --strict-warnings no-shared enable-fips
+
+      - name: build
+        run: make -j4
+
+      - name: show CPU and OpenSSL build info
+        run: |
+          cat /proc/cpuinfo | grep -m1 "model name"
+          sde64 -skx -- ./apps/openssl version -c
+
+      - name: ml_dsa_internal_test (AVX512VL via SDE)
+        run: sde64 -skx -- ./test/ml_dsa_internal_test
+
+      - name: sha3_x4_internal_test (AVX512VL via SDE)
+        run: sde64 -skx -- ./test/sha3_x4_internal_test
+
+      - name: fipsinstall (FIPS KAT via SDE)
+        run: sde64 -skx -- ./apps/openssl fipsinstall -module ./providers/fips.so -out /tmp/fipsmodule.cnf -provider_name fips
+
+  windows:
+    runs-on: windows-2022
+    env:
+      VCVARS: C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          persist-credentials: false
+
+      - name: install NASM
+        run: |
+          choco install nasm
+          "C:\Program Files\NASM" | Out-File -FilePath "$env:GITHUB_PATH" -Append
+
+      - name: install JOM
+        run: choco install jom
+
+      - name: install Intel SDE
+        run: |
+          $url = "https://downloadmirror.intel.com/$env:SDE_MIRROR_ID/sde-external-$env:SDE_VERSION-$env:SDE_DATE-win.tar.xz"
+          $expected = "176F87C80EB42BB91B73E1428F4A0FD067DF322F901F9B4359B20B86B92C2BAE"
+          curl.exe -fsSL -o sde-win.tar.xz $url
+          $actual = (Get-FileHash sde-win.tar.xz -Algorithm SHA256).Hash
+          if ($actual -ne $expected) { throw "SDE SHA256 mismatch: got $actual" }
+          & "C:\Program Files\7-Zip\7z.exe" x sde-win.tar.xz -so | & "C:\Program Files\7-Zip\7z.exe" x -si -ttar -o"C:\sde"
+          $sdeRoot = "C:\sde\sde-external-$env:SDE_VERSION-$env:SDE_DATE-win"
+          if (-not (Test-Path "$sdeRoot\sde.exe")) { throw "sde.exe not found in $sdeRoot" }
+          "$sdeRoot" | Out-File -FilePath $env:GITHUB_PATH -Append
+
+      - name: prepare build directory
+        run: mkdir _build
+
+      - name: config
+        working-directory: _build
+        shell: cmd
+        run: |
+          call "%VCVARS%"
+          perl ..\Configure --banner=Configured --strict-warnings no-shared enable-fips no-makedepend
+
+      - name: build
+        working-directory: _build
+        shell: cmd
+        run: |
+          call "%VCVARS%"
+          jom /j4 /S
+
+      - name: show CPU and OpenSSL build info
+        working-directory: _build
+        run: sde -skx -- apps\openssl.exe version -c
+
+      - name: ml_dsa_internal_test (AVX512VL via SDE)
+        working-directory: _build
+        shell: cmd
+        run: sde -skx -- test\ml_dsa_internal_test.exe
+
+      - name: sha3_x4_internal_test (AVX512VL via SDE)
+        working-directory: _build
+        shell: cmd
+        run: sde -skx -- test\sha3_x4_internal_test.exe
+
+      - name: fipsinstall (FIPS KAT via SDE)
+        working-directory: _build
+        shell: cmd
+        run: sde -skx -- apps\openssl.exe fipsinstall -module providers\fips.dll -out fipsmodule.cnf -provider_name fips
diff --git a/CHANGES.md b/CHANGES.md
index 049c0e7288710..f7ee59641cf2c 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -31,6 +31,10 @@ OpenSSL Releases
 
 ### Changes between 4.0 and 4.1 [xx XXX xxxx]
 
+ * Added AVX512 optimized SHAKE x4 operations for ML-DSA on x86_64.
+
+   *Marcel Cornu and Tomasz Kantecki*
+
  * Added test framework for testing function memory allocation failures.
 
    *Jakub Zelenka*
diff --git a/crypto/ml_dsa/ml_dsa_hash.h b/crypto/ml_dsa/ml_dsa_hash.h
index 7625d3367d9c3..4280ef67c8897 100644
--- a/crypto/ml_dsa/ml_dsa_hash.h
+++ b/crypto/ml_dsa/ml_dsa_hash.h
@@ -7,6 +7,9 @@
  * https://www.openssl.org/source/license.html
  */
 
+#ifndef OSSL_CRYPTO_ML_DSA_HASH_H
+#define OSSL_CRYPTO_ML_DSA_HASH_H
+
 #include <openssl/evp.h>
 
 static ossl_inline ossl_unused int
@@ -39,3 +42,5 @@ shake_xof_3(EVP_MD_CTX *ctx, const EVP_MD *md, const uint8_t *in1, size_t in1_le
         && EVP_DigestUpdate(ctx, in3, in3_len)
         && EVP_DigestSqueeze(ctx, out, out_len);
 }
+
+#endif /* OSSL_CRYPTO_ML_DSA_HASH_H */
diff --git a/crypto/ml_dsa/ml_dsa_key.c b/crypto/ml_dsa/ml_dsa_key.c
index 24fa7596e2f77..74488365c31f2 100644
--- a/crypto/ml_dsa/ml_dsa_key.c
+++ b/crypto/ml_dsa/ml_dsa_key.c
@@ -332,7 +332,7 @@ int ossl_ml_dsa_key_has(const ML_DSA_KEY *key, int selection)
  * @returns 1 on success, or 0 on failure.
  */
 static int public_from_private(const ML_DSA_KEY *key, EVP_MD_CTX *md_ctx,
-    VECTOR *t1, VECTOR *t0)
+    const OSSL_ML_DSA_SAMPLE_OPS *sample_ops, VECTOR *t1, VECTOR *t0)
 {
     int ret = 0;
     const ML_DSA_PARAMS *params = key->params;
@@ -351,7 +351,7 @@ static int public_from_private(const ML_DSA_KEY *key, EVP_MD_CTX *md_ctx,
     matrix_init(&a_ntt, s1_ntt.poly + l, k, l);
 
     /* Using rho generate A' = A in NTT form */
-    if (!matrix_expand_A(md_ctx, key->shake128_md, key->rho, &a_ntt))
+    if (!sample_ops->matrix_expand_A(md_ctx, key->shake128_md, key->rho, &a_ntt))
         goto err;
 
     /* t = NTT_inv(A' * NTT(s1)) + s2 */
@@ -376,6 +376,7 @@ static int public_from_private(const ML_DSA_KEY *key, EVP_MD_CTX *md_ctx,
 int ossl_ml_dsa_key_public_from_private(ML_DSA_KEY *key)
 {
     int ret = 0;
+    const OSSL_ML_DSA_SAMPLE_OPS *sample_ops = ossl_ml_dsa_sample_ops();
     VECTOR t0;
     EVP_MD_CTX *md_ctx = NULL;
 
@@ -383,7 +384,7 @@ int ossl_ml_dsa_key_public_from_private(ML_DSA_KEY *key)
         return 0;
     ret = ((md_ctx = EVP_MD_CTX_new()) != NULL)
         && ossl_ml_dsa_key_pub_alloc(key) /* allocate space for t1 */
-        && public_from_private(key, md_ctx, &key->t1, &t0)
+        && public_from_private(key, md_ctx, sample_ops, &key->t1, &t0)
         && vector_equal(&t0, &key->t0) /* compare the generated t0 to the expected */
         && ossl_ml_dsa_pk_encode(key)
         && shake_xof(md_ctx, key->shake256_md,
@@ -397,6 +398,7 @@ int ossl_ml_dsa_key_public_from_private(ML_DSA_KEY *key)
 int ossl_ml_dsa_key_pairwise_check(const ML_DSA_KEY *key)
 {
     int ret = 0;
+    const OSSL_ML_DSA_SAMPLE_OPS *sample_ops = ossl_ml_dsa_sample_ops();
     VECTOR t1, t0;
     POLY *polys = NULL;
     uint32_t k = (uint32_t)key->params->k;
@@ -414,7 +416,7 @@ int ossl_ml_dsa_key_pairwise_check(const ML_DSA_KEY *key)
 
     vector_init(&t1, polys, k);
     vector_init(&t0, polys + k, k);
-    if (!public_from_private(key, md_ctx, &t1, &t0))
+    if (!public_from_private(key, md_ctx, sample_ops, &t1, &t0))
         goto err;
 
     ret = vector_equal(&t1, &key->t1) && vector_equal(&t0, &key->t0);
@@ -435,6 +437,7 @@ int ossl_ml_dsa_key_pairwise_check(const ML_DSA_KEY *key)
 static int keygen_internal(ML_DSA_KEY *out)
 {
     int ret = 0;
+    const OSSL_ML_DSA_SAMPLE_OPS *sample_ops = ossl_ml_dsa_sample_ops();
     uint8_t augmented_seed[ML_DSA_SEED_BYTES + 2];
     uint8_t expanded_seed[ML_DSA_RHO_BYTES + ML_DSA_PRIV_SEED_BYTES + ML_DSA_K_BYTES];
     const uint8_t *const rho = expanded_seed; /* p = Public Random Seed */
@@ -461,8 +464,9 @@ static int keygen_internal(ML_DSA_KEY *out)
     memcpy(out->rho, rho, sizeof(out->rho));
     memcpy(out->K, K, sizeof(out->K));
 
-    ret = vector_expand_S(md_ctx, out->shake256_md, params->eta, priv_seed, &out->s1, &out->s2)
-        && public_from_private(out, md_ctx, &out->t1, &out->t0)
+    ret = sample_ops->vector_expand_S(md_ctx, out->shake256_md, params->eta,
+              priv_seed, &out->s1, &out->s2)
+        && public_from_private(out, md_ctx, sample_ops, &out->t1, &out->t0)
         && ossl_ml_dsa_pk_encode(out)
         && shake_xof(md_ctx, out->shake256_md, out->pub_encoding, out->params->pk_len,
             out->tr, sizeof(out->tr))
diff --git a/crypto/ml_dsa/ml_dsa_local.h b/crypto/ml_dsa/ml_dsa_local.h
index bbaa6dafc75a9..34a83f8ffbe0e 100644
--- a/crypto/ml_dsa/ml_dsa_local.h
+++ b/crypto/ml_dsa/ml_dsa_local.h
@@ -59,10 +59,23 @@ typedef struct vector_st VECTOR;
 typedef struct matrix_st MATRIX;
 typedef struct ml_dsa_sig_st ML_DSA_SIG;
 
-int ossl_ml_dsa_matrix_expand_A(EVP_MD_CTX *g_ctx, const EVP_MD *md,
+typedef int(ML_DSA_MATRIX_EXPAND_A_FN)(EVP_MD_CTX *g_ctx, const EVP_MD *md,
     const uint8_t *rho, MATRIX *out);
-int ossl_ml_dsa_vector_expand_S(EVP_MD_CTX *h_ctx, const EVP_MD *md, int eta,
-    const uint8_t *seed, VECTOR *s1, VECTOR *s2);
+typedef int(ML_DSA_VECTOR_EXPAND_S_FN)(EVP_MD_CTX *h_ctx, const EVP_MD *md,
+    int eta, const uint8_t *seed, VECTOR *s1, VECTOR *s2);
+typedef void(ML_DSA_VECTOR_EXPAND_MASK_FN)(VECTOR *out, const uint8_t *rho_prime,
+    size_t rho_prime_len, uint32_t kappa, uint32_t gamma1,
+    EVP_MD_CTX *h_ctx, const EVP_MD *md);
+
+typedef struct ossl_ml_dsa_sample_ops_st {
+    ML_DSA_MATRIX_EXPAND_A_FN *matrix_expand_A;
+    ML_DSA_VECTOR_EXPAND_S_FN *vector_expand_S;
+    ML_DSA_VECTOR_EXPAND_MASK_FN *vector_expand_mask;
+} OSSL_ML_DSA_SAMPLE_OPS;
+
+const OSSL_ML_DSA_SAMPLE_OPS *ossl_ml_dsa_sample_ops(void);
+const OSSL_ML_DSA_SAMPLE_OPS *ossl_ml_dsa_sample_generic_ops(void);
+const OSSL_ML_DSA_SAMPLE_OPS *ossl_ml_dsa_sample_x86_64_ops(void);
 void ossl_ml_dsa_matrix_mult_vector(const MATRIX *matrix_kl, const VECTOR *vl,
     VECTOR *vk);
 int ossl_ml_dsa_poly_expand_mask(POLY *out, const uint8_t *seed, size_t seed_len,
diff --git a/crypto/ml_dsa/ml_dsa_matrix.h b/crypto/ml_dsa/ml_dsa_matrix.h
index 0352ecac7afc0..cd9005fc87177 100644
--- a/crypto/ml_dsa/ml_dsa_matrix.h
+++ b/crypto/ml_dsa/ml_dsa_matrix.h
@@ -35,10 +35,3 @@ matrix_mult_vector(const MATRIX *a, const VECTOR *s, VECTOR *t)
 {
     ossl_ml_dsa_matrix_mult_vector(a, s, t);
 }
-
-static ossl_inline ossl_unused int
-matrix_expand_A(EVP_MD_CTX *g_ctx, const EVP_MD *md, const uint8_t *rho,
-    MATRIX *out)
-{
-    return ossl_ml_dsa_matrix_expand_A(g_ctx, md, rho, out);
-}
diff --git a/crypto/ml_dsa/ml_dsa_sample.c b/crypto/ml_dsa/ml_dsa_sample.c
index 5d9dc84a54fa3..3eef3c0176b1b 100644
--- a/crypto/ml_dsa/ml_dsa_sample.c
+++ b/crypto/ml_dsa/ml_dsa_sample.c
@@ -8,6 +8,7 @@
  */
 
 #include <openssl/byteorder.h>
+#include <openssl/crypto.h>
 #include "ml_dsa_local.h"
 #include "ml_dsa_vector.h"
 #include "ml_dsa_matrix.h"
@@ -35,6 +36,10 @@ typedef int(COEFF_FROM_NIBBLE_FUNC)(uint32_t nibble, uint32_t *out);
 static COEFF_FROM_NIBBLE_FUNC coeff_from_nibble_4;
 static COEFF_FROM_NIBBLE_FUNC coeff_from_nibble_2;
 
+static ML_DSA_MATRIX_EXPAND_A_FN matrix_expand_A_scalar;
+static ML_DSA_VECTOR_EXPAND_S_FN vector_expand_S_scalar;
+static ML_DSA_VECTOR_EXPAND_MASK_FN vector_expand_mask_scalar;
+
 /**
  * @brief Combine 3 bytes to form an coefficient.
  * See FIPS 204, Algorithm 14, CoeffFromThreeBytes()
@@ -198,7 +203,7 @@ static int rej_bounded_poly(EVP_MD_CTX *h_ctx, const EVP_MD *md,
  *            in the range of 0..q-1.
  * @returns 1 if the matrix was generated, or 0 on error.
  */
-int ossl_ml_dsa_matrix_expand_A(EVP_MD_CTX *g_ctx, const EVP_MD *md,
+static int matrix_expand_A_scalar(EVP_MD_CTX *g_ctx, const EVP_MD *md,
     const uint8_t *rho, MATRIX *out)
 {
     int ret = 0;
@@ -208,7 +213,6 @@ int ossl_ml_dsa_matrix_expand_A(EVP_MD_CTX *g_ctx, const EVP_MD *md,
 
     /* The seed used for each matrix element is rho + column_index + row_index */
     memcpy(derived_seed, rho, ML_DSA_RHO_BYTES);
-
     for (i = 0; i < out->k; i++) {
         for (j = 0; j < out->l; j++) {
             derived_seed[ML_DSA_RHO_BYTES + 1] = (uint8_t)i;
@@ -241,7 +245,7 @@ int ossl_ml_dsa_matrix_expand_A(EVP_MD_CTX *g_ctx, const EVP_MD *md,
  *           the range (q-eta)..0..eta
  * @returns 1 if s1 and s2 were successfully generated, or 0 otherwise.
  */
-int ossl_ml_dsa_vector_expand_S(EVP_MD_CTX *h_ctx, const EVP_MD *md, int eta,
+static int vector_expand_S_scalar(EVP_MD_CTX *h_ctx, const EVP_MD *md, int eta,
     const uint8_t *seed, VECTOR *s1, VECTOR *s2)
 {
     int ret = 0;
@@ -376,3 +380,57 @@ int ossl_ml_dsa_poly_sample_in_ball(POLY *out_c, const uint8_t *seed, int seed_l
     }
     return 1;
 }
+
+static void vector_expand_mask_scalar(VECTOR *out, const uint8_t *rho_prime,
+    size_t rho_prime_len, uint32_t kappa, uint32_t gamma1,
+    EVP_MD_CTX *h_ctx, const EVP_MD *md)
+{
+    size_t i;
+    uint8_t derived_seed[ML_DSA_RHO_PRIME_BYTES + 2];
+
+    (void)rho_prime_len;
+
+    memcpy(derived_seed, rho_prime, ML_DSA_RHO_PRIME_BYTES);
+
+    for (i = 0; i < out->num_poly; i++) {
+        size_t index = kappa + i;
+
+        derived_seed[ML_DSA_RHO_PRIME_BYTES] = index & 0xFF;
+        derived_seed[ML_DSA_RHO_PRIME_BYTES + 1] = (index >> 8) & 0xFF;
+        poly_expand_mask(out->poly + i, derived_seed, sizeof(derived_seed),
+            gamma1, h_ctx, md);
+    }
+}
+
+static const OSSL_ML_DSA_SAMPLE_OPS ml_dsa_sample_generic_meth = {
+    matrix_expand_A_scalar,
+    vector_expand_S_scalar,
+    vector_expand_mask_scalar
+};
+
+const OSSL_ML_DSA_SAMPLE_OPS *ossl_ml_dsa_sample_ops(void)
+{
+#if defined(KECCAK1600_ASM)                                                               \
+    && (defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)) \
+    && !defined(OPENSSL_NO_ASM)
+    return ossl_ml_dsa_sample_x86_64_ops();
+#else
+    return ossl_ml_dsa_sample_generic_ops();
+#endif
+}
+
+const OSSL_ML_DSA_SAMPLE_OPS *ossl_ml_dsa_sample_generic_ops(void)
+{
+    return &ml_dsa_sample_generic_meth;
+}
+
+#if defined(KECCAK1600_ASM)                                                               \
+    && (defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)) \
+    && !defined(OPENSSL_NO_ASM)
+#include "ml_dsa_sample_hw_x86_64.inc"
+#else
+const OSSL_ML_DSA_SAMPLE_OPS *ossl_ml_dsa_sample_x86_64_ops(void)
+{
+    return ossl_ml_dsa_sample_generic_ops();
+}
+#endif
diff --git a/crypto/ml_dsa/ml_dsa_sample_hw_x86_64.inc b/crypto/ml_dsa/ml_dsa_sample_hw_x86_64.inc
new file mode 100644
index 0000000000000..cc36c489a761f
--- /dev/null
+++ b/crypto/ml_dsa/ml_dsa_sample_hw_x86_64.inc
@@ -0,0 +1,310 @@
+/*
+ * Copyright 2026 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright (c) 2026 Intel Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#define ML_DSA_SHAKE_X4_BATCH_SIZE 4
+#define ML_DSA_SHAKE_X4_DONE_MASK ((1 << ML_DSA_SHAKE_X4_BATCH_SIZE) - 1)
+#define ML_DSA_EXPAND_MASK_BYTES_PER_COEFF 32
+#define ML_DSA_EXPAND_MASK_COEFFS_GAMMA1_19 20
+#define ML_DSA_EXPAND_MASK_COEFFS_GAMMA1_17 18
+#define ML_DSA_EXPAND_MASK_BUF_SIZE_GAMMA1_19 \
+    (ML_DSA_EXPAND_MASK_BYTES_PER_COEFF * ML_DSA_EXPAND_MASK_COEFFS_GAMMA1_19)
+#define ML_DSA_EXPAND_MASK_BUF_SIZE_GAMMA1_17 \
+    (ML_DSA_EXPAND_MASK_BYTES_PER_COEFF * ML_DSA_EXPAND_MASK_COEFFS_GAMMA1_17)
+#define ML_DSA_EXPAND_MASK_BUF_SIZE(gamma1)         \
+    ((gamma1) == ML_DSA_GAMMA1_TWO_POWER_19         \
+            ? ML_DSA_EXPAND_MASK_BUF_SIZE_GAMMA1_19 \
+            : ML_DSA_EXPAND_MASK_BUF_SIZE_GAMMA1_17)
+
+static ossl_unused int rej_ntt_poly_mb(EVP_MD_CTX *g_ctx, const EVP_MD *md,
+    const uint8_t *seeds[ML_DSA_SHAKE_X4_BATCH_SIZE], const size_t seed_len,
+    POLY *outs[ML_DSA_SHAKE_X4_BATCH_SIZE], const size_t count)
+{
+    KECCAK1600_X4_CTX ctx;
+    uint8_t blocks[ML_DSA_SHAKE_X4_BATCH_SIZE][SHAKE128_BLOCKSIZE];
+    int coeff_idx[ML_DSA_SHAKE_X4_BATCH_SIZE] = { 0, 0, 0, 0 };
+    size_t done_mask = 0;
+    size_t lane;
+
+    (void)g_ctx;
+    (void)md;
+
+    for (lane = count; lane < ML_DSA_SHAKE_X4_BATCH_SIZE; lane++)
+        done_mask |= ((size_t)1 << lane);
+
+    ossl_sha3_shake128_x4_inc_init(&ctx);
+    ossl_sha3_shake128_x4_inc_absorb(&ctx, seeds[0], seeds[1],
+        seeds[2], seeds[3], seed_len);
+    ossl_sha3_shake128_x4_inc_finalize(&ctx);
+
+    while (done_mask != ML_DSA_SHAKE_X4_DONE_MASK) {
+        ossl_sha3_shake128_x4_inc_squeeze(blocks[0], blocks[1],
+            blocks[2], blocks[3], SHAKE128_BLOCKSIZE, &ctx);
+
+        for (lane = 0; lane < ML_DSA_SHAKE_X4_BATCH_SIZE; lane++) {
+            if (done_mask & ((size_t)1 << lane))
+                continue;
+
+            const uint8_t *b = blocks[lane];
+            const uint8_t *end = b + SHAKE128_BLOCKSIZE;
+
+            for (; b < end && coeff_idx[lane] < ML_DSA_NUM_POLY_COEFFICIENTS; b += 3) {
+                uint32_t *coeff_ptr = &(outs[lane]->coeff[coeff_idx[lane]]);
+
+                if (coeff_from_three_bytes(b, coeff_ptr))
+                    coeff_idx[lane]++;
+            }
+
+            if (coeff_idx[lane] >= ML_DSA_NUM_POLY_COEFFICIENTS)
+                done_mask |= ((size_t)1 << lane);
+        }
+    }
+
+    return 1;
+}
+
+static void vector_expand_mask_mb(VECTOR *out, const uint8_t *rho_prime,
+    const size_t rho_prime_len, const uint32_t kappa, const uint32_t gamma1,
+    EVP_MD_CTX *h_ctx, const EVP_MD *md)
+{
+    size_t i;
+    const size_t num_polys = out->num_poly;
+    uint8_t derived_seeds[ML_DSA_SHAKE_X4_BATCH_SIZE][ML_DSA_RHO_PRIME_BYTES + 2];
+    const size_t seed_len = sizeof(derived_seeds[0]);
+    const size_t buf_size = ML_DSA_EXPAND_MASK_BUF_SIZE(gamma1);
+    uint8_t buffers[ML_DSA_SHAKE_X4_BATCH_SIZE][ML_DSA_EXPAND_MASK_BUF_SIZE_GAMMA1_19];
+
+    (void)rho_prime_len;
+    (void)h_ctx;
+    (void)md;
+
+    for (i = 0; i < ML_DSA_SHAKE_X4_BATCH_SIZE; i++)
+        memcpy(derived_seeds[i], rho_prime, ML_DSA_RHO_PRIME_BYTES);
+
+    for (i = 0; i + (ML_DSA_SHAKE_X4_BATCH_SIZE - 1) < num_polys; i += ML_DSA_SHAKE_X4_BATCH_SIZE) {
+        size_t b;
+
+        for (b = 0; b < ML_DSA_SHAKE_X4_BATCH_SIZE; b++) {
+            const size_t index = kappa + i + b;
+
+            derived_seeds[b][ML_DSA_RHO_PRIME_BYTES] = index & 0xFF;
+            derived_seeds[b][ML_DSA_RHO_PRIME_BYTES + 1] = (index >> 8) & 0xFF;
+        }
+
+        ossl_sha3_shake256_x4(buffers[0], buffers[1], buffers[2], buffers[3], buf_size,
+            derived_seeds[0], derived_seeds[1], derived_seeds[2], derived_seeds[3], seed_len);
+
+        ossl_ml_dsa_poly_decode_expand_mask(&out->poly[i + 0], buffers[0], buf_size, gamma1);
+        ossl_ml_dsa_poly_decode_expand_mask(&out->poly[i + 1], buffers[1], buf_size, gamma1);
+        ossl_ml_dsa_poly_decode_expand_mask(&out->poly[i + 2], buffers[2], buf_size, gamma1);
+        ossl_ml_dsa_poly_decode_expand_mask(&out->poly[i + 3], buffers[3], buf_size, gamma1);
+    }
+
+    if (i < num_polys) {
+        const size_t left = num_polys - i;
+        size_t b;
+
+        for (b = 0; b < left; b++) {
+            const size_t index = kappa + i + b;
+
+            derived_seeds[b][ML_DSA_RHO_PRIME_BYTES] = (uint8_t)index;
+            derived_seeds[b][ML_DSA_RHO_PRIME_BYTES + 1] = (uint8_t)(index >> 8);
+        }
+
+        ossl_sha3_shake256_x4(buffers[0], buffers[1], buffers[2], buffers[3], buf_size,
+            derived_seeds[0], derived_seeds[1], derived_seeds[2], derived_seeds[3], seed_len);
+
+        ossl_ml_dsa_poly_decode_expand_mask(&out->poly[i + 0], buffers[0], buf_size, gamma1);
+
+        if ((i + 1) < num_polys)
+            ossl_ml_dsa_poly_decode_expand_mask(&out->poly[i + 1], buffers[1], buf_size, gamma1);
+
+        if ((i + 2) < num_polys)
+            ossl_ml_dsa_poly_decode_expand_mask(&out->poly[i + 2], buffers[2], buf_size, gamma1);
+    }
+}
+
+static ossl_unused int rej_bounded_poly_mb(EVP_MD_CTX *h_ctx, const EVP_MD *md,
+    COEFF_FROM_NIBBLE_FUNC *coef_from_nibble,
+    const uint8_t *seeds[ML_DSA_SHAKE_X4_BATCH_SIZE], const size_t seed_len,
+    POLY *outs[ML_DSA_SHAKE_X4_BATCH_SIZE], const size_t count)
+{
+    KECCAK1600_X4_CTX ctx;
+    uint8_t blocks[ML_DSA_SHAKE_X4_BATCH_SIZE][SHAKE256_BLOCKSIZE];
+    int coeff_idx[ML_DSA_SHAKE_X4_BATCH_SIZE] = { 0, 0, 0, 0 };
+    size_t done_mask = 0;
+    size_t lane;
+
+    (void)h_ctx;
+    (void)md;
+
+    for (lane = count; lane < ML_DSA_SHAKE_X4_BATCH_SIZE; lane++)
+        done_mask |= ((size_t)1 << lane);
+
+    ossl_sha3_shake256_x4_inc_init(&ctx);
+    ossl_sha3_shake256_x4_inc_absorb(&ctx, seeds[0], seeds[1],
+        seeds[2], seeds[3], seed_len);
+    ossl_sha3_shake256_x4_inc_finalize(&ctx);
+
+    while (done_mask != ML_DSA_SHAKE_X4_DONE_MASK) {
+        ossl_sha3_shake256_x4_inc_squeeze(blocks[0], blocks[1],
+            blocks[2], blocks[3], SHAKE256_BLOCKSIZE, &ctx);
+
+        for (lane = 0; lane < ML_DSA_SHAKE_X4_BATCH_SIZE; lane++) {
+            if (done_mask & ((size_t)1 << lane))
+                continue;
+
+            const uint8_t *b = blocks[lane];
+            const uint8_t *end = b + SHAKE256_BLOCKSIZE;
+
+            for (; b < end && coeff_idx[lane] < ML_DSA_NUM_POLY_COEFFICIENTS; b++) {
+                uint32_t z0 = *b & 0x0F;
+                uint32_t z1 = *b >> 4;
+
+                if (coef_from_nibble(z0, &outs[lane]->coeff[coeff_idx[lane]]))
+                    coeff_idx[lane]++;
+
+                if (coeff_idx[lane] >= ML_DSA_NUM_POLY_COEFFICIENTS) {
+                    done_mask |= ((size_t)1 << lane);
+                    break;
+                }
+
+                if (coef_from_nibble(z1, &outs[lane]->coeff[coeff_idx[lane]]))
+                    coeff_idx[lane]++;
+
+                if (coeff_idx[lane] >= ML_DSA_NUM_POLY_COEFFICIENTS) {
+                    done_mask |= ((size_t)1 << lane);
+                    break;
+                }
+            }
+        }
+    }
+
+    return 1;
+}
+
+static int matrix_expand_A_mb(EVP_MD_CTX *g_ctx, const EVP_MD *md,
+    const uint8_t *rho, MATRIX *out)
+{
+    size_t b, idx;
+    uint8_t derived_seeds[ML_DSA_SHAKE_X4_BATCH_SIZE][ML_DSA_RHO_BYTES + 2];
+    const size_t seed_len = sizeof(derived_seeds[0]);
+    const uint8_t *seeds[ML_DSA_SHAKE_X4_BATCH_SIZE];
+    POLY *polys[ML_DSA_SHAKE_X4_BATCH_SIZE];
+    POLY *poly = out->m_poly;
+
+    for (b = 0; b < ML_DSA_SHAKE_X4_BATCH_SIZE; b++) {
+        memcpy(derived_seeds[b], rho, ML_DSA_RHO_BYTES);
+        seeds[b] = derived_seeds[b];
+    }
+
+    for (idx = 0; (idx + ML_DSA_SHAKE_X4_BATCH_SIZE - 1) < (out->k * out->l);
+         idx += ML_DSA_SHAKE_X4_BATCH_SIZE) {
+        for (b = 0; b < ML_DSA_SHAKE_X4_BATCH_SIZE; b++) {
+            const size_t row = (idx + b) / out->l;
+            const size_t col = (idx + b) % out->l;
+
+            derived_seeds[b][ML_DSA_RHO_BYTES] = (uint8_t)col;
+            derived_seeds[b][ML_DSA_RHO_BYTES + 1] = (uint8_t)row;
+            polys[b] = &poly[idx + b];
+        }
+
+        if (!rej_ntt_poly_mb(g_ctx, md, seeds, seed_len, polys, 4))
+            return 0;
+    }
+
+    if (idx < (out->k * out->l)) {
+        const size_t left = (out->k * out->l) - idx;
+
+        for (b = 0; b < left; b++) {
+            const size_t row = (idx + b) / out->l;
+            const size_t col = (idx + b) % out->l;
+
+            derived_seeds[b][ML_DSA_RHO_BYTES] = (uint8_t)col;
+            derived_seeds[b][ML_DSA_RHO_BYTES + 1] = (uint8_t)row;
+            polys[b] = &poly[idx + b];
+        }
+
+        if (!rej_ntt_poly_mb(g_ctx, md, seeds, seed_len, polys, left))
+            return 0;
+    }
+
+    return 1;
+}
+
+static int vector_expand_S_mb(EVP_MD_CTX *h_ctx, const EVP_MD *md, const int eta,
+    const uint8_t *seed, VECTOR *s1, VECTOR *s2)
+{
+    size_t b, idx;
+    const size_t l = s1->num_poly;
+    const size_t total = l + s2->num_poly;
+    uint8_t derived_seeds[ML_DSA_SHAKE_X4_BATCH_SIZE][ML_DSA_PRIV_SEED_BYTES + 2];
+    const uint8_t *seeds[ML_DSA_SHAKE_X4_BATCH_SIZE];
+    const size_t seed_len = sizeof(derived_seeds[0]);
+    POLY *polys[ML_DSA_SHAKE_X4_BATCH_SIZE];
+    COEFF_FROM_NIBBLE_FUNC *coef_from_nibble_fn = (eta == ML_DSA_ETA_4) ? coeff_from_nibble_4 : coeff_from_nibble_2;
+
+    for (b = 0; b < ML_DSA_SHAKE_X4_BATCH_SIZE; b++) {
+        memcpy(derived_seeds[b], seed, ML_DSA_PRIV_SEED_BYTES);
+        seeds[b] = derived_seeds[b];
+    }
+
+    for (idx = 0; (idx + ML_DSA_SHAKE_X4_BATCH_SIZE - 1) < total; idx += ML_DSA_SHAKE_X4_BATCH_SIZE) {
+        for (b = 0; b < ML_DSA_SHAKE_X4_BATCH_SIZE; b++) {
+            const size_t poly_idx = idx + b;
+
+            derived_seeds[b][ML_DSA_PRIV_SEED_BYTES] = (uint8_t)(poly_idx);
+            derived_seeds[b][ML_DSA_PRIV_SEED_BYTES + 1] = (uint8_t)(poly_idx >> 8);
+
+            if (poly_idx < l)
+                polys[b] = &s1->poly[poly_idx];
+            else
+                polys[b] = &s2->poly[poly_idx - l];
+        }
+
+        if (!rej_bounded_poly_mb(h_ctx, md, coef_from_nibble_fn,
+                seeds, seed_len, polys, ML_DSA_SHAKE_X4_BATCH_SIZE))
+            return 0;
+    }
+
+    if (idx < total) {
+        const size_t batch_count = total - idx;
+
+        for (b = 0; b < batch_count; b++) {
+            const size_t poly_idx = idx + b;
+
+            derived_seeds[b][ML_DSA_PRIV_SEED_BYTES] = (uint8_t)(poly_idx);
+            derived_seeds[b][ML_DSA_PRIV_SEED_BYTES + 1] = (uint8_t)(poly_idx >> 8);
+
+            if (poly_idx < l)
+                polys[b] = &s1->poly[poly_idx];
+            else
+                polys[b] = &s2->poly[poly_idx - l];
+        }
+
+        if (!rej_bounded_poly_mb(h_ctx, md, coef_from_nibble_fn,
+                seeds, seed_len, polys, batch_count))
+            return 0;
+    }
+
+    return 1;
+}
+
+static const OSSL_ML_DSA_SAMPLE_OPS ml_dsa_sample_x86_64 = {
+    matrix_expand_A_mb,
+    vector_expand_S_mb,
+    vector_expand_mask_mb
+};
+
+const OSSL_ML_DSA_SAMPLE_OPS *ossl_ml_dsa_sample_x86_64_ops(void)
+{
+    if (SHA3_avx512vl_capable())
+        return &ml_dsa_sample_x86_64;
+    return ossl_ml_dsa_sample_generic_ops();
+}
diff --git a/crypto/ml_dsa/ml_dsa_sign.c b/crypto/ml_dsa/ml_dsa_sign.c
index 51c2709ddbaf9..b42323266aad4 100644
--- a/crypto/ml_dsa/ml_dsa_sign.c
+++ b/crypto/ml_dsa/ml_dsa_sign.c
@@ -164,6 +164,7 @@ static int ml_dsa_sign_internal(const ML_DSA_KEY *priv,
     uint8_t *out_sig)
 {
     int ret = 0;
+    const OSSL_ML_DSA_SAMPLE_OPS *sample_ops = ossl_ml_dsa_sample_ops();
     const ML_DSA_PARAMS *params = priv->params;
     EVP_MD_CTX *md_ctx = NULL;
     uint32_t k = (uint32_t)params->k, l = (uint32_t)params->l;
@@ -232,7 +233,7 @@ static int ml_dsa_sign_internal(const ML_DSA_KEY *priv,
     CONSTTIME_SECRET_VECTOR(priv->s2);
     CONSTTIME_SECRET_VECTOR(priv->t0);
 
-    if (!matrix_expand_A(md_ctx, priv->shake128_md, priv->rho, &a_ntt))
+    if (!sample_ops->matrix_expand_A(md_ctx, priv->shake128_md, priv->rho, &a_ntt))
         goto err;
 
     /*
@@ -263,8 +264,8 @@ static int ml_dsa_sign_internal(const ML_DSA_KEY *priv,
         VECTOR *ct0 = &w1;
         uint32_t z_max, r0_max, ct0_max, h_ones;
 
-        vector_expand_mask(&y, rho_prime, sizeof(rho_prime), (uint32_t)kappa,
-            gamma1, md_ctx, priv->shake256_md);
+        sample_ops->vector_expand_mask(&y, rho_prime, sizeof(rho_prime),
+            (uint32_t)kappa, gamma1, md_ctx, priv->shake256_md);
         vector_copy(y_ntt, &y);
         vector_ntt(y_ntt);
 
@@ -380,6 +381,7 @@ static int ml_dsa_verify_internal(const ML_DSA_KEY *pub,
     const uint8_t *sig_enc, size_t sig_enc_len)
 {
     int ret = 0;
+    const OSSL_ML_DSA_SAMPLE_OPS *sample_ops = ossl_ml_dsa_sample_ops();
     uint8_t *alloc = NULL, *w1_encoded;
     POLY *p, *c_ntt;
     MATRIX a_ntt;
@@ -428,7 +430,7 @@ static int ml_dsa_verify_internal(const ML_DSA_KEY *pub,
     vector_init(&ct1_ntt, p + k, k);
 
     if (!ossl_ml_dsa_sig_decode(&sig, sig_enc, sig_enc_len, pub->params)
-        || !matrix_expand_A(md_ctx, pub->shake128_md, pub->rho, &a_ntt))
+        || !sample_ops->matrix_expand_A(md_ctx, pub->shake128_md, pub->rho, &a_ntt))
         goto err;
 
     /* Compute verifiers challenge c_ntt = NTT(SampleInBall(c_tilde)) */
diff --git a/crypto/ml_dsa/ml_dsa_vector.h b/crypto/ml_dsa/ml_dsa_vector.h
index 0693eb6e3c30c..389c0ed045338 100644
--- a/crypto/ml_dsa/ml_dsa_vector.h
+++ b/crypto/ml_dsa/ml_dsa_vector.h
@@ -149,33 +149,6 @@ vector_mult_scalar(const VECTOR *lhs, const POLY *rhs, VECTOR *out)
         ossl_ml_dsa_poly_ntt_mult(lhs->poly + i, rhs, out->poly + i);
 }
 
-static ossl_inline ossl_unused int
-vector_expand_S(EVP_MD_CTX *h_ctx, const EVP_MD *md, int eta,
-    const uint8_t *seed, VECTOR *s1, VECTOR *s2)
-{
-    return ossl_ml_dsa_vector_expand_S(h_ctx, md, eta, seed, s1, s2);
-}
-
-static ossl_inline ossl_unused void
-vector_expand_mask(VECTOR *out, const uint8_t *rho_prime, size_t rho_prime_len,
-    uint32_t kappa, uint32_t gamma1,
-    EVP_MD_CTX *h_ctx, const EVP_MD *md)
-{
-    size_t i;
-    uint8_t derived_seed[ML_DSA_RHO_PRIME_BYTES + 2];
-
-    memcpy(derived_seed, rho_prime, ML_DSA_RHO_PRIME_BYTES);
-
-    for (i = 0; i < out->num_poly; i++) {
-        size_t index = kappa + i;
-
-        derived_seed[ML_DSA_RHO_PRIME_BYTES] = index & 0xFF;
-        derived_seed[ML_DSA_RHO_PRIME_BYTES + 1] = (index >> 8) & 0xFF;
-        poly_expand_mask(out->poly + i, derived_seed, sizeof(derived_seed),
-            gamma1, h_ctx, md);
-    }
-}
-
 /* Scale back previously rounded value */
 static ossl_inline ossl_unused void
 vector_scale_power2_round_ntt(const VECTOR *in, VECTOR *out)
diff --git a/crypto/sha/asm/keccak1600x4-avx512vl.pl b/crypto/sha/asm/keccak1600x4-avx512vl.pl
new file mode 100755
index 0000000000000..cf52b190407e8
--- /dev/null
+++ b/crypto/sha/asm/keccak1600x4-avx512vl.pl
@@ -0,0 +1,2343 @@
+#!/usr/bin/env perl
+#
+# Copyright 2026 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright (c) 2026 Intel Corporation. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+###############################################################################
+# Keccak x4 AVX512VL SHA3/SHAKE Assembly Routines
+#
+# Description:
+#   This file emits x86_64 assembly for AVX512VL accelerated Keccak-f[1600]
+#   processing of 4 independent states in parallel ("x4").
+#
+#   It provides the core 24-round Keccak permutation and x4 helper routines
+#   used by SHA3 and SHAKE absorb/finalize/squeeze paths. Data from four
+#   input/output lanes is packed across YMM registers so lane-local operations
+#   execute in SIMD.
+#
+###############################################################################
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$avx512vl = 0;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+# Check for AVX512VL support in assembler
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version (\d+)\.(\d+)/) {
+  my ($gas_major, $gas_minor) = ($1, $2);
+  $avx512vl = ($gas_major > 2 || ($gas_major == 2 && $gas_minor >= 26));
+}
+
+if (!$avx512vl
+  && $win64
+  && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/)
+  && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/)
+{
+  $avx512vl = ($1 >= 2.12);
+}
+
+if (!$avx512vl && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
+    $avx512vl = ($2>=3.9);
+}
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+$arg1="%rdi";
+$arg2="%rsi";
+$arg3="%rdx";
+$arg4="%rcx";
+$arg5="%r8";
+$arg6="%r9";
+$roundn="%r13d";
+$tblptr="%r14";
+
+# Define SHAKE rates
+$SHAKE128_RATE="\$168";
+$SHAKE256_RATE="\$136";
+
+# Stack frame offsets for SHAKE x4 wrapper functions
+$STATE_SIZE="808";    # (25 * 8 * 4) + 8 = 808 bytes
+$sf_arg1="0";
+$sf_arg2="8";
+$sf_arg3="16";
+$sf_arg4="24";
+$sf_arg5="32";
+$sf_state_ptr="40";
+$sf_state_x4="48";
+$sf_size="856";       # 48 + 808 = 856 bytes
+
+# Emit an internal helper call used by one-shot wrappers.
+# - Win64: call the provided *_internal shim and bracket it with 32-byte
+#   shadow space so shim entry can use xlate-compatible [rsp+8]/[rsp+16].
+# - non-Win64: call the public API symbol (same base name without _internal).
+# The argument must be the shim/internal symbol name, e.g.
+#   SHA3_shake128_x4_inc_squeeze_avx512vl_internal
+sub call_internal {
+    my ($shim_name) = @_;
+    my $external_name = $shim_name;
+
+    $external_name =~ s/_internal$//;
+
+    return <<___ if ($win64);
+    sub     \$32, %rsp
+    call    $shim_name
+    add     \$32, %rsp
+___
+
+    return <<___;
+    call    $external_name
+___
+}
+
+if ($avx512vl>0) {{{
+
+# AVX512VL feature bit (bit 31 in OPENSSL_ia32cap_P+8)
+my $avx512vl_mask = (1<<31);
+
+$code .= <<___;
+.text
+
+.extern OPENSSL_ia32cap_P
+
+.globl  SHA3_avx512vl_capable
+.type   SHA3_avx512vl_capable,\@abi-omnipotent
+.align 32
+SHA3_avx512vl_capable:
+    mov     OPENSSL_ia32cap_P+8(%rip), %rcx
+    xor     %eax, %eax
+    and     \$$avx512vl_mask, %ecx
+    cmovnz  %ecx, %eax
+    ret
+.size   SHA3_avx512vl_capable, .-SHA3_avx512vl_capable
+___
+
+$code.=<<___;
+.text
+
+# Perform Keccak permutation
+#
+# YMM registers 0 to 24 are used as Keccak state registers.
+# This function, as is, can work on 1 to 4 independent states at the same time.
+#
+# There is no clear boundary between Theta, Rho, Pi, Chi and Iota steps.
+# Instructions corresponding to these steps overlap for better efficiency.
+#
+# Arguments:
+# ymm0-ymm24    [in/out]    Keccak state registers (one SIMD per one state register)
+# ymm25-ymm31   [clobbered] temporary SIMD registers
+# $roundn       [clobbered] used for round tracking
+# $tblptr       [clobbered] used for access to SHA3 constant table
+.type keccak_1600_permute,\@abi-omnipotent
+.align  32
+keccak_1600_permute:
+.cfi_startproc
+    mov     \$24, $roundn        # 24 rounds
+    lea     iotas(%rip), $tblptr # Load the address of the SHA3 round constants
+
+.align  32
+.Lkeccak_rnd_loop:
+    # Theta step
+
+    # Compute column parities
+    # C[5] = [0, 0, 0, 0, 0]
+    # for x in 0 to 4:
+    #     C[x] = state[x][0] XOR state[x][1] XOR state[x][2] XOR state[x][3] XOR state[x][4]
+
+    vmovdqa64   %ymm0, %ymm25
+    vpternlogq  \$0x96, %ymm5, %ymm10, %ymm25
+    vmovdqa64   %ymm1, %ymm26
+    vpternlogq  \$0x96, %ymm11, %ymm6, %ymm26
+    vmovdqa64   %ymm2, %ymm27
+    vpternlogq  \$0x96, %ymm12, %ymm7, %ymm27
+
+    vmovdqa64   %ymm3, %ymm28
+    vpternlogq  \$0x96, %ymm13, %ymm8, %ymm28
+    vmovdqa64   %ymm4, %ymm29
+    vpternlogq  \$0x96, %ymm14, %ymm9, %ymm29
+    vpternlogq  \$0x96, %ymm20, %ymm15, %ymm25
+
+    vpternlogq  \$0x96, %ymm21, %ymm16, %ymm26
+    vpternlogq  \$0x96, %ymm22, %ymm17, %ymm27
+    vpternlogq  \$0x96, %ymm23, %ymm18, %ymm28
+
+    # Start computing D values and keep computing column parity
+    # D[5] = [0, 0, 0, 0, 0]
+    # for x in 0 to 4:
+    #     D[x] = C[(x+4) mod 5] XOR ROTATE_LEFT(C[(x+1) mod 5], 1)
+
+    vprolq      \$1, %ymm26, %ymm30
+    vprolq      \$1, %ymm27, %ymm31
+    vpternlogq  \$0x96, %ymm24, %ymm19, %ymm29
+
+    # Continue computing D values and apply Theta
+    # for x in 0 to 4:
+    #     for y in 0 to 4:
+    #         state[x][y] = state[x][y] XOR D[x]
+
+    vpternlogq  \$0x96, %ymm30, %ymm29, %ymm0
+    vpternlogq  \$0x96, %ymm30, %ymm29, %ymm10
+    vpternlogq  \$0x96, %ymm30, %ymm29, %ymm20
+
+    vpternlogq  \$0x96, %ymm30, %ymm29, %ymm5
+    vpternlogq  \$0x96, %ymm30, %ymm29, %ymm15
+    vprolq      \$1, %ymm28, %ymm30
+
+    vpternlogq  \$0x96, %ymm31, %ymm25, %ymm6
+    vpternlogq  \$0x96, %ymm31, %ymm25, %ymm16
+    vpternlogq  \$0x96, %ymm31, %ymm25, %ymm1
+
+    vpternlogq  \$0x96, %ymm31, %ymm25, %ymm11
+    vpternlogq  \$0x96, %ymm31, %ymm25, %ymm21
+    vprolq      \$1, %ymm29, %ymm31
+
+    vpbroadcastq    ($tblptr), %ymm29 # Load the round constant into ymm29 (Iota)
+    add         \$8, $tblptr          # Increment the pointer to the next round constant
+
+    vpternlogq  \$0x96, %ymm30, %ymm26, %ymm12
+    vpternlogq  \$0x96, %ymm30, %ymm26, %ymm7
+    vpternlogq  \$0x96, %ymm30, %ymm26, %ymm22
+
+    vpternlogq  \$0x96, %ymm30, %ymm26, %ymm17
+    vpternlogq  \$0x96, %ymm30, %ymm26, %ymm2
+    vprolq      \$1, %ymm25, %ymm30
+
+    # Rho step
+    # Keep applying Theta and start Rho step
+    #
+    # ROTATION_OFFSETS[5][5] = [
+    #     [0, 1, 62, 28, 27],
+    #     [36, 44, 6, 55, 20],
+    #     [3, 10, 43, 25, 39],
+    #     [41, 45, 15, 21, 8],
+    #     [18, 2, 61, 56, 14] ]
+    #
+    # for x in 0 to 4:
+    #     for y in 0 to 4:
+    #         state[x][y] = ROTATE_LEFT(state[x][y], ROTATION_OFFSETS[x][y])
+
+    vpternlogq  \$0x96, %ymm31, %ymm27, %ymm3
+    vpternlogq  \$0x96, %ymm31, %ymm27, %ymm13
+    vpternlogq  \$0x96, %ymm31, %ymm27, %ymm23
+
+    vprolq      \$44, %ymm6, %ymm6
+    vpternlogq  \$0x96, %ymm31, %ymm27, %ymm18
+    vpternlogq  \$0x96, %ymm31, %ymm27, %ymm8
+
+    vprolq      \$43, %ymm12, %ymm12
+    vprolq      \$21, %ymm18, %ymm18
+    vpternlogq  \$0x96, %ymm30, %ymm28, %ymm24
+
+    vprolq      \$14, %ymm24, %ymm24
+    vprolq      \$28, %ymm3, %ymm3
+    vpternlogq  \$0x96, %ymm30, %ymm28, %ymm9
+
+    vprolq      \$20, %ymm9, %ymm9
+    vprolq      \$3, %ymm10, %ymm10
+    vpternlogq  \$0x96, %ymm30, %ymm28, %ymm19
+
+    vprolq      \$45, %ymm16, %ymm16
+    vprolq      \$61, %ymm22, %ymm22
+    vpternlogq  \$0x96, %ymm30, %ymm28, %ymm4
+
+    vprolq      \$1, %ymm1, %ymm1
+    vprolq      \$6, %ymm7, %ymm7
+    vpternlogq  \$0x96, %ymm30, %ymm28, %ymm14
+
+    # Continue with Rho and start Pi and Chi steps at the same time
+    # Ternary logic 0xD2 is used for Chi step
+    #
+    # for x in 0 to 4:
+    #     for y in 0 to 4:
+    #         state[x][y] = state[x][y] XOR ((NOT state[(x+1) mod 5][y]) AND state[(x+2) mod 5][y])
+
+    vprolq      \$25, %ymm13, %ymm13
+    vprolq      \$8, %ymm19, %ymm19
+    vmovdqa64   %ymm0, %ymm30
+    vpternlogq  \$0xD2, %ymm12, %ymm6, %ymm30
+
+    vprolq      \$18, %ymm20, %ymm20
+    vprolq      \$27, %ymm4, %ymm4
+    vpxorq      %ymm29, %ymm30, %ymm30 # Iota step
+
+    vprolq      \$36, %ymm5, %ymm5
+    vprolq      \$10, %ymm11, %ymm11
+    vmovdqa64   %ymm6, %ymm31
+    vpternlogq  \$0xD2, %ymm18, %ymm12, %ymm31
+
+    vprolq      \$15, %ymm17, %ymm17
+    vprolq      \$56, %ymm23, %ymm23
+    vpternlogq  \$0xD2, %ymm24, %ymm18, %ymm12
+
+    vprolq      \$62, %ymm2, %ymm2
+    vprolq      \$55, %ymm8, %ymm8
+    vpternlogq  \$0xD2, %ymm0, %ymm24, %ymm18
+
+    vprolq      \$39, %ymm14, %ymm14
+    vprolq      \$41, %ymm15, %ymm15
+    vpternlogq  \$0xD2, %ymm6, %ymm0, %ymm24
+    vmovdqa64   %ymm30, %ymm0
+    vmovdqa64   %ymm31, %ymm6
+
+    vprolq      \$2, %ymm21, %ymm21
+    vmovdqa64   %ymm3, %ymm30
+    vpternlogq  \$0xD2, %ymm10, %ymm9, %ymm30
+    vmovdqa64   %ymm9, %ymm31
+    vpternlogq  \$0xD2, %ymm16, %ymm10, %ymm31
+
+    vpternlogq  \$0xD2, %ymm22, %ymm16, %ymm10
+    vpternlogq  \$0xD2, %ymm3, %ymm22, %ymm16
+    vpternlogq  \$0xD2, %ymm9, %ymm3, %ymm22
+    vmovdqa64   %ymm30, %ymm3
+    vmovdqa64   %ymm31, %ymm9
+
+    vmovdqa64   %ymm1, %ymm30
+    vpternlogq  \$0xD2, %ymm13, %ymm7, %ymm30
+    vmovdqa64   %ymm7, %ymm31
+    vpternlogq  \$0xD2, %ymm19, %ymm13, %ymm31
+    vpternlogq  \$0xD2, %ymm20, %ymm19, %ymm13
+
+    vpternlogq  \$0xD2, %ymm1, %ymm20, %ymm19
+    vpternlogq  \$0xD2, %ymm7, %ymm1, %ymm20
+    vmovdqa64   %ymm30, %ymm1
+    vmovdqa64   %ymm31, %ymm7
+    vmovdqa64   %ymm4, %ymm30
+    vpternlogq  \$0xD2, %ymm11, %ymm5, %ymm30
+
+    vmovdqa64   %ymm5, %ymm31
+    vpternlogq  \$0xD2, %ymm17, %ymm11, %ymm31
+    vpternlogq  \$0xD2, %ymm23, %ymm17, %ymm11
+    vpternlogq  \$0xD2, %ymm4, %ymm23, %ymm17
+
+    vpternlogq  \$0xD2, %ymm5, %ymm4, %ymm23
+    vmovdqa64   %ymm30, %ymm4
+    vmovdqa64   %ymm31, %ymm5
+    vmovdqa64   %ymm2, %ymm30
+    vpternlogq  \$0xD2, %ymm14, %ymm8, %ymm30
+    vmovdqa64   %ymm8, %ymm31
+    vpternlogq  \$0xD2, %ymm15, %ymm14, %ymm31
+
+    vpternlogq  \$0xD2, %ymm21, %ymm15, %ymm14
+    vpternlogq  \$0xD2, %ymm2, %ymm21, %ymm15
+    vpternlogq  \$0xD2, %ymm8, %ymm2, %ymm21
+    vmovdqa64   %ymm30, %ymm2
+    vmovdqa64   %ymm31, %ymm8
+
+    # Complete the steps and get updated state registers in ymm0 to ymm24
+    vmovdqa64   %ymm3,  %ymm30
+    vmovdqa64   %ymm18, %ymm3
+    vmovdqa64   %ymm17, %ymm18
+    vmovdqa64   %ymm11, %ymm17
+    vmovdqa64   %ymm7,  %ymm11
+    vmovdqa64   %ymm10, %ymm7
+    vmovdqa64   %ymm1,  %ymm10
+    vmovdqa64   %ymm6,  %ymm1
+    vmovdqa64   %ymm9,  %ymm6
+    vmovdqa64   %ymm22, %ymm9
+    vmovdqa64   %ymm14, %ymm22
+    vmovdqa64   %ymm20, %ymm14
+    vmovdqa64   %ymm2,  %ymm20
+    vmovdqa64   %ymm12, %ymm2
+    vmovdqa64   %ymm13, %ymm12
+    vmovdqa64   %ymm19, %ymm13
+    vmovdqa64   %ymm23, %ymm19
+    vmovdqa64   %ymm15, %ymm23
+    vmovdqa64   %ymm4,  %ymm15
+    vmovdqa64   %ymm24, %ymm4
+    vmovdqa64   %ymm21, %ymm24
+    vmovdqa64   %ymm8,  %ymm21
+    vmovdqa64   %ymm16, %ymm8
+    vmovdqa64   %ymm5,  %ymm16
+    vmovdqa64   %ymm30, %ymm5
+
+    dec         $roundn           # Decrement the round counter
+    jnz         .Lkeccak_rnd_loop # Jump to the start of the loop if r13d is not zero
+    ret
+.cfi_endproc
+.size   keccak_1600_permute,.-keccak_1600_permute
+
+# Initialize YMM registers 0-24 to zero
+.globl  keccak_1600_init_state
+.type   keccak_1600_init_state,\@abi-omnipotent
+.align  32
+keccak_1600_init_state:
+.cfi_startproc
+    vpxorq      %ymm0, %ymm0, %ymm0
+    vmovdqa64   %ymm0, %ymm1
+    vmovdqa64   %ymm0, %ymm2
+    vmovdqa64   %ymm0, %ymm3
+    vmovdqa64   %ymm0, %ymm4
+    vmovdqa64   %ymm0, %ymm5
+    vmovdqa64   %ymm0, %ymm6
+    vmovdqa64   %ymm0, %ymm7
+    vmovdqa64   %ymm0, %ymm8
+    vmovdqa64   %ymm0, %ymm9
+    vmovdqa64   %ymm0, %ymm10
+    vmovdqa64   %ymm0, %ymm11
+    vmovdqa64   %ymm0, %ymm12
+    vmovdqa64   %ymm0, %ymm13
+    vmovdqa64   %ymm0, %ymm14
+    vmovdqa64   %ymm0, %ymm15
+    vmovdqa64   %ymm0, %ymm16
+    vmovdqa64   %ymm0, %ymm17
+    vmovdqa64   %ymm0, %ymm18
+    vmovdqa64   %ymm0, %ymm19
+    vmovdqa64   %ymm0, %ymm20
+    vmovdqa64   %ymm0, %ymm21
+    vmovdqa64   %ymm0, %ymm22
+    vmovdqa64   %ymm0, %ymm23
+    vmovdqa64   %ymm0, %ymm24
+    ret
+.cfi_endproc
+.size   keccak_1600_init_state,.-keccak_1600_init_state
+
+.globl  keccak_1600_load_state_x4
+.type   keccak_1600_load_state_x4,\@abi-omnipotent
+.align  32
+keccak_1600_load_state_x4:
+.cfi_startproc
+    vmovdqu64   32*0($arg1),  %ymm0
+    vmovdqu64   32*1($arg1),  %ymm1
+    vmovdqu64   32*2($arg1),  %ymm2
+    vmovdqu64   32*3($arg1),  %ymm3
+    vmovdqu64   32*4($arg1),  %ymm4
+    vmovdqu64   32*5($arg1),  %ymm5
+    vmovdqu64   32*6($arg1),  %ymm6
+    vmovdqu64   32*7($arg1),  %ymm7
+    vmovdqu64   32*8($arg1),  %ymm8
+    vmovdqu64   32*9($arg1),  %ymm9
+    vmovdqu64   32*10($arg1), %ymm10
+    vmovdqu64   32*11($arg1), %ymm11
+    vmovdqu64   32*12($arg1), %ymm12
+    vmovdqu64   32*13($arg1), %ymm13
+    vmovdqu64   32*14($arg1), %ymm14
+    vmovdqu64   32*15($arg1), %ymm15
+    vmovdqu64   32*16($arg1), %ymm16
+    vmovdqu64   32*17($arg1), %ymm17
+    vmovdqu64   32*18($arg1), %ymm18
+    vmovdqu64   32*19($arg1), %ymm19
+    vmovdqu64   32*20($arg1), %ymm20
+    vmovdqu64   32*21($arg1), %ymm21
+    vmovdqu64   32*22($arg1), %ymm22
+    vmovdqu64   32*23($arg1), %ymm23
+    vmovdqu64   32*24($arg1), %ymm24
+    ret
+.cfi_endproc
+.size   keccak_1600_load_state_x4,.-keccak_1600_load_state_x4
+
+
+.globl  keccak_1600_save_state_x4
+.type   keccak_1600_save_state_x4,\@abi-omnipotent
+.align  32
+keccak_1600_save_state_x4:
+.cfi_startproc
+    vmovdqu64   %ymm0,  32*0($arg1)
+    vmovdqu64   %ymm1,  32*1($arg1)
+    vmovdqu64   %ymm2,  32*2($arg1)
+    vmovdqu64   %ymm3,  32*3($arg1)
+    vmovdqu64   %ymm4,  32*4($arg1)
+    vmovdqu64   %ymm5,  32*5($arg1)
+    vmovdqu64   %ymm6,  32*6($arg1)
+    vmovdqu64   %ymm7,  32*7($arg1)
+    vmovdqu64   %ymm8,  32*8($arg1)
+    vmovdqu64   %ymm9,  32*9($arg1)
+    vmovdqu64   %ymm10, 32*10($arg1)
+    vmovdqu64   %ymm11, 32*11($arg1)
+    vmovdqu64   %ymm12, 32*12($arg1)
+    vmovdqu64   %ymm13, 32*13($arg1)
+    vmovdqu64   %ymm14, 32*14($arg1)
+    vmovdqu64   %ymm15, 32*15($arg1)
+    vmovdqu64   %ymm16, 32*16($arg1)
+    vmovdqu64   %ymm17, 32*17($arg1)
+    vmovdqu64   %ymm18, 32*18($arg1)
+    vmovdqu64   %ymm19, 32*19($arg1)
+    vmovdqu64   %ymm20, 32*20($arg1)
+    vmovdqu64   %ymm21, 32*21($arg1)
+    vmovdqu64   %ymm22, 32*22($arg1)
+    vmovdqu64   %ymm23, 32*23($arg1)
+    vmovdqu64   %ymm24, 32*24($arg1)
+    ret
+.cfi_endproc
+.size   keccak_1600_save_state_x4,.-keccak_1600_save_state_x4
+
+
+# Add input data to state when message length is less than rate
+# Arguments:
+#   r10:        state pointer to absorb into (clobbered)
+#   arg2 (rsi): message pointer lane 0 (updated on output)
+#   arg3 (rdx): message pointer lane 1 (updated on output)
+#   arg4 (rcx): message pointer lane 2 (updated on output)
+#   arg5 (r8):  message pointer lane 3 (updated on output)
+#   r12:        length in bytes (clobbered on output)
+# Clobbers: r9, rbx, r15, k1, ymm31-ymm29
+.globl  keccak_1600_partial_add_x4
+.type   keccak_1600_partial_add_x4,\@abi-omnipotent
+.align  32
+keccak_1600_partial_add_x4:
+.cfi_startproc
+    mov     8*100(%r10), %r9
+    test    \$7, %r9d
+    jz      .Lstart_aligned_to_4x8
+
+    # Start offset is not aligned to register size
+    mov     %r9, %r15 # %r15 = s[100]
+
+    and     \$7, %r9d
+    neg     %r9d
+    add     \$8, %r9d     # register capacity = 8 - (offset % 8)
+    cmp     %r9d, %r12d
+    cmovnae   %r12d, %r9d # %r9d = min(register capacity, length)
+
+    lea     byte_kmask_0_to_7(%rip), %rbx
+    kmovb   (%rbx,%r9), %k1 # message load mask
+
+    mov     %r15, %rbx
+    and     \$~7, %ebx
+    lea     (%r10,%rbx,4), %r10 # get to state starting register
+
+    mov     %r15, %rbx
+    and     \$7, %ebx
+
+    vmovdqu8    (%r10), %ymm31 # load & store / allocate SB for the register
+    vmovdqu8    %ymm31, (%r10)
+
+    vmovdqu8    ($arg2), %xmm31{%k1}{z}        # Read 1 to 7 bytes from lane 0
+    vmovdqu8    8*0(%r10,%rbx), %xmm30{%k1}{z} # Read 1 to 7 bytes from state reg lane 0
+    vpxorq      %xmm30, %xmm31, %xmm31
+    vmovdqu8    %xmm31, 8*0(%r10,%rbx){%k1}    # Write 1 to 7 bytes to state reg lane 0
+
+    vmovdqu8    ($arg3), %xmm31{%k1}{z}        # Read 1 to 7 bytes from lane 1
+    vmovdqu8    8*1(%r10,%rbx), %xmm30{%k1}{z} # Read 1 to 7 bytes from state reg lane 1
+    vpxorq      %xmm30, %xmm31, %xmm31
+    vmovdqu8    %xmm31, 8*1(%r10,%rbx){%k1}    # Write 1 to 7 bytes to state reg lane 1
+
+    vmovdqu8    ($arg4), %xmm31{%k1}{z}        # Read 1 to 7 bytes from lane 2
+    vmovdqu8    8*2(%r10,%rbx), %xmm30{%k1}{z} # Read 1 to 7 bytes from state reg lane 2
+    vpxorq      %xmm30, %xmm31, %xmm31
+    vmovdqu8    %xmm31, 8*2(%r10,%rbx){%k1}    # Write 1 to 7 bytes to state reg lane 2
+
+    vmovdqu8    ($arg5), %xmm31{%k1}{z}        # Read 1 to 7 bytes from lane 3
+    vmovdqu8    8*3(%r10,%rbx), %xmm30{%k1}{z} # Read 1 to 7 bytes from state reg lane 3
+    vpxorq      %xmm30, %xmm31, %xmm31
+    vmovdqu8    %xmm31, 8*3(%r10,%rbx){%k1}    # Write 1 to 7 bytes to state reg lane 3
+
+    sub     %r9, %r12
+    jz      .Lzero_bytes
+
+    add     %r9, $arg2
+    add     %r9, $arg3
+    add     %r9, $arg4
+    add     %r9, $arg5
+    add     \$32, %r10
+    xor     %r9, %r9
+    jmp     .Lymm_loop
+
+.Lstart_aligned_to_4x8:
+    lea     (%r10,%r9,4), %r10
+    xor     %r9, %r9
+
+.align  32
+.Lymm_loop:
+    cmp     \$8, %r12d
+    jb      .Llt_8_bytes
+
+    vmovq       ($arg2,%r9), %xmm31              # Read 8 bytes from lane 0
+    vpinsrq     \$1, ($arg3,%r9), %xmm31, %xmm31 # Read 8 bytes from lane 1
+    vmovq       ($arg4,%r9), %xmm30              # Read 8 bytes from lane 2
+    vpinsrq     \$1, ($arg5,%r9),%xmm30, %xmm30  # Read 8 bytes from lane 3
+    vinserti32x4 \$1, %xmm30, %ymm31, %ymm31
+    vpxorq      (%r10,%r9,4), %ymm31, %ymm31     # Add data with the state
+    vmovdqu64   %ymm31, (%r10,%r9,4)
+    add     \$8, %r9
+    sub     \$8, %r12
+    jz      .Lzero_bytes
+
+    jmp     .Lymm_loop
+
+.align  32
+.Lzero_bytes:
+    add     %r9, $arg2
+    add     %r9, $arg3
+    add     %r9, $arg4
+    add     %r9, $arg5
+    ret
+
+.align  32
+.Llt_8_bytes:
+    add     %r9, $arg2
+    add     %r9, $arg3
+    add     %r9, $arg4
+    add     %r9, $arg5
+    lea     (%r10,%r9,4), %r10
+
+    lea     byte_kmask_0_to_7(%rip), %rbx
+    kmovb   (%rbx,%r12), %k1 # message load mask
+
+    vmovdqu8    ($arg2), %xmm31{%k1}{z} # Read 1 to 7 bytes from lane 0
+    vmovdqu8    ($arg3), %xmm30{%k1}{z} # Read 1 to 7 bytes from lane 1
+    vpunpcklqdq %xmm30, %xmm31, %xmm31  # Interleave data from lane 0 and lane 1
+    vmovdqu8    ($arg4), %xmm30{%k1}{z} # Read 1 to 7 bytes from lane 2
+    vmovdqu8    ($arg5), %xmm29{%k1}{z} # Read 1 to 7 bytes from lane 3
+    vpunpcklqdq %xmm29, %xmm30, %xmm30  # Interleave data from lane 2 and lane 3
+    vinserti32x4 \$1, %xmm30, %ymm31, %ymm31
+
+    vpxorq      (%r10), %ymm31, %ymm31 # Add data to the state
+    vmovdqu64   %ymm31, (%r10)         # Update state in memory
+
+    add     %r12, $arg2 # increment message pointer lane 0
+    add     %r12, $arg3 # increment message pointer lane 1
+    add     %r12, $arg4 # increment message pointer lane 2
+    add     %r12, $arg5 # increment message pointer lane 3
+    ret
+.cfi_endproc
+.size   keccak_1600_partial_add_x4,.-keccak_1600_partial_add_x4
+
+
+# Extract bytes from state and write to outputs
+# Arguments:
+#   r10:        state pointer to start extracting from (clobbered)
+#   arg1 (rdi): output pointer lane 0 (updated on output)
+#   arg2 (rsi): output pointer lane 1 (updated on output)
+#   arg3 (rdx): output pointer lane 2 (updated on output)
+#   arg4 (rcx): output pointer lane 3 (updated on output)
+#   r12:        length in bytes (clobbered on output)
+#   r11:        state offset to start extract from
+.globl  keccak_1600_extract_bytes_x4
+.type   keccak_1600_extract_bytes_x4,\@abi-omnipotent
+.align  32
+keccak_1600_extract_bytes_x4:
+.cfi_startproc
+    or      %r12, %r12
+    jz      .Lextract_zero_bytes
+
+    test    \$7, %r11d
+    jz      .Lextract_start_aligned_to_4x8
+
+    # Extract offset is not aligned to the register size (8 bytes)
+    mov     %r11, %r9
+
+    and     \$7, %r9d
+    neg     %r9d
+    add     \$8, %r9d     # register capacity = 8 - (offset % 8)
+    cmp     %r9d, %r12d
+    cmovnae   %r12d, %r9d # %r9d = min(register capacity, length)
+
+    lea     byte_kmask_0_to_7(%rip), %rbx
+    kmovb   (%rbx,%r9), %k1 # message store mask
+
+    mov     %r11, %rbx
+    and     \$~7, %ebx
+    lea     (%r10,%rbx,4), %r10 # get to state starting register
+
+    mov     %r11, %rbx
+    and     \$7, %ebx
+
+    vmovdqu8    8*0(%r10,%rbx), %xmm31{%k1}{z} # Read 1-7 bytes from state reg lane 0
+    vmovdqu8    %xmm31, ($arg1){%k1}           # Write 1-7 bytes to lane 0 output
+
+    vmovdqu8    8*1(%r10,%rbx), %xmm31{%k1}{z} # Read 1-7 bytes from state reg lane 1
+    vmovdqu8    %xmm31, ($arg2){%k1}           # Write 1-7 bytes to lane 1 output
+
+    vmovdqu8    8*2(%r10,%rbx), %xmm31{%k1}{z} # Read 1-7 bytes from state reg lane 2
+    vmovdqu8    %xmm31, ($arg3){%k1}           # Write 1-7 bytes to lane 2 output
+
+    vmovdqu8    8*3(%r10,%rbx), %xmm31{%k1}{z} # Read 1-7 bytes from state reg lane 3
+    vmovdqu8    %xmm31, ($arg4){%k1}           # Write 1-7 bytes to lane 3 output
+
+    # Increment output registers
+    add     %r9, $arg1
+    add     %r9, $arg2
+    add     %r9, $arg3
+    add     %r9, $arg4
+
+    # Decrement length to extract
+    sub     %r9, %r12
+    jz      .Lextract_zero_bytes
+
+    # More data to extract, update state register pointer
+    add     \$32, %r10
+    xor     %r9, %r9
+    jmp     .Lextract_ymm_loop
+
+.Lextract_start_aligned_to_4x8:
+        lea     (%r10,%r11,4), %r10
+        xor     %r9, %r9
+
+.align  32
+.Lextract_ymm_loop:
+    cmp     \$8, %r12
+    jb      .Lextract_lt_8_bytes
+
+    vmovdqu64   (%r10), %xmm31
+    vmovdqu64   16(%r10), %xmm30
+    vmovq       %xmm31, ($arg1,%r9)
+    vpextrq     \$1, %xmm31, ($arg2,%r9)
+    vmovq       %xmm30, ($arg3,%r9)
+    vpextrq     \$1, %xmm30, ($arg4,%r9)
+    add     \$8, %r9
+    sub     \$8, %r12
+    jz      .Lzero_bytes_left
+
+    add     \$32, %r10
+    jmp     .Lextract_ymm_loop
+
+.align  32
+.Lzero_bytes_left:
+    # Increment output pointers
+    add     %r9, $arg1
+    add     %r9, $arg2
+    add     %r9, $arg3
+    add     %r9, $arg4
+.Lextract_zero_bytes:
+    ret
+
+.align  32
+.Lextract_lt_8_bytes:
+    add     %r9, $arg1
+    add     %r9, $arg2
+    add     %r9, $arg3
+    add     %r9, $arg4
+
+    lea     byte_kmask_0_to_7(%rip), %r9
+    kmovb   (%r9,%r12), %k1 # k1 is the mask of message bytes to read
+
+    vmovq       0*8(%r10), %xmm31    # Read 8 bytes from state lane 0
+    vmovdqu8    %xmm31, ($arg1){%k1} # Extract 1-7 bytes into output 0
+    vmovq       1*8(%r10), %xmm31    # Read 8 bytes from state lane 1
+    vmovdqu8    %xmm31, ($arg2){%k1} # Extract 1-7 bytes into output 1
+    vmovq       2*8(%r10), %xmm31    # Read 8 bytes from state lane 2
+    vmovdqu8    %xmm31, ($arg3){%k1} # Extract 1-7 bytes into output 2
+    vmovq       3*8(%r10), %xmm31    # Read 8 bytes from state lane 3
+    vmovdqu8    %xmm31, ($arg4){%k1} # Extract 1-7 bytes into output 3
+
+    # Increment output pointers
+    add     %r12, $arg1
+    add     %r12, $arg2
+    add     %r12, $arg3
+    add     %r12, $arg4
+    ret
+.cfi_endproc
+.size   keccak_1600_extract_bytes_x4,.-keccak_1600_extract_bytes_x4
+
+
+# SHAKE128 x4 multi-buffer functions
+# These functions process 4 independent SHAKE128 streams in parallel using AVX-512VL
+# State layout: 25 ymm registers (200 bytes each) + 1 qword = 808 bytes per context
+# Rate: 168 bytes for SHAKE128
+
+# SHA3_shake128_x4_avx512vl
+# One-shot SHAKE-128 x4 function: init + absorb + finalize + squeeze
+# Arguments:
+#   arg1 (rdi): pointer to output lane 0
+#   arg2 (rsi): pointer to output lane 1
+#   arg3 (rdx): pointer to output lane 2
+#   arg4 (rcx): pointer to output lane 3
+#   arg5 (r8):  output length in bytes (must be same for all lanes)
+#   arg6 (r9):  pointer to input lane 0
+#   [stack+0]:  pointer to input lane 1
+#   [stack+8]:  pointer to input lane 2
+#   [stack+16]: pointer to input lane 3
+#   [stack+24]: input length in bytes (must be same for all lanes)
+# Returns: void
+.globl  SHA3_shake128_x4_avx512vl
+.type   SHA3_shake128_x4_avx512vl,\@function,10
+.align  32
+SHA3_shake128_x4_avx512vl:
+.cfi_startproc
+    push    %rbp
+.cfi_push       %rbp
+    mov     %rsp, %rbp
+    push    %rbx
+.cfi_push       %rbx
+___
+$code .= <<___ if ($win64);
+    sub     \$160, %rsp
+    vmovups %xmm6,   0(%rsp)
+    vmovups %xmm7,   16(%rsp)
+    vmovups %xmm8,   32(%rsp)
+    vmovups %xmm9,   48(%rsp)
+    vmovups %xmm10,  64(%rsp)
+    vmovups %xmm11,  80(%rsp)
+    vmovups %xmm12,  96(%rsp)
+    vmovups %xmm13,  112(%rsp)
+    vmovups %xmm14,  128(%rsp)
+    vmovups %xmm15,  144(%rsp)
+___
+$code.=<<___;
+
+    sub     \$$sf_size, %rsp
+    mov     %rsp, %rbx
+
+.Lshake128_x4_body:
+    mov     $arg1, $sf_arg1(%rbx)
+    mov     $arg2, $sf_arg2(%rbx)
+    mov     $arg3, $sf_arg3(%rbx)
+    mov     $arg4, $sf_arg4(%rbx)
+    mov     $arg5, $sf_arg5(%rbx)
+
+    lea     $sf_state_x4(%rbx), $arg1 # start of x4 state on the stack frame
+    mov     $arg1, $sf_state_ptr(%rbx)
+
+    # Initialize the state array to zero
+    call    keccak_1600_init_state
+
+    call    keccak_1600_save_state_x4
+
+    movq    \$0, 8*100($arg1) # clear s[100]
+
+    mov     $sf_state_ptr(%rbx), $arg1
+    mov     $arg6, $arg2
+___
+$code .= <<___ if ($win64);
+    # xlate prologue handles up to six arguments. For one-shot x4 wrappers
+    # (10 args), the remaining four stay in Win64 stack slots.
+    mov     64(%rbp), $arg3 # arg7 from stack
+    mov     72(%rbp), $arg4 # arg8 from stack
+    mov     80(%rbp), $arg5 # arg9 from stack
+    mov     88(%rbp), $arg6 # arg10 from stack
+___
+$code .= <<___ if (!$win64);
+    mov     16(%rbp), $arg3 # arg7 from stack
+    mov     24(%rbp), $arg4 # arg8 from stack
+    mov     32(%rbp), $arg5 # arg9 from stack
+    mov     40(%rbp), $arg6 # arg10 from stack
+___
+$code.=<<___;
+    # Internal entry avoids Win64 xlate prologue argument remapping.
+___
+$code .= call_internal("SHA3_shake128_x4_inc_absorb_avx512vl_internal");
+$code.=<<___;
+
+    mov     $sf_state_ptr(%rbx), $arg1
+    call    .L_SHA3_shake128_x4_inc_finalize_avx512vl
+
+    # squeeze
+    mov     $sf_arg1(%rbx), $arg1
+    mov     $sf_arg2(%rbx), $arg2
+    mov     $sf_arg3(%rbx), $arg3
+    mov     $sf_arg4(%rbx), $arg4
+    mov     $sf_arg5(%rbx), $arg5
+    mov     $sf_state_ptr(%rbx), $arg6
+___
+$code .= call_internal("SHA3_shake128_x4_inc_squeeze_avx512vl_internal");
+$code.=<<___;
+
+    # Clear the temporary buffer
+    lea     $sf_state_x4(%rbx), %r9
+    vpxorq      %ymm31, %ymm31, %ymm31
+    vmovdqu64   %ymm31, 32*0(%r9)
+    vmovdqu64   %ymm31, 32*1(%r9)
+    vmovdqu64   %ymm31, 32*2(%r9)
+    vmovdqu64   %ymm31, 32*3(%r9)
+    vmovdqu64   %ymm31, 32*4(%r9)
+    vmovdqu64   %ymm31, 32*5(%r9)
+    vmovdqu64   %ymm31, 32*6(%r9)
+    vmovdqu64   %ymm31, 32*7(%r9)
+    vmovdqu64   %ymm31, 32*8(%r9)
+    vmovdqu64   %ymm31, 32*9(%r9)
+    vmovdqu64   %ymm31, 32*10(%r9)
+    vmovdqu64   %ymm31, 32*11(%r9)
+    vmovdqu64   %ymm31, 32*12(%r9)
+    vmovdqu64   %ymm31, 32*13(%r9)
+    vmovdqu64   %ymm31, 32*14(%r9)
+    vmovdqu64   %ymm31, 32*15(%r9)
+    vmovdqu64   %ymm31, 32*16(%r9)
+    vmovdqu64   %ymm31, 32*17(%r9)
+    vmovdqu64   %ymm31, 32*18(%r9)
+    vmovdqu64   %ymm31, 32*19(%r9)
+    vmovdqu64   %ymm31, 32*20(%r9)
+    vmovdqu64   %ymm31, 32*21(%r9)
+    vmovdqu64   %ymm31, 32*22(%r9)
+    vmovdqu64   %ymm31, 32*23(%r9)
+    vmovdqu64   %ymm31, 32*24(%r9)
+    vmovq       %xmm31, 32*25(%r9)
+
+.Lshake128_x4_epilogue:
+___
+$code .= <<___ if ($win64);
+    vmovups $sf_size+0(%rsp),   %xmm6
+    vmovups $sf_size+16(%rsp),  %xmm7
+    vmovups $sf_size+32(%rsp),  %xmm8
+    vmovups $sf_size+48(%rsp),  %xmm9
+    vmovups $sf_size+64(%rsp),  %xmm10
+    vmovups $sf_size+80(%rsp),  %xmm11
+    vmovups $sf_size+96(%rsp),  %xmm12
+    vmovups $sf_size+112(%rsp), %xmm13
+    vmovups $sf_size+128(%rsp), %xmm14
+    vmovups $sf_size+144(%rsp), %xmm15
+    add     \$160, %rsp
+___
+$code.=<<___;
+    add     \$$sf_size, %rsp
+    pop     %rbx
+.cfi_pop        %rbx
+    pop     %rbp
+.cfi_pop        %rbp
+    ret
+.cfi_endproc
+.size   SHA3_shake128_x4_avx512vl,.-SHA3_shake128_x4_avx512vl
+
+___
+
+$code .= <<___ if ($win64);
+# Internal Win64 shim for absorb entry. It establishes xlate-compatible
+# unwind state and then jumps to the function entry after the prologue.
+# This is required for internal calls since the xlate ABI conversion
+# is already done in the caller function.
+.type   SHA3_shake128_x4_inc_absorb_avx512vl_internal,\@abi-omnipotent
+.align  32
+.LSEH_begin_SHA3_shake128_x4_inc_absorb_avx512vl_internal:
+SHA3_shake128_x4_inc_absorb_avx512vl_internal:
+    mov     %rsp, %rax
+    mov     $arg1, 8(%rsp)
+    mov     $arg2, 16(%rsp)
+    jmp     .L_SHA3_shake128_x4_inc_absorb_avx512vl
+.LSEH_end_SHA3_shake128_x4_inc_absorb_avx512vl_internal:
+.size   SHA3_shake128_x4_inc_absorb_avx512vl_internal,.-SHA3_shake128_x4_inc_absorb_avx512vl_internal
+___
+$code.=<<___;
+
+# SHA3_shake128_x4_inc_absorb_avx512vl
+# Absorb input data into 4 parallel SHAKE128 states
+# Arguments:
+#   arg1 (rdi): pointer to state context (808 bytes)
+#   arg2 (rsi): pointer to lane 0 input data
+#   arg3 (rdx): pointer to lane 1 input data
+#   arg4 (rcx): pointer to lane 2 input data
+#   arg5 (r8):  pointer to lane 3 input data
+#   arg6 (r9):  input length in bytes (must be same for all lanes)
+# Returns: void
+# Note: Input is XORed into state and Keccak permutation is applied for each rate-sized block
+.globl  SHA3_shake128_x4_inc_absorb_avx512vl
+.type   SHA3_shake128_x4_inc_absorb_avx512vl,\@function,6
+.align  32
+SHA3_shake128_x4_inc_absorb_avx512vl:
+.L_SHA3_shake128_x4_inc_absorb_avx512vl:
+.cfi_startproc
+        push    %rbp
+.cfi_push       %rbp
+        push    %rbx
+.cfi_push       %rbx
+        push    %r12
+.cfi_push       %r12
+        push    %r13
+.cfi_push       %r13
+        push    %r14
+.cfi_push       %r14
+        push    %r15
+.cfi_push       %r15
+___
+$code .= <<___ if ($win64);
+    sub     \$160, %rsp
+    vmovups %xmm6,   0(%rsp)
+    vmovups %xmm7,   16(%rsp)
+    vmovups %xmm8,   32(%rsp)
+    vmovups %xmm9,   48(%rsp)
+    vmovups %xmm10,  64(%rsp)
+    vmovups %xmm11,  80(%rsp)
+    vmovups %xmm12,  96(%rsp)
+    vmovups %xmm13,  112(%rsp)
+    vmovups %xmm14,  128(%rsp)
+    vmovups %xmm15,  144(%rsp)
+___
+$code.=<<___;
+
+.Lshake128_absorb_body:
+    # check for partially processed block
+    mov     8*100($arg1), %r14
+    or      %r14, %r14 # s[100] == 0?
+    je      .Lshake128_absorb_main_loop_start
+
+    # process remaining bytes if message long enough
+    mov     \$168, %r12 # SHAKE128_RATE = 168
+    sub     %r14, %r12  # %r12 = capacity
+
+    cmp     %r12, $arg6 # if mlen <= capacity then no permute
+    jbe     .Lshake128_absorb_skip_permute
+
+    sub     %r12, $arg6
+    mov     $arg6, %r11 # preserve remaining length across helper calls
+
+    # r10/state, arg2-arg5/inputs, r12/length
+    mov     $arg1, %r10                # %r10 = state
+    call    keccak_1600_partial_add_x4 # arg2-arg5 are updated
+
+    call    keccak_1600_load_state_x4
+
+    call    keccak_1600_permute
+
+    movq    \$0, 8*100($arg1) # clear s[100]
+    jmp     .Lshake128_absorb_partial_block_done
+
+.Lshake128_absorb_skip_permute:
+    # r10/state, arg2-arg5/inputs, r12/length
+    mov     $arg1, %r10
+    mov     $arg6, %r12
+    mov     $arg6, %r11 # preserve input length across helper call
+    call    keccak_1600_partial_add_x4
+
+    lea     (%r11,%r14), %r15
+    mov     %r15, 8*100($arg1) # s[100] += inlen
+
+    cmp     \$168, %r15 # check s[100] below SHAKE128_RATE
+    jb      .Lshake128_absorb_exit
+
+    call    keccak_1600_load_state_x4
+
+    call    keccak_1600_permute
+
+    call    keccak_1600_save_state_x4
+
+    movq    \$0, 8*100($arg1) # clear s[100]
+    jmp     .Lshake128_absorb_exit
+
+.Lshake128_absorb_main_loop_start:
+    call    keccak_1600_load_state_x4
+    mov     $arg6, %r11 # full input length when no prior partial block
+
+.Lshake128_absorb_partial_block_done:
+    xor     %r12, %r12  # zero message offset
+
+    # Process the input message in blocks
+.align  32
+.Lshake128_absorb_while_loop:
+    cmp     \$168, %r11 # compare mlen to SHAKE128_RATE
+    jb      .Lshake128_absorb_while_loop_done
+
+    # Inline absorb_bytes_x4 for SHAKE128_RATE (168 bytes = 21 ymm registers)
+___
+
+# Generate absorb code for SHAKE128 rate (168 bytes)
+for (my $i = 0; $i < 21; $i++) {
+    my $offset = $i * 8;
+    $code.=<<___;
+        vmovq       $offset($arg2,%r12), %xmm31
+        vpinsrq     \$1, $offset($arg3,%r12), %xmm31, %xmm31
+        vmovq       $offset($arg4,%r12), %xmm30
+        vpinsrq     \$1, $offset($arg5,%r12), %xmm30, %xmm30
+        vinserti32x4 \$1, %xmm30, %ymm31, %ymm31
+        vpxorq      %ymm31, %ymm$i, %ymm$i
+___
+}
+
+$code.=<<___;
+    sub     \$168, %r11         # Subtract the rate from the remaining length
+    add     \$168, %r12         # Adjust offset to next block
+    call    keccak_1600_permute # Perform the Keccak permutation
+
+    jmp     .Lshake128_absorb_while_loop
+
+.align  32
+.Lshake128_absorb_while_loop_done:
+    call    keccak_1600_save_state_x4
+
+    mov     %r11, 8*100($arg1) # update s[100]
+    or      %r11, %r11
+    jz      .Lshake128_absorb_exit
+
+    movq    \$0, 8*100($arg1) # clear s[100]
+
+    # r10/state, arg2-arg5/input, r12/length
+    mov     $arg1, %r10
+    add     %r12, $arg2
+    add     %r12, $arg3
+    add     %r12, $arg4
+    add     %r12, $arg5
+    mov     %r11, %r12
+    call    keccak_1600_partial_add_x4
+
+    mov     %r11, 8*100($arg1) # update s[100]
+
+.Lshake128_absorb_exit:
+    # Clear sensitive registers
+    vpxorq      %xmm16, %xmm16, %xmm16
+    vmovdqa64   %ymm16, %ymm17
+    vmovdqa64   %ymm16, %ymm18
+    vmovdqa64   %ymm16, %ymm19
+    vmovdqa64   %ymm16, %ymm20
+    vmovdqa64   %ymm16, %ymm21
+    vmovdqa64   %ymm16, %ymm22
+    vmovdqa64   %ymm16, %ymm23
+    vmovdqa64   %ymm16, %ymm24
+    vmovdqa64   %ymm16, %ymm25
+    vmovdqa64   %ymm16, %ymm26
+    vmovdqa64   %ymm16, %ymm27
+    vmovdqa64   %ymm16, %ymm28
+    vmovdqa64   %ymm16, %ymm29
+    vmovdqa64   %ymm16, %ymm30
+    vmovdqa64   %ymm16, %ymm31
+.Lshake128_absorb_epilogue:
+    vzeroall
+___
+$code .= <<___ if ($win64);
+    vmovups 0(%rsp),   %xmm6
+    vmovups 16(%rsp),  %xmm7
+    vmovups 32(%rsp),  %xmm8
+    vmovups 48(%rsp),  %xmm9
+    vmovups 64(%rsp),  %xmm10
+    vmovups 80(%rsp),  %xmm11
+    vmovups 96(%rsp),  %xmm12
+    vmovups 112(%rsp), %xmm13
+    vmovups 128(%rsp), %xmm14
+    vmovups 144(%rsp), %xmm15
+    add     \$160, %rsp
+___
+$code.=<<___;
+
+    pop     %r15
+.cfi_pop        %r15
+    pop     %r14
+.cfi_pop        %r14
+    pop     %r13
+.cfi_pop        %r13
+    pop     %r12
+.cfi_pop        %r12
+    pop     %rbx
+.cfi_pop        %rbx
+    pop     %rbp
+.cfi_pop        %rbp
+    ret
+.cfi_endproc
+.size   SHA3_shake128_x4_inc_absorb_avx512vl,.-SHA3_shake128_x4_inc_absorb_avx512vl
+
+
+# SHA3_shake128_x4_inc_finalize_avx512vl
+# Finalize absorption phase for 4 parallel SHAKE-128 states
+# Adds padding and terminator bytes and clears the absorb offset
+# Arguments:
+#   arg1 (rdi): pointer to state context (808 bytes)
+# Returns: void
+# Note: After this call, state is ready for squeezing output
+.globl  SHA3_shake128_x4_inc_finalize_avx512vl
+.type   SHA3_shake128_x4_inc_finalize_avx512vl,\@function,1
+.align  32
+SHA3_shake128_x4_inc_finalize_avx512vl:
+.L_SHA3_shake128_x4_inc_finalize_avx512vl:
+.cfi_startproc
+    mov         8*100($arg1), %r11 # load state offset from s[100]
+    mov         %r11, %r10
+    and         \$~7, %r10d        # offset to the state register
+    and         \$7, %r11d         # offset within the register
+
+    # add EOM byte right after the message
+    vmovdqu32   ($arg1,%r10,4), %ymm31
+    lea         shake_msg_pad_x4(%rip), %r9
+    sub         %r11, %r9
+    vmovdqu32   (%r9), %ymm30
+    vpxorq      %ymm30, %ymm31, %ymm31
+    vmovdqu32   %ymm31, ($arg1,%r10,4)
+
+    # add terminating byte at offset equal to rate - 1 (SHAKE128_RATE = 168)
+    vmovdqu32   640($arg1), %ymm31 # 168*4 - 32 = 672 - 32 = 640
+    vmovdqa32   shake_terminator_byte_x4(%rip), %ymm30
+    vpxorq      %ymm30, %ymm31, %ymm31
+    vmovdqu32   %ymm31, 640($arg1)
+
+    movq        \$0, 8*100($arg1) # clear s[100]
+    vpxorq      %ymm31, %ymm31, %ymm31
+    ret
+.cfi_endproc
+.size   SHA3_shake128_x4_inc_finalize_avx512vl,.-SHA3_shake128_x4_inc_finalize_avx512vl
+
+___
+
+$code .= <<___ if ($win64);
+# Internal Win64 shim for squeeze entry. It establishes xlate-compatible
+# unwind state and then jumps to the function entry after the prologue.
+# This is required for internal calls since the xlate ABI conversion
+# is already done in the caller function.
+.type   SHA3_shake128_x4_inc_squeeze_avx512vl_internal,\@abi-omnipotent
+.align  32
+.LSEH_begin_SHA3_shake128_x4_inc_squeeze_avx512vl_internal:
+SHA3_shake128_x4_inc_squeeze_avx512vl_internal:
+    mov     %rsp, %rax
+    mov     $arg1, 8(%rsp)
+    mov     $arg2, 16(%rsp)
+    jmp     .L_SHA3_shake128_x4_inc_squeeze_avx512vl
+.LSEH_end_SHA3_shake128_x4_inc_squeeze_avx512vl_internal:
+.size   SHA3_shake128_x4_inc_squeeze_avx512vl_internal,.-SHA3_shake128_x4_inc_squeeze_avx512vl_internal
+___
+$code.=<<___;
+
+# SHA3_shake128_x4_inc_squeeze_avx512vl
+# Squeeze output from 4 parallel SHAKE128 states
+# Arguments:
+#   arg1 (rdi): pointer to lane 0 output buffer
+#   arg2 (rsi): pointer to lane 1 output buffer
+#   arg3 (rdx): pointer to lane 2 output buffer
+#   arg4 (rcx): pointer to lane 3 output buffer
+#   arg5 (r8):  output length in bytes (must be same for all lanes)
+#   arg6 (r9):  pointer to state context (808 bytes)
+# Returns: void
+# Note: Can be called multiple times to generate arbitrary-length output
+.globl  SHA3_shake128_x4_inc_squeeze_avx512vl
+.type   SHA3_shake128_x4_inc_squeeze_avx512vl,\@function,6
+.align  32
+SHA3_shake128_x4_inc_squeeze_avx512vl:
+.L_SHA3_shake128_x4_inc_squeeze_avx512vl:
+.cfi_startproc
+    push    %rbp
+.cfi_push       %rbp
+    push    %rbx
+.cfi_push       %rbx
+    push    %r12
+.cfi_push       %r12
+    push    %r13
+.cfi_push       %r13
+    push    %r14
+.cfi_push       %r14
+    push    %r15
+.cfi_push       %r15
+___
+$code .= <<___ if ($win64);
+    sub     \$160, %rsp
+    vmovups %xmm6,   0(%rsp)
+    vmovups %xmm7,   16(%rsp)
+    vmovups %xmm8,   32(%rsp)
+    vmovups %xmm9,   48(%rsp)
+    vmovups %xmm10,  64(%rsp)
+    vmovups %xmm11,  80(%rsp)
+    vmovups %xmm12,  96(%rsp)
+    vmovups %xmm13,  112(%rsp)
+    vmovups %xmm14,  128(%rsp)
+    vmovups %xmm15,  144(%rsp)
+___
+$code.=<<___;
+
+.Lshake128_squeeze_body:
+    or      $arg5, $arg5
+    jz      .Lshake128_squeeze_done
+
+    # check for partially processed block
+    mov     8*100($arg6), %r15 # s[100] - capacity
+    or      %r15, %r15
+    jnz     .Lshake128_squeeze_no_init_permute
+
+    mov     $arg1, %r14
+    mov     $arg6, $arg1
+    call    keccak_1600_load_state_x4
+
+    mov     %r14, $arg1
+
+    xor     %rbp, %rbp
+    jmp     .Lshake128_squeeze_loop
+
+.align  32
+.Lshake128_squeeze_no_init_permute:
+    # extract bytes: r10 - state/src, arg1-arg4 - output/dst, r12 - length = min(capacity, outlen), r11 - offset
+    mov     $arg6, %r10
+    mov     $arg6, %r14 # preserve state pointer across extract helper
+
+    mov     %r15, %r12
+    cmp     %r15, $arg5
+    cmovnae $arg5, %r12 # %r12 = min(capacity, outlen)
+
+    sub     %r12, $arg5 # outlen -= length
+
+    mov     \$168, %r11d # SHAKE128_RATE
+    sub     %r15, %r11   # state offset
+
+    sub     %r12, %r15         # capacity -= length
+    mov     %r15, 8*100($arg6) # update s[100]
+
+    call    keccak_1600_extract_bytes_x4
+    mov     %r14, $arg6        # restore state pointer after helper clobbers
+
+    or      %r15, %r15
+    jnz     .Lshake128_squeeze_done # check s[100] not zero
+
+    mov     $arg1, %r13 # preserve arg1
+    mov     %r14, $arg1
+    call    keccak_1600_load_state_x4
+
+    mov     %r13, $arg1
+    xor     %rbp, %rbp
+
+.align  32
+.Lshake128_squeeze_loop:
+    cmp     \$168, $arg5 # outlen > SHAKE128_RATE
+    jb      .Lshake128_squeeze_final_extract
+
+    call    keccak_1600_permute
+
+    # Extract SHAKE128 rate bytes (168 bytes = 21 x 8 bytes) inline
+___
+
+# Generate extract code for SHAKE128 rate (168 bytes = 21 ymm registers)
+for (my $i = 0; $i < 21; $i++) {
+    my $offset = $i * 8;
+    $code.=<<___;
+        vextracti64x2 \$1, %ymm$i, %xmm31
+        vmovq       %xmm$i, $offset($arg1,%rbp)
+        vpextrq     \$1, %xmm$i, $offset($arg2,%rbp)
+        vmovq       %xmm31, $offset($arg3,%rbp)
+        vpextrq     \$1, %xmm31, $offset($arg4,%rbp)
+___
+}
+
+$code.=<<___;
+    add     \$168, %rbp  # dst offset += SHAKE128_RATE
+    sub     \$168, $arg5 # outlen -= SHAKE128_RATE
+    jmp     .Lshake128_squeeze_loop
+
+.align  32
+.Lshake128_squeeze_final_extract:
+    or      $arg5, $arg5
+    jz      .Lshake128_squeeze_no_end_permute
+
+    # update output pointers
+    add     %rbp, $arg1
+    add     %rbp, $arg2
+    add     %rbp, $arg3
+    add     %rbp, $arg4
+
+    mov     \$168, %r15d       # SHAKE128_RATE
+    sub     $arg5, %r15
+    mov     %r15, 8*100($arg6) # s[100] = capacity
+
+    call    keccak_1600_permute
+
+    mov     $arg1, %r14
+    mov     $arg6, $arg1
+    call    keccak_1600_save_state_x4
+
+    mov     %r14, $arg1
+
+    # extract bytes: r10 - state/src, arg1-arg4 - output/dst, r12 - length, r11 - offset = 0
+    mov     $arg6, %r10
+    mov     $arg5, %r12
+    xor     %r11, %r11
+    call    keccak_1600_extract_bytes_x4
+
+    jmp     .Lshake128_squeeze_done
+
+.Lshake128_squeeze_no_end_permute:
+    movq    \$0, 8*100($arg6) # s[100] = 0
+    mov     $arg6, $arg1
+    call    keccak_1600_save_state_x4
+
+.Lshake128_squeeze_done:
+    # Clear sensitive registers
+    vpxorq      %xmm16, %xmm16, %xmm16
+    vmovdqa64   %ymm16, %ymm17
+    vmovdqa64   %ymm16, %ymm18
+    vmovdqa64   %ymm16, %ymm19
+    vmovdqa64   %ymm16, %ymm20
+    vmovdqa64   %ymm16, %ymm21
+    vmovdqa64   %ymm16, %ymm22
+    vmovdqa64   %ymm16, %ymm23
+    vmovdqa64   %ymm16, %ymm24
+    vmovdqa64   %ymm16, %ymm25
+    vmovdqa64   %ymm16, %ymm26
+    vmovdqa64   %ymm16, %ymm27
+    vmovdqa64   %ymm16, %ymm28
+    vmovdqa64   %ymm16, %ymm29
+    vmovdqa64   %ymm16, %ymm30
+    vmovdqa64   %ymm16, %ymm31
+.Lshake128_squeeze_epilogue:
+    vzeroall
+___
+$code .= <<___ if ($win64);
+    vmovups 0(%rsp),   %xmm6
+    vmovups 16(%rsp),  %xmm7
+    vmovups 32(%rsp),  %xmm8
+    vmovups 48(%rsp),  %xmm9
+    vmovups 64(%rsp),  %xmm10
+    vmovups 80(%rsp),  %xmm11
+    vmovups 96(%rsp),  %xmm12
+    vmovups 112(%rsp), %xmm13
+    vmovups 128(%rsp), %xmm14
+    vmovups 144(%rsp), %xmm15
+    add     \$160, %rsp
+___
+$code.=<<___;
+
+    pop %r15
+.cfi_pop    %r15
+    pop %r14
+.cfi_pop    %r14
+    pop %r13
+.cfi_pop    %r13
+    pop %r12
+.cfi_pop    %r12
+    pop %rbx
+.cfi_pop    %rbx
+    pop %rbp
+.cfi_pop    %rbp
+    ret
+.cfi_endproc
+.size   SHA3_shake128_x4_inc_squeeze_avx512vl,.-SHA3_shake128_x4_inc_squeeze_avx512vl
+
+
+# SHAKE256 x4 multi-buffer functions
+# These functions process 4 independent SHAKE256 streams in parallel using AVX-512VL
+# State layout: 25 ymm registers (200 bytes each) + 1 qword = 808 bytes per context
+# Rate: 136 bytes for SHAKE256
+
+# SHA3_shake256_x4_avx512vl
+# One-shot SHAKE-256 x4 function: init + absorb + finalize + squeeze
+# Arguments:
+#   arg1 (rdi): pointer to output lane 0
+#   arg2 (rsi): pointer to output lane 1
+#   arg3 (rdx): pointer to output lane 2
+#   arg4 (rcx): pointer to output lane 3
+#   arg5 (r8):  output length in bytes (must be same for all lanes)
+#   arg6 (r9):  pointer to input lane 0
+#   [stack+0]:  pointer to input lane 1
+#   [stack+8]:  pointer to input lane 2
+#   [stack+16]: pointer to input lane 3
+#   [stack+24]: input length in bytes (must be same for all lanes)
+# Returns: void
+.globl  SHA3_shake256_x4_avx512vl
+.type   SHA3_shake256_x4_avx512vl,\@function,10
+.align  32
+SHA3_shake256_x4_avx512vl:
+.cfi_startproc
+    push    %rbp
+.cfi_push       %rbp
+    mov     %rsp, %rbp
+    push    %rbx
+.cfi_push       %rbx
+___
+$code .= <<___ if ($win64);
+    sub     \$160, %rsp
+    vmovups %xmm6,   0(%rsp)
+    vmovups %xmm7,   16(%rsp)
+    vmovups %xmm8,   32(%rsp)
+    vmovups %xmm9,   48(%rsp)
+    vmovups %xmm10,  64(%rsp)
+    vmovups %xmm11,  80(%rsp)
+    vmovups %xmm12,  96(%rsp)
+    vmovups %xmm13,  112(%rsp)
+    vmovups %xmm14,  128(%rsp)
+    vmovups %xmm15,  144(%rsp)
+___
+$code.=<<___;
+
+    sub     \$$sf_size, %rsp
+    mov     %rsp, %rbx
+
+.Lshake256_x4_body:
+    mov     $arg1, $sf_arg1(%rbx)
+    mov     $arg2, $sf_arg2(%rbx)
+    mov     $arg3, $sf_arg3(%rbx)
+    mov     $arg4, $sf_arg4(%rbx)
+    mov     $arg5, $sf_arg5(%rbx)
+
+    lea     $sf_state_x4(%rbx), $arg1 # start of x4 state on the stack frame
+    mov     $arg1, $sf_state_ptr(%rbx)
+
+    # Initialize the state array to zero
+    call    keccak_1600_init_state
+
+    call    keccak_1600_save_state_x4
+
+    movq    \$0, 8*100($arg1) # clear s[100]
+
+    mov     $sf_state_ptr(%rbx), $arg1
+    mov     $arg6, $arg2
+___
+$code .= <<___ if ($win64);
+    # xlate prologue handles up to six arguments. For one-shot x4 wrappers
+    # (10 args), the remaining four stay in Win64 stack slots.
+    mov     64(%rbp), $arg3 # arg7 from stack
+    mov     72(%rbp), $arg4 # arg8 from stack
+    mov     80(%rbp), $arg5 # arg9 from stack
+    mov     88(%rbp), $arg6 # arg10 from stack
+___
+$code .= <<___ if (!$win64);
+    mov     16(%rbp), $arg3 # arg7 from stack
+    mov     24(%rbp), $arg4 # arg8 from stack
+    mov     32(%rbp), $arg5 # arg9 from stack
+    mov     40(%rbp), $arg6 # arg10 from stack
+___
+$code.=<<___;
+    # Internal entry avoids Win64 xlate prologue argument remapping.
+___
+$code .= call_internal("SHA3_shake256_x4_inc_absorb_avx512vl_internal");
+$code.=<<___;
+
+    mov     $sf_state_ptr(%rbx), $arg1
+    call    .L_SHA3_shake256_x4_inc_finalize_avx512vl
+
+    # squeeze
+    mov     $sf_arg1(%rbx), $arg1
+    mov     $sf_arg2(%rbx), $arg2
+    mov     $sf_arg3(%rbx), $arg3
+    mov     $sf_arg4(%rbx), $arg4
+    mov     $sf_arg5(%rbx), $arg5
+    mov     $sf_state_ptr(%rbx), $arg6
+___
+$code .= call_internal("SHA3_shake256_x4_inc_squeeze_avx512vl_internal");
+$code.=<<___;
+
+    # Clear the temporary buffer
+    lea     $sf_state_x4(%rbx), %r9
+    vpxorq      %ymm31, %ymm31, %ymm31
+    vmovdqu64   %ymm31, 32*0(%r9)
+    vmovdqu64   %ymm31, 32*1(%r9)
+    vmovdqu64   %ymm31, 32*2(%r9)
+    vmovdqu64   %ymm31, 32*3(%r9)
+    vmovdqu64   %ymm31, 32*4(%r9)
+    vmovdqu64   %ymm31, 32*5(%r9)
+    vmovdqu64   %ymm31, 32*6(%r9)
+    vmovdqu64   %ymm31, 32*7(%r9)
+    vmovdqu64   %ymm31, 32*8(%r9)
+    vmovdqu64   %ymm31, 32*9(%r9)
+    vmovdqu64   %ymm31, 32*10(%r9)
+    vmovdqu64   %ymm31, 32*11(%r9)
+    vmovdqu64   %ymm31, 32*12(%r9)
+    vmovdqu64   %ymm31, 32*13(%r9)
+    vmovdqu64   %ymm31, 32*14(%r9)
+    vmovdqu64   %ymm31, 32*15(%r9)
+    vmovdqu64   %ymm31, 32*16(%r9)
+    vmovdqu64   %ymm31, 32*17(%r9)
+    vmovdqu64   %ymm31, 32*18(%r9)
+    vmovdqu64   %ymm31, 32*19(%r9)
+    vmovdqu64   %ymm31, 32*20(%r9)
+    vmovdqu64   %ymm31, 32*21(%r9)
+    vmovdqu64   %ymm31, 32*22(%r9)
+    vmovdqu64   %ymm31, 32*23(%r9)
+    vmovdqu64   %ymm31, 32*24(%r9)
+    vmovq       %xmm31, 32*25(%r9)
+
+.Lshake256_x4_epilogue:
+___
+$code .= <<___ if ($win64);
+    vmovups $sf_size+0(%rsp),   %xmm6
+    vmovups $sf_size+16(%rsp),  %xmm7
+    vmovups $sf_size+32(%rsp),  %xmm8
+    vmovups $sf_size+48(%rsp),  %xmm9
+    vmovups $sf_size+64(%rsp),  %xmm10
+    vmovups $sf_size+80(%rsp),  %xmm11
+    vmovups $sf_size+96(%rsp),  %xmm12
+    vmovups $sf_size+112(%rsp), %xmm13
+    vmovups $sf_size+128(%rsp), %xmm14
+    vmovups $sf_size+144(%rsp), %xmm15
+    add     \$160, %rsp
+___
+$code.=<<___;
+    add     \$$sf_size, %rsp
+    pop     %rbx
+.cfi_pop        %rbx
+    pop     %rbp
+.cfi_pop        %rbp
+    ret
+.cfi_endproc
+.size   SHA3_shake256_x4_avx512vl,.-SHA3_shake256_x4_avx512vl
+
+___
+
+$code .= <<___ if ($win64);
+# Internal Win64 shim for absorb entry. It establishes xlate-compatible
+# unwind state and then jumps to the function entry after the prologue.
+# This is required for internal calls since the xlate ABI conversion
+# is already done in the caller function.
+.type   SHA3_shake256_x4_inc_absorb_avx512vl_internal,\@abi-omnipotent
+.align  32
+.LSEH_begin_SHA3_shake256_x4_inc_absorb_avx512vl_internal:
+SHA3_shake256_x4_inc_absorb_avx512vl_internal:
+    mov     %rsp, %rax
+    mov     $arg1, 8(%rsp)
+    mov     $arg2, 16(%rsp)
+    jmp     .L_SHA3_shake256_x4_inc_absorb_avx512vl
+.LSEH_end_SHA3_shake256_x4_inc_absorb_avx512vl_internal:
+.size   SHA3_shake256_x4_inc_absorb_avx512vl_internal,.-SHA3_shake256_x4_inc_absorb_avx512vl_internal
+___
+$code.=<<___;
+
+# SHA3_shake256_x4_inc_absorb_avx512vl
+# Absorb input data into 4 parallel SHAKE256 states
+# Arguments:
+#   arg1 (rdi): pointer to state context (808 bytes)
+#   arg2 (rsi): pointer to lane 0 input data
+#   arg3 (rdx): pointer to lane 1 input data
+#   arg4 (rcx): pointer to lane 2 input data
+#   arg5 (r8):  pointer to lane 3 input data
+#   arg6 (r9):  input length in bytes (must be same for all lanes)
+# Returns: void
+# Note: Input is XORed into state and Keccak permutation is applied for each rate-sized block
+.globl  SHA3_shake256_x4_inc_absorb_avx512vl
+.type   SHA3_shake256_x4_inc_absorb_avx512vl,\@function,6
+.align  32
+SHA3_shake256_x4_inc_absorb_avx512vl:
+.L_SHA3_shake256_x4_inc_absorb_avx512vl:
+.cfi_startproc
+    push    %rbp
+.cfi_push       %rbp
+    push    %rbx
+.cfi_push       %rbx
+    push    %r12
+.cfi_push       %r12
+    push    %r13
+.cfi_push       %r13
+    push    %r14
+.cfi_push       %r14
+    push    %r15
+.cfi_push       %r15
+___
+$code .= <<___ if ($win64);
+    sub     \$160, %rsp
+    vmovups %xmm6,   0(%rsp)
+    vmovups %xmm7,   16(%rsp)
+    vmovups %xmm8,   32(%rsp)
+    vmovups %xmm9,   48(%rsp)
+    vmovups %xmm10,  64(%rsp)
+    vmovups %xmm11,  80(%rsp)
+    vmovups %xmm12,  96(%rsp)
+    vmovups %xmm13,  112(%rsp)
+    vmovups %xmm14,  128(%rsp)
+    vmovups %xmm15,  144(%rsp)
+___
+$code.=<<___;
+
+.Lshake256_absorb_body:
+    # check for partially processed block
+    mov     8*100($arg1), %r14
+    or      %r14, %r14 # s[100] == 0?
+    je      .Lshake256_absorb_main_loop_start
+
+    # process remaining bytes if message long enough
+    mov     \$136, %r12 # SHAKE256_RATE = 136
+    sub     %r14, %r12  # %r12 = capacity
+
+    cmp     %r12, $arg6 # if mlen <= capacity then no permute
+    jbe     .Lshake256_absorb_skip_permute
+
+    sub     %r12, $arg6
+    mov     $arg6, %r11 # preserve remaining length across helper calls
+
+    # r10/state, arg2-arg5/inputs, r12/length
+    mov     $arg1, %r10                # %r10 = state
+    call    keccak_1600_partial_add_x4 # arg2-arg5 are updated
+
+    call    keccak_1600_load_state_x4
+
+    call    keccak_1600_permute
+
+    movq    \$0, 8*100($arg1) # clear s[100]
+    jmp     .Lshake256_absorb_partial_block_done
+
+.Lshake256_absorb_skip_permute:
+    # r10/state, arg2-arg5/inputs, r12/length
+    mov     $arg1, %r10
+    mov     $arg6, %r12
+    mov     $arg6, %r11 # preserve input length across helper call
+    call    keccak_1600_partial_add_x4
+
+    lea     (%r11,%r14), %r15
+    mov     %r15, 8*100($arg1) # s[100] += inlen
+
+    cmp     \$136, %r15 # check s[100] below SHAKE256_RATE
+    jb      .Lshake256_absorb_exit
+
+    call    keccak_1600_load_state_x4
+
+    call    keccak_1600_permute
+
+    call    keccak_1600_save_state_x4
+
+    movq    \$0, 8*100($arg1) # clear s[100]
+    jmp     .Lshake256_absorb_exit
+
+.Lshake256_absorb_main_loop_start:
+    call    keccak_1600_load_state_x4
+    mov     $arg6, %r11 # full input length when no prior partial block
+
+.Lshake256_absorb_partial_block_done:
+    xor     %r12, %r12  # zero message offset
+
+    # Process the input message in blocks
+.align  32
+.Lshake256_absorb_while_loop:
+    cmp     \$136, %r11 # compare mlen to SHAKE256_RATE
+    jb      .Lshake256_absorb_while_loop_done
+
+    # Inline absorb_bytes_x4 for SHAKE256_RATE (136 bytes = 17 ymm registers)
+___
+
+# Generate absorb code for SHAKE256 rate (136 bytes)
+for (my $i = 0; $i < 17; $i++) {
+    my $offset = $i * 8;
+    $code.=<<___;
+        vmovq       $offset($arg2,%r12), %xmm31
+        vpinsrq     \$1, $offset($arg3,%r12), %xmm31, %xmm31
+        vmovq       $offset($arg4,%r12), %xmm30
+        vpinsrq     \$1, $offset($arg5,%r12), %xmm30, %xmm30
+        vinserti32x4 \$1, %xmm30, %ymm31, %ymm31
+        vpxorq      %ymm31, %ymm$i, %ymm$i
+___
+}
+
+$code.=<<___;
+    sub     \$136, %r11         # Subtract the rate from the remaining length
+    add     \$136, %r12         # Adjust offset to next block
+    call    keccak_1600_permute # Perform the Keccak permutation
+
+    jmp     .Lshake256_absorb_while_loop
+
+.align  32
+.Lshake256_absorb_while_loop_done:
+    call    keccak_1600_save_state_x4
+
+    mov     %r11, 8*100($arg1) # update s[100]
+    or      %r11, %r11
+    jz      .Lshake256_absorb_exit
+
+    movq    \$0, 8*100($arg1) # clear s[100]
+
+    # r10/state, arg2-arg5/input, r12/length
+    mov     $arg1, %r10
+    add     %r12, $arg2
+    add     %r12, $arg3
+    add     %r12, $arg4
+    add     %r12, $arg5
+    mov     %r11, %r12
+    call    keccak_1600_partial_add_x4
+
+    mov     %r11, 8*100($arg1) # update s[100]
+
+.Lshake256_absorb_exit:
+    # Clear sensitive registers
+    vpxorq      %xmm16, %xmm16, %xmm16
+    vmovdqa64   %ymm16, %ymm17
+    vmovdqa64   %ymm16, %ymm18
+    vmovdqa64   %ymm16, %ymm19
+    vmovdqa64   %ymm16, %ymm20
+    vmovdqa64   %ymm16, %ymm21
+    vmovdqa64   %ymm16, %ymm22
+    vmovdqa64   %ymm16, %ymm23
+    vmovdqa64   %ymm16, %ymm24
+    vmovdqa64   %ymm16, %ymm25
+    vmovdqa64   %ymm16, %ymm26
+    vmovdqa64   %ymm16, %ymm27
+    vmovdqa64   %ymm16, %ymm28
+    vmovdqa64   %ymm16, %ymm29
+    vmovdqa64   %ymm16, %ymm30
+    vmovdqa64   %ymm16, %ymm31
+.Lshake256_absorb_epilogue:
+___
+$code .= <<___ if ($win64);
+    vmovups 0(%rsp),   %xmm6
+    vmovups 16(%rsp),  %xmm7
+    vmovups 32(%rsp),  %xmm8
+    vmovups 48(%rsp),  %xmm9
+    vmovups 64(%rsp),  %xmm10
+    vmovups 80(%rsp),  %xmm11
+    vmovups 96(%rsp),  %xmm12
+    vmovups 112(%rsp), %xmm13
+    vmovups 128(%rsp), %xmm14
+    vmovups 144(%rsp), %xmm15
+    add     \$160, %rsp
+___
+$code.=<<___;
+
+    pop %r15
+.cfi_pop    %r15
+    pop %r14
+.cfi_pop    %r14
+    pop %r13
+.cfi_pop    %r13
+    pop %r12
+.cfi_pop    %r12
+    pop %rbx
+.cfi_pop    %rbx
+    pop %rbp
+.cfi_pop    %rbp
+    vzeroall
+    ret
+.cfi_endproc
+.size   SHA3_shake256_x4_inc_absorb_avx512vl,.-SHA3_shake256_x4_inc_absorb_avx512vl
+
+
+# SHA3_shake256_x4_inc_finalize_avx512vl
+# Finalize absorption phase for 4 parallel SHAKE-256 states
+# Adds padding and terminator bytes and clears the absorb offset
+# Arguments:
+#   arg1 (rdi): pointer to state context (808 bytes)
+# Returns: void
+# Note: After this call, state is ready for squeezing output
+.globl  SHA3_shake256_x4_inc_finalize_avx512vl
+.type   SHA3_shake256_x4_inc_finalize_avx512vl,\@function,1
+.align  32
+SHA3_shake256_x4_inc_finalize_avx512vl:
+.L_SHA3_shake256_x4_inc_finalize_avx512vl:
+.cfi_startproc
+    mov     8*100($arg1), %r11 # load state offset from s[100]
+    mov     %r11, %r10
+    and     \$~7, %r10d        # offset to the state register
+    and     \$7, %r11d         # offset within the register
+
+    # add EOM byte right after the message
+    vmovdqu32   ($arg1,%r10,4), %ymm31
+    lea         shake_msg_pad_x4(%rip), %r9
+    sub         %r11, %r9
+    vmovdqu32   (%r9), %ymm30
+    vpxorq      %ymm30, %ymm31, %ymm31
+    vmovdqu32   %ymm31, ($arg1,%r10,4)
+
+    # add terminating byte at offset equal to rate - 1 (SHAKE256_RATE = 136)
+    vmovdqu32   512($arg1), %ymm31 # 136*4 - 32 = 544 - 32 = 512
+    vmovdqa32   shake_terminator_byte_x4(%rip), %ymm30
+    vpxorq      %ymm30, %ymm31, %ymm31
+    vmovdqu32   %ymm31, 512($arg1)
+
+    movq        \$0, 8*100($arg1) # clear s[100]
+    vpxorq      %ymm31, %ymm31, %ymm31
+    ret
+.cfi_endproc
+.size   SHA3_shake256_x4_inc_finalize_avx512vl,.-SHA3_shake256_x4_inc_finalize_avx512vl
+
+___
+
+$code .= <<___ if ($win64);
+# Internal Win64 shim for squeeze entry. It establishes xlate-compatible
+# unwind state and then jumps to the function entry after the prologue.
+# This is required for internal calls since the xlate ABI conversion
+# is already done in the caller function.
+.type   SHA3_shake256_x4_inc_squeeze_avx512vl_internal,\@abi-omnipotent
+.align  32
+.LSEH_begin_SHA3_shake256_x4_inc_squeeze_avx512vl_internal:
+SHA3_shake256_x4_inc_squeeze_avx512vl_internal:
+    mov     %rsp, %rax
+    mov     $arg1, 8(%rsp)
+    mov     $arg2, 16(%rsp)
+    jmp     .L_SHA3_shake256_x4_inc_squeeze_avx512vl
+.LSEH_end_SHA3_shake256_x4_inc_squeeze_avx512vl_internal:
+.size   SHA3_shake256_x4_inc_squeeze_avx512vl_internal,.-SHA3_shake256_x4_inc_squeeze_avx512vl_internal
+___
+$code.=<<___;
+
+# SHA3_shake256_x4_inc_squeeze_avx512vl
+# Squeeze output from 4 parallel SHAKE256 states
+# Arguments:
+#   arg1 (rdi): pointer to lane 0 output buffer
+#   arg2 (rsi): pointer to lane 1 output buffer
+#   arg3 (rdx): pointer to lane 2 output buffer
+#   arg4 (rcx): pointer to lane 3 output buffer
+#   arg5 (r8):  output length in bytes (must be same for all lanes)
+#   arg6 (r9):  pointer to state context (808 bytes)
+# Returns: void
+# Note: Can be called multiple times to generate arbitrary-length output
+.globl  SHA3_shake256_x4_inc_squeeze_avx512vl
+.type   SHA3_shake256_x4_inc_squeeze_avx512vl,\@function,6
+.align  32
+SHA3_shake256_x4_inc_squeeze_avx512vl:
+.L_SHA3_shake256_x4_inc_squeeze_avx512vl:
+.cfi_startproc
+    push    %rbp
+.cfi_push       %rbp
+    push    %rbx
+.cfi_push       %rbx
+    push    %r12
+.cfi_push       %r12
+    push    %r13
+.cfi_push       %r13
+    push    %r14
+.cfi_push       %r14
+    push    %r15
+.cfi_push       %r15
+___
+$code .= <<___ if ($win64);
+    sub     \$160, %rsp
+    vmovups %xmm6,   0(%rsp)
+    vmovups %xmm7,   16(%rsp)
+    vmovups %xmm8,   32(%rsp)
+    vmovups %xmm9,   48(%rsp)
+    vmovups %xmm10,  64(%rsp)
+    vmovups %xmm11,  80(%rsp)
+    vmovups %xmm12,  96(%rsp)
+    vmovups %xmm13,  112(%rsp)
+    vmovups %xmm14,  128(%rsp)
+    vmovups %xmm15,  144(%rsp)
+___
+$code.=<<___;
+
+.Lshake256_squeeze_body:
+    or      $arg5, $arg5
+    jz      .Lshake256_squeeze_done
+
+    # check for partially processed block
+    mov     8*100($arg6), %r15 # s[100] - capacity
+    or      %r15, %r15
+    jnz     .Lshake256_squeeze_no_init_permute
+
+    mov     $arg1, %r14
+    mov     $arg6, $arg1
+    call    keccak_1600_load_state_x4
+
+    mov     %r14, $arg1
+
+    xor     %rbp, %rbp
+    jmp     .Lshake256_squeeze_loop
+
+.align  32
+.Lshake256_squeeze_no_init_permute:
+    # extract bytes: r10 - state/src, arg1-arg4 - output/dst, r12 - length = min(capacity, outlen), r11 - offset
+    mov     $arg6, %r10
+    mov     $arg6, %r14 # preserve state pointer across extract helper
+
+    mov     %r15, %r12
+    cmp     %r15, $arg5
+    cmovnae $arg5, %r12 # %r12 = min(capacity, outlen)
+
+    sub     %r12, $arg5 # outlen -= length
+
+    mov     \$136, %r11d # SHAKE256_RATE
+    sub     %r15, %r11   # state offset
+
+    sub     %r12, %r15         # capacity -= length
+    mov     %r15, 8*100($arg6) # update s[100]
+
+    call    keccak_1600_extract_bytes_x4
+    mov     %r14, $arg6        # restore state pointer after helper clobbers
+
+    or      %r15, %r15
+    jnz     .Lshake256_squeeze_done # check s[100] not zero
+
+    mov     $arg1, %r13 # preserve arg1
+    mov     %r14, $arg1
+    call    keccak_1600_load_state_x4
+
+    mov     %r13, $arg1
+    xor     %rbp, %rbp
+
+.align  32
+.Lshake256_squeeze_loop:
+    cmp     \$136, $arg5 # outlen > SHAKE256_RATE
+    jb      .Lshake256_squeeze_final_extract
+
+    call    keccak_1600_permute
+
+    # Extract SHAKE256 rate bytes (136 bytes = 17 x 8 bytes) inline
+___
+
+# Generate extract code for SHAKE256 rate (136 bytes = 17 ymm registers)
+for (my $i = 0; $i < 17; $i++) {
+    my $offset = $i * 8;
+    $code.=<<___;
+        vextracti64x2 \$1, %ymm$i, %xmm31
+        vmovq       %xmm$i, $offset($arg1,%rbp)
+        vpextrq     \$1, %xmm$i, $offset($arg2,%rbp)
+        vmovq       %xmm31, $offset($arg3,%rbp)
+        vpextrq     \$1, %xmm31, $offset($arg4,%rbp)
+___
+}
+
+$code.=<<___;
+    add     \$136, %rbp  # dst offset += SHAKE256_RATE
+    sub     \$136, $arg5 # outlen -= SHAKE256_RATE
+    jmp     .Lshake256_squeeze_loop
+
+.align  32
+.Lshake256_squeeze_final_extract:
+    or      $arg5, $arg5
+    jz      .Lshake256_squeeze_no_end_permute
+
+    # update output pointers
+    add     %rbp, $arg1
+    add     %rbp, $arg2
+    add     %rbp, $arg3
+    add     %rbp, $arg4
+
+    mov     \$136, %r15d       # SHAKE256_RATE
+    sub     $arg5, %r15
+    mov     %r15, 8*100($arg6) # s[100] = capacity
+
+    call    keccak_1600_permute
+
+    mov     $arg1, %r14
+    mov     $arg6, $arg1
+    call    keccak_1600_save_state_x4
+
+    mov     %r14, $arg1
+
+    # extract bytes: r10 - state/src, arg1-arg4 - output/dst, r12 - length, r11 - offset = 0
+    mov     $arg6, %r10
+    mov     $arg5, %r12
+    xor     %r11, %r11
+    call    keccak_1600_extract_bytes_x4
+
+    jmp     .Lshake256_squeeze_done
+
+.Lshake256_squeeze_no_end_permute:
+    movq    \$0, 8*100($arg6) # s[100] = 0
+    mov     $arg6, $arg1
+    call    keccak_1600_save_state_x4
+
+.Lshake256_squeeze_done:
+    # Clear sensitive registers
+    vpxorq      %xmm16, %xmm16, %xmm16
+    vmovdqa64   %ymm16, %ymm17
+    vmovdqa64   %ymm16, %ymm18
+    vmovdqa64   %ymm16, %ymm19
+    vmovdqa64   %ymm16, %ymm20
+    vmovdqa64   %ymm16, %ymm21
+    vmovdqa64   %ymm16, %ymm22
+    vmovdqa64   %ymm16, %ymm23
+    vmovdqa64   %ymm16, %ymm24
+    vmovdqa64   %ymm16, %ymm25
+    vmovdqa64   %ymm16, %ymm26
+    vmovdqa64   %ymm16, %ymm27
+    vmovdqa64   %ymm16, %ymm28
+    vmovdqa64   %ymm16, %ymm29
+    vmovdqa64   %ymm16, %ymm30
+    vmovdqa64   %ymm16, %ymm31
+.Lshake256_squeeze_epilogue:
+    vzeroall
+___
+$code .= <<___ if ($win64);
+    vmovups 0(%rsp),   %xmm6
+    vmovups 16(%rsp),  %xmm7
+    vmovups 32(%rsp),  %xmm8
+    vmovups 48(%rsp),  %xmm9
+    vmovups 64(%rsp),  %xmm10
+    vmovups 80(%rsp),  %xmm11
+    vmovups 96(%rsp),  %xmm12
+    vmovups 112(%rsp), %xmm13
+    vmovups 128(%rsp), %xmm14
+    vmovups 144(%rsp), %xmm15
+    add     \$160, %rsp
+___
+$code.=<<___;
+
+    pop %r15
+.cfi_pop    %r15
+    pop %r14
+.cfi_pop    %r14
+    pop %r13
+.cfi_pop    %r13
+    pop %r12
+.cfi_pop    %r12
+    pop %rbx
+.cfi_pop    %rbx
+    pop %rbp
+.cfi_pop    %rbp
+    ret
+.cfi_endproc
+.size   SHA3_shake256_x4_inc_squeeze_avx512vl,.-SHA3_shake256_x4_inc_squeeze_avx512vl
+___
+
+if ($win64) {
+my $context = "%r8";
+my $disp    = "%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type   keccak_se_handler,\@abi-omnipotent
+.align  16
+keccak_se_handler:
+    push    %rsi
+    push    %rdi
+    push    %rbx
+    push    %rbp
+    push    %r12
+    push    %r13
+    push    %r14
+    push    %r15
+    pushfq
+    sub     \$64, %rsp
+
+    mov     120($context), %rax # context->Rax = original %rsp from xlate prologue
+    mov     248($context), %rbx # context->Rip
+
+    mov     8($disp), %rsi  # disp->ImageBase
+    mov     56($disp), %r11 # disp->HandlerData
+
+    mov     0(%r11), %r10d # HandlerData[0]: body label (rva)
+    lea     (%rsi,%r10), %r10
+    cmp     %r10, %rbx     # Rip < body?
+    jb      .Lkeccak_in_prologue
+
+    mov     4(%r11), %r10d # HandlerData[1]: epilogue label (rva)
+    lea     (%rsi,%r10), %r10
+    cmp     %r10, %rbx     # Rip >= epilogue?
+    jae     .Lkeccak_in_epilogue
+
+    # In function body:
+    # HandlerData[2]: delta from context->Rsp(body) to original %rsp
+    # HandlerData[3]: offset of XMM6 save area from context->Rsp(body), -1 if none
+    # HandlerData[4]: number of saved non-volatiles in stack frame layout (2 or 6)
+    # HandlerData[5]: delta from context->Rsp(epilogue) to original %rsp
+    mov     152($context), %rdx # body rsp
+    mov     8(%r11), %r10d
+    lea     (%rdx,%r10), %rax   # original rsp
+    jmp     .Lkeccak_restore_body_or_epilogue
+
+.Lkeccak_in_epilogue:
+    mov     152($context), %rdx # epilogue rsp
+    mov     20(%r11), %r10d
+    lea     (%rdx,%r10), %rax   # original rsp
+
+.Lkeccak_restore_body_or_epilogue:
+    mov     8(%rax), %rcx       # xlate shadow save of original rdi
+    mov     16(%rax), %rsi      # xlate shadow save of original rsi
+    mov     %rax, 152($context) # context->Rsp = original rsp
+    mov     %rsi, 168($context) # context->Rsi
+    mov     %rcx, 176($context) # context->Rdi
+
+    mov     16(%r11), %r10d # gpr save count
+    cmp     \$6, %r10d
+    jne     .Lkeccak_restore_two
+
+    mov     -24(%rax), %r12
+    mov     -32(%rax), %r13
+    mov     -40(%rax), %r14
+    mov     -48(%rax), %r15
+    mov     %r12, 216($context) # context->R12
+    mov     %r13, 224($context) # context->R13
+    mov     %r14, 232($context) # context->R14
+    mov     %r15, 240($context) # context->R15
+
+.Lkeccak_restore_two:
+    mov     -8(%rax), %rbp
+    mov     -16(%rax), %rbx
+    mov     %rbp, 160($context) # context->Rbp
+    mov     %rbx, 144($context) # context->Rbx
+
+    mov     12(%r11), %r10d # xmm save offset from body rsp
+    cmp     \$-1, %r10d
+    je      .Lkeccak_in_prologue
+
+    lea     (%rdx,%r10), %rsi   # source = xmm save area
+    lea     512($context), %rdi # &context->Xmm6
+    mov     \$20, %ecx          # 10 XMM * 2 qwords
+    .long   0xa548f3fc          # cld; rep movsq
+
+.Lkeccak_in_prologue:
+    mov     8(%rax), %rcx
+    mov     16(%rax), %rdx
+    mov     %rcx, 176($context) # context->Rdi
+    mov     %rdx, 168($context) # context->Rsi
+    mov     %rax, 152($context) # context->Rsp = original rsp
+
+    mov     40($disp), %rdi # disp->ContextRecord
+    mov     $context, %rsi
+    mov     \$154, %ecx     # sizeof(CONTEXT)/8
+    .long   0xa548f3fc      # cld; rep movsq
+
+    mov     $disp, %rsi
+    xor     %rcx, %rcx     # UNW_FLAG_NHANDLER
+    mov     8(%rsi), %rdx  # disp->ImageBase
+    mov     0(%rsi), %r8   # disp->ControlPc
+    mov     16(%rsi), %r9  # disp->FunctionEntry
+    mov     40(%rsi), %r10 # disp->ContextRecord
+    lea     56(%rsi), %r11 # &disp->HandlerData
+    lea     24(%rsi), %r12 # &disp->EstablisherFrame
+    mov     %r10, 32(%rsp)
+    mov     %r11, 40(%rsp)
+    mov     %r12, 48(%rsp)
+    mov     %rcx, 56(%rsp)
+    call    *__imp_RtlVirtualUnwind(%rip)
+
+    mov     \$1, %eax # ExceptionContinueSearch
+    add     \$64, %rsp
+    popfq
+    pop     %r15
+    pop     %r14
+    pop     %r13
+    pop     %r12
+    pop     %rbp
+    pop     %rbx
+    pop     %rdi
+    pop     %rsi
+    ret
+.size   keccak_se_handler,.-keccak_se_handler
+
+.section    .pdata
+.align  4
+    .rva    .LSEH_begin_SHA3_shake128_x4_avx512vl
+    .rva    .LSEH_end_SHA3_shake128_x4_avx512vl
+    .rva    .LSEH_info_SHA3_shake128_x4_avx512vl
+    .rva    .LSEH_begin_SHA3_shake128_x4_inc_absorb_avx512vl_internal
+    .rva    .LSEH_end_SHA3_shake128_x4_inc_absorb_avx512vl_internal
+    .rva    .LSEH_info_SHA3_shake128_x4_inc_absorb_avx512vl_internal
+    .rva    .LSEH_begin_SHA3_shake128_x4_inc_absorb_avx512vl
+    .rva    .LSEH_end_SHA3_shake128_x4_inc_absorb_avx512vl
+    .rva    .LSEH_info_SHA3_shake128_x4_inc_absorb_avx512vl
+    .rva    .LSEH_begin_SHA3_shake128_x4_inc_squeeze_avx512vl_internal
+    .rva    .LSEH_end_SHA3_shake128_x4_inc_squeeze_avx512vl_internal
+    .rva    .LSEH_info_SHA3_shake128_x4_inc_squeeze_avx512vl_internal
+    .rva    .LSEH_begin_SHA3_shake128_x4_inc_squeeze_avx512vl
+    .rva    .LSEH_end_SHA3_shake128_x4_inc_squeeze_avx512vl
+    .rva    .LSEH_info_SHA3_shake128_x4_inc_squeeze_avx512vl
+    .rva    .LSEH_begin_SHA3_shake256_x4_avx512vl
+    .rva    .LSEH_end_SHA3_shake256_x4_avx512vl
+    .rva    .LSEH_info_SHA3_shake256_x4_avx512vl
+    .rva    .LSEH_begin_SHA3_shake256_x4_inc_absorb_avx512vl_internal
+    .rva    .LSEH_end_SHA3_shake256_x4_inc_absorb_avx512vl_internal
+    .rva    .LSEH_info_SHA3_shake256_x4_inc_absorb_avx512vl_internal
+    .rva    .LSEH_begin_SHA3_shake256_x4_inc_absorb_avx512vl
+    .rva    .LSEH_end_SHA3_shake256_x4_inc_absorb_avx512vl
+    .rva    .LSEH_info_SHA3_shake256_x4_inc_absorb_avx512vl
+    .rva    .LSEH_begin_SHA3_shake256_x4_inc_squeeze_avx512vl_internal
+    .rva    .LSEH_end_SHA3_shake256_x4_inc_squeeze_avx512vl_internal
+    .rva    .LSEH_info_SHA3_shake256_x4_inc_squeeze_avx512vl_internal
+    .rva    .LSEH_begin_SHA3_shake256_x4_inc_squeeze_avx512vl
+    .rva    .LSEH_end_SHA3_shake256_x4_inc_squeeze_avx512vl
+    .rva    .LSEH_info_SHA3_shake256_x4_inc_squeeze_avx512vl
+
+.section    .xdata
+.align  8
+.LSEH_info_SHA3_shake128_x4_avx512vl:
+    .byte   9,0,0,0
+    .rva    keccak_se_handler
+    .rva    .Lshake128_x4_body,.Lshake128_x4_epilogue
+    .long   1032,856,2,1032
+.LSEH_info_SHA3_shake128_x4_inc_absorb_avx512vl:
+    .byte   9,0,0,0
+    .rva    keccak_se_handler
+    .rva    .Lshake128_absorb_body,.Lshake128_absorb_epilogue
+    .long   208,0,6,208
+.LSEH_info_SHA3_shake128_x4_inc_absorb_avx512vl_internal:
+    .byte   9,0,0,0
+    .rva    keccak_se_handler
+    .rva    .Lshake128_absorb_body,.Lshake128_absorb_epilogue
+    .long   208,0,6,208
+.LSEH_info_SHA3_shake128_x4_inc_squeeze_avx512vl:
+    .byte   9,0,0,0
+    .rva    keccak_se_handler
+    .rva    .Lshake128_squeeze_body,.Lshake128_squeeze_epilogue
+    .long   208,0,6,208
+.LSEH_info_SHA3_shake128_x4_inc_squeeze_avx512vl_internal:
+    .byte   9,0,0,0
+    .rva    keccak_se_handler
+    .rva    .Lshake128_squeeze_body,.Lshake128_squeeze_epilogue
+    .long   208,0,6,208
+.LSEH_info_SHA3_shake256_x4_avx512vl:
+    .byte   9,0,0,0
+    .rva    keccak_se_handler
+    .rva    .Lshake256_x4_body,.Lshake256_x4_epilogue
+    .long   1032,856,2,1032
+.LSEH_info_SHA3_shake256_x4_inc_absorb_avx512vl:
+    .byte   9,0,0,0
+    .rva    keccak_se_handler
+    .rva    .Lshake256_absorb_body,.Lshake256_absorb_epilogue
+    .long   208,0,6,208
+.LSEH_info_SHA3_shake256_x4_inc_absorb_avx512vl_internal:
+    .byte   9,0,0,0
+    .rva    keccak_se_handler
+    .rva    .Lshake256_absorb_body,.Lshake256_absorb_epilogue
+    .long   208,0,6,208
+.LSEH_info_SHA3_shake256_x4_inc_squeeze_avx512vl:
+    .byte   9,0,0,0
+    .rva    keccak_se_handler
+    .rva    .Lshake256_squeeze_body,.Lshake256_squeeze_epilogue
+    .long   208,0,6,208
+.LSEH_info_SHA3_shake256_x4_inc_squeeze_avx512vl_internal:
+    .byte   9,0,0,0
+    .rva    keccak_se_handler
+    .rva    .Lshake256_squeeze_body,.Lshake256_squeeze_epilogue
+    .long   208,0,6,208
+___
+}
+
+$code.=<<___;
+
+.section .rodata align=128
+.align  128
+.type   iotas,\@object
+iotas:
+    .quad   0x0000000000000001
+    .quad   0x0000000000008082
+    .quad   0x800000000000808a
+    .quad   0x8000000080008000
+    .quad   0x000000000000808b
+    .quad   0x0000000080000001
+    .quad   0x8000000080008081
+    .quad   0x8000000000008009
+    .quad   0x000000000000008a
+    .quad   0x0000000000000088
+    .quad   0x0000000080008009
+    .quad   0x000000008000000a
+    .quad   0x000000008000808b
+    .quad   0x800000000000008b
+    .quad   0x8000000000008089
+    .quad   0x8000000000008003
+    .quad   0x8000000000008002
+    .quad   0x8000000000000080
+    .quad   0x000000000000800a
+    .quad   0x800000008000000a
+    .quad   0x8000000080008081
+    .quad   0x8000000000008080
+    .quad   0x0000000080000001
+    .quad   0x8000000080008008
+.size   iotas,.-iotas
+
+.align  8
+byte_kmask_0_to_7:
+    .byte   0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f
+
+.align  32
+shake_terminator_byte_x4:
+    .byte   0, 0, 0, 0, 0, 0, 0, 0x80
+    .byte   0, 0, 0, 0, 0, 0, 0, 0x80
+    .byte   0, 0, 0, 0, 0, 0, 0, 0x80
+    .byte   0, 0, 0, 0, 0, 0, 0, 0x80
+
+.align  8
+    .byte   0, 0, 0, 0, 0, 0, 0, 0
+shake_msg_pad_x4:
+    .byte   0x1F, 0, 0, 0, 0, 0, 0, 0
+    .byte   0x1F, 0, 0, 0, 0, 0, 0, 0
+    .byte   0x1F, 0, 0, 0, 0, 0, 0, 0
+    .byte   0x1F, 0, 0, 0, 0, 0, 0, 0
+
+.asciz  "Keccak-1600 absorb and squeeze for AVX512VL, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+}}} else {{{
+
+# When AVX512VL is not available, output stub functions
+# The capable function returns 0, and the operation functions are not defined (will use C fallback)
+
+$code .= <<___;
+.text
+
+.globl  SHA3_avx512vl_capable
+.type   SHA3_avx512vl_capable,\@abi-omnipotent
+SHA3_avx512vl_capable:
+    xor     %eax, %eax
+    ret
+.size   SHA3_avx512vl_capable, .-SHA3_avx512vl_capable
+
+.globl  SHA3_shake128_x4_inc_absorb_avx512vl
+.globl  SHA3_shake256_x4_inc_absorb_avx512vl
+.globl  SHA3_shake128_x4_inc_finalize_avx512vl
+.globl  SHA3_shake256_x4_inc_finalize_avx512vl
+.globl  SHA3_shake128_x4_inc_squeeze_avx512vl
+.globl  SHA3_shake256_x4_inc_squeeze_avx512vl
+.globl  SHA3_shake128_x4_avx512vl
+.globl  SHA3_shake256_x4_avx512vl
+.type   SHA3_shake128_x4_inc_absorb_avx512vl,\@abi-omnipotent
+SHA3_shake128_x4_inc_absorb_avx512vl:
+SHA3_shake256_x4_inc_absorb_avx512vl:
+SHA3_shake128_x4_inc_finalize_avx512vl:
+SHA3_shake256_x4_inc_finalize_avx512vl:
+SHA3_shake128_x4_inc_squeeze_avx512vl:
+SHA3_shake256_x4_inc_squeeze_avx512vl:
+SHA3_shake128_x4_avx512vl:
+SHA3_shake256_x4_avx512vl:
+    .byte   0x0f,0x0b # ud2
+    ret
+.size   SHA3_shake128_x4_inc_absorb_avx512vl, .-SHA3_shake128_x4_inc_absorb_avx512vl
+___
+}}}
+
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/crypto/sha/build.info b/crypto/sha/build.info
index 457ac8d06ab7b..fd192a66dda4c 100644
--- a/crypto/sha/build.info
+++ b/crypto/sha/build.info
@@ -65,7 +65,7 @@ ENDIF
 $KECCAK1600ASM=keccak1600.c
 IF[{- !$disabled{asm} -}]
   $KECCAK1600ASM_x86=
-  $KECCAK1600ASM_x86_64=keccak1600-x86_64.s
+  $KECCAK1600ASM_x86_64=keccak1600-x86_64.s keccak1600x4-avx512vl.s
 
   $KECCAK1600ASM_s390x=keccak1600-s390x.S
 
@@ -83,8 +83,8 @@ IF[{- !$disabled{asm} -}]
 ENDIF
 
 $COMMON=sha1dgst.c sha256.c sha512.c sha3.c sha3_encode.c $SHA1ASM $KECCAK1600ASM
-SOURCE[../../libcrypto]=$COMMON sha1_one.c
-SOURCE[../../providers/libfips.a]= $COMMON
+SOURCE[../../libcrypto]=$COMMON sha1_one.c sha3_x4.c
+SOURCE[../../providers/libfips.a]= $COMMON sha3_x4.c
 
 # Implementations are now spread across several libraries, so the defines
 # need to be applied to all affected libraries and modules.
@@ -198,4 +198,8 @@ GENERATE[keccak1600-avx512vl.S]=asm/keccak1600-avx512vl.pl
 GENERATE[keccak1600-mmx.S]=asm/keccak1600-mmx.pl
 GENERATE[keccak1600p8-ppc.S]=asm/keccak1600p8-ppc.pl
 
+# keccak1600x4-avx512vl.s supports multi-squeeze
+# Currently only used in ML-DSA on x86_64 with AVX-512VL support
+GENERATE[keccak1600x4-avx512vl.s]=asm/keccak1600x4-avx512vl.pl
+
 GENERATE[sha1-thumb.S]=asm/sha1-thumb.pl
diff --git a/crypto/sha/sha3_x4.c b/crypto/sha/sha3_x4.c
new file mode 100644
index 0000000000000..1d993c326c0a5
--- /dev/null
+++ b/crypto/sha/sha3_x4.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright 2026 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright (c) 2026 Intel Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/*
+ * SHAKE x4 multi-buffer implementation for AVX-512VL
+ *
+ * This file provides incremental API wrappers around the AVX-512VL
+ * assembly implementations for processing 4 SHAKE instances in parallel.
+ *
+ * Callers should check SHA3_avx512vl_capable() before calling.
+ */
+
+#include "internal/sha3.h"
+#include <string.h>
+
+#if defined(KECCAK1600_ASM)                                                               \
+    && (defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)) \
+    && !defined(OPENSSL_NO_ASM)
+
+/* External assembly function declarations */
+extern void SHA3_shake128_x4_inc_absorb_avx512vl(
+    uint64_t *state,
+    const void *in0, const void *in1,
+    const void *in2, const void *in3,
+    size_t inlen);
+
+extern void SHA3_shake256_x4_inc_absorb_avx512vl(
+    uint64_t *state,
+    const void *in0, const void *in1,
+    const void *in2, const void *in3,
+    size_t inlen);
+
+extern void SHA3_shake128_x4_inc_finalize_avx512vl(uint64_t *state);
+extern void SHA3_shake256_x4_inc_finalize_avx512vl(uint64_t *state);
+
+extern void SHA3_shake128_x4_inc_squeeze_avx512vl(
+    void *out0, void *out1,
+    void *out2, void *out3,
+    size_t outlen,
+    uint64_t *state);
+
+extern void SHA3_shake256_x4_inc_squeeze_avx512vl(
+    void *out0, void *out1,
+    void *out2, void *out3,
+    size_t outlen,
+    uint64_t *state);
+
+/* One-shot assembly function declarations */
+extern void SHA3_shake128_x4_avx512vl(
+    void *out0, void *out1,
+    void *out2, void *out3,
+    size_t outlen,
+    const void *in0, const void *in1,
+    const void *in2, const void *in3,
+    size_t inlen);
+
+extern void SHA3_shake256_x4_avx512vl(
+    void *out0, void *out1,
+    void *out2, void *out3,
+    size_t outlen,
+    const void *in0, const void *in1,
+    const void *in2, const void *in3,
+    size_t inlen);
+
+/*
+ * SHAKE-128 x4 Implementation
+ */
+
+void ossl_sha3_shake128_x4_inc_init(KECCAK1600_X4_CTX *ctx)
+{
+    memset(ctx->A, 0, sizeof(ctx->A));
+    ctx->rate = 168; /* SHAKE-128 rate in bytes */
+    ctx->finalized = 0;
+}
+
+void ossl_sha3_shake128_x4_inc_absorb(
+    KECCAK1600_X4_CTX *ctx,
+    const void *in0, const void *in1,
+    const void *in2, const void *in3,
+    size_t inlen)
+{
+    if (ctx->finalized) {
+        /* Error: cannot absorb after finalize */
+        return;
+    }
+
+    SHA3_shake128_x4_inc_absorb_avx512vl(
+        ctx->A, in0, in1, in2, in3, inlen);
+}
+
+void ossl_sha3_shake128_x4_inc_finalize(KECCAK1600_X4_CTX *ctx)
+{
+    if (ctx->finalized) {
+        return; /* Already finalized */
+    }
+
+    SHA3_shake128_x4_inc_finalize_avx512vl(ctx->A);
+    ctx->finalized = 1;
+}
+
+void ossl_sha3_shake128_x4_inc_squeeze(
+    void *out0, void *out1,
+    void *out2, void *out3,
+    size_t outlen,
+    KECCAK1600_X4_CTX *ctx)
+{
+    if (!ctx->finalized) {
+        /* Auto-finalize on first squeeze */
+        ossl_sha3_shake128_x4_inc_finalize(ctx);
+    }
+
+    SHA3_shake128_x4_inc_squeeze_avx512vl(
+        out0, out1, out2, out3, outlen, ctx->A);
+}
+
+/*
+ * SHAKE-256 x4 Implementation
+ */
+
+void ossl_sha3_shake256_x4_inc_init(KECCAK1600_X4_CTX *ctx)
+{
+    memset(ctx->A, 0, sizeof(ctx->A));
+    ctx->rate = 136; /* SHAKE-256 rate in bytes */
+    ctx->finalized = 0;
+}
+
+void ossl_sha3_shake256_x4_inc_absorb(
+    KECCAK1600_X4_CTX *ctx,
+    const void *in0, const void *in1,
+    const void *in2, const void *in3,
+    size_t inlen)
+{
+    if (ctx->finalized) {
+        /* Error: cannot absorb after finalize */
+        return;
+    }
+
+    SHA3_shake256_x4_inc_absorb_avx512vl(
+        ctx->A, in0, in1, in2, in3, inlen);
+}
+
+void ossl_sha3_shake256_x4_inc_finalize(KECCAK1600_X4_CTX *ctx)
+{
+    if (ctx->finalized) {
+        return; /* Already finalized */
+    }
+
+    SHA3_shake256_x4_inc_finalize_avx512vl(ctx->A);
+    ctx->finalized = 1;
+}
+
+void ossl_sha3_shake256_x4_inc_squeeze(
+    void *out0, void *out1,
+    void *out2, void *out3,
+    size_t outlen,
+    KECCAK1600_X4_CTX *ctx)
+{
+    if (!ctx->finalized) {
+        /* Auto-finalize on first squeeze */
+        ossl_sha3_shake256_x4_inc_finalize(ctx);
+    }
+
+    SHA3_shake256_x4_inc_squeeze_avx512vl(
+        out0, out1, out2, out3, outlen, ctx->A);
+}
+
+/*
+ * Single-call wrapper APIs
+ */
+
+void ossl_sha3_shake128_x4(
+    void *out0, void *out1,
+    void *out2, void *out3,
+    size_t outlen,
+    const void *in0, const void *in1,
+    const void *in2, const void *in3,
+    size_t inlen)
+{
+    SHA3_shake128_x4_avx512vl(out0, out1, out2, out3, outlen,
+        in0, in1, in2, in3, inlen);
+}
+
+void ossl_sha3_shake256_x4(
+    void *out0, void *out1,
+    void *out2, void *out3,
+    size_t outlen,
+    const void *in0, const void *in1,
+    const void *in2, const void *in3,
+    size_t inlen)
+{
+    SHA3_shake256_x4_avx512vl(out0, out1, out2, out3, outlen,
+        in0, in1, in2, in3, inlen);
+}
+
+#endif /* KECCAK1600_ASM && x86_64 && !OPENSSL_NO_ASM */
diff --git a/include/internal/sha3.h b/include/internal/sha3.h
index f91d00a74f838..82a7ec158b1a7 100644
--- a/include/internal/sha3.h
+++ b/include/internal/sha3.h
@@ -65,4 +65,75 @@ int ossl_shake_squeeze_default(KECCAK1600_CTX *ctx, unsigned char *out, size_t o
 size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
     size_t r);
 
+/* Multi-buffer (x4) Keccak-f[1600] context and API */
+#if defined(KECCAK1600_ASM)                                                               \
+    && (defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)) \
+    && !defined(OPENSSL_NO_ASM)
+
+/* Runtime capability check for AVX512VL */
+int SHA3_avx512vl_capable(void);
+
+/* Context for 4-way parallel SHAKE operations */
+typedef struct {
+    /* 4 interleaved Keccak states (800 bytes)
+       plus 8 bytes to store the number of
+       already absorbed or not yet squeezed bytes */
+    uint64_t A[(25 * 4) + 1];
+    size_t rate; /* Rate in bytes: 168 (SHAKE-128) or 136 (SHAKE-256) */
+    unsigned finalized; /* Has finalize been called? 0=no, 1=yes */
+} KECCAK1600_X4_CTX;
+
+/* SHAKE-128 x4 incremental API */
+void ossl_sha3_shake128_x4_inc_init(KECCAK1600_X4_CTX *ctx);
+
+void ossl_sha3_shake128_x4_inc_absorb(
+    KECCAK1600_X4_CTX *ctx,
+    const void *in0, const void *in1,
+    const void *in2, const void *in3,
+    size_t inlen);
+
+void ossl_sha3_shake128_x4_inc_finalize(KECCAK1600_X4_CTX *ctx);
+
+void ossl_sha3_shake128_x4_inc_squeeze(
+    void *out0, void *out1,
+    void *out2, void *out3,
+    size_t outlen,
+    KECCAK1600_X4_CTX *ctx);
+
+/* SHAKE-256 x4 incremental API */
+void ossl_sha3_shake256_x4_inc_init(KECCAK1600_X4_CTX *ctx);
+
+void ossl_sha3_shake256_x4_inc_absorb(
+    KECCAK1600_X4_CTX *ctx,
+    const void *in0, const void *in1,
+    const void *in2, const void *in3,
+    size_t inlen);
+
+void ossl_sha3_shake256_x4_inc_finalize(KECCAK1600_X4_CTX *ctx);
+
+void ossl_sha3_shake256_x4_inc_squeeze(
+    void *out0, void *out1,
+    void *out2, void *out3,
+    size_t outlen,
+    KECCAK1600_X4_CTX *ctx);
+
+/* Single-call SHAKE x4 APIs (wrapper functions) */
+void ossl_sha3_shake128_x4(
+    void *out0, void *out1,
+    void *out2, void *out3,
+    size_t outlen,
+    const void *in0, const void *in1,
+    const void *in2, const void *in3,
+    size_t inlen);
+
+void ossl_sha3_shake256_x4(
+    void *out0, void *out1,
+    void *out2, void *out3,
+    size_t outlen,
+    const void *in0, const void *in1,
+    const void *in2, const void *in3,
+    size_t inlen);
+
+#endif /* KECCAK1600_ASM && x86_64 && !OPENSSL_NO_ASM */
+
 #endif /* OSSL_INTERNAL_SHA3_H */
diff --git a/test/build.info b/test/build.info
index f599b3aff8c61..d6a36ba9ba035 100644
--- a/test/build.info
+++ b/test/build.info
@@ -915,6 +915,8 @@ IF[{- !$disabled{tests} -}]
       PROGRAMS{noinst}=cmactest
     ENDIF
 
+    PROGRAMS{noinst}=sha3_x4_internal_test
+
     SOURCE[poly1305_internal_test]=poly1305_internal_test.c
     INCLUDE[poly1305_internal_test]=.. ../include ../apps/include
     DEPEND[poly1305_internal_test]=../libcrypto.a libtestutil.a
@@ -923,6 +925,10 @@ IF[{- !$disabled{tests} -}]
     INCLUDE[chacha_internal_test]=.. ../include ../apps/include
     DEPEND[chacha_internal_test]=../libcrypto.a libtestutil.a
 
+    SOURCE[sha3_x4_internal_test]=sha3_x4_internal_test.c
+    INCLUDE[sha3_x4_internal_test]=.. ../include ../apps/include
+    DEPEND[sha3_x4_internal_test]=../libcrypto.a libtestutil.a
+
     SOURCE[asn1_internal_test]=asn1_internal_test.c
     INCLUDE[asn1_internal_test]=.. ../include ../apps/include
     DEPEND[asn1_internal_test]=../libcrypto.a libtestutil.a
diff --git a/test/recipes/03-test_sha3_x4_internal.t b/test/recipes/03-test_sha3_x4_internal.t
new file mode 100644
index 0000000000000..9e5793aaf3cd3
--- /dev/null
+++ b/test/recipes/03-test_sha3_x4_internal.t
@@ -0,0 +1,16 @@
+#! /usr/bin/env perl
+# Copyright 2026 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright (c) 2026 Intel Corporation. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+use strict;
+use OpenSSL::Test;
+use OpenSSL::Test::Simple;
+
+setup("test_sha3_x4_internal");
+
+simple_test("test_sha3_x4_internal", "sha3_x4_internal_test");
diff --git a/test/sha3_x4_internal_test.c b/test/sha3_x4_internal_test.c
new file mode 100644
index 0000000000000..e387b6f51ae46
--- /dev/null
+++ b/test/sha3_x4_internal_test.c
@@ -0,0 +1,432 @@
+/*
+ * Copyright 2026 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright (c) 2026 Intel Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/*
+ * Internal cross-validation tests for the SHAKE x4 multi-buffer API.
+ *
+ * Each test computes SHAKE-128 or SHAKE-256 on four independent inputs
+ * using the x4 (AVX-512VL) path and compares every lane's output to the
+ * equivalent result produced by the scalar ossl_sha3_* API.
+ *
+ * Tests cover:
+ *   - Single-call (ossl_sha3_shake{128,256}_x4) for many (inlen, outlen) pairs
+ *   - Incremental init/absorb/squeeze for the same (inlen, outlen) pairs
+ *   - Multi-absorb: input split at every possible block boundary
+ *   - Multi-squeeze: output produced in two successive squeeze calls
+ */
+
+#include <string.h>
+#include "testutil.h"
+
+/*
+ * KECCAK1600_ASM is only added to the library compilation flags by the build
+ * system, not to test binaries.  Since the x4 declarations in internal/sha3.h
+ * are guarded by that macro, we define it here before the include so that the
+ * KECCAK1600_X4_CTX type and function prototypes are visible.  The symbols
+ * themselves live in libcrypto.a which is always compiled with the flag set.
+ * We additionally gate all x4 code on x86_64 (GCC/Clang: __x86_64__,
+ * MSVC: _M_AMD64/_M_X64) and !OPENSSL_NO_ASM so that the test still
+ * compiles on other platforms or in no-asm builds.
+ */
+#if (defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)) \
+    && !defined(OPENSSL_NO_ASM)
+#ifndef KECCAK1600_ASM
+#define KECCAK1600_ASM
+#endif
+#endif
+#include "internal/sha3.h"
+
+/*
+ * A single deterministic 1024-byte message.  Each of the four lanes receives
+ * a different slice of this buffer, with lane base pointers spaced 64 bytes
+ * apart, so their inputs are distinct yet entirely self-contained.
+ */
+#define MSG_BUF_SIZE 1024
+#define LANE_STRIDE 64 /* byte offset between lane base pointers */
+#define NUM_LANES 4
+
+static unsigned char msg[MSG_BUF_SIZE];
+
+/* Maximum output length used in this file – must fit chunk1 + chunk2. */
+#define MAX_OUT 640
+
+#if defined(KECCAK1600_ASM)                                                               \
+    && (defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)) \
+    && !defined(OPENSSL_NO_ASM)
+
+/*
+ * Input lengths exercising: empty, tiny, sub-block, block boundary ±1,
+ * multiple blocks and a longer message for SHAKE-128 (rate=168) and
+ * SHAKE-256 (rate=136).
+ */
+static const size_t input_sizes[] = {
+    0, 1, 17, 100, 135, 136, 137, 168, 169, 200, 400
+};
+#define NUM_INPUT_SIZES (sizeof(input_sizes) / sizeof(input_sizes[0]))
+
+/* Output lengths chosen to straddle rate boundaries for both variants. */
+static const size_t output_sizes[] = {
+    16, 32, 64, 136, 168, 256, 512
+};
+#define NUM_OUTPUT_SIZES (sizeof(output_sizes) / sizeof(output_sizes[0]))
+
+/* Helpers functions */
+
+/*
+ * Compute a scalar SHAKE-128 or SHAKE-256 digest.
+ * bitlen: 128 or 256.  Returns 1 on success, 0 on failure.
+ */
+static int scalar_shake(const unsigned int bitlen,
+    const unsigned char *in, const size_t inlen,
+    unsigned char *out, const size_t outlen)
+{
+    KECCAK1600_CTX ctx;
+
+    if (!ossl_sha3_init(&ctx, 0x1f, bitlen))
+        return 0;
+    /* ossl_sha3_init does not populate the method vtable; do it here. */
+    ctx.meth.absorb = ossl_sha3_absorb_default;
+    ctx.meth.final = ossl_sha3_final_default;
+    ctx.meth.squeeze = ossl_shake_squeeze_default;
+    return ossl_sha3_absorb(&ctx, in, inlen)
+        && ossl_sha3_squeeze(&ctx, out, outlen);
+}
+
+/*
+ * Encode (inlen_idx, outlen_idx) into a single test index and back.
+ * test index n = inlen_idx * NUM_OUTPUT_SIZES + outlen_idx
+ */
+static void decode_idx(const int n, size_t *inlen, size_t *outlen)
+{
+    *inlen = input_sizes[n / (int)NUM_OUTPUT_SIZES];
+    *outlen = output_sizes[n % (int)NUM_OUTPUT_SIZES];
+}
+
+/* One-shot tests */
+
+static int test_shake_x4_oneshot(const unsigned int bitlen, const int n)
+{
+    size_t inlen, outlen;
+    const unsigned char *in[NUM_LANES];
+    unsigned char x4_out[NUM_LANES][MAX_OUT];
+    unsigned char ref_out[NUM_LANES][MAX_OUT];
+    int i;
+
+    decode_idx(n, &inlen, &outlen);
+
+    for (i = 0; i < NUM_LANES; i++)
+        in[i] = msg + i * LANE_STRIDE;
+
+    /* Ensure the lane inputs fit within the message buffer. */
+    if (!TEST_size_t_le(inlen + (NUM_LANES - 1) * LANE_STRIDE, MSG_BUF_SIZE))
+        return 0;
+    if (!TEST_size_t_le(outlen, MAX_OUT))
+        return 0;
+
+    /* x4 single-call */
+    if (bitlen == 128)
+        ossl_sha3_shake128_x4(x4_out[0], x4_out[1], x4_out[2], x4_out[3],
+            outlen,
+            in[0], in[1], in[2], in[3], inlen);
+    else
+        ossl_sha3_shake256_x4(x4_out[0], x4_out[1], x4_out[2], x4_out[3],
+            outlen,
+            in[0], in[1], in[2], in[3], inlen);
+
+    /* scalar reference */
+    for (i = 0; i < NUM_LANES; i++)
+        if (!TEST_true(scalar_shake(bitlen, in[i], inlen, ref_out[i], outlen)))
+            return 0;
+
+    /* compare */
+    for (i = 0; i < NUM_LANES; i++) {
+        if (!TEST_mem_eq(x4_out[i], outlen, ref_out[i], outlen)) {
+            TEST_info("SHAKE-%u x4 oneshot lane %d: inlen=%zu outlen=%zu",
+                bitlen, i, inlen, outlen);
+            return 0;
+        }
+    }
+    return 1;
+}
+
+static int test_shake128_x4_oneshot(const int n)
+{
+    return test_shake_x4_oneshot(128, n);
+}
+
+static int test_shake256_x4_oneshot(const int n)
+{
+    return test_shake_x4_oneshot(256, n);
+}
+
+/* Incremental (init / absorb / finalize / squeeze) tests */
+
+static int test_shake_x4_incremental(const unsigned int bitlen, const int n)
+{
+    size_t inlen, outlen;
+    const unsigned char *in[NUM_LANES];
+    unsigned char x4_out[NUM_LANES][MAX_OUT];
+    unsigned char ref_out[NUM_LANES][MAX_OUT];
+    KECCAK1600_X4_CTX ctx;
+    int i;
+
+    decode_idx(n, &inlen, &outlen);
+
+    for (i = 0; i < NUM_LANES; i++)
+        in[i] = msg + i * LANE_STRIDE;
+
+    if (!TEST_size_t_le(inlen + (NUM_LANES - 1) * LANE_STRIDE, MSG_BUF_SIZE))
+        return 0;
+
+    /* x4 incremental */
+    if (bitlen == 128) {
+        ossl_sha3_shake128_x4_inc_init(&ctx);
+        ossl_sha3_shake128_x4_inc_absorb(&ctx, in[0], in[1], in[2], in[3],
+            inlen);
+        ossl_sha3_shake128_x4_inc_finalize(&ctx);
+        ossl_sha3_shake128_x4_inc_squeeze(x4_out[0], x4_out[1],
+            x4_out[2], x4_out[3], outlen, &ctx);
+    } else {
+        ossl_sha3_shake256_x4_inc_init(&ctx);
+        ossl_sha3_shake256_x4_inc_absorb(&ctx, in[0], in[1], in[2], in[3],
+            inlen);
+        ossl_sha3_shake256_x4_inc_finalize(&ctx);
+        ossl_sha3_shake256_x4_inc_squeeze(x4_out[0], x4_out[1],
+            x4_out[2], x4_out[3], outlen, &ctx);
+    }
+
+    /* scalar reference */
+    for (i = 0; i < NUM_LANES; i++)
+        if (!TEST_true(scalar_shake(bitlen, in[i], inlen, ref_out[i], outlen)))
+            return 0;
+
+    for (i = 0; i < NUM_LANES; i++) {
+        if (!TEST_mem_eq(x4_out[i], outlen, ref_out[i], outlen)) {
+            TEST_info("SHAKE-%u x4 incremental lane %d: inlen=%zu outlen=%zu",
+                bitlen, i, inlen, outlen);
+            return 0;
+        }
+    }
+    return 1;
+}
+
+static int test_shake128_x4_incremental(const int n)
+{
+    return test_shake_x4_incremental(128, n);
+}
+
+static int test_shake256_x4_incremental(const int n)
+{
+    return test_shake_x4_incremental(256, n);
+}
+
+/* Multi-absorb tests */
+
+/*
+ * Split the input at every tested input size, absorbing the two halves
+ * in separate calls.  The split length is chosen as input_sizes[n] so that
+ * we exercise sub-block, at-block and multi-block split points.
+ *
+ * Full message length is fixed at the largest tested input size so that
+ * every split index is meaningful.
+ */
+static int test_shake_x4_multi_absorb(const unsigned int bitlen, const int n)
+{
+    const size_t total = input_sizes[NUM_INPUT_SIZES - 1];
+    const size_t split = input_sizes[n];
+    const size_t outlen = 64; /* fixed output length for this sub-test */
+    const unsigned char *in[NUM_LANES];
+    unsigned char x4_out[NUM_LANES][MAX_OUT];
+    unsigned char ref_out[NUM_LANES][MAX_OUT];
+    KECCAK1600_X4_CTX ctx;
+    int i;
+
+    if (split > total)
+        return 1; /* nothing to test */
+
+    for (i = 0; i < NUM_LANES; i++)
+        in[i] = msg + i * LANE_STRIDE;
+
+    if (!TEST_size_t_le(total + (NUM_LANES - 1) * LANE_STRIDE, MSG_BUF_SIZE))
+        return 0;
+
+    /* x4 split absorb */
+    if (bitlen == 128) {
+        ossl_sha3_shake128_x4_inc_init(&ctx);
+        ossl_sha3_shake128_x4_inc_absorb(&ctx,
+            in[0], in[1], in[2], in[3], split);
+        ossl_sha3_shake128_x4_inc_absorb(&ctx,
+            in[0] + split, in[1] + split, in[2] + split, in[3] + split,
+            total - split);
+        ossl_sha3_shake128_x4_inc_squeeze(x4_out[0], x4_out[1],
+            x4_out[2], x4_out[3], outlen, &ctx);
+    } else {
+        ossl_sha3_shake256_x4_inc_init(&ctx);
+        ossl_sha3_shake256_x4_inc_absorb(&ctx,
+            in[0], in[1], in[2], in[3], split);
+        ossl_sha3_shake256_x4_inc_absorb(&ctx,
+            in[0] + split, in[1] + split, in[2] + split, in[3] + split,
+            total - split);
+        ossl_sha3_shake256_x4_inc_squeeze(x4_out[0], x4_out[1],
+            x4_out[2], x4_out[3], outlen, &ctx);
+    }
+
+    /* scalar reference (single absorb of full message) */
+    for (i = 0; i < NUM_LANES; i++)
+        if (!TEST_true(scalar_shake(bitlen, in[i], total, ref_out[i], outlen)))
+            return 0;
+
+    for (i = 0; i < NUM_LANES; i++) {
+        if (!TEST_mem_eq(x4_out[i], outlen, ref_out[i], outlen)) {
+            TEST_info("SHAKE-%u x4 multi-absorb lane %d: total=%zu split=%zu",
+                bitlen, i, total, split);
+            return 0;
+        }
+    }
+    return 1;
+}
+
+static int test_shake128_x4_multi_absorb(const int n)
+{
+    return test_shake_x4_multi_absorb(128, n);
+}
+
+static int test_shake256_x4_multi_absorb(const int n)
+{
+    return test_shake_x4_multi_absorb(256, n);
+}
+
+/* Multi-squeeze tests */
+
+/*
+ * Squeeze in two successive calls and verify that the concatenated output
+ * matches a single scalar squeeze of the same total length.
+ * Parameterized over output_sizes[] for the first chunk; the second chunk
+ * is always 64 bytes so the total length varies.
+ */
+static int test_shake_x4_multi_squeeze(const unsigned int bitlen, const int n)
+{
+    const size_t inlen = 200; /* fixed input length */
+    const size_t chunk1 = output_sizes[n];
+    const size_t chunk2 = 64;
+    const size_t total = chunk1 + chunk2;
+    const unsigned char *in[NUM_LANES];
+    unsigned char x4_a[NUM_LANES][MAX_OUT]; /* first chunk              */
+    unsigned char x4_b[NUM_LANES][MAX_OUT]; /* second chunk             */
+    unsigned char ref_out[NUM_LANES][MAX_OUT];
+    KECCAK1600_X4_CTX ctx;
+    int i;
+
+    if (!TEST_size_t_le(total, MAX_OUT))
+        return 0;
+    if (!TEST_size_t_le(inlen + (NUM_LANES - 1) * LANE_STRIDE, MSG_BUF_SIZE))
+        return 0;
+
+    for (i = 0; i < NUM_LANES; i++)
+        in[i] = msg + i * LANE_STRIDE;
+
+    /* x4 two-shot squeeze */
+    if (bitlen == 128) {
+        ossl_sha3_shake128_x4_inc_init(&ctx);
+        ossl_sha3_shake128_x4_inc_absorb(&ctx, in[0], in[1], in[2], in[3],
+            inlen);
+        /* first squeeze */
+        ossl_sha3_shake128_x4_inc_squeeze(x4_a[0], x4_a[1], x4_a[2], x4_a[3],
+            chunk1, &ctx);
+        /* second squeeze – context carries state from previous call */
+        ossl_sha3_shake128_x4_inc_squeeze(x4_b[0], x4_b[1], x4_b[2], x4_b[3],
+            chunk2, &ctx);
+    } else {
+        ossl_sha3_shake256_x4_inc_init(&ctx);
+        ossl_sha3_shake256_x4_inc_absorb(&ctx, in[0], in[1], in[2], in[3],
+            inlen);
+        ossl_sha3_shake256_x4_inc_squeeze(x4_a[0], x4_a[1], x4_a[2], x4_a[3],
+            chunk1, &ctx);
+        ossl_sha3_shake256_x4_inc_squeeze(x4_b[0], x4_b[1], x4_b[2], x4_b[3],
+            chunk2, &ctx);
+    }
+
+    /* scalar reference – squeeze the full total in one call */
+    for (i = 0; i < NUM_LANES; i++)
+        if (!TEST_true(scalar_shake(bitlen, in[i], inlen, ref_out[i], total)))
+            return 0;
+
+    /* check first chunk, then second chunk */
+    for (i = 0; i < NUM_LANES; i++) {
+        if (!TEST_mem_eq(x4_a[i], chunk1, ref_out[i], chunk1)) {
+            TEST_info("SHAKE-%u x4 multi-squeeze lane %d chunk1: "
+                      "inlen=%zu chunk1=%zu chunk2=%zu",
+                bitlen, i, inlen, chunk1, chunk2);
+            return 0;
+        }
+        if (!TEST_mem_eq(x4_b[i], chunk2, ref_out[i] + chunk1, chunk2)) {
+            TEST_info("SHAKE-%u x4 multi-squeeze lane %d chunk2: "
+                      "inlen=%zu chunk1=%zu chunk2=%zu",
+                bitlen, i, inlen, chunk1, chunk2);
+            return 0;
+        }
+    }
+    return 1;
+}
+
+static int test_shake128_x4_multi_squeeze(const int n)
+{
+    return test_shake_x4_multi_squeeze(128, n);
+}
+
+static int test_shake256_x4_multi_squeeze(const int n)
+{
+    return test_shake_x4_multi_squeeze(256, n);
+}
+
+#endif /* KECCAK1600_ASM && x86_64 && !OPENSSL_NO_ASM */
+
+/* Test entry point */
+
+int setup_tests(void)
+{
+    size_t i;
+
+    /* Fill the message buffer with a deterministic non-zero pattern. */
+    for (i = 0; i < MSG_BUF_SIZE; i++)
+        msg[i] = (unsigned char)(251 * i + 17);
+
+#ifdef OPENSSL_CPUID_OBJ
+    OPENSSL_cpuid_setup();
+#endif
+
+#if !defined(KECCAK1600_ASM)                                                               \
+    || !(defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)) \
+    || defined(OPENSSL_NO_ASM)
+    return TEST_skip("SHAKE x4 API not available in this build");
+#else
+    if (!SHA3_avx512vl_capable()) {
+        return TEST_skip("AVX-512VL not available; skipping SHAKE x4 tests");
+    }
+
+    ADD_ALL_TESTS(test_shake128_x4_oneshot,
+        (int)(NUM_INPUT_SIZES * NUM_OUTPUT_SIZES));
+    ADD_ALL_TESTS(test_shake256_x4_oneshot,
+        (int)(NUM_INPUT_SIZES * NUM_OUTPUT_SIZES));
+
+    ADD_ALL_TESTS(test_shake128_x4_incremental,
+        (int)(NUM_INPUT_SIZES * NUM_OUTPUT_SIZES));
+    ADD_ALL_TESTS(test_shake256_x4_incremental,
+        (int)(NUM_INPUT_SIZES * NUM_OUTPUT_SIZES));
+
+    ADD_ALL_TESTS(test_shake128_x4_multi_absorb, (int)NUM_INPUT_SIZES);
+    ADD_ALL_TESTS(test_shake256_x4_multi_absorb, (int)NUM_INPUT_SIZES);
+
+    ADD_ALL_TESTS(test_shake128_x4_multi_squeeze, (int)NUM_OUTPUT_SIZES);
+    ADD_ALL_TESTS(test_shake256_x4_multi_squeeze, (int)NUM_OUTPUT_SIZES);
+#endif
+
+    return 1;
+}