diff --git a/.gitignore b/.gitignore
index 24f4ec9..0668edc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -46,3 +46,5 @@ Network Trash Folder
 Temporary Items
 .apdisk
 .vscode/settings.json
+mlkem_native/test/test_mlkem768
+*.o
diff --git a/licenses.md b/licenses.md
index ce64699..98bd5dc 100644
--- a/licenses.md
+++ b/licenses.md
@@ -19,6 +19,17 @@ at your option.
 https://github.com/trustcrypto/libraries/blob/master/mbedtls-2.4.0/apache-2.0.txt
 https://www.apache.org/licenses/LICENSE-2.0
 
+## mlkem-native (ML-KEM / FIPS 203)
+https://github.com/pq-code-package/mlkem-native
+
+Licensed under your choice of:
+
+Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
+ISC License (https://opensource.org/licenses/ISC)
+MIT License (https://opensource.org/licenses/MIT)
+
+Post-Quantum Cryptography Alliance, a project of the Linux Foundation.
+
 ## Base64, Sha1, Sha256
 
 https://github.com/B-Con/crypto-algorithms
diff --git a/mlkem_native/mlkem_native.c b/mlkem_native/mlkem_native.c
new file mode 100644
index 0000000..a00697e
--- /dev/null
+++ b/mlkem_native/mlkem_native.c
@@ -0,0 +1,660 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * WARNING: This file is auto-generated from scripts/autogen
+ *          in the mlkem-native repository.
+ *          Do not modify it directly.
+ */
+
+/******************************************************************************
+ *
+ * Single compilation unit (SCU) for fixed-level build of mlkem-native
+ *
+ * This compilation unit bundles together all source files for a build
+ * of mlkem-native for a fixed security level (MLKEM-512/768/1024).
+ *
+ * # API
+ *
+ * The API exposed by this file is described in mlkem_native.h.
+ *
+ * # Multi-level build
+ *
+ * If you want an SCU build of mlkem-native with support for multiple security
+ * levels, you need to include this file multiple times, and set
+ * MLK_CONFIG_MULTILEVEL_WITH_SHARED and MLK_CONFIG_MULTILEVEL_NO_SHARED
+ * appropriately. This is exemplified in examples/monolithic_build_multilevel
+ * and examples/monolithic_build_multilevel_native.
+ *
+ * # Configuration
+ *
+ * The following options from the mlkem-native configuration are relevant:
+ *
+ * - MLK_CONFIG_FIPS202_CUSTOM_HEADER
+ *   Set this option if you use a custom FIPS202 implementation.
+ *
+ * - MLK_CONFIG_USE_NATIVE_BACKEND_ARITH
+ *   Set this option if you want to include the native arithmetic backends
+ *   in your build.
+ *
+ * - MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202
+ *   Set this option if you want to include the native FIPS202 backends
+ *   in your build.
+ *
+ * - MLK_CONFIG_MONOBUILD_KEEP_SHARED_HEADERS
+ *   Set this option if you want to keep the directives defined in
+ *   level-independent headers. This is needed for a multi-level build.
+ */
+
+/* If parts of the mlkem-native source tree are not used,
+ * consider reducing this header via `unifdef`.
+ *
+ * Example:
+ * ```bash
+ * unifdef -UMLK_CONFIG_USE_NATIVE_BACKEND_ARITH mlkem_native.c
+ * ```
+ */
+
+#include "src/common.h"
+
+#include "src/compress.c"
+#include "src/debug.c"
+#include "src/indcpa.c"
+#include "src/kem.c"
+#include "src/poly.c"
+#include "src/poly_k.c"
+#include "src/sampling.c"
+#include "src/verify.c"
+
+#if !defined(MLK_CONFIG_FIPS202_CUSTOM_HEADER)
+#include "src/fips202/fips202.c"
+#include "src/fips202/fips202x4.c"
+#include "src/fips202/keccakf1600.c"
+#endif
+
+#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_ARITH)
+#if defined(MLK_SYS_AARCH64)
+#include "src/native/aarch64/src/aarch64_zetas.c"
+#include "src/native/aarch64/src/rej_uniform_table.c"
+#endif
+#if defined(MLK_SYS_X86_64)
+#include "src/native/x86_64/src/compress_consts.c"
+#include "src/native/x86_64/src/consts.c"
+#include "src/native/x86_64/src/rej_uniform_table.c"
+#endif
+#if defined(MLK_SYS_RISCV64)
+#include "src/native/riscv64/src/rv64v_debug.c"
+#include "src/native/riscv64/src/rv64v_poly.c"
+#endif
+#endif /* MLK_CONFIG_USE_NATIVE_BACKEND_ARITH */
+
+#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202)
+#if defined(MLK_SYS_AARCH64)
+#include "src/fips202/native/aarch64/src/keccakf1600_round_constants.c"
+#endif
+#if defined(MLK_SYS_X86_64)
+#include "src/fips202/native/x86_64/src/keccakf1600_constants.c"
+#endif
+#if defined(MLK_SYS_ARMV81M_MVE)
+#include "src/fips202/native/armv81m/src/keccak_f1600_x4_mve.c"
+#include "src/fips202/native/armv81m/src/keccakf1600_round_constants.c"
+#endif
+#endif /* MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 */
+
+/* Macro #undef's
+ *
+ * The following undefines macros from headers
+ * included by the source files imported above.
+ *
+ * This is to allow building and linking multiple builds
+ * of mlkem-native for varying parameter sets through concatenation
+ * of this file, as if the files had been compiled separately.
+ * If this is not relevant to you, you may remove the following.
+ */
+
+/*
+ * Undefine macros from MLK_CONFIG_PARAMETER_SET-specific files
+ */
+/* mlkem/mlkem_native.h */
+#undef CRYPTO_BYTES
+#undef CRYPTO_CIPHERTEXTBYTES
+#undef CRYPTO_PUBLICKEYBYTES
+#undef CRYPTO_SECRETKEYBYTES
+#undef CRYPTO_SYMBYTES
+#undef MLKEM1024_BYTES
+#undef MLKEM1024_CIPHERTEXTBYTES
+#undef MLKEM1024_PUBLICKEYBYTES
+#undef MLKEM1024_SECRETKEYBYTES
+#undef MLKEM1024_SYMBYTES
+#undef MLKEM512_BYTES
+#undef MLKEM512_CIPHERTEXTBYTES
+#undef MLKEM512_PUBLICKEYBYTES
+#undef MLKEM512_SECRETKEYBYTES
+#undef MLKEM512_SYMBYTES
+#undef MLKEM768_BYTES
+#undef MLKEM768_CIPHERTEXTBYTES
+#undef MLKEM768_PUBLICKEYBYTES
+#undef MLKEM768_SECRETKEYBYTES
+#undef MLKEM768_SYMBYTES
+#undef MLKEM_BYTES
+#undef MLKEM_CIPHERTEXTBYTES
+#undef MLKEM_CIPHERTEXTBYTES_
+#undef MLKEM_PUBLICKEYBYTES
+#undef MLKEM_PUBLICKEYBYTES_
+#undef MLKEM_SECRETKEYBYTES
+#undef MLKEM_SECRETKEYBYTES_
+#undef MLKEM_SYMBYTES
+#undef MLK_API_CONCAT
+#undef MLK_API_CONCAT_
+#undef MLK_API_CONCAT_UNDERSCORE
+#undef MLK_API_LEGACY_CONFIG
+#undef MLK_API_MUST_CHECK_RETURN_VALUE
+#undef MLK_API_NAMESPACE
+#undef MLK_API_QUALIFIER
+#undef MLK_CONFIG_API_CONSTANTS_ONLY
+#undef MLK_CONFIG_API_NAMESPACE_PREFIX
+#undef MLK_CONFIG_API_NO_SUPERCOP
+#undef MLK_CONFIG_API_PARAMETER_SET
+#undef MLK_CONFIG_API_QUALIFIER
+#undef MLK_ERR_FAIL
+#undef MLK_ERR_OUT_OF_MEMORY
+#undef MLK_ERR_RNG_FAIL
+#undef MLK_H
+#undef MLK_MAX3_
+#undef MLK_TOTAL_ALLOC_1024
+#undef MLK_TOTAL_ALLOC_1024_DECAPS
+#undef MLK_TOTAL_ALLOC_1024_ENCAPS
+#undef MLK_TOTAL_ALLOC_1024_KEYPAIR
+#undef MLK_TOTAL_ALLOC_1024_KEYPAIR_NO_PCT
+#undef MLK_TOTAL_ALLOC_1024_KEYPAIR_PCT
+#undef MLK_TOTAL_ALLOC_512
+#undef MLK_TOTAL_ALLOC_512_DECAPS
+#undef MLK_TOTAL_ALLOC_512_ENCAPS
+#undef MLK_TOTAL_ALLOC_512_KEYPAIR
+#undef MLK_TOTAL_ALLOC_512_KEYPAIR_NO_PCT
+#undef MLK_TOTAL_ALLOC_512_KEYPAIR_PCT
+#undef MLK_TOTAL_ALLOC_768
+#undef MLK_TOTAL_ALLOC_768_DECAPS
+#undef MLK_TOTAL_ALLOC_768_ENCAPS
+#undef MLK_TOTAL_ALLOC_768_KEYPAIR
+#undef MLK_TOTAL_ALLOC_768_KEYPAIR_NO_PCT
+#undef MLK_TOTAL_ALLOC_768_KEYPAIR_PCT
+#undef crypto_kem_check_pk
+#undef crypto_kem_check_sk
+#undef crypto_kem_dec
+#undef crypto_kem_enc
+#undef crypto_kem_enc_derand
+#undef crypto_kem_keypair
+#undef crypto_kem_keypair_derand
+/* mlkem/src/common.h */
+#undef MLK_ADD_PARAM_SET
+#undef MLK_ALLOC
+#undef MLK_APPLY
+#undef MLK_ASM_FN_SIZE
+#undef MLK_ASM_FN_SYMBOL
+#undef MLK_ASM_NAMESPACE
+#undef MLK_BUILD_INTERNAL
+#undef MLK_COMMON_H
+#undef MLK_CONCAT
+#undef MLK_CONCAT_
+#undef MLK_CONTEXT_PARAMETERS_0
+#undef MLK_CONTEXT_PARAMETERS_1
+#undef MLK_CONTEXT_PARAMETERS_2
+#undef MLK_CONTEXT_PARAMETERS_3
+#undef MLK_CONTEXT_PARAMETERS_4
+#undef MLK_EMPTY_CU
+#undef MLK_ERR_FAIL
+#undef MLK_ERR_OUT_OF_MEMORY
+#undef MLK_ERR_RNG_FAIL
+#undef MLK_EXTERNAL_API
+#undef MLK_FIPS202X4_HEADER_FILE
+#undef MLK_FIPS202_HEADER_FILE
+#undef MLK_FREE
+#undef MLK_INTERNAL_API
+#undef MLK_NAMESPACE
+#undef MLK_NAMESPACE_K
+#undef MLK_NAMESPACE_PREFIX
+#undef MLK_NAMESPACE_PREFIX_K
+#undef mlk_memcpy
+#undef mlk_memset
+/* mlkem/src/indcpa.h */
+#undef MLK_INDCPA_H
+#undef mlk_gen_matrix
+#undef mlk_indcpa_dec
+#undef mlk_indcpa_enc
+#undef mlk_indcpa_keypair_derand
+/* mlkem/src/kem.h */
+#undef MLK_KEM_H
+#undef mlk_kem_check_pk
+#undef mlk_kem_check_sk
+#undef mlk_kem_dec
+#undef mlk_kem_enc
+#undef mlk_kem_enc_derand
+#undef mlk_kem_keypair
+#undef mlk_kem_keypair_derand
+/* mlkem/src/params.h */
+#undef MLKEM_DU
+#undef MLKEM_DV
+#undef MLKEM_ETA1
+#undef MLKEM_ETA2
+#undef MLKEM_INDCCA_CIPHERTEXTBYTES
+#undef MLKEM_INDCCA_PUBLICKEYBYTES
+#undef MLKEM_INDCCA_SECRETKEYBYTES
+#undef MLKEM_INDCPA_BYTES
+#undef MLKEM_INDCPA_MSGBYTES
+#undef MLKEM_INDCPA_PUBLICKEYBYTES
+#undef MLKEM_INDCPA_SECRETKEYBYTES
+#undef MLKEM_K
+#undef MLKEM_N
+#undef MLKEM_POLYBYTES
+#undef MLKEM_POLYCOMPRESSEDBYTES_D10
+#undef MLKEM_POLYCOMPRESSEDBYTES_D11
+#undef MLKEM_POLYCOMPRESSEDBYTES_D4
+#undef MLKEM_POLYCOMPRESSEDBYTES_D5
+#undef MLKEM_POLYCOMPRESSEDBYTES_DU
+#undef MLKEM_POLYCOMPRESSEDBYTES_DV
+#undef MLKEM_POLYVECBYTES
+#undef MLKEM_POLYVECCOMPRESSEDBYTES_DU
+#undef MLKEM_Q
+#undef MLKEM_Q_HALF
+#undef MLKEM_SSBYTES
+#undef MLKEM_SYMBYTES
+#undef MLKEM_UINT12_LIMIT
+#undef MLK_PARAMS_H
+/* mlkem/src/poly_k.h */
+#undef MLK_POLY_K_H
+#undef mlk_poly_compress_du
+#undef mlk_poly_compress_dv
+#undef mlk_poly_decompress_du
+#undef mlk_poly_decompress_dv
+#undef mlk_poly_getnoise_eta1122_4x
+#undef mlk_poly_getnoise_eta1_4x
+#undef mlk_poly_getnoise_eta2
+#undef mlk_poly_getnoise_eta2_4x
+#undef mlk_polymat
+#undef mlk_polyvec
+#undef mlk_polyvec_add
+#undef mlk_polyvec_basemul_acc_montgomery_cached
+#undef mlk_polyvec_compress_du
+#undef mlk_polyvec_decompress_du
+#undef mlk_polyvec_frombytes
+#undef mlk_polyvec_invntt_tomont
+#undef mlk_polyvec_mulcache
+#undef mlk_polyvec_mulcache_compute
+#undef mlk_polyvec_ntt
+#undef mlk_polyvec_reduce
+#undef mlk_polyvec_tobytes
+#undef mlk_polyvec_tomont
+
+#if !defined(MLK_CONFIG_MONOBUILD_KEEP_SHARED_HEADERS)
+/*
+ * Undefine macros from MLK_CONFIG_PARAMETER_SET-generic files
+ */
+/* mlkem/src/compress.h */
+#undef MLK_COMPRESS_H
+#undef mlk_poly_compress_d10
+#undef mlk_poly_compress_d11
+#undef mlk_poly_compress_d4
+#undef mlk_poly_compress_d5
+#undef mlk_poly_decompress_d10
+#undef mlk_poly_decompress_d11
+#undef mlk_poly_decompress_d4
+#undef mlk_poly_decompress_d5
+#undef mlk_poly_frombytes
+#undef mlk_poly_frommsg
+#undef mlk_poly_tobytes
+#undef mlk_poly_tomsg
+/* mlkem/src/debug.h */
+#undef MLK_DEBUG_H
+#undef mlk_assert
+#undef mlk_assert_abs_bound
+#undef mlk_assert_abs_bound_2d
+#undef mlk_assert_bound
+#undef mlk_assert_bound_2d
+#undef mlk_debug_check_assert
+#undef mlk_debug_check_bounds
+/* mlkem/src/poly.h */
+#undef MLK_INVNTT_BOUND
+#undef MLK_NTT_BOUND
+#undef MLK_POLY_H
+#undef mlk_poly_add
+#undef mlk_poly_invntt_tomont
+#undef mlk_poly_mulcache_compute
+#undef mlk_poly_ntt
+#undef mlk_poly_reduce
+#undef mlk_poly_sub
+#undef mlk_poly_tomont
+/* mlkem/src/randombytes.h */
+#undef MLK_RANDOMBYTES_H
+/* mlkem/src/sampling.h */
+#undef MLK_SAMPLING_H
+#undef mlk_poly_cbd2
+#undef mlk_poly_cbd3
+#undef mlk_poly_rej_uniform
+#undef mlk_poly_rej_uniform_x4
+/* mlkem/src/symmetric.h */
+#undef MLK_SYMMETRIC_H
+#undef MLK_XOF_RATE
+#undef mlk_hash_g
+#undef mlk_hash_h
+#undef mlk_hash_j
+#undef mlk_prf_eta
+#undef mlk_prf_eta1
+#undef mlk_prf_eta1_x4
+#undef mlk_prf_eta2
+#undef mlk_xof_absorb
+#undef mlk_xof_ctx
+#undef mlk_xof_init
+#undef mlk_xof_release
+#undef mlk_xof_squeezeblocks
+#undef mlk_xof_x4_absorb
+#undef mlk_xof_x4_ctx
+#undef mlk_xof_x4_init
+#undef mlk_xof_x4_release
+#undef mlk_xof_x4_squeezeblocks
+/* mlkem/src/sys.h */
+#undef MLK_ALIGN
+#undef MLK_ALIGN_UP
+#undef MLK_ALWAYS_INLINE
+#undef MLK_CET_ENDBR
+#undef MLK_CT_TESTING_DECLASSIFY
+#undef MLK_CT_TESTING_SECRET
+#undef MLK_DEFAULT_ALIGN
+#undef MLK_HAVE_INLINE_ASM
+#undef MLK_INLINE
+#undef MLK_MUST_CHECK_RETURN_VALUE
+#undef MLK_RESTRICT
+#undef MLK_STATIC_TESTABLE
+#undef MLK_SYS_AARCH64
+#undef MLK_SYS_AARCH64_EB
+#undef MLK_SYS_APPLE
+#undef MLK_SYS_ARMV81M_MVE
+#undef MLK_SYS_BIG_ENDIAN
+#undef MLK_SYS_H
+#undef MLK_SYS_LINUX
+#undef MLK_SYS_LITTLE_ENDIAN
+#undef MLK_SYS_PPC64LE
+#undef MLK_SYS_RISCV32
+#undef MLK_SYS_RISCV64
+#undef MLK_SYS_RISCV64_RVV
+#undef MLK_SYS_WINDOWS
+#undef MLK_SYS_X86_64
+#undef MLK_SYS_X86_64_AVX2
+/* mlkem/src/verify.h */
+#undef MLK_USE_ASM_VALUE_BARRIER
+#undef MLK_VERIFY_H
+#undef mlk_ct_opt_blocker_u64
+/* mlkem/src/cbmc.h */
+#undef MLK_CBMC_H
+#undef __contract__
+#undef __loop__
+
+#if !defined(MLK_CONFIG_FIPS202_CUSTOM_HEADER)
+/*
+ * Undefine macros from FIPS-202 files
+ */
+/* mlkem/src/fips202/fips202.h */
+#undef FIPS202_X4_DEFAULT_IMPLEMENTATION
+#undef MLK_FIPS202_FIPS202_H
+#undef SHA3_256_HASHBYTES
+#undef SHA3_256_RATE
+#undef SHA3_384_RATE
+#undef SHA3_512_HASHBYTES
+#undef SHA3_512_RATE
+#undef SHAKE128_RATE
+#undef SHAKE256_RATE
+#undef mlk_sha3_256
+#undef mlk_sha3_512
+#undef mlk_shake128_absorb_once
+#undef mlk_shake128_init
+#undef mlk_shake128_release
+#undef mlk_shake128_squeezeblocks
+#undef mlk_shake256
+/* mlkem/src/fips202/fips202x4.h */
+#undef MLK_FIPS202_FIPS202X4_H
+#undef mlk_shake128x4_absorb_once
+#undef mlk_shake128x4_init
+#undef mlk_shake128x4_release
+#undef mlk_shake128x4_squeezeblocks
+#undef mlk_shake256x4
+/* mlkem/src/fips202/keccakf1600.h */
+#undef MLK_FIPS202_KECCAKF1600_H
+#undef MLK_KECCAK_LANES
+#undef MLK_KECCAK_WAY
+#undef mlk_keccakf1600_extract_bytes
+#undef mlk_keccakf1600_permute
+#undef mlk_keccakf1600_xor_bytes
+#undef mlk_keccakf1600x4_extract_bytes
+#undef mlk_keccakf1600x4_permute
+#undef mlk_keccakf1600x4_xor_bytes
+#endif /* !MLK_CONFIG_FIPS202_CUSTOM_HEADER */
+
+#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202)
+/* mlkem/src/fips202/native/api.h */
+#undef MLK_FIPS202_NATIVE_API_H
+#undef MLK_NATIVE_FUNC_FALLBACK
+#undef MLK_NATIVE_FUNC_SUCCESS
+/* mlkem/src/fips202/native/auto.h */
+#undef MLK_FIPS202_NATIVE_AUTO_H
+#if defined(MLK_SYS_AARCH64)
+/*
+ * Undefine macros from native code (FIPS202, AArch64)
+ */
+/* mlkem/src/fips202/native/aarch64/auto.h */
+#undef MLK_FIPS202_NATIVE_AARCH64_AUTO_H
+/* mlkem/src/fips202/native/aarch64/src/fips202_native_aarch64.h */
+#undef MLK_FIPS202_NATIVE_AARCH64_SRC_FIPS202_NATIVE_AARCH64_H
+#undef mlk_keccak_f1600_x1_scalar_asm
+#undef mlk_keccak_f1600_x1_v84a_asm
+#undef mlk_keccak_f1600_x2_v84a_asm
+#undef mlk_keccak_f1600_x4_v8a_scalar_hybrid_asm
+#undef mlk_keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm
+#undef mlk_keccakf1600_round_constants
+/* mlkem/src/fips202/native/aarch64/x1_scalar.h */
+#undef MLK_FIPS202_AARCH64_NEED_X1_SCALAR
+#undef MLK_FIPS202_NATIVE_AARCH64_X1_SCALAR_H
+#undef MLK_USE_FIPS202_X1_NATIVE
+/* mlkem/src/fips202/native/aarch64/x1_v84a.h */
+#undef MLK_FIPS202_AARCH64_NEED_X1_V84A
+#undef MLK_FIPS202_NATIVE_AARCH64_X1_V84A_H
+#undef MLK_USE_FIPS202_X1_NATIVE
+/* mlkem/src/fips202/native/aarch64/x2_v84a.h */
+#undef MLK_FIPS202_AARCH64_NEED_X2_V84A
+#undef MLK_FIPS202_NATIVE_AARCH64_X2_V84A_H
+#undef MLK_USE_FIPS202_X4_NATIVE
+/* mlkem/src/fips202/native/aarch64/x4_v8a_scalar.h */
+#undef MLK_FIPS202_AARCH64_NEED_X4_V8A_SCALAR_HYBRID
+#undef MLK_FIPS202_NATIVE_AARCH64_X4_V8A_SCALAR_H
+#undef MLK_USE_FIPS202_X4_NATIVE
+/* mlkem/src/fips202/native/aarch64/x4_v8a_v84a_scalar.h */
+#undef MLK_FIPS202_AARCH64_NEED_X4_V8A_V84A_SCALAR_HYBRID
+#undef MLK_FIPS202_NATIVE_AARCH64_X4_V8A_V84A_SCALAR_H
+#undef MLK_USE_FIPS202_X4_NATIVE
+#endif /* MLK_SYS_AARCH64 */
+#if defined(MLK_SYS_X86_64)
+/*
+ * Undefine macros from native code (FIPS202, x86_64)
+ */
+/* mlkem/src/fips202/native/x86_64/keccak_f1600_x4_avx2.h */
+#undef MLK_FIPS202_NATIVE_X86_64_KECCAK_F1600_X4_AVX2_H
+#undef MLK_FIPS202_X86_64_NEED_X4_AVX2
+#undef MLK_USE_FIPS202_X4_NATIVE
+/* mlkem/src/fips202/native/x86_64/src/fips202_native_x86_64.h */
+#undef MLK_FIPS202_NATIVE_X86_64_SRC_FIPS202_NATIVE_X86_64_H
+#undef mlk_keccak_f1600_x4_avx2
+#undef mlk_keccak_rho56
+#undef mlk_keccak_rho8
+#undef mlk_keccakf1600_round_constants
+#endif /* MLK_SYS_X86_64 */
+#if defined(MLK_SYS_ARMV81M_MVE)
+/*
+ * Undefine macros from native code (FIPS202, Armv8.1-M)
+ */
+/* mlkem/src/fips202/native/armv81m/mve.h */
+#undef MLK_FIPS202_ARMV81M_NEED_X4
+#undef MLK_FIPS202_NATIVE_ARMV81M
+#undef MLK_FIPS202_NATIVE_ARMV81M_MVE_H
+#undef MLK_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE
+#undef MLK_USE_FIPS202_X4_NATIVE
+#undef MLK_USE_FIPS202_X4_XOR_BYTES_NATIVE
+#undef mlk_keccak_f1600_x4_native_impl
+#undef mlk_keccak_f1600_x4_state_extract_bytes
+#undef mlk_keccak_f1600_x4_state_xor_bytes
+/* mlkem/src/fips202/native/armv81m/src/fips202_native_armv81m.h */
+#undef MLK_FIPS202_NATIVE_ARMV81M_SRC_FIPS202_NATIVE_ARMV81M_H
+#undef mlk_keccak_f1600_x4_mve_asm
+#undef mlk_keccak_f1600_x4_state_extract_bytes_asm
+#undef mlk_keccak_f1600_x4_state_xor_bytes_asm
+#undef mlk_keccakf1600_round_constants
+#endif /* MLK_SYS_ARMV81M_MVE */
+#endif /* MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 */
+#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_ARITH)
+/* mlkem/src/native/api.h */
+#undef MLK_INVNTT_BOUND
+#undef MLK_NATIVE_API_H
+#undef MLK_NATIVE_FUNC_FALLBACK
+#undef MLK_NATIVE_FUNC_SUCCESS
+#undef MLK_NTT_BOUND
+/* mlkem/src/native/meta.h */
+#undef MLK_NATIVE_META_H
+#if defined(MLK_SYS_AARCH64)
+/*
+ * Undefine macros from native code (Arith, AArch64)
+ */
+/* mlkem/src/native/aarch64/meta.h */
+#undef MLK_ARITH_BACKEND_AARCH64
+#undef MLK_NATIVE_AARCH64_META_H
+#undef MLK_USE_NATIVE_INTT
+#undef MLK_USE_NATIVE_NTT
+#undef MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED
+#undef MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE
+#undef MLK_USE_NATIVE_POLY_REDUCE
+#undef MLK_USE_NATIVE_POLY_TOBYTES
+#undef MLK_USE_NATIVE_POLY_TOMONT
+#undef MLK_USE_NATIVE_REJ_UNIFORM
+/* mlkem/src/native/aarch64/src/arith_native_aarch64.h */
+#undef MLK_NATIVE_AARCH64_SRC_ARITH_NATIVE_AARCH64_H
+#undef mlk_aarch64_invntt_zetas_layer12345
+#undef mlk_aarch64_invntt_zetas_layer67
+#undef mlk_aarch64_ntt_zetas_layer12345
+#undef mlk_aarch64_ntt_zetas_layer67
+#undef mlk_aarch64_zetas_mulcache_native
+#undef mlk_aarch64_zetas_mulcache_twisted_native
+#undef mlk_intt_asm
+#undef mlk_ntt_asm
+#undef mlk_poly_mulcache_compute_asm
+#undef mlk_poly_reduce_asm
+#undef mlk_poly_tobytes_asm
+#undef mlk_poly_tomont_asm
+#undef mlk_polyvec_basemul_acc_montgomery_cached_asm_k2
+#undef mlk_polyvec_basemul_acc_montgomery_cached_asm_k3
+#undef mlk_polyvec_basemul_acc_montgomery_cached_asm_k4
+#undef mlk_rej_uniform_asm
+#undef mlk_rej_uniform_table
+#endif /* MLK_SYS_AARCH64 */
+#if defined(MLK_SYS_X86_64)
+/*
+ * Undefine macros from native code (Arith, X86_64)
+ */
+/* mlkem/src/native/x86_64/meta.h */
+#undef MLK_ARITH_BACKEND_X86_64_DEFAULT
+#undef MLK_NATIVE_X86_64_META_H
+#undef MLK_USE_NATIVE_INTT
+#undef MLK_USE_NATIVE_NTT
+#undef MLK_USE_NATIVE_NTT_CUSTOM_ORDER
+#undef MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED
+#undef MLK_USE_NATIVE_POLY_COMPRESS_D10
+#undef MLK_USE_NATIVE_POLY_COMPRESS_D11
+#undef MLK_USE_NATIVE_POLY_COMPRESS_D4
+#undef MLK_USE_NATIVE_POLY_COMPRESS_D5
+#undef MLK_USE_NATIVE_POLY_DECOMPRESS_D10
+#undef MLK_USE_NATIVE_POLY_DECOMPRESS_D11
+#undef MLK_USE_NATIVE_POLY_DECOMPRESS_D4
+#undef MLK_USE_NATIVE_POLY_DECOMPRESS_D5
+#undef MLK_USE_NATIVE_POLY_FROMBYTES
+#undef MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE
+#undef MLK_USE_NATIVE_POLY_REDUCE
+#undef MLK_USE_NATIVE_POLY_TOBYTES
+#undef MLK_USE_NATIVE_POLY_TOMONT
+#undef MLK_USE_NATIVE_REJ_UNIFORM
+/* mlkem/src/native/x86_64/src/arith_native_x86_64.h */
+#undef MLK_AVX2_REJ_UNIFORM_BUFLEN
+#undef MLK_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H
+#undef mlk_invntt_avx2
+#undef mlk_ntt_avx2
+#undef mlk_nttfrombytes_avx2
+#undef mlk_ntttobytes_avx2
+#undef mlk_nttunpack_avx2
+#undef mlk_poly_compress_d10_avx2
+#undef mlk_poly_compress_d11_avx2
+#undef mlk_poly_compress_d4_avx2
+#undef mlk_poly_compress_d5_avx2
+#undef mlk_poly_decompress_d10_avx2
+#undef mlk_poly_decompress_d11_avx2
+#undef mlk_poly_decompress_d4_avx2
+#undef mlk_poly_decompress_d5_avx2
+#undef mlk_poly_mulcache_compute_avx2
+#undef mlk_polyvec_basemul_acc_montgomery_cached_asm_k2
+#undef mlk_polyvec_basemul_acc_montgomery_cached_asm_k3
+#undef mlk_polyvec_basemul_acc_montgomery_cached_asm_k4
+#undef mlk_reduce_avx2
+#undef mlk_rej_uniform_asm
+#undef mlk_rej_uniform_table
+#undef mlk_tomont_avx2
+/* mlkem/src/native/x86_64/src/compress_consts.h */
+#undef MLK_NATIVE_X86_64_SRC_COMPRESS_CONSTS_H
+#undef mlk_compress_d10_data
+#undef mlk_compress_d11_data
+#undef mlk_compress_d4_data
+#undef mlk_compress_d5_data
+#undef mlk_decompress_d10_data
+#undef mlk_decompress_d11_data
+#undef mlk_decompress_d4_data
+#undef mlk_decompress_d5_data
+/* mlkem/src/native/x86_64/src/consts.h */
+#undef MLK_AVX2_BACKEND_DATA_OFFSET_MULCACHE_TWIDDLES
+#undef MLK_AVX2_BACKEND_DATA_OFFSET_REVIDXB
+#undef MLK_AVX2_BACKEND_DATA_OFFSET_REVIDXD
+#undef MLK_AVX2_BACKEND_DATA_OFFSET_ZETAS_EXP
+#undef MLK_NATIVE_X86_64_SRC_CONSTS_H
+#undef mlk_qdata
+#endif /* MLK_SYS_X86_64 */
+#if defined(MLK_SYS_RISCV64)
+/*
+ * Undefine macros from native code (Arith, RISC-V 64)
+ */
+/* mlkem/src/native/riscv64/meta.h */
+#undef MLK_ARITH_BACKEND_RISCV64
+#undef MLK_NATIVE_RISCV64_META_H
+#undef MLK_USE_NATIVE_INTT
+#undef MLK_USE_NATIVE_NTT
+#undef MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED
+#undef MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE
+#undef MLK_USE_NATIVE_POLY_REDUCE
+#undef MLK_USE_NATIVE_POLY_TOMONT
+#undef MLK_USE_NATIVE_REJ_UNIFORM
+/* mlkem/src/native/riscv64/src/arith_native_riscv64.h */
+#undef MLK_NATIVE_RISCV64_SRC_ARITH_NATIVE_RISCV64_H
+#undef mlk_rv64v_poly_add
+#undef mlk_rv64v_poly_basemul_mont_add_k2
+#undef mlk_rv64v_poly_basemul_mont_add_k3
+#undef mlk_rv64v_poly_basemul_mont_add_k4
+#undef mlk_rv64v_poly_invntt_tomont
+#undef mlk_rv64v_poly_ntt
+#undef mlk_rv64v_poly_reduce
+#undef mlk_rv64v_poly_sub
+#undef mlk_rv64v_poly_tomont
+#undef mlk_rv64v_rej_uniform
+/* mlkem/src/native/riscv64/src/rv64v_debug.h */
+#undef MLK_NATIVE_RISCV64_SRC_RV64V_DEBUG_H
+#undef mlk_assert_abs_bound_int16m1
+#undef mlk_assert_abs_bound_int16m2
+#undef mlk_assert_bound_int16m1
+#undef mlk_assert_bound_int16m2
+#undef mlk_debug_check_bounds_int16m1
+#undef mlk_debug_check_bounds_int16m2
+#endif /* MLK_SYS_RISCV64 */
+#endif /* MLK_CONFIG_USE_NATIVE_BACKEND_ARITH */
+#endif /* !MLK_CONFIG_MONOBUILD_KEEP_SHARED_HEADERS */
diff --git a/mlkem_native/mlkem_native.h b/mlkem_native/mlkem_native.h
new file mode 100644
index 0000000..302ca3f
--- /dev/null
+++ b/mlkem_native/mlkem_native.h
@@ -0,0 +1,538 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS203]
+ *   FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/203/final
+ */
+
+#ifndef MLK_H
+#define MLK_H
+
+/*
+ * Public API for mlkem-native.
+ *
+ * This header defines the public API of a single build of mlkem-native.
+ *
+ * Make sure the configuration file is in the include path
+ * (this is "mlkem_native_config.h" by default, or MLK_CONFIG_FILE if defined).
+ *
+ * # Multi-level builds
+ *
+ * This header specifies a build of mlkem-native for a fixed security level.
+ * If you need multiple security levels, leave the security level unspecified
+ * in the configuration file and include this header multiple times, setting
+ * MLK_CONFIG_PARAMETER_SET accordingly for each, and #undef'ing the MLK_H
+ * guard to allow multiple inclusions.
+ *
+ * # Legacy configuration (deprecated)
+ *
+ * Instead of providing the config file used for the build, you can
+ * alternatively set the following configuration options prior to
+ * including this header.
+ *
+ * This method of configuration is deprecated.
+ * It will be removed in mlkem-native-v2.
+ *
+ * - MLK_CONFIG_API_PARAMETER_SET [required]
+ *
+ *   The parameter set used for the build; 512, 768, or 1024.
+ *
+ * - MLK_CONFIG_API_NAMESPACE_PREFIX [required]
+ *
+ *   The namespace prefix used for the build.
+ *
+ *   NOTE:
+ *   For a multi-level build, you must include the 512/768/1024 suffixes
+ *   in MLK_CONFIG_API_NAMESPACE_PREFIX.
+ *
+ * - MLK_CONFIG_API_NO_SUPERCOP [optional]
+ *
+ *   By default, this header will also expose the mlkem-native API in the
+ *   SUPERCOP naming convention crypto_kem_xxx. If you don't want/need this,
+ *   set MLK_CONFIG_API_NO_SUPERCOP. You must set this for a multi-level build.
+ *
+ * - MLK_CONFIG_API_CONSTANTS_ONLY [optional]
+ *
+ *   If you don't want this header to expose any function declarations,
+ *   but only constants for the sizes of key material, set
+ *   MLK_CONFIG_API_CONSTANTS_ONLY. In this case, you don't need to set
+ *   MLK_CONFIG_API_PARAMETER_SET or MLK_CONFIG_API_NAMESPACE_PREFIX,
+ *   nor include a configuration.
+ *
+ * - MLK_CONFIG_API_QUALIFIER [optional]
+ *
+ *   Qualifier to apply to external API.
+ *
+ ******************************************************************************/
+
+/******************************* Key sizes ************************************/
+
+/* Sizes of cryptographic material, per parameter set */
+/* See mlkem/common.h for the arithmetic expressions giving rise to these */
+/* check-magic: off */
+#define MLKEM512_SECRETKEYBYTES 1632
+#define MLKEM512_PUBLICKEYBYTES 800
+#define MLKEM512_CIPHERTEXTBYTES 768
+
+#define MLKEM768_SECRETKEYBYTES 2400
+#define MLKEM768_PUBLICKEYBYTES 1184
+#define MLKEM768_CIPHERTEXTBYTES 1088
+
+#define MLKEM1024_SECRETKEYBYTES 3168
+#define MLKEM1024_PUBLICKEYBYTES 1568
+#define MLKEM1024_CIPHERTEXTBYTES 1568
+/* check-magic: on */
+
+/* Size of randomness coins in bytes (level-independent) */
+#define MLKEM_SYMBYTES 32
+#define MLKEM512_SYMBYTES MLKEM_SYMBYTES
+#define MLKEM768_SYMBYTES MLKEM_SYMBYTES
+#define MLKEM1024_SYMBYTES MLKEM_SYMBYTES
+/* Size of shared secret in bytes (level-independent) */
+#define MLKEM_BYTES 32
+#define MLKEM512_BYTES MLKEM_BYTES
+#define MLKEM768_BYTES MLKEM_BYTES
+#define MLKEM1024_BYTES MLKEM_BYTES
+
+/* Sizes of cryptographic material, as a function of LVL=512,768,1024 */
+#define MLKEM_SECRETKEYBYTES_(LVL) MLKEM##LVL##_SECRETKEYBYTES
+#define MLKEM_PUBLICKEYBYTES_(LVL) MLKEM##LVL##_PUBLICKEYBYTES
+#define MLKEM_CIPHERTEXTBYTES_(LVL) MLKEM##LVL##_CIPHERTEXTBYTES
+#define MLKEM_SECRETKEYBYTES(LVL) MLKEM_SECRETKEYBYTES_(LVL)
+#define MLKEM_PUBLICKEYBYTES(LVL) MLKEM_PUBLICKEYBYTES_(LVL)
+#define MLKEM_CIPHERTEXTBYTES(LVL) MLKEM_CIPHERTEXTBYTES_(LVL)
+
+/****************************** Error codes ***********************************/
+
+/* Generic failure condition */
+#define MLK_ERR_FAIL -1
+/* An allocation failed. This can only happen if MLK_CONFIG_CUSTOM_ALLOC_FREE
+ * is defined and the provided MLK_CUSTOM_ALLOC can fail. */
+#define MLK_ERR_OUT_OF_MEMORY -2
+/* An rng failure occured. Might be due to insufficient entropy or
+ * system misconfiguration. */
+#define MLK_ERR_RNG_FAIL -3
+
+/****************************** Function API **********************************/
+
+#define MLK_API_CONCAT_(x, y) x##y
+#define MLK_API_CONCAT(x, y) MLK_API_CONCAT_(x, y)
+#define MLK_API_CONCAT_UNDERSCORE(x, y) MLK_API_CONCAT(MLK_API_CONCAT(x, _), y)
+
+#if !defined(MLK_CONFIG_API_PARAMETER_SET)
+/* Recommended configuration via same config file as used for the build. */
+
+/* For now, we derive the legacy API configuration MLK_CONFIG_API_XXX from
+ * the config file. In mlkem-native-v2, this will be removed and we will
+ * exclusively work with MLK_CONFIG_XXX. */
+
+/* You need to make sure the config file is in the include path. */
+#if defined(MLK_CONFIG_FILE)
+#include MLK_CONFIG_FILE
+#else
+#include "mlkem_native_config.h"
+#endif
+
+#define MLK_CONFIG_API_PARAMETER_SET MLK_CONFIG_PARAMETER_SET
+
+#if defined(MLK_CONFIG_MULTILEVEL_BUILD)
+#define MLK_CONFIG_API_NAMESPACE_PREFIX \
+  MLK_API_CONCAT(MLK_CONFIG_NAMESPACE_PREFIX, MLK_CONFIG_PARAMETER_SET)
+#else
+#define MLK_CONFIG_API_NAMESPACE_PREFIX MLK_CONFIG_NAMESPACE_PREFIX
+#endif
+
+#if defined(MLK_CONFIG_NO_SUPERCOP)
+#define MLK_CONFIG_API_NO_SUPERCOP
+#endif
+
+#if defined(MLK_CONFIG_CONSTANTS_ONLY)
+#define MLK_CONFIG_API_CONSTANTS_ONLY
+#endif
+
+#if defined(MLK_CONFIG_EXTERNAL_API_QUALIFIER)
+#define MLK_CONFIG_API_QUALIFIER MLK_CONFIG_EXTERNAL_API_QUALIFIER
+#endif
+
+#else /* !MLK_CONFIG_API_PARAMETER_SET */
+
+#define MLK_API_LEGACY_CONFIG
+
+#endif /* MLK_CONFIG_API_PARAMETER_SET */
+
+#define MLK_API_NAMESPACE(sym) \
+  MLK_API_CONCAT_UNDERSCORE(MLK_CONFIG_API_NAMESPACE_PREFIX, sym)
+
+#if defined(__GNUC__) || defined(clang)
+#define MLK_API_MUST_CHECK_RETURN_VALUE __attribute__((warn_unused_result))
+#else
+#define MLK_API_MUST_CHECK_RETURN_VALUE
+#endif
+
+#if defined(MLK_CONFIG_API_QUALIFIER)
+#define MLK_API_QUALIFIER MLK_CONFIG_API_QUALIFIER
+#else
+#define MLK_API_QUALIFIER
+#endif
+
+#if !defined(MLK_CONFIG_API_CONSTANTS_ONLY)
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/*************************************************
+ * Name:        crypto_kem_keypair_derand
+ *
+ * Description: Generates public and private key
+ *              for CCA-secure ML-KEM key encapsulation mechanism
+ *
+ * Arguments:   - uint8_t pk[]: pointer to output public key, an array of
+ *                 length MLKEM{512,768,1024}_PUBLICKEYBYTES bytes.
+ *              - uint8_t sk[]: pointer to output private key, an array of
+ *                  of MLKEM{512,768,1024}_SECRETKEYBYTES bytes.
+ *              - uint8_t *coins: pointer to input randomness, an array of
+ *                  2*MLKEM_SYMBYTES uniformly random bytes.
+ *
+ * Returns:     - 0: On success
+ *              - MLK_ERR_FAIL: If MLK_CONFIG_KEYGEN_PCT is enabled and the
+ *                  PCT failed.
+ *              - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ *                  used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ *
+ * Specification: Implements @[FIPS203, Algorithm 16, ML-KEM.KeyGen_Internal]
+ *
+ **************************************************/
+MLK_API_QUALIFIER
+MLK_API_MUST_CHECK_RETURN_VALUE
+int MLK_API_NAMESPACE(keypair_derand)(
+    uint8_t pk[MLKEM_PUBLICKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)],
+    uint8_t sk[MLKEM_SECRETKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)],
+    const uint8_t coins[2 * MLKEM_SYMBYTES]
+#ifdef MLK_CONFIG_CONTEXT_PARAMETER
+    ,
+    MLK_CONFIG_CONTEXT_PARAMETER_TYPE context
+#endif
+);
+
+
+#if !defined(MLK_CONFIG_NO_RANDOMIZED_API)
+/*************************************************
+ * Name:        crypto_kem_keypair
+ *
+ * Description: Generates public and private key
+ *              for CCA-secure ML-KEM key encapsulation mechanism
+ *
+ * Arguments:   - uint8_t *pk: pointer to output public key, an array of
+ *                 MLKEM{512,768,1024}_PUBLICKEYBYTES bytes.
+ *              - uint8_t *sk: pointer to output private key, an array of
+ *                 MLKEM{512,768,1024}_SECRETKEYBYTES bytes.
+ *
+ * Returns:     - 0: On success
+ *              - MLK_ERR_FAIL: If MLK_CONFIG_KEYGEN_PCT is enabled and the
+ *                  PCT failed.
+ *              - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ *                  used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ *              - MLK_ERR_RNG_FAIL: Random number generation failed.
+ *
+ * Specification: Implements @[FIPS203, Algorithm 19, ML-KEM.KeyGen]
+ *
+ **************************************************/
+MLK_API_QUALIFIER
+MLK_API_MUST_CHECK_RETURN_VALUE
+int MLK_API_NAMESPACE(keypair)(
+    uint8_t pk[MLKEM_PUBLICKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)],
+    uint8_t sk[MLKEM_SECRETKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)]
+#ifdef MLK_CONFIG_CONTEXT_PARAMETER
+    ,
+    MLK_CONFIG_CONTEXT_PARAMETER_TYPE context
+#endif
+);
+#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */
+
+/*************************************************
+ * Name:        crypto_kem_enc_derand
+ *
+ * Description: Generates cipher text and shared
+ *              secret for given public key
+ *
+ * Arguments:   - uint8_t *ct: pointer to output cipher text, an array of
+ *                 MLKEM{512,768,1024}_CIPHERTEXTBYTES bytes.
+ *              - uint8_t *ss: pointer to output shared secret, an array of
+ *                 MLKEM_BYTES bytes.
+ *              - const uint8_t *pk: pointer to input public key, an array of
+ *                 MLKEM{512,768,1024}_PUBLICKEYBYTES bytes.
+ *              - const uint8_t *coins: pointer to input randomness, an array of
+ *                 MLKEM_SYMBYTES bytes.
+ *
+ * Returns: - 0 on success
+ *          - MLK_ERR_FAIL: If the 'modulus check' @[FIPS203, Section 7.2]
+ *              for the public key fails.
+ *          - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ *              used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ *
+ * Specification: Implements @[FIPS203, Algorithm 17, ML-KEM.Encaps_Internal]
+ *
+ **************************************************/
+MLK_API_QUALIFIER
+MLK_API_MUST_CHECK_RETURN_VALUE
+int MLK_API_NAMESPACE(enc_derand)(
+    uint8_t ct[MLKEM_CIPHERTEXTBYTES(MLK_CONFIG_API_PARAMETER_SET)],
+    uint8_t ss[MLKEM_BYTES],
+    const uint8_t pk[MLKEM_PUBLICKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)],
+    const uint8_t coins[MLKEM_SYMBYTES]
+#ifdef MLK_CONFIG_CONTEXT_PARAMETER
+    ,
+    MLK_CONFIG_CONTEXT_PARAMETER_TYPE context
+#endif
+);
+
+#if !defined(MLK_CONFIG_NO_RANDOMIZED_API)
+/*************************************************
+ * Name:        crypto_kem_enc
+ *
+ * Description: Generates cipher text and shared
+ *              secret for given public key
+ *
+ * Arguments:   - uint8_t *ct: pointer to output cipher text, an array of
+ *                 MLKEM{512,768,1024}_CIPHERTEXTBYTES bytes.
+ *              - uint8_t *ss: pointer to output shared secret, an array of
+ *                 MLKEM_BYTES bytes.
+ *              - const uint8_t *pk: pointer to input public key, an array of
+ *                 MLKEM{512,768,1024}_PUBLICKEYBYTES bytes.
+ *
+ * Returns: - 0 on success
+ *          - MLK_ERR_FAIL: If the 'modulus check' @[FIPS203, Section 7.2]
+ *              for the public key fails.
+ *          - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ *              used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ *          - MLK_ERR_RNG_FAIL: Random number generation failed.
+ *
+ * Specification: Implements @[FIPS203, Algorithm 20, ML-KEM.Encaps]
+ *
+ **************************************************/
+MLK_API_QUALIFIER
+MLK_API_MUST_CHECK_RETURN_VALUE
+int MLK_API_NAMESPACE(enc)(
+    uint8_t ct[MLKEM_CIPHERTEXTBYTES(MLK_CONFIG_API_PARAMETER_SET)],
+    uint8_t ss[MLKEM_BYTES],
+    const uint8_t pk[MLKEM_PUBLICKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)]
+#ifdef MLK_CONFIG_CONTEXT_PARAMETER
+    ,
+    MLK_CONFIG_CONTEXT_PARAMETER_TYPE context
+#endif
+);
+#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */
+
+/*************************************************
+ * Name:        crypto_kem_dec
+ *
+ * Description: Generates shared secret for given
+ *              cipher text and private key
+ *
+ * Arguments:   - uint8_t *ss: pointer to output shared secret, an array of
+ *                 MLKEM_BYTES bytes.
+ *              - const uint8_t *ct: pointer to input cipher text, an array of
+ *                 MLKEM{512,768,1024}_CIPHERTEXTBYTES bytes.
+ *              - const uint8_t *sk: pointer to input private key, an array of
+ *                 MLKEM{512,768,1024}_SECRETKEYBYTES bytes.
+ *
+ * Returns: - 0 on success
+ *          - MLK_ERR_FAIL: If the 'hash check' @[FIPS203, Section 7.3]
+ *              for the secret key fails.
+ *          - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ *              used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ *
+ * Specification: Implements @[FIPS203, Algorithm 21, ML-KEM.Decaps]
+ *
+ **************************************************/
+MLK_API_QUALIFIER
+MLK_API_MUST_CHECK_RETURN_VALUE
+int MLK_API_NAMESPACE(dec)(
+    uint8_t ss[MLKEM_BYTES],
+    const uint8_t ct[MLKEM_CIPHERTEXTBYTES(MLK_CONFIG_API_PARAMETER_SET)],
+    const uint8_t sk[MLKEM_SECRETKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)]
+#ifdef MLK_CONFIG_CONTEXT_PARAMETER
+    ,
+    MLK_CONFIG_CONTEXT_PARAMETER_TYPE context
+#endif
+);
+
+
+/*************************************************
+ * Name:        crypto_kem_check_pk
+ *
+ * Description: Implements modulus check mandated by FIPS 203,
+ *              i.e., ensures that coefficients are in [0,q-1].
+ *
+ * Arguments:   - const uint8_t *pk: pointer to input public key, an array of
+ *                 MLKEM{512,768,1024}_PUBLICKEYBYTES bytes.
+ *
+ * Returns: - 0 on success
+ *          - MLK_ERR_FAIL: If the modulus check failed.
+ *          - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ *              used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ *
+ * Specification: Implements @[FIPS203, Section 7.2, 'modulus check']
+ *
+ **************************************************/
+MLK_API_QUALIFIER
+MLK_API_MUST_CHECK_RETURN_VALUE
+int MLK_API_NAMESPACE(check_pk)(
+    const uint8_t pk[MLKEM_PUBLICKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)]
+#ifdef MLK_CONFIG_CONTEXT_PARAMETER
+    ,
+    MLK_CONFIG_CONTEXT_PARAMETER_TYPE context
+#endif
+);
+
+/*************************************************
+ * Name:        crypto_kem_check_sk
+ *
+ * Description: Implements public key hash check mandated by FIPS 203,
+ *              i.e., ensures that
+ *              sk[768𝑘+32 ∶ 768𝑘+64] = H(pk)= H(sk[384𝑘 : 768𝑘+32])
+ *
+ * Arguments:   - const uint8_t *sk: pointer to input private key, an array of
+ *                 MLKEM{512,768,1024}_SECRETKEYBYTES bytes.
+ *
+ * Returns: - 0 on success
+ *          - MLK_ERR_FAIL: If the public key hash check failed.
+ *          - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ *              used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ *
+ * Specification: Implements @[FIPS203, Section 7.3, 'hash check']
+ *
+ **************************************************/
+MLK_API_QUALIFIER
+MLK_API_MUST_CHECK_RETURN_VALUE
+int MLK_API_NAMESPACE(check_sk)(
+    const uint8_t sk[MLKEM_SECRETKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)]
+#ifdef MLK_CONFIG_CONTEXT_PARAMETER
+    ,
+    MLK_CONFIG_CONTEXT_PARAMETER_TYPE context
+#endif
+);
+
+#ifdef __cplusplus
+}
+#endif
+
+/****************************** SUPERCOP API *********************************/
+
+#if !defined(MLK_CONFIG_API_NO_SUPERCOP)
+/* Export API in SUPERCOP naming scheme CRYPTO_xxx / crypto_kem_xxx */
+#define CRYPTO_SECRETKEYBYTES MLKEM_SECRETKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)
+#define CRYPTO_PUBLICKEYBYTES MLKEM_PUBLICKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)
+#define CRYPTO_CIPHERTEXTBYTES \
+  MLKEM_CIPHERTEXTBYTES(MLK_CONFIG_API_PARAMETER_SET)
+#define CRYPTO_SYMBYTES MLKEM_SYMBYTES
+#define CRYPTO_BYTES MLKEM_BYTES
+
+#define crypto_kem_keypair_derand MLK_API_NAMESPACE(keypair_derand)
+#define crypto_kem_keypair MLK_API_NAMESPACE(keypair)
+#define crypto_kem_enc_derand MLK_API_NAMESPACE(enc_derand)
+#define crypto_kem_enc MLK_API_NAMESPACE(enc)
+#define crypto_kem_dec MLK_API_NAMESPACE(dec)
+#define crypto_kem_check_pk MLK_API_NAMESPACE(check_pk)
+#define crypto_kem_check_sk MLK_API_NAMESPACE(check_sk)
+
+#else /* !MLK_CONFIG_API_NO_SUPERCOP */
+
+/* If the SUPERCOP API is not needed, we can undefine the various helper macros
+ * above. Otherwise, they are needed for lazy evaluation of crypto_kem_xxx. */
+#if !defined(MLK_API_LEGACY_CONFIG)
+#undef MLK_CONFIG_API_PARAMETER_SET
+#undef MLK_CONFIG_API_NAMESPACE_PREFIX
+#undef MLK_CONFIG_API_NO_SUPERCOP
+#undef MLK_CONFIG_API_CONSTANTS_ONLY
+#undef MLK_CONFIG_API_QUALIFIER
+#endif /* !MLK_API_LEGACY_CONFIG */
+
+#undef MLK_API_CONCAT
+#undef MLK_API_CONCAT_
+#undef MLK_API_CONCAT_UNDERSCORE
+#undef MLK_API_NAMESPACE
+#undef MLK_API_MUST_CHECK_RETURN_VALUE
+#undef MLK_API_QUALIFIER
+#undef MLK_API_LEGACY_CONFIG
+
+#endif /* MLK_CONFIG_API_NO_SUPERCOP */
+#endif /* !MLK_CONFIG_API_CONSTANTS_ONLY */
+
+
+/***************************** Memory Usage **********************************/
+
+/*
+ * By default mlkem-native performs all memory allocations on the stack.
+ * Alternatively, mlkem-native supports custom allocation of large structures
+ * through the `MLK_CONFIG_CUSTOM_ALLOC_FREE` configuration option.
+ * See mlkem_native_config.h for details.
+ *
+ * `MLK_TOTAL_ALLOC_{512,768,1024}_{KEYPAIR,ENCAPS,DECAPS}` indicates the
+ * maximum (accumulative) allocation via MLK_ALLOC for each parameter set and
+ * operation. Note that some stack allocation remains even when using custom
+ * allocators, so these values are lower than total stack usage with the default
+ * stack-only allocation.
+ *
+ * These constants may be used to implement custom allocations using a
+ * fixed-sized buffer and a simple allocator (e.g., bump allocator).
+ */
+/* check-magic: off */
+#define MLK_TOTAL_ALLOC_512_KEYPAIR_NO_PCT 5824
+#define MLK_TOTAL_ALLOC_512_KEYPAIR_PCT 10048
+#define MLK_TOTAL_ALLOC_512_ENCAPS 8384
+#define MLK_TOTAL_ALLOC_512_DECAPS 9152
+#define MLK_TOTAL_ALLOC_768_KEYPAIR_NO_PCT 10176
+#define MLK_TOTAL_ALLOC_768_KEYPAIR_PCT 15552
+#define MLK_TOTAL_ALLOC_768_ENCAPS 13248
+#define MLK_TOTAL_ALLOC_768_DECAPS 14336
+#define MLK_TOTAL_ALLOC_1024_KEYPAIR_NO_PCT 15552
+#define MLK_TOTAL_ALLOC_1024_KEYPAIR_PCT 22400
+#define MLK_TOTAL_ALLOC_1024_ENCAPS 19136
+#define MLK_TOTAL_ALLOC_1024_DECAPS 20704
+/* check-magic: on */
+
+/*
+ * MLK_TOTAL_ALLOC_*_KEYPAIR adapts based on MLK_CONFIG_KEYGEN_PCT.
+ * For legacy config, we don't know which options are used, so assume
+ * the worst case (PCT enabled).
+ */
+#if defined(MLK_API_LEGACY_CONFIG) || defined(MLK_CONFIG_KEYGEN_PCT)
+#define MLK_TOTAL_ALLOC_512_KEYPAIR MLK_TOTAL_ALLOC_512_KEYPAIR_PCT
+#define MLK_TOTAL_ALLOC_768_KEYPAIR MLK_TOTAL_ALLOC_768_KEYPAIR_PCT
+#define MLK_TOTAL_ALLOC_1024_KEYPAIR MLK_TOTAL_ALLOC_1024_KEYPAIR_PCT
+#else
+#define MLK_TOTAL_ALLOC_512_KEYPAIR MLK_TOTAL_ALLOC_512_KEYPAIR_NO_PCT
+#define MLK_TOTAL_ALLOC_768_KEYPAIR MLK_TOTAL_ALLOC_768_KEYPAIR_NO_PCT
+#define MLK_TOTAL_ALLOC_1024_KEYPAIR MLK_TOTAL_ALLOC_1024_KEYPAIR_NO_PCT
+#endif
+
+#define MLK_MAX3_(a, b, c) \
+  ((a) > (b) ? ((a) > (c) ? (a) : (c)) : ((b) > (c) ? (b) : (c)))
+
+/*
+ * `MLK_TOTAL_ALLOC_{512,768,1024}` is the maximum across all operations for
+ * each parameter set.
+ */
+#define MLK_TOTAL_ALLOC_512                                          \
+  MLK_MAX3_(MLK_TOTAL_ALLOC_512_KEYPAIR, MLK_TOTAL_ALLOC_512_ENCAPS, \
+            MLK_TOTAL_ALLOC_512_DECAPS)
+#define MLK_TOTAL_ALLOC_768                                          \
+  MLK_MAX3_(MLK_TOTAL_ALLOC_768_KEYPAIR, MLK_TOTAL_ALLOC_768_ENCAPS, \
+            MLK_TOTAL_ALLOC_768_DECAPS)
+#define MLK_TOTAL_ALLOC_1024                                           \
+  MLK_MAX3_(MLK_TOTAL_ALLOC_1024_KEYPAIR, MLK_TOTAL_ALLOC_1024_ENCAPS, \
+            MLK_TOTAL_ALLOC_1024_DECAPS)
+
+#endif /* !MLK_H */
diff --git a/mlkem_native/mlkem_native_config.h b/mlkem_native/mlkem_native_config.h
new file mode 100644
index 0000000..19450f3
--- /dev/null
+++ b/mlkem_native/mlkem_native_config.h
@@ -0,0 +1,64 @@
+/*
+ * mlkem-native configuration for OnlyKey (NXP MK20DX256, Cortex-M4)
+ * ML-KEM-768 (FIPS 203), C-only portable backend
+ *
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#ifndef MLK_CONFIG_H
+#define MLK_CONFIG_H
+
+/* ML-KEM-768 (NIST Level 3) */
+#ifndef MLK_CONFIG_PARAMETER_SET
+#define MLK_CONFIG_PARAMETER_SET 768
+#endif
+
+/* Namespace prefix for symbols */
+#if !defined(MLK_CONFIG_NAMESPACE_PREFIX)
+#define MLK_CONFIG_NAMESPACE_PREFIX MLK_DEFAULT_NAMESPACE_PREFIX
+#endif
+
+/* No native assembly backends — Cortex-M4 not supported by existing
+ * AArch64/x86_64/RVV/Helium backends. Pure portable C. */
+
+/* Build-only options */
+#if defined(MLK_BUILD_INTERNAL)
+
+/*
+ * Custom randombytes wrapper
+ *
+ * OnlyKey's existing randombytes has signature:
+ *   void randombytes(unsigned char *x, unsigned long long xlen)
+ *
+ * mlkem-native expects:
+ *   int mlk_randombytes(uint8_t *out, size_t outlen) — returns 0 on success
+ *
+ * We bridge with a custom wrapper. Implement onlykey_mlkem_randombytes()
+ * in okcrypto.cpp using your preferred entropy source.
+ */
+#define MLK_CONFIG_CUSTOM_RANDOMBYTES
+#if !defined(__ASSEMBLER__)
+#include <stdint.h>
+#include <stddef.h>
+#include "src/sys.h"
+
+extern int onlykey_mlkem_randombytes(uint8_t *out, size_t outlen);
+
+static MLK_INLINE int mlk_randombytes(uint8_t *out, size_t outlen)
+{
+    return onlykey_mlkem_randombytes(out, outlen);
+}
+#endif /* !__ASSEMBLER__ */
+
+#endif /* MLK_BUILD_INTERNAL */
+
+/* Default namespace */
+#if MLK_CONFIG_PARAMETER_SET == 512
+#define MLK_DEFAULT_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM512
+#elif MLK_CONFIG_PARAMETER_SET == 768
+#define MLK_DEFAULT_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM768
+#elif MLK_CONFIG_PARAMETER_SET == 1024
+#define MLK_DEFAULT_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM1024
+#endif
+
+#endif /* !MLK_CONFIG_H */
diff --git a/mlkem_native/src/cbmc.h b/mlkem_native/src/cbmc.h
new file mode 100644
index 0000000..80e1a36
--- /dev/null
+++ b/mlkem_native/src/cbmc.h
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#ifndef MLK_CBMC_H
+#define MLK_CBMC_H
+/***************************************************
+ * Basic replacements for __CPROVER_XXX contracts
+ ***************************************************/
+#ifndef CBMC
+
+#define __contract__(x)
+#define __loop__(x)
+
+#else /* !CBMC */
+
+
+#define __contract__(x) x
+#define __loop__(x) x
+
+/* https://diffblue.github.io/cbmc/contracts-assigns.html */
+#define assigns(...) __CPROVER_assigns(__VA_ARGS__)
+
+/* https://diffblue.github.io/cbmc/contracts-requires-ensures.html */
+#define requires(...) __CPROVER_requires(__VA_ARGS__)
+#define ensures(...) __CPROVER_ensures(__VA_ARGS__)
+/* https://diffblue.github.io/cbmc/contracts-loops.html */
+#define invariant(...) __CPROVER_loop_invariant(__VA_ARGS__)
+#define decreases(...) __CPROVER_decreases(__VA_ARGS__)
+/* cassert to avoid confusion with in-built assert */
+#define cassert(x) __CPROVER_assert(x, "cbmc assertion failed")
+#define assume(...) __CPROVER_assume(__VA_ARGS__)
+
+/***************************************************
+ * Macros for "expression" forms that may appear
+ * _inside_ top-level contracts.
+ ***************************************************/
+
+/*
+ * function return value - useful inside ensures
+ * https://diffblue.github.io/cbmc/contracts-functions.html
+ */
+#define return_value (__CPROVER_return_value)
+
+/*
+ * assigns l-value targets
+ * https://diffblue.github.io/cbmc/contracts-assigns.html
+ */
+#define object_whole(...) __CPROVER_object_whole(__VA_ARGS__)
+#define memory_slice(...) __CPROVER_object_upto(__VA_ARGS__)
+
+/*
+ * Pointer-related predicates
+ * https://diffblue.github.io/cbmc/contracts-memory-predicates.html
+ */
+#define memory_no_alias(...) __CPROVER_is_fresh(__VA_ARGS__)
+#define readable(...) __CPROVER_r_ok(__VA_ARGS__)
+#define writeable(...) __CPROVER_w_ok(__VA_ARGS__)
+
+/* Maximum supported buffer size
+ *
+ * Larger buffers may be supported, but due to internal modeling constraints
+ * in CBMC, the proofs of memory- and type-safety won't be able to run.
+ *
+ * If you find yourself in need for a buffer size larger than this,
+ * please contact the maintainers, so we can prioritize work to relax
+ * this somewhat artificial bound.
+ */
+#define MLK_MAX_BUFFER_SIZE (SIZE_MAX >> 12)
+
+/*
+ * History variables
+ * https://diffblue.github.io/cbmc/contracts-history-variables.html
+ */
+#define old(...) __CPROVER_old(__VA_ARGS__)
+#define loop_entry(...) __CPROVER_loop_entry(__VA_ARGS__)
+
+/*
+ * Quantifiers
+ * Note that the range on qvar is _exclusive_ between qvar_lb .. qvar_ub
+ * https://diffblue.github.io/cbmc/contracts-quantifiers.html
+ */
+
+/*
+ * Prevent clang-format from corrupting CBMC's special ==> operator
+ */
+/* clang-format off */
+#define forall(qvar, qvar_lb, qvar_ub, predicate)                 \
+  __CPROVER_forall                                                \
+  {                                                               \
+    unsigned qvar;                                                \
+    ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> (predicate)   \
+  }
+
+#define exists(qvar, qvar_lb, qvar_ub, predicate)               \
+  __CPROVER_exists                                              \
+  {                                                             \
+    unsigned qvar;                                              \
+    ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) && (predicate)  \
+  }
+/* clang-format on */
+
+/***************************************************
+ * Convenience macros for common contract patterns
+ ***************************************************/
+
+/*
+ * Boolean-value predidate that asserts that "all values of array_var are in
+ * range value_lb (inclusive) .. value_ub (exclusive)"
+ * Example:
+ *  array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)
+ * expands to
+ *  __CPROVER_forall { int k; (0 <= k && k <= MLKEM_N-1) ==> (
+ *  0 <= a->coeffs[k]) && a->coeffs[k] < MLKEM_Q)) }
+ */
+
+/*
+ * Prevent clang-format from corrupting CBMC's special ==> operator
+ */
+/* clang-format off */
+#define CBMC_CONCAT_(left, right) left##right
+#define CBMC_CONCAT(left, right) CBMC_CONCAT_(left, right)
+
+#define array_bound_core(qvar, qvar_lb, qvar_ub, array_var,            \
+                         value_lb, value_ub)                           \
+  __CPROVER_forall                                                     \
+  {                                                                    \
+    unsigned qvar;                                                     \
+    ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==>                    \
+        (((int)(value_lb) <= ((array_var)[(qvar)])) &&                 \
+         (((array_var)[(qvar)]) < (int)(value_ub)))                    \
+  }
+
+#define array_bound(array_var, qvar_lb, qvar_ub, value_lb, value_ub)    \
+  array_bound_core(CBMC_CONCAT(_cbmc_idx, __COUNTER__), (qvar_lb),      \
+      (qvar_ub), (array_var), (value_lb), (value_ub))
+
+#define array_unchanged_core(qvar, qvar_lb, qvar_ub, array_var)        \
+  __CPROVER_forall                                                     \
+  {                                                                    \
+    unsigned qvar;                                                     \
+    ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==>                    \
+    ((array_var)[(qvar)]) == (old(* (int16_t (*)[(qvar_ub)])(array_var)))[(qvar)] \
+  }
+
+#define array_unchanged(array_var, N) \
+    array_unchanged_core(CBMC_CONCAT(_cbmc_idx, __COUNTER__), 0, (N), (array_var))
+
+#define array_unchanged_u64_core(qvar, qvar_lb, qvar_ub, array_var)    \
+  __CPROVER_forall                                                     \
+  {                                                                    \
+    unsigned qvar;                                                     \
+    ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==>                    \
+    ((array_var)[(qvar)]) == (old(* (uint64_t (*)[(qvar_ub)])(array_var)))[(qvar)] \
+  }
+
+#define array_unchanged_u64(array_var, N) \
+    array_unchanged_u64_core(CBMC_CONCAT(_cbmc_idx, __COUNTER__), 0, (N), (array_var))
+/* clang-format on */
+
+/* Wrapper around array_bound operating on absolute values.
+ *
+ * The absolute value bound `k` is exclusive.
+ *
+ * Note that since the lower bound in array_bound is inclusive, we have to
+ * raise it by 1 here.
+ */
+#define array_abs_bound(arr, lb, ub, k) \
+  array_bound((arr), (lb), (ub), -((int)(k)) + 1, (k))
+
+#endif /* CBMC */
+
+#endif /* !MLK_CBMC_H */
diff --git a/mlkem_native/src/common.h b/mlkem_native/src/common.h
new file mode 100644
index 0000000..bc4e9ed
--- /dev/null
+++ b/mlkem_native/src/common.h
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+#ifndef MLK_COMMON_H
+#define MLK_COMMON_H
+
+#ifndef __ASSEMBLER__
+#include <stdint.h>
+#endif
+
+#define MLK_BUILD_INTERNAL
+
+#if defined(MLK_CONFIG_FILE)
+#include MLK_CONFIG_FILE
+#else
+#include "mlkem_native_config.h"
+#endif
+
+#include "params.h"
+#include "sys.h"
+
+/* Internal and public API have external linkage by default, but
+ * this can be overwritten by the user, e.g. for single-CU builds. */
+#if !defined(MLK_CONFIG_INTERNAL_API_QUALIFIER)
+#define MLK_INTERNAL_API
+#else
+#define MLK_INTERNAL_API MLK_CONFIG_INTERNAL_API_QUALIFIER
+#endif
+
+#if !defined(MLK_CONFIG_EXTERNAL_API_QUALIFIER)
+#define MLK_EXTERNAL_API
+#else
+#define MLK_EXTERNAL_API MLK_CONFIG_EXTERNAL_API_QUALIFIER
+#endif
+
+#define MLK_CONCAT_(x1, x2) x1##x2
+#define MLK_CONCAT(x1, x2) MLK_CONCAT_(x1, x2)
+
+#if (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || \
+     defined(MLK_CONFIG_MULTILEVEL_NO_SHARED))
+#define MLK_ADD_PARAM_SET(s) MLK_CONCAT(s, MLK_CONFIG_PARAMETER_SET)
+#else
+#define MLK_ADD_PARAM_SET(s) s
+#endif
+
+#define MLK_NAMESPACE_PREFIX MLK_CONCAT(MLK_CONFIG_NAMESPACE_PREFIX, _)
+#define MLK_NAMESPACE_PREFIX_K \
+  MLK_CONCAT(MLK_ADD_PARAM_SET(MLK_CONFIG_NAMESPACE_PREFIX), _)
+
+/* Functions are prefixed by MLK_CONFIG_NAMESPACE_PREFIX.
+ *
+ * If multiple parameter sets are used, functions depending on the parameter
+ * set are additionally prefixed with 512/768/1024. See mlkem_native_config.h.
+ *
+ * Example: If MLK_CONFIG_NAMESPACE_PREFIX is mlkem, then
+ * MLK_NAMESPACE_K(enc) becomes mlkem512_enc/mlkem768_enc/mlkem1024_enc.
+ */
+#define MLK_NAMESPACE(s) MLK_CONCAT(MLK_NAMESPACE_PREFIX, s)
+#define MLK_NAMESPACE_K(s) MLK_CONCAT(MLK_NAMESPACE_PREFIX_K, s)
+
+/* On Apple platforms, we need to emit leading underscore
+ * in front of assembly symbols. We thus introducee a separate
+ * namespace wrapper for ASM symbols. */
+#if !defined(__APPLE__)
+#define MLK_ASM_NAMESPACE(sym) MLK_NAMESPACE(sym)
+#else
+#define MLK_ASM_NAMESPACE(sym) MLK_CONCAT(_, MLK_NAMESPACE(sym))
+#endif
+
+/*
+ * On X86_64 if control-flow protections (CET) are enabled (through
+ * -fcf-protection=), we add an endbr64 instruction at every global function
+ * label.  See sys.h for more details
+ */
+#if defined(MLK_SYS_X86_64)
+#define MLK_ASM_FN_SYMBOL(sym) MLK_ASM_NAMESPACE(sym) : MLK_CET_ENDBR
+#elif defined(MLK_SYS_ARMV81M_MVE)
+/* clang-format off */
+#define MLK_ASM_FN_SYMBOL(sym) \
+  .type MLK_ASM_NAMESPACE(sym), %function; \
+  MLK_ASM_NAMESPACE(sym) :
+/* clang-format on */
+#else /* !MLK_SYS_X86_64 && MLK_SYS_ARMV81M_MVE */
+#define MLK_ASM_FN_SYMBOL(sym) MLK_ASM_NAMESPACE(sym) :
+#endif /* !MLK_SYS_X86_64 && !MLK_SYS_ARMV81M_MVE */
+
+/*
+ * Output the size of an assembly function.
+ */
+#if defined(__ELF__)
+#define MLK_ASM_FN_SIZE(sym) \
+  .size MLK_ASM_NAMESPACE(sym), .- MLK_ASM_NAMESPACE(sym)
+#else
+#define MLK_ASM_FN_SIZE(sym)
+#endif
+
+/* We aim to simplify the user's life by supporting builds where
+ * all source files are included, even those that are not needed.
+ * Those files are appropriately guarded and will be empty when unneeded.
+ * The following is to avoid compilers complaining about this. */
+#define MLK_EMPTY_CU(s) extern int MLK_NAMESPACE_K(empty_cu_##s);
+
+/* MLK_CONFIG_NO_ASM takes precedence over MLK_USE_NATIVE_XXX */
+#if defined(MLK_CONFIG_NO_ASM)
+#undef MLK_CONFIG_USE_NATIVE_BACKEND_ARITH
+#undef MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202
+#endif
+
+#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_ARITH) && \
+    !defined(MLK_CONFIG_ARITH_BACKEND_FILE)
+#error Bad configuration: MLK_CONFIG_USE_NATIVE_BACKEND_ARITH is set, but MLK_CONFIG_ARITH_BACKEND_FILE is not.
+#endif
+
+#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202) && \
+    !defined(MLK_CONFIG_FIPS202_BACKEND_FILE)
+#error Bad configuration: MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 is set, but MLK_CONFIG_FIPS202_BACKEND_FILE is not.
+#endif
+
+#if defined(MLK_CONFIG_NO_RANDOMIZED_API) && defined(MLK_CONFIG_KEYGEN_PCT)
+#error Bad configuration: MLK_CONFIG_NO_RANDOMIZED_API is incompatible with MLK_CONFIG_KEYGEN_PCT as the current PCT implementation requires crypto_kem_enc()
+#endif
+
+#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_ARITH)
+#include MLK_CONFIG_ARITH_BACKEND_FILE
+/* Include to enforce consistency of API and implementation,
+ * and conduct sanity checks on the backend.
+ *
+ * Keep this _after_ the inclusion of the backend; otherwise,
+ * the sanity checks won't have an effect. */
+#if defined(MLK_CHECK_APIS) && !defined(__ASSEMBLER__)
+#include "native/api.h"
+#endif
+#endif /* MLK_CONFIG_USE_NATIVE_BACKEND_ARITH */
+
+#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202)
+#include MLK_CONFIG_FIPS202_BACKEND_FILE
+/* Include to enforce consistency of API and implementation,
+ * and conduct sanity checks on the backend.
+ *
+ * Keep this _after_ the inclusion of the backend; otherwise,
+ * the sanity checks won't have an effect. */
+#if defined(MLK_CHECK_APIS) && !defined(__ASSEMBLER__)
+#include "fips202/native/api.h"
+#endif
+#endif /* MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 */
+
+#if !defined(MLK_CONFIG_FIPS202_CUSTOM_HEADER)
+#define MLK_FIPS202_HEADER_FILE "fips202/fips202.h"
+#else
+#define MLK_FIPS202_HEADER_FILE MLK_CONFIG_FIPS202_CUSTOM_HEADER
+#endif
+
+#if !defined(MLK_CONFIG_FIPS202X4_CUSTOM_HEADER)
+#define MLK_FIPS202X4_HEADER_FILE "fips202/fips202x4.h"
+#else
+#define MLK_FIPS202X4_HEADER_FILE MLK_CONFIG_FIPS202X4_CUSTOM_HEADER
+#endif
+
+/* Standard library function replacements */
+#if !defined(__ASSEMBLER__)
+#if !defined(MLK_CONFIG_CUSTOM_MEMCPY)
+#include <string.h>
+#define mlk_memcpy memcpy
+#endif
+
+#if !defined(MLK_CONFIG_CUSTOM_MEMSET)
+#include <string.h>
+#define mlk_memset memset
+#endif
+
+
+/* Allocation macros for large local structures
+ *
+ * MLK_ALLOC(v, T, N) declares T *v and attempts to point it to an T[N]
+ * MLK_FREE(v, T, N) zeroizes and frees the allocation
+ *
+ * Default implementation uses stack allocation.
+ * Can be overridden by setting the config option MLK_CONFIG_CUSTOM_ALLOC_FREE
+ * and defining MLK_CUSTOM_ALLOC and MLK_CUSTOM_FREE.
+ */
+#if defined(MLK_CONFIG_CUSTOM_ALLOC_FREE) != \
+    (defined(MLK_CUSTOM_ALLOC) && defined(MLK_CUSTOM_FREE))
+#error Bad configuration: MLK_CONFIG_CUSTOM_ALLOC_FREE must be set together with MLK_CUSTOM_ALLOC and MLK_CUSTOM_FREE
+#endif
+
+/*
+ * If the integration wants to provide a context parameter for use in
+ * platform-specific hooks, then it should define this parameter.
+ *
+ * The MLK_CONTEXT_PARAMETERS_n macros are intended to be used with macros
+ * defining the function names and expand to either pass or discard the context
+ * argument as required by the current build.  If there is no context parameter
+ * requested then these are removed from the prototypes and from all calls.
+ */
+#ifdef MLK_CONFIG_CONTEXT_PARAMETER
+#define MLK_CONTEXT_PARAMETERS_0(context) (context)
+#define MLK_CONTEXT_PARAMETERS_1(arg0, context) (arg0, context)
+#define MLK_CONTEXT_PARAMETERS_2(arg0, arg1, context) (arg0, arg1, context)
+#define MLK_CONTEXT_PARAMETERS_3(arg0, arg1, arg2, context) \
+  (arg0, arg1, arg2, context)
+#define MLK_CONTEXT_PARAMETERS_4(arg0, arg1, arg2, arg3, context) \
+  (arg0, arg1, arg2, arg3, context)
+#else /* MLK_CONFIG_CONTEXT_PARAMETER */
+#define MLK_CONTEXT_PARAMETERS_0(context) ()
+#define MLK_CONTEXT_PARAMETERS_1(arg0, context) (arg0)
+#define MLK_CONTEXT_PARAMETERS_2(arg0, arg1, context) (arg0, arg1)
+#define MLK_CONTEXT_PARAMETERS_3(arg0, arg1, arg2, context) (arg0, arg1, arg2)
+#define MLK_CONTEXT_PARAMETERS_4(arg0, arg1, arg2, arg3, context) \
+  (arg0, arg1, arg2, arg3)
+#endif /* !MLK_CONFIG_CONTEXT_PARAMETER */
+
+#if defined(MLK_CONFIG_CONTEXT_PARAMETER_TYPE) != \
+    defined(MLK_CONFIG_CONTEXT_PARAMETER)
+#error MLK_CONFIG_CONTEXT_PARAMETER_TYPE must be defined if and only if MLK_CONFIG_CONTEXT_PARAMETER is defined
+#endif
+
+#if !defined(MLK_CONFIG_CUSTOM_ALLOC_FREE)
+/* Default: stack allocation */
+
+#define MLK_ALLOC(v, T, N, context) \
+  MLK_ALIGN T mlk_alloc_##v[N];     \
+  T *v = mlk_alloc_##v
+
+/* TODO: This leads to a circular dependency between common and verify.h
+ * It just works out before we're at the end of the file, but it's still
+ * prone to issues in the future. */
+#include "verify.h"
+#define MLK_FREE(v, T, N, context)                     \
+  do                                                   \
+  {                                                    \
+    mlk_zeroize(mlk_alloc_##v, sizeof(mlk_alloc_##v)); \
+    (v) = NULL;                                        \
+  } while (0)
+
+#else /* !MLK_CONFIG_CUSTOM_ALLOC_FREE */
+
+/* Custom allocation */
+
+/*
+ * The indirection here is necessary to use MLK_CONTEXT_PARAMETERS_3 here.
+ */
+#define MLK_APPLY(f, args) f args
+
+#define MLK_ALLOC(v, T, N, context) \
+  MLK_APPLY(MLK_CUSTOM_ALLOC, MLK_CONTEXT_PARAMETERS_3(v, T, N, context))
+
+#define MLK_FREE(v, T, N, context)                                            \
+  do                                                                          \
+  {                                                                           \
+    if (v != NULL)                                                            \
+    {                                                                         \
+      mlk_zeroize(v, sizeof(T) * (N));                                        \
+      MLK_APPLY(MLK_CUSTOM_FREE, MLK_CONTEXT_PARAMETERS_3(v, T, N, context)); \
+      v = NULL;                                                               \
+    }                                                                         \
+  } while (0)
+
+#endif /* MLK_CONFIG_CUSTOM_ALLOC_FREE */
+
+/****************************** Error codes ***********************************/
+
+/* Generic failure condition */
+#define MLK_ERR_FAIL -1
+/* An allocation failed. This can only happen if MLK_CONFIG_CUSTOM_ALLOC_FREE
+ * is defined and the provided MLK_CUSTOM_ALLOC can fail. */
+#define MLK_ERR_OUT_OF_MEMORY -2
+/* An rng failure occured. Might be due to insufficient entropy or
+ * system misconfiguration. */
+#define MLK_ERR_RNG_FAIL -3
+
+#endif /* !__ASSEMBLER__ */
+
+#endif /* !MLK_COMMON_H */
diff --git a/mlkem_native/src/compress.c b/mlkem_native/src/compress.c
new file mode 100644
index 0000000..50da36d
--- /dev/null
+++ b/mlkem_native/src/compress.c
@@ -0,0 +1,717 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS203]
+ *   FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/203/final
+ *
+ * - [REF]
+ *   CRYSTALS-Kyber C reference implementation
+ *   Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ *   https://github.com/pq-crystals/kyber/tree/main/ref
+ */
+
+#include "common.h"
+#if !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
+
+
+#include "cbmc.h"
+#include "compress.h"
+#include "debug.h"
+#include "verify.h"
+
+#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
+/* Reference: `poly_compress()` in the reference implementation @[REF],
+ *            for ML-KEM-{512,768}.
+ *            - In contrast to the reference implementation, we assume
+ *              unsigned canonical coefficients here.
+ *              The reference implementation works with coefficients
+ *              in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
+MLK_STATIC_TESTABLE void mlk_poly_compress_d4_c(
+    uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const mlk_poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+  requires(memory_no_alias(a, sizeof(mlk_poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+)
+{
+  unsigned i;
+  mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(t, 0, j, 0, 16)))
+    {
+      t[j] = mlk_scalar_compress_d4(a->coeffs[8 * i + j]);
+    }
+
+    /* All t[i] are 4-bit wide, so the truncations don't alter the value. */
+    r[i * 4] = (uint8_t)(t[0] | (t[1] << 4));
+    r[i * 4 + 1] = (uint8_t)(t[2] | (t[3] << 4));
+    r[i * 4 + 2] = (uint8_t)(t[4] | (t[5] << 4));
+    r[i * 4 + 3] = (uint8_t)(t[6] | (t[7] << 4));
+  }
+}
+
+MLK_INTERNAL_API
+void mlk_poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4],
+                          const mlk_poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+  requires(memory_no_alias(a, sizeof(mlk_poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+)
+{
+#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D4)
+  int ret;
+  mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  ret = mlk_poly_compress_d4_native(r, a->coeffs);
+  if (ret == MLK_NATIVE_FUNC_SUCCESS)
+  {
+    return;
+  }
+#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D4 */
+
+  mlk_poly_compress_d4_c(r, a);
+}
+
+/* Reference: Embedded into `polyvec_compress()` in the
+ *            reference implementation, for ML-KEM-{512,768}.
+ *            - In contrast to the reference implementation, we assume
+ *              unsigned canonical coefficients here.
+ *              The reference implementation works with coefficients
+ *              in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
+MLK_STATIC_TESTABLE void mlk_poly_compress_d10_c(
+    uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const mlk_poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+  requires(memory_no_alias(a, sizeof(mlk_poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+)
+{
+  unsigned j;
+  mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  for (j = 0; j < MLKEM_N / 4; j++)
+  __loop__(invariant(j <= MLKEM_N / 4))
+  {
+    unsigned k;
+    uint16_t t[4];
+    for (k = 0; k < 4; k++)
+    __loop__(
+      invariant(k <= 4)
+      invariant(forall(r, 0, k, t[r] < (1u << 10))))
+    {
+      t[k] = mlk_scalar_compress_d10(a->coeffs[4 * j + k]);
+    }
+
+    /*
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 10-bit in size.
+     */
+    r[5 * j + 0] = (uint8_t)((t[0] >> 0) & 0xFF);
+    r[5 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] << 2) & 0xFF));
+    r[5 * j + 2] = (uint8_t)((t[1] >> 6) | ((t[2] << 4) & 0xFF));
+    r[5 * j + 3] = (uint8_t)((t[2] >> 4) | ((t[3] << 6) & 0xFF));
+    r[5 * j + 4] = (uint8_t)(t[3] >> 2);
+  }
+}
+
+MLK_INTERNAL_API
+void mlk_poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
+                           const mlk_poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+  requires(memory_no_alias(a, sizeof(mlk_poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+)
+{
+#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D10)
+  int ret;
+  mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  ret = mlk_poly_compress_d10_native(r, a->coeffs);
+  if (ret == MLK_NATIVE_FUNC_SUCCESS)
+  {
+    return;
+  }
+#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D10 */
+
+  mlk_poly_compress_d10_c(r, a);
+}
+
+/* Reference: `poly_decompress()` in the reference implementation @[REF],
+ *            for ML-KEM-{512,768}. */
+MLK_STATIC_TESTABLE void mlk_poly_decompress_d4_c(
+    mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(mlk_poly)))
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D4))
+  assigns(memory_slice(r, sizeof(mlk_poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
+  {
+    r->coeffs[2 * i + 0] = mlk_scalar_decompress_d4((a[i] >> 0) & 0xF);
+    r->coeffs[2 * i + 1] = mlk_scalar_decompress_d4((a[i] >> 4) & 0xF);
+  }
+
+  mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLK_INTERNAL_API
+void mlk_poly_decompress_d4(mlk_poly *r,
+                            const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(mlk_poly)))
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D4))
+  assigns(memory_slice(r, sizeof(mlk_poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
+{
+#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D4)
+  int ret;
+  ret = mlk_poly_decompress_d4_native(r->coeffs, a);
+  if (ret == MLK_NATIVE_FUNC_SUCCESS)
+  {
+    mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+    return;
+  }
+#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D4 */
+
+  mlk_poly_decompress_d4_c(r, a);
+}
+
+/* Reference: Embedded into `polyvec_decompress()` in the
+ *            reference implementation, for ML-KEM-{512,768}. */
+MLK_STATIC_TESTABLE void mlk_poly_decompress_d10_c(
+    mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+__contract__(
+  requires(memory_no_alias(r, sizeof(mlk_poly)))
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D10))
+  assigns(memory_slice(r, sizeof(mlk_poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
+{
+  unsigned j;
+  for (j = 0; j < MLKEM_N / 4; j++)
+  __loop__(
+    invariant(j <= MLKEM_N / 4)
+    invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q)))
+  {
+    unsigned k;
+    uint16_t t[4];
+    uint8_t const *base = &a[5 * j];
+
+    t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6));
+    t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4));
+    t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2));
+
+    for (k = 0; k < 4; k++)
+    __loop__(
+      invariant(k <= 4)
+      invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q)))
+    {
+      r->coeffs[4 * j + k] = mlk_scalar_decompress_d10(t[k]);
+    }
+  }
+
+  mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLK_INTERNAL_API
+void mlk_poly_decompress_d10(mlk_poly *r,
+                             const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+__contract__(
+  requires(memory_no_alias(r, sizeof(mlk_poly)))
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D10))
+  assigns(memory_slice(r, sizeof(mlk_poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
+{
+#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D10)
+  int ret;
+  ret = mlk_poly_decompress_d10_native(r->coeffs, a);
+  if (ret == MLK_NATIVE_FUNC_SUCCESS)
+  {
+    mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+    return;
+  }
+#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D10 */
+
+  mlk_poly_decompress_d10_c(r, a);
+}
+#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == 3 */
+
+#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4
+/* Reference: `poly_compress()` in the reference implementation @[REF],
+ *            for ML-KEM-1024.
+ *            - In contrast to the reference implementation, we assume
+ *              unsigned canonical coefficients here.
+ *              The reference implementation works with coefficients
+ *              in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
+MLK_STATIC_TESTABLE void mlk_poly_compress_d5_c(
+    uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const mlk_poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+  requires(memory_no_alias(a, sizeof(mlk_poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+)
+{
+  unsigned i;
+  mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(t, 0, j, 0, 32)))
+    {
+      t[j] = mlk_scalar_compress_d5(a->coeffs[8 * i + j]);
+    }
+
+    r[i * 5] = (uint8_t)(0xFF & ((t[0] >> 0) | (t[1] << 5)));
+    r[i * 5 + 1] = (uint8_t)(0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)));
+    r[i * 5 + 2] = (uint8_t)(0xFF & ((t[3] >> 1) | (t[4] << 4)));
+    r[i * 5 + 3] = (uint8_t)(0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)));
+    r[i * 5 + 4] = (uint8_t)(0xFF & ((t[6] >> 2) | (t[7] << 3)));
+  }
+}
+
+MLK_INTERNAL_API
+void mlk_poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
+                          const mlk_poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+  requires(memory_no_alias(a, sizeof(mlk_poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+)
+{
+#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D5)
+  int ret;
+  mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  ret = mlk_poly_compress_d5_native(r, a->coeffs);
+  if (ret == MLK_NATIVE_FUNC_SUCCESS)
+  {
+    return;
+  }
+#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D5 */
+
+  mlk_poly_compress_d5_c(r, a);
+}
+
+/* Reference: Embedded into `polyvec_compress()` in the
+ *            reference implementation, for ML-KEM-1024.
+ *            - In contrast to the reference implementation, we assume
+ *              unsigned canonical coefficients here.
+ *              The reference implementation works with coefficients
+ *              in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
+MLK_STATIC_TESTABLE void mlk_poly_compress_d11_c(
+    uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const mlk_poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+  requires(memory_no_alias(a, sizeof(mlk_poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+)
+{
+  unsigned j;
+  mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(invariant(j <= MLKEM_N / 8))
+  {
+    unsigned k;
+    uint16_t t[8];
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(forall(r, 0, k, t[r] < (1u << 11))))
+    {
+      t[k] = mlk_scalar_compress_d11(a->coeffs[8 * j + k]);
+    }
+
+    /*
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 11-bit in size.
+     */
+    r[11 * j + 0] = (uint8_t)((t[0] >> 0) & 0xFF);
+    r[11 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] << 3) & 0xFF));
+    r[11 * j + 2] = (uint8_t)((t[1] >> 5) | ((t[2] << 6) & 0xFF));
+    r[11 * j + 3] = (uint8_t)((t[2] >> 2) & 0xFF);
+    r[11 * j + 4] = (uint8_t)((t[2] >> 10) | ((t[3] << 1) & 0xFF));
+    r[11 * j + 5] = (uint8_t)((t[3] >> 7) | ((t[4] << 4) & 0xFF));
+    r[11 * j + 6] = (uint8_t)((t[4] >> 4) | ((t[5] << 7) & 0xFF));
+    r[11 * j + 7] = (uint8_t)((t[5] >> 1) & 0xFF);
+    r[11 * j + 8] = (uint8_t)((t[5] >> 9) | ((t[6] << 2) & 0xFF));
+    r[11 * j + 9] = (uint8_t)((t[6] >> 6) | ((t[7] << 5) & 0xFF));
+    r[11 * j + 10] = (uint8_t)(t[7] >> 3);
+  }
+}
+
+MLK_INTERNAL_API
+void mlk_poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
+                           const mlk_poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+  requires(memory_no_alias(a, sizeof(mlk_poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+)
+{
+#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D11)
+  int ret;
+  mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  ret = mlk_poly_compress_d11_native(r, a->coeffs);
+  if (ret == MLK_NATIVE_FUNC_SUCCESS)
+  {
+    return;
+  }
+#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D11 */
+
+  mlk_poly_compress_d11_c(r, a);
+}
+
+/* Reference: `poly_decompress()` in the reference implementation @[REF],
+ *            for ML-KEM-1024. */
+MLK_STATIC_TESTABLE void mlk_poly_decompress_d5_c(
+    mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+__contract__(
+  requires(memory_no_alias(r, sizeof(mlk_poly)))
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D5))
+  assigns(memory_slice(r, sizeof(mlk_poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
+  {
+    unsigned j;
+    uint8_t t[8];
+    const unsigned offset = i * 5;
+    /*
+     * Explicitly truncate to avoid warning about
+     * implicit truncation in CBMC and unwind loop for ease
+     * of proof.
+     */
+
+    /*
+     * Decompress 5 8-bit bytes (so 40 bits) into
+     * 8 5-bit values stored in t[]
+     */
+    t[0] = 0x1F & (a[offset + 0] >> 0);
+    t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3));
+    t[2] = 0x1F & (a[offset + 1] >> 2);
+    t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1));
+    t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4));
+    t[5] = 0x1F & (a[offset + 3] >> 1);
+    t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2));
+    t[7] = 0x1F & (a[offset + 4] >> 3);
+
+    /* and copy to the correct slice in r[] */
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(j <= 8 && i <= MLKEM_N / 8)
+      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
+    {
+      r->coeffs[8 * i + j] = mlk_scalar_decompress_d5(t[j]);
+    }
+  }
+
+  mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLK_INTERNAL_API
+void mlk_poly_decompress_d5(mlk_poly *r,
+                            const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+__contract__(
+  requires(memory_no_alias(r, sizeof(mlk_poly)))
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D5))
+  assigns(memory_slice(r, sizeof(mlk_poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
+{
+#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D5)
+  int ret;
+  ret = mlk_poly_decompress_d5_native(r->coeffs, a);
+  if (ret == MLK_NATIVE_FUNC_SUCCESS)
+  {
+    mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+    return;
+  }
+#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D5 */
+
+  mlk_poly_decompress_d5_c(r, a);
+}
+
+/* Reference: Embedded into `polyvec_decompress()` in the
+ *            reference implementation, for ML-KEM-1024. */
+MLK_STATIC_TESTABLE void mlk_poly_decompress_d11_c(
+    mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+__contract__(
+  requires(memory_no_alias(r, sizeof(mlk_poly)))
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D11))
+  assigns(memory_slice(r, sizeof(mlk_poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
+{
+  unsigned j;
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(
+    invariant(j <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
+  {
+    unsigned k;
+    uint16_t t[8];
+    uint8_t const *base = &a[11 * j];
+    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
+    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
+                    ((uint16_t)base[4] << 10));
+    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
+    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
+    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
+                    ((uint16_t)base[8] << 9));
+    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
+    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
+
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
+    {
+      r->coeffs[8 * j + k] = mlk_scalar_decompress_d11(t[k]);
+    }
+  }
+
+  mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLK_INTERNAL_API
+void mlk_poly_decompress_d11(mlk_poly *r,
+                             const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+__contract__(
+  requires(memory_no_alias(r, sizeof(mlk_poly)))
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D11))
+  assigns(memory_slice(r, sizeof(mlk_poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
+{
+#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D11)
+  int ret;
+  ret = mlk_poly_decompress_d11_native(r->coeffs, a);
+  if (ret == MLK_NATIVE_FUNC_SUCCESS)
+  {
+    mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+    return;
+  }
+#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D11 */
+
+  mlk_poly_decompress_d11_c(r, a);
+}
+
+#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
+
+/* Reference: `poly_tobytes()` in the reference implementation @[REF].
+ *            - In contrast to the reference implementation, we assume
+ *              unsigned canonical coefficients here.
+ *              The reference implementation works with coefficients
+ *              in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
+MLK_STATIC_TESTABLE void mlk_poly_tobytes_c(uint8_t r[MLKEM_POLYBYTES],
+                                            const mlk_poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYBYTES))
+  requires(memory_no_alias(a, sizeof(mlk_poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(memory_slice(r, MLKEM_POLYBYTES))
+)
+{
+  unsigned i;
+  mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(invariant(i <= MLKEM_N / 2))
+  {
+    /* The conversion to uint16_t is safe since we assume that
+     * the coefficients of `a` are non-negative. */
+    const uint16_t t0 = (uint16_t)a->coeffs[2 * i];
+    const uint16_t t1 = (uint16_t)a->coeffs[2 * i + 1];
+    /*
+     * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of
+     * significant data, so these can be packed into 24 bits or exactly
+     * 3 bytes, as follows.
+     */
+
+    /* Least significant bits 0 - 7 of t0. */
+    r[3 * i + 0] = (uint8_t)(t0 & 0xFF);
+
+    /*
+     * Most significant bits 8 - 11 of t0 become the least significant
+     * nibble of the second byte. The least significant 4 bits
+     * of t1 become the upper nibble of the second byte.
+     *
+     * The conversion to uint8_t does not alter the value.
+     */
+    r[3 * i + 1] = (uint8_t)((t0 >> 8) | ((t1 << 4) & 0xF0));
+
+    /* Bits 4 - 11 of t1 become the third byte. The conversion to uint8_t
+     * does not alter the value because t1 is 12-bit wide. */
+    r[3 * i + 2] = (uint8_t)(t1 >> 4);
+  }
+}
+
+MLK_INTERNAL_API
+void mlk_poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const mlk_poly *a)
+{
+#if defined(MLK_USE_NATIVE_POLY_TOBYTES)
+  int ret;
+  mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  ret = mlk_poly_tobytes_native(r, a->coeffs);
+  if (ret == MLK_NATIVE_FUNC_SUCCESS)
+  {
+    return;
+  }
+#endif /* MLK_USE_NATIVE_POLY_TOBYTES */
+
+  mlk_poly_tobytes_c(r, a);
+}
+
+/* Reference: `poly_frombytes()` in the reference implementation @[REF]. */
+MLK_STATIC_TESTABLE void mlk_poly_frombytes_c(mlk_poly *r,
+                                              const uint8_t a[MLKEM_POLYBYTES])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYBYTES))
+  requires(memory_no_alias(r, sizeof(mlk_poly)))
+  assigns(memory_slice(r, sizeof(mlk_poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))
+)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_UINT12_LIMIT)))
+  {
+    const uint8_t t0 = a[3 * i + 0];
+    const uint8_t t1 = a[3 * i + 1];
+    const uint8_t t2 = a[3 * i + 2];
+    r->coeffs[2 * i + 0] = (int16_t)(t0 | ((t1 << 8) & 0xFFF));
+    r->coeffs[2 * i + 1] = (int16_t)((t1 >> 4) | (t2 << 4));
+  }
+
+  /* Note that the coefficients are not canonical */
+  mlk_assert_bound(r, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
+}
+
+MLK_INTERNAL_API
+void mlk_poly_frombytes(mlk_poly *r, const uint8_t a[MLKEM_POLYBYTES])
+{
+#if defined(MLK_USE_NATIVE_POLY_FROMBYTES)
+  int ret;
+  ret = mlk_poly_frombytes_native(r->coeffs, a);
+  if (ret == MLK_NATIVE_FUNC_SUCCESS)
+  {
+    return;
+  }
+#endif /* MLK_USE_NATIVE_POLY_FROMBYTES */
+
+  mlk_poly_frombytes_c(r, a);
+}
+
+/* Reference: `poly_frommsg()` in the reference implementation @[REF].
+ *            - We use a value barrier around the bit-selection mask to
+ *              reduce the risk of compiler-introduced branches.
+ *              The reference implementation contains the expression
+ *              `(msg[i] >> j) & 1` which the compiler can reason must
+ *              be either 0 or 1. */
+MLK_INTERNAL_API
+void mlk_poly_frommsg(mlk_poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+{
+  unsigned i;
+#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8)
+#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!"
+#endif
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
+  {
+    unsigned j;
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <  MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
+    {
+      /* mlk_ct_sel_int16(MLKEM_Q_HALF, 0, b) is `Decompress_1(b != 0)`
+       * as per @[FIPS203, Eq (4.8)]. */
+
+      /* Prevent the compiler from recognizing this as a bit selection */
+      uint8_t mask = mlk_value_barrier_u8((uint8_t)(1u << j));
+      r->coeffs[8 * i + j] = mlk_ct_sel_int16(MLKEM_Q_HALF, 0, msg[i] & mask);
+    }
+  }
+  mlk_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
+}
+
+/* Reference: `poly_tomsg()` in the reference implementation @[REF].
+ *            - In contrast to the reference implementation, we assume
+ *              unsigned canonical coefficients here.
+ *              The reference implementation works with coefficients
+ *              in the range (-MLKEM_Q+1,...,MLKEM_Q-1).
+ */
+MLK_INTERNAL_API
+void mlk_poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const mlk_poly *a)
+{
+  unsigned i;
+  mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    msg[i] = 0;
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8))
+    {
+      uint32_t t = mlk_scalar_compress_d1(a->coeffs[8 * i + j]);
+      msg[i] |= (uint8_t)(t << j);
+    }
+  }
+}
+
+#else /* !MLK_CONFIG_MULTILEVEL_NO_SHARED */
+
+MLK_EMPTY_CU(compress)
+
+#endif /* MLK_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/mlkem_native/src/compress.h b/mlkem_native/src/compress.h
new file mode 100644
index 0000000..b16b088
--- /dev/null
+++ b/mlkem_native/src/compress.h
@@ -0,0 +1,688 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS203]
+ *   FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/203/final
+ *
+ * - [REF]
+ *   CRYSTALS-Kyber C reference implementation
+ *   Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ *   https://github.com/pq-crystals/kyber/tree/main/ref
+ */
+
+#ifndef MLK_COMPRESS_H
+#define MLK_COMPRESS_H
+
+
+#include "cbmc.h"
+#include "common.h"
+#include "debug.h"
+#include "poly.h"
+#include "verify.h"
+
+/************************************************************
+ * Name: mlk_scalar_compress_d1
+ *
+ * Description: Computes round(u * 2 / q)
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ *
+ * Specification: Compress_1 from @[FIPS203, Eq (4.7)].
+ *
+ ************************************************************/
+
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+
+/* Reference: Part of poly_tomsg() in the reference implementation @[REF]. */
+static MLK_INLINE uint8_t mlk_scalar_compress_d1(int16_t u)
+__contract__(
+  requires(0 <= u && u <= MLKEM_Q - 1)
+  ensures(return_value < 2)
+  ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2)  )
+{
+  /* Compute as follows:
+   * ```
+   * round(u * 2 / MLKEM_Q)
+   *   = round(u * 2 * (2^31 / MLKEM_Q) / 2^31)
+   *  ~= round(u * 2 * round(2^31 / MLKEM_Q) / 2^31)
+   * ```
+   */
+  /* check-magic: 1290168 == 2*round(2^31 / MLKEM_Q) */
+  uint32_t d0 = (uint32_t)u * 1290168;
+  /* Unsigned shifting by 31 positions leaves only the top bit. */
+  return (uint8_t)((d0 + ((uint32_t)1u << 30)) >> 31);
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: mlk_scalar_compress_d4
+ *
+ * Description: Computes round(u * 16 / q) % 16
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ *
+ * Specification: Compress_4 from @[FIPS203, Eq (4.7)].
+ *
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+
+/* Reference: Embedded into `poly_compress()` in the
+ *            reference implementation @[REF]. */
+static MLK_INLINE uint8_t mlk_scalar_compress_d4(int16_t u)
+__contract__(
+  requires(0 <= u && u <= MLKEM_Q - 1)
+  ensures(return_value < 16)
+  ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16))
+{
+  /* Compute as follows:
+   * ```
+   * round(u * 16 / MLKEM_Q)
+   *   = round(u * 16 * (2^28 / MLKEM_Q) / 2^28)
+   *  ~= round(u * 16 * round(2^28 / MLKEM_Q) / 2^28)
+   * ```
+   */
+  /* check-magic: 1290160 == 16 * round(2^28 / MLKEM_Q) */
+  uint32_t d0 = (uint32_t)u * 1290160;
+  /* The return value is < 16, so not altered by the conversion to uint8_t. */
+  return (uint8_t)((d0 + ((uint32_t)1u << 27)) >> 28); /* round(d0/2^28) */
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: mlk_scalar_decompress_d4
+ *
+ * Description: Computes round(u * q / 16)
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ *
+ * Specification: Decompress_4 from @[FIPS203, Eq (4.8)].
+ *
+ ************************************************************/
+
+/* Reference: Embedded into `poly_decompress()` in the
+ *            reference implementation @[REF]. */
+static MLK_INLINE int16_t mlk_scalar_decompress_d4(uint8_t u)
+__contract__(
+  requires(0 <= u && u < 16)
+  ensures(return_value <= (MLKEM_Q - 1))
+)
+{
+  /* The return value is in 0..MLKEM_Q-1, hence not altered by the
+   * conversion to int16_t. */
+  return (int16_t)((((uint32_t)u * MLKEM_Q) + 8) >> 4);
+}
+
+/************************************************************
+ * Name: mlk_scalar_compress_d5
+ *
+ * Description: Computes round(u * 32 / q) % 32
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ *
+ * Specification: Compress_5 from @[FIPS203, Eq (4.7)].
+ *
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+
+/* Reference: Embedded into `poly_compress()` in the
+ *            reference implementation @[REF]. */
+static MLK_INLINE uint8_t mlk_scalar_compress_d5(int16_t u)
+__contract__(
+  requires(0 <= u && u <= MLKEM_Q - 1)
+  ensures(return_value < 32)
+  ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32)  )
+{
+  /* Compute as follows:
+   * ```
+   * round(u * 32 / MLKEM_Q)
+   *   = round(u * 32 * (2^27 / MLKEM_Q) / 2^27)
+   *  ~= round(u * 32 * round(2^27 / MLKEM_Q) / 2^27)
+   * ```
+   */
+  /* check-magic: 1290176 == 2^5 * round(2^27 / MLKEM_Q) */
+  uint32_t d0 = (uint32_t)u * 1290176;
+  /* The return value is < 32, so not altered by the conversion to uint8_t. */
+  return (uint8_t)((d0 + ((uint32_t)1u << 26)) >> 27); /* round(d0/2^27) */
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: mlk_scalar_decompress_d5
+ *
+ * Description: Computes round(u * q / 32)
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 32
+ *                 to be decompressed.
+ *
+ * Specification: Decompress_5 from @[FIPS203, Eq (4.8)].
+ *
+ ************************************************************/
+
+/* Reference: Embedded into `poly_decompress()` in the
+ *            reference implementation @[REF]. */
+static MLK_INLINE int16_t mlk_scalar_decompress_d5(uint8_t u)
+__contract__(
+  requires(0 <= u && u < 32)
+  ensures(0 <= return_value && return_value <= MLKEM_Q - 1)
+)
+{
+  /* The return value is in 0..MLKEM_Q-1, hence not altered by the
+   * conversion to int16_t. */
+  return (int16_t)((((uint32_t)u * MLKEM_Q) + 16) >> 5);
+}
+
+/************************************************************
+ * Name: mlk_scalar_compress_d10
+ *
+ * Description: Computes round(u * 2**10 / q) % 2**10
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ *
+ * Specification: Compress_10 from @[FIPS203, Eq (4.7)].
+ *
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+
+/* Reference: Embedded into `polyvec_compress()` in the
+ *            reference implementation @[REF]. */
+static MLK_INLINE uint16_t mlk_scalar_compress_d10(int16_t u)
+__contract__(
+  requires(0 <= u && u <= MLKEM_Q - 1)
+  ensures(return_value < (1u << 10))
+  ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10)))
+{
+  /* Compute as follows:
+   * ```
+   * round(u * 1024 / MLKEM_Q)
+   *   = round(u * 1024 * (2^33 / MLKEM_Q) / 2^33)
+   *  ~= round(u * 1024 * round(2^33 / MLKEM_Q) / 2^33)
+   * ```
+   */
+  /* check-magic: 2642263040 == 2^10 * round(2^33 / MLKEM_Q) */
+  uint64_t d0 = (uint64_t)u * 2642263040;
+  d0 = (d0 + ((uint64_t)1u << 32)) >> 33; /* round(d0/2^33) */
+  return (d0 & 0x3FF);
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: mlk_scalar_decompress_d10
+ *
+ * Description: Computes round(u * q / 1024)
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 1024
+ *                 to be decompressed.
+ *
+ * Specification: Decompress_10 from @[FIPS203, Eq (4.8)].
+ *
+ ************************************************************/
+
+/* Reference: Embedded into `polyvec_decompress()` in the
+ *            reference implementation @[REF]. */
+static MLK_INLINE int16_t mlk_scalar_decompress_d10(uint16_t u)
+__contract__(
+  requires(0 <= u && u < 1024)
+  ensures(0 <= return_value && return_value <= (MLKEM_Q - 1))
+)
+{
+  /* The return value is in 0..MLKEM_Q-1, hence not altered by the
+   * conversion to int16_t. */
+  return (int16_t)((((uint32_t)u * MLKEM_Q) + 512) >> 10);
+}
+
+/************************************************************
+ * Name: mlk_scalar_compress_d11
+ *
+ * Description: Computes round(u * 2**11 / q) % 2**11
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ *
+ * Specification: Compress_11 from @[FIPS203, Eq (4.7)].
+ *
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+
+/* Reference: Embedded into `polyvec_compress()` in the
+ *            reference implementation @[REF]. */
+static MLK_INLINE uint16_t mlk_scalar_compress_d11(int16_t u)
+__contract__(
+  requires(0 <= u && u <= MLKEM_Q - 1)
+  ensures(return_value < (1u << 11))
+  ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11)))
+{
+  /* Compute as follows:
+   * ```
+   * round(u * 2048 / MLKEM_Q)
+   *   = round(u * 2048 * (2^33 / MLKEM_Q) / 2^33)
+   *  ~= round(u * 2048 * round(2^33 / MLKEM_Q) / 2^33)
+   * ```
+   */
+  /* check-magic: 5284526080 == 2^11 * round(2^33 / MLKEM_Q) */
+  uint64_t d0 = (uint64_t)u * 5284526080;
+  d0 = (d0 + ((uint64_t)1u << 32)) >> 33; /* round(d0/2^33) */
+  return (d0 & 0x7FF);
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: mlk_scalar_decompress_d11
+ *
+ * Description: Computes round(u * q / 2048)
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 2048
+ *                 to be decompressed.
+ *
+ * Specification: Decompress_11 from @[FIPS203, Eq (4.8)].
+ *
+ ************************************************************/
+
+/* Reference: Embedded into `polyvec_decompress()` in the
+ *            reference implementation @[REF]. */
+static MLK_INLINE int16_t mlk_scalar_decompress_d11(uint16_t u)
+__contract__(
+  requires(0 <= u && u < 2048)
+  ensures(0 <= return_value && return_value <= (MLKEM_Q - 1))
+)
+{
+  /* The return value is in 0..MLKEM_Q-1, hence not altered by the
+   * conversion to int16_t. */
+  return (int16_t)((((uint32_t)u * MLKEM_Q) + 1024) >> 11);
+}
+
+#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
+#define mlk_poly_compress_d4 MLK_NAMESPACE(poly_compress_d4)
+/*************************************************
+ * Name:        mlk_poly_compress_d4
+ *
+ * Description: Compression (4 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
+ *              - const mlk_poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ *
+ * Specification: Implements `ByteEncode_4 (Compress_4 (a))`:
+ *                - ByteEncode_d: @[FIPS203, Algorithm 5],
+ *                - Compress_d: @[FIPS203, Eq (4.7)]
+ *                  Extended to vectors as per
+ *                  @[FIPS203, 2.4.8 Applying Algorithms to Arrays]
+ *                - `ByteEncode_{d_v} (Compress_{d_v} (v))` appears in
+ *                  @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L23],
+ *                  where `d_v=4` for ML-KEM-{512,768} @[FIPS203, Table 2].
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4],
+                          const mlk_poly *a);
+
+#define mlk_poly_compress_d10 MLK_NAMESPACE(poly_compress_d10)
+/*************************************************
+ * Name:        mlk_poly_compress_d10
+ *
+ * Description: Compression (10 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *              - const mlk_poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ *
+ * Specification: Implements `ByteEncode_10 (Compress_10 (a))`:
+ *                - ByteEncode_d: @[FIPS203, Algorithm 5],
+ *                - Compress_d: @[FIPS203, Eq (4.7)]
+ *                  Extended to vectors as per
+ *                  @[FIPS203, 2.4.8 Applying Algorithms to Arrays]
+ *                - `ByteEncode_{d_u} (Compress_{d_u} (u))` appears in
+ *                  @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L22],
+ *                  where `d_u=10` for ML-KEM-{512,768} @[FIPS203, Table 2].
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
+                           const mlk_poly *a);
+
+#define mlk_poly_decompress_d4 MLK_NAMESPACE(poly_decompress_d4)
+/*************************************************
+ * Name:        mlk_poly_decompress_d4
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - mlk_poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ * Specification: Implements `Decompress_4 (ByteDecode_4 (a))`:
+ *                - ByteDecode_d: @[FIPS203, Algorithm 6],
+ *                - Decompress_d: @[FIPS203, Eq (4.8)]
+ *                  Extended to vectors as per
+ *                  @[FIPS203, 2.4.8 Applying Algorithms to Arrays]
+ *                - `Decompress_{d_v} (ByteDecode_{d_v} (v))` appears in
+ *                  @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L4],
+ *                  where `d_v=4` for ML-KEM-{512,768} @[FIPS203, Table 2].
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_poly_decompress_d4(mlk_poly *r,
+                            const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
+
+#define mlk_poly_decompress_d10 MLK_NAMESPACE(poly_decompress_d10)
+/*************************************************
+ * Name:        mlk_poly_decompress_d10
+ *
+ * Description: De-serialization and subsequent decompression (10 bits) of a
+ *              polynomial; approximate inverse of mlk_poly_compress_d10
+ *
+ * Arguments:   - mlk_poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ * Specification: Implements `Decompress_10 (ByteDecode_10 (a))`:
+ *                - ByteDecode_d: @[FIPS203, Algorithm 6],
+ *                - Decompress_d: @[FIPS203, Eq (4.8)]
+ *                  Extended to vectors as per
+ *                  @[FIPS203, 2.4.8 Applying Algorithms to Arrays]
+ *                - `Decompress_{d_u} (ByteDecode_{d_u} (u))` appears in
+ *                  @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L3],
+ *                  where `d_u=10` for ML-KEM-{512,768} @[FIPS203, Table 2].
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_poly_decompress_d10(mlk_poly *r,
+                             const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
+#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == 3 */
+
+#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4
+#define mlk_poly_compress_d5 MLK_NAMESPACE(poly_compress_d5)
+/*************************************************
+ * Name:        mlk_poly_compress_d5
+ *
+ * Description: Compression (5 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
+ *              - const mlk_poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ *
+ * Specification: Implements `ByteEncode_5 (Compress_5 (a))`:
+ *                - ByteEncode_d: @[FIPS203, Algorithm 5],
+ *                - Compress_d: @[FIPS203, Eq (4.7)]
+ *                  Extended to vectors as per
+ *                  @[FIPS203, 2.4.8 Applying Algorithms to Arrays]
+ *                - `ByteEncode_{d_v} (Compress_{d_v} (v))` appears in
+ *                  @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L23],
+ *                  where `d_v=5` for ML-KEM-1024 @[FIPS203, Table 2].
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
+                          const mlk_poly *a);
+
+#define mlk_poly_compress_d11 MLK_NAMESPACE(poly_compress_d11)
+/*************************************************
+ * Name:        mlk_poly_compress_d11
+ *
+ * Description: Compression (11 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *              - const mlk_poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ *
+ * Specification: `ByteEncode_11 (Compress_11 (a))`:
+ *                - ByteEncode_d: @[FIPS203, Algorithm 5],
+ *                - Compress_d: @[FIPS203, Eq (4.7)]
+ *                  Extended to vectors as per
+ *                  @[FIPS203, 2.4.8 Applying Algorithms to Arrays]
+ *                - `ByteEncode_{d_u} (Compress_{d_u} (u))` appears in
+ *                  @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L22],
+ *                  where `d_u=11` for ML-KEM-1024 @[FIPS203, Table 2].
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
+                           const mlk_poly *a);
+
+#define mlk_poly_decompress_d5 MLK_NAMESPACE(poly_decompress_d5)
+/*************************************************
+ * Name:        mlk_poly_decompress_d5
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - mlk_poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ * Specification: Implements `Decompress_5 (ByteDecode_5 (a))`:
+ *                - ByteDecode_d: @[FIPS203, Algorithm 6],
+ *                - Decompress_d: @[FIPS203, Eq (4.8)]
+ *                  Extended to vectors as per
+ *                  @[FIPS203, 2.4.8 Applying Algorithms to Arrays]
+ *                - `Decompress_{d_v} (ByteDecode_{d_v} (v))` appears in
+ *                  @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L4],
+ *                  where `d_v=5` for ML-KEM-1024 @[FIPS203, Table 2].
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_poly_decompress_d5(mlk_poly *r,
+                            const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
+
+#define mlk_poly_decompress_d11 MLK_NAMESPACE(poly_decompress_d11)
+/*************************************************
+ * Name:        mlk_poly_decompress_d11
+ *
+ * Description: De-serialization and subsequent decompression (11 bits) of a
+ *              polynomial; approximate inverse of mlk_poly_compress_d11
+ *
+ * Arguments:   - mlk_poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ * Specification: Implements `Decompress_11 (ByteDecode_11 (a))`:
+ *                - ByteDecode_d: @[FIPS203, Algorithm 6],
+ *                - Decompress_d: @[FIPS203, Eq (4.8)]
+ *                  Extended to vectors as per
+ *                  @[FIPS203, 2.4.8 Applying Algorithms to Arrays]
+ *                - `Decompress_{d_u} (ByteDecode_{d_u} (u))` appears in
+ *                  @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L3],
+ *                  where `d_u=11` for ML-KEM-1024 @[FIPS203, Table 2].
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_poly_decompress_d11(mlk_poly *r,
+                             const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
+#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
+
+#define mlk_poly_tobytes MLK_NAMESPACE(poly_tobytes)
+/*************************************************
+ * Name:        mlk_poly_tobytes
+ *
+ * Description: Serialization of a polynomial.
+ *              Signed coefficients are converted to
+ *              unsigned form before serialization.
+ *
+ * Arguments:   INPUT:
+ *              - a: const pointer to input polynomial,
+ *                with each coefficient in the range [0,1,..,Q-1]
+ *              OUTPUT
+ *              - r: pointer to output byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ *
+ * Specification: Implements ByteEncode_12 @[FIPS203, Algorithm 5].
+ *                Extended to vectors as per
+ *                @[FIPS203, 2.4.8 Applying Algorithms to Arrays]
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const mlk_poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYBYTES))
+  requires(memory_no_alias(a, sizeof(mlk_poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(memory_slice(r, MLKEM_POLYBYTES))
+);
+
+
+#define mlk_poly_frombytes MLK_NAMESPACE(poly_frombytes)
+/*************************************************
+ * Name:        mlk_poly_frombytes
+ *
+ * Description: De-serialization of a polynomial.
+ *
+ * Arguments:   INPUT
+ *              - a: pointer to input byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ *              OUTPUT
+ *              - r: pointer to output polynomial, with
+ *                   each coefficient unsigned and in the range
+ *                   0 .. 4095
+ *
+ * Specification: Implements ByteDecode_12 @[FIPS203, Algorithm 6].
+ *                Extended to vectors as per
+ *                @[FIPS203, 2.4.8 Applying Algorithms to Arrays]
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_poly_frombytes(mlk_poly *r, const uint8_t a[MLKEM_POLYBYTES])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYBYTES))
+  requires(memory_no_alias(r, sizeof(mlk_poly)))
+  assigns(memory_slice(r, sizeof(mlk_poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))
+);
+
+
+#define mlk_poly_frommsg MLK_NAMESPACE(poly_frommsg)
+/*************************************************
+ * Name:        mlk_poly_frommsg
+ *
+ * Description: Convert 32-byte message to polynomial
+ *
+ * Arguments:   - mlk_poly *r: pointer to output polynomial
+ *              - const uint8_t *msg: pointer to input message
+ *
+ * Specification: Implements `Decompress_1 (ByteDecode_1 (a))`:
+ *                - ByteDecode_d: @[FIPS203, Algorithm 6],
+ *                - Decompress_d: @[FIPS203, Eq (4.8)]
+ *                  Extended to vectors as per
+ *                  @[FIPS203, 2.4.8 Applying Algorithms to Arrays]
+ *                - `Decompress_1 (ByteDecode_1 (w))` appears in
+ *                  @[FIPS203, Algorithm 15 (K-PKE.Encrypt), L20].
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_poly_frommsg(mlk_poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+__contract__(
+  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(r, sizeof(mlk_poly)))
+  assigns(memory_slice(r, sizeof(mlk_poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+);
+
+#define mlk_poly_tomsg MLK_NAMESPACE(poly_tomsg)
+/*************************************************
+ * Name:        mlk_poly_tomsg
+ *
+ * Description: Convert polynomial to 32-byte message
+ *
+ * Arguments:   - uint8_t *msg: pointer to output message
+ *              - const mlk_poly *r: pointer to input polynomial
+ *                Coefficients must be unsigned canonical
+ *
+ * Specification: Implements `ByteEncode_1 (Compress_1 (a))`:
+ *                - ByteEncode_d: @[FIPS203, Algorithm 5],
+ *                - Compress_d: @[FIPS203, Eq (4.7)]
+ *                  Extended to vectors as per
+ *                  @[FIPS203, 2.4.8 Applying Algorithms to Arrays]
+ *                - `ByteEncode_1 (Compress_1 (w))` appears in
+ *                  @[FIPS203, Algorithm 14 (K-PKE.Decrypt), L7].
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const mlk_poly *r)
+__contract__(
+  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(r, sizeof(mlk_poly)))
+  requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(memory_slice(msg, MLKEM_INDCPA_MSGBYTES))
+);
+
+#endif /* !MLK_COMPRESS_H */
diff --git a/mlkem_native/src/debug.c b/mlkem_native/src/debug.c
new file mode 100644
index 0000000..386f526
--- /dev/null
+++ b/mlkem_native/src/debug.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* NOTE: You can remove this file unless you compile with MLKEM_DEBUG. */
+
+#include "common.h"
+
+#if !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && defined(MLKEM_DEBUG)
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "debug.h"
+
+#define MLK_DEBUG_ERROR_HEADER "[ERROR:%s:%04d] "
+
+void mlk_debug_check_assert(const char *file, int line, const int val)
+{
+  if (val == 0)
+  {
+    fprintf(stderr, MLK_DEBUG_ERROR_HEADER "Assertion failed (value %d)\n",
+            file, line, val);
+    exit(1);
+  }
+}
+
+void mlk_debug_check_bounds(const char *file, int line, const int16_t *ptr,
+                            unsigned len, int lower_bound_exclusive,
+                            int upper_bound_exclusive)
+{
+  int err = 0;
+  unsigned i;
+  for (i = 0; i < len; i++)
+  {
+    int16_t val = ptr[i];
+    if (!(val > lower_bound_exclusive && val < upper_bound_exclusive))
+    {
+      fprintf(
+          stderr,
+          MLK_DEBUG_ERROR_HEADER
+          "Bounds assertion failed: Index %u, value %d out of bounds (%d,%d)\n",
+          file, line, i, (int)val, lower_bound_exclusive,
+          upper_bound_exclusive);
+      err = 1;
+    }
+  }
+
+  if (err == 1)
+  {
+    exit(1);
+  }
+}
+
+#else /* !MLK_CONFIG_MULTILEVEL_NO_SHARED && MLKEM_DEBUG */
+
+MLK_EMPTY_CU(debug)
+
+#endif /* !(!MLK_CONFIG_MULTILEVEL_NO_SHARED && MLKEM_DEBUG) */
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef MLK_DEBUG_ERROR_HEADER
diff --git a/mlkem_native/src/debug.h b/mlkem_native/src/debug.h
new file mode 100644
index 0000000..47c864b
--- /dev/null
+++ b/mlkem_native/src/debug.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+#ifndef MLK_DEBUG_H
+#define MLK_DEBUG_H
+#include "common.h"
+
+#if defined(MLKEM_DEBUG)
+
+/*************************************************
+ * Name:        mlk_assert
+ *
+ * Description: Check debug assertion
+ *
+ *              Prints an error message to stderr and calls
+ *              exit(1) if not.
+ *
+ * Arguments:   - file: filename
+ *              - line: line number
+ *              - val: Value asserted to be non-zero
+ **************************************************/
+#define mlk_debug_check_assert MLK_NAMESPACE(mlkem_debug_assert)
+void mlk_debug_check_assert(const char *file, int line, const int val);
+
+/*************************************************
+ * Name:        mlk_debug_check_bounds
+ *
+ * Description: Check whether values in an array of int16_t
+ *              are within specified bounds.
+ *
+ *              Prints an error message to stderr and calls
+ *              exit(1) if not.
+ *
+ * Arguments:   - file: filename
+ *              - line: line number
+ *              - ptr: Base of array to be checked
+ *              - len: Number of int16_t in ptr
+ *              - lower_bound_exclusive: Exclusive lower bound
+ *              - upper_bound_exclusive: Exclusive upper bound
+ **************************************************/
+#define mlk_debug_check_bounds MLK_NAMESPACE(mlkem_debug_check_bounds)
+void mlk_debug_check_bounds(const char *file, int line, const int16_t *ptr,
+                            unsigned len, int lower_bound_exclusive,
+                            int upper_bound_exclusive);
+
+/* Check assertion, calling exit() upon failure
+ *
+ * val: Value that's asserted to be non-zero
+ */
+#define mlk_assert(val) mlk_debug_check_assert(__FILE__, __LINE__, (val))
+
+/* Check bounds in array of int16_t's
+ * ptr: Base of int16_t array; will be explicitly cast to int16_t*,
+ *      so you may pass a byte-compatible type such as mlk_poly or mlk_polyvec.
+ * len: Number of int16_t in array
+ * value_lb: Inclusive lower value bound
+ * value_ub: Exclusive upper value bound */
+#define mlk_assert_bound(ptr, len, value_lb, value_ub)                      \
+  mlk_debug_check_bounds(__FILE__, __LINE__, (const int16_t *)(ptr), (len), \
+                         (value_lb) - 1, (value_ub))
+
+/* Check absolute bounds in array of int16_t's
+ * ptr: Base of array, expression of type int16_t*
+ * len: Number of int16_t in array
+ * value_abs_bd: Exclusive absolute upper bound */
+#define mlk_assert_abs_bound(ptr, len, value_abs_bd) \
+  mlk_assert_bound((ptr), (len), (-(value_abs_bd) + 1), (value_abs_bd))
+
+/* Version of bounds assertions for 2-dimensional arrays */
+#define mlk_assert_bound_2d(ptr, len0, len1, value_lb, value_ub) \
+  mlk_assert_bound((ptr), ((len0) * (len1)), (value_lb), (value_ub))
+
+#define mlk_assert_abs_bound_2d(ptr, len0, len1, value_abs_bd) \
+  mlk_assert_abs_bound((ptr), ((len0) * (len1)), (value_abs_bd))
+
+/* When running CBMC, convert debug assertions into proof obligations */
+#elif defined(CBMC)
+#include "cbmc.h"
+
+#define mlk_assert(val) cassert(val)
+
+#define mlk_assert_bound(ptr, len, value_lb, value_ub) \
+  cassert(array_bound(((int16_t *)(ptr)), 0, (len), (value_lb), (value_ub)))
+
+#define mlk_assert_abs_bound(ptr, len, value_abs_bd) \
+  cassert(array_abs_bound(((int16_t *)(ptr)), 0, (len), (value_abs_bd)))
+
+/* Because of https://github.com/diffblue/cbmc/issues/8570, we can't
+ * just use a single flattened array_bound(...) here. */
+#define mlk_assert_bound_2d(ptr, M, N, value_lb, value_ub)              \
+  cassert(forall(kN, 0, (M),                                            \
+                 array_bound(&((int16_t (*)[(N)])(ptr))[kN][0], 0, (N), \
+                             (value_lb), (value_ub))))
+
+#define mlk_assert_abs_bound_2d(ptr, M, N, value_abs_bd)                    \
+  cassert(forall(kN, 0, (M),                                                \
+                 array_abs_bound(&((int16_t (*)[(N)])(ptr))[kN][0], 0, (N), \
+                                 (value_abs_bd))))
+
+#else /* !MLKEM_DEBUG && CBMC */
+
+#define mlk_assert(val) \
+  do                    \
+  {                     \
+  } while (0)
+#define mlk_assert_bound(ptr, len, value_lb, value_ub) \
+  do                                                   \
+  {                                                    \
+  } while (0)
+#define mlk_assert_abs_bound(ptr, len, value_abs_bd) \
+  do                                                 \
+  {                                                  \
+  } while (0)
+
+#define mlk_assert_bound_2d(ptr, len0, len1, value_lb, value_ub) \
+  do                                                             \
+  {                                                              \
+  } while (0)
+
+#define mlk_assert_abs_bound_2d(ptr, len0, len1, value_abs_bd) \
+  do                                                           \
+  {                                                            \
+  } while (0)
+
+
+#endif /* !MLKEM_DEBUG && !CBMC */
+#endif /* !MLK_DEBUG_H */
diff --git a/mlkem_native/src/fips202/fips202.c b/mlkem_native/src/fips202/fips202.c
new file mode 100644
index 0000000..4751efb
--- /dev/null
+++ b/mlkem_native/src/fips202/fips202.c
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS203]
+ *   FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/203/final
+ *
+ * - [mupq]
+ *   Common files for pqm4, pqm3, pqriscv
+ *   Kannwischer, Petri, Rijneveld, Schwabe, Stoffelen
+ *   https://github.com/mupq/mupq
+ *
+ * - [supercop]
+ *   SUPERCOP benchmarking framework
+ *   Daniel J. Bernstein
+ *   http://bench.cr.yp.to/supercop.html
+ *
+ * - [tweetfips]
+ *   'tweetfips202' FIPS202 implementation
+ *   Van Assche, Bernstein, Schwabe
+ *   https://keccak.team/2015/tweetfips202.html
+ */
+
+/* Based on the CC0 implementation from @[mupq] and the public domain
+ * implementation @[supercop, crypto_hash/keccakc512/simple/]
+ * by Ronny Van Keer, and the public domain @[tweetfips] implementation. */
+
+#include "../common.h"
+#if !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
+
+
+#include "../verify.h"
+#include "fips202.h"
+#include "keccakf1600.h"
+
+/*************************************************
+ * Name:        mlk_keccak_absorb_once
+ *
+ * Description: Absorb step of Keccak;
+ *              non-incremental, starts by zeroeing the state.
+ *
+ *              WARNING: Must only be called once.
+ *
+ * Arguments:   - uint64_t *s:       pointer to (uninitialized) output Keccak
+ *                                   state
+ *              - unsigned r:        rate in bytes (e.g., 168 for SHAKE128)
+ *              - const uint8_t *m:  pointer to input to be absorbed into s
+ *              - size_t mlen:       length of input in bytes
+ *              - uint8_t p:         domain-separation byte for different
+ *                                   Keccak-derived functions
+ **************************************************/
+static void mlk_keccak_absorb_once(uint64_t *s, unsigned r, const uint8_t *m,
+                                   size_t mlen, uint8_t p)
+__contract__(
+    requires(mlen <= MLK_MAX_BUFFER_SIZE)
+    requires(r <= sizeof(uint64_t) * MLK_KECCAK_LANES)
+    requires(memory_no_alias(s, sizeof(uint64_t) * MLK_KECCAK_LANES))
+    requires(memory_no_alias(m, mlen))
+    assigns(memory_slice(s, sizeof(uint64_t) * MLK_KECCAK_LANES)))
+{
+  /* Initialize state */
+  size_t i;
+  for (i = 0; i < 25; ++i)
+  __loop__(invariant(i <= 25))
+  {
+    s[i] = 0;
+  }
+
+  while (mlen >= r)
+  __loop__(
+    assigns(mlen, m, memory_slice(s, sizeof(uint64_t) * MLK_KECCAK_LANES))
+    invariant(mlen <= loop_entry(mlen))
+    invariant(m == loop_entry(m) + (loop_entry(mlen) - mlen)))
+  {
+    mlk_keccakf1600_xor_bytes(s, m, 0, r);
+    mlk_keccakf1600_permute(s);
+    mlen -= r;
+    m += r;
+  }
+
+  /* At this point, mlen < r, so the truncations to unsigned are safe below. */
+
+  if (mlen > 0)
+  {
+    mlk_keccakf1600_xor_bytes(s, m, 0, (unsigned int)mlen);
+  }
+
+  if (mlen == r - 1)
+  {
+    p |= 128;
+    mlk_keccakf1600_xor_bytes(s, &p, (unsigned int)mlen, 1);
+  }
+  else
+  {
+    mlk_keccakf1600_xor_bytes(s, &p, (unsigned int)mlen, 1);
+    p = 128;
+    mlk_keccakf1600_xor_bytes(s, &p, r - 1, 1);
+  }
+}
+
+/*************************************************
+ * Name:        mlk_keccak_squeezeblocks
+ *
+ * Description: block-level Keccak squeeze
+ *
+ * Arguments:   - uint8_t *h: pointer to output bytes
+ *              - size_t nblocks: number of blocks to be squeezed
+ *              - uint64_t *s_inc: pointer to input/output state
+ *              - unsigned r: rate in bytes (e.g., 168 for SHAKE128)
+ **************************************************/
+static void mlk_keccak_squeezeblocks(uint8_t *h, size_t nblocks, uint64_t *s,
+                                     unsigned r)
+__contract__(
+    requires(r <= sizeof(uint64_t) * MLK_KECCAK_LANES)
+    requires(nblocks <= 8 /* somewhat arbitrary bound */)
+    requires(memory_no_alias(s, sizeof(uint64_t) * MLK_KECCAK_LANES))
+    requires(memory_no_alias(h, nblocks * r))
+    assigns(memory_slice(s, sizeof(uint64_t) * MLK_KECCAK_LANES))
+    assigns(memory_slice(h, nblocks * r)))
+{
+  while (nblocks > 0)
+  __loop__(
+    assigns(h, nblocks,
+      memory_slice(s, sizeof(uint64_t) * MLK_KECCAK_LANES),
+      memory_slice(h, nblocks * r))
+    invariant(nblocks <= loop_entry(nblocks) &&
+      h == loop_entry(h) + r * (loop_entry(nblocks) - nblocks)))
+  {
+    mlk_keccakf1600_permute(s);
+    mlk_keccakf1600_extract_bytes(s, h, 0, r);
+    h += r;
+    nblocks--;
+  }
+}
+
+/*************************************************
+ * Name:        mlk_keccak_squeeze_once
+ *
+ * Description: Keccak squeeze; can be called on byte-level
+ *
+ *              WARNING: This must only be called once.
+ *
+ * Arguments:   - uint8_t *h: pointer to output bytes
+ *              - size_t outlen: number of bytes to be squeezed
+ *              - uint64_t *s_inc: pointer to Keccak state
+ *              - unsigned r: rate in bytes (e.g., 168 for SHAKE128)
+ **************************************************/
+static void mlk_keccak_squeeze_once(uint8_t *h, size_t outlen, uint64_t *s,
+                                    unsigned r)
+__contract__(
+    requires(outlen <= MLK_MAX_BUFFER_SIZE)
+    requires(r <= sizeof(uint64_t) * MLK_KECCAK_LANES)
+    requires(memory_no_alias(s, sizeof(uint64_t) * MLK_KECCAK_LANES))
+    requires(memory_no_alias(h, outlen))
+    assigns(memory_slice(s, sizeof(uint64_t) * MLK_KECCAK_LANES))
+    assigns(memory_slice(h, outlen)))
+{
+  size_t len;
+  while (outlen > 0)
+  __loop__(
+    assigns(len, h, outlen,
+      memory_slice(s, sizeof(uint64_t) * MLK_KECCAK_LANES),
+      memory_slice(h, outlen))
+    invariant(outlen <= loop_entry(outlen) &&
+      h == loop_entry(h) + (loop_entry(outlen) - outlen)))
+  {
+    mlk_keccakf1600_permute(s);
+
+    if (outlen < r)
+    {
+      len = outlen;
+    }
+    else
+    {
+      len = r;
+    }
+    mlk_keccakf1600_extract_bytes(s, h, 0, (unsigned int)len);
+    h += len;
+    outlen -= len;
+  }
+}
+
+void mlk_shake128_absorb_once(mlk_shake128ctx *state, const uint8_t *input,
+                              size_t inlen)
+{
+  mlk_keccak_absorb_once(state->ctx, SHAKE128_RATE, input, inlen, 0x1F);
+}
+
+void mlk_shake128_squeezeblocks(uint8_t *output, size_t nblocks,
+                                mlk_shake128ctx *state)
+{
+  mlk_keccak_squeezeblocks(output, nblocks, state->ctx, SHAKE128_RATE);
+}
+
+void mlk_shake128_init(mlk_shake128ctx *state) { (void)state; }
+void mlk_shake128_release(mlk_shake128ctx *state)
+{
+  /* Specification: Partially implements
+   * @[FIPS203, Section 3.3, Destruction of intermediate values] */
+  mlk_zeroize(state, sizeof(mlk_shake128ctx));
+}
+
+typedef mlk_shake128ctx mlk_shake256ctx;
+void mlk_shake256(uint8_t *output, size_t outlen, const uint8_t *input,
+                  size_t inlen)
+{
+  mlk_shake256ctx state;
+  /* Absorb input */
+  mlk_keccak_absorb_once(state.ctx, SHAKE256_RATE, input, inlen, 0x1F);
+  /* Squeeze output */
+  mlk_keccak_squeeze_once(output, outlen, state.ctx, SHAKE256_RATE);
+  /* Specification: Partially implements
+   * @[FIPS203, Section 3.3, Destruction of intermediate values] */
+  mlk_zeroize(&state, sizeof(state));
+}
+
+void mlk_sha3_256(uint8_t *output, const uint8_t *input, size_t inlen)
+{
+  uint64_t ctx[25];
+  /* Absorb input */
+  mlk_keccak_absorb_once(ctx, SHA3_256_RATE, input, inlen, 0x06);
+  /* Squeeze output */
+  mlk_keccak_squeeze_once(output, 32, ctx, SHA3_256_RATE);
+  /* Specification: Partially implements
+   * @[FIPS203, Section 3.3, Destruction of intermediate values] */
+  mlk_zeroize(ctx, sizeof(ctx));
+}
+
+void mlk_sha3_512(uint8_t *output, const uint8_t *input, size_t inlen)
+{
+  uint64_t ctx[25];
+  /* Absorb input */
+  mlk_keccak_absorb_once(ctx, SHA3_512_RATE, input, inlen, 0x06);
+  /* Squeeze output */
+  mlk_keccak_squeeze_once(output, 64, ctx, SHA3_512_RATE);
+  /* Specification: Partially implements
+   * @[FIPS203, Section 3.3, Destruction of intermediate values] */
+  mlk_zeroize(ctx, sizeof(ctx));
+}
+
+#else /* !MLK_CONFIG_MULTILEVEL_NO_SHARED */
+
+MLK_EMPTY_CU(fips202)
+
+#endif /* MLK_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/mlkem_native/src/fips202/fips202.h b/mlkem_native/src/fips202/fips202.h
new file mode 100644
index 0000000..9ebc158
--- /dev/null
+++ b/mlkem_native/src/fips202/fips202.h
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+#ifndef MLK_FIPS202_FIPS202_H
+#define MLK_FIPS202_FIPS202_H
+
+#include "../cbmc.h"
+#include "../common.h"
+
+#define SHAKE128_RATE 168
+#define SHAKE256_RATE 136
+#define SHA3_256_RATE 136
+#define SHA3_384_RATE 104
+#define SHA3_512_RATE 72
+
+/* Context for non-incremental API */
+typedef struct
+{
+  uint64_t ctx[25];
+} MLK_ALIGN mlk_shake128ctx;
+
+#define mlk_shake128_absorb_once MLK_NAMESPACE(shake128_absorb_once)
+/*************************************************
+ * Name:        mlk_shake128_absorb_once
+ *
+ * Description: One-shot absorb step of the SHAKE128 XOF.
+ *
+ *              For call-sites (in mlkem-native):
+ *              - This function MUST ONLY be called straight after
+ *                mlk_shake128_init().
+ *              - This function MUST ONLY be called once.
+ *
+ *              Consequently, for providers of custom FIPS202 code
+ *              to be used with mlkem-native:
+ *              - You may assume that the input context is
+ *                freshly initialized via mlk_shake128_init().
+ *              - You may assume that this function is
+ *                called exactly once.
+ *
+ * Arguments:   - mlk_shake128ctx *state:   pointer to SHAKE128 context
+ *              - const uint8_t *input: pointer to input to be absorbed into
+ *                                      the state
+ *              - size_t inlen:         length of input in bytes
+ **************************************************/
+void mlk_shake128_absorb_once(mlk_shake128ctx *state, const uint8_t *input,
+                              size_t inlen)
+__contract__(
+  requires(inlen <= MLK_MAX_BUFFER_SIZE)
+  requires(memory_no_alias(state, sizeof(mlk_shake128ctx)))
+  requires(memory_no_alias(input, inlen))
+  assigns(memory_slice(state, sizeof(mlk_shake128ctx)))
+);
+
+#define mlk_shake128_squeezeblocks MLK_NAMESPACE(shake128_squeezeblocks)
+/*************************************************
+ * Name:        mlk_shake128_squeezeblocks
+ *
+ * Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of
+ *              SHAKE128_RATE bytes each. Modifies the state. Can be called
+ *              multiple times to keep squeezing, i.e., is incremental.
+ *
+ * Arguments:   - uint8_t *output:     pointer to output blocks
+ *              - size_t nblocks:      number of blocks to be squeezed (written
+ *                                     to output)
+ *              - mlk_shake128ctx *state:  pointer to in/output Keccak state
+ **************************************************/
+void mlk_shake128_squeezeblocks(uint8_t *output, size_t nblocks,
+                                mlk_shake128ctx *state)
+__contract__(
+  requires(nblocks <= 8 /* somewhat arbitrary bound */)
+  requires(memory_no_alias(state, sizeof(mlk_shake128ctx)))
+  requires(memory_no_alias(output, nblocks * SHAKE128_RATE))
+  assigns(memory_slice(output, nblocks * SHAKE128_RATE), memory_slice(state, sizeof(mlk_shake128ctx)))
+);
+
+#define mlk_shake128_init MLK_NAMESPACE(shake128_init)
+void mlk_shake128_init(mlk_shake128ctx *state);
+
+#define mlk_shake128_release MLK_NAMESPACE(shake128_release)
+void mlk_shake128_release(mlk_shake128ctx *state);
+
+/* One-stop SHAKE256 call. Aliasing between input and
+ * output is not permitted */
+#define mlk_shake256 MLK_NAMESPACE(shake256)
+/*************************************************
+ * Name:        mlk_shake256
+ *
+ * Description: SHAKE256 XOF with non-incremental API
+ *
+ * Arguments:   - uint8_t *output:      pointer to output
+ *              - size_t outlen:        requested output length in bytes
+ *              - const uint8_t *input: pointer to input
+ *              - size_t inlen:         length of input in bytes
+ **************************************************/
+void mlk_shake256(uint8_t *output, size_t outlen, const uint8_t *input,
+                  size_t inlen)
+__contract__(
+  requires(inlen <= MLK_MAX_BUFFER_SIZE)
+  requires(outlen <= MLK_MAX_BUFFER_SIZE)
+  requires(memory_no_alias(input, inlen))
+  requires(memory_no_alias(output, outlen))
+  assigns(memory_slice(output, outlen))
+);
+
+/* One-stop SHA3_256 call. Aliasing between input and
+ * output is not permitted */
+#define SHA3_256_HASHBYTES 32
+#define mlk_sha3_256 MLK_NAMESPACE(sha3_256)
+/*************************************************
+ * Name:        mlk_sha3_256
+ *
+ * Description: SHA3-256 with non-incremental API
+ *
+ * Arguments:   - uint8_t *output:      pointer to output
+ *              - const uint8_t *input: pointer to input
+ *              - size_t inlen:         length of input in bytes
+ **************************************************/
+void mlk_sha3_256(uint8_t *output, const uint8_t *input, size_t inlen)
+__contract__(
+  requires(inlen <= MLK_MAX_BUFFER_SIZE)
+  requires(memory_no_alias(input, inlen))
+  requires(memory_no_alias(output, SHA3_256_HASHBYTES))
+  assigns(memory_slice(output, SHA3_256_HASHBYTES))
+);
+
+/* One-stop SHA3_512 call. Aliasing between input and
+ * output is not permitted */
+#define SHA3_512_HASHBYTES 64
+#define mlk_sha3_512 MLK_NAMESPACE(sha3_512)
+/*************************************************
+ * Name:        mlk_sha3_512
+ *
+ * Description: SHA3-512 with non-incremental API
+ *
+ * Arguments:   - uint8_t *output:      pointer to output
+ *              - const uint8_t *input: pointer to input
+ *              - size_t inlen:         length of input in bytes
+ **************************************************/
+void mlk_sha3_512(uint8_t *output, const uint8_t *input, size_t inlen)
+__contract__(
+  requires(inlen <= MLK_MAX_BUFFER_SIZE)
+  requires(memory_no_alias(input, inlen))
+  requires(memory_no_alias(output, SHA3_512_HASHBYTES))
+  assigns(memory_slice(output, SHA3_512_HASHBYTES))
+);
+
+#if !defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202) || \
+    !defined(MLK_USE_FIPS202_X4_NATIVE)
+/* If you provide your own FIPS-202 implementation where the x4-
+ * Keccak-f1600-x4 implementation falls back to 4-fold Keccak-f1600,
+ * set this to gain a small speedup. */
+#define FIPS202_X4_DEFAULT_IMPLEMENTATION
+#endif /* !MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 || !MLK_USE_FIPS202_X4_NATIVE \
+        */
+
+
+#endif /* !MLK_FIPS202_FIPS202_H */
diff --git a/mlkem_native/src/fips202/fips202x4.c b/mlkem_native/src/fips202/fips202x4.c
new file mode 100644
index 0000000..95f8848
--- /dev/null
+++ b/mlkem_native/src/fips202/fips202x4.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS203]
+ *   FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/203/final
+ */
+
+#include "../common.h"
+#if !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
+
+#include "../verify.h"
+#include "fips202.h"
+#include "fips202x4.h"
+#include "keccakf1600.h"
+
+typedef mlk_shake128x4ctx mlk_shake256x4_ctx;
+
+static void mlk_keccak_absorb_once_x4(uint64_t *s, unsigned r,
+                                      const uint8_t *in0, const uint8_t *in1,
+                                      const uint8_t *in2, const uint8_t *in3,
+                                      size_t inlen, uint8_t p)
+__contract__(
+  requires(inlen <= MLK_MAX_BUFFER_SIZE)
+  requires(memory_no_alias(s, sizeof(uint64_t) * MLK_KECCAK_LANES * MLK_KECCAK_WAY))
+  requires(r <= sizeof(uint64_t) * MLK_KECCAK_LANES)
+  requires(memory_no_alias(in0, inlen))
+  requires(memory_no_alias(in1, inlen))
+  requires(memory_no_alias(in2, inlen))
+  requires(memory_no_alias(in3, inlen))
+  assigns(memory_slice(s, sizeof(uint64_t) * MLK_KECCAK_LANES * MLK_KECCAK_WAY)))
+{
+  while (inlen >= r)
+  __loop__(
+    assigns(inlen, in0, in1, in2, in3, memory_slice(s, sizeof(uint64_t) * MLK_KECCAK_LANES * MLK_KECCAK_WAY))
+    invariant(inlen <= loop_entry(inlen))
+    invariant(in0 == loop_entry(in0) + (loop_entry(inlen) - inlen))
+    invariant(in1 == loop_entry(in1) + (loop_entry(inlen) - inlen))
+    invariant(in2 == loop_entry(in2) + (loop_entry(inlen) - inlen))
+    invariant(in3 == loop_entry(in3) + (loop_entry(inlen) - inlen)))
+  {
+    mlk_keccakf1600x4_xor_bytes(s, in0, in1, in2, in3, 0, r);
+    mlk_keccakf1600x4_permute(s);
+
+    in0 += r;
+    in1 += r;
+    in2 += r;
+    in3 += r;
+    inlen -= r;
+  }
+
+  /* At this point, inlen < r, so the truncations to unsigned are safe below. */
+
+  if (inlen > 0)
+  {
+    mlk_keccakf1600x4_xor_bytes(s, in0, in1, in2, in3, 0, (unsigned int)inlen);
+  }
+
+  if (inlen == r - 1)
+  {
+    p |= 128;
+    mlk_keccakf1600x4_xor_bytes(s, &p, &p, &p, &p, (unsigned int)inlen, 1);
+  }
+  else
+  {
+    mlk_keccakf1600x4_xor_bytes(s, &p, &p, &p, &p, (unsigned int)inlen, 1);
+    p = 128;
+    mlk_keccakf1600x4_xor_bytes(s, &p, &p, &p, &p, r - 1, 1);
+  }
+}
+
+static void mlk_keccak_squeezeblocks_x4(uint8_t *out0, uint8_t *out1,
+                                        uint8_t *out2, uint8_t *out3,
+                                        size_t nblocks, uint64_t *s, unsigned r)
+__contract__(
+    requires(r <= sizeof(uint64_t) * MLK_KECCAK_LANES)
+    requires(r == SHAKE128_RATE || r == SHAKE256_RATE)
+    requires(nblocks <= (MLK_MAX_BUFFER_SIZE / SHAKE256_RATE))
+    requires(memory_no_alias(s, sizeof(uint64_t) * MLK_KECCAK_LANES * MLK_KECCAK_WAY))
+    requires(memory_no_alias(out0, nblocks * r))
+    requires(memory_no_alias(out1, nblocks * r))
+    requires(memory_no_alias(out2, nblocks * r))
+    requires(memory_no_alias(out3, nblocks * r))
+    assigns(memory_slice(s, sizeof(uint64_t) * MLK_KECCAK_LANES * MLK_KECCAK_WAY))
+    assigns(memory_slice(out0, nblocks * r))
+    assigns(memory_slice(out1, nblocks * r))
+    assigns(memory_slice(out2, nblocks * r))
+    assigns(memory_slice(out3, nblocks * r)))
+{
+  while (nblocks > 0)
+  __loop__(
+    assigns(out0, out1, out2, out3, nblocks,
+            memory_slice(s, sizeof(uint64_t) * MLK_KECCAK_LANES * MLK_KECCAK_WAY),
+            memory_slice(out0, nblocks * r),
+            memory_slice(out1, nblocks * r),
+            memory_slice(out2, nblocks * r),
+            memory_slice(out3, nblocks * r))
+    invariant(nblocks <= loop_entry(nblocks) &&
+      out0 == loop_entry(out0) + r * (loop_entry(nblocks) - nblocks) &&
+      out1 == loop_entry(out1) + r * (loop_entry(nblocks) - nblocks) &&
+      out2 == loop_entry(out2) + r * (loop_entry(nblocks) - nblocks) &&
+      out3 == loop_entry(out3) + r * (loop_entry(nblocks) - nblocks)))
+  {
+    mlk_keccakf1600x4_permute(s);
+    mlk_keccakf1600x4_extract_bytes(s, out0, out1, out2, out3, 0, r);
+
+    out0 += r;
+    out1 += r;
+    out2 += r;
+    out3 += r;
+    nblocks--;
+  }
+}
+
+void mlk_shake128x4_absorb_once(mlk_shake128x4ctx *state, const uint8_t *in0,
+                                const uint8_t *in1, const uint8_t *in2,
+                                const uint8_t *in3, size_t inlen)
+{
+  mlk_memset(state, 0, sizeof(mlk_shake128x4ctx));
+  mlk_keccak_absorb_once_x4(state->ctx, SHAKE128_RATE, in0, in1, in2, in3,
+                            inlen, 0x1F);
+}
+
+void mlk_shake128x4_squeezeblocks(uint8_t *out0, uint8_t *out1, uint8_t *out2,
+                                  uint8_t *out3, size_t nblocks,
+                                  mlk_shake128x4ctx *state)
+{
+  mlk_keccak_squeezeblocks_x4(out0, out1, out2, out3, nblocks, state->ctx,
+                              SHAKE128_RATE);
+}
+
+void mlk_shake128x4_init(mlk_shake128x4ctx *state) { (void)state; }
+void mlk_shake128x4_release(mlk_shake128x4ctx *state)
+{
+  /* Specification: Partially implements
+   * @[FIPS203, Section 3.3, Destruction of intermediate values] */
+  mlk_zeroize(state, sizeof(mlk_shake128x4ctx));
+}
+
+static void mlk_shake256x4_absorb_once(mlk_shake256x4_ctx *state,
+                                       const uint8_t *in0, const uint8_t *in1,
+                                       const uint8_t *in2, const uint8_t *in3,
+                                       size_t inlen)
+{
+  mlk_memset(state, 0, sizeof(mlk_shake128x4ctx));
+  mlk_keccak_absorb_once_x4(state->ctx, SHAKE256_RATE, in0, in1, in2, in3,
+                            inlen, 0x1F);
+}
+
+static void mlk_shake256x4_squeezeblocks(uint8_t *out0, uint8_t *out1,
+                                         uint8_t *out2, uint8_t *out3,
+                                         size_t nblocks,
+                                         mlk_shake256x4_ctx *state)
+{
+  mlk_keccak_squeezeblocks_x4(out0, out1, out2, out3, nblocks, state->ctx,
+                              SHAKE256_RATE);
+}
+
+void mlk_shake256x4(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3,
+                    size_t outlen, uint8_t *in0, uint8_t *in1, uint8_t *in2,
+                    uint8_t *in3, size_t inlen)
+{
+  mlk_shake256x4_ctx statex;
+  size_t nblocks = outlen / SHAKE256_RATE;
+  uint8_t tmp0[SHAKE256_RATE];
+  uint8_t tmp1[SHAKE256_RATE];
+  uint8_t tmp2[SHAKE256_RATE];
+  uint8_t tmp3[SHAKE256_RATE];
+
+  mlk_shake256x4_absorb_once(&statex, in0, in1, in2, in3, inlen);
+  mlk_shake256x4_squeezeblocks(out0, out1, out2, out3, nblocks, &statex);
+
+  out0 += nblocks * SHAKE256_RATE;
+  out1 += nblocks * SHAKE256_RATE;
+  out2 += nblocks * SHAKE256_RATE;
+  out3 += nblocks * SHAKE256_RATE;
+
+  outlen -= nblocks * SHAKE256_RATE;
+
+  if (outlen)
+  {
+    mlk_shake256x4_squeezeblocks(tmp0, tmp1, tmp2, tmp3, 1, &statex);
+    mlk_memcpy(out0, tmp0, outlen);
+    mlk_memcpy(out1, tmp1, outlen);
+    mlk_memcpy(out2, tmp2, outlen);
+    mlk_memcpy(out3, tmp3, outlen);
+  }
+
+  /* Specification: Partially implements
+   * @[FIPS203, Section 3.3, Destruction of intermediate values] */
+  mlk_zeroize(&statex, sizeof(statex));
+  mlk_zeroize(tmp0, sizeof(tmp0));
+  mlk_zeroize(tmp1, sizeof(tmp1));
+  mlk_zeroize(tmp2, sizeof(tmp2));
+  mlk_zeroize(tmp3, sizeof(tmp3));
+}
+
+#else /* !MLK_CONFIG_MULTILEVEL_NO_SHARED */
+
+MLK_EMPTY_CU(fips202x4)
+
+#endif /* MLK_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/mlkem_native/src/fips202/fips202x4.h b/mlkem_native/src/fips202/fips202x4.h
new file mode 100644
index 0000000..1f6a8be
--- /dev/null
+++ b/mlkem_native/src/fips202/fips202x4.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+#ifndef MLK_FIPS202_FIPS202X4_H
+#define MLK_FIPS202_FIPS202X4_H
+
+
+#include "../cbmc.h"
+#include "../common.h"
+
+#include "fips202.h"
+#include "keccakf1600.h"
+
+/* Context for non-incremental API */
+typedef struct
+{
+  uint64_t ctx[MLK_KECCAK_LANES * MLK_KECCAK_WAY];
+} MLK_ALIGN mlk_shake128x4ctx;
+
+#define mlk_shake128x4_absorb_once MLK_NAMESPACE(shake128x4_absorb_once)
+void mlk_shake128x4_absorb_once(mlk_shake128x4ctx *state, const uint8_t *in0,
+                                const uint8_t *in1, const uint8_t *in2,
+                                const uint8_t *in3, size_t inlen)
+__contract__(
+  requires(inlen <= MLK_MAX_BUFFER_SIZE)
+  requires(memory_no_alias(state, sizeof(mlk_shake128x4ctx)))
+  requires(memory_no_alias(in0, inlen))
+  requires(memory_no_alias(in1, inlen))
+  requires(memory_no_alias(in2, inlen))
+  requires(memory_no_alias(in3, inlen))
+  assigns(memory_slice(state, sizeof(mlk_shake128x4ctx)))
+);
+
+#define mlk_shake128x4_squeezeblocks MLK_NAMESPACE(shake128x4_squeezeblocks)
+void mlk_shake128x4_squeezeblocks(uint8_t *out0, uint8_t *out1, uint8_t *out2,
+                                  uint8_t *out3, size_t nblocks,
+                                  mlk_shake128x4ctx *state)
+__contract__(
+  requires(nblocks <= 8 /* somewhat arbitrary bound */)
+  requires(memory_no_alias(state, sizeof(mlk_shake128x4ctx)))
+  requires(memory_no_alias(out0, nblocks * SHAKE128_RATE))
+  requires(memory_no_alias(out1, nblocks * SHAKE128_RATE))
+  requires(memory_no_alias(out2, nblocks * SHAKE128_RATE))
+  requires(memory_no_alias(out3, nblocks * SHAKE128_RATE))
+  assigns(memory_slice(out0, nblocks * SHAKE128_RATE),
+    memory_slice(out1, nblocks * SHAKE128_RATE),
+    memory_slice(out2, nblocks * SHAKE128_RATE),
+    memory_slice(out3, nblocks * SHAKE128_RATE),
+    memory_slice(state, sizeof(mlk_shake128x4ctx)))
+);
+
+#define mlk_shake128x4_init MLK_NAMESPACE(shake128x4_init)
+void mlk_shake128x4_init(mlk_shake128x4ctx *state);
+
+#define mlk_shake128x4_release MLK_NAMESPACE(shake128x4_release)
+void mlk_shake128x4_release(mlk_shake128x4ctx *state);
+
+#define mlk_shake256x4 MLK_NAMESPACE(shake256x4)
+void mlk_shake256x4(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3,
+                    size_t outlen, uint8_t *in0, uint8_t *in1, uint8_t *in2,
+                    uint8_t *in3, size_t inlen)
+__contract__(
+  requires(inlen <= MLK_MAX_BUFFER_SIZE)
+  requires(outlen <= MLK_MAX_BUFFER_SIZE)
+  requires(memory_no_alias(in0, inlen))
+  requires(memory_no_alias(in1, inlen))
+  requires(memory_no_alias(in2, inlen))
+  requires(memory_no_alias(in3, inlen))
+  requires(memory_no_alias(out0, outlen))
+  requires(memory_no_alias(out1, outlen))
+  requires(memory_no_alias(out2, outlen))
+  requires(memory_no_alias(out3, outlen))
+  assigns(memory_slice(out0, outlen))
+  assigns(memory_slice(out1, outlen))
+  assigns(memory_slice(out2, outlen))
+  assigns(memory_slice(out3, outlen))
+);
+
+#endif /* !MLK_FIPS202_FIPS202X4_H */
diff --git a/mlkem_native/src/fips202/keccakf1600.c b/mlkem_native/src/fips202/keccakf1600.c
new file mode 100644
index 0000000..cf423e3
--- /dev/null
+++ b/mlkem_native/src/fips202/keccakf1600.c
@@ -0,0 +1,463 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [mupq]
+ *   Common files for pqm4, pqm3, pqriscv
+ *   Kannwischer, Petri, Rijneveld, Schwabe, Stoffelen
+ *   https://github.com/mupq/mupq
+ *
+ * - [supercop]
+ *   SUPERCOP benchmarking framework
+ *   Daniel J. Bernstein
+ *   http://bench.cr.yp.to/supercop.html
+ *
+ * - [tweetfips]
+ *   'tweetfips202' FIPS202 implementation
+ *   Van Assche, Bernstein, Schwabe
+ *   https://keccak.team/2015/tweetfips202.html
+ */
+
+/* Based on the CC0 implementation from @[mupq] and the public domain
+ * implementation @[supercop, crypto_hash/keccakc512/simple/]
+ * by Ronny Van Keer, and the public domain @[tweetfips] implementation. */
+
+
+#include "keccakf1600.h"
+#if !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
+
+#define MLK_KECCAK_NROUNDS 24
+#define MLK_KECCAK_ROL(a, offset) ((a << offset) ^ (a >> (64 - offset)))
+
+void mlk_keccakf1600_extract_bytes(uint64_t *state, unsigned char *data,
+                                   unsigned offset, unsigned length)
+{
+  unsigned i;
+#if defined(MLK_SYS_LITTLE_ENDIAN)
+  uint8_t *state_ptr = (uint8_t *)state + offset;
+  for (i = 0; i < length; i++)
+  __loop__(invariant(i <= length))
+  {
+    data[i] = state_ptr[i];
+  }
+#else  /* MLK_SYS_LITTLE_ENDIAN */
+  /* Portable version */
+  for (i = 0; i < length; i++)
+  __loop__(invariant(i <= length))
+  {
+    data[i] = (state[(offset + i) >> 3] >> (8 * ((offset + i) & 0x07))) & 0xFF;
+  }
+#endif /* !MLK_SYS_LITTLE_ENDIAN */
+}
+
+void mlk_keccakf1600_xor_bytes(uint64_t *state, const unsigned char *data,
+                               unsigned offset, unsigned length)
+{
+  unsigned i;
+#if defined(MLK_SYS_LITTLE_ENDIAN)
+  uint8_t *state_ptr = (uint8_t *)state + offset;
+  for (i = 0; i < length; i++)
+  __loop__(invariant(i <= length))
+  {
+    state_ptr[i] ^= data[i];
+  }
+#else  /* MLK_SYS_LITTLE_ENDIAN */
+  /* Portable version */
+  for (i = 0; i < length; i++)
+  __loop__(invariant(i <= length))
+  {
+    state[(offset + i) >> 3] ^= (uint64_t)data[i]
+                                << (8 * ((offset + i) & 0x07));
+  }
+#endif /* !MLK_SYS_LITTLE_ENDIAN */
+}
+
+static void mlk_keccakf1600x4_extract_bytes_c(uint64_t *state,
+                                              unsigned char *data0,
+                                              unsigned char *data1,
+                                              unsigned char *data2,
+                                              unsigned char *data3,
+                                              unsigned offset, unsigned length)
+{
+  mlk_keccakf1600_extract_bytes(state + MLK_KECCAK_LANES * 0, data0, offset,
+                                length);
+  mlk_keccakf1600_extract_bytes(state + MLK_KECCAK_LANES * 1, data1, offset,
+                                length);
+  mlk_keccakf1600_extract_bytes(state + MLK_KECCAK_LANES * 2, data2, offset,
+                                length);
+  mlk_keccakf1600_extract_bytes(state + MLK_KECCAK_LANES * 3, data3, offset,
+                                length);
+}
+
+void mlk_keccakf1600x4_extract_bytes(uint64_t *state, unsigned char *data0,
+                                     unsigned char *data1, unsigned char *data2,
+                                     unsigned char *data3, unsigned offset,
+                                     unsigned length)
+{
+#if defined(MLK_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE)
+  if (mlk_keccakf1600_extract_bytes_x4_native(state, data0, data1, data2, data3,
+                                              offset, length) ==
+      MLK_NATIVE_FUNC_SUCCESS)
+  {
+    return;
+  }
+#endif /* MLK_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE */
+  mlk_keccakf1600x4_extract_bytes_c(state, data0, data1, data2, data3, offset,
+                                    length);
+}
+
+static void mlk_keccakf1600x4_xor_bytes_c(uint64_t *state,
+                                          const unsigned char *data0,
+                                          const unsigned char *data1,
+                                          const unsigned char *data2,
+                                          const unsigned char *data3,
+                                          unsigned offset, unsigned length)
+{
+  mlk_keccakf1600_xor_bytes(state + MLK_KECCAK_LANES * 0, data0, offset,
+                            length);
+  mlk_keccakf1600_xor_bytes(state + MLK_KECCAK_LANES * 1, data1, offset,
+                            length);
+  mlk_keccakf1600_xor_bytes(state + MLK_KECCAK_LANES * 2, data2, offset,
+                            length);
+  mlk_keccakf1600_xor_bytes(state + MLK_KECCAK_LANES * 3, data3, offset,
+                            length);
+}
+
+void mlk_keccakf1600x4_xor_bytes(uint64_t *state, const unsigned char *data0,
+                                 const unsigned char *data1,
+                                 const unsigned char *data2,
+                                 const unsigned char *data3, unsigned offset,
+                                 unsigned length)
+{
+#if defined(MLK_USE_FIPS202_X4_XOR_BYTES_NATIVE)
+  if (mlk_keccakf1600_xor_bytes_x4_native(state, data0, data1, data2, data3,
+                                          offset,
+                                          length) == MLK_NATIVE_FUNC_SUCCESS)
+  {
+    return;
+  }
+#endif /* MLK_USE_FIPS202_X4_XOR_BYTES_NATIVE */
+  mlk_keccakf1600x4_xor_bytes_c(state, data0, data1, data2, data3, offset,
+                                length);
+}
+
+void mlk_keccakf1600x4_permute(uint64_t *state)
+{
+#if defined(MLK_USE_FIPS202_X4_NATIVE)
+  if (mlk_keccak_f1600_x4_native(state) == MLK_NATIVE_FUNC_SUCCESS)
+  {
+    return;
+  }
+#endif /* MLK_USE_FIPS202_X4_NATIVE */
+  mlk_keccakf1600_permute(state + MLK_KECCAK_LANES * 0);
+  mlk_keccakf1600_permute(state + MLK_KECCAK_LANES * 1);
+  mlk_keccakf1600_permute(state + MLK_KECCAK_LANES * 2);
+  mlk_keccakf1600_permute(state + MLK_KECCAK_LANES * 3);
+}
+
+static const uint64_t mlk_KeccakF_RoundConstants[MLK_KECCAK_NROUNDS] = {
+    (uint64_t)0x0000000000000001ULL, (uint64_t)0x0000000000008082ULL,
+    (uint64_t)0x800000000000808aULL, (uint64_t)0x8000000080008000ULL,
+    (uint64_t)0x000000000000808bULL, (uint64_t)0x0000000080000001ULL,
+    (uint64_t)0x8000000080008081ULL, (uint64_t)0x8000000000008009ULL,
+    (uint64_t)0x000000000000008aULL, (uint64_t)0x0000000000000088ULL,
+    (uint64_t)0x0000000080008009ULL, (uint64_t)0x000000008000000aULL,
+    (uint64_t)0x000000008000808bULL, (uint64_t)0x800000000000008bULL,
+    (uint64_t)0x8000000000008089ULL, (uint64_t)0x8000000000008003ULL,
+    (uint64_t)0x8000000000008002ULL, (uint64_t)0x8000000000000080ULL,
+    (uint64_t)0x000000000000800aULL, (uint64_t)0x800000008000000aULL,
+    (uint64_t)0x8000000080008081ULL, (uint64_t)0x8000000000008080ULL,
+    (uint64_t)0x0000000080000001ULL, (uint64_t)0x8000000080008008ULL};
+
+MLK_STATIC_TESTABLE
+void mlk_keccakf1600_permute_c(uint64_t *state)
+{
+  unsigned round;
+
+  uint64_t Aba, Abe, Abi, Abo, Abu;
+  uint64_t Aga, Age, Agi, Ago, Agu;
+  uint64_t Aka, Ake, Aki, Ako, Aku;
+  uint64_t Ama, Ame, Ami, Amo, Amu;
+  uint64_t Asa, Ase, Asi, Aso, Asu;
+  uint64_t BCa, BCe, BCi, BCo, BCu;
+  uint64_t Da, De, Di, Do, Du;
+  uint64_t Eba, Ebe, Ebi, Ebo, Ebu;
+  uint64_t Ega, Ege, Egi, Ego, Egu;
+  uint64_t Eka, Eke, Eki, Eko, Eku;
+  uint64_t Ema, Eme, Emi, Emo, Emu;
+  uint64_t Esa, Ese, Esi, Eso, Esu;
+
+  /* copyFromState(A, state) */
+  Aba = state[0];
+  Abe = state[1];
+  Abi = state[2];
+  Abo = state[3];
+  Abu = state[4];
+  Aga = state[5];
+  Age = state[6];
+  Agi = state[7];
+  Ago = state[8];
+  Agu = state[9];
+  Aka = state[10];
+  Ake = state[11];
+  Aki = state[12];
+  Ako = state[13];
+  Aku = state[14];
+  Ama = state[15];
+  Ame = state[16];
+  Ami = state[17];
+  Amo = state[18];
+  Amu = state[19];
+  Asa = state[20];
+  Ase = state[21];
+  Asi = state[22];
+  Aso = state[23];
+  Asu = state[24];
+
+  for (round = 0; round < MLK_KECCAK_NROUNDS; round += 2)
+  __loop__(invariant(round <= MLK_KECCAK_NROUNDS && round % 2 == 0))
+  {
+    /* prepareTheta */
+    BCa = Aba ^ Aga ^ Aka ^ Ama ^ Asa;
+    BCe = Abe ^ Age ^ Ake ^ Ame ^ Ase;
+    BCi = Abi ^ Agi ^ Aki ^ Ami ^ Asi;
+    BCo = Abo ^ Ago ^ Ako ^ Amo ^ Aso;
+    BCu = Abu ^ Agu ^ Aku ^ Amu ^ Asu;
+
+    /* thetaRhoPiChiIotaPrepareTheta(round, A, E) */
+    Da = BCu ^ MLK_KECCAK_ROL(BCe, 1);
+    De = BCa ^ MLK_KECCAK_ROL(BCi, 1);
+    Di = BCe ^ MLK_KECCAK_ROL(BCo, 1);
+    Do = BCi ^ MLK_KECCAK_ROL(BCu, 1);
+    Du = BCo ^ MLK_KECCAK_ROL(BCa, 1);
+
+    Aba ^= Da;
+    BCa = Aba;
+    Age ^= De;
+    BCe = MLK_KECCAK_ROL(Age, 44);
+    Aki ^= Di;
+    BCi = MLK_KECCAK_ROL(Aki, 43);
+    Amo ^= Do;
+    BCo = MLK_KECCAK_ROL(Amo, 21);
+    Asu ^= Du;
+    BCu = MLK_KECCAK_ROL(Asu, 14);
+    Eba = BCa ^ ((~BCe) & BCi);
+    Eba ^= (uint64_t)mlk_KeccakF_RoundConstants[round];
+    Ebe = BCe ^ ((~BCi) & BCo);
+    Ebi = BCi ^ ((~BCo) & BCu);
+    Ebo = BCo ^ ((~BCu) & BCa);
+    Ebu = BCu ^ ((~BCa) & BCe);
+
+    Abo ^= Do;
+    BCa = MLK_KECCAK_ROL(Abo, 28);
+    Agu ^= Du;
+    BCe = MLK_KECCAK_ROL(Agu, 20);
+    Aka ^= Da;
+    BCi = MLK_KECCAK_ROL(Aka, 3);
+    Ame ^= De;
+    BCo = MLK_KECCAK_ROL(Ame, 45);
+    Asi ^= Di;
+    BCu = MLK_KECCAK_ROL(Asi, 61);
+    Ega = BCa ^ ((~BCe) & BCi);
+    Ege = BCe ^ ((~BCi) & BCo);
+    Egi = BCi ^ ((~BCo) & BCu);
+    Ego = BCo ^ ((~BCu) & BCa);
+    Egu = BCu ^ ((~BCa) & BCe);
+
+    Abe ^= De;
+    BCa = MLK_KECCAK_ROL(Abe, 1);
+    Agi ^= Di;
+    BCe = MLK_KECCAK_ROL(Agi, 6);
+    Ako ^= Do;
+    BCi = MLK_KECCAK_ROL(Ako, 25);
+    Amu ^= Du;
+    BCo = MLK_KECCAK_ROL(Amu, 8);
+    Asa ^= Da;
+    BCu = MLK_KECCAK_ROL(Asa, 18);
+    Eka = BCa ^ ((~BCe) & BCi);
+    Eke = BCe ^ ((~BCi) & BCo);
+    Eki = BCi ^ ((~BCo) & BCu);
+    Eko = BCo ^ ((~BCu) & BCa);
+    Eku = BCu ^ ((~BCa) & BCe);
+
+    Abu ^= Du;
+    BCa = MLK_KECCAK_ROL(Abu, 27);
+    Aga ^= Da;
+    BCe = MLK_KECCAK_ROL(Aga, 36);
+    Ake ^= De;
+    BCi = MLK_KECCAK_ROL(Ake, 10);
+    Ami ^= Di;
+    BCo = MLK_KECCAK_ROL(Ami, 15);
+    Aso ^= Do;
+    BCu = MLK_KECCAK_ROL(Aso, 56);
+    Ema = BCa ^ ((~BCe) & BCi);
+    Eme = BCe ^ ((~BCi) & BCo);
+    Emi = BCi ^ ((~BCo) & BCu);
+    Emo = BCo ^ ((~BCu) & BCa);
+    Emu = BCu ^ ((~BCa) & BCe);
+
+    Abi ^= Di;
+    BCa = MLK_KECCAK_ROL(Abi, 62);
+    Ago ^= Do;
+    BCe = MLK_KECCAK_ROL(Ago, 55);
+    Aku ^= Du;
+    BCi = MLK_KECCAK_ROL(Aku, 39);
+    Ama ^= Da;
+    BCo = MLK_KECCAK_ROL(Ama, 41);
+    Ase ^= De;
+    BCu = MLK_KECCAK_ROL(Ase, 2);
+    Esa = BCa ^ ((~BCe) & BCi);
+    Ese = BCe ^ ((~BCi) & BCo);
+    Esi = BCi ^ ((~BCo) & BCu);
+    Eso = BCo ^ ((~BCu) & BCa);
+    Esu = BCu ^ ((~BCa) & BCe);
+
+    /* prepareTheta */
+    BCa = Eba ^ Ega ^ Eka ^ Ema ^ Esa;
+    BCe = Ebe ^ Ege ^ Eke ^ Eme ^ Ese;
+    BCi = Ebi ^ Egi ^ Eki ^ Emi ^ Esi;
+    BCo = Ebo ^ Ego ^ Eko ^ Emo ^ Eso;
+    BCu = Ebu ^ Egu ^ Eku ^ Emu ^ Esu;
+
+    /* thetaRhoPiChiIotaPrepareTheta(round+1, E, A) */
+    Da = BCu ^ MLK_KECCAK_ROL(BCe, 1);
+    De = BCa ^ MLK_KECCAK_ROL(BCi, 1);
+    Di = BCe ^ MLK_KECCAK_ROL(BCo, 1);
+    Do = BCi ^ MLK_KECCAK_ROL(BCu, 1);
+    Du = BCo ^ MLK_KECCAK_ROL(BCa, 1);
+
+    Eba ^= Da;
+    BCa = Eba;
+    Ege ^= De;
+    BCe = MLK_KECCAK_ROL(Ege, 44);
+    Eki ^= Di;
+    BCi = MLK_KECCAK_ROL(Eki, 43);
+    Emo ^= Do;
+    BCo = MLK_KECCAK_ROL(Emo, 21);
+    Esu ^= Du;
+    BCu = MLK_KECCAK_ROL(Esu, 14);
+    Aba = BCa ^ ((~BCe) & BCi);
+    Aba ^= (uint64_t)mlk_KeccakF_RoundConstants[round + 1];
+    Abe = BCe ^ ((~BCi) & BCo);
+    Abi = BCi ^ ((~BCo) & BCu);
+    Abo = BCo ^ ((~BCu) & BCa);
+    Abu = BCu ^ ((~BCa) & BCe);
+
+    Ebo ^= Do;
+    BCa = MLK_KECCAK_ROL(Ebo, 28);
+    Egu ^= Du;
+    BCe = MLK_KECCAK_ROL(Egu, 20);
+    Eka ^= Da;
+    BCi = MLK_KECCAK_ROL(Eka, 3);
+    Eme ^= De;
+    BCo = MLK_KECCAK_ROL(Eme, 45);
+    Esi ^= Di;
+    BCu = MLK_KECCAK_ROL(Esi, 61);
+    Aga = BCa ^ ((~BCe) & BCi);
+    Age = BCe ^ ((~BCi) & BCo);
+    Agi = BCi ^ ((~BCo) & BCu);
+    Ago = BCo ^ ((~BCu) & BCa);
+    Agu = BCu ^ ((~BCa) & BCe);
+
+    Ebe ^= De;
+    BCa = MLK_KECCAK_ROL(Ebe, 1);
+    Egi ^= Di;
+    BCe = MLK_KECCAK_ROL(Egi, 6);
+    Eko ^= Do;
+    BCi = MLK_KECCAK_ROL(Eko, 25);
+    Emu ^= Du;
+    BCo = MLK_KECCAK_ROL(Emu, 8);
+    Esa ^= Da;
+    BCu = MLK_KECCAK_ROL(Esa, 18);
+    Aka = BCa ^ ((~BCe) & BCi);
+    Ake = BCe ^ ((~BCi) & BCo);
+    Aki = BCi ^ ((~BCo) & BCu);
+    Ako = BCo ^ ((~BCu) & BCa);
+    Aku = BCu ^ ((~BCa) & BCe);
+
+    Ebu ^= Du;
+    BCa = MLK_KECCAK_ROL(Ebu, 27);
+    Ega ^= Da;
+    BCe = MLK_KECCAK_ROL(Ega, 36);
+    Eke ^= De;
+    BCi = MLK_KECCAK_ROL(Eke, 10);
+    Emi ^= Di;
+    BCo = MLK_KECCAK_ROL(Emi, 15);
+    Eso ^= Do;
+    BCu = MLK_KECCAK_ROL(Eso, 56);
+    Ama = BCa ^ ((~BCe) & BCi);
+    Ame = BCe ^ ((~BCi) & BCo);
+    Ami = BCi ^ ((~BCo) & BCu);
+    Amo = BCo ^ ((~BCu) & BCa);
+    Amu = BCu ^ ((~BCa) & BCe);
+
+    Ebi ^= Di;
+    BCa = MLK_KECCAK_ROL(Ebi, 62);
+    Ego ^= Do;
+    BCe = MLK_KECCAK_ROL(Ego, 55);
+    Eku ^= Du;
+    BCi = MLK_KECCAK_ROL(Eku, 39);
+    Ema ^= Da;
+    BCo = MLK_KECCAK_ROL(Ema, 41);
+    Ese ^= De;
+    BCu = MLK_KECCAK_ROL(Ese, 2);
+    Asa = BCa ^ ((~BCe) & BCi);
+    Ase = BCe ^ ((~BCi) & BCo);
+    Asi = BCi ^ ((~BCo) & BCu);
+    Aso = BCo ^ ((~BCu) & BCa);
+    Asu = BCu ^ ((~BCa) & BCe);
+  }
+
+  /* copyToState(state, A) */
+  state[0] = Aba;
+  state[1] = Abe;
+  state[2] = Abi;
+  state[3] = Abo;
+  state[4] = Abu;
+  state[5] = Aga;
+  state[6] = Age;
+  state[7] = Agi;
+  state[8] = Ago;
+  state[9] = Agu;
+  state[10] = Aka;
+  state[11] = Ake;
+  state[12] = Aki;
+  state[13] = Ako;
+  state[14] = Aku;
+  state[15] = Ama;
+  state[16] = Ame;
+  state[17] = Ami;
+  state[18] = Amo;
+  state[19] = Amu;
+  state[20] = Asa;
+  state[21] = Ase;
+  state[22] = Asi;
+  state[23] = Aso;
+  state[24] = Asu;
+}
+
+void mlk_keccakf1600_permute(uint64_t *state)
+{
+#if defined(MLK_USE_FIPS202_X1_NATIVE)
+  if (mlk_keccak_f1600_x1_native(state) == MLK_NATIVE_FUNC_SUCCESS)
+  {
+    return;
+  }
+#endif /* MLK_USE_FIPS202_X1_NATIVE */
+  mlk_keccakf1600_permute_c(state);
+}
+
+#else /* !MLK_CONFIG_MULTILEVEL_NO_SHARED */
+
+MLK_EMPTY_CU(keccakf1600)
+
+#endif /* MLK_CONFIG_MULTILEVEL_NO_SHARED */
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef MLK_KECCAK_NROUNDS
+#undef MLK_KECCAK_ROL
diff --git a/mlkem_native/src/fips202/keccakf1600.h b/mlkem_native/src/fips202/keccakf1600.h
new file mode 100644
index 0000000..c26c36a
--- /dev/null
+++ b/mlkem_native/src/fips202/keccakf1600.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+#ifndef MLK_FIPS202_KECCAKF1600_H
+#define MLK_FIPS202_KECCAKF1600_H
+#include "../cbmc.h"
+#include "../common.h"
+
+#define MLK_KECCAK_LANES 25
+#define MLK_KECCAK_WAY 4
+
+/*
+ * WARNING:
+ * The contents of this structure, including the placement
+ * and interleaving of Keccak lanes, are IMPLEMENTATION-DEFINED.
+ * The struct is only exposed here to allow its construction on the stack.
+ */
+
+#define mlk_keccakf1600_extract_bytes MLK_NAMESPACE(keccakf1600_extract_bytes)
+void mlk_keccakf1600_extract_bytes(uint64_t *state, unsigned char *data,
+                                   unsigned offset, unsigned length)
+__contract__(
+    requires(0 <= offset && offset <= MLK_KECCAK_LANES * sizeof(uint64_t) &&
+             0 <= length && length <= MLK_KECCAK_LANES * sizeof(uint64_t) - offset)
+    requires(memory_no_alias(state, sizeof(uint64_t) * MLK_KECCAK_LANES))
+    requires(memory_no_alias(data, length))
+    assigns(memory_slice(data, length))
+);
+
+#define mlk_keccakf1600_xor_bytes MLK_NAMESPACE(keccakf1600_xor_bytes)
+void mlk_keccakf1600_xor_bytes(uint64_t *state, const unsigned char *data,
+                               unsigned offset, unsigned length)
+__contract__(
+    requires(0 <= offset && offset <= MLK_KECCAK_LANES * sizeof(uint64_t) &&
+             0 <= length && length <= MLK_KECCAK_LANES * sizeof(uint64_t) - offset)
+    requires(memory_no_alias(state, sizeof(uint64_t) * MLK_KECCAK_LANES))
+    requires(memory_no_alias(data, length))
+    assigns(memory_slice(state, sizeof(uint64_t) * MLK_KECCAK_LANES))
+);
+
+#define mlk_keccakf1600x4_extract_bytes \
+  MLK_NAMESPACE(keccakf1600x4_extract_bytes)
+void mlk_keccakf1600x4_extract_bytes(uint64_t *state, unsigned char *data0,
+                                     unsigned char *data1, unsigned char *data2,
+                                     unsigned char *data3, unsigned offset,
+                                     unsigned length)
+__contract__(
+    requires(0 <= offset && offset <= MLK_KECCAK_LANES * sizeof(uint64_t) &&
+             0 <= length && length <= MLK_KECCAK_LANES * sizeof(uint64_t) - offset)
+    requires(memory_no_alias(state, sizeof(uint64_t) * MLK_KECCAK_LANES * MLK_KECCAK_WAY))
+    requires(memory_no_alias(data0, length))
+    requires(memory_no_alias(data1, length))
+    requires(memory_no_alias(data2, length))
+    requires(memory_no_alias(data3, length))
+    assigns(memory_slice(data0, length))
+    assigns(memory_slice(data1, length))
+    assigns(memory_slice(data2, length))
+    assigns(memory_slice(data3, length))
+);
+
+#define mlk_keccakf1600x4_xor_bytes MLK_NAMESPACE(keccakf1600x4_xor_bytes)
+void mlk_keccakf1600x4_xor_bytes(uint64_t *state, const unsigned char *data0,
+                                 const unsigned char *data1,
+                                 const unsigned char *data2,
+                                 const unsigned char *data3, unsigned offset,
+                                 unsigned length)
+__contract__(
+    requires(0 <= offset && offset <= MLK_KECCAK_LANES * sizeof(uint64_t) &&
+             0 <= length && length <= MLK_KECCAK_LANES * sizeof(uint64_t) - offset)
+    requires(memory_no_alias(state, sizeof(uint64_t) * MLK_KECCAK_LANES * MLK_KECCAK_WAY))
+    requires(memory_no_alias(data0, length))
+    /* Case 1: all input buffers are distinct; Case 2: All input buffers are the same */
+    requires((data0 == data1 &&
+              data0 == data2 &&
+              data0 == data3) ||
+             (memory_no_alias(data1, length) &&
+              memory_no_alias(data2, length) &&
+              memory_no_alias(data3, length)))
+    assigns(memory_slice(state, sizeof(uint64_t) * MLK_KECCAK_LANES * MLK_KECCAK_WAY))
+);
+
+
+#define mlk_keccakf1600x4_permute MLK_NAMESPACE(keccakf1600x4_permute)
+void mlk_keccakf1600x4_permute(uint64_t *state)
+__contract__(
+    requires(memory_no_alias(state, sizeof(uint64_t) * MLK_KECCAK_LANES * MLK_KECCAK_WAY))
+    assigns(memory_slice(state, sizeof(uint64_t) * MLK_KECCAK_LANES * MLK_KECCAK_WAY))
+);
+
+#define mlk_keccakf1600_permute MLK_NAMESPACE(keccakf1600_permute)
+void mlk_keccakf1600_permute(uint64_t *state)
+__contract__(
+    requires(memory_no_alias(state, sizeof(uint64_t) * MLK_KECCAK_LANES))
+    assigns(memory_slice(state, sizeof(uint64_t) * MLK_KECCAK_LANES))
+);
+
+#endif /* !MLK_FIPS202_KECCAKF1600_H */
diff --git a/mlkem_native/src/indcpa.c b/mlkem_native/src/indcpa.c
new file mode 100644
index 0000000..d3dc364
--- /dev/null
+++ b/mlkem_native/src/indcpa.c
@@ -0,0 +1,652 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS203]
+ *   FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/203/final
+ *
+ * - [REF]
+ *   CRYSTALS-Kyber C reference implementation
+ *   Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ *   https://github.com/pq-crystals/kyber/tree/main/ref
+ */
+
+#include "indcpa.h"
+
+#include "debug.h"
+#include "randombytes.h"
+#include "sampling.h"
+#include "symmetric.h"
+
+/* Parameter set namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying parameter sets)
+ * within a single compilation unit. */
+#define mlk_pack_pk MLK_ADD_PARAM_SET(mlk_pack_pk)
+#define mlk_unpack_pk MLK_ADD_PARAM_SET(mlk_unpack_pk)
+#define mlk_pack_sk MLK_ADD_PARAM_SET(mlk_pack_sk)
+#define mlk_unpack_sk MLK_ADD_PARAM_SET(mlk_unpack_sk)
+#define mlk_pack_ciphertext MLK_ADD_PARAM_SET(mlk_pack_ciphertext)
+#define mlk_unpack_ciphertext MLK_ADD_PARAM_SET(mlk_unpack_ciphertext)
+#define mlk_matvec_mul MLK_ADD_PARAM_SET(mlk_matvec_mul)
+#define mlk_polyvec_permute_bitrev_to_custom \
+  MLK_ADD_PARAM_SET(mlk_polyvec_permute_bitrev_to_custom)
+#define mlk_polymat_permute_bitrev_to_custom \
+  MLK_ADD_PARAM_SET(mlk_polymat_permute_bitrev_to_custom)
+#define mlk_keypair_getnoise MLK_ADD_PARAM_SET(mlk_keypair_getnoise)
+/* End of parameter set namespacing */
+
+/*************************************************
+ * Name:        mlk_pack_pk
+ *
+ * Description: Serialize the public key as concatenation of the
+ *              serialized vector of polynomials pk
+ *              and the public seed used to generate the matrix A.
+ *
+ * Arguments:   uint8_t *r: pointer to the output serialized public key
+ *              mlk_polyvec pk: pointer to the input public-key mlk_polyvec.
+ *                Must have coefficients within [0,..,q-1].
+ *              const uint8_t *seed: pointer to the input public seed
+ *
+ * Specification:
+ * Implements @[FIPS203, Algorithm 13 (K-PKE.KeyGen), L19]
+ *
+ **************************************************/
+static void mlk_pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES],
+                        const mlk_polyvec *pk,
+                        const uint8_t seed[MLKEM_SYMBYTES])
+{
+  mlk_assert_bound_2d(pk->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+  mlk_polyvec_tobytes(r, pk);
+  mlk_memcpy(r + MLKEM_POLYVECBYTES, seed, MLKEM_SYMBYTES);
+}
+
+/*************************************************
+ * Name:        mlk_unpack_pk
+ *
+ * Description: De-serialize public key from a byte array;
+ *              approximate inverse of mlk_pack_pk
+ *
+ * Arguments:   - mlk_polyvec pk: pointer to output public-key polynomial
+ *                vector Coefficients will be normalized to [0,..,q-1].
+ *              - uint8_t *seed: pointer to output seed to generate matrix A
+ *              - const uint8_t *packedpk: pointer to input serialized public
+ *                  key.
+ *
+ * Specification:
+ * Implements @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L2-3]
+ *
+ **************************************************/
+static void mlk_unpack_pk(mlk_polyvec *pk, uint8_t seed[MLKEM_SYMBYTES],
+                          const uint8_t packedpk[MLKEM_INDCPA_PUBLICKEYBYTES])
+{
+  mlk_polyvec_frombytes(pk, packedpk);
+  mlk_memcpy(seed, packedpk + MLKEM_POLYVECBYTES, MLKEM_SYMBYTES);
+
+  /* NOTE: If a modulus check was conducted on the PK, we know at this
+   * point that the coefficients of `pk` are unsigned canonical. The
+   * specifications and proofs, however, do _not_ assume this, and instead
+   * work with the easily provable bound by MLKEM_UINT12_LIMIT. */
+}
+
+/*************************************************
+ * Name:        mlk_pack_sk
+ *
+ * Description: Serialize the secret key
+ *
+ * Arguments:   - uint8_t *r: pointer to output serialized secret key
+ *              - mlk_polyvec sk: pointer to input vector of polynomials
+ *                (secret key)
+ *
+ * Specification:
+ * Implements @[FIPS203, Algorithm 13 (K-PKE.KeyGen), L20]
+ *
+ **************************************************/
+static void mlk_pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES],
+                        const mlk_polyvec *sk)
+{
+  mlk_assert_bound_2d(sk->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+  mlk_polyvec_tobytes(r, sk);
+}
+
+/*************************************************
+ * Name:        mlk_unpack_sk
+ *
+ * Description: De-serialize the secret key; inverse of mlk_pack_sk
+ *
+ * Arguments:   - mlk_polyvec sk: pointer to output vector of polynomials
+ *                (secret key)
+ *              - const uint8_t *packedsk: pointer to input serialized secret
+ *                key
+ *
+ * Specification:
+ * Implements @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L5]
+ *
+ **************************************************/
+static void mlk_unpack_sk(mlk_polyvec *sk,
+                          const uint8_t packedsk[MLKEM_INDCPA_SECRETKEYBYTES])
+{
+  mlk_polyvec_frombytes(sk, packedsk);
+}
+
+/*************************************************
+ * Name:        mlk_pack_ciphertext
+ *
+ * Description: Serialize the ciphertext as concatenation of the
+ *              compressed and serialized vector of polynomials b
+ *              and the compressed and serialized polynomial v
+ *
+ * Arguments:   uint8_t *r: pointer to the output serialized ciphertext
+ *              mlk_poly *pk: pointer to the input vector of polynomials b
+ *              mlk_poly *v: pointer to the input polynomial v
+ *
+ * Specification:
+ * Implements @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L22-23]
+ *
+ **************************************************/
+static void mlk_pack_ciphertext(uint8_t r[MLKEM_INDCPA_BYTES],
+                                const mlk_polyvec *b, mlk_poly *v)
+{
+  mlk_polyvec_compress_du(r, b);
+  mlk_poly_compress_dv(r + MLKEM_POLYVECCOMPRESSEDBYTES_DU, v);
+}
+
+/*************************************************
+ * Name:        mlk_unpack_ciphertext
+ *
+ * Description: De-serialize and decompress ciphertext from a byte array;
+ *              approximate inverse of mlk_pack_ciphertext
+ *
+ * Arguments:   - mlk_polyvec b: pointer to the output vector of polynomials b
+ *              - mlk_poly *v: pointer to the output polynomial v
+ *              - const uint8_t *c: pointer to the input serialized ciphertext
+ *
+ * Specification:
+ * Implements @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L1-4]
+ *
+ **************************************************/
+static void mlk_unpack_ciphertext(mlk_polyvec *b, mlk_poly *v,
+                                  const uint8_t c[MLKEM_INDCPA_BYTES])
+{
+  mlk_polyvec_decompress_du(b, c);
+  mlk_poly_decompress_dv(v, c + MLKEM_POLYVECCOMPRESSEDBYTES_DU);
+}
+
+/* Helper function to ensure that the polynomial entries in the output
+ * of gen_matrix use the standard (bitreversed) ordering of coefficients.
+ * No-op unless a native backend with a custom ordering is used.
+ *
+ * We don't inline this into gen_matrix to avoid having to split the CBMC
+ * proof for gen_matrix based on MLK_USE_NATIVE_NTT_CUSTOM_ORDER. */
+static void mlk_polyvec_permute_bitrev_to_custom(mlk_polyvec *v)
+__contract__(
+  /* We don't specify that this should be a permutation, but only
+   * that it does not change the bound established at the end of mlk_gen_matrix. */
+  requires(memory_no_alias(v, sizeof(mlk_polyvec)))
+  requires(forall(x, 0, MLKEM_K,
+    array_bound(v->vec[x].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+  assigns(memory_slice(v, sizeof(mlk_polyvec)))
+  ensures(forall(x, 0, MLKEM_K,
+    array_bound(v->vec[x].coeffs, 0, MLKEM_N, 0, MLKEM_Q))))
+{
+#if defined(MLK_USE_NATIVE_NTT_CUSTOM_ORDER)
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  __loop__(
+     assigns(i, memory_slice(v, sizeof(mlk_polyvec)))
+     invariant(i <= MLKEM_K)
+     invariant(forall(x, 0, MLKEM_K,
+       array_bound(v->vec[x].coeffs, 0, MLKEM_N, 0, MLKEM_Q))))
+  {
+    mlk_poly_permute_bitrev_to_custom(v->vec[i].coeffs);
+  }
+#else  /* MLK_USE_NATIVE_NTT_CUSTOM_ORDER */
+  /* Nothing to do */
+  (void)v;
+#endif /* !MLK_USE_NATIVE_NTT_CUSTOM_ORDER */
+}
+
+static void mlk_polymat_permute_bitrev_to_custom(mlk_polymat *a)
+__contract__(
+  /* We don't specify that this should be a permutation, but only
+   * that it does not change the bound established at the end of mlk_gen_matrix. */
+  requires(memory_no_alias(a, sizeof(mlk_polymat)))
+  requires(forall(x, 0, MLKEM_K, forall(y, 0, MLKEM_K,
+    array_bound(a->vec[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q))))
+  assigns(memory_slice(a, sizeof(mlk_polymat)))
+  ensures(forall(x, 0, MLKEM_K, forall(y, 0, MLKEM_K,
+    array_bound(a->vec[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))))
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  __loop__(
+     assigns(i, memory_slice(a, sizeof(mlk_polymat)))
+     invariant(i <= MLKEM_K)
+     invariant(forall(x, 0, MLKEM_K, forall(y, 0, MLKEM_K,
+       array_bound(a->vec[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))))
+  {
+    mlk_polyvec_permute_bitrev_to_custom(&a->vec[i]);
+  }
+}
+
+/* Reference: `gen_matrix()` in the reference implementation @[REF].
+ *            - We use a special subroutine to generate 4 polynomials
+ *              at a time, to be able to leverage batched Keccak-f1600
+ *              implementations. The reference implementation generates
+ *              one matrix entry a time.
+ *
+ * Not static for benchmarking */
+MLK_INTERNAL_API
+void mlk_gen_matrix(mlk_polymat *a, const uint8_t seed[MLKEM_SYMBYTES],
+                    int transposed)
+{
+  unsigned i, j;
+  MLK_ALIGN uint8_t seed_ext[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)];
+
+  for (j = 0; j < 4; j++)
+  {
+    mlk_memcpy(seed_ext[j], seed, MLKEM_SYMBYTES);
+  }
+
+#if !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
+  /* Sample 4 matrix entries a time. */
+  for (i = 0; i < (MLKEM_K * MLKEM_K / 4) * 4; i += 4)
+  {
+    for (j = 0; j < 4; j++)
+    {
+      uint8_t x, y;
+      /* MLKEM_K <= 4, so the values fit in uint8_t. */
+      x = (uint8_t)((i + j) / MLKEM_K);
+      y = (uint8_t)((i + j) % MLKEM_K);
+      if (transposed)
+      {
+        seed_ext[j][MLKEM_SYMBYTES + 0] = x;
+        seed_ext[j][MLKEM_SYMBYTES + 1] = y;
+      }
+      else
+      {
+        seed_ext[j][MLKEM_SYMBYTES + 0] = y;
+        seed_ext[j][MLKEM_SYMBYTES + 1] = x;
+      }
+    }
+
+    mlk_poly_rej_uniform_x4(&a->vec[i / MLKEM_K].vec[i % MLKEM_K],
+                            &a->vec[(i + 1) / MLKEM_K].vec[(i + 1) % MLKEM_K],
+                            &a->vec[(i + 2) / MLKEM_K].vec[(i + 2) % MLKEM_K],
+                            &a->vec[(i + 3) / MLKEM_K].vec[(i + 3) % MLKEM_K],
+                            seed_ext);
+  }
+#else  /* !MLK_CONFIG_SERIAL_FIPS202_ONLY */
+  /* When using serial FIPS202, sample all entries individually. */
+  i = 0;
+#endif /* MLK_CONFIG_SERIAL_FIPS202_ONLY */
+
+  /* For MLKEM_K == 3, sample the last entry individually.
+   * When MLK_CONFIG_SERIAL_FIPS202_ONLY is set, sample all entries
+   * individually. */
+  for (; i < MLKEM_K * MLKEM_K; i++)
+  {
+    uint8_t x, y;
+    /* MLKEM_K <= 4, so the values fit in uint8_t. */
+    x = (uint8_t)(i / MLKEM_K);
+    y = (uint8_t)(i % MLKEM_K);
+
+    if (transposed)
+    {
+      seed_ext[0][MLKEM_SYMBYTES + 0] = x;
+      seed_ext[0][MLKEM_SYMBYTES + 1] = y;
+    }
+    else
+    {
+      seed_ext[0][MLKEM_SYMBYTES + 0] = y;
+      seed_ext[0][MLKEM_SYMBYTES + 1] = x;
+    }
+
+    mlk_poly_rej_uniform(&a->vec[i / MLKEM_K].vec[i % MLKEM_K], seed_ext[0]);
+  }
+
+  mlk_assert(i == MLKEM_K * MLKEM_K);
+
+  /*
+   * The public matrix is generated in NTT domain. If the native backend
+   * uses a custom order in NTT domain, permute A accordingly.
+   */
+  mlk_polymat_permute_bitrev_to_custom(a);
+
+  /* Specification: Partially implements
+   * @[FIPS203, Section 3.3, Destruction of intermediate values] */
+  mlk_zeroize(seed_ext, sizeof(seed_ext));
+}
+
+/*************************************************
+ * Name:        mlk_matvec_mul
+ *
+ * Description: Computes matrix-vector product in NTT domain,
+ *              via Montgomery multiplication.
+ *
+ * Arguments:   - mlk_polyvec out: Pointer to output polynomial vector
+ *              - mlk_polymat a: Input matrix. Must be in NTT domain
+ *                  and have coefficients of absolute value < 4096.
+ *              - mlk_polyvec v: Input polynomial vector. Must be in NTT
+ *                  domain.
+ *              - mlk_polyvec vc: Mulcache for v, computed via
+ *                  mlk_polyvec_mulcache_compute().
+ *
+ * Specification: Implements @[FIPS203, Section 2.4.7, Eq (2.12), (2.13)]
+ *
+ **************************************************/
+static void mlk_matvec_mul(mlk_polyvec *out, const mlk_polymat *a,
+                           const mlk_polyvec *v, const mlk_polyvec_mulcache *vc)
+__contract__(
+  requires(memory_no_alias(out, sizeof(mlk_polyvec)))
+  requires(memory_no_alias(a, sizeof(mlk_polymat)))
+  requires(memory_no_alias(v, sizeof(mlk_polyvec)))
+  requires(memory_no_alias(vc, sizeof(mlk_polyvec_mulcache)))
+  requires(forall(k0, 0, MLKEM_K,
+    forall(k1, 0, MLKEM_K,
+      array_bound(a->vec[k0].vec[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))))
+  assigns(memory_slice(out, sizeof(mlk_polyvec))))
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  __loop__(
+    assigns(i, memory_slice(out, sizeof(mlk_polyvec)))
+    invariant(i <= MLKEM_K))
+  {
+    mlk_polyvec_basemul_acc_montgomery_cached(&out->vec[i], &a->vec[i], v, vc);
+  }
+}
+
+/*************************************************
+ * Name:        mlk_keypair_getnoise
+ *
+ * Description: Computes and fills the pv and e polyvec
+ *              structures needed by mlk_keypair_derand()
+ *
+ * Arguments:   - pv: Pointer to output polynomial vector
+ *              - e:  Pointer to output polynomial vector
+ *              - seed: seed bytes for sampling
+ *
+ * Specification: Implements @[FIPS203, Algorithm 13 (K-PKE.KeyGen)].
+ *                steps 8 - 15
+ **************************************************/
+static void mlk_keypair_getnoise(mlk_polyvec *pv, mlk_polyvec *e,
+                                 const uint8_t seed[MLKEM_SYMBYTES])
+__contract__(
+  requires(memory_no_alias(pv, sizeof(mlk_polyvec)))
+  requires(memory_no_alias(e, sizeof(mlk_polyvec)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(memory_slice(pv, sizeof(mlk_polyvec)))
+  assigns(memory_slice(e, sizeof(mlk_polyvec)))
+  ensures(forall(k0, 0, MLKEM_K, array_abs_bound(pv->vec[k0].coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1)))
+  ensures(forall(k1, 0, MLKEM_K, array_abs_bound(e->vec[k1].coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1)))
+)
+{
+#if MLKEM_K == 2
+  mlk_poly_getnoise_eta1_4x(&pv->vec[0], &pv->vec[1], &e->vec[0], &e->vec[1],
+                            seed, 0, 1, 2, 3);
+#elif MLKEM_K == 3
+  /*
+   * Only the first three output buffers are needed.
+   */
+  mlk_poly_getnoise_eta1_4x(&pv->vec[0], &pv->vec[1], &pv->vec[2], NULL, seed,
+                            0, 1, 2, 0xFF /* irrelevant */);
+  /* Same here */
+  mlk_poly_getnoise_eta1_4x(&e->vec[0], &e->vec[1], &e->vec[2], NULL, seed, 3,
+                            4, 5, 0xFF /* irrelevant */);
+#elif MLKEM_K == 4
+  mlk_poly_getnoise_eta1_4x(&pv->vec[0], &pv->vec[1], &pv->vec[2], &pv->vec[3],
+                            seed, 0, 1, 2, 3);
+  mlk_poly_getnoise_eta1_4x(&e->vec[0], &e->vec[1], &e->vec[2], &e->vec[3],
+                            seed, 4, 5, 6, 7);
+#endif /* MLKEM_K == 4 */
+}
+
+
+/* Reference: `indcpa_keypair_derand()` in the reference implementation @[REF].
+ *            - We use x4-batched versions of `poly_getnoise` to leverage
+ *              batched x4-batched Keccak-f1600.
+ *            - We use a different implementation of `gen_matrix()` which
+ *              uses x4-batched Keccak-f1600 (see `mlk_gen_matrix()` above).
+ *            - We use a mulcache to speed up matrix-vector multiplication.
+ *            - We include buffer zeroization.
+ */
+MLK_INTERNAL_API
+int mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+                              uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+                              const uint8_t coins[MLKEM_SYMBYTES],
+                              MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+{
+  int ret = 0;
+  const uint8_t *publicseed;
+  const uint8_t *noiseseed;
+  MLK_ALLOC(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+  MLK_ALLOC(coins_with_domain_separator, uint8_t, MLKEM_SYMBYTES + 1, context);
+  MLK_ALLOC(a, mlk_polymat, 1, context);
+  MLK_ALLOC(e, mlk_polyvec, 1, context);
+  MLK_ALLOC(pkpv, mlk_polyvec, 1, context);
+  MLK_ALLOC(skpv, mlk_polyvec, 1, context);
+  MLK_ALLOC(skpv_cache, mlk_polyvec_mulcache, 1, context);
+
+  if (buf == NULL || coins_with_domain_separator == NULL || a == NULL ||
+      e == NULL || pkpv == NULL || skpv == NULL || skpv_cache == NULL)
+  {
+    ret = MLK_ERR_OUT_OF_MEMORY;
+    goto cleanup;
+  }
+
+  publicseed = buf;
+  noiseseed = buf + MLKEM_SYMBYTES;
+
+  /* Concatenate coins with MLKEM_K for domain separation of security levels */
+  mlk_memcpy(coins_with_domain_separator, coins, MLKEM_SYMBYTES);
+  coins_with_domain_separator[MLKEM_SYMBYTES] = MLKEM_K;
+
+  mlk_hash_g(buf, coins_with_domain_separator, MLKEM_SYMBYTES + 1);
+
+  /*
+   * Declassify the public seed.
+   * Required to use it in conditional-branches in rejection sampling.
+   * This is needed because all output of randombytes is marked as secret
+   * (=undefined)
+   */
+  MLK_CT_TESTING_DECLASSIFY(publicseed, MLKEM_SYMBYTES);
+
+  mlk_gen_matrix(a, publicseed, 0 /* no transpose */);
+
+  mlk_keypair_getnoise(skpv, e, noiseseed);
+
+  mlk_polyvec_ntt(skpv);
+  mlk_polyvec_ntt(e);
+
+  mlk_polyvec_mulcache_compute(skpv_cache, skpv);
+  mlk_matvec_mul(pkpv, a, skpv, skpv_cache);
+  mlk_polyvec_tomont(pkpv);
+
+  mlk_polyvec_add(pkpv, e);
+  mlk_polyvec_reduce(pkpv);
+  mlk_polyvec_reduce(skpv);
+
+  mlk_pack_sk(sk, skpv);
+  mlk_pack_pk(pk, pkpv, publicseed);
+
+cleanup:
+  /* Specification: Partially implements
+   * @[FIPS203, Section 3.3, Destruction of intermediate values] */
+  MLK_FREE(skpv_cache, mlk_polyvec_mulcache, 1, context);
+  MLK_FREE(skpv, mlk_polyvec, 1, context);
+  MLK_FREE(pkpv, mlk_polyvec, 1, context);
+  MLK_FREE(e, mlk_polyvec, 1, context);
+  MLK_FREE(a, mlk_polymat, 1, context);
+  MLK_FREE(coins_with_domain_separator, uint8_t, MLKEM_SYMBYTES + 1, context);
+  MLK_FREE(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+  return ret;
+}
+
+/* Reference: `indcpa_enc()` in the reference implementation @[REF].
+ *            - We use x4-batched versions of `poly_getnoise` to leverage
+ *              batched x4-batched Keccak-f1600.
+ *            - We use a different implementation of `gen_matrix()` which
+ *              uses x4-batched Keccak-f1600 (see `mlk_gen_matrix()` above).
+ *            - We use a mulcache to speed up matrix-vector multiplication.
+ *            - We include buffer zeroization.
+ */
+MLK_INTERNAL_API
+int mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
+                   const uint8_t m[MLKEM_INDCPA_MSGBYTES],
+                   const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+                   const uint8_t coins[MLKEM_SYMBYTES],
+                   MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+{
+  int ret = 0;
+  MLK_ALLOC(seed, uint8_t, MLKEM_SYMBYTES, context);
+  MLK_ALLOC(at, mlk_polymat, 1, context);
+  MLK_ALLOC(sp, mlk_polyvec, 1, context);
+  MLK_ALLOC(pkpv, mlk_polyvec, 1, context);
+  MLK_ALLOC(ep, mlk_polyvec, 1, context);
+  MLK_ALLOC(b, mlk_polyvec, 1, context);
+  MLK_ALLOC(v, mlk_poly, 1, context);
+  MLK_ALLOC(k, mlk_poly, 1, context);
+  MLK_ALLOC(epp, mlk_poly, 1, context);
+  MLK_ALLOC(sp_cache, mlk_polyvec_mulcache, 1, context);
+
+  if (seed == NULL || at == NULL || sp == NULL || pkpv == NULL || ep == NULL ||
+      b == NULL || v == NULL || k == NULL || epp == NULL || sp_cache == NULL)
+  {
+    ret = MLK_ERR_OUT_OF_MEMORY;
+    goto cleanup;
+  }
+
+  mlk_unpack_pk(pkpv, seed, pk);
+  mlk_poly_frommsg(k, m);
+
+  /*
+   * Declassify the public seed.
+   * Required to use it in conditional-branches in rejection sampling.
+   * This is needed because in re-encryption the publicseed originated from sk
+   * which is marked undefined.
+   */
+  MLK_CT_TESTING_DECLASSIFY(seed, MLKEM_SYMBYTES);
+
+  mlk_gen_matrix(at, seed, 1 /* transpose */);
+
+#if MLKEM_K == 2
+  mlk_poly_getnoise_eta1122_4x(&sp->vec[0], &sp->vec[1], &ep->vec[0],
+                               &ep->vec[1], coins, 0, 1, 2, 3);
+  mlk_poly_getnoise_eta2(epp, coins, 4);
+#elif MLKEM_K == 3
+  /*
+   * In this call, only the first three output buffers are needed.
+   * The last parameter is a dummy that's overwritten later.
+   */
+  mlk_poly_getnoise_eta1_4x(&sp->vec[0], &sp->vec[1], &sp->vec[2], NULL, coins,
+                            0, 1, 2, 0xFF /* irrelevant */);
+  /* The fourth output buffer in this call _is_ used. */
+  mlk_poly_getnoise_eta2_4x(&ep->vec[0], &ep->vec[1], &ep->vec[2], epp, coins,
+                            3, 4, 5, 6);
+#elif MLKEM_K == 4
+  mlk_poly_getnoise_eta1_4x(&sp->vec[0], &sp->vec[1], &sp->vec[2], &sp->vec[3],
+                            coins, 0, 1, 2, 3);
+  mlk_poly_getnoise_eta2_4x(&ep->vec[0], &ep->vec[1], &ep->vec[2], &ep->vec[3],
+                            coins, 4, 5, 6, 7);
+  mlk_poly_getnoise_eta2(epp, coins, 8);
+#endif /* MLKEM_K == 4 */
+
+  mlk_polyvec_ntt(sp);
+
+  mlk_polyvec_mulcache_compute(sp_cache, sp);
+  mlk_matvec_mul(b, at, sp, sp_cache);
+  mlk_polyvec_basemul_acc_montgomery_cached(v, pkpv, sp, sp_cache);
+
+  mlk_polyvec_invntt_tomont(b);
+  mlk_poly_invntt_tomont(v);
+
+  mlk_polyvec_add(b, ep);
+  mlk_poly_add(v, epp);
+  mlk_poly_add(v, k);
+
+  mlk_polyvec_reduce(b);
+  mlk_poly_reduce(v);
+
+  mlk_pack_ciphertext(c, b, v);
+
+cleanup:
+  /* Specification: Partially implements
+   * @[FIPS203, Section 3.3, Destruction of intermediate values] */
+  MLK_FREE(sp_cache, mlk_polyvec_mulcache, 1, context);
+  MLK_FREE(epp, mlk_poly, 1, context);
+  MLK_FREE(k, mlk_poly, 1, context);
+  MLK_FREE(v, mlk_poly, 1, context);
+  MLK_FREE(b, mlk_polyvec, 1, context);
+  MLK_FREE(ep, mlk_polyvec, 1, context);
+  MLK_FREE(pkpv, mlk_polyvec, 1, context);
+  MLK_FREE(sp, mlk_polyvec, 1, context);
+  MLK_FREE(at, mlk_polymat, 1, context);
+  MLK_FREE(seed, uint8_t, MLKEM_SYMBYTES, context);
+  return ret;
+}
+
+/* Reference: `indcpa_dec()` in the reference implementation @[REF].
+ *            - We use a mulcache for the scalar product.
+ *            - We include buffer zeroization. */
+MLK_INTERNAL_API
+int mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
+                   const uint8_t c[MLKEM_INDCPA_BYTES],
+                   const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+                   MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+{
+  int ret = 0;
+  MLK_ALLOC(b, mlk_polyvec, 1, context);
+  MLK_ALLOC(skpv, mlk_polyvec, 1, context);
+  MLK_ALLOC(v, mlk_poly, 1, context);
+  MLK_ALLOC(sb, mlk_poly, 1, context);
+  MLK_ALLOC(b_cache, mlk_polyvec_mulcache, 1, context);
+
+  if (b == NULL || skpv == NULL || v == NULL || sb == NULL || b_cache == NULL)
+  {
+    ret = MLK_ERR_OUT_OF_MEMORY;
+    goto cleanup;
+  }
+
+  mlk_unpack_ciphertext(b, v, c);
+  mlk_unpack_sk(skpv, sk);
+
+  mlk_polyvec_ntt(b);
+  mlk_polyvec_mulcache_compute(b_cache, b);
+  mlk_polyvec_basemul_acc_montgomery_cached(sb, skpv, b, b_cache);
+  mlk_poly_invntt_tomont(sb);
+
+  mlk_poly_sub(v, sb);
+  mlk_poly_reduce(v);
+
+  mlk_poly_tomsg(m, v);
+
+cleanup:
+  /* Specification: Partially implements
+   * @[FIPS203, Section 3.3, Destruction of intermediate values] */
+  MLK_FREE(b_cache, mlk_polyvec_mulcache, 1, context);
+  MLK_FREE(sb, mlk_poly, 1, context);
+  MLK_FREE(v, mlk_poly, 1, context);
+  MLK_FREE(skpv, mlk_polyvec, 1, context);
+  MLK_FREE(b, mlk_polyvec, 1, context);
+  return ret;
+}
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef mlk_pack_pk
+#undef mlk_unpack_pk
+#undef mlk_pack_sk
+#undef mlk_unpack_sk
+#undef mlk_pack_ciphertext
+#undef mlk_unpack_ciphertext
+#undef mlk_matvec_mul
+#undef mlk_polyvec_permute_bitrev_to_custom
+#undef mlk_polymat_permute_bitrev_to_custom
+#undef mlk_keypair_getnoise
diff --git a/mlkem_native/src/indcpa.h b/mlkem_native/src/indcpa.h
new file mode 100644
index 0000000..b31756d
--- /dev/null
+++ b/mlkem_native/src/indcpa.h
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS203]
+ *   FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/203/final
+ */
+
+#ifndef MLK_INDCPA_H
+#define MLK_INDCPA_H
+
+#include "cbmc.h"
+#include "common.h"
+#include "poly_k.h"
+
+#define mlk_gen_matrix MLK_NAMESPACE_K(gen_matrix)
+/*************************************************
+ * Name:        mlk_gen_matrix
+ *
+ * Description: Deterministically generate matrix A (or the transpose of A)
+ *              from a seed. Entries of the matrix are polynomials that look
+ *              uniformly random. Performs rejection sampling on output of
+ *              a XOF
+ *
+ * Arguments:   - mlk_polymat a: pointer to output matrix A
+ *              - const uint8_t *seed: pointer to input seed
+ *              - int transposed: boolean deciding whether A or A^T is generated
+ *
+ * Specification: Implements @[FIPS203, Algorithm 13 (K-PKE.KeyGen), L3-7]
+ *                and @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L4-8].
+ *                The `transposed` parameter only affects internal presentation.
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_gen_matrix(mlk_polymat *a, const uint8_t seed[MLKEM_SYMBYTES],
+                    int transposed)
+__contract__(
+  requires(memory_no_alias(a, sizeof(mlk_polymat)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires(transposed == 0 || transposed == 1)
+  assigns(memory_slice(a, sizeof(mlk_polymat)))
+  ensures(forall(x, 0, MLKEM_K, forall(y, 0, MLKEM_K,
+  array_bound(a->vec[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q))))
+);
+
+#define mlk_indcpa_keypair_derand \
+  MLK_NAMESPACE_K(indcpa_keypair_derand) MLK_CONTEXT_PARAMETERS_3
+/*************************************************
+ * Name:        mlk_indcpa_keypair_derand
+ *
+ * Description: Generates public and private key for the CPA-secure
+ *              public-key encryption scheme underlying ML-KEM
+ *
+ * Arguments:   - uint8_t *pk: pointer to output public key
+ *                             (of length MLKEM_INDCPA_PUBLICKEYBYTES bytes)
+ *              - uint8_t *sk: pointer to output private key
+ *                             (of length MLKEM_INDCPA_SECRETKEYBYTES bytes)
+ *              - const uint8_t *coins: pointer to input randomness
+ *                             (of length MLKEM_SYMBYTES bytes)
+ *
+ * Specification: Implements @[FIPS203, Algorithm 13 (K-PKE.KeyGen)].
+ *
+ **************************************************/
+MLK_INTERNAL_API
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+                              uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+                              const uint8_t coins[MLKEM_SYMBYTES],
+                              MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+__contract__(
+  requires(memory_no_alias(pk, MLKEM_INDCPA_PUBLICKEYBYTES))
+  requires(memory_no_alias(sk, MLKEM_INDCPA_SECRETKEYBYTES))
+  requires(memory_no_alias(coins, MLKEM_SYMBYTES))
+  assigns(memory_slice(pk, MLKEM_INDCPA_PUBLICKEYBYTES))
+  assigns(memory_slice(sk, MLKEM_INDCPA_SECRETKEYBYTES))
+  ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+          return_value == MLK_ERR_OUT_OF_MEMORY ||
+          return_value == MLK_ERR_RNG_FAIL)
+);
+
+#define mlk_indcpa_enc MLK_NAMESPACE_K(indcpa_enc) MLK_CONTEXT_PARAMETERS_4
+/*************************************************
+ * Name:        mlk_indcpa_enc
+ *
+ * Description: Encryption function of the CPA-secure
+ *              public-key encryption scheme underlying Kyber.
+ *
+ * Arguments:   - uint8_t *c: pointer to output ciphertext
+ *                            (of length MLKEM_INDCPA_BYTES bytes)
+ *              - const uint8_t *m: pointer to input message
+ *                                  (of length MLKEM_INDCPA_MSGBYTES bytes)
+ *              - const uint8_t *pk: pointer to input public key
+ *                                   (of length MLKEM_INDCPA_PUBLICKEYBYTES)
+ *              - const uint8_t *coins: pointer to input random coins used as
+ *                 seed (of length MLKEM_SYMBYTES) to deterministically generate
+ *                 all randomness
+ *
+ * Specification: Implements @[FIPS203, Algorithm 14 (K-PKE.Encrypt)].
+ *
+ **************************************************/
+MLK_INTERNAL_API
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
+                   const uint8_t m[MLKEM_INDCPA_MSGBYTES],
+                   const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+                   const uint8_t coins[MLKEM_SYMBYTES],
+                   MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+__contract__(
+  requires(memory_no_alias(c, MLKEM_INDCPA_BYTES))
+  requires(memory_no_alias(m, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(pk, MLKEM_INDCPA_PUBLICKEYBYTES))
+  requires(memory_no_alias(coins, MLKEM_SYMBYTES))
+  assigns(memory_slice(c, MLKEM_INDCPA_BYTES))
+  ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+          return_value == MLK_ERR_OUT_OF_MEMORY)
+);
+
+#define mlk_indcpa_dec MLK_NAMESPACE_K(indcpa_dec) MLK_CONTEXT_PARAMETERS_3
+/*************************************************
+ * Name:        mlk_indcpa_dec
+ *
+ * Description: Decryption function of the CPA-secure
+ *              public-key encryption scheme underlying Kyber.
+ *
+ * Arguments:   - uint8_t *m: pointer to output decrypted message
+ *                            (of length MLKEM_INDCPA_MSGBYTES)
+ *              - const uint8_t *c: pointer to input ciphertext
+ *                                  (of length MLKEM_INDCPA_BYTES)
+ *              - const uint8_t *sk: pointer to input secret key
+ *                                   (of length MLKEM_INDCPA_SECRETKEYBYTES)
+ *
+ * Specification: Implements @[FIPS203, Algorithm 15 (K-PKE.Decrypt)].
+ *
+ **************************************************/
+MLK_INTERNAL_API
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
+                   const uint8_t c[MLKEM_INDCPA_BYTES],
+                   const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+                   MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+__contract__(
+  requires(memory_no_alias(c, MLKEM_INDCPA_BYTES))
+  requires(memory_no_alias(m, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(sk, MLKEM_INDCPA_SECRETKEYBYTES))
+  assigns(memory_slice(m, MLKEM_INDCPA_MSGBYTES))
+  ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+          return_value == MLK_ERR_OUT_OF_MEMORY)
+);
+
+#endif /* !MLK_INDCPA_H */
diff --git a/mlkem_native/src/kem.c b/mlkem_native/src/kem.c
new file mode 100644
index 0000000..3c82d6d
--- /dev/null
+++ b/mlkem_native/src/kem.c
@@ -0,0 +1,446 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS140_3_IG]
+ *   Implementation Guidance for FIPS 140-3 and the Cryptographic Module
+ *   Validation Program
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements
+ *
+ * - [FIPS203]
+ *   FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/203/final
+ *
+ * - [REF]
+ *   CRYSTALS-Kyber C reference implementation
+ *   Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ *   https://github.com/pq-crystals/kyber/tree/main/ref
+ */
+
+#include "kem.h"
+
+#include "indcpa.h"
+#include "randombytes.h"
+#include "symmetric.h"
+#include "verify.h"
+
+/* Parameter set namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define mlk_check_pct MLK_ADD_PARAM_SET(mlk_check_pct) MLK_CONTEXT_PARAMETERS_2
+/* End of parameter set namespacing */
+
+/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_EXTERNAL_API
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_kem_check_pk(const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+                     MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+{
+  int ret = 0;
+  MLK_ALLOC(p, mlk_polyvec, 1, context);
+  MLK_ALLOC(p_reencoded, uint8_t, MLKEM_POLYVECBYTES, context);
+
+  if (p == NULL || p_reencoded == NULL)
+  {
+    ret = MLK_ERR_OUT_OF_MEMORY;
+    goto cleanup;
+  }
+
+  mlk_polyvec_frombytes(p, pk);
+  mlk_polyvec_reduce(p);
+  mlk_polyvec_tobytes(p_reencoded, p);
+
+  /* We use a constant-time memcmp here to avoid having to
+   * declassify the PK before the PCT has succeeded. */
+  ret = mlk_ct_memcmp(pk, p_reencoded, MLKEM_POLYVECBYTES) ? MLK_ERR_FAIL : 0;
+
+cleanup:
+  /* Specification: Partially implements
+   * @[FIPS203, Section 3.3, Destruction of intermediate values] */
+  MLK_FREE(p_reencoded, uint8_t, MLKEM_POLYVECBYTES, context);
+  MLK_FREE(p, mlk_polyvec, 1, context);
+  return ret;
+}
+
+
+/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_EXTERNAL_API
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_kem_check_sk(const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+                     MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+{
+  int ret = 0;
+  MLK_ALLOC(test, uint8_t, MLKEM_SYMBYTES, context);
+
+  if (test == NULL)
+  {
+    ret = MLK_ERR_OUT_OF_MEMORY;
+    goto cleanup;
+  }
+
+  /*
+   * The parts of `sk` being hashed and compared here are public, so
+   * no public information is leaked through the runtime or the return value
+   * of this function.
+   */
+
+  /* Declassify the public part of the secret key */
+  MLK_CT_TESTING_DECLASSIFY(sk + MLKEM_INDCPA_SECRETKEYBYTES,
+                            MLKEM_INDCCA_PUBLICKEYBYTES);
+  MLK_CT_TESTING_DECLASSIFY(
+      sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES, MLKEM_SYMBYTES);
+
+  mlk_hash_h(test, sk + MLKEM_INDCPA_SECRETKEYBYTES,
+             MLKEM_INDCCA_PUBLICKEYBYTES);
+  /* This doesn't have to be a constant-time memcmp, but it's the only place
+   * in the library where a normal memcmp would be used otherwise, so for sake
+   * of minimizing stdlib dependency, we use our constant-time one anyway. */
+  ret = mlk_ct_memcmp(sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES,
+                      test, MLKEM_SYMBYTES)
+            ? MLK_ERR_FAIL
+            : 0;
+
+cleanup:
+  /* Specification: Partially implements
+   * @[FIPS203, Section 3.3, Destruction of intermediate values] */
+  MLK_FREE(test, uint8_t, MLKEM_SYMBYTES, context);
+  return ret;
+}
+
+MLK_MUST_CHECK_RETURN_VALUE
+static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+                         uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES],
+                         MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+__contract__(
+  requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
+  requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+  ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+          return_value == MLK_ERR_OUT_OF_MEMORY ||
+          return_value == MLK_ERR_RNG_FAIL)
+);
+
+#if defined(MLK_CONFIG_KEYGEN_PCT)
+/* Specification:
+ * Partially implements 'Pairwise Consistency Test' @[FIPS140_3_IG, p.87] and
+ * @[FIPS203, Section 7.1, Pairwise Consistency]. */
+
+/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_MUST_CHECK_RETURN_VALUE
+static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+                         uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES],
+                         MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+{
+  int ret = 0;
+  MLK_ALLOC(ct, uint8_t, MLKEM_INDCCA_CIPHERTEXTBYTES, context);
+  MLK_ALLOC(ss_enc, uint8_t, MLKEM_SSBYTES, context);
+  MLK_ALLOC(ss_dec, uint8_t, MLKEM_SSBYTES, context);
+
+  if (ct == NULL || ss_enc == NULL || ss_dec == NULL)
+  {
+    ret = MLK_ERR_OUT_OF_MEMORY;
+    goto cleanup;
+  }
+
+  ret = mlk_kem_enc(ct, ss_enc, pk, context);
+  if (ret != 0)
+  {
+    goto cleanup;
+  }
+
+  ret = mlk_kem_dec(ss_dec, ct, sk, context);
+  if (ret != 0)
+  {
+    goto cleanup;
+  }
+
+#if defined(MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST)
+  /* Deliberately break PCT for testing purposes */
+  if (mlk_break_pct())
+  {
+    ss_enc[0] = ~ss_enc[0];
+  }
+#endif /* MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST */
+
+  ret = mlk_ct_memcmp(ss_enc, ss_dec, MLKEM_SSBYTES);
+  /* The result of the PCT is public. */
+  MLK_CT_TESTING_DECLASSIFY(&ret, sizeof(ret));
+
+  if (ret != 0)
+  {
+    ret = MLK_ERR_FAIL;
+  }
+
+cleanup:
+
+  /* Specification: Partially implements
+   * @[FIPS203, Section 3.3, Destruction of intermediate values] */
+  MLK_FREE(ss_dec, uint8_t, MLKEM_SSBYTES, context);
+  MLK_FREE(ss_enc, uint8_t, MLKEM_SSBYTES, context);
+  MLK_FREE(ct, uint8_t, MLKEM_INDCCA_CIPHERTEXTBYTES, context);
+  return ret;
+}
+#else /* MLK_CONFIG_KEYGEN_PCT */
+MLK_MUST_CHECK_RETURN_VALUE
+static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+                         uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES],
+                         MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+{
+  /* Skip PCT */
+  ((void)pk);
+  ((void)sk);
+#if defined(MLK_CONFIG_CONTEXT_PARAMETER)
+  ((void)context);
+#endif
+  return 0;
+}
+#endif /* !MLK_CONFIG_KEYGEN_PCT */
+
+/* Reference: `crypto_kem_keypair_derand()` in the reference implementation
+ *            @[REF].
+ *            - We optionally include PCT which is not present in
+ *              the reference code. */
+MLK_EXTERNAL_API
+int mlk_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+                           uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+                           const uint8_t coins[2 * MLKEM_SYMBYTES],
+                           MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+{
+  int ret;
+
+  ret = mlk_indcpa_keypair_derand(pk, sk, coins, context);
+  if (ret != 0)
+  {
+    goto cleanup;
+  }
+
+  mlk_memcpy(sk + MLKEM_INDCPA_SECRETKEYBYTES, pk, MLKEM_INDCCA_PUBLICKEYBYTES);
+  mlk_hash_h(sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES, pk,
+             MLKEM_INDCCA_PUBLICKEYBYTES);
+  /* Value z for pseudo-random output on reject */
+  mlk_memcpy(sk + MLKEM_INDCCA_SECRETKEYBYTES - MLKEM_SYMBYTES,
+             coins + MLKEM_SYMBYTES, MLKEM_SYMBYTES);
+
+  /* Declassify public key */
+  MLK_CT_TESTING_DECLASSIFY(pk, MLKEM_INDCCA_PUBLICKEYBYTES);
+
+  /* Pairwise Consistency Test (PCT) @[FIPS140_3_IG, p.87] */
+  ret = mlk_check_pct(pk, sk, context);
+  if (ret != 0)
+  {
+    goto cleanup;
+  }
+
+cleanup:
+  if (ret != 0)
+  {
+    mlk_zeroize(pk, MLKEM_INDCCA_PUBLICKEYBYTES);
+    mlk_zeroize(sk, MLKEM_INDCCA_SECRETKEYBYTES);
+  }
+
+  return ret;
+}
+
+#if !defined(MLK_CONFIG_NO_RANDOMIZED_API)
+/* Reference: `crypto_kem_keypair()` in the reference implementation @[REF]
+ *            - We zeroize the stack buffer */
+MLK_EXTERNAL_API
+int mlk_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+                    uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+                    MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+{
+  int ret = 0;
+  MLK_ALLOC(coins, uint8_t, 2 * MLKEM_SYMBYTES, context);
+
+  if (coins == NULL)
+  {
+    ret = MLK_ERR_OUT_OF_MEMORY;
+    goto cleanup;
+  }
+
+  /* Acquire necessary randomness, and mark it as secret. */
+  if (mlk_randombytes(coins, 2 * MLKEM_SYMBYTES) != 0)
+  {
+    ret = MLK_ERR_RNG_FAIL;
+    goto cleanup;
+  }
+
+  MLK_CT_TESTING_SECRET(coins, 2 * MLKEM_SYMBYTES);
+
+  ret = mlk_kem_keypair_derand(pk, sk, coins, context);
+
+cleanup:
+  /* Specification: Partially implements
+   * @[FIPS203, Section 3.3, Destruction of intermediate values] */
+  MLK_FREE(coins, uint8_t, 2 * MLKEM_SYMBYTES, context);
+  return ret;
+}
+#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */
+
+/* Reference: `crypto_kem_enc_derand()` in the reference implementation @[REF]
+ *            - We include public key check
+ *            - We include stack buffer zeroization */
+MLK_EXTERNAL_API
+int mlk_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+                       uint8_t ss[MLKEM_SSBYTES],
+                       const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+                       const uint8_t coins[MLKEM_SYMBYTES],
+                       MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+{
+  int ret = 0;
+  MLK_ALLOC(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+  MLK_ALLOC(kr, uint8_t, 2 * MLKEM_SYMBYTES, context);
+
+  if (buf == NULL || kr == NULL)
+  {
+    ret = MLK_ERR_OUT_OF_MEMORY;
+    goto cleanup;
+  }
+
+  /* Specification: Implements @[FIPS203, Section 7.2, Modulus check] */
+  ret = mlk_kem_check_pk(pk, context);
+  if (ret != 0)
+  {
+    goto cleanup;
+  }
+
+  mlk_memcpy(buf, coins, MLKEM_SYMBYTES);
+
+  /* Multitarget countermeasure for coins + contributory KEM */
+  mlk_hash_h(buf + MLKEM_SYMBYTES, pk, MLKEM_INDCCA_PUBLICKEYBYTES);
+  mlk_hash_g(kr, buf, 2 * MLKEM_SYMBYTES);
+
+  /* coins are in kr+MLKEM_SYMBYTES */
+  ret = mlk_indcpa_enc(ct, buf, pk, kr + MLKEM_SYMBYTES, context);
+  if (ret != 0)
+  {
+    goto cleanup;
+  }
+
+  mlk_memcpy(ss, kr, MLKEM_SYMBYTES);
+
+cleanup:
+  /* Specification: Partially implements
+   * @[FIPS203, Section 3.3, Destruction of intermediate values] */
+  MLK_FREE(kr, uint8_t, 2 * MLKEM_SYMBYTES, context);
+  MLK_FREE(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+  return ret;
+}
+
+#if !defined(MLK_CONFIG_NO_RANDOMIZED_API)
+/* Reference: `crypto_kem_enc()` in the reference implementation @[REF]
+ *            - We include stack buffer zeroization */
+MLK_EXTERNAL_API
+int mlk_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+                uint8_t ss[MLKEM_SSBYTES],
+                const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+                MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+{
+  int ret = 0;
+  MLK_ALLOC(coins, uint8_t, MLKEM_SYMBYTES, context);
+
+  if (coins == NULL)
+  {
+    ret = MLK_ERR_OUT_OF_MEMORY;
+    goto cleanup;
+  }
+
+  if (mlk_randombytes(coins, MLKEM_SYMBYTES) != 0)
+  {
+    ret = MLK_ERR_RNG_FAIL;
+    goto cleanup;
+  }
+
+  MLK_CT_TESTING_SECRET(coins, MLKEM_SYMBYTES);
+
+  ret = mlk_kem_enc_derand(ct, ss, pk, coins, context);
+
+cleanup:
+  /* Specification: Partially implements
+   * @[FIPS203, Section 3.3, Destruction of intermediate values] */
+  MLK_FREE(coins, uint8_t, MLKEM_SYMBYTES, context);
+  return ret;
+}
+#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */
+
+/* Reference: `crypto_kem_dec()` in the reference implementation @[REF]
+ *            - We include secret key check
+ *            - We include stack buffer zeroization */
+MLK_EXTERNAL_API
+int mlk_kem_dec(uint8_t ss[MLKEM_SSBYTES],
+                const uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+                const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+                MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+{
+  int ret = 0;
+  uint8_t fail;
+  const uint8_t *pk = sk + MLKEM_INDCPA_SECRETKEYBYTES;
+  MLK_ALLOC(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+  MLK_ALLOC(kr, uint8_t, 2 * MLKEM_SYMBYTES, context);
+  MLK_ALLOC(tmp, uint8_t, MLKEM_SYMBYTES + MLKEM_INDCCA_CIPHERTEXTBYTES,
+            context);
+
+  if (buf == NULL || kr == NULL || tmp == NULL)
+  {
+    ret = MLK_ERR_OUT_OF_MEMORY;
+    goto cleanup;
+  }
+
+  /* Specification: Implements @[FIPS203, Section 7.3, Hash check] */
+  ret = mlk_kem_check_sk(sk, context);
+  if (ret != 0)
+  {
+    goto cleanup;
+  }
+
+  ret = mlk_indcpa_dec(buf, ct, sk, context);
+  if (ret != 0)
+  {
+    goto cleanup;
+  }
+
+  /* Multitarget countermeasure for coins + contributory KEM */
+  mlk_memcpy(buf + MLKEM_SYMBYTES,
+             sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES,
+             MLKEM_SYMBYTES);
+  mlk_hash_g(kr, buf, 2 * MLKEM_SYMBYTES);
+
+  /* Recompute and compare ciphertext */
+  /* coins are in kr+MLKEM_SYMBYTES */
+  ret = mlk_indcpa_enc(tmp, buf, pk, kr + MLKEM_SYMBYTES, context);
+  if (ret != 0)
+  {
+    goto cleanup;
+  }
+
+  fail = mlk_ct_memcmp(ct, tmp, MLKEM_INDCCA_CIPHERTEXTBYTES);
+
+  /* Compute rejection key */
+  mlk_memcpy(tmp, sk + MLKEM_INDCCA_SECRETKEYBYTES - MLKEM_SYMBYTES,
+             MLKEM_SYMBYTES);
+  mlk_memcpy(tmp + MLKEM_SYMBYTES, ct, MLKEM_INDCCA_CIPHERTEXTBYTES);
+  mlk_hash_j(ss, tmp, MLKEM_SYMBYTES + MLKEM_INDCCA_CIPHERTEXTBYTES);
+
+  /* Copy true key to return buffer if fail is 0 */
+  mlk_ct_cmov_zero(ss, kr, MLKEM_SYMBYTES, fail);
+
+cleanup:
+  /* Specification: Partially implements
+   * @[FIPS203, Section 3.3, Destruction of intermediate values] */
+  MLK_FREE(tmp, uint8_t, MLKEM_SYMBYTES + MLKEM_INDCCA_CIPHERTEXTBYTES,
+           context);
+  MLK_FREE(kr, uint8_t, 2 * MLKEM_SYMBYTES, context);
+  MLK_FREE(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+
+  return ret;
+}
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef mlk_check_pct
diff --git a/mlkem_native/src/kem.h b/mlkem_native/src/kem.h
new file mode 100644
index 0000000..0502715
--- /dev/null
+++ b/mlkem_native/src/kem.h
@@ -0,0 +1,326 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS203]
+ *   FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/203/final
+ *
+ * - [REF]
+ *   CRYSTALS-Kyber C reference implementation
+ *   Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ *   https://github.com/pq-crystals/kyber/tree/main/ref
+ */
+
+#ifndef MLK_KEM_H
+#define MLK_KEM_H
+
+#include "cbmc.h"
+#include "common.h"
+#include "sys.h"
+
+#if defined(MLK_CHECK_APIS)
+/* Include to ensure consistency between internal kem.h
+ * and external mlkem_native.h. */
+#include "mlkem_native.h"
+
+#if MLKEM_INDCCA_SECRETKEYBYTES != \
+    MLKEM_SECRETKEYBYTES(MLK_CONFIG_PARAMETER_SET)
+#error Mismatch for SECRETKEYBYTES between kem.h and mlkem_native.h
+#endif
+
+#if MLKEM_INDCCA_PUBLICKEYBYTES != \
+    MLKEM_PUBLICKEYBYTES(MLK_CONFIG_PARAMETER_SET)
+#error Mismatch for PUBLICKEYBYTES between kem.h and mlkem_native.h
+#endif
+
+#if MLKEM_INDCCA_CIPHERTEXTBYTES != \
+    MLKEM_CIPHERTEXTBYTES(MLK_CONFIG_PARAMETER_SET)
+#error Mismatch for CIPHERTEXTBYTES between kem.h and mlkem_native.h
+#endif
+
+#endif /* MLK_CHECK_APIS */
+
+#define mlk_kem_keypair_derand \
+  MLK_NAMESPACE_K(keypair_derand) MLK_CONTEXT_PARAMETERS_3
+#define mlk_kem_keypair MLK_NAMESPACE_K(keypair) MLK_CONTEXT_PARAMETERS_2
+#define mlk_kem_enc_derand MLK_NAMESPACE_K(enc_derand) MLK_CONTEXT_PARAMETERS_4
+#define mlk_kem_enc MLK_NAMESPACE_K(enc) MLK_CONTEXT_PARAMETERS_3
+#define mlk_kem_dec MLK_NAMESPACE_K(dec) MLK_CONTEXT_PARAMETERS_3
+#define mlk_kem_check_pk MLK_NAMESPACE_K(check_pk) MLK_CONTEXT_PARAMETERS_1
+#define mlk_kem_check_sk MLK_NAMESPACE_K(check_sk) MLK_CONTEXT_PARAMETERS_1
+
+/*************************************************
+ * Name:        mlk_kem_check_pk
+ *
+ * Description: Implements modulus check mandated by FIPS 203,
+ *              i.e., ensures that coefficients are in [0,q-1].
+ *
+ * Arguments:   - const uint8_t *pk: pointer to input public key
+ *                (an already allocated array of MLKEM_INDCCA_PUBLICKEYBYTES
+ *                 bytes)
+ *
+ * Returns: - 0 on success
+ *          - MLK_ERR_FAIL: If the modulus check failed.
+ *          - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ *              used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ *
+ * Specification: Implements @[FIPS203, Section 7.2, 'modulus check']
+ *
+ **************************************************/
+
+/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_EXTERNAL_API
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_kem_check_pk(const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+                     MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+__contract__(
+  requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
+  ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+          return_value == MLK_ERR_OUT_OF_MEMORY)
+);
+
+
+/*************************************************
+ * Name:        mlk_kem_check_sk
+ *
+ * Description: Implements public key hash check mandated by FIPS 203,
+ *              i.e., ensures that
+ *              sk[768𝑘+32 ∶ 768𝑘+64] = H(pk)= H(sk[384𝑘 : 768𝑘+32])
+ *
+ * Arguments:   - const uint8_t *sk: pointer to input private key
+ *                (an already allocated array of MLKEM_INDCCA_SECRETKEYBYTES
+ *                 bytes)
+ *
+ * Returns: - 0 on success
+ *          - MLK_ERR_FAIL: If the public key hash check failed.
+ *          - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ *              used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ *
+ * Specification: Implements @[FIPS203, Section 7.3, 'hash check']
+ *
+ **************************************************/
+
+/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_EXTERNAL_API
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_kem_check_sk(const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+                     MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+__contract__(
+  requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+  ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+          return_value == MLK_ERR_OUT_OF_MEMORY)
+);
+
+/*************************************************
+ * Name:        mlk_kem_keypair_derand
+ *
+ * Description: Generates public and private key
+ *              for CCA-secure ML-KEM key encapsulation mechanism
+ *
+ * Arguments:   - uint8_t *pk: pointer to output public key
+ *                (an already allocated array of MLKEM_INDCCA_PUBLICKEYBYTES
+ *                 bytes)
+ *              - uint8_t *sk: pointer to output private key
+ *                (an already allocated array of MLKEM_INDCCA_SECRETKEYBYTES
+ *                 bytes)
+ *              - uint8_t *coins: pointer to input randomness
+ *                (an already allocated array filled with 2*MLKEM_SYMBYTES
+ *                 random bytes)
+ *
+ * Returns:     - 0: On success
+ *              - MLK_ERR_FAIL: If MLK_CONFIG_KEYGEN_PCT is enabled and the
+ *                  PCT failed.
+ *              - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ *                  used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ *
+ * Specification: Implements @[FIPS203, Algorithm 16, ML-KEM.KeyGen_Internal]
+ *
+ **************************************************/
+MLK_EXTERNAL_API
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+                           uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+                           const uint8_t coins[2 * MLKEM_SYMBYTES],
+                           MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+__contract__(
+  requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
+  requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+  requires(memory_no_alias(coins, 2 * MLKEM_SYMBYTES))
+  assigns(memory_slice(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
+  assigns(memory_slice(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+  ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+          return_value == MLK_ERR_OUT_OF_MEMORY ||
+          return_value == MLK_ERR_RNG_FAIL)
+);
+
+/*************************************************
+ * Name:        mlk_kem_keypair
+ *
+ * Description: Generates public and private key
+ *              for CCA-secure ML-KEM key encapsulation mechanism
+ *
+ * Arguments:   - uint8_t *pk: pointer to output public key
+ *                (an already allocated array of MLKEM_INDCCA_PUBLICKEYBYTES
+ *                 bytes)
+ *              - uint8_t *sk: pointer to output private key
+ *                (an already allocated array of MLKEM_INDCCA_SECRETKEYBYTES
+ *                 bytes)
+ *
+ * Returns:     - 0: On success
+ *              - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ *                  used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ *              - MLK_ERR_RNG_FAIL: Random number generation failed.
+ *              - MLK_ERR_FAIL: If MLK_CONFIG_KEYGEN_PCT is enabled and the
+ *                  PCT failed.
+ *
+ * Specification: Implements @[FIPS203, Algorithm 19, ML-KEM.KeyGen]
+ *
+ **************************************************/
+MLK_EXTERNAL_API
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+                    uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+                    MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+__contract__(
+  requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
+  requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+  assigns(memory_slice(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
+  assigns(memory_slice(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+  ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+          return_value == MLK_ERR_OUT_OF_MEMORY ||
+          return_value == MLK_ERR_RNG_FAIL)
+);
+
+/*************************************************
+ * Name:        mlk_kem_enc_derand
+ *
+ * Description: Generates cipher text and shared
+ *              secret for given public key
+ *
+ * Arguments:   - uint8_t *ct: pointer to output cipher text
+ *                (an already allocated array of MLKEM_INDCCA_CIPHERTEXTBYTES
+ *                 bytes)
+ *              - uint8_t *ss: pointer to output shared secret
+ *                (an already allocated array of MLKEM_SSBYTES bytes)
+ *              - const uint8_t *pk: pointer to input public key
+ *                (an already allocated array of MLKEM_INDCCA_PUBLICKEYBYTES
+ *                 bytes)
+ *              - const uint8_t *coins: pointer to input randomness
+ *                (an already allocated array filled with MLKEM_SYMBYTES random
+ *                 bytes)
+ *
+ * Returns: - 0 on success
+ *          - MLK_ERR_FAIL: If the 'modulus check' @[FIPS203, Section 7.2]
+ *              for the public key fails.
+ *          - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ *              used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ *
+ * Specification: Implements @[FIPS203, Algorithm 17, ML-KEM.Encaps_Internal]
+ *
+ **************************************************/
+MLK_EXTERNAL_API
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+                       uint8_t ss[MLKEM_SSBYTES],
+                       const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+                       const uint8_t coins[MLKEM_SYMBYTES],
+                       MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+__contract__(
+  requires(memory_no_alias(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
+  requires(memory_no_alias(ss, MLKEM_SSBYTES))
+  requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
+  requires(memory_no_alias(coins, MLKEM_SYMBYTES))
+  assigns(memory_slice(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
+  assigns(memory_slice(ss, MLKEM_SSBYTES))
+  ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+          return_value == MLK_ERR_OUT_OF_MEMORY)
+);
+
+/*************************************************
+ * Name:        mlk_kem_enc
+ *
+ * Description: Generates cipher text and shared
+ *              secret for given public key
+ *
+ * Arguments:   - uint8_t *ct: pointer to output cipher text
+ *                (an already allocated array of MLKEM_INDCCA_CIPHERTEXTBYTES
+ *                 bytes)
+ *              - uint8_t *ss: pointer to output shared secret
+ *                (an already allocated array of MLKEM_SSBYTES bytes)
+ *              - const uint8_t *pk: pointer to input public key
+ *                (an already allocated array of MLKEM_INDCCA_PUBLICKEYBYTES
+ *                 bytes)
+ *
+ * Returns: - 0 on success
+ *          - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ *              used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ *          - MLK_ERR_RNG_FAIL: Random number generation failed.
+ *          - MLK_ERR_FAIL: If the 'modulus check' @[FIPS203, Section 7.2]
+ *              for the public key fails.
+ *
+ * Specification: Implements @[FIPS203, Algorithm 20, ML-KEM.Encaps]
+ *
+ **************************************************/
+MLK_EXTERNAL_API
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+                uint8_t ss[MLKEM_SSBYTES],
+                const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+                MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+__contract__(
+  requires(memory_no_alias(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
+  requires(memory_no_alias(ss, MLKEM_SSBYTES))
+  requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
+  assigns(memory_slice(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
+  assigns(memory_slice(ss, MLKEM_SSBYTES))
+  ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+          return_value == MLK_ERR_OUT_OF_MEMORY ||
+          return_value == MLK_ERR_RNG_FAIL)
+);
+
+/*************************************************
+ * Name:        mlk_kem_dec
+ *
+ * Description: Generates shared secret for given
+ *              cipher text and private key
+ *
+ * Arguments:   - uint8_t *ss: pointer to output shared secret
+ *                (an already allocated array of MLKEM_SSBYTES bytes)
+ *              - const uint8_t *ct: pointer to input cipher text
+ *                (an already allocated array of MLKEM_INDCCA_CIPHERTEXTBYTES
+ *                 bytes)
+ *              - const uint8_t *sk: pointer to input private key
+ *                (an already allocated array of MLKEM_INDCCA_SECRETKEYBYTES
+ *                 bytes)
+ *
+ * Returns: - 0 on success
+ *          - MLK_ERR_FAIL: If the 'hash check' @[FIPS203, Section 7.3]
+ *              for the secret key fails.
+ *          - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ *              used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ *
+ * Specification: Implements @[FIPS203, Algorithm 21, ML-KEM.Decaps]
+ *
+ **************************************************/
+MLK_EXTERNAL_API
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_kem_dec(uint8_t ss[MLKEM_SSBYTES],
+                const uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+                const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+                MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+__contract__(
+  requires(memory_no_alias(ss, MLKEM_SSBYTES))
+  requires(memory_no_alias(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
+  requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+  assigns(memory_slice(ss, MLKEM_SSBYTES))
+  ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+          return_value == MLK_ERR_OUT_OF_MEMORY)
+);
+
+#endif /* !MLK_KEM_H */
diff --git a/mlkem_native/src/params.h b/mlkem_native/src/params.h
new file mode 100644
index 0000000..0459853
--- /dev/null
+++ b/mlkem_native/src/params.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+#ifndef MLK_PARAMS_H
+#define MLK_PARAMS_H
+
+#if !defined(MLK_CONFIG_PARAMETER_SET)
+#error MLK_CONFIG_PARAMETER_SET is not defined
+#endif
+
+#if MLK_CONFIG_PARAMETER_SET == 512
+#define MLKEM_K 2
+#elif MLK_CONFIG_PARAMETER_SET == 768
+#define MLKEM_K 3
+#elif MLK_CONFIG_PARAMETER_SET == 1024
+#define MLKEM_K 4
+#else
+#error Invalid value for MLK_CONFIG_PARAMETER_SET. Must be 512, 768, or 1024.
+#endif
+
+#define MLKEM_N 256
+#define MLKEM_Q 3329
+#define MLKEM_Q_HALF ((MLKEM_Q + 1) / 2) /* 1665 */
+#define MLKEM_UINT12_LIMIT 4096
+
+#define MLKEM_SYMBYTES 32 /* size in bytes of hashes, and seeds */
+#define MLKEM_SSBYTES 32  /* size in bytes of shared key */
+
+#define MLKEM_POLYBYTES 384
+#define MLKEM_POLYVECBYTES (MLKEM_K * MLKEM_POLYBYTES)
+
+#define MLKEM_POLYCOMPRESSEDBYTES_D4 128
+#define MLKEM_POLYCOMPRESSEDBYTES_D5 160
+#define MLKEM_POLYCOMPRESSEDBYTES_D10 320
+#define MLKEM_POLYCOMPRESSEDBYTES_D11 352
+
+#if MLKEM_K == 2
+#define MLKEM_ETA1 3
+#define MLKEM_DU 10
+#define MLKEM_DV 4
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D4
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D10
+#define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
+#elif MLKEM_K == 3
+#define MLKEM_ETA1 2
+#define MLKEM_DU 10
+#define MLKEM_DV 4
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D4
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D10
+#define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
+#elif MLKEM_K == 4
+#define MLKEM_ETA1 2
+#define MLKEM_DU 11
+#define MLKEM_DV 5
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D5
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D11
+#define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
+#endif /* MLKEM_K == 4 */
+
+#define MLKEM_ETA2 2
+
+#define MLKEM_INDCPA_MSGBYTES (MLKEM_SYMBYTES)
+#define MLKEM_INDCPA_PUBLICKEYBYTES (MLKEM_POLYVECBYTES + MLKEM_SYMBYTES)
+#define MLKEM_INDCPA_SECRETKEYBYTES (MLKEM_POLYVECBYTES)
+#define MLKEM_INDCPA_BYTES \
+  (MLKEM_POLYVECCOMPRESSEDBYTES_DU + MLKEM_POLYCOMPRESSEDBYTES_DV)
+
+#define MLKEM_INDCCA_PUBLICKEYBYTES (MLKEM_INDCPA_PUBLICKEYBYTES)
+/* 32 bytes of additional space to save H(pk) */
+#define MLKEM_INDCCA_SECRETKEYBYTES                            \
+  (MLKEM_INDCPA_SECRETKEYBYTES + MLKEM_INDCPA_PUBLICKEYBYTES + \
+   2 * MLKEM_SYMBYTES)
+#define MLKEM_INDCCA_CIPHERTEXTBYTES (MLKEM_INDCPA_BYTES)
+
+#endif /* !MLK_PARAMS_H */
diff --git a/mlkem_native/src/poly.c b/mlkem_native/src/poly.c
new file mode 100644
index 0000000..564d5d7
--- /dev/null
+++ b/mlkem_native/src/poly.c
@@ -0,0 +1,572 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [NeonNTT]
+ *   Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1
+ *   Becker, Hwang, Kannwischer, Yang, Yang
+ *   https://eprint.iacr.org/2021/986
+ *
+ * - [REF]
+ *   CRYSTALS-Kyber C reference implementation
+ *   Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ *   https://github.com/pq-crystals/kyber/tree/main/ref
+ */
+
+#include "common.h"
+#if !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
+
+
+#include "cbmc.h"
+#include "debug.h"
+#include "poly.h"
+#include "sampling.h"
+#include "symmetric.h"
+#include "verify.h"
+
+/*************************************************
+ * Name:        mlk_fqmul
+ *
+ * Description: Montgomery multiplication modulo MLKEM_Q
+ *
+ * Arguments:   - int16_t a: first factor
+ *                  Can be any int16_t.
+ *              - int16_t b: second factor.
+ *                  Must be signed canonical (abs value <(MLKEM_Q+1)/2)
+ *
+ * Returns 16-bit integer congruent to a*b*R^{-1} mod MLKEM_Q, and
+ * smaller than MLKEM_Q in absolute value.
+ *
+ **************************************************/
+
+/* Reference: `fqmul()` in the reference implementation @[REF]. */
+static MLK_INLINE int16_t mlk_fqmul(int16_t a, int16_t b)
+__contract__(
+  requires(b > -MLKEM_Q_HALF && b < MLKEM_Q_HALF)
+  ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q)
+)
+{
+  int16_t res;
+  mlk_assert_abs_bound(&b, 1, MLKEM_Q_HALF);
+
+  res = mlk_montgomery_reduce((int32_t)a * (int32_t)b);
+  /* Bounds:
+   * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2
+   *       <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2
+   *       <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2
+   *        < MLKEM_Q
+   */
+
+  mlk_assert_abs_bound(&res, 1, MLKEM_Q);
+  return res;
+}
+
+/*************************************************
+ * Name:        mlk_barrett_reduce
+ *
+ * Description: Barrett reduction; given a 16-bit integer a, computes
+ *              centered representative congruent to a mod q in
+ *              {-(q-1)/2,...,(q-1)/2}
+ *
+ * Arguments:   - int16_t a: input integer to be reduced
+ *
+ * Returns:     integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
+ *
+ **************************************************/
+
+/* Reference: `barrett_reduce()` in the reference implementation @[REF]. */
+static MLK_INLINE int16_t mlk_barrett_reduce(int16_t a)
+__contract__(
+  ensures(return_value > -MLKEM_Q_HALF && return_value < MLKEM_Q_HALF)
+)
+{
+  /* Barrett reduction approximates
+   * ```
+   *     round(a/MLKEM_Q)
+   *   = round(a*(2^N/MLKEM_Q))/2^N)
+   *  ~= round(a*round(2^N/MLKEM_Q)/2^N)
+   * ```
+   * Here, we pick N=26.
+   */
+  const int32_t magic = 20159; /* check-magic: 20159 == round(2^26 / MLKEM_Q) */
+
+  /*
+   * PORTABILITY: Right-shift on a signed integer is
+   * implementation-defined for negative left argument.
+   * Here, we assume it's sign-preserving "arithmetic" shift right.
+   * See (C99 6.5.7 (5))
+   */
+  const int32_t t = (magic * a + ((int32_t)1 << 25)) >> 26;
+
+  /*
+   * t is in -10 .. +10, so we need 32-bit math to
+   * evaluate t * MLKEM_Q and the subsequent subtraction
+   */
+  int16_t res = (int16_t)(a - t * MLKEM_Q);
+
+  mlk_assert_abs_bound(&res, 1, MLKEM_Q_HALF);
+  return res;
+}
+
+/* Reference: `poly_tomont()` in the reference implementation @[REF]. */
+MLK_STATIC_TESTABLE void mlk_poly_tomont_c(mlk_poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(mlk_poly)))
+  assigns(memory_slice(r, sizeof(mlk_poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q))
+)
+{
+  unsigned i;
+  const int16_t f = 1353; /* check-magic: 1353 == signed_mod(2^32, MLKEM_Q) */
+  for (i = 0; i < MLKEM_N; i++)
+  __loop__(
+    invariant(i <= MLKEM_N)
+    invariant(array_abs_bound(r->coeffs, 0, i, MLKEM_Q)))
+  {
+    r->coeffs[i] = mlk_fqmul(r->coeffs[i], f);
+  }
+
+  mlk_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
+}
+
+MLK_INTERNAL_API
+void mlk_poly_tomont(mlk_poly *r)
+{
+#if defined(MLK_USE_NATIVE_POLY_TOMONT)
+  int ret;
+  ret = mlk_poly_tomont_native(r->coeffs);
+  if (ret == MLK_NATIVE_FUNC_SUCCESS)
+  {
+    mlk_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
+    return;
+  }
+#endif /* MLK_USE_NATIVE_POLY_TOMONT */
+
+  mlk_poly_tomont_c(r);
+}
+
+/************************************************************
+ * Name: mlk_scalar_signed_to_unsigned_q
+ *
+ * Description: Constant-time conversion of signed representatives
+ *              modulo MLKEM_Q within range (-(MLKEM_Q-1) .. (MLKEM_Q-1))
+ *              into unsigned representatives within range (0..(MLKEM_Q-1)).
+ *
+ * Arguments: c: signed coefficient to be converted
+ *
+ ************************************************************/
+
+/* Reference: Not present in the reference implementation @[REF].
+ *            - Used here to implement different semantics of `poly_reduce()`;
+ *              see below. in the reference implementation @[REF], this logic is
+ *              part of all compression functions (see `compress.c`). */
+static MLK_INLINE int16_t mlk_scalar_signed_to_unsigned_q(int16_t c)
+__contract__(
+  requires(c > -MLKEM_Q && c < MLKEM_Q)
+  ensures(return_value >= 0 && return_value < MLKEM_Q)
+  ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
+{
+  mlk_assert_abs_bound(&c, 1, MLKEM_Q);
+
+  /* Add MLKEM_Q if c is negative, but in constant time.
+   *
+   * Note that c + MLKEM_Q does not overflow in int16_t,
+   * so the cast to uint16_t is safe. */
+  c = mlk_ct_sel_int16((int16_t)(c + MLKEM_Q), c, mlk_ct_cmask_neg_i16(c));
+
+  mlk_assert_bound(&c, 1, 0, MLKEM_Q);
+  return c;
+}
+
+/* Reference: `poly_reduce()` in the reference implementation @[REF]
+ *            - We use _unsigned_ canonical outputs, while the reference
+ *              implementation uses _signed_ canonical outputs.
+ *              Accordingly, we need a conditional addition of MLKEM_Q
+ *              here to go from signed to unsigned representatives.
+ *              This conditional addition is then dropped from all
+ *              polynomial compression functions instead (see `compress.c`). */
+MLK_STATIC_TESTABLE void mlk_poly_reduce_c(mlk_poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(mlk_poly)))
+  assigns(memory_slice(r, sizeof(mlk_poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
+{
+  unsigned i;
+
+  for (i = 0; i < MLKEM_N; i++)
+  __loop__(
+    invariant(i <= MLKEM_N)
+    invariant(array_bound(r->coeffs, 0, i, 0, MLKEM_Q)))
+  {
+    /* Barrett reduction, giving signed canonical representative */
+    int16_t t = mlk_barrett_reduce(r->coeffs[i]);
+    /* Conditional addition to get unsigned canonical representative */
+    r->coeffs[i] = mlk_scalar_signed_to_unsigned_q(t);
+  }
+
+  mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLK_INTERNAL_API
+void mlk_poly_reduce(mlk_poly *r)
+{
+#if defined(MLK_USE_NATIVE_POLY_REDUCE)
+  int ret;
+  ret = mlk_poly_reduce_native(r->coeffs);
+  if (ret == MLK_NATIVE_FUNC_SUCCESS)
+  {
+    mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+    return;
+  }
+#endif /* MLK_USE_NATIVE_POLY_REDUCE */
+
+  mlk_poly_reduce_c(r);
+}
+
+/* Reference: `poly_add()` in the reference implementation @[REF].
+ *            - We use destructive version (output=first input) to avoid
+ *              reasoning about aliasing in the CBMC specification */
+MLK_INTERNAL_API
+void mlk_poly_add(mlk_poly *r, const mlk_poly *b)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N; i++)
+  __loop__(
+    invariant(i <= MLKEM_N)
+    invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
+    invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] + b->coeffs[k1])))
+  {
+    /* The preconditions imply that the addition stays within int16_t. */
+    r->coeffs[i] = (int16_t)(r->coeffs[i] + b->coeffs[i]);
+  }
+}
+
+/* Reference: `poly_sub()` in the reference implementation @[REF].
+ *            - We use destructive version (output=first input) to avoid
+ *              reasoning about aliasing in the CBMC specification */
+MLK_INTERNAL_API
+void mlk_poly_sub(mlk_poly *r, const mlk_poly *b)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N; i++)
+  __loop__(
+    invariant(i <= MLKEM_N)
+    invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
+    invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] - b->coeffs[k1])))
+  {
+    /* The preconditions imply that the subtraction stays within int16_t. */
+    r->coeffs[i] = (int16_t)(r->coeffs[i] - b->coeffs[i]);
+  }
+}
+
+#include "zetas.inc"
+
+/* Reference: Does not exist in the reference implementation @[REF].
+ *            - The reference implementation does not use a
+ *              multiplication cache ('mulcache'). This idea originates
+ *              from @[NeonNTT] and is used at the C level here. */
+MLK_STATIC_TESTABLE void mlk_poly_mulcache_compute_c(mlk_poly_mulcache *x,
+                                                     const mlk_poly *a)
+__contract__(
+  requires(memory_no_alias(x, sizeof(mlk_poly_mulcache)))
+  requires(memory_no_alias(a, sizeof(mlk_poly)))
+  assigns(memory_slice(x, sizeof(mlk_poly_mulcache)))
+)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 4; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 4)
+    invariant(array_abs_bound(x->coeffs, 0, 2 * i, MLKEM_Q)))
+  {
+    x->coeffs[2 * i + 0] = mlk_fqmul(a->coeffs[4 * i + 1], mlk_zetas[64 + i]);
+    /* The values in zeta table are <= MLKEM_Q in absolute value,
+     * so the negation in int16_t is safe. */
+    x->coeffs[2 * i + 1] =
+        mlk_fqmul(a->coeffs[4 * i + 3], (int16_t)(-mlk_zetas[64 + i]));
+  }
+
+  /*
+   * This bound is true for the C implementation, but not needed
+   * in the higher level bounds reasoning. It is thus omitted
+   * from the spec to not unnecessarily constrain native
+   * implementations, but checked here nonetheless.
+   */
+  mlk_assert_abs_bound(x, MLKEM_N / 2, MLKEM_Q);
+}
+
+MLK_INTERNAL_API
+void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
+{
+#if defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE)
+  int ret;
+  ret = mlk_poly_mulcache_compute_native(x->coeffs, a->coeffs);
+  if (ret == MLK_NATIVE_FUNC_SUCCESS)
+  {
+    return;
+  }
+#endif /* MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE */
+
+  mlk_poly_mulcache_compute_c(x, a);
+}
+
+/*
+ * Computes a block CT butterflies with a fixed twiddle factor,
+ * using Montgomery multiplication.
+ * Parameters:
+ * - r: Pointer to base of polynomial (_not_ the base of butterfly block)
+ * - root: Twiddle factor to use for the butterfly. This must be in
+ *         Montgomery form and signed canonical.
+ * - start: Offset to the beginning of the butterfly block
+ * - len: Index difference between coefficients subject to a butterfly
+ * - bound: Ghost variable describing coefficient bound: Prior to `start`,
+ *          coefficients must be bound by `bound + MLKEM_Q`. Post `start`,
+ *          they must be bound by `bound`.
+ * When this function returns, output coefficients in the index range
+ * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`.
+ * Example:
+ * - start=8, len=4
+ *   This would compute the following four butterflies
+ *          8     --    12
+ *             9    --     13
+ *                10   --     14
+ *                   11   --     15
+ * - start=4, len=2
+ *   This would compute the following two butterflies
+ *          4 -- 6
+ *             5 -- 7
+ */
+
+/* Reference: Embedded in `ntt()` in the reference implementation @[REF]. */
+static void mlk_ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
+                                    unsigned start, unsigned len,
+                                    unsigned bound)
+__contract__(
+  requires(start < MLKEM_N)
+  requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
+  requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
+  requires(-MLKEM_Q_HALF < zeta && zeta < MLKEM_Q_HALF)
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(array_abs_bound(r, 0, start, bound + MLKEM_Q))
+  requires(array_abs_bound(r, start, MLKEM_N, bound))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q))
+  ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
+{
+  /* `bound` is a ghost variable only needed in the CBMC specification */
+  unsigned j;
+  ((void)bound);
+  for (j = start; j < start + len; j++)
+  __loop__(
+    invariant(start <= j && j <= start + len)
+    /*
+     * Coefficients are updated in strided pairs, so the bounds for the
+     * intermediate states alternate twice between the old and new bound
+     */
+    invariant(array_abs_bound(r, 0,           j,           bound + MLKEM_Q))
+    invariant(array_abs_bound(r, j,           start + len, bound))
+    invariant(array_abs_bound(r, start + len, j + len,     bound + MLKEM_Q))
+    invariant(array_abs_bound(r, j + len,     MLKEM_N,     bound)))
+  {
+    int16_t t;
+    t = mlk_fqmul(r[j + len], zeta);
+    /* The precondition implies that the arithmetic does not overflow. */
+    r[j + len] = (int16_t)(r[j] - t);
+    r[j] = (int16_t)(r[j] + t);
+  }
+}
+
+/*
+ * Compute one layer of forward NTT
+ * Parameters:
+ * - r: Pointer to base of polynomial
+ * - layer: Variable indicating which layer is being applied.
+ */
+
+/* Reference: Embedded in `ntt()` in the reference implementation @[REF]. */
+static void mlk_ntt_layer(int16_t r[MLKEM_N], unsigned layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(1 <= layer && layer <= 7)
+  requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
+{
+  unsigned start, k, len;
+  /* Twiddle factors for layer n are at indices 2^(n-1)..2^n-1. */
+  k = 1u << (layer - 1);
+  len = (unsigned)MLKEM_N >> layer;
+  for (start = 0; start < MLKEM_N; start += 2 * len)
+  __loop__(
+    invariant(start < MLKEM_N + 2 * len)
+    invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
+    invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
+    invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
+  {
+    int16_t zeta = mlk_zetas[k++];
+    mlk_ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q);
+  }
+}
+
+/*
+ * Compute full forward NTT
+ * NOTE: This particular implementation satisfies a much tighter
+ * bound on the output coefficients (5*q) than the contractual one (8*q),
+ * but this is not needed in the calling code. Should we change the
+ * base multiplication strategy to require smaller NTT output bounds,
+ * the proof may need strengthening.
+ */
+
+/* Reference: `ntt()` in the reference implementation @[REF].
+ * - Iterate over `layer` instead of `len` in the outer loop
+ *   to simplify computation of zeta index. */
+MLK_STATIC_TESTABLE void mlk_poly_ntt_c(mlk_poly *p)
+__contract__(
+  requires(memory_no_alias(p, sizeof(mlk_poly)))
+  requires(array_abs_bound(p->coeffs, 0, MLKEM_N, MLKEM_Q))
+  assigns(memory_slice(p, sizeof(mlk_poly)))
+  ensures(array_abs_bound(p->coeffs, 0, MLKEM_N, MLK_NTT_BOUND))
+)
+{
+  unsigned layer;
+  int16_t *r;
+
+  mlk_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
+
+  r = p->coeffs;
+
+  for (layer = 1; layer <= 7; layer++)
+  __loop__(
+    invariant(1 <= layer && layer <= 8)
+    invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)))
+  {
+    mlk_ntt_layer(r, layer);
+  }
+
+  /* Check the stronger bound */
+  mlk_assert_abs_bound(p, MLKEM_N, MLK_NTT_BOUND);
+}
+
+MLK_INTERNAL_API
+void mlk_poly_ntt(mlk_poly *p)
+{
+#if defined(MLK_USE_NATIVE_NTT)
+  int ret;
+  mlk_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
+  ret = mlk_ntt_native(p->coeffs);
+  if (ret == MLK_NATIVE_FUNC_SUCCESS)
+  {
+    mlk_assert_abs_bound(p, MLKEM_N, MLK_NTT_BOUND);
+    return;
+  }
+#endif /* MLK_USE_NATIVE_NTT */
+
+  mlk_poly_ntt_c(p);
+}
+
+
+/* Compute one layer of inverse NTT */
+
+/* Reference: Embedded into `invntt()` in the reference implementation @[REF] */
+static void mlk_invntt_layer(int16_t *r, unsigned layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(1 <= layer && layer <= 7)
+  requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+{
+  unsigned start, k, len;
+  len = (unsigned)MLKEM_N >> layer;
+  k = (1u << layer) - 1;
+  for (start = 0; start < MLKEM_N; start += 2 * len)
+  __loop__(
+    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
+    invariant(start <= MLKEM_N && k <= 127)
+    /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
+    invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
+  {
+    unsigned j;
+    int16_t zeta = mlk_zetas[k--];
+    for (j = start; j < start + len; j++)
+    __loop__(
+      invariant(start <= j && j <= start + len)
+      invariant(start <= MLKEM_N && k <= 127)
+      invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+    {
+      int16_t t = r[j];
+      /* The preconditions imply that the arithmetic does not overflow. */
+      r[j] = mlk_barrett_reduce((int16_t)(t + r[j + len]));
+      r[j + len] = (int16_t)(r[j + len] - t);
+      r[j + len] = mlk_fqmul(r[j + len], zeta);
+    }
+  }
+}
+
+/* Reference: `invntt()` in the reference implementation @[REF]
+ *            - We normalize at the beginning of the inverse NTT,
+ *              while the reference implementation normalizes at
+ *              the end. This allows us to drop a call to `poly_reduce()`
+ *              from the base multiplication. */
+MLK_STATIC_TESTABLE void mlk_poly_invntt_tomont_c(mlk_poly *p)
+__contract__(
+  requires(memory_no_alias(p, sizeof(mlk_poly)))
+  assigns(memory_slice(p, sizeof(mlk_poly)))
+  ensures(array_abs_bound(p->coeffs, 0, MLKEM_N, MLK_INVNTT_BOUND))
+)
+{
+  unsigned j, layer;
+  const int16_t f = 1441; /* check-magic: 1441 == pow(2,32 - 7,MLKEM_Q) */
+  int16_t *r = p->coeffs;
+
+  /*
+   * Scale input polynomial to account for Montgomery factor
+   * and NTT twist. This also brings coefficients down to
+   * absolute value < MLKEM_Q.
+   */
+  for (j = 0; j < MLKEM_N; j++)
+  __loop__(
+    invariant(j <= MLKEM_N)
+    invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
+  {
+    r[j] = mlk_fqmul(r[j], f);
+  }
+
+  /* Run the invNTT layers */
+  for (layer = 7; layer > 0; layer--)
+  __loop__(
+    invariant(0 <= layer && layer < 8)
+    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+  {
+    mlk_invntt_layer(r, layer);
+  }
+
+  mlk_assert_abs_bound(p, MLKEM_N, MLK_INVNTT_BOUND);
+}
+
+MLK_INTERNAL_API
+void mlk_poly_invntt_tomont(mlk_poly *p)
+{
+#if defined(MLK_USE_NATIVE_INTT)
+  int ret;
+  ret = mlk_intt_native(p->coeffs);
+  if (ret == MLK_NATIVE_FUNC_SUCCESS)
+  {
+    mlk_assert_abs_bound(p, MLKEM_N, MLK_INVNTT_BOUND);
+    return;
+  }
+#endif /* MLK_USE_NATIVE_INTT */
+
+  mlk_poly_invntt_tomont_c(p);
+}
+
+#else /* !MLK_CONFIG_MULTILEVEL_NO_SHARED */
+
+MLK_EMPTY_CU(mlk_poly)
+
+#endif /* MLK_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/mlkem_native/src/poly.h b/mlkem_native/src/poly.h
new file mode 100644
index 0000000..587062c
--- /dev/null
+++ b/mlkem_native/src/poly.h
@@ -0,0 +1,317 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS203]
+ *   FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/203/final
+ */
+
+#ifndef MLK_POLY_H
+#define MLK_POLY_H
+
+
+#include "cbmc.h"
+#include "common.h"
+#include "debug.h"
+#include "verify.h"
+
+/* Absolute exclusive upper bound for the output of the inverse NTT */
+#define MLK_INVNTT_BOUND (8 * MLKEM_Q)
+
+/* Absolute exclusive upper bound for the output of the forward NTT */
+#define MLK_NTT_BOUND (8 * MLKEM_Q)
+
+/*
+ * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial
+ * coeffs[0] + X*coeffs[1] + X^2*coeffs[2] + ... + X^{n-1}*coeffs[n-1]
+ */
+typedef struct
+{
+  int16_t coeffs[MLKEM_N];
+} MLK_ALIGN mlk_poly;
+
+/*
+ * INTERNAL presentation of precomputed data speeding up
+ * the base multiplication of two polynomials in NTT domain.
+ */
+typedef struct
+{
+  int16_t coeffs[MLKEM_N >> 1];
+} MLK_ALIGN mlk_poly_mulcache;
+
+/*************************************************
+ * Name:        mlk_montgomery_reduce
+ *
+ * Description: Generic Montgomery reduction; given a 32-bit integer a, computes
+ *              16-bit integer congruent to a * R^-1 mod q, where R=2^16
+ *
+ * Arguments:   - int32_t a: input integer to be reduced, of absolute value
+ *                smaller or equal to INT32_MAX - 2^15 * MLKEM_Q.
+ *
+ * Returns:     integer congruent to a * R^-1 modulo q, with absolute value
+ *                <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2
+ *
+ **************************************************/
+static MLK_ALWAYS_INLINE int16_t mlk_montgomery_reduce(int32_t a)
+__contract__(
+    requires(a < +(INT32_MAX - (((int32_t)1 << 15) * MLKEM_Q)) &&
+             a > -(INT32_MAX - (((int32_t)1 << 15) * MLKEM_Q)))
+    /* We don't attempt to express an input-dependent output bound
+     * as the post-condition here. There are two call-sites for this
+     * function:
+     * - The base multiplication: Here, we need no output bound.
+     * - mlk_fqmul: Here, we inline this function and prove another spec
+     *          for mlk_fqmul which does have a post-condition bound. */
+)
+{
+  /* check-magic: 62209 == unsigned_mod(pow(MLKEM_Q, -1, 2^16), 2^16) */
+  const uint32_t QINV = 62209;
+
+  /* Compute a*q^{-1} mod 2^16 in unsigned representatives. */
+  const uint16_t a_reduced = mlk_cast_int32_to_uint16(a);
+  const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
+
+  /* Lift to signed canonical representative mod 2^16. */
+  const int16_t t = mlk_cast_uint16_to_int16(a_inverted);
+
+  int32_t r;
+
+  mlk_assert(a < +(INT32_MAX - (((int32_t)1 << 15) * MLKEM_Q)) &&
+             a > -(INT32_MAX - (((int32_t)1 << 15) * MLKEM_Q)));
+
+  r = a - ((int32_t)t * MLKEM_Q);
+
+  /*
+   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
+   * implementation-defined for negative left argument. Here,
+   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
+   */
+  r = r >> 16;
+  /* Bounds: |r >> 16| <= ceil(|r| / 2^16)
+   *                   <= ceil(|a| / 2^16 + MLKEM_Q / 2)
+   *                   <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
+   *
+   * (Note that |a >> n| = ceil(|a| / 2^16) for negative a)
+   */
+  return (int16_t)r;
+}
+
+#define mlk_poly_tomont MLK_NAMESPACE(poly_tomont)
+/*************************************************
+ * Name:        mlk_poly_tomont
+ *
+ * Description: Inplace conversion of all coefficients of a polynomial
+ *              from normal domain to Montgomery domain
+ *
+ *              Bounds: Output < q in absolute value.
+ *
+ * Arguments:   - mlk_poly *r: pointer to input/output polynomial
+ *
+ * Specification: Internal normalization required in `mlk_indcpa_keypair_derand`
+ *                as part of matrix-vector multiplication
+ *                @[FIPS203, Algorithm 13, K-PKE.KeyGen, L18].
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_poly_tomont(mlk_poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(mlk_poly)))
+  assigns(memory_slice(r, sizeof(mlk_poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q))
+);
+
+#define mlk_poly_mulcache_compute MLK_NAMESPACE(poly_mulcache_compute)
+/************************************************************
+ * Name: mlk_poly_mulcache_compute
+ *
+ * Description: Computes the mulcache for a polynomial in NTT domain
+ *
+ *              The mulcache of a degree-2 polynomial b := b0 + b1*X
+ *              in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when
+ *              computing products of b in Fq[X]/(X^2-zeta).
+ *
+ *              The mulcache of a polynomial in NTT domain -- which is
+ *              a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta),
+ *              for varying zeta, is the 128-tuple of mulcaches of those
+ *              polynomials.
+ *
+ * Arguments: - x: Pointer to mulcache to be populated
+ *            - a: Pointer to input polynomial
+ *
+ * Specification:
+ * - Caches `b_1 * \gamma` in @[FIPS203, Algorithm 12, BaseCaseMultiply, L1]
+ *
+ ************************************************************/
+/*
+ * NOTE: The default C implementation of this function populates
+ * the mulcache with values in (-q,q), but this is not needed for the
+ * higher level safety proofs, and thus not part of the spec.
+ */
+MLK_INTERNAL_API
+void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
+__contract__(
+  requires(memory_no_alias(x, sizeof(mlk_poly_mulcache)))
+  requires(memory_no_alias(a, sizeof(mlk_poly)))
+  assigns(memory_slice(x, sizeof(mlk_poly_mulcache)))
+);
+
+#define mlk_poly_reduce MLK_NAMESPACE(poly_reduce)
+/*************************************************
+ * Name:        mlk_poly_reduce
+ *
+ * Description: Converts polynomial to _unsigned canonical_ representatives.
+ *
+ *              The input coefficients can be arbitrary integers in int16_t.
+ *              The output coefficients are in [0,1,...,MLKEM_Q-1].
+ *
+ * Arguments:   - mlk_poly *r: pointer to input/output polynomial
+ *
+ * Specification: Normalizes on unsigned canoncial representatives
+ *                ahead of calling @[FIPS203, Compress_d, Eq (4.7)].
+ *                This is not made explicit in FIPS 203.
+ *
+ **************************************************/
+/*
+ * NOTE: The semantics of mlk_poly_reduce() is different in
+ * the reference implementation, which requires
+ * signed canonical output data. Unsigned canonical
+ * outputs are better suited to the only remaining
+ * use of mlk_poly_reduce() in the context of (de)serialization.
+ */
+MLK_INTERNAL_API
+void mlk_poly_reduce(mlk_poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(mlk_poly)))
+  assigns(memory_slice(r, sizeof(mlk_poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+);
+
+#define mlk_poly_add MLK_NAMESPACE(poly_add)
+/************************************************************
+ * Name: mlk_poly_add
+ *
+ * Description: Adds two polynomials in place
+ *
+ * Arguments: - r: Pointer to input-output polynomial to be added to.
+ *            - b: Pointer to input polynomial that should be added
+ *                 to r. Must be disjoint from r.
+ *
+ * The coefficients of r and b must be so that the addition does
+ * not overflow. Otherwise, the behaviour of this function is undefined.
+ *
+ * Specification:
+ * - @[FIPS203, 2.4.5, Arithmetic With Polynomials and NTT Representations]
+ * - Used in @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L21]
+ *
+ ************************************************************/
+/*
+ * NOTE: The reference implementation uses a 3-argument mlk_poly_add.
+ * We specialize to the accumulator form to avoid reasoning about aliasing.
+ */
+MLK_INTERNAL_API
+void mlk_poly_add(mlk_poly *r, const mlk_poly *b)
+__contract__(
+  requires(memory_no_alias(r, sizeof(mlk_poly)))
+  requires(memory_no_alias(b, sizeof(mlk_poly)))
+  requires(forall(k0, 0, MLKEM_N, (int32_t) r->coeffs[k0] + b->coeffs[k0] <= INT16_MAX))
+  requires(forall(k1, 0, MLKEM_N, (int32_t) r->coeffs[k1] + b->coeffs[k1] >= INT16_MIN))
+  ensures(forall(k, 0, MLKEM_N, r->coeffs[k] == old(*r).coeffs[k] + b->coeffs[k]))
+  assigns(memory_slice(r, sizeof(mlk_poly)))
+);
+
+#define mlk_poly_sub MLK_NAMESPACE(poly_sub)
+/*************************************************
+ * Name:        mlk_poly_sub
+ *
+ * Description: Subtract two polynomials; no modular reduction is performed
+ *
+ * Arguments: - mlk_poly *r: Pointer to input-output polynomial to be added to.
+ *            - const mlk_poly *b: Pointer to second input polynomial
+ *
+ * Specification:
+ * - @[FIPS203, 2.4.5, Arithmetic With Polynomials and NTT Representations]
+ * - Used in @[FIPS203, Algorithm 15, K-PKE.Decrypt, L6]
+ *
+ **************************************************/
+/*
+ * NOTE: The reference implementation uses a 3-argument mlk_poly_sub.
+ * We specialize to the accumulator form to avoid reasoning about aliasing.
+ */
+MLK_INTERNAL_API
+void mlk_poly_sub(mlk_poly *r, const mlk_poly *b)
+__contract__(
+  requires(memory_no_alias(r, sizeof(mlk_poly)))
+  requires(memory_no_alias(b, sizeof(mlk_poly)))
+  requires(forall(k0, 0, MLKEM_N, (int32_t) r->coeffs[k0] - b->coeffs[k0] <= INT16_MAX))
+  requires(forall(k1, 0, MLKEM_N, (int32_t) r->coeffs[k1] - b->coeffs[k1] >= INT16_MIN))
+  ensures(forall(k, 0, MLKEM_N, r->coeffs[k] == old(*r).coeffs[k] - b->coeffs[k]))
+  assigns(memory_slice(r, sizeof(mlk_poly)))
+);
+
+#define mlk_poly_ntt MLK_NAMESPACE(poly_ntt)
+/*************************************************
+ * Name:        mlk_poly_ntt
+ *
+ * Description: Computes negacyclic number-theoretic transform (NTT) of
+ *              a polynomial in place.
+ *
+ *              The input is assumed to be in normal order and
+ *              coefficient-wise bound by MLKEM_Q in absolute value.
+ *
+ *              The output polynomial is in bitreversed order, and
+ *              coefficient-wise bound by MLK_NTT_BOUND in absolute value.
+ *
+ *              (NOTE: Sometimes the input to the NTT is actually smaller,
+ *               which gives better bounds.)
+ *
+ * Arguments:   - mlk_poly *p: pointer to in/output polynomial
+ *
+ * Specification: Implements @[FIPS203, Algorithm 9, NTT]
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_poly_ntt(mlk_poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(mlk_poly)))
+  requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(mlk_poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLK_NTT_BOUND))
+);
+
+#define mlk_poly_invntt_tomont MLK_NAMESPACE(poly_invntt_tomont)
+/*************************************************
+ * Name:        mlk_poly_invntt_tomont
+ *
+ * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
+ *              of a polynomial in place;
+ *              inputs assumed to be in bitreversed order, output in normal
+ *              order
+ *
+ *              The input is assumed to be in bitreversed order, and can
+ *              have arbitrary coefficients in int16_t.
+ *
+ *              The output polynomial is in normal order, and
+ *              coefficient-wise bound by MLK_INVNTT_BOUND in absolute value.
+ *
+ * Arguments:   - uint16_t *a: pointer to in/output polynomial
+ *
+ * Specification: Implements composition of @[FIPS203, Algorithm 10, NTT^{-1}]
+ *                and elementwise modular multiplication with a suitable
+ *                Montgomery factor introduced during the base multiplication.
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_poly_invntt_tomont(mlk_poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(mlk_poly)))
+  assigns(memory_slice(r, sizeof(mlk_poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLK_INVNTT_BOUND))
+);
+
+#endif /* !MLK_POLY_H */
diff --git a/mlkem_native/src/poly_k.c b/mlkem_native/src/poly_k.c
new file mode 100644
index 0000000..32b214e
--- /dev/null
+++ b/mlkem_native/src/poly_k.c
@@ -0,0 +1,502 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS203]
+ *   FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/203/final
+ *
+ * - [NeonNTT]
+ *   Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1
+ *   Becker, Hwang, Kannwischer, Yang, Yang
+ *   https://eprint.iacr.org/2021/986
+ *
+ * - [REF]
+ *   CRYSTALS-Kyber C reference implementation
+ *   Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ *   https://github.com/pq-crystals/kyber/tree/main/ref
+ */
+
+#include "poly_k.h"
+
+#include "debug.h"
+#include "sampling.h"
+#include "symmetric.h"
+
+/* Parameter set namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying parameter sets)
+ * within a single compilation unit. */
+#define mlk_poly_cbd_eta1 MLK_ADD_PARAM_SET(mlk_poly_cbd_eta1)
+#define mlk_poly_cbd_eta2 MLK_ADD_PARAM_SET(mlk_poly_cbd_eta2)
+#define mlk_polyvec_basemul_acc_montgomery_cached_c \
+  MLK_ADD_PARAM_SET(mlk_polyvec_basemul_acc_montgomery_cached_c)
+/* End of parameter set namespacing */
+
+/* Reference: `polyvec_compress()` in the reference implementation @[REF]
+ *            - In contrast to the reference implementation, we assume
+ *              unsigned canonical coefficients here.
+ *              The reference implementation works with coefficients
+ *              in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
+MLK_INTERNAL_API
+void mlk_polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
+                             const mlk_polyvec *a)
+{
+  unsigned i;
+  mlk_assert_bound_2d(a->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    mlk_poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]);
+  }
+}
+
+/* Reference: `polyvec_decompress()` in the reference implementation @[REF]. */
+MLK_INTERNAL_API
+void mlk_polyvec_decompress_du(mlk_polyvec *r,
+                               const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    mlk_poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
+  }
+
+  mlk_assert_bound_2d(r->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+}
+
+/* Reference: `polyvec_tobytes()` in the reference implementation @[REF].
+ *            - In contrast to the reference implementation, we assume
+ *              unsigned canonical coefficients here.
+ *              The reference implementation works with coefficients
+ *              in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
+MLK_INTERNAL_API
+void mlk_polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const mlk_polyvec *a)
+{
+  unsigned i;
+  mlk_assert_bound_2d(a->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_K; i++)
+  __loop__(
+    assigns(i, memory_slice(r, MLKEM_POLYVECBYTES))
+    invariant(i <= MLKEM_K)
+  )
+  {
+    mlk_poly_tobytes(&r[i * MLKEM_POLYBYTES], &a->vec[i]);
+  }
+}
+
+/* Reference: `polyvec_frombytes()` in the reference implementation @[REF]. */
+MLK_INTERNAL_API
+void mlk_polyvec_frombytes(mlk_polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    mlk_poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES);
+  }
+
+  mlk_assert_bound_2d(r->vec, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
+}
+
+/* Reference: `polyvec_ntt()` in the reference implementation @[REF]. */
+MLK_INTERNAL_API
+void mlk_polyvec_ntt(mlk_polyvec *r)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    mlk_poly_ntt(&r->vec[i]);
+  }
+
+  mlk_assert_abs_bound_2d(r->vec, MLKEM_K, MLKEM_N, MLK_NTT_BOUND);
+}
+
+/* Reference: `polyvec_invntt_tomont()` in the reference implementation @[REF].
+ *            - We normalize at the beginning of the inverse NTT,
+ *              while the reference implementation normalizes at
+ *              the end. This allows us to drop a call to `poly_reduce()`
+ *              from the base multiplication. */
+MLK_INTERNAL_API
+void mlk_polyvec_invntt_tomont(mlk_polyvec *r)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    mlk_poly_invntt_tomont(&r->vec[i]);
+  }
+
+  mlk_assert_abs_bound_2d(r->vec, MLKEM_K, MLKEM_N, MLK_INVNTT_BOUND);
+}
+
+/* Reference: `polyvec_basemul_acc_montgomery()` in the
+ *            reference implementation @[REF].
+ *            - We use a multiplication cache ('mulcache') here
+ *              which is not present in the reference implementation @[REF].
+ *              This idea originates from @[NeonNTT] and is used
+ *              at the C level here.
+ *            - We compute the coefficients of the scalar product in 32-bit
+ *              coefficients and perform only a single modular reduction
+ *              at the end. The reference implementation uses 2 * MLKEM_K
+ *              more modular reductions since it reduces after every modular
+ *              multiplication. */
+MLK_STATIC_TESTABLE void mlk_polyvec_basemul_acc_montgomery_cached_c(
+    mlk_poly *r, const mlk_polyvec *a, const mlk_polyvec *b,
+    const mlk_polyvec_mulcache *b_cache)
+__contract__(
+  requires(memory_no_alias(r, sizeof(mlk_poly)))
+  requires(memory_no_alias(a, sizeof(mlk_polyvec)))
+  requires(memory_no_alias(b, sizeof(mlk_polyvec)))
+  requires(memory_no_alias(b_cache, sizeof(mlk_polyvec_mulcache)))
+  requires(forall(k1, 0, MLKEM_K,
+     array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
+  assigns(memory_slice(r, sizeof(mlk_poly)))
+)
+{
+  unsigned i;
+  mlk_assert_bound_2d(a->vec, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
+
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(invariant(i <= MLKEM_N / 2))
+  {
+    unsigned k;
+    int32_t t[2] = {0};
+    for (k = 0; k < MLKEM_K; k++)
+    __loop__(
+      invariant(k <= MLKEM_K &&
+         t[0] <=    (int32_t) k * 2 * MLKEM_UINT12_LIMIT * 32768  &&
+         t[0] >= - ((int32_t) k * 2 * MLKEM_UINT12_LIMIT * 32768) &&
+         t[1] <=   ((int32_t) k * 2 * MLKEM_UINT12_LIMIT * 32768) &&
+         t[1] >= - ((int32_t) k * 2 * MLKEM_UINT12_LIMIT * 32768)))
+    {
+      t[0] += (int32_t)a->vec[k].coeffs[2 * i + 1] * b_cache->vec[k].coeffs[i];
+      t[0] += (int32_t)a->vec[k].coeffs[2 * i] * b->vec[k].coeffs[2 * i];
+      t[1] += (int32_t)a->vec[k].coeffs[2 * i] * b->vec[k].coeffs[2 * i + 1];
+      t[1] += (int32_t)a->vec[k].coeffs[2 * i + 1] * b->vec[k].coeffs[2 * i];
+    }
+    r->coeffs[2 * i + 0] = mlk_montgomery_reduce(t[0]);
+    r->coeffs[2 * i + 1] = mlk_montgomery_reduce(t[1]);
+  }
+}
+
+MLK_INTERNAL_API
+void mlk_polyvec_basemul_acc_montgomery_cached(
+    mlk_poly *r, const mlk_polyvec *a, const mlk_polyvec *b,
+    const mlk_polyvec_mulcache *b_cache)
+{
+#if defined(MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
+  {
+    int ret;
+    mlk_assert_bound_2d(a->vec, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
+#if MLKEM_K == 2
+    ret = mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
+        r->coeffs, (const int16_t *)a, (const int16_t *)b,
+        (const int16_t *)b_cache);
+#elif MLKEM_K == 3
+    ret = mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
+        r->coeffs, (const int16_t *)a, (const int16_t *)b,
+        (const int16_t *)b_cache);
+#elif MLKEM_K == 4
+    ret = mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
+        r->coeffs, (const int16_t *)a, (const int16_t *)b,
+        (const int16_t *)b_cache);
+#endif
+    if (ret == MLK_NATIVE_FUNC_SUCCESS)
+    {
+      return;
+    }
+  }
+#endif /* MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
+
+  mlk_polyvec_basemul_acc_montgomery_cached_c(r, a, b, b_cache);
+}
+
+/* Reference: Does not exist in the reference implementation @[REF].
+ *            - The reference implementation does not use a
+ *              multiplication cache ('mulcache'). This idea originates
+ *              from @[NeonNTT] and is used at the C level here. */
+MLK_INTERNAL_API
+void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache *x, const mlk_polyvec *a)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    mlk_poly_mulcache_compute(&x->vec[i], &a->vec[i]);
+  }
+}
+
+/* Reference: `polyvec_reduce()` in the reference implementation @[REF].
+ *            - We use _unsigned_ canonical outputs, while the reference
+ *              implementation uses _signed_ canonical outputs.
+ *              Accordingly, we need a conditional addition of MLKEM_Q
+ *              here to go from signed to unsigned representatives.
+ *              This conditional addition is then dropped from all
+ *              polynomial compression functions instead (see `compress.c`). */
+MLK_INTERNAL_API
+void mlk_polyvec_reduce(mlk_polyvec *r)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    mlk_poly_reduce(&r->vec[i]);
+  }
+
+  mlk_assert_bound_2d(r->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+}
+
+/* Reference: `polyvec_add()` in the reference implementation @[REF].
+ *            - We use destructive version (output=first input) to avoid
+ *              reasoning about aliasing in the CBMC specification */
+MLK_INTERNAL_API
+void mlk_polyvec_add(mlk_polyvec *r, const mlk_polyvec *b)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  __loop__(
+    assigns(i, memory_slice(r, sizeof(mlk_polyvec)))
+    invariant(i <= MLKEM_K)
+    invariant(forall(j0, i, MLKEM_K,
+                forall(k0, 0, MLKEM_N,
+                       ((int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX) &&
+                       ((int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] >= INT16_MIN))))
+    invariant(forall(j2, 0, i,
+                forall(k2, 0, MLKEM_N,
+                       (r->vec[j2].coeffs[k2] <= INT16_MAX) &&
+                       (r->vec[j2].coeffs[k2] >= INT16_MIN))))
+  )
+  {
+    mlk_poly_add(&r->vec[i], &b->vec[i]);
+  }
+}
+
+/* Reference: `polyvec_tomont()` in the reference implementation @[REF]. */
+MLK_INTERNAL_API
+void mlk_polyvec_tomont(mlk_polyvec *r)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    mlk_poly_tomont(&r->vec[i]);
+  }
+
+  mlk_assert_abs_bound_2d(r->vec, MLKEM_K, MLKEM_N, MLKEM_Q);
+}
+
+
+/*************************************************
+ * Name:        mlk_poly_cbd_eta1
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - mlk_poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ *
+ * Specification: Implements @[FIPS203, Algorithm 8, SamplePolyCBD_eta1], where
+ *                eta1 is specified per parameter set in @[FIPS203, Table 2]
+ *                and represented as MLKEM_ETA1 here.
+ *
+ **************************************************/
+
+/* Reference: `poly_cbd_eta1` in the reference implementation @[REF]. */
+static MLK_INLINE void mlk_poly_cbd_eta1(
+    mlk_poly *r, const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(mlk_poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(mlk_poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
+)
+{
+#if MLKEM_ETA1 == 2
+  mlk_poly_cbd2(r, buf);
+#elif MLKEM_ETA1 == 3
+  mlk_poly_cbd3(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA1"
+#endif
+}
+
+/* Reference: Does not exist in the reference implementation @[REF].
+ *            - This implements a x4-batched version of `poly_getnoise_eta1()`
+ *              from the reference implementation, to leverage
+ *              batched Keccak-f1600.*/
+MLK_INTERNAL_API
+void mlk_poly_getnoise_eta1_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
+                               mlk_poly *r3, const uint8_t seed[MLKEM_SYMBYTES],
+                               uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                               uint8_t nonce3)
+{
+  MLK_ALIGN uint8_t buf[4][MLK_ALIGN_UP(MLKEM_ETA1 * MLKEM_N / 4)];
+  MLK_ALIGN uint8_t extkey[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 1)];
+  mlk_memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+  mlk_memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+  mlk_memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+  mlk_memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+  extkey[0][MLKEM_SYMBYTES] = nonce0;
+  extkey[1][MLKEM_SYMBYTES] = nonce1;
+  extkey[2][MLKEM_SYMBYTES] = nonce2;
+  extkey[3][MLKEM_SYMBYTES] = nonce3;
+
+#if !defined(FIPS202_X4_DEFAULT_IMPLEMENTATION) && \
+    !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
+  mlk_prf_eta1_x4(buf, extkey);
+#else
+  mlk_prf_eta1(buf[0], extkey[0]);
+  mlk_prf_eta1(buf[1], extkey[1]);
+  mlk_prf_eta1(buf[2], extkey[2]);
+  if (r3 != NULL)
+  {
+    mlk_prf_eta1(buf[3], extkey[3]);
+  }
+#endif /* !(!FIPS202_X4_DEFAULT_IMPLEMENTATION && \
+          !MLK_CONFIG_SERIAL_FIPS202_ONLY) */
+
+  mlk_poly_cbd_eta1(r0, buf[0]);
+  mlk_poly_cbd_eta1(r1, buf[1]);
+  mlk_poly_cbd_eta1(r2, buf[2]);
+  if (r3 != NULL)
+  {
+    mlk_poly_cbd_eta1(r3, buf[3]);
+    mlk_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
+  }
+
+  mlk_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  mlk_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  mlk_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1);
+
+  /* Specification: Partially implements
+   * @[FIPS203, Section 3.3, Destruction of intermediate values] */
+  mlk_zeroize(buf, sizeof(buf));
+  mlk_zeroize(extkey, sizeof(extkey));
+}
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+/*************************************************
+ * Name:        mlk_poly_cbd_eta2
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA2.
+ *
+ * Arguments:   - mlk_poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ *
+ * Specification: Implements @[FIPS203, Algorithm 8, SamplePolyCBD_eta2], where
+ *                eta2 is specified per parameter set in @[FIPS203, Table 2]
+ *                and represented as MLKEM_ETA2 here.
+ *
+ **************************************************/
+
+/* Reference: `poly_cbd_eta2` in the reference implementation @[REF]. */
+static MLK_INLINE void mlk_poly_cbd_eta2(
+    mlk_poly *r, const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(mlk_poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(mlk_poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)))
+{
+#if MLKEM_ETA2 == 2
+  mlk_poly_cbd2(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA2"
+#endif
+}
+
+/* Reference: `poly_getnoise_eta2()` in the reference implementation @[REF].
+ *            - We include buffer zeroization. */
+MLK_INTERNAL_API
+void mlk_poly_getnoise_eta2(mlk_poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                            uint8_t nonce)
+{
+  MLK_ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
+  MLK_ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
+
+  mlk_memcpy(extkey, seed, MLKEM_SYMBYTES);
+  extkey[MLKEM_SYMBYTES] = nonce;
+  mlk_prf_eta2(buf, extkey);
+
+  mlk_poly_cbd_eta2(r, buf);
+
+  mlk_assert_abs_bound(r, MLKEM_N, MLKEM_ETA2 + 1);
+
+  /* Specification: Partially implements
+   * @[FIPS203, Section 3.3, Destruction of intermediate values] */
+  mlk_zeroize(buf, sizeof(buf));
+  mlk_zeroize(extkey, sizeof(extkey));
+}
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+#if MLKEM_K == 2
+/* Reference: Does not exist in the reference implementation @[REF].
+ *            - This implements a x4-batched version of `poly_getnoise_eta1()`
+ *              and `poly_getnoise_eta2()` from the reference implementation,
+ *              leveraging batched Keccak-f1600.
+ *            - If a x4-batched Keccak-f1600 is available, we squeeze
+ *              more random data than needed for the eta2 calls, to be
+ *              be able to use a x4-batched Keccak-f1600. */
+MLK_INTERNAL_API
+void mlk_poly_getnoise_eta1122_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
+                                  mlk_poly *r3,
+                                  const uint8_t seed[MLKEM_SYMBYTES],
+                                  uint8_t nonce0, uint8_t nonce1,
+                                  uint8_t nonce2, uint8_t nonce3)
+{
+#if MLKEM_ETA2 >= MLKEM_ETA1
+#error mlk_poly_getnoise_eta1122_4x assumes MLKEM_ETA1 > MLKEM_ETA2
+#endif
+  MLK_ALIGN uint8_t buf[4][MLK_ALIGN_UP(MLKEM_ETA1 * MLKEM_N / 4)];
+  MLK_ALIGN uint8_t extkey[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 1)];
+
+  mlk_memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+  mlk_memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+  mlk_memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+  mlk_memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+  extkey[0][MLKEM_SYMBYTES] = nonce0;
+  extkey[1][MLKEM_SYMBYTES] = nonce1;
+  extkey[2][MLKEM_SYMBYTES] = nonce2;
+  extkey[3][MLKEM_SYMBYTES] = nonce3;
+
+  /* On systems with fast batched Keccak, we use 4-fold batched PRF,
+   * even though that means generating more random data in buf[2] and buf[3]
+   * than necessary. */
+#if !defined(FIPS202_X4_DEFAULT_IMPLEMENTATION) && \
+    !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
+  mlk_prf_eta1_x4(buf, extkey);
+#else
+  mlk_prf_eta1(buf[0], extkey[0]);
+  mlk_prf_eta1(buf[1], extkey[1]);
+  mlk_prf_eta2(buf[2], extkey[2]);
+  mlk_prf_eta2(buf[3], extkey[3]);
+#endif /* !(!FIPS202_X4_DEFAULT_IMPLEMENTATION && \
+          !MLK_CONFIG_SERIAL_FIPS202_ONLY) */
+
+  mlk_poly_cbd_eta1(r0, buf[0]);
+  mlk_poly_cbd_eta1(r1, buf[1]);
+  mlk_poly_cbd_eta2(r2, buf[2]);
+  mlk_poly_cbd_eta2(r3, buf[3]);
+
+  mlk_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  mlk_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  mlk_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1);
+  mlk_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1);
+
+  /* Specification: Partially implements
+   * @[FIPS203, Section 3.3, Destruction of intermediate values] */
+  mlk_zeroize(buf, sizeof(buf));
+  mlk_zeroize(extkey, sizeof(extkey));
+}
+#endif /* MLKEM_K == 2 */
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef mlk_poly_cbd_eta1
+#undef mlk_poly_cbd_eta2
+#undef mlk_polyvec_basemul_acc_montgomery_cached_c
diff --git a/mlkem_native/src/poly_k.h b/mlkem_native/src/poly_k.h
new file mode 100644
index 0000000..9089a8e
--- /dev/null
+++ b/mlkem_native/src/poly_k.h
@@ -0,0 +1,668 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS203]
+ *   FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/203/final
+ */
+
+#ifndef MLK_POLY_K_H
+#define MLK_POLY_K_H
+
+#include "common.h"
+#include "compress.h"
+#include "poly.h"
+
+/* Parameter set namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying parameter sets)
+ * within a single compilation unit. */
+#define mlk_polyvec MLK_ADD_PARAM_SET(mlk_polyvec)
+#define mlk_polymat MLK_ADD_PARAM_SET(mlk_polymat)
+#define mlk_polyvec_mulcache MLK_ADD_PARAM_SET(mlk_polyvec_mulcache)
+/* End of parameter set namespacing */
+
+typedef struct
+{
+  mlk_poly vec[MLKEM_K];
+} MLK_ALIGN mlk_polyvec;
+
+typedef struct
+{
+  mlk_polyvec vec[MLKEM_K];
+} MLK_ALIGN mlk_polymat;
+
+typedef struct
+{
+  mlk_poly_mulcache vec[MLKEM_K];
+} MLK_ALIGN mlk_polyvec_mulcache;
+
+#define mlk_poly_compress_du MLK_NAMESPACE_K(poly_compress_du)
+/*************************************************
+ * Name:        mlk_poly_compress_du
+ *
+ * Description: Compression (du bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *              - const mlk_poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ *
+ * Specification: Implements `ByteEncode_{d_u} (Compress_{d_u} (u))`
+ *                in @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L22],
+ *                with level-specific d_u defined in @[FIPS203, Table 2],
+ *                and given by MLKEM_DU here.
+ *
+ **************************************************/
+static MLK_INLINE void mlk_poly_compress_du(
+    uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const mlk_poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(a, sizeof(mlk_poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU)))
+{
+#if MLKEM_DU == 10
+  mlk_poly_compress_d10(r, a);
+#elif MLKEM_DU == 11
+  mlk_poly_compress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define mlk_poly_decompress_du MLK_NAMESPACE_K(poly_decompress_du)
+/*************************************************
+ * Name:        mlk_poly_decompress_du
+ *
+ * Description: De-serialization and subsequent decompression (du bits) of a
+ *              polynomial; approximate inverse of mlk_poly_compress_du
+ *
+ * Arguments:   - mlk_poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ * Specification: Implements `Decompress_{d_u} (ByteDecode_{d_u} (u))`
+ *                in @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L3].
+ *                with level-specific d_u defined in @[FIPS203, Table 2],
+ *                and given by MLKEM_DU here.
+ *
+ **************************************************/
+static MLK_INLINE void mlk_poly_decompress_du(
+    mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(r, sizeof(mlk_poly)))
+  assigns(memory_slice(r, sizeof(mlk_poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DU == 10
+  mlk_poly_decompress_d10(r, a);
+#elif MLKEM_DU == 11
+  mlk_poly_decompress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define mlk_poly_compress_dv MLK_NAMESPACE_K(poly_compress_dv)
+/*************************************************
+ * Name:        mlk_poly_compress_dv
+ *
+ * Description: Compression (dv bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *              - const mlk_poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ *
+ * Specification: Implements `ByteEncode_{d_v} (Compress_{d_v} (v))`
+ *                in @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L23].
+ *                with level-specific d_v defined in @[FIPS203, Table 2],
+ *                and given by MLKEM_DV here.
+ *
+ **************************************************/
+static MLK_INLINE void mlk_poly_compress_dv(
+    uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const mlk_poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(a, sizeof(mlk_poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DV)))
+{
+#if MLKEM_DV == 4
+  mlk_poly_compress_d4(r, a);
+#elif MLKEM_DV == 5
+  mlk_poly_compress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+
+#define mlk_poly_decompress_dv MLK_NAMESPACE_K(poly_decompress_dv)
+/*************************************************
+ * Name:        mlk_poly_decompress_dv
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - mlk_poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ * Specification: Implements `Decompress_{d_v} (ByteDecode_{d_v} (v))`
+ *                in @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L4].
+ *                with level-specific d_v defined in @[FIPS203, Table 2],
+ *                and given by MLKEM_DV here.
+ *
+ **************************************************/
+static MLK_INLINE void mlk_poly_decompress_dv(
+    mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(r, sizeof(mlk_poly)))
+  assigns(memory_slice(r, sizeof(mlk_poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DV == 4
+  mlk_poly_decompress_d4(r, a);
+#elif MLKEM_DV == 5
+  mlk_poly_decompress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+#define mlk_polyvec_compress_du MLK_NAMESPACE_K(polyvec_compress_du)
+/*************************************************
+ * Name:        mlk_polyvec_compress_du
+ *
+ * Description: Compress and serialize vector of polynomials
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                            (needs space for MLKEM_POLYVECCOMPRESSEDBYTES_DU)
+ *              - const mlk_polyvec a: pointer to input vector of polynomials.
+ *                                  Coefficients must be unsigned canonical,
+ *                                  i.e. in [0,1,..,MLKEM_Q-1].
+ *
+ * Specification: Implements `ByteEncode_{d_u} (Compress_{d_u} (u))`
+ *                in @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L22].
+ *                with level-specific d_u defined in @[FIPS203, Table 2],
+ *                and given by MLKEM_DU here.
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
+                             const mlk_polyvec *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(a, sizeof(mlk_polyvec)))
+  requires(forall(k0, 0, MLKEM_K,
+         array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+  assigns(memory_slice(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
+);
+
+#define mlk_polyvec_decompress_du MLK_NAMESPACE_K(polyvec_decompress_du)
+/*************************************************
+ * Name:        mlk_polyvec_decompress_du
+ *
+ * Description: De-serialize and decompress vector of polynomials;
+ *              approximate inverse of mlk_polyvec_compress_du
+ *
+ * Arguments:   - mlk_polyvec r:       pointer to output vector of polynomials.
+ *                Output will have coefficients normalized to [0,..,q-1].
+ *              - const uint8_t *a: pointer to input byte array
+ *                                  (of length MLKEM_POLYVECCOMPRESSEDBYTES_DU)
+ *
+ * Specification: Implements `Decompress_{d_u} (ByteDecode_{d_u} (u))`
+ *                in @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L3].
+ *                with level-specific d_u defined in @[FIPS203, Table 2],
+ *                and given by MLKEM_DU here.
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_polyvec_decompress_du(mlk_polyvec *r,
+                               const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(r, sizeof(mlk_polyvec)))
+  assigns(memory_slice(r, sizeof(mlk_polyvec)))
+  ensures(forall(k0, 0, MLKEM_K,
+         array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+);
+
+#define mlk_polyvec_tobytes MLK_NAMESPACE_K(polyvec_tobytes)
+/*************************************************
+ * Name:        mlk_polyvec_tobytes
+ *
+ * Description: Serialize vector of polynomials
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                            (needs space for MLKEM_POLYVECBYTES)
+ *              - const mlk_polyvec a: pointer to input vector of polynomials
+ *                  Each polynomial must have coefficients in [0,..,q-1].
+ *
+ * Specification: Implements ByteEncode_12 @[FIPS203, Algorithm 5].
+ *                Extended to vectors as per
+ *                @[FIPS203, 2.4.8 Applying Algorithms to Arrays]
+ *                and @[FIPS203, 2.4.6, Matrices and Vectors]
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const mlk_polyvec *a)
+__contract__(
+  requires(memory_no_alias(a, sizeof(mlk_polyvec)))
+  requires(memory_no_alias(r, MLKEM_POLYVECBYTES))
+  requires(forall(k0, 0, MLKEM_K,
+         array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+  assigns(memory_slice(r, MLKEM_POLYVECBYTES))
+);
+
+#define mlk_polyvec_frombytes MLK_NAMESPACE_K(polyvec_frombytes)
+/*************************************************
+ * Name:        mlk_polyvec_frombytes
+ *
+ * Description: De-serialize vector of polynomials;
+ *              inverse of mlk_polyvec_tobytes
+ *
+ * Arguments:   - const mlk_polyvec a: pointer to output vector of polynomials
+ *                 (of length MLKEM_POLYVECBYTES). Output will have coefficients
+ *                 normalized in [0..4095].
+ *              - uint8_t *r: pointer to input byte array
+ *
+ * Specification: Implements ByteDecode_12 @[FIPS203, Algorithm 6].
+ *                Extended to vectors as per
+ *                @[FIPS203, 2.4.8 Applying Algorithms to Arrays]
+ *                and @[FIPS203, 2.4.6, Matrices and Vectors]
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_polyvec_frombytes(mlk_polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
+__contract__(
+  requires(memory_no_alias(r, sizeof(mlk_polyvec)))
+  requires(memory_no_alias(a, MLKEM_POLYVECBYTES))
+  assigns(memory_slice(r, sizeof(mlk_polyvec)))
+  ensures(forall(k0, 0, MLKEM_K,
+        array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
+);
+
+#define mlk_polyvec_ntt MLK_NAMESPACE_K(polyvec_ntt)
+/*************************************************
+ * Name:        mlk_polyvec_ntt
+ *
+ * Description: Apply forward NTT to all elements of a vector of polynomials.
+ *
+ *              The input is assumed to be in normal order and
+ *              coefficient-wise bound by MLKEM_Q in absolute value.
+ *
+ *              The output polynomial is in bitreversed order, and
+ *              coefficient-wise bound by MLK_NTT_BOUND in absolute value.
+ *
+ * Arguments:   - mlk_polyvec r: pointer to in/output vector of polynomials
+ *
+ * Specification:
+ * - Implements @[FIPS203, Algorithm 9, NTT]
+ * - Extended to vectors as per @[FIPS203, 2.4.6, Matrices and Vectors]
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_polyvec_ntt(mlk_polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(mlk_polyvec)))
+  requires(forall(j, 0, MLKEM_K,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
+  assigns(memory_slice(r, sizeof(mlk_polyvec)))
+  ensures(forall(j, 0, MLKEM_K,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLK_NTT_BOUND)))
+);
+
+#define mlk_polyvec_invntt_tomont MLK_NAMESPACE_K(polyvec_invntt_tomont)
+/*************************************************
+ * Name:        mlk_polyvec_invntt_tomont
+ *
+ * Description: Apply inverse NTT to all elements of a vector of polynomials
+ *              and multiply by Montgomery factor 2^16
+ *
+ *              The input is assumed to be in bitreversed order, and can
+ *              have arbitrary coefficients in int16_t.
+ *
+ *              The output polynomial is in normal order, and
+ *              coefficient-wise bound by MLK_INVNTT_BOUND in absolute value.
+ *
+ * Arguments:   - mlk_polyvec r: pointer to in/output vector of polynomials
+ *
+ * Specification:
+ * - Implements @[FIPS203, Algorithm 10, NTT^{-1}]
+ * - Extended to vectors as per @[FIPS203, 2.4.6, Matrices and Vectors]
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_polyvec_invntt_tomont(mlk_polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(mlk_polyvec)))
+  assigns(memory_slice(r, sizeof(mlk_polyvec)))
+  ensures(forall(j, 0, MLKEM_K,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLK_INVNTT_BOUND)))
+);
+
+#define mlk_polyvec_basemul_acc_montgomery_cached \
+  MLK_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached)
+/*************************************************
+ * Name:        mlk_polyvec_basemul_acc_montgomery_cached
+ *
+ * Description: Scalar product of two vectors of polynomials in NTT domain,
+ *              using mulcache for second operand.
+ *
+ *              Bounds:
+ *              - Every coefficient of a is assumed to be in [0..4095]
+ *              - No bounds guarantees for the coefficients in the result.
+ *
+ * Arguments:   - mlk_poly *r: pointer to output polynomial
+ *              - const mlk_polyvec a: pointer to first input polynomial vector
+ *              - const mlk_polyvec b: pointer to second input polynomial
+ *                vector
+ *              - const mlk_polyvec_mulcache b_cache: pointer to mulcache
+ *                  for second input polynomial vector. Can be computed
+ *                  via mlk_polyvec_mulcache_compute().
+ *
+ * Specification: Implements
+ *                - @[FIPS203, Section 2.4.7, Eq (2.14)]
+ *                - @[FIPS203, Algorithm 11, MultiplyNTTs]
+ *                - @[FIPS203, Algorithm 12, BaseCaseMultiply]
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_polyvec_basemul_acc_montgomery_cached(
+    mlk_poly *r, const mlk_polyvec *a, const mlk_polyvec *b,
+    const mlk_polyvec_mulcache *b_cache)
+__contract__(
+  requires(memory_no_alias(r, sizeof(mlk_poly)))
+  requires(memory_no_alias(a, sizeof(mlk_polyvec)))
+  requires(memory_no_alias(b, sizeof(mlk_polyvec)))
+  requires(memory_no_alias(b_cache, sizeof(mlk_polyvec_mulcache)))
+  requires(forall(k1, 0, MLKEM_K,
+     array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
+  assigns(memory_slice(r, sizeof(mlk_poly)))
+);
+
+#define mlk_polyvec_mulcache_compute MLK_NAMESPACE_K(polyvec_mulcache_compute)
+/************************************************************
+ * Name: mlk_polyvec_mulcache_compute
+ *
+ * Description: Computes the mulcache for a vector of polynomials in NTT domain
+ *
+ *              The mulcache of a degree-2 polynomial b := b0 + b1*X
+ *              in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when
+ *              computing products of b in Fq[X]/(X^2-zeta).
+ *
+ *              The mulcache of a polynomial in NTT domain -- which is
+ *              a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta),
+ *              for varying zeta, is the 128-tuple of mulcaches of those
+ *              polynomials.
+ *
+ *              The mulcache of a vector of polynomials is the vector
+ *              of mulcaches of its entries.
+ *
+ * Arguments: - x: Pointer to mulcache to be populated
+ *            - a: Pointer to input polynomial vector
+ *
+ * Specification:
+ * - Caches `b_1 * \gamma` in @[FIPS203, Algorithm 12, BaseCaseMultiply, L1]
+ *
+ ************************************************************/
+/*
+ * NOTE: The default C implementation of this function populates
+ * the mulcache with values in (-q,q), but this is not needed for the
+ * higher level safety proofs, and thus not part of the spec.
+ */
+MLK_INTERNAL_API
+void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache *x, const mlk_polyvec *a)
+__contract__(
+  requires(memory_no_alias(x, sizeof(mlk_polyvec_mulcache)))
+  requires(memory_no_alias(a, sizeof(mlk_polyvec)))
+  assigns(memory_slice(x, sizeof(mlk_polyvec_mulcache)))
+);
+
+#define mlk_polyvec_reduce MLK_NAMESPACE_K(polyvec_reduce)
+/*************************************************
+ * Name:        mlk_polyvec_reduce
+ *
+ * Description: Applies Barrett reduction to each coefficient
+ *              of each element of a vector of polynomials;
+ *              for details of the Barrett reduction see comments in poly.c
+ *
+ * Arguments:   - mlk_polyvec r: pointer to input/output polynomial
+ *
+ * Specification: Normalizes on unsigned canoncial representatives
+ *                ahead of calling @[FIPS203, Compress_d, Eq (4.7)].
+ *                This is not made explicit in FIPS 203.
+ *
+ **************************************************/
+/*
+ * NOTE: The semantics of mlk_polyvec_reduce() is different in
+ *       the reference implementation, which requires
+ *       signed canonical output data. Unsigned canonical
+ *       outputs are better suited to the only remaining
+ *       use of mlk_poly_reduce() in the context of (de)serialization.
+ */
+MLK_INTERNAL_API
+void mlk_polyvec_reduce(mlk_polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(mlk_polyvec)))
+  assigns(memory_slice(r, sizeof(mlk_polyvec)))
+  ensures(forall(k0, 0, MLKEM_K,
+    array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+);
+
+#define mlk_polyvec_add MLK_NAMESPACE_K(polyvec_add)
+/*************************************************
+ * Name:        mlk_polyvec_add
+ *
+ * Description: Add vectors of polynomials
+ *
+ * Arguments: - mlk_polyvec r: pointer to input-output vector of polynomials to
+ *              be added to
+ *            - const mlk_polyvec b: pointer to second input vector of
+ *              polynomials
+ *
+ * The coefficients of r and b must be so that the addition does
+ * not overflow. Otherwise, the behaviour of this function is undefined.
+ *
+ * The coefficients returned in *r are in int16_t which is sufficient
+ * to prove type-safety of calling units. Therefore, no stronger
+ * ensures clause is required on this function.
+ *
+ * Specification:
+ * - @[FIPS203, 2.4.5, Arithmetic With Polynomials and NTT Representations]
+ * - Used in @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L19]
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_polyvec_add(mlk_polyvec *r, const mlk_polyvec *b)
+__contract__(
+  requires(memory_no_alias(r, sizeof(mlk_polyvec)))
+  requires(memory_no_alias(b, sizeof(mlk_polyvec)))
+  requires(forall(j0, 0, MLKEM_K,
+          forall(k0, 0, MLKEM_N,
+            (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX)))
+  requires(forall(j1, 0, MLKEM_K,
+          forall(k1, 0, MLKEM_N,
+            (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN)))
+  assigns(memory_slice(r, sizeof(mlk_polyvec)))
+);
+
+#define mlk_polyvec_tomont MLK_NAMESPACE_K(polyvec_tomont)
+/*************************************************
+ * Name:        mlk_polyvec_tomont
+ *
+ * Description: Inplace conversion of all coefficients of a polynomial
+ *              vector from normal domain to Montgomery domain
+ *
+ *              Bounds: Output < q in absolute value.
+ *
+ *
+ * Specification: Internal normalization required in `mlk_indcpa_keypair_derand`
+ *                as part of matrix-vector multiplication
+ *                @[FIPS203, Algorithm 13, K-PKE.KeyGen, L18].
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_polyvec_tomont(mlk_polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(mlk_polyvec)))
+  assigns(memory_slice(r, sizeof(mlk_polyvec)))
+  ensures(forall(j, 0, MLKEM_K,
+    array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
+);
+
+#define mlk_poly_getnoise_eta1_4x MLK_NAMESPACE_K(poly_getnoise_eta1_4x)
+/*************************************************
+ * Name:        mlk_poly_getnoise_eta1_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ *              and nonces, with output polynomials close to centered binomial
+ *              distribution with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - mlk_poly *r{0,1,2,3}: pointer to output polynomial. The last
+ *                polynomial pointer may be NULL.
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ *
+ * Specification:
+ * Implements 4x `SamplePolyCBD_{eta1} (PRF_{eta1} (sigma, N))`:
+ * - @[FIPS203, Algorithm 8, SamplePolyCBD_eta]
+ * - @[FIPS203, Eq (4.3), PRF_eta]
+ * - `SamplePolyCBD_{eta1} (PRF_{eta1} (sigma, N))` appears in
+ *   @[FIPS203, Algorithm 13, K-PKE.KeyGen, L{9, 13}]
+ *   @[FIPS203, Algorithm 14, K-PKE.Encrypt, L10]
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_poly_getnoise_eta1_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
+                               mlk_poly *r3, const uint8_t seed[MLKEM_SYMBYTES],
+                               uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                               uint8_t nonce3)
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires(memory_no_alias(r0, sizeof(mlk_poly)))
+  requires(memory_no_alias(r1, sizeof(mlk_poly)))
+  requires(memory_no_alias(r2, sizeof(mlk_poly)))
+  requires(r3 == NULL || memory_no_alias(r3, sizeof(mlk_poly)))
+  assigns(memory_slice(r0, sizeof(mlk_poly)))
+  assigns(memory_slice(r1, sizeof(mlk_poly)))
+  assigns(memory_slice(r2, sizeof(mlk_poly)))
+  assigns(r3 != NULL: memory_slice(r3, sizeof(mlk_poly)))
+  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1))
+  ensures(array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1))
+  ensures(array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1))
+  ensures(r3 != NULL ==> array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1))
+);
+
+#if MLKEM_ETA1 == MLKEM_ETA2
+/*
+ * We only require mlk_poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
+ * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
+ * For ml-kem-512, mlk_poly_getnoise_eta1122_4x is used instead.
+ */
+#define mlk_poly_getnoise_eta2_4x mlk_poly_getnoise_eta1_4x
+#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+#define mlk_poly_getnoise_eta2 MLK_NAMESPACE_K(poly_getnoise_eta2)
+/*************************************************
+ * Name:        mlk_poly_getnoise_eta2
+ *
+ * Description: Sample a polynomial deterministically from a seed and a nonce,
+ *              with output polynomial close to centered binomial distribution
+ *              with parameter MLKEM_ETA2
+ *
+ * Arguments:   - mlk_poly *r: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce: one-byte input nonce
+ *
+ * Specification:
+ * Implements `SamplePolyCBD_{eta2} (PRF_{eta2} (sigma, N))`:
+ * - @[FIPS203, Algorithm 8, SamplePolyCBD_eta]
+ * - @[FIPS203, Eq (4.3), PRF_eta]
+ * - `SamplePolyCBD_{eta2} (PRF_{eta2} (sigma, N))` appears in
+ *   @[FIPS203, Algorithm 14, K-PKE.Encrypt, L14]
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_poly_getnoise_eta2(mlk_poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                            uint8_t nonce)
+__contract__(
+  requires(memory_no_alias(r, sizeof(mlk_poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(memory_slice(r, sizeof(mlk_poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
+);
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+#if MLKEM_K == 2
+#define mlk_poly_getnoise_eta1122_4x MLK_NAMESPACE_K(poly_getnoise_eta1122_4x)
+/*************************************************
+ * Name:        mlk_poly_getnoise_eta1122_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and a nonces, with output polynomials close to centered binomial
+ * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
+ *
+ * Arguments:   - mlk_poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ *
+ * Specification:
+ * Implements two instances each of
+ * `SamplePolyCBD_{eta1} (PRF_{eta1} (sigma, N))` and
+ * `SamplePolyCBD_{eta2} (PRF_{eta2} (sigma, N))`:
+ * - @[FIPS203, Algorithm 8, SamplePolyCBD_eta]
+ * - @[FIPS203, Eq (4.3), PRF_eta]
+ * - `SamplePolyCBD_{eta2} (PRF_{eta2} (sigma, N))` appears in
+ *   @[FIPS203, Algorithm 14, K-PKE.Encrypt, L14]
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_poly_getnoise_eta1122_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
+                                  mlk_poly *r3,
+                                  const uint8_t seed[MLKEM_SYMBYTES],
+                                  uint8_t nonce0, uint8_t nonce1,
+                                  uint8_t nonce2, uint8_t nonce3)
+__contract__(
+  requires(memory_no_alias(r0, sizeof(mlk_poly)))
+  requires(memory_no_alias(r1, sizeof(mlk_poly)))
+  requires(memory_no_alias(r2, sizeof(mlk_poly)))
+  requires(memory_no_alias(r3, sizeof(mlk_poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(memory_slice(r0, sizeof(mlk_poly)))
+  assigns(memory_slice(r1, sizeof(mlk_poly)))
+  assigns(memory_slice(r2, sizeof(mlk_poly)))
+  assigns(memory_slice(r3, sizeof(mlk_poly)))
+  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+       && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+       && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
+       && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1))
+);
+#endif /* MLKEM_K == 2 */
+
+#endif /* !MLK_POLY_K_H */
diff --git a/mlkem_native/src/randombytes.h b/mlkem_native/src/randombytes.h
new file mode 100644
index 0000000..3e841d2
--- /dev/null
+++ b/mlkem_native/src/randombytes.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+#ifndef MLK_RANDOMBYTES_H
+#define MLK_RANDOMBYTES_H
+
+
+#include "cbmc.h"
+#include "common.h"
+
+#if !defined(MLK_CONFIG_NO_RANDOMIZED_API)
+#if !defined(MLK_CONFIG_CUSTOM_RANDOMBYTES)
+/*************************************************
+ * Name:        randombytes
+ *
+ * Description: Fill a buffer with cryptographically secure random bytes.
+ *
+ *              mlkem-native does not provide an implementation of this
+ *              function. It must be provided by the consumer.
+ *
+ *              To use a custom random byte source with a different name
+ *              or signature, set MLK_CONFIG_CUSTOM_RANDOMBYTES and define
+ *              mlk_randombytes directly.
+ *
+ * Arguments:   - uint8_t *out: pointer to output buffer
+ *              - size_t outlen: number of random bytes to write
+ *
+ * Returns:     0 on success, non-zero on failure.
+ *              On failure, top-level APIs return MLK_ERR_RNG_FAIL.
+ *
+ **************************************************/
+int randombytes(uint8_t *out, size_t outlen);
+
+/*************************************************
+ * Name:        mlk_randombytes
+ *
+ * Description: Internal wrapper around randombytes().
+ *
+ *              Fill a buffer with cryptographically secure random bytes.
+ *
+ *              This function can be replaced by setting
+ *              MLK_CONFIG_CUSTOM_RANDOMBYTES and defining mlk_randombytes
+ *              directly.
+ *
+ * Arguments:   - uint8_t *out: pointer to output buffer
+ *              - size_t outlen: number of random bytes to write
+ *
+ * Returns:     0 on success, non-zero on failure.
+ *              On failure, top-level APIs return MLK_ERR_RNG_FAIL.
+ *
+ **************************************************/
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_randombytes(uint8_t *out, size_t outlen)
+__contract__(
+  requires(memory_no_alias(out, outlen))
+  assigns(memory_slice(out, outlen))) { return randombytes(out, outlen); }
+#endif /* !MLK_CONFIG_CUSTOM_RANDOMBYTES */
+#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */
+#endif /* !MLK_RANDOMBYTES_H */
diff --git a/mlkem_native/src/sampling.c b/mlkem_native/src/sampling.c
new file mode 100644
index 0000000..945d12e
--- /dev/null
+++ b/mlkem_native/src/sampling.c
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS203]
+ *   FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/203/final
+ *
+ * - [REF]
+ *   CRYSTALS-Kyber C reference implementation
+ *   Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ *   https://github.com/pq-crystals/kyber/tree/main/ref
+ */
+
+#include "common.h"
+#if !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
+
+#include "debug.h"
+#include "sampling.h"
+#include "symmetric.h"
+
+/* Reference: `rej_uniform()` in the reference implementation @[REF].
+ *            - Our signature differs from the reference implementation
+ *              in that it adds the offset and always expects the base of the
+ *              target buffer. This avoids shifting the buffer base in the
+ *              caller, which appears tricky to reason about. */
+MLK_STATIC_TESTABLE unsigned mlk_rej_uniform_c(int16_t *r, unsigned target,
+                                               unsigned offset,
+                                               const uint8_t *buf,
+                                               unsigned buflen)
+__contract__(
+  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
+  requires(memory_no_alias(r, sizeof(int16_t) * target))
+  requires(memory_no_alias(buf, buflen))
+  requires(array_bound(r, 0, offset, 0, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * target))
+  ensures(offset <= return_value && return_value <= target)
+  ensures(array_bound(r, 0, return_value, 0, MLKEM_Q)))
+{
+  unsigned ctr, pos;
+  int16_t val0, val1;
+
+  mlk_assert_bound(r, offset, 0, MLKEM_Q);
+
+  ctr = offset;
+  pos = 0;
+  /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */
+  while (ctr < target && pos + 3 <= buflen)
+  __loop__(
+    invariant(offset <= ctr && ctr <= target && pos <= buflen)
+    invariant(array_bound(r, 0, ctr, 0, MLKEM_Q)))
+  {
+    val0 = ((buf[pos + 0] >> 0) | (buf[pos + 1] << 8)) & 0xFFF;
+    val1 = ((buf[pos + 1] >> 4) | (buf[pos + 2] << 4)) & 0xFFF;
+    pos += 3;
+
+    if (val0 < MLKEM_Q)
+    {
+      r[ctr++] = val0;
+    }
+    if (ctr < target && val1 < MLKEM_Q)
+    {
+      r[ctr++] = val1;
+    }
+  }
+
+  mlk_assert_bound(r, ctr, 0, MLKEM_Q);
+  return ctr;
+}
+
+/*************************************************
+ * Name:        mlk_rej_uniform
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ *              uniform random integers mod q
+ *
+ * Arguments:   - int16_t *r:          pointer to output buffer
+ *              - unsigned target:     requested number of 16-bit integers
+ *                                     (uniform mod q).
+ *                                     Must be <= 4096.
+ *              - unsigned offset:     number of 16-bit integers that have
+ *                                     already been sampled.
+ *                                     Must be <= target.
+ *              - const uint8_t *buf:  pointer to input buffer
+ *                                     (assumed to be uniform random bytes)
+ *              - unsigned buflen:     length of input buffer in bytes
+ *                                     Must be <= 4096.
+ *                                     Must be a multiple of 3.
+ *
+ * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
+ * excluding. The limit of 4096 is somewhat arbitrary but sufficient for all
+ * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
+ *
+ * Returns the new offset of sampled 16-bit integers, at most target,
+ * and at least the initial offset.
+ * If the new offset is strictly less than len, all of the input buffers
+ * is guaranteed to have been consumed. If it is equal to len, no information
+ * is provided on how many bytes of the input buffer have been consumed.
+ **************************************************/
+
+/* Reference: `rej_uniform()` in the reference implementation @[REF].
+ *            - Our signature differs from the reference implementation
+ *              in that it adds the offset and always expects the base of the
+ *              target buffer. This avoids shifting the buffer base in the
+ *              caller, which appears tricky to reason about.
+ *            - Optional fallback to native implementation. */
+static unsigned mlk_rej_uniform(int16_t *r, unsigned target, unsigned offset,
+                                const uint8_t *buf, unsigned buflen)
+__contract__(
+  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
+  requires(memory_no_alias(r, sizeof(int16_t) * target))
+  requires(memory_no_alias(buf, buflen))
+  requires(array_bound(r, 0, offset, 0, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * target))
+  ensures(offset <= return_value && return_value <= target)
+  ensures(array_bound(r, 0, return_value, 0, MLKEM_Q))
+)
+{
+#if defined(MLK_USE_NATIVE_REJ_UNIFORM)
+  if (offset == 0)
+  {
+    int ret;
+    ret = mlk_rej_uniform_native(r, target, buf, buflen);
+    if (ret != MLK_NATIVE_FUNC_FALLBACK)
+    {
+      unsigned res = (unsigned)ret;
+      mlk_assert_bound(r, res, 0, MLKEM_Q);
+      return res;
+    }
+  }
+#endif /* MLK_USE_NATIVE_REJ_UNIFORM */
+
+  return mlk_rej_uniform_c(r, target, offset, buf, buflen);
+}
+
+#ifndef MLKEM_GEN_MATRIX_NBLOCKS
+#define MLKEM_GEN_MATRIX_NBLOCKS                                       \
+  ((12 * MLKEM_N / 8 * ((uint32_t)1 << 12) / MLKEM_Q + MLK_XOF_RATE) / \
+   MLK_XOF_RATE)
+#endif
+
+#if !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
+/* Reference: Does not exist in the reference implementation @[REF].
+ *            - x4-batched version of `rej_uniform()` from the
+ *              reference implementation, leveraging x4-batched Keccak-f1600. */
+MLK_INTERNAL_API
+void mlk_poly_rej_uniform_x4(mlk_poly *vec0, mlk_poly *vec1, mlk_poly *vec2,
+                             mlk_poly *vec3,
+                             uint8_t seed[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)])
+{
+  /* Temporary buffers for XOF output before rejection sampling */
+  MLK_ALIGN uint8_t
+      buf[4][MLK_ALIGN_UP(MLKEM_GEN_MATRIX_NBLOCKS * MLK_XOF_RATE)];
+
+  /* Tracks the number of coefficients we have already sampled */
+  unsigned ctr[4];
+  mlk_xof_x4_ctx statex;
+  unsigned buflen;
+
+  mlk_xof_x4_init(&statex);
+  mlk_xof_x4_absorb(&statex, seed, MLKEM_SYMBYTES + 2);
+
+  /*
+   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   * This should generate the matrix entries with high probability.
+   */
+  mlk_xof_x4_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &statex);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * MLK_XOF_RATE;
+  ctr[0] = mlk_rej_uniform(vec0->coeffs, MLKEM_N, 0, buf[0], buflen);
+  ctr[1] = mlk_rej_uniform(vec1->coeffs, MLKEM_N, 0, buf[1], buflen);
+  ctr[2] = mlk_rej_uniform(vec2->coeffs, MLKEM_N, 0, buf[2], buflen);
+  ctr[3] = mlk_rej_uniform(vec3->coeffs, MLKEM_N, 0, buf[3], buflen);
+
+  /*
+   * So long as not all matrix entries have been generated, squeeze
+   * one more block a time until we're done.
+   */
+  buflen = MLK_XOF_RATE;
+  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
+         ctr[3] < MLKEM_N)
+  __loop__(
+    assigns(ctr, statex,
+            memory_slice(vec0, sizeof(mlk_poly)),
+            memory_slice(vec1, sizeof(mlk_poly)),
+            memory_slice(vec2, sizeof(mlk_poly)),
+            memory_slice(vec3, sizeof(mlk_poly)),
+            object_whole(buf))
+    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
+    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
+    invariant(array_bound(vec0->coeffs, 0, ctr[0], 0, MLKEM_Q))
+    invariant(array_bound(vec1->coeffs, 0, ctr[1], 0, MLKEM_Q))
+    invariant(array_bound(vec2->coeffs, 0, ctr[2], 0, MLKEM_Q))
+    invariant(array_bound(vec3->coeffs, 0, ctr[3], 0, MLKEM_Q)))
+  {
+    mlk_xof_x4_squeezeblocks(buf, 1, &statex);
+    ctr[0] = mlk_rej_uniform(vec0->coeffs, MLKEM_N, ctr[0], buf[0], buflen);
+    ctr[1] = mlk_rej_uniform(vec1->coeffs, MLKEM_N, ctr[1], buf[1], buflen);
+    ctr[2] = mlk_rej_uniform(vec2->coeffs, MLKEM_N, ctr[2], buf[2], buflen);
+    ctr[3] = mlk_rej_uniform(vec3->coeffs, MLKEM_N, ctr[3], buf[3], buflen);
+  }
+
+  mlk_xof_x4_release(&statex);
+
+  /* Specification: Partially implements
+   * @[FIPS203, Section 3.3, Destruction of intermediate values] */
+  mlk_zeroize(buf, sizeof(buf));
+}
+#endif /* !MLK_CONFIG_SERIAL_FIPS202_ONLY */
+
+MLK_INTERNAL_API
+void mlk_poly_rej_uniform(mlk_poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
+{
+  mlk_xof_ctx state;
+  MLK_ALIGN uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * MLK_XOF_RATE];
+  unsigned ctr, buflen;
+
+  mlk_xof_init(&state);
+  mlk_xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
+
+  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   */
+  /* This should generate the matrix entry with high probability. */
+  mlk_xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * MLK_XOF_RATE;
+  ctr = mlk_rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
+
+  /* Squeeze + sample one more block a time until we're done */
+  buflen = MLK_XOF_RATE;
+  while (ctr < MLKEM_N)
+  __loop__(
+    assigns(ctr, state, memory_slice(entry, sizeof(mlk_poly)), object_whole(buf))
+    invariant(ctr <= MLKEM_N)
+    invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q)))
+  {
+    mlk_xof_squeezeblocks(buf, 1, &state);
+    ctr = mlk_rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
+  }
+
+  mlk_xof_release(&state);
+
+  /* Specification: Partially implements
+   * @[FIPS203, Section 3.3, Destruction of intermediate values] */
+  mlk_zeroize(buf, sizeof(buf));
+}
+
+/*************************************************
+ * Name:        mlk_load32_littleendian
+ *
+ * Description: load 4 bytes into a 32-bit integer
+ *              in little-endian order
+ *
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x
+ *
+ **************************************************/
+
+/* Reference: `load32_littleendian()` in the reference implementation @[REF]. */
+static uint32_t mlk_load32_littleendian(const uint8_t x[4])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  r |= (uint32_t)x[3] << 24;
+  return r;
+}
+
+/* Reference: `cbd2()` in the reference implementation @[REF]. */
+MLK_INTERNAL_API
+void mlk_poly_cbd2(mlk_poly *r, const uint8_t buf[2 * MLKEM_N / 4])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3)))
+  {
+    unsigned j;
+    uint32_t t = mlk_load32_littleendian(buf + 4 * i);
+    uint32_t d = t & 0x55555555;
+    d += (t >> 1) & 0x55555555;
+
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3)))
+    {
+      const int16_t a = (d >> (4 * j + 0)) & 0x3;
+      const int16_t b = (d >> (4 * j + 2)) & 0x3;
+      r->coeffs[8 * i + j] = (int16_t)(a - b);
+    }
+  }
+}
+
+#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_ETA1 == 3
+/*************************************************
+ * Name:        mlk_load24_littleendian
+ *
+ * Description: load 3 bytes into a 32-bit integer
+ *              in little-endian order.
+ *              This function is only needed for ML-KEM-512
+ *
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
+ *
+ **************************************************/
+
+/* Reference: `load24_littleendian()` in the reference implementation @[REF]. */
+static uint32_t mlk_load24_littleendian(const uint8_t x[3])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  return r;
+}
+
+/* Reference: `cbd3()` in the reference implementation @[REF]. */
+MLK_INTERNAL_API
+void mlk_poly_cbd3(mlk_poly *r, const uint8_t buf[3 * MLKEM_N / 4])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 4; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 4)
+    invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4)))
+  {
+    unsigned j;
+    const uint32_t t = mlk_load24_littleendian(buf + 3 * i);
+    uint32_t d = t & 0x00249249;
+    d += (t >> 1) & 0x00249249;
+    d += (t >> 2) & 0x00249249;
+
+    for (j = 0; j < 4; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 4 && j <= 4)
+      invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4)))
+    {
+      const int16_t a = (d >> (6 * j + 0)) & 0x7;
+      const int16_t b = (d >> (6 * j + 3)) & 0x7;
+      r->coeffs[4 * i + j] = (int16_t)(a - b);
+    }
+  }
+}
+#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_ETA1 == 3 */
+
+#else /* !MLK_CONFIG_MULTILEVEL_NO_SHARED */
+
+MLK_EMPTY_CU(sampling)
+
+#endif /* MLK_CONFIG_MULTILEVEL_NO_SHARED */
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef MLKEM_GEN_MATRIX_NBLOCKS
diff --git a/mlkem_native/src/sampling.h b/mlkem_native/src/sampling.h
new file mode 100644
index 0000000..24c26b3
--- /dev/null
+++ b/mlkem_native/src/sampling.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS203]
+ *   FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/203/final
+ */
+
+#ifndef MLK_SAMPLING_H
+#define MLK_SAMPLING_H
+
+#include "cbmc.h"
+#include "common.h"
+#include "poly.h"
+
+#define mlk_poly_cbd2 MLK_NAMESPACE(poly_cbd2)
+/*************************************************
+ * Name:        mlk_poly_cbd2
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter eta=2
+ *
+ * Arguments:   - mlk_poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ *
+ * Specification: Implements @[FIPS203, Algorithm 8, SamplePolyCBD_2]
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_poly_cbd2(mlk_poly *r, const uint8_t buf[2 * MLKEM_N / 4]);
+
+#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_ETA1 == 3
+#define mlk_poly_cbd3 MLK_NAMESPACE(poly_cbd3)
+/*************************************************
+ * Name:        mlk_poly_cbd3
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter eta=3.
+ *              This function is only needed for ML-KEM-512
+ *
+ * Arguments:   - mlk_poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ *
+ * Specification: Implements @[FIPS203, Algorithm 8, SamplePolyCBD_3]
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_poly_cbd3(mlk_poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
+#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_ETA1 == 3 */
+
+#if !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
+#define mlk_poly_rej_uniform_x4 MLK_NAMESPACE(poly_rej_uniform_x4)
+/*************************************************
+ * Name:        mlk_poly_rej_uniform_x4
+ *
+ * Description: Generate four polynomials using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
+ *
+ * Arguments:   - mlk_poly *vec0, *vec1, *vec2, *vec3:
+ *                Pointers to 4 polynomials to be sampled.
+ *              - uint8_t seed[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)]:
+ *                Pointer consecutive array of seed buffers of size
+ *                MLKEM_SYMBYTES + 2 each, plus padding for alignment.
+ *
+ * Specification: Implements @[FIPS203, Algorithm 7, SampleNTT]
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_poly_rej_uniform_x4(mlk_poly *vec0, mlk_poly *vec1, mlk_poly *vec2,
+                             mlk_poly *vec3,
+                             uint8_t seed[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)])
+__contract__(
+  requires(memory_no_alias(vec0, sizeof(mlk_poly)))
+  requires(memory_no_alias(vec1, sizeof(mlk_poly)))
+  requires(memory_no_alias(vec2, sizeof(mlk_poly)))
+  requires(memory_no_alias(vec3, sizeof(mlk_poly)))
+  requires(memory_no_alias(seed, 4 * MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)))
+  assigns(memory_slice(vec0, sizeof(mlk_poly)))
+  assigns(memory_slice(vec1, sizeof(mlk_poly)))
+  assigns(memory_slice(vec2, sizeof(mlk_poly)))
+  assigns(memory_slice(vec3, sizeof(mlk_poly)))
+  ensures(array_bound(vec0->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec1->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec2->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec3->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+#endif /* !MLK_CONFIG_SERIAL_FIPS202_ONLY */
+
+#define mlk_poly_rej_uniform MLK_NAMESPACE(poly_rej_uniform)
+/*************************************************
+ * Name:        mlk_poly_rej_uniform
+ *
+ * Description: Generate polynomial using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
+ *
+ * Arguments:   - mlk_poly *vec:           Pointer to polynomial to be sampled.
+ *              - uint8_t *seed:       Pointer to seed buffer of size
+ *                                     MLKEM_SYMBYTES + 2 each.
+ *
+ * Specification: Implements @[FIPS203, Algorithm 7, SampleNTT]
+ *
+ **************************************************/
+MLK_INTERNAL_API
+void mlk_poly_rej_uniform(mlk_poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
+__contract__(
+  requires(memory_no_alias(entry, sizeof(mlk_poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(entry, sizeof(mlk_poly)))
+  ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+
+#endif /* !MLK_SAMPLING_H */
diff --git a/mlkem_native/src/symmetric.h b/mlkem_native/src/symmetric.h
new file mode 100644
index 0000000..68d7e1a
--- /dev/null
+++ b/mlkem_native/src/symmetric.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS203]
+ *   FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/203/final
+ */
+
+#ifndef MLK_SYMMETRIC_H
+#define MLK_SYMMETRIC_H
+
+
+#include "cbmc.h"
+#include "common.h"
+#include MLK_FIPS202_HEADER_FILE
+#if !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
+#include MLK_FIPS202X4_HEADER_FILE
+#endif
+
+/* Macros denoting FIPS 203 specific Hash functions */
+
+/* Hash function H, @[FIPS203, Section 4.1, Eq (4.4)] */
+#define mlk_hash_h(OUT, IN, INBYTES) mlk_sha3_256(OUT, IN, INBYTES)
+
+/* Hash function G, @[FIPS203, Section 4.1, Eq (4.5)] */
+#define mlk_hash_g(OUT, IN, INBYTES) mlk_sha3_512(OUT, IN, INBYTES)
+
+/* Hash function J, @[FIPS203, Section 4.1, Eq (4.4)] */
+#define mlk_hash_j(OUT, IN, INBYTES) \
+  mlk_shake256(OUT, MLKEM_SYMBYTES, IN, INBYTES)
+
+/* PRF function, @[FIPS203, Section 4.1, Eq (4.3)]
+ * Referring to (eq 4.3), `OUT` is assumed to contain `s || b`. */
+#define mlk_prf_eta(ETA, OUT, IN) \
+  mlk_shake256(OUT, (ETA) * MLKEM_N / 4, IN, MLKEM_SYMBYTES + 1)
+#define mlk_prf_eta1(OUT, IN) mlk_prf_eta(MLKEM_ETA1, OUT, IN)
+#define mlk_prf_eta2(OUT, IN) mlk_prf_eta(MLKEM_ETA2, OUT, IN)
+#define mlk_prf_eta1_x4(OUT, IN)                                        \
+  mlk_shake256x4((OUT)[0], (OUT)[1], (OUT)[2], (OUT)[3],                \
+                 (MLKEM_ETA1 * MLKEM_N / 4), (IN)[0], (IN)[1], (IN)[2], \
+                 (IN)[3], MLKEM_SYMBYTES + 1)
+
+/* XOF function, FIPS 203 4.1 */
+#define mlk_xof_ctx mlk_shake128ctx
+#define mlk_xof_x4_ctx mlk_shake128x4ctx
+#define mlk_xof_init(CTX) mlk_shake128_init((CTX))
+#define mlk_xof_absorb(CTX, IN, INBYTES) \
+  mlk_shake128_absorb_once((CTX), (IN), (INBYTES))
+#define mlk_xof_squeezeblocks(BUF, NBLOCKS, CTX) \
+  mlk_shake128_squeezeblocks((BUF), (NBLOCKS), (CTX))
+#define mlk_xof_release(CTX) mlk_shake128_release((CTX))
+
+#define mlk_xof_x4_init(CTX) mlk_shake128x4_init((CTX))
+#define mlk_xof_x4_absorb(CTX, IN, INBYTES)                             \
+  mlk_shake128x4_absorb_once((CTX), (IN)[0], (IN)[1], (IN)[2], (IN)[3], \
+                             (INBYTES))
+#define mlk_xof_x4_squeezeblocks(BUF, NBLOCKS, CTX)                    \
+  mlk_shake128x4_squeezeblocks((BUF)[0], (BUF)[1], (BUF)[2], (BUF)[3], \
+                               (NBLOCKS), (CTX))
+#define mlk_xof_x4_release(CTX) mlk_shake128x4_release((CTX))
+
+#define MLK_XOF_RATE SHAKE128_RATE
+
+#endif /* !MLK_SYMMETRIC_H */
diff --git a/mlkem_native/src/sys.h b/mlkem_native/src/sys.h
new file mode 100644
index 0000000..0ab8947
--- /dev/null
+++ b/mlkem_native/src/sys.h
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+#ifndef MLK_SYS_H
+#define MLK_SYS_H
+
+#if !defined(MLK_CONFIG_NO_ASM) && (defined(__GNUC__) || defined(__clang__))
+#define MLK_HAVE_INLINE_ASM
+#endif
+
+/* Try to find endianness, if not forced through CFLAGS already */
+#if !defined(MLK_SYS_LITTLE_ENDIAN) && !defined(MLK_SYS_BIG_ENDIAN)
+#if defined(__BYTE_ORDER__)
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define MLK_SYS_LITTLE_ENDIAN
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define MLK_SYS_BIG_ENDIAN
+#else
+#error "__BYTE_ORDER__ defined, but don't recognize value."
+#endif
+#endif /* __BYTE_ORDER__ */
+
+/* MSVC does not define __BYTE_ORDER__. However, MSVC only supports
+ * little endian x86, x86_64, and AArch64. It is, hence, safe to assume
+ * little endian. */
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || \
+                          defined(_M_IX86) || defined(_M_ARM64))
+#define MLK_SYS_LITTLE_ENDIAN
+#endif
+
+#endif /* !MLK_SYS_LITTLE_ENDIAN && !MLK_SYS_BIG_ENDIAN */
+
+/* Check if we're running on an AArch64 little endian system. _M_ARM64 is set by
+ * MSVC. */
+#if defined(__AARCH64EL__) || defined(_M_ARM64)
+#define MLK_SYS_AARCH64
+#endif
+
+/* Check if we're running on an AArch64 big endian system. */
+#if defined(__AARCH64EB__)
+#define MLK_SYS_AARCH64_EB
+#endif
+
+/* Check if we're running on an Armv8.1-M system with MVE */
+#if defined(__ARM_ARCH_8_1M_MAIN__) || defined(__ARM_FEATURE_MVE)
+#define MLK_SYS_ARMV81M_MVE
+#endif
+
+#if defined(__x86_64__)
+#define MLK_SYS_X86_64
+#if defined(__AVX2__)
+#define MLK_SYS_X86_64_AVX2
+#endif
+#endif /* __x86_64__ */
+
+#if defined(MLK_SYS_LITTLE_ENDIAN) && defined(__powerpc64__)
+#define MLK_SYS_PPC64LE
+#endif
+
+#if defined(__riscv) && defined(__riscv_xlen) && __riscv_xlen == 64
+#define MLK_SYS_RISCV64
+#endif
+
+#if defined(MLK_SYS_RISCV64) && defined(__riscv_vector) && \
+    defined(__riscv_v_intrinsic)
+#define MLK_SYS_RISCV64_RVV
+#endif
+
+#if defined(__riscv) && defined(__riscv_xlen) && __riscv_xlen == 32
+#define MLK_SYS_RISCV32
+#endif
+
+#if defined(_WIN32)
+#define MLK_SYS_WINDOWS
+#endif
+
+#if defined(__linux__)
+#define MLK_SYS_LINUX
+#endif
+
+#if defined(__APPLE__)
+#define MLK_SYS_APPLE
+#endif
+
+#if defined(MLK_FORCE_AARCH64) && !defined(MLK_SYS_AARCH64)
+#error "MLK_FORCE_AARCH64 is set, but we don't seem to be on an AArch64 system."
+#endif
+
+#if defined(MLK_FORCE_AARCH64_EB) && !defined(MLK_SYS_AARCH64_EB)
+#error \
+    "MLK_FORCE_AARCH64_EB is set, but we don't seem to be on an AArch64 system."
+#endif
+
+#if defined(MLK_FORCE_X86_64) && !defined(MLK_SYS_X86_64)
+#error "MLK_FORCE_X86_64 is set, but we don't seem to be on an X86_64 system."
+#endif
+
+#if defined(MLK_FORCE_PPC64LE) && !defined(MLK_SYS_PPC64LE)
+#error "MLK_FORCE_PPC64LE is set, but we don't seem to be on a PPC64LE system."
+#endif
+
+#if defined(MLK_FORCE_RISCV64) && !defined(MLK_SYS_RISCV64)
+#error "MLK_FORCE_RISCV64 is set, but we don't seem to be on a RISCV64 system."
+#endif
+
+#if defined(MLK_FORCE_RISCV32) && !defined(MLK_SYS_RISCV32)
+#error "MLK_FORCE_RISCV32 is set, but we don't seem to be on a RISCV32 system."
+#endif
+
+/*
+ * MLK_INLINE: Hint for inlining.
+ * - MSVC: __inline
+ * - C99+: inline
+ * - GCC/Clang C90: __attribute__((unused)) to silence warnings
+ * - Other C90: empty
+ */
+#if !defined(MLK_INLINE)
+#if defined(_MSC_VER)
+#define MLK_INLINE __inline
+#elif defined(inline) || \
+    (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
+#define MLK_INLINE inline
+#elif defined(__GNUC__) || defined(__clang__)
+#define MLK_INLINE __attribute__((unused))
+#else
+#define MLK_INLINE
+#endif
+#endif /* !MLK_INLINE */
+
+/*
+ * MLK_ALWAYS_INLINE: Force inlining.
+ * - MSVC: __forceinline
+ * - GCC/Clang C99+: MLK_INLINE __attribute__((always_inline))
+ * - Other: MLK_INLINE (no forced inlining)
+ */
+#if !defined(MLK_ALWAYS_INLINE)
+#if defined(_MSC_VER)
+#define MLK_ALWAYS_INLINE __forceinline
+#elif (defined(__GNUC__) || defined(__clang__)) && \
+    (defined(inline) ||                            \
+     (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L))
+#define MLK_ALWAYS_INLINE MLK_INLINE __attribute__((always_inline))
+#else
+#define MLK_ALWAYS_INLINE MLK_INLINE
+#endif
+#endif /* !MLK_ALWAYS_INLINE */
+
+#ifndef MLK_STATIC_TESTABLE
+#define MLK_STATIC_TESTABLE static
+#endif
+
+/*
+ * C90 does not have the restrict compiler directive yet.
+ * We don't use it in C90 builds.
+ */
+#if !defined(restrict)
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#define MLK_RESTRICT restrict
+#else
+#define MLK_RESTRICT
+#endif
+
+#else /* !restrict */
+
+#define MLK_RESTRICT restrict
+#endif /* restrict */
+
+#define MLK_DEFAULT_ALIGN 32
+#define MLK_ALIGN_UP(N) \
+  ((((N) + (MLK_DEFAULT_ALIGN - 1)) / MLK_DEFAULT_ALIGN) * MLK_DEFAULT_ALIGN)
+#if defined(__GNUC__)
+#define MLK_ALIGN __attribute__((aligned(MLK_DEFAULT_ALIGN)))
+#elif defined(_MSC_VER)
+#define MLK_ALIGN __declspec(align(MLK_DEFAULT_ALIGN))
+#else
+#define MLK_ALIGN /* No known support for alignment constraints */
+#endif
+
+
+/* New X86_64 CPUs support Conflow-flow protection using the CET instructions.
+ * When enabled (through -fcf-protection=), all compilation units (including
+ * empty ones) need to support CET for this to work.
+ * For assembly, this means that source files need to signal support for
+ * CET by setting the appropriate note.gnu.property section.
+ * This can be achieved by including the <cet.h> header in all assembly file.
+ * This file also provides the _CET_ENDBR macro which needs to be placed at
+ * every potential target of an indirect branch.
+ * If CET is enabled _CET_ENDBR maps to the endbr64 instruction, otherwise
+ * it is empty.
+ * In case the compiler does not support CET (e.g., <gcc8, <clang11),
+ * the __CET__ macro is not set and we default to nothing.
+ * Note that we only issue _CET_ENDBR instructions through the MLK_ASM_FN_SYMBOL
+ * macro as the global symbols are the only possible targets of indirect
+ * branches in our code.
+ */
+#if defined(MLK_SYS_X86_64)
+#if defined(__CET__)
+#include <cet.h>
+#define MLK_CET_ENDBR _CET_ENDBR
+#else
+#define MLK_CET_ENDBR
+#endif
+#endif /* MLK_SYS_X86_64 */
+
+#if defined(MLK_CONFIG_CT_TESTING_ENABLED) && !defined(__ASSEMBLER__)
+#include <valgrind/memcheck.h>
+#define MLK_CT_TESTING_SECRET(ptr, len) \
+  VALGRIND_MAKE_MEM_UNDEFINED((ptr), (len))
+#define MLK_CT_TESTING_DECLASSIFY(ptr, len) \
+  VALGRIND_MAKE_MEM_DEFINED((ptr), (len))
+#else /* MLK_CONFIG_CT_TESTING_ENABLED && !__ASSEMBLER__ */
+#define MLK_CT_TESTING_SECRET(ptr, len) \
+  do                                    \
+  {                                     \
+  } while (0)
+#define MLK_CT_TESTING_DECLASSIFY(ptr, len) \
+  do                                        \
+  {                                         \
+  } while (0)
+#endif /* !(MLK_CONFIG_CT_TESTING_ENABLED && !__ASSEMBLER__) */
+
+#if defined(__GNUC__) || defined(__clang__)
+#define MLK_MUST_CHECK_RETURN_VALUE __attribute__((warn_unused_result))
+#else
+#define MLK_MUST_CHECK_RETURN_VALUE
+#endif
+
+#if !defined(__ASSEMBLER__)
+/* System capability enumeration */
+typedef enum
+{
+  /* x86_64 */
+  MLK_SYS_CAP_AVX2,
+  /* AArch64 */
+  MLK_SYS_CAP_SHA3
+} mlk_sys_cap;
+
+#if !defined(MLK_CONFIG_CUSTOM_CAPABILITY_FUNC)
+#include "cbmc.h"
+
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_sys_check_capability(mlk_sys_cap cap)
+__contract__(
+  ensures(return_value == 0 || return_value == 1)
+)
+{
+  /* By default, we rely on compile-time feature detection/specification:
+   * If a feature is enabled at compile-time, we assume it is supported by
+   * the host that the resulting library/binary will be built on.
+   * If this assumption is not true, you MUST overwrite this function.
+   * See the documentation of MLK_CONFIG_CUSTOM_CAPABILITY_FUNC in
+   * mlkem_native_config.h for more information. */
+  (void)cap;
+  return 1;
+}
+#endif /* !MLK_CONFIG_CUSTOM_CAPABILITY_FUNC */
+#endif /* !__ASSEMBLER__ */
+
+#endif /* !MLK_SYS_H */
diff --git a/mlkem_native/src/verify.c b/mlkem_native/src/verify.c
new file mode 100644
index 0000000..db760f7
--- /dev/null
+++ b/mlkem_native/src/verify.c
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+#include "verify.h"
+
+#if !defined(MLK_USE_ASM_VALUE_BARRIER) && \
+    !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
+/*
+ * Masking value used in constant-time functions from
+ * verify.h to block the compiler's range analysis and
+ * thereby reduce the risk of compiler-introduced branches.
+ */
+volatile uint64_t mlk_ct_opt_blocker_u64 = 0;
+
+#else /* !MLK_USE_ASM_VALUE_BARRIER && !MLK_CONFIG_MULTILEVEL_NO_SHARED */
+
+MLK_EMPTY_CU(verify)
+
+#endif /* !(!MLK_USE_ASM_VALUE_BARRIER && !MLK_CONFIG_MULTILEVEL_NO_SHARED) */
diff --git a/mlkem_native/src/verify.h b/mlkem_native/src/verify.h
new file mode 100644
index 0000000..a9bdeaa
--- /dev/null
+++ b/mlkem_native/src/verify.h
@@ -0,0 +1,464 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS203]
+ *   FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/203/final
+ *
+ * - [REF]
+ *   CRYSTALS-Kyber C reference implementation
+ *   Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ *   https://github.com/pq-crystals/kyber/tree/main/ref
+ *
+ * - [libmceliece]
+ *   libmceliece implementation of Classic McEliece
+ *   Bernstein, Chou
+ *   https://lib.mceliece.org/
+ *
+ * - [optblocker]
+ *   PQC forum post on opt-blockers using volatile globals
+ *   Daniel J. Bernstein
+ *   https://groups.google.com/a/list.nist.gov/g/pqc-forum/c/hqbtIGFKIpU/m/H14H0wOlBgAJ
+ */
+
+#ifndef MLK_VERIFY_H
+#define MLK_VERIFY_H
+
+
+#include "cbmc.h"
+#include "common.h"
+
+/* Constant-time comparisons and conditional operations
+
+   We reduce the risk for compilation into variable-time code
+   through the use of 'value barriers'.
+
+   Functionally, a value barrier is a no-op. To the compiler, however,
+   it constitutes an arbitrary modification of its input, and therefore
+   harden's value propagation and range analysis.
+
+   We consider two approaches to implement a value barrier:
+   - An empty inline asm block which marks the target value as clobbered.
+   - XOR'ing with the value of a volatile global that's set to 0;
+     see @[optblocker] for a discussion of this idea, and
+     @[libmceliece, inttypes/crypto_intN.h] for an implementation.
+
+   The first approach is cheap because it only prevents the compiler
+   from reasoning about the value of the variable past the barrier,
+   but does not directly generate additional instructions.
+
+   The second approach generates redundant loads and XOR operations
+   and therefore comes at a higher runtime cost. However, it appears
+   more robust towards optimization, as compilers should never drop
+   a volatile load.
+
+   We use the empty-ASM value barrier for GCC and clang, and fall
+   back to the global volatile barrier otherwise.
+
+   The global value barrier can be forced by setting
+   MLK_CONFIG_NO_ASM_VALUE_BARRIER.
+
+*/
+
+#if defined(MLK_HAVE_INLINE_ASM) && !defined(MLK_CONFIG_NO_ASM_VALUE_BARRIER)
+#define MLK_USE_ASM_VALUE_BARRIER
+#endif
+
+#if !defined(MLK_USE_ASM_VALUE_BARRIER)
+
+/*
+ * Declaration of global volatile that the global value barrier
+ * is loading from and masking with.
+ */
+#define mlk_ct_opt_blocker_u64 MLK_NAMESPACE(ct_opt_blocker_u64)
+extern volatile uint64_t mlk_ct_opt_blocker_u64;
+
+/* Helper functions for obtaining global masks of various sizes */
+
+/* This contract is not proved but treated as an axiom.
+ *
+ * Its validity relies on the assumption that the global opt-blocker
+ * constant mlk_ct_opt_blocker_u64 is not modified.
+ */
+static MLK_INLINE uint64_t mlk_ct_get_optblocker_u64(void)
+__contract__(ensures(return_value == 0)) { return mlk_ct_opt_blocker_u64; }
+
+static MLK_INLINE uint8_t mlk_ct_get_optblocker_u8(void)
+__contract__(ensures(return_value == 0)) { return (uint8_t)mlk_ct_get_optblocker_u64(); }
+
+static MLK_INLINE uint32_t mlk_ct_get_optblocker_u32(void)
+__contract__(ensures(return_value == 0)) { return (uint32_t)mlk_ct_get_optblocker_u64(); }
+
+static MLK_INLINE int32_t mlk_ct_get_optblocker_i32(void)
+__contract__(ensures(return_value == 0)) { return (int32_t)mlk_ct_get_optblocker_u64(); }
+
+/* Opt-blocker based implementation of value barriers */
+static MLK_INLINE uint32_t mlk_value_barrier_u32(uint32_t b)
+__contract__(ensures(return_value == b)) { return (b ^ mlk_ct_get_optblocker_u32()); }
+
+static MLK_INLINE int32_t mlk_value_barrier_i32(int32_t b)
+__contract__(ensures(return_value == b)) { return (b ^ mlk_ct_get_optblocker_i32()); }
+
+static MLK_INLINE uint8_t mlk_value_barrier_u8(uint8_t b)
+__contract__(ensures(return_value == b)) { return (b ^ mlk_ct_get_optblocker_u8()); }
+
+#else /* !MLK_USE_ASM_VALUE_BARRIER */
+
+static MLK_INLINE uint32_t mlk_value_barrier_u32(uint32_t b)
+__contract__(ensures(return_value == b))
+{
+  __asm__ volatile("" : "+r"(b));
+  return b;
+}
+
+static MLK_INLINE int32_t mlk_value_barrier_i32(int32_t b)
+__contract__(ensures(return_value == b))
+{
+  __asm__ volatile("" : "+r"(b));
+  return b;
+}
+
+static MLK_INLINE uint8_t mlk_value_barrier_u8(uint8_t b)
+__contract__(ensures(return_value == b))
+{
+  __asm__ volatile("" : "+r"(b));
+  return b;
+}
+
+#endif /* MLK_USE_ASM_VALUE_BARRIER */
+
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "conversion"
+#endif
+/*************************************************
+ * Name:        mlk_cast_uint16_to_int16
+ *
+ * Description: Cast uint16 value to int16
+ *
+ * Returns:     For uint16_t x, the unique y in int16_t
+ *              so that x == y mod 2^16.
+ *
+ *              Concretely:
+ *              - x <  32768: returns x
+ *              - x >= 32768: returns x - 65536
+ *
+ **************************************************/
+static MLK_ALWAYS_INLINE int16_t mlk_cast_uint16_to_int16(uint16_t x)
+{
+  /*
+   * PORTABILITY: This relies on uint16_t -> int16_t
+   * being implemented as the inverse of int16_t -> uint16_t,
+   * which is implementation-defined (C99 6.3.1.3 (3))
+   * CBMC (correctly) fails to prove this conversion is OK,
+   * so we have to suppress that check here
+   */
+  return (int16_t)x;
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/*************************************************
+ * Name:        mlk_cast_int32_to_uint16
+ *
+ * Description: Cast int32 value to uint16 as per C standard.
+ *
+ * Returns:     For int32_t x, the unique y in uint16_t
+ *              so that x == y mod 2^16.
+ **************************************************/
+static MLK_ALWAYS_INLINE uint16_t mlk_cast_int32_to_uint16(int32_t x)
+{
+  return (uint16_t)(x & (int32_t)UINT16_MAX);
+}
+
+/*************************************************
+ * Name:        mlk_cast_int16_to_uint16
+ *
+ * Description: Cast int16 value to uint16 as per C standard.
+ *
+ * Returns:     For int16_t x, the unique y in uint16_t
+ *              so that x == y mod 2^16.
+ **************************************************/
+static MLK_ALWAYS_INLINE uint16_t mlk_cast_int16_to_uint16(int32_t x)
+{
+  return mlk_cast_int32_to_uint16((int32_t)x);
+}
+
+/*************************************************
+ * Name:        mlk_ct_cmask_neg_i16
+ *
+ * Description: Return 0 if input is non-negative, and -1 otherwise.
+ *
+ * Arguments:   uint16_t x: Value to be converted into a mask
+ *
+ **************************************************/
+
+/* Reference: Embedded in polynomial compression function in the
+ *            reference implementation @[REF].
+ *            - Used as part of signed->unsigned conversion for modular
+ *              representatives to detect whether the input is negative.
+ *              This happen in `mlk_poly_reduce()` here, and as part of
+ *              polynomial compression functions in the reference
+ *              implementation. See `mlk_poly_reduce()`.
+ *            - We use value barriers to reduce the risk of
+ *              compiler-introduced branches. */
+static MLK_INLINE uint16_t mlk_ct_cmask_neg_i16(int16_t x)
+__contract__(ensures(return_value == ((x < 0) ? 0xFFFF : 0)))
+{
+  int32_t tmp = mlk_value_barrier_i32((int32_t)x);
+  tmp >>= 16;
+  return mlk_cast_int32_to_uint16(tmp);
+}
+
+/*************************************************
+ * Name:        mlk_ct_cmask_nonzero_u16
+ *
+ * Description: Return 0 if input is zero, and -1 otherwise.
+ *
+ * Arguments:   uint16_t x: Value to be converted into a mask
+ *
+ **************************************************/
+
+/* Reference: Embedded in `cmov_int16()` in the reference implementation @[REF].
+ *            - Use value barrier and shift instead of `b = -b` to
+ *              convert condition into mask. */
+static MLK_INLINE uint16_t mlk_ct_cmask_nonzero_u16(uint16_t x)
+__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFFFF)))
+{
+  int32_t tmp = mlk_value_barrier_i32(-((int32_t)x));
+  tmp >>= 16;
+  return mlk_cast_int32_to_uint16(tmp);
+}
+
+/*************************************************
+ * Name:        mlk_ct_cmask_nonzero_u8
+ *
+ * Description: Return 0 if input is zero, and -1 otherwise.
+ *
+ * Arguments:   uint8_t x: Value to be converted into a mask
+ *
+ **************************************************/
+
+/* Reference: Embedded in `verify()` and `cmov()` in the
+ *            reference implementation @[REF].
+ *            - We include a value barrier not present in the
+ *              reference implementation, to prevent the compiler
+ *              from realizing that this function returns a mask. */
+static MLK_INLINE uint8_t mlk_ct_cmask_nonzero_u8(uint8_t x)
+__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFF)))
+{
+  uint16_t mask = mlk_ct_cmask_nonzero_u16((uint16_t)x);
+  return (uint8_t)(mask & 0xFF);
+}
+
+/*************************************************
+ * Name:        mlk_ct_sel_int16
+ *
+ * Description: Functionally equivalent to cond ? a : b,
+ *              but implemented with guards against
+ *              compiler-introduced branches.
+ *
+ * Arguments:   int16_t a:       First alternative
+ *              int16_t b:       Second alternative
+ *              uint16_t cond:   Condition variable.
+ *
+ * Specification:
+ * - With `a = MLKEM_Q_HALF` and `b=0`, this essentially
+ *   implements `Decompress_1` @[FIPS203, Eq (4.8)] in `mlk_poly_frommsg()`.
+ * - With `a = x + MLKEM_Q`, `b = x`, and `cond` indicating whether `x`
+ *   is negative, implements signed->unsigned conversion of modular
+ *   representatives. Questions of representation are not considered
+ *   in the specification @[FIPS203, Section 2.4.1, "The pseudocode is
+ *   agnostic regarding how an integer modulo 𝑚 is represented in
+ *   actual implementations"].
+ *
+ **************************************************/
+
+/* Reference: Embedded in polynomial compression function in the
+ *            reference implementation @[REF].
+ *            - Used as part of signed->unsigned conversion for modular
+ *              representatives. This happen in `mlk_poly_reduce()` here,
+ *              and as part of polynomial compression functions in @[REF].
+ *              See `mlk_poly_reduce()`.
+ *            - Barrier to reduce the risk of compiler-introduced branches.
+ *            For `a = MLKEM_Q_HALF` and `b=0`, also embedded in
+ *            `poly_frommsg()` from the reference implementation, which uses
+ *            `cmov_int16()` instead. */
+static MLK_INLINE int16_t mlk_ct_sel_int16(int16_t a, int16_t b, uint16_t cond)
+__contract__(ensures(return_value == (cond ? a : b)))
+{
+  uint16_t au = mlk_cast_int16_to_uint16(a);
+  uint16_t bu = mlk_cast_int16_to_uint16(b);
+  uint16_t res = bu ^ (mlk_ct_cmask_nonzero_u16(cond) & (au ^ bu));
+  return mlk_cast_uint16_to_int16(res);
+}
+
+/*************************************************
+ * Name:        mlk_ct_sel_uint8
+ *
+ * Description: Functionally equivalent to cond ? a : b,
+ *              but implemented with guards against
+ *              compiler-introduced branches.
+ *
+ * Arguments:   uint8_t a:       First alternative
+ *              uint8_t b:       Second alternative
+ *              uuint8_t cond:   Condition variable.
+ *
+ **************************************************/
+
+/* Reference: Embedded into `cmov()` in the reference implementation @[REF].
+ *            - Use value barrier to get mask from condition value. */
+static MLK_INLINE uint8_t mlk_ct_sel_uint8(uint8_t a, uint8_t b, uint8_t cond)
+__contract__(ensures(return_value == (cond ? a : b)))
+{
+  return b ^ (mlk_ct_cmask_nonzero_u8(cond) & (a ^ b));
+}
+
+/*************************************************
+ * Name:        mlk_ct_memcmp
+ *
+ * Description: Compare two arrays for equality in constant time.
+ *
+ * Arguments:   const uint8_t *a: pointer to first byte array
+ *              const uint8_t *b: pointer to second byte array
+ *              size_t len:       length of the byte arrays, upper-bounded
+ *                                to UINT16_MAX to control proof complexity
+ *                                only.
+ *
+ * Returns 0 if the byte arrays are equal, 0xFF otherwise.
+ *
+ * Specification:
+ * - Used to securely compute conditional move in
+ *   @[FIPS203, Algorithm 18 (ML-KEM.Decaps_Internal, L9-11]
+ *
+ **************************************************/
+
+/* Reference: `cmov()` in the reference implementation @[REF]
+ *            - We return `uint8_t`, not `int`.
+ *            - We use an additional XOR-accumulator in the comparison loop
+ *              which prevents early abort if the OR-accumulator is 0xFF.
+ *            - We use a value barrier to convert the OR-accumulator into
+ *              a mask. The reference implementation uses a shift which the
+ *              compiler can argue to result in either 0 of 0xFF..FF. */
+static MLK_INLINE uint8_t mlk_ct_memcmp(const uint8_t *a, const uint8_t *b,
+                                        const size_t len)
+__contract__(
+  requires(len <= UINT16_MAX)
+  requires(memory_no_alias(a, len))
+  requires(memory_no_alias(b, len))
+  ensures((return_value == 0) || (return_value == 0xFF))
+  ensures((return_value == 0) == forall(i, 0, len, (a[i] == b[i]))))
+{
+  uint8_t r = 0, s = 0;
+  unsigned i;
+
+  for (i = 0; i < len; i++)
+  __loop__(
+    invariant(i <= len)
+    invariant((r == 0) == (forall(k, 0, i, (a[k] == b[k])))))
+  {
+    r |= a[i] ^ b[i];
+    /* s is useless, but prevents the loop from being aborted once r=0xff. */
+    s ^= a[i] ^ b[i];
+  }
+
+  /*
+   * - Convert r into a mask; this may not be necessary, but is an additional
+   *   safeguard
+   *   towards leaking information about a and b.
+   * - XOR twice with s, separated by a value barrier, to prevent the compile
+   *   from dropping the s computation in the loop.
+   */
+  return (mlk_value_barrier_u8(mlk_ct_cmask_nonzero_u8(r) ^ s) ^ s);
+}
+
+/*************************************************
+ * Name:        mlk_ct_cmov_zero
+ *
+ * Description: Copy len bytes from x to r if b is zero;
+ *              don't modify x if b is non-zero.
+ *              assumes two's complement representation of negative integers.
+ *              Runs in constant time.
+ *
+ * Arguments:   uint8_t *r:       pointer to output byte array
+ *              const uint8_t *x: pointer to input byte array
+ *              size_t len:       Amount of bytes to be copied
+ *              uint8_t b:        Condition value.
+ *
+ * Specification:
+ * - Used to securely compute conditional move in
+ *   @[FIPS203, Algorithm 18 (ML-KEM.Decaps_Internal, L9-11]
+ *
+ **************************************************/
+
+/* Reference: `cmov()` in the reference implementation @[REF].
+ *            - We move if condition value is `0`, not `1`.
+ *            - We use `mlk_ct_sel_uint8` for constant-time selection. */
+static MLK_INLINE void mlk_ct_cmov_zero(uint8_t *r, const uint8_t *x,
+                                        size_t len, uint8_t b)
+__contract__(
+  requires(len <= MLK_MAX_BUFFER_SIZE)
+  requires(memory_no_alias(r, len))
+  requires(memory_no_alias(x, len))
+  assigns(memory_slice(r, len))
+  ensures(forall(i, 0, len, (r[i] == (b == 0 ? x[i] : old(r)[i])))))
+{
+  size_t i;
+  for (i = 0; i < len; i++)
+  __loop__(
+    invariant(i <= len)
+    invariant(forall(k, 0, i, r[k] == (b == 0 ? x[k] : loop_entry(r)[k]))))
+  {
+    r[i] = mlk_ct_sel_uint8(r[i], x[i], b);
+  }
+}
+
+/*************************************************
+ * Name:        mlk_zeroize
+ *
+ * Description: Force-zeroize a buffer.
+ *
+ * Arguments:   uint8_t *r:       pointer to byte array to be zeroed
+ *              size_t len:       Amount of bytes to be zeroed
+ *
+ * Specification: Used to implement
+ * @[FIPS203, Section 3.3, Destruction of intermediate values]
+ *
+ **************************************************/
+
+/* Reference: Not present in the reference implementation @[REF]. */
+#if !defined(MLK_CONFIG_CUSTOM_ZEROIZE)
+#if defined(MLK_SYS_WINDOWS)
+#include <windows.h>
+static MLK_INLINE void mlk_zeroize(void *ptr, size_t len)
+__contract__(
+  requires(memory_no_alias(ptr, len))
+  assigns(memory_slice(ptr, len))) { SecureZeroMemory(ptr, len); }
+#elif defined(MLK_HAVE_INLINE_ASM)
+#include <string.h>
+static MLK_INLINE void mlk_zeroize(void *ptr, size_t len)
+__contract__(
+  requires(memory_no_alias(ptr, len))
+  assigns(memory_slice(ptr, len)))
+{
+  mlk_memset(ptr, 0, len);
+  /* This follows OpenSSL and seems sufficient to prevent the compiler
+   * from optimizing away the memset.
+   *
+   * If there was a reliable way to detect availability of memset_s(),
+   * that would be preferred. */
+  __asm__ volatile("" : : "r"(ptr) : "memory");
+}
+#else /* !MLK_SYS_WINDOWS && MLK_HAVE_INLINE_ASM */
+#error No plausibly-secure implementation of mlk_zeroize available. Please provide your own using MLK_CONFIG_CUSTOM_ZEROIZE.
+#endif /* !MLK_SYS_WINDOWS && !MLK_HAVE_INLINE_ASM */
+#endif /* !MLK_CONFIG_CUSTOM_ZEROIZE */
+
+#endif /* !MLK_VERIFY_H */
diff --git a/mlkem_native/src/zetas.inc b/mlkem_native/src/zetas.inc
new file mode 100644
index 0000000..00316da
--- /dev/null
+++ b/mlkem_native/src/zetas.inc
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * WARNING: This file is auto-generated from scripts/autogen
+ *          in the mlkem-native repository.
+ *          Do not modify it directly.
+ */
+
+
+/*
+ * Table of zeta values used in the reference NTT and inverse NTT.
+ * See autogen for details.
+ */
+static MLK_ALIGN const int16_t mlk_zetas[128] = {
+    -1044, -758,  -359,  -1517, 1493,  1422,  287,   202,  -171,  622,   1577,
+    182,   962,   -1202, -1474, 1468,  573,   -1325, 264,  383,   -829,  1458,
+    -1602, -130,  -681,  1017,  732,   608,   -1542, 411,  -205,  -1571, 1223,
+    652,   -552,  1015,  -1293, 1491,  -282,  -1544, 516,  -8,    -320,  -666,
+    -1618, -1162, 126,   1469,  -853,  -90,   -271,  830,  107,   -1421, -247,
+    -951,  -398,  961,   -1508, -725,  448,   -1065, 677,  -1275, -1103, 430,
+    555,   843,   -1251, 871,   1550,  105,   422,   587,  177,   -235,  -291,
+    -460,  1574,  1653,  -246,  778,   1159,  -147,  -777, 1483,  -602,  1119,
+    -1590, 644,   -872,  349,   418,   329,   -156,  -75,  817,   1097,  603,
+    610,   1322,  -1285, -1465, 384,   -1215, -136,  1218, -1335, -874,  220,
+    -1187, -1659, -1185, -1530, -1278, 794,   -1510, -854, -870,  478,   -108,
+    -308,  996,   991,   958,   -1460, 1522,  1628,
+};
diff --git a/mlkem_native/test/Makefile b/mlkem_native/test/Makefile
new file mode 100644
index 0000000..e24da68
--- /dev/null
+++ b/mlkem_native/test/Makefile
@@ -0,0 +1,17 @@
+# ML-KEM-768 test suite
+# Run on host (x86/ARM) to validate the integration before flashing
+CC ?= gcc
+CFLAGS = -std=c99 -O2 -Wall -Wextra -Wno-unused-result -I../.. -I..
+
+all: test_mlkem768
+
+test_mlkem768: test_mlkem768.c ../../mlkem_native/mlkem_native.c
+	$(CC) $(CFLAGS) -DMLK_CONFIG_PARAMETER_SET=768 $< ../../mlkem_native/mlkem_native.c -o $@
+
+test: test_mlkem768
+	./test_mlkem768
+
+clean:
+	rm -f test_mlkem768
+
+.PHONY: all test clean
diff --git a/mlkem_native/test/test_mlkem768.c b/mlkem_native/test/test_mlkem768.c
new file mode 100644
index 0000000..a6b5a41
--- /dev/null
+++ b/mlkem_native/test/test_mlkem768.c
@@ -0,0 +1,437 @@
+/*
+ * ML-KEM-768 & X-Wing KEM Test Suite
+ * Tests compliance with FIPS 203 and draft-connolly-cfrg-xwing-kem-09
+ * Compatible with age v1.3.0 mlkem768x25519 recipient type
+ *
+ * Compile (from mlkem_native/test/):
+ *   make test
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <time.h>
+
+#include "mlkem_native/mlkem_native.h"
+
+/* Access SHA3-256 and SHAKE256 from mlkem-native */
+extern void PQCP_MLKEM_NATIVE_MLKEM768_sha3_256(uint8_t *output, const uint8_t *input, size_t inlen);
+extern void PQCP_MLKEM_NATIVE_MLKEM768_shake256(uint8_t *output, size_t outlen, const uint8_t *input, size_t inlen);
+#define xwing_sha3_256  PQCP_MLKEM_NATIVE_MLKEM768_sha3_256
+#define xwing_shake256  PQCP_MLKEM_NATIVE_MLKEM768_shake256
+
+/* X-Wing label: "\./", "/^\" = hex 5c 2e 2f 2f 5e 5c */
+static const uint8_t XWingLabel[6] = {0x5c, 0x2e, 0x2f, 0x2f, 0x5e, 0x5c};
+
+int onlykey_mlkem_randombytes(uint8_t *out, size_t outlen) {
+    FILE *f = fopen("/dev/urandom", "rb");
+    if (!f) return -1;
+    if (fread(out, 1, outlen, f) != outlen) { fclose(f); return -1; }
+    fclose(f);
+    return 0;
+}
+
+/* X-Wing Combiner (Section 5.3):
+ * SHA3-256(ss_M || ss_X || ct_X || pk_X || XWingLabel) */
+static void xwing_combiner(uint8_t ss[32],
+    const uint8_t ss_M[32], const uint8_t ss_X[32],
+    const uint8_t ct_X[32], const uint8_t pk_X[32])
+{
+    uint8_t buf[134];
+    memcpy(buf,       ss_M, 32);
+    memcpy(buf + 32,  ss_X, 32);
+    memcpy(buf + 64,  ct_X, 32);
+    memcpy(buf + 96,  pk_X, 32);
+    memcpy(buf + 128, XWingLabel, 6);
+    xwing_sha3_256(ss, buf, 134);
+}
+
+/* Minimal X25519 using tweetnacl-compatible scalar mult
+ * For testing only — firmware uses Curve25519 library */
+/* Base point for X25519 */
+static const uint8_t X25519_BASE[32] = {9};
+
+/* We need X25519 scalar mult for the test. Use a simple
+ * implementation or link against tweetnacl. For now, we
+ * implement the test using the mlkem-native SHAKE256 to
+ * simulate — but for real interop tests we need actual X25519.
+ *
+ * Since we can't link tweetnacl in this standalone test,
+ * we test the ML-KEM and combiner components, and mark
+ * full X-Wing round-trip as requiring firmware. */
+
+static int tests_run = 0;
+static int tests_passed = 0;
+
+#define TEST(name) do { tests_run++; printf("  [%02d] %-55s ", tests_run, name); fflush(stdout); } while(0)
+#define PASS() do { tests_passed++; printf("PASS\n"); } while(0)
+#define FAIL(msg) do { printf("FAIL: %s\n", msg); } while(0)
+
+/* === ML-KEM-768 Tests === */
+
+static int test_sizes(void) {
+    TEST("ML-KEM: sizes match FIPS 203");
+    if (MLKEM768_PUBLICKEYBYTES != 1184 || MLKEM768_SECRETKEYBYTES != 2400 ||
+        MLKEM768_CIPHERTEXTBYTES != 1088 || MLKEM_BYTES != 32) { FAIL("mismatch"); return 1; }
+    PASS(); return 0;
+}
+
+static int test_roundtrip(void) {
+    TEST("ML-KEM: keygen + encaps + decaps round-trip");
+    uint8_t pk[1184], sk[2400], ct[1088], ss1[32], ss2[32];
+    if (crypto_kem_keypair(pk, sk) != 0) { FAIL("keygen"); return 1; }
+    if (crypto_kem_enc(ct, ss1, pk) != 0) { FAIL("encaps"); return 1; }
+    if (crypto_kem_dec(ss2, ct, sk) != 0) { FAIL("decaps"); return 1; }
+    if (memcmp(ss1, ss2, 32) != 0) { FAIL("ss mismatch"); return 1; }
+    PASS(); return 0;
+}
+
+static int test_different_secrets(void) {
+    TEST("ML-KEM: multiple encaps produce different secrets");
+    uint8_t pk[1184], sk[2400], ct1[1088], ct2[1088], ss1[32], ss2[32];
+    crypto_kem_keypair(pk, sk);
+    crypto_kem_enc(ct1, ss1, pk);
+    crypto_kem_enc(ct2, ss2, pk);
+    if (memcmp(ss1, ss2, 32) == 0) { FAIL("identical"); return 1; }
+    PASS(); return 0;
+}
+
+static int test_wrong_sk(void) {
+    TEST("ML-KEM: wrong SK implicit rejection");
+    uint8_t pk1[1184], sk1[2400], pk2[1184], sk2[2400], ct[1088], ss1[32], ss2[32];
+    crypto_kem_keypair(pk1, sk1); crypto_kem_keypair(pk2, sk2);
+    crypto_kem_enc(ct, ss1, pk1); crypto_kem_dec(ss2, ct, sk2);
+    if (memcmp(ss1, ss2, 32) == 0) { FAIL("matched"); return 1; }
+    PASS(); return 0;
+}
+
+static int test_corrupted_ct(void) {
+    TEST("ML-KEM: corrupted CT implicit rejection");
+    uint8_t pk[1184], sk[2400], ct[1088], ss1[32], ss2[32];
+    crypto_kem_keypair(pk, sk); crypto_kem_enc(ct, ss1, pk);
+    ct[0] ^= 0x01; crypto_kem_dec(ss2, ct, sk);
+    if (memcmp(ss1, ss2, 32) == 0) { FAIL("matched"); return 1; }
+    PASS(); return 0;
+}
+
+static int test_pk_in_sk(void) {
+    TEST("ML-KEM: PK at SK offset 1152");
+    uint8_t pk[1184], sk[2400];
+    crypto_kem_keypair(pk, sk);
+    if (memcmp(pk, sk + 1152, 1184) != 0) { FAIL("wrong offset"); return 1; }
+    PASS(); return 0;
+}
+
+static int test_check_pk_sk(void) {
+    TEST("ML-KEM: check_pk/check_sk validate keys");
+    uint8_t pk[1184], sk[2400];
+    crypto_kem_keypair(pk, sk);
+    if (crypto_kem_check_pk(pk) != 0) { FAIL("pk rejected"); return 1; }
+    if (crypto_kem_check_sk(sk) != 0) { FAIL("sk rejected"); return 1; }
+    PASS(); return 0;
+}
+
+static int test_derand_keygen(void) {
+    TEST("ML-KEM: deterministic keygen produces same keys");
+    uint8_t coins[64];
+    onlykey_mlkem_randombytes(coins, 64);
+    uint8_t pk1[1184], sk1[2400], pk2[1184], sk2[2400];
+    crypto_kem_keypair_derand(pk1, sk1, coins);
+    crypto_kem_keypair_derand(pk2, sk2, coins);
+    if (memcmp(pk1, pk2, 1184) != 0) { FAIL("pk differs"); return 1; }
+    if (memcmp(sk1, sk2, 2400) != 0) { FAIL("sk differs"); return 1; }
+    PASS(); return 0;
+}
+
+static int test_onlykey_flow(void) {
+    TEST("ML-KEM: simulated OnlyKey ctap_buffer flow");
+    uint8_t ctap[7609]; memset(ctap, 0, sizeof(ctap));
+    uint8_t *sk = ctap, *pk = ctap + 2400;
+    if (crypto_kem_keypair(pk, sk) != 0) { FAIL("keygen"); return 1; }
+    uint8_t pk_h[1184]; memcpy(pk_h, pk, 1184);
+    uint8_t flash[2400]; memcpy(flash, sk, 2400);
+    memset(ctap, 0, sizeof(ctap));
+    uint8_t ct[1088], ss_h[32];
+    if (crypto_kem_enc(ct, ss_h, pk_h) != 0) { FAIL("encaps"); return 1; }
+    memcpy(ctap, flash, 2400);
+    memcpy(ctap + 5465, ct, 1088);
+    uint8_t ss_d[32];
+    if (crypto_kem_dec(ss_d, ctap + 5465, ctap) != 0) { FAIL("decaps"); return 1; }
+    if (memcmp(ss_h, ss_d, 32) != 0) { FAIL("ss mismatch"); return 1; }
+    PASS(); return 0;
+}
+
+/* === X-Wing Spec Tests === */
+
+static int test_xwing_sizes(void) {
+    TEST("X-Wing: PK=1216, CT=1120, SS=32, SK_seed=32");
+    if ((1184 + 32) != 1216) { FAIL("pk"); return 1; }
+    if ((1088 + 32) != 1120) { FAIL("ct"); return 1; }
+    PASS(); return 0;
+}
+
+static int test_xwing_label(void) {
+    TEST("X-Wing: label is \\./  /^\\ = hex 5c2e2f2f5e5c");
+    uint8_t expected[6] = {0x5c, 0x2e, 0x2f, 0x2f, 0x5e, 0x5c};
+    if (memcmp(XWingLabel, expected, 6) != 0) { FAIL("wrong label"); return 1; }
+    PASS(); return 0;
+}
+
+static int test_xwing_shake256_expansion(void) {
+    TEST("X-Wing: SHAKE256(seed,96) is deterministic");
+    uint8_t seed[32];
+    onlykey_mlkem_randombytes(seed, 32);
+    uint8_t exp1[96], exp2[96];
+    xwing_shake256(exp1, 96, seed, 32);
+    xwing_shake256(exp2, 96, seed, 32);
+    if (memcmp(exp1, exp2, 96) != 0) { FAIL("not deterministic"); return 1; }
+    /* Different seed => different expansion */
+    seed[0] ^= 0x01;
+    xwing_shake256(exp2, 96, seed, 32);
+    if (memcmp(exp1, exp2, 96) == 0) { FAIL("different seed same output"); return 1; }
+    PASS(); return 0;
+}
+
+static int test_xwing_derand_keygen(void) {
+    TEST("X-Wing: keygen from seed via SHAKE256 is deterministic");
+    uint8_t seed[32];
+    onlykey_mlkem_randombytes(seed, 32);
+    uint8_t expanded[96];
+    xwing_shake256(expanded, 96, seed, 32);
+    /* ML-KEM keygen from expanded[0:64] */
+    uint8_t pk1[1184], sk1[2400], pk2[1184], sk2[2400];
+    crypto_kem_keypair_derand(pk1, sk1, expanded);
+    crypto_kem_keypair_derand(pk2, sk2, expanded);
+    if (memcmp(pk1, pk2, 1184) != 0) { FAIL("pk differs"); return 1; }
+    if (memcmp(sk1, sk2, 2400) != 0) { FAIL("sk differs"); return 1; }
+    PASS(); return 0;
+}
+
+static int test_xwing_combiner_deterministic(void) {
+    TEST("X-Wing: combiner is deterministic");
+    uint8_t ss_M[32], ss_X[32], ct_X[32], pk_X[32];
+    onlykey_mlkem_randombytes(ss_M, 32);
+    onlykey_mlkem_randombytes(ss_X, 32);
+    onlykey_mlkem_randombytes(ct_X, 32);
+    onlykey_mlkem_randombytes(pk_X, 32);
+    uint8_t h1[32], h2[32];
+    xwing_combiner(h1, ss_M, ss_X, ct_X, pk_X);
+    xwing_combiner(h2, ss_M, ss_X, ct_X, pk_X);
+    if (memcmp(h1, h2, 32) != 0) { FAIL("not deterministic"); return 1; }
+    PASS(); return 0;
+}
+
+static int test_xwing_combiner_uses_all_inputs(void) {
+    TEST("X-Wing: combiner output changes with each input");
+    uint8_t ss_M[32], ss_X[32], ct_X[32], pk_X[32], base[32], test[32];
+    onlykey_mlkem_randombytes(ss_M, 32);
+    onlykey_mlkem_randombytes(ss_X, 32);
+    onlykey_mlkem_randombytes(ct_X, 32);
+    onlykey_mlkem_randombytes(pk_X, 32);
+    xwing_combiner(base, ss_M, ss_X, ct_X, pk_X);
+
+    /* Flip bit in ss_M */
+    ss_M[0] ^= 1;
+    xwing_combiner(test, ss_M, ss_X, ct_X, pk_X);
+    if (memcmp(base, test, 32) == 0) { FAIL("ss_M ignored"); return 1; }
+    ss_M[0] ^= 1;
+
+    /* Flip bit in ss_X */
+    ss_X[0] ^= 1;
+    xwing_combiner(test, ss_M, ss_X, ct_X, pk_X);
+    if (memcmp(base, test, 32) == 0) { FAIL("ss_X ignored"); return 1; }
+    ss_X[0] ^= 1;
+
+    /* Flip bit in ct_X */
+    ct_X[0] ^= 1;
+    xwing_combiner(test, ss_M, ss_X, ct_X, pk_X);
+    if (memcmp(base, test, 32) == 0) { FAIL("ct_X ignored"); return 1; }
+    ct_X[0] ^= 1;
+
+    /* Flip bit in pk_X */
+    pk_X[0] ^= 1;
+    xwing_combiner(test, ss_M, ss_X, ct_X, pk_X);
+    if (memcmp(base, test, 32) == 0) { FAIL("pk_X ignored"); return 1; }
+
+    PASS(); return 0;
+}
+
+static int test_xwing_combiner_layout(void) {
+    TEST("X-Wing: combiner = SHA3-256(ssM||ssX||ctX||pkX||label)");
+    uint8_t ss_M[32], ss_X[32], ct_X[32], pk_X[32];
+    onlykey_mlkem_randombytes(ss_M, 32);
+    onlykey_mlkem_randombytes(ss_X, 32);
+    onlykey_mlkem_randombytes(ct_X, 32);
+    onlykey_mlkem_randombytes(pk_X, 32);
+
+    /* Compute via combiner function */
+    uint8_t h_func[32];
+    xwing_combiner(h_func, ss_M, ss_X, ct_X, pk_X);
+
+    /* Compute manually per spec */
+    uint8_t buf[134];
+    memcpy(buf,       ss_M, 32);
+    memcpy(buf + 32,  ss_X, 32);
+    memcpy(buf + 64,  ct_X, 32);
+    memcpy(buf + 96,  pk_X, 32);
+    memcpy(buf + 128, XWingLabel, 6);
+    uint8_t h_manual[32];
+    xwing_sha3_256(h_manual, buf, 134);
+
+    if (memcmp(h_func, h_manual, 32) != 0) { FAIL("layout mismatch"); return 1; }
+    PASS(); return 0;
+}
+
+static int test_xwing_mlkem_component_roundtrip(void) {
+    TEST("X-Wing: ML-KEM component works in hybrid context");
+    /* Simulate X-Wing keygen ML-KEM part */
+    uint8_t seed[32];
+    onlykey_mlkem_randombytes(seed, 32);
+    uint8_t expanded[96];
+    xwing_shake256(expanded, 96, seed, 32);
+
+    uint8_t pk_M[1184], sk_M[2400];
+    crypto_kem_keypair_derand(pk_M, sk_M, expanded);
+
+    /* Encaps/decaps round-trip */
+    uint8_t ct_M[1088], ss_enc[32], ss_dec[32];
+    if (crypto_kem_enc(ct_M, ss_enc, pk_M) != 0) { FAIL("encaps"); return 1; }
+    if (crypto_kem_dec(ss_dec, ct_M, sk_M) != 0) { FAIL("decaps"); return 1; }
+    if (memcmp(ss_enc, ss_dec, 32) != 0) { FAIL("ss mismatch"); return 1; }
+
+    /* Combined with fake X25519 values through combiner */
+    uint8_t fake_ssX[32], fake_ctX[32], fake_pkX[32];
+    onlykey_mlkem_randombytes(fake_ssX, 32);
+    onlykey_mlkem_randombytes(fake_ctX, 32);
+    onlykey_mlkem_randombytes(fake_pkX, 32);
+
+    uint8_t combined1[32], combined2[32];
+    xwing_combiner(combined1, ss_enc, fake_ssX, fake_ctX, fake_pkX);
+    xwing_combiner(combined2, ss_dec, fake_ssX, fake_ctX, fake_pkX);
+    if (memcmp(combined1, combined2, 32) != 0) { FAIL("combined diverged"); return 1; }
+    PASS(); return 0;
+}
+
+static int test_xwing_wrong_mlkem_breaks_ss(void) {
+    TEST("X-Wing: wrong ML-KEM component breaks combined SS");
+    uint8_t pk1[1184], sk1[2400], pk2[1184], sk2[2400], ct[1088];
+    uint8_t ss_good[32], ss_bad[32], x_ss[32], ct_x[32], pk_x[32];
+    crypto_kem_keypair(pk1, sk1); crypto_kem_keypair(pk2, sk2);
+    crypto_kem_enc(ct, ss_good, pk1); crypto_kem_dec(ss_bad, ct, sk2);
+    onlykey_mlkem_randombytes(x_ss, 32);
+    onlykey_mlkem_randombytes(ct_x, 32);
+    onlykey_mlkem_randombytes(pk_x, 32);
+    uint8_t c1[32], c2[32];
+    xwing_combiner(c1, ss_good, x_ss, ct_x, pk_x);
+    xwing_combiner(c2, ss_bad, x_ss, ct_x, pk_x);
+    if (memcmp(c1, c2, 32) == 0) { FAIL("bad mlkem matched"); return 1; }
+    PASS(); return 0;
+}
+
+static int test_xwing_wrong_x25519_breaks_ss(void) {
+    TEST("X-Wing: wrong X25519 component breaks combined SS");
+    uint8_t mlkem_ss[32], ct_x[32], pk_x[32];
+    uint8_t x_good[32], x_bad[32];
+    onlykey_mlkem_randombytes(mlkem_ss, 32);
+    onlykey_mlkem_randombytes(ct_x, 32);
+    onlykey_mlkem_randombytes(pk_x, 32);
+    onlykey_mlkem_randombytes(x_good, 32);
+    onlykey_mlkem_randombytes(x_bad, 32);
+    uint8_t c1[32], c2[32];
+    xwing_combiner(c1, mlkem_ss, x_good, ct_x, pk_x);
+    xwing_combiner(c2, mlkem_ss, x_bad, ct_x, pk_x);
+    if (memcmp(c1, c2, 32) == 0) { FAIL("bad x25519 matched"); return 1; }
+    PASS(); return 0;
+}
+
+static int test_xwing_expanded_sk_layout(void) {
+    TEST("X-Wing: expanded SK layout sk_M(2400)||sk_X(32)||pk_X(32)");
+    uint8_t seed[32];
+    onlykey_mlkem_randombytes(seed, 32);
+    uint8_t expanded[96];
+    xwing_shake256(expanded, 96, seed, 32);
+
+    uint8_t pk_M[1184], sk_M[2400];
+    crypto_kem_keypair_derand(pk_M, sk_M, expanded);
+
+    /* Build expanded SK as firmware would */
+    uint8_t xwing_sk[2464];
+    memcpy(xwing_sk, sk_M, 2400);          /* sk_M */
+    memcpy(xwing_sk + 2400, expanded + 64, 32); /* sk_X */
+    /* pk_X would be computed from sk_X via X25519(sk_X, BASE) */
+    /* For this test, just verify the layout sizes */
+    if (sizeof(xwing_sk) != 2464) { FAIL("size"); return 1; }
+
+    /* Verify pk_M is extractable from sk_M at offset 1152 */
+    if (memcmp(pk_M, sk_M + 1152, 1184) != 0) { FAIL("pk_M offset"); return 1; }
+
+    PASS(); return 0;
+}
+
+/* === Performance & Stress === */
+
+static int test_performance(void) {
+    TEST("Performance: ML-KEM-768 (10 iterations)");
+    uint8_t pk[1184], sk[2400], ct[1088], ss1[32], ss2[32];
+    int N = 10; clock_t start, end;
+    start = clock(); for (int i=0;i<N;i++) crypto_kem_keypair(pk,sk); end = clock();
+    double kg = ((double)(end-start)/CLOCKS_PER_SEC*1000.0)/N;
+    start = clock(); for (int i=0;i<N;i++) crypto_kem_enc(ct,ss1,pk); end = clock();
+    double ec = ((double)(end-start)/CLOCKS_PER_SEC*1000.0)/N;
+    start = clock(); for (int i=0;i<N;i++) crypto_kem_dec(ss2,ct,sk); end = clock();
+    double dc = ((double)(end-start)/CLOCKS_PER_SEC*1000.0)/N;
+    printf("PASS\n");
+    printf("         Keygen: %.2f ms  Encaps: %.2f ms  Decaps: %.2f ms\n", kg, ec, dc);
+    tests_passed++; return 0;
+}
+
+static int test_stress(void) {
+    TEST("Stress: 100 ML-KEM + X-Wing combiner round-trips");
+    for (int i = 0; i < 100; i++) {
+        uint8_t pk[1184], sk[2400], ct[1088], ss1[32], ss2[32];
+        if (crypto_kem_keypair(pk,sk)!=0||crypto_kem_enc(ct,ss1,pk)!=0||crypto_kem_dec(ss2,ct,sk)!=0)
+            { FAIL("crypto"); return 1; }
+        if (memcmp(ss1,ss2,32)!=0) { printf("FAIL iter %d\n",i); return 1; }
+        /* Also verify combiner with matching ML-KEM ss */
+        uint8_t x[32], ctX[32], pkX[32], c1[32], c2[32];
+        onlykey_mlkem_randombytes(x,32);
+        onlykey_mlkem_randombytes(ctX,32);
+        onlykey_mlkem_randombytes(pkX,32);
+        xwing_combiner(c1,ss1,x,ctX,pkX);
+        xwing_combiner(c2,ss2,x,ctX,pkX);
+        if (memcmp(c1,c2,32)!=0) { printf("FAIL combiner iter %d\n",i); return 1; }
+    }
+    PASS(); return 0;
+}
+
+int main(void) {
+    printf("============================================================\n");
+    printf("  ML-KEM-768 & X-Wing KEM Test Suite\n");
+    printf("  FIPS 203 | draft-connolly-cfrg-xwing-kem-09 | OnlyKey\n");
+    printf("  Compatible with: age v1.3.0 mlkem768x25519\n");
+    printf("============================================================\n\n");
+    printf("Sizes: ML-KEM PK=%d SK=%d CT=%d SS=%d\n", 1184, 2400, 1088, 32);
+    printf("       X-Wing PK=%d CT=%d SS=%d SK_expanded=%d\n\n", 1216, 1120, 32, 2464);
+
+    printf("--- ML-KEM-768 (FIPS 203) ---\n");
+    test_sizes(); test_roundtrip(); test_different_secrets();
+    test_wrong_sk(); test_corrupted_ct(); test_pk_in_sk();
+    test_check_pk_sk(); test_derand_keygen(); test_onlykey_flow();
+
+    printf("\n--- X-Wing KEM (draft-09) ---\n");
+    test_xwing_sizes(); test_xwing_label();
+    test_xwing_shake256_expansion(); test_xwing_derand_keygen();
+    test_xwing_combiner_deterministic(); test_xwing_combiner_uses_all_inputs();
+    test_xwing_combiner_layout(); test_xwing_mlkem_component_roundtrip();
+    test_xwing_wrong_mlkem_breaks_ss(); test_xwing_wrong_x25519_breaks_ss();
+    test_xwing_expanded_sk_layout();
+
+    printf("\n--- Performance & Stress ---\n");
+    test_performance(); test_stress();
+
+    printf("\n============================================================\n");
+    printf("  Results: %d/%d tests passed\n", tests_passed, tests_run);
+    printf("============================================================\n");
+    return (tests_passed == tests_run) ? 0 : 1;
+}
diff --git a/onlykey/okcore.h b/onlykey/okcore.h
index 11ed80f..4d9c274 100644
--- a/onlykey/okcore.h
+++ b/onlykey/okcore.h
@@ -121,8 +121,8 @@ extern "C"
 //Global Buffer Sizes
 /*************************************/
 #define LARGE_RESP_BUFFER_SIZE         1024
-#define LARGE_BUFFER_SIZE         1024
-#define PACKET_BUFFER_SIZE         768
+#define LARGE_BUFFER_SIZE         1120
+#define PACKET_BUFFER_SIZE         1120
 #define ATTESTATION_DER_BUFFER_SIZE 768
 #define KEYBOARD_BUFFER_SIZE         80
 /*************************************/
@@ -209,6 +209,7 @@ extern "C"
 #define MAX_RSA_KEY_SIZE 512
 #define MAX_ECC_KEY_SIZE 32
 #define RESERVED_KEY_DERIVATION 132
+// KEYTYPE_MLKEM768 and KEYTYPE_XWING can be stored in any ECC slot (101-132)
 #define RESERVED_KEY_DEFAULT_BACKUP 131
 #define RESERVED_KEY_HMACSHA1_1 130
 #define RESERVED_KEY_HMACSHA1_2 129
@@ -222,6 +223,23 @@ extern "C"
 #define KEYTYPE_ECDH_P256R   102
 #define KEYTYPE_ECDH_P256K   103
 #define KEYTYPE_ECDH_CURVE25519  104
+#define KEYTYPE_MLKEM768         5
+#define KEYTYPE_XWING            6
+/*************************************/
+//ML-KEM-768 sizes (FIPS 203)
+/*************************************/
+#define MLKEM_SK_SIZE            2400
+#define MLKEM_PK_SIZE            1184
+#define MLKEM_CT_SIZE            1088
+#define MLKEM_SS_SIZE            32
+/*************************************/
+//X-Wing sizes (draft-connolly-cfrg-xwing-kem-09)
+//X25519 + ML-KEM-768 hybrid KEM
+/*************************************/
+#define XWING_PK_SIZE            1216   /* pk_M(1184) || pk_X(32) */
+#define XWING_CT_SIZE            1120   /* ct_M(1088) || ct_X(32) */
+#define XWING_SS_SIZE            32     /* SHA3-256 output */
+#define XWING_SEED_SIZE          32     /* seed stored in ECC slot */
 
 /*************************************/
 /*************************************/
diff --git a/onlykey/okcrypto.cpp b/onlykey/okcrypto.cpp
index bb7577d..241b4ca 100644
--- a/onlykey/okcrypto.cpp
+++ b/onlykey/okcrypto.cpp
@@ -92,6 +92,46 @@
 #include "device.h"
 #include "okcrypto.h"
 
+/*************************************/
+//ML-KEM-768 (FIPS 203) support
+/*************************************/
+extern "C" {
+#include "mlkem_native/mlkem_native.h"
+}
+
+// Access SHA3-256 and SHAKE256 from mlkem-native (already compiled in)
+extern "C" {
+    void PQCP_MLKEM_NATIVE_MLKEM768_sha3_256(uint8_t *output, const uint8_t *input, size_t inlen);
+    void PQCP_MLKEM_NATIVE_MLKEM768_shake256(uint8_t *output, size_t outlen, const uint8_t *input, size_t inlen);
+}
+#define xwing_sha3_256  PQCP_MLKEM_NATIVE_MLKEM768_sha3_256
+#define xwing_shake256  PQCP_MLKEM_NATIVE_MLKEM768_shake256
+
+// X-Wing label: "\./", "/^\" = hex 5c 2e 2f 2f 5e 5c
+static const uint8_t XWingLabel[6] = {0x5c, 0x2e, 0x2f, 0x2f, 0x5e, 0x5c};
+
+// X-Wing Combiner (draft-connolly-cfrg-xwing-kem-09 Section 5.3)
+// SHA3-256(ss_M || ss_X || ct_X || pk_X || XWingLabel)
+static void xwing_combiner(uint8_t ss[32],
+    const uint8_t ss_M[32], const uint8_t ss_X[32],
+    const uint8_t ct_X[32], const uint8_t pk_X[32])
+{
+    uint8_t buf[134]; // 32+32+32+32+6
+    memcpy(buf,       ss_M, 32);
+    memcpy(buf + 32,  ss_X, 32);
+    memcpy(buf + 64,  ct_X, 32);
+    memcpy(buf + 96,  pk_X, 32);
+    memcpy(buf + 128, XWingLabel, 6);
+    xwing_sha3_256(ss, buf, 134);
+    memset(buf, 0, sizeof(buf));
+}
+
+// Bridge OnlyKey RNG to mlkem-native expected signature
+extern "C" int onlykey_mlkem_randombytes(uint8_t *out, size_t outlen) {
+    RNG.rand(out, (unsigned)outlen);
+    return 0;
+}
+
 #if !defined(MBEDTLS_CONFIG_FILE)
 #include "config.h"
 #else
@@ -198,7 +238,11 @@ void okcrypto_getpubkey (uint8_t *buffer) {
 	if (buffer[5] < 5 && !buffer[6]) { //Slot 101-132 are for ECC, 1-4 are for RSA
 		if (okcore_flashget_RSA ((int)buffer[5])) okcrypto_getrsapubkey(buffer);
 	} else if (buffer[5] < 117) { //128-132 are reserved
-		if (okcore_flashget_ECC ((int)buffer[5])) okcrypto_geteccpubkey(buffer);
+		if (okcore_flashget_ECC ((int)buffer[5])) {
+			if (type == KEYTYPE_MLKEM768) okcrypto_mlkem_getpubkey(buffer);
+			else if (type == KEYTYPE_XWING) okcrypto_xwing_getpubkey(buffer);
+			else okcrypto_geteccpubkey(buffer);
+		}
 	} else if (buffer[5] == RESERVED_KEY_DERIVATION && buffer[6] <= KEYTYPE_CURVE25519) { // Generate key using provided data, return public
 	okcrypto_derive_key(buffer[6], buffer+7, NULL);
 	send_transport_response(ecc_public_key, 64, false, false);
@@ -240,7 +284,11 @@ void okcrypto_decrypt (uint8_t *buffer){
 			fadeoff(0);
 			return;
 		}
-		if (is_bit_set(features, 5)) {
+		if (type == KEYTYPE_MLKEM768) {
+			okcrypto_mlkem_decaps(buffer);
+		} else if (type == KEYTYPE_XWING) {
+			okcrypto_xwing_decaps(buffer);
+		} else if (is_bit_set(features, 5)) {
 			okcrypto_ecdh(buffer);
 		} else {
 			#ifdef DEBUG
@@ -255,14 +303,18 @@ void okcrypto_decrypt (uint8_t *buffer){
 
 void okcrypto_generate_random_key (uint8_t *buffer) {
 	uECC_set_rng(&RNG2);
-	//uint8_t backupslot;
-	//uint8_t temp[64];
 	#ifdef DEBUG
 	Serial.println();
 	Serial.println("GENERATE KEY MESSAGE RECEIVED");
 	#endif
 	if (buffer[5] > 100) { //Slot 101-132 are for ECC, 1-4 are for RSA
-		if ((buffer[6] & 0x0F) == 1) {
+		if ((buffer[6] & 0x0F) == KEYTYPE_MLKEM768) {
+			okcrypto_mlkem_keygen(buffer);
+			return;
+		} else if ((buffer[6] & 0x0F) == KEYTYPE_XWING) {
+			okcrypto_xwing_keygen(buffer);
+			return;
+		} else if ((buffer[6] & 0x0F) == 1) {
 			crypto_box_keypair(ecc_public_key, buffer+7); //Curve25519
 		} else if ((buffer[6] & 0x0F) == 2) {
 			const struct uECC_Curve_t * curve = uECC_secp256r1(); //P-256
@@ -813,6 +865,18 @@ int okcrypto_shared_secret (uint8_t *pub, uint8_t *secret) {
 		return 0;
 		}
 
+	case KEYTYPE_MLKEM768:
+		// ML-KEM uses KEM (encaps/decaps), not DH shared secret
+		// Use okcrypto_mlkem_decaps() instead
+		hidprint("Error use ML-KEM decaps for this key type");
+		return 1;
+
+	case KEYTYPE_XWING:
+		// Hybrid uses combined KEM, not DH shared secret
+		// Use okcrypto_xwing_decaps() instead
+		hidprint("Error use X-Wing decaps for this key type");
+		return 1;
+
 	default:
 		hidprint("Error ECC type incorrect");
 		return 1;
@@ -1669,9 +1733,332 @@ void okcrypto_split_sundae(uint8_t *state, uint8_t *iv, int len, uint8_t functio
 	}
 }
 
+/*************************************/
+//ML-KEM-768 operations
+//Seed stored as 32-byte ECC key with KEYTYPE_MLKEM768
+/*************************************/
+
+void okcrypto_mlkem_keygen (uint8_t *buffer) {
+	extern uint8_t ctap_buffer[CTAPHID_BUFFER_SIZE];
+	#ifdef DEBUG
+	Serial.println();
+	Serial.println("MLKEM KEYGEN MESSAGE RECEIVED");
+	#endif
+	if (!CRYPTO_AUTH) {
+		pending_operation=CTAP2_ERR_USER_ACTION_PENDING;
+		return;
+	}
+
+	// Generate 32-byte seed, store via existing ECC slot infrastructure
+	// buffer[5] = slot (set by caller), buffer[6] = type, buffer[7..38] = key data
+	RNG2(buffer + 7, 32);
+	buffer[6] = (KEYTYPE_MLKEM768 & 0x0F) | 0x20; // type with decrypt feature (bit 5)
+	ecc_priv_flash(buffer, false);
+
+	// Expand seed to 64-byte coins: SHAKE256(seed, 64)
+	uint8_t coins[64];
+	xwing_shake256(coins, 64, buffer + 7, 32);
+
+	// Deterministic keygen
+	uint8_t *sk = ctap_buffer;
+	uint8_t *pk = ctap_buffer + MLKEM_SK_SIZE;
+	if (crypto_kem_keypair_derand(pk, sk, coins) != 0) {
+		hidprint("Error ML-KEM keygen failed");
+		fadeoff(0);
+		memset(coins, 0, 64);
+		memset(ctap_buffer, 0, MLKEM_SK_SIZE + MLKEM_PK_SIZE);
+		return;
+	}
+	#ifdef DEBUG
+	Serial.println("ML-KEM keypair generated");
+	Serial.print("PK first 16 bytes: ");
+	byteprint(pk, 16);
+	#endif
+
+	pending_operation=CTAP2_ERR_DATA_READY;
+	send_transport_response(pk, MLKEM_PK_SIZE, true, true);
+
+	memset(coins, 0, 64);
+	memset(ctap_buffer, 0, MLKEM_SK_SIZE + MLKEM_PK_SIZE);
+	fadeoff(85);
+}
+
+void okcrypto_mlkem_decaps (uint8_t *buffer) {
+	extern uint8_t ctap_buffer[CTAPHID_BUFFER_SIZE];
+	uint8_t ss[MLKEM_SS_SIZE];
+	#ifdef DEBUG
+	Serial.println();
+	Serial.println("MLKEM DECAPS MESSAGE RECEIVED");
+	#endif
+	if (!CRYPTO_AUTH) {
+		process_packets(buffer, 0, 0);
+		pending_operation=OKDECRYPT_ERR_USER_ACTION_PENDING;
+	}
+	else if (CRYPTO_AUTH == 4) {
+		okcore_aes_gcm_decrypt(large_buffer, packet_buffer_details[0], packet_buffer_details[1], profilekey, large_buffer_offset);
+		if (large_buffer_offset != MLKEM_CT_SIZE) {
+			hidprint("Error ML-KEM CT wrong size");
+			fadeoff(0);
+			memset(large_buffer, 0, LARGE_BUFFER_SIZE);
+			return;
+		}
+
+		// Seed already loaded into ecc_private_key by okcore_flashget_ECC in dispatch
+		uint8_t coins[64];
+		xwing_shake256(coins, 64, ecc_private_key, 32);
+
+		uint8_t *sk = ctap_buffer;
+		uint8_t *pk = ctap_buffer + MLKEM_SK_SIZE;
+		if (crypto_kem_keypair_derand(pk, sk, coins) != 0) {
+			hidprint("Error ML-KEM key expansion failed");
+			memset(coins, 0, 64);
+			memset(ctap_buffer, 0, MLKEM_SK_SIZE + MLKEM_PK_SIZE);
+			memset(large_buffer, 0, LARGE_BUFFER_SIZE);
+			fadeoff(0);
+			return;
+		}
+		memset(coins, 0, 64);
+
+		if (crypto_kem_dec(ss, large_buffer, sk) != 0) {
+			hidprint("Error ML-KEM decaps failed");
+			memset(ss, 0, sizeof(ss));
+			memset(ctap_buffer, 0, MLKEM_SK_SIZE + MLKEM_PK_SIZE);
+			memset(large_buffer, 0, LARGE_BUFFER_SIZE);
+			fadeoff(0);
+			return;
+		}
+		#ifdef DEBUG
+		Serial.print("Shared secret: ");
+		byteprint(ss, MLKEM_SS_SIZE);
+		#endif
+
+		pending_operation=CTAP2_ERR_DATA_READY;
+		outputmode=packet_buffer_details[2];
+		send_transport_response(ss, MLKEM_SS_SIZE, true, true);
+		if (outputmode != WEBAUTHN) {
+			wipetasks();
+		}
+
+		memset(ss, 0, sizeof(ss));
+		memset(ctap_buffer, 0, MLKEM_SK_SIZE + MLKEM_PK_SIZE);
+		memset(large_buffer, 0, LARGE_BUFFER_SIZE);
+		fadeoff(85);
+	} else {
+		#ifdef DEBUG
+		Serial.println("Waiting for challenge buttons to be pressed");
+		#endif
+	}
+}
+
+void okcrypto_mlkem_getpubkey (uint8_t *buffer) {
+	extern uint8_t ctap_buffer[CTAPHID_BUFFER_SIZE];
+	#ifdef DEBUG
+	Serial.println();
+	Serial.println("MLKEM GETPUBKEY MESSAGE RECEIVED");
+	#endif
+
+	// Seed already loaded into ecc_private_key by okcore_flashget_ECC in dispatch
+	uint8_t coins[64];
+	xwing_shake256(coins, 64, ecc_private_key, 32);
+
+	uint8_t *sk = ctap_buffer;
+	uint8_t *pk = ctap_buffer + MLKEM_SK_SIZE;
+	crypto_kem_keypair_derand(pk, sk, coins);
+	memset(coins, 0, 64);
+
+	send_transport_response(pk, MLKEM_PK_SIZE, true, true);
+	memset(ctap_buffer, 0, MLKEM_SK_SIZE + MLKEM_PK_SIZE);
+}
+
+/*************************************/
+//X-Wing KEM operations (draft-connolly-cfrg-xwing-kem-09)
+//Compatible with age v1.3.0 mlkem768x25519 recipient type
+//Seed stored as 32-byte ECC key with KEYTYPE_XWING
+//PK: pk_M(1184) || pk_X(32) = 1216 bytes
+//CT: ct_M(1088) || ct_X(32) = 1120 bytes
+//SS: SHA3-256(ss_M || ss_X || ct_X || pk_X || XWingLabel) = 32 bytes
+/*************************************/
+
+void okcrypto_xwing_keygen (uint8_t *buffer) {
+	extern uint8_t ctap_buffer[CTAPHID_BUFFER_SIZE];
+	#ifdef DEBUG
+	Serial.println();
+	Serial.println("XWING KEYGEN MESSAGE RECEIVED");
+	#endif
+	if (!CRYPTO_AUTH) {
+		pending_operation=CTAP2_ERR_USER_ACTION_PENDING;
+		return;
+	}
+
+	// Generate 32-byte seed, store via existing ECC slot infrastructure
+	RNG2(buffer + 7, XWING_SEED_SIZE);
+	buffer[6] = (KEYTYPE_XWING & 0x0F) | 0x20; // type with decrypt feature (bit 5)
+	ecc_priv_flash(buffer, false);
+
+	// Expand seed: SHAKE256(seed, 96)
+	uint8_t expanded[96];
+	xwing_shake256(expanded, 96, buffer + 7, XWING_SEED_SIZE);
+
+	// ML-KEM-768 deterministic keygen from expanded[0:64]
+	uint8_t *sk_M = ctap_buffer;
+	uint8_t *pk_M = ctap_buffer + MLKEM_SK_SIZE;
+	if (crypto_kem_keypair_derand(pk_M, sk_M, expanded) != 0) {
+		hidprint("Error X-Wing ML-KEM keygen failed");
+		fadeoff(0);
+		memset(expanded, 0, 96);
+		memset(ctap_buffer, 0, MLKEM_SK_SIZE + MLKEM_PK_SIZE);
+		return;
+	}
+
+	// X25519: pk_X = X25519(expanded[64:96], BASE)
+	uint8_t pk_X[32];
+	crypto_scalarmult_base(pk_X, expanded + 64);
+
+	// Build PK: pk_M(1184) || pk_X(32)
+	memcpy(pk_M + MLKEM_PK_SIZE, pk_X, 32);
+
+	#ifdef DEBUG
+	Serial.println("X-Wing keypair generated");
+	Serial.print("PK_M first 16: ");
+	byteprint(pk_M, 16);
+	Serial.print("PK_X: ");
+	byteprint(pk_X, 32);
+	#endif
+
+	pending_operation=CTAP2_ERR_DATA_READY;
+	send_transport_response(pk_M, XWING_PK_SIZE, true, true);
+
+	memset(expanded, 0, 96);
+	memset(pk_X, 0, 32);
+	memset(ctap_buffer, 0, MLKEM_SK_SIZE + XWING_PK_SIZE);
+	fadeoff(85);
+}
+
+void okcrypto_xwing_decaps (uint8_t *buffer) {
+	extern uint8_t ctap_buffer[CTAPHID_BUFFER_SIZE];
+	uint8_t ss_M[32];
+	uint8_t ss_X[32];
+	uint8_t ss[XWING_SS_SIZE];
+	#ifdef DEBUG
+	Serial.println();
+	Serial.println("XWING DECAPS MESSAGE RECEIVED");
+	#endif
+	if (!CRYPTO_AUTH) {
+		process_packets(buffer, 0, 0);
+		pending_operation=OKDECRYPT_ERR_USER_ACTION_PENDING;
+	}
+	else if (CRYPTO_AUTH == 4) {
+		okcore_aes_gcm_decrypt(large_buffer, packet_buffer_details[0], packet_buffer_details[1], profilekey, large_buffer_offset);
+		if (large_buffer_offset != XWING_CT_SIZE) {
+			hidprint("Error X-Wing CT wrong size");
+			fadeoff(0);
+			memset(large_buffer, 0, LARGE_BUFFER_SIZE);
+			return;
+		}
+
+		uint8_t *ct_M = large_buffer;
+		uint8_t *ct_X = large_buffer + MLKEM_CT_SIZE;
+
+		// Seed already loaded into ecc_private_key by okcore_flashget_ECC in dispatch
+		uint8_t expanded[96];
+		xwing_shake256(expanded, 96, ecc_private_key, XWING_SEED_SIZE);
+
+		// Reconstruct ML-KEM keypair from expanded[0:64]
+		uint8_t *sk_M = ctap_buffer;
+		uint8_t *pk_M = ctap_buffer + MLKEM_SK_SIZE;
+		if (crypto_kem_keypair_derand(pk_M, sk_M, expanded) != 0) {
+			hidprint("Error X-Wing key expansion failed");
+			memset(expanded, 0, 96);
+			memset(ctap_buffer, 0, MLKEM_SK_SIZE + MLKEM_PK_SIZE);
+			memset(large_buffer, 0, LARGE_BUFFER_SIZE);
+			fadeoff(0);
+			return;
+		}
+
+		// X25519 keys from expanded[64:96]
+		uint8_t *sk_X = expanded + 64;
+		uint8_t pk_X[32];
+		crypto_scalarmult_base(pk_X, sk_X);
+
+		// ML-KEM-768 decapsulation
+		if (crypto_kem_dec(ss_M, ct_M, sk_M) != 0) {
+			hidprint("Error X-Wing ML-KEM decaps failed");
+			memset(ss_M, 0, 32);
+			memset(expanded, 0, 96);
+			memset(ctap_buffer, 0, MLKEM_SK_SIZE + MLKEM_PK_SIZE);
+			memset(large_buffer, 0, LARGE_BUFFER_SIZE);
+			fadeoff(0);
+			return;
+		}
+
+		// X25519 ECDH: ss_X = X25519(sk_X, ct_X)
+		crypto_scalarmult(ss_X, sk_X, ct_X);
+
+		// X-Wing Combiner
+		xwing_combiner(ss, ss_M, ss_X, ct_X, pk_X);
+
+		#ifdef DEBUG
+		Serial.print("ss_M: "); byteprint(ss_M, 32);
+		Serial.print("ss_X: "); byteprint(ss_X, 32);
+		Serial.print("X-Wing SS: "); byteprint(ss, XWING_SS_SIZE);
+		#endif
+
+		pending_operation=CTAP2_ERR_DATA_READY;
+		outputmode=packet_buffer_details[2];
+		send_transport_response(ss, XWING_SS_SIZE, true, true);
+		if (outputmode != WEBAUTHN) {
+			wipetasks();
+		}
+
+		memset(ss_M, 0, 32);
+		memset(ss_X, 0, 32);
+		memset(ss, 0, XWING_SS_SIZE);
+		memset(expanded, 0, 96);
+		memset(pk_X, 0, 32);
+		memset(ctap_buffer, 0, MLKEM_SK_SIZE + MLKEM_PK_SIZE);
+		memset(large_buffer, 0, LARGE_BUFFER_SIZE);
+		fadeoff(85);
+	} else {
+		#ifdef DEBUG
+		Serial.println("Waiting for challenge buttons to be pressed");
+		#endif
+	}
+}
+
+void okcrypto_xwing_getpubkey (uint8_t *buffer) {
+	extern uint8_t ctap_buffer[CTAPHID_BUFFER_SIZE];
+	#ifdef DEBUG
+	Serial.println();
+	Serial.println("XWING GETPUBKEY MESSAGE RECEIVED");
+	#endif
+
+	// Seed already loaded into ecc_private_key by okcore_flashget_ECC in dispatch
+	uint8_t expanded[96];
+	xwing_shake256(expanded, 96, ecc_private_key, XWING_SEED_SIZE);
+
+	uint8_t *sk_M = ctap_buffer;
+	uint8_t *pk_M = ctap_buffer + MLKEM_SK_SIZE;
+	crypto_kem_keypair_derand(pk_M, sk_M, expanded);
+
+	uint8_t pk_X[32];
+	crypto_scalarmult_base(pk_X, expanded + 64);
+
+	memcpy(pk_M + MLKEM_PK_SIZE, pk_X, 32);
+
+	send_transport_response(pk_M, XWING_PK_SIZE, true, true);
+
+	memset(expanded, 0, 96);
+	memset(pk_X, 0, 32);
+	memset(ctap_buffer, 0, MLKEM_SK_SIZE + XWING_PK_SIZE);
+}
+
 void okcrypto_compute_pubkey() {
 	memset(ecc_public_key, 0, sizeof(ecc_public_key));
 
+	// PQ key types store seeds, not traditional ECC keys — pubkey is
+	// derived on demand via SHAKE256 expansion, not from ecc_private_key
+	if (type == KEYTYPE_MLKEM768 || type == KEYTYPE_XWING) return;
+
 	if (type == KEYTYPE_ED25519) {
 		Ed25519::derivePublicKey(ecc_public_key, ecc_private_key);
 	}
diff --git a/onlykey/okcrypto.h b/onlykey/okcrypto.h
index 44dc14f..a044470 100644
--- a/onlykey/okcrypto.h
+++ b/onlykey/okcrypto.h
@@ -134,6 +134,13 @@ extern void okcrypto_aes_gcm_decrypt2 (uint8_t * state, uint8_t * iv1, const uin
 extern void okcrypto_aes_cbc_encrypt (uint8_t * state, uint8_t * iv, const uint8_t * key, int len);
 extern void okcrypto_aes_cbc_decrypt (uint8_t * state, uint8_t * iv, const uint8_t * key, int len);
 
+extern void okcrypto_mlkem_keygen (uint8_t *buffer);
+extern void okcrypto_mlkem_decaps (uint8_t *buffer);
+extern void okcrypto_mlkem_getpubkey (uint8_t *buffer);
+extern void okcrypto_xwing_keygen (uint8_t *buffer);
+extern void okcrypto_xwing_decaps (uint8_t *buffer);
+extern void okcrypto_xwing_getpubkey (uint8_t *buffer);
+
 
 #ifdef __cplusplus
 }
diff --git a/readme.md b/readme.md
index efee056..7b81569 100644
--- a/readme.md
+++ b/readme.md
@@ -18,6 +18,7 @@ The following cryptographic software is included in this distribution:
                       "MICRO-ECC PROJECT" - https://github.com/kmackay/micro-ecc
                       "ARDUINOLIBS PROJECT" - https://rweather.github.io/arduinolibs/crypto.html
                       "YUBICO-C PROJECT" - https://github.com/Yubico/yubico-c
+                      "MLKEM-NATIVE PROJECT" - https://github.com/pq-code-package/mlkem-native
 
 For more information on export restrictions see: http://www.apache.org/licenses/exports/