From 28d293299216011d42caf1867160804d8bae9b43 Mon Sep 17 00:00:00 2001
From: Basil Hess <bhe@zurich.ibm.com>
Date: Mon, 9 Dec 2024 17:46:47 +0100
Subject: [PATCH] working C pqcp-mlkem

Signed-off-by: Basil Hess <bhe@zurich.ibm.com>
---
 .CMake/alg_support.cmake                      |  18 -
 docs/algorithms/kem/ml_kem.md                 |   5 +-
 docs/algorithms/kem/ml_kem.yml                |  51 +-
 docs/cbom.json                                |  95 +--
 .../copy_from_upstream/copy_from_upstream.yml |   9 +-
 src/kem/ml_kem/CMakeLists.txt                 |  51 +-
 src/kem/ml_kem/kem_ml_kem_1024.c              |  54 +-
 src/kem/ml_kem/kem_ml_kem_512.c               |  54 +-
 src/kem/ml_kem/kem_ml_kem_768.c               |  54 +-
 .../LICENSE                                   |   0
 .../ml_kem/mlkem-native_ml-kem-1024_ref/api.h |  62 ++
 .../arith_native.h                            | 250 ++++++
 .../ml_kem/mlkem-native_ml-kem-1024_ref/cbd.c | 143 ++++
 .../ml_kem/mlkem-native_ml-kem-1024_ref/cbd.h |  50 ++
 .../mlkem-native_ml-kem-1024_ref/cbmc.h       | 143 ++++
 .../mlkem-native_ml-kem-1024_ref/common.h     |  65 ++
 .../mlkem-native_ml-kem-1024_ref/config.h     |  22 +
 .../mlkem-native_ml-kem-1024_ref/cpucap.h     |  43 +
 .../debug/debug.c                             |  58 ++
 .../debug/debug.h                             | 219 +++++
 .../mlkem-native_ml-kem-1024_ref/indcpa.c     | 577 +++++++++++++
 .../mlkem-native_ml-kem-1024_ref/indcpa.h     |  75 ++
 .../ml_kem/mlkem-native_ml-kem-1024_ref/kem.c | 182 +++++
 .../ml_kem/mlkem-native_ml-kem-1024_ref/kem.h | 141 ++++
 .../ml_kem/mlkem-native_ml-kem-1024_ref/ntt.c | 278 +++++++
 .../ml_kem/mlkem-native_ml-kem-1024_ref/ntt.h | 102 +++
 .../mlkem-native_ml-kem-1024_ref/params.h     |  73 ++
 .../mlkem-native_ml-kem-1024_ref/poly.c       | 568 +++++++++++++
 .../mlkem-native_ml-kem-1024_ref/poly.h       | 773 ++++++++++++++++++
 .../mlkem-native_ml-kem-1024_ref/polyvec.c    | 209 +++++
 .../mlkem-native_ml-kem-1024_ref/polyvec.h    | 304 +++++++
 .../mlkem-native_ml-kem-1024_ref/reduce.c     | 154 ++++
 .../mlkem-native_ml-kem-1024_ref/reduce.h     |  64 ++
 .../rej_uniform.c                             |  90 ++
 .../rej_uniform.h                             |  61 ++
 .../symmetric-shake.c                         |  35 +
 .../mlkem-native_ml-kem-1024_ref/symmetric.h  |  82 ++
 .../mlkem-native_ml-kem-1024_ref/verify.c     |  19 +
 .../mlkem-native_ml-kem-1024_ref/verify.h     | 305 +++++++
 .../mlkem-native_ml-kem-1024_ref/zetas.c      |  30 +
 .../LICENSE                                   |   0
 .../ml_kem/mlkem-native_ml-kem-512_ref/api.h  |  62 ++
 .../arith_native.h                            | 250 ++++++
 .../ml_kem/mlkem-native_ml-kem-512_ref/cbd.c  | 143 ++++
 .../ml_kem/mlkem-native_ml-kem-512_ref/cbd.h  |  50 ++
 .../ml_kem/mlkem-native_ml-kem-512_ref/cbmc.h | 143 ++++
 .../mlkem-native_ml-kem-512_ref/common.h      |  65 ++
 .../mlkem-native_ml-kem-512_ref/config.h      |  22 +
 .../mlkem-native_ml-kem-512_ref/cpucap.h      |  43 +
 .../mlkem-native_ml-kem-512_ref/debug/debug.c |  58 ++
 .../mlkem-native_ml-kem-512_ref/debug/debug.h | 219 +++++
 .../mlkem-native_ml-kem-512_ref/indcpa.c      | 577 +++++++++++++
 .../mlkem-native_ml-kem-512_ref/indcpa.h      |  75 ++
 .../ml_kem/mlkem-native_ml-kem-512_ref/kem.c  | 182 +++++
 .../ml_kem/mlkem-native_ml-kem-512_ref/kem.h  | 141 ++++
 .../ml_kem/mlkem-native_ml-kem-512_ref/ntt.c  | 278 +++++++
 .../ml_kem/mlkem-native_ml-kem-512_ref/ntt.h  | 102 +++
 .../mlkem-native_ml-kem-512_ref/params.h      |  72 ++
 .../ml_kem/mlkem-native_ml-kem-512_ref/poly.c | 568 +++++++++++++
 .../ml_kem/mlkem-native_ml-kem-512_ref/poly.h | 773 ++++++++++++++++++
 .../mlkem-native_ml-kem-512_ref/polyvec.c     | 209 +++++
 .../mlkem-native_ml-kem-512_ref/polyvec.h     | 304 +++++++
 .../mlkem-native_ml-kem-512_ref/reduce.c      | 154 ++++
 .../mlkem-native_ml-kem-512_ref/reduce.h      |  64 ++
 .../mlkem-native_ml-kem-512_ref/rej_uniform.c |  90 ++
 .../mlkem-native_ml-kem-512_ref/rej_uniform.h |  61 ++
 .../symmetric-shake.c                         |  35 +
 .../mlkem-native_ml-kem-512_ref/symmetric.h   |  82 ++
 .../mlkem-native_ml-kem-512_ref/verify.c      |  19 +
 .../mlkem-native_ml-kem-512_ref/verify.h      | 305 +++++++
 .../mlkem-native_ml-kem-512_ref/zetas.c       |  30 +
 .../LICENSE                                   |   0
 .../ml_kem/mlkem-native_ml-kem-768_ref/api.h  |  62 ++
 .../arith_native.h                            | 250 ++++++
 .../ml_kem/mlkem-native_ml-kem-768_ref/cbd.c  | 143 ++++
 .../ml_kem/mlkem-native_ml-kem-768_ref/cbd.h  |  50 ++
 .../ml_kem/mlkem-native_ml-kem-768_ref/cbmc.h | 143 ++++
 .../mlkem-native_ml-kem-768_ref/common.h      |  65 ++
 .../mlkem-native_ml-kem-768_ref/config.h      |  22 +
 .../mlkem-native_ml-kem-768_ref/cpucap.h      |  43 +
 .../mlkem-native_ml-kem-768_ref/debug/debug.c |  58 ++
 .../mlkem-native_ml-kem-768_ref/debug/debug.h | 219 +++++
 .../mlkem-native_ml-kem-768_ref/indcpa.c      | 578 +++++++++++++
 .../mlkem-native_ml-kem-768_ref/indcpa.h      |  75 ++
 .../ml_kem/mlkem-native_ml-kem-768_ref/kem.c  | 182 +++++
 .../ml_kem/mlkem-native_ml-kem-768_ref/kem.h  | 141 ++++
 .../ml_kem/mlkem-native_ml-kem-768_ref/ntt.c  | 278 +++++++
 .../ml_kem/mlkem-native_ml-kem-768_ref/ntt.h  | 102 +++
 .../mlkem-native_ml-kem-768_ref/params.h      |  72 ++
 .../ml_kem/mlkem-native_ml-kem-768_ref/poly.c | 568 +++++++++++++
 .../ml_kem/mlkem-native_ml-kem-768_ref/poly.h | 773 ++++++++++++++++++
 .../mlkem-native_ml-kem-768_ref/polyvec.c     | 209 +++++
 .../mlkem-native_ml-kem-768_ref/polyvec.h     | 304 +++++++
 .../mlkem-native_ml-kem-768_ref/reduce.c      | 154 ++++
 .../mlkem-native_ml-kem-768_ref/reduce.h      |  64 ++
 .../mlkem-native_ml-kem-768_ref/rej_uniform.c |  90 ++
 .../mlkem-native_ml-kem-768_ref/rej_uniform.h |  61 ++
 .../symmetric-shake.c                         |  35 +
 .../mlkem-native_ml-kem-768_ref/symmetric.h   |  82 ++
 .../mlkem-native_ml-kem-768_ref/verify.c      |  19 +
 .../mlkem-native_ml-kem-768_ref/verify.h      | 305 +++++++
 .../mlkem-native_ml-kem-768_ref/zetas.c       |  30 +
 .../align.h                                   |  19 -
 .../api.h                                     |  66 --
 .../basemul.S                                 | 105 ---
 .../cbd.c                                     | 144 ----
 .../cbd.h                                     |  15 -
 .../consts.c                                  | 121 ---
 .../consts.h                                  |  43 -
 .../fq.S                                      |  88 --
 .../fq.inc                                    |  30 -
 .../indcpa.c                                  | 568 -------------
 .../indcpa.h                                  |  27 -
 .../invntt.S                                  | 193 -----
 .../kem.c                                     | 169 ----
 .../kem.h                                     |  35 -
 .../ntt.S                                     | 189 -----
 .../ntt.h                                     |  28 -
 .../params.h                                  |  68 --
 .../poly.c                                    | 519 ------------
 .../poly.h                                    |  77 --
 .../polyvec.c                                 | 307 -------
 .../polyvec.h                                 |  36 -
 .../reduce.h                                  |  12 -
 .../rejsample.c                               | 398 ---------
 .../rejsample.h                               |  14 -
 .../shuffle.S                                 | 255 ------
 .../shuffle.inc                               |  25 -
 .../symmetric-shake.c                         |  74 --
 .../symmetric.h                               |  34 -
 .../verify.c                                  |  83 --
 .../verify.h                                  |  17 -
 .../api.h                                     |  66 --
 .../cbd.c                                     | 128 ---
 .../cbd.h                                     |  14 -
 .../indcpa.c                                  | 334 --------
 .../indcpa.h                                  |  27 -
 .../kem.c                                     | 169 ----
 .../kem.h                                     |  35 -
 .../ntt.c                                     | 146 ----
 .../ntt.h                                     |  19 -
 .../params.h                                  |  55 --
 .../poly.c                                    | 360 --------
 .../poly.h                                    |  53 --
 .../polyvec.c                                 | 246 ------
 .../polyvec.h                                 |  36 -
 .../reduce.c                                  |  42 -
 .../reduce.h                                  |  16 -
 .../symmetric-shake.c                         |  74 --
 .../symmetric.h                               |  35 -
 .../verify.c                                  |  75 --
 .../verify.h                                  |  17 -
 .../align.h                                   |  19 -
 .../api.h                                     |  66 --
 .../basemul.S                                 | 105 ---
 .../cbd.c                                     | 144 ----
 .../cbd.h                                     |  15 -
 .../consts.c                                  | 121 ---
 .../consts.h                                  |  43 -
 .../fq.S                                      |  88 --
 .../fq.inc                                    |  30 -
 .../indcpa.c                                  | 568 -------------
 .../indcpa.h                                  |  27 -
 .../invntt.S                                  | 193 -----
 .../kem.c                                     | 169 ----
 .../kem.h                                     |  35 -
 .../ntt.S                                     | 189 -----
 .../ntt.h                                     |  28 -
 .../params.h                                  |  68 --
 .../poly.c                                    | 519 ------------
 .../poly.h                                    |  77 --
 .../polyvec.c                                 | 307 -------
 .../polyvec.h                                 |  36 -
 .../reduce.h                                  |  12 -
 .../rejsample.c                               | 398 ---------
 .../rejsample.h                               |  14 -
 .../shuffle.S                                 | 255 ------
 .../shuffle.inc                               |  25 -
 .../symmetric-shake.c                         |  74 --
 .../symmetric.h                               |  34 -
 .../verify.c                                  |  83 --
 .../verify.h                                  |  17 -
 .../LICENSE                                   |   6 -
 .../api.h                                     |  66 --
 .../cbd.c                                     | 128 ---
 .../cbd.h                                     |  14 -
 .../indcpa.c                                  | 334 --------
 .../indcpa.h                                  |  27 -
 .../kem.c                                     | 169 ----
 .../kem.h                                     |  35 -
 .../ntt.c                                     | 146 ----
 .../ntt.h                                     |  19 -
 .../params.h                                  |  55 --
 .../poly.c                                    | 360 --------
 .../poly.h                                    |  53 --
 .../polyvec.c                                 | 246 ------
 .../polyvec.h                                 |  36 -
 .../reduce.c                                  |  42 -
 .../reduce.h                                  |  16 -
 .../symmetric-shake.c                         |  74 --
 .../symmetric.h                               |  35 -
 .../verify.c                                  |  75 --
 .../verify.h                                  |  17 -
 .../LICENSE                                   |   6 -
 .../align.h                                   |  19 -
 .../api.h                                     |  66 --
 .../basemul.S                                 | 105 ---
 .../cbd.c                                     | 144 ----
 .../cbd.h                                     |  15 -
 .../consts.c                                  | 121 ---
 .../consts.h                                  |  43 -
 .../fq.S                                      |  88 --
 .../fq.inc                                    |  30 -
 .../indcpa.c                                  | 568 -------------
 .../indcpa.h                                  |  27 -
 .../invntt.S                                  | 193 -----
 .../kem.c                                     | 169 ----
 .../kem.h                                     |  35 -
 .../ntt.S                                     | 189 -----
 .../ntt.h                                     |  28 -
 .../params.h                                  |  68 --
 .../poly.c                                    | 519 ------------
 .../poly.h                                    |  77 --
 .../polyvec.c                                 | 307 -------
 .../polyvec.h                                 |  36 -
 .../reduce.h                                  |  12 -
 .../rejsample.c                               | 398 ---------
 .../rejsample.h                               |  14 -
 .../shuffle.S                                 | 255 ------
 .../shuffle.inc                               |  25 -
 .../symmetric-shake.c                         |  74 --
 .../symmetric.h                               |  34 -
 .../verify.c                                  |  83 --
 .../verify.h                                  |  17 -
 .../LICENSE                                   |   6 -
 .../api.h                                     |  66 --
 .../cbd.c                                     | 128 ---
 .../cbd.h                                     |  14 -
 .../indcpa.c                                  | 334 --------
 .../indcpa.h                                  |  27 -
 .../kem.c                                     | 169 ----
 .../kem.h                                     |  35 -
 .../ntt.c                                     | 146 ----
 .../ntt.h                                     |  19 -
 .../params.h                                  |  55 --
 .../poly.c                                    | 360 --------
 .../poly.h                                    |  53 --
 .../polyvec.c                                 | 246 ------
 .../polyvec.h                                 |  36 -
 .../reduce.c                                  |  42 -
 .../reduce.h                                  |  16 -
 .../symmetric-shake.c                         |  74 --
 .../symmetric.h                               |  35 -
 .../verify.c                                  |  75 --
 .../verify.h                                  |  17 -
 src/oqsconfig.h.cmake                         |   3 -
 tests/test_binary.py                          |   2 +-
 257 files changed, 15578 insertions(+), 17484 deletions(-)
 rename src/kem/ml_kem/{pqcrystals-kyber-standard_ml-kem-1024_avx2 => mlkem-native_ml-kem-1024_ref}/LICENSE (100%)
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/api.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/arith_native.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbd.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbd.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbmc.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/common.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/config.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cpucap.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/debug/debug.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/debug/debug.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/kem.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/kem.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/ntt.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/ntt.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/params.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/polyvec.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/polyvec.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/reduce.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/reduce.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/rej_uniform.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/rej_uniform.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/symmetric-shake.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/symmetric.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/verify.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/verify.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/zetas.c
 rename src/kem/ml_kem/{pqcrystals-kyber-standard_ml-kem-1024_ref => mlkem-native_ml-kem-512_ref}/LICENSE (100%)
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/api.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/arith_native.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbd.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbd.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbmc.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/common.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/config.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cpucap.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/debug/debug.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/debug/debug.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/kem.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/kem.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/ntt.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/ntt.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/params.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/polyvec.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/polyvec.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/reduce.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/reduce.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/rej_uniform.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/rej_uniform.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/symmetric-shake.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/symmetric.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/verify.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/verify.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/zetas.c
 rename src/kem/ml_kem/{pqcrystals-kyber-standard_ml-kem-512_avx2 => mlkem-native_ml-kem-768_ref}/LICENSE (100%)
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/api.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/arith_native.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbd.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbd.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbmc.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/common.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/config.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cpucap.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/debug/debug.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/debug/debug.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/kem.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/kem.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/ntt.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/ntt.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/params.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/polyvec.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/polyvec.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/reduce.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/reduce.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/rej_uniform.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/rej_uniform.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/symmetric-shake.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/symmetric.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/verify.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/verify.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/zetas.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/align.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/api.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/basemul.S
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/cbd.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/cbd.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/consts.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/consts.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/fq.S
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/fq.inc
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/indcpa.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/indcpa.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/invntt.S
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/kem.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/kem.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/ntt.S
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/ntt.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/params.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/poly.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/poly.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/polyvec.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/polyvec.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/reduce.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/rejsample.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/rejsample.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/shuffle.S
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/shuffle.inc
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/symmetric-shake.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/symmetric.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/verify.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/verify.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/api.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/cbd.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/cbd.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/indcpa.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/indcpa.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/kem.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/kem.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/ntt.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/ntt.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/params.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/poly.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/poly.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/polyvec.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/polyvec.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/reduce.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/reduce.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/symmetric-shake.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/symmetric.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/verify.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/verify.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/align.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/api.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/basemul.S
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/cbd.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/cbd.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/consts.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/consts.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/fq.S
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/fq.inc
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/indcpa.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/indcpa.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/invntt.S
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/kem.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/kem.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/ntt.S
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/ntt.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/params.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/poly.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/poly.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/polyvec.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/polyvec.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/reduce.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/rejsample.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/rejsample.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/shuffle.S
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/shuffle.inc
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/symmetric-shake.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/symmetric.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/verify.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/verify.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/LICENSE
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/api.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/cbd.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/cbd.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/indcpa.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/indcpa.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/kem.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/kem.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/ntt.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/ntt.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/params.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/poly.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/poly.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/polyvec.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/polyvec.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/reduce.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/reduce.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/symmetric-shake.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/symmetric.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/verify.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/verify.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/LICENSE
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/align.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/api.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/basemul.S
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/cbd.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/cbd.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/consts.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/consts.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/fq.S
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/fq.inc
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/indcpa.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/indcpa.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/invntt.S
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/kem.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/kem.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/ntt.S
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/ntt.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/params.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/poly.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/poly.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/polyvec.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/polyvec.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/reduce.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/rejsample.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/rejsample.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/shuffle.S
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/shuffle.inc
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/symmetric-shake.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/symmetric.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/verify.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/verify.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/LICENSE
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/api.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/cbd.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/cbd.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/indcpa.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/indcpa.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/kem.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/kem.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/ntt.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/ntt.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/params.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/poly.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/poly.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/polyvec.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/polyvec.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/reduce.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/reduce.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/symmetric-shake.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/symmetric.h
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/verify.c
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/verify.h

diff --git a/.CMake/alg_support.cmake b/.CMake/alg_support.cmake
index 9afa6e4b15..114ad69a5d 100644
--- a/.CMake/alg_support.cmake
+++ b/.CMake/alg_support.cmake
@@ -332,24 +332,6 @@ endif()
 endif()
 
 
-if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin")
-if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_BMI2_INSTRUCTIONS AND OQS_USE_POPCNT_INSTRUCTIONS))
-    cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_512_avx2 "" ON "OQS_ENABLE_KEM_ml_kem_512" OFF)
-endif()
-endif()
-
-if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin")
-if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_BMI2_INSTRUCTIONS AND OQS_USE_POPCNT_INSTRUCTIONS))
-    cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_768_avx2 "" ON "OQS_ENABLE_KEM_ml_kem_768" OFF)
-endif()
-endif()
-
-if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin")
-if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_BMI2_INSTRUCTIONS AND OQS_USE_POPCNT_INSTRUCTIONS))
-    cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_1024_avx2 "" ON "OQS_ENABLE_KEM_ml_kem_1024" OFF)
-endif()
-endif()
-
 
 if(CMAKE_SYSTEM_NAME MATCHES "Darwin|Linux")
 if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_POPCNT_INSTRUCTIONS))
diff --git a/docs/algorithms/kem/ml_kem.md b/docs/algorithms/kem/ml_kem.md
index d1806517ba..4fa0971fa9 100644
--- a/docs/algorithms/kem/ml_kem.md
+++ b/docs/algorithms/kem/ml_kem.md
@@ -7,7 +7,7 @@
 - **Authors' website**: https://pq-crystals.org/kyber/ and https://csrc.nist.gov/pubs/fips/203
 - **Specification version**: ML-KEM.
 - **Primary Source**<a name="primary-source"></a>:
-  - **Source**: https://github.com/pq-crystals/kyber/commit/10b478fc3cc4ff6215eb0b6a11bd758bf0929cbd with copy_from_upstream patches
+  - **Source**: https://github.com/bhess/mlkem-native/commit/2b650d6676bf6a3a82ab7e9ecd96acd397ca71cd
   - **Implementation license (SPDX-Identifier)**: CC0-1.0 or Apache-2.0
 
 
@@ -24,7 +24,6 @@
 |       Implementation source       | Identifier in upstream   | Supported architecture(s)   | Supported operating system(s)   | CPU extension(s) used   | No branching-on-secrets claimed?   | No branching-on-secrets checked by valgrind?   | Large stack usage?‡   |
 |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:----------------------|
 | [Primary Source](#primary-source) | ref                      | All                         | All                             | None                    | True                               | True                                           | False                 |
-| [Primary Source](#primary-source) | avx2                     | x86\_64                     | Linux,Darwin                    | AVX2,BMI2,POPCNT        | True                               | True                                           | False                 |
 
 Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
@@ -35,7 +34,6 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**.
 |       Implementation source       | Identifier in upstream   | Supported architecture(s)   | Supported operating system(s)   | CPU extension(s) used   | No branching-on-secrets claimed?   | No branching-on-secrets checked by valgrind?   | Large stack usage?   |
 |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
 | [Primary Source](#primary-source) | ref                      | All                         | All                             | None                    | True                               | True                                           | False                |
-| [Primary Source](#primary-source) | avx2                     | x86\_64                     | Linux,Darwin                    | AVX2,BMI2,POPCNT        | True                               | True                                           | False                |
 
 Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
@@ -44,7 +42,6 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**.
 |       Implementation source       | Identifier in upstream   | Supported architecture(s)   | Supported operating system(s)   | CPU extension(s) used   | No branching-on-secrets claimed?   | No branching-on-secrets checked by valgrind?   | Large stack usage?   |
 |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
 | [Primary Source](#primary-source) | ref                      | All                         | All                             | None                    | True                               | True                                           | False                |
-| [Primary Source](#primary-source) | avx2                     | x86\_64                     | Linux,Darwin                    | AVX2,BMI2,POPCNT        | True                               | True                                           | False                |
 
 Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
diff --git a/docs/algorithms/kem/ml_kem.yml b/docs/algorithms/kem/ml_kem.yml
index 81ef2b6c4a..34561915cb 100644
--- a/docs/algorithms/kem/ml_kem.yml
+++ b/docs/algorithms/kem/ml_kem.yml
@@ -17,8 +17,7 @@ website: https://pq-crystals.org/kyber/ and https://csrc.nist.gov/pubs/fips/203
 nist-round: FIPS203
 spec-version: ML-KEM
 primary-upstream:
-  source: https://github.com/pq-crystals/kyber/commit/10b478fc3cc4ff6215eb0b6a11bd758bf0929cbd
-    with copy_from_upstream patches
+  source: https://github.com/bhess/mlkem-native/commit/2b650d6676bf6a3a82ab7e9ecd96acd397ca71cd
   spdx-license-identifier: CC0-1.0 or Apache-2.0
 parameter-sets:
 - name: ML-KEM-512
@@ -38,22 +37,6 @@ parameter-sets:
     no-secret-dependent-branching-claimed: true
     no-secret-dependent-branching-checked-by-valgrind: true
     large-stack-usage: false
-  - upstream: primary-upstream
-    upstream-id: avx2
-    supported-platforms:
-    - architecture: x86_64
-      operating_systems:
-      - Linux
-      - Darwin
-      required_flags:
-      - avx2
-      - bmi2
-      - popcnt
-    common-crypto:
-    - SHA3: liboqs
-    no-secret-dependent-branching-claimed: true
-    no-secret-dependent-branching-checked-by-valgrind: true
-    large-stack-usage: false
 - name: ML-KEM-768
   claimed-nist-level: 3
   claimed-security: IND-CCA2
@@ -71,22 +54,6 @@ parameter-sets:
     no-secret-dependent-branching-claimed: true
     no-secret-dependent-branching-checked-by-valgrind: true
     large-stack-usage: false
-  - upstream: primary-upstream
-    upstream-id: avx2
-    supported-platforms:
-    - architecture: x86_64
-      operating_systems:
-      - Linux
-      - Darwin
-      required_flags:
-      - avx2
-      - bmi2
-      - popcnt
-    common-crypto:
-    - SHA3: liboqs
-    no-secret-dependent-branching-claimed: true
-    no-secret-dependent-branching-checked-by-valgrind: true
-    large-stack-usage: false
 - name: ML-KEM-1024
   claimed-nist-level: 5
   claimed-security: IND-CCA2
@@ -104,19 +71,3 @@ parameter-sets:
     no-secret-dependent-branching-claimed: true
     no-secret-dependent-branching-checked-by-valgrind: true
     large-stack-usage: false
-  - upstream: primary-upstream
-    upstream-id: avx2
-    supported-platforms:
-    - architecture: x86_64
-      operating_systems:
-      - Linux
-      - Darwin
-      required_flags:
-      - avx2
-      - bmi2
-      - popcnt
-    common-crypto:
-    - SHA3: liboqs
-    no-secret-dependent-branching-claimed: true
-    no-secret-dependent-branching-checked-by-valgrind: true
-    large-stack-usage: false
diff --git a/docs/cbom.json b/docs/cbom.json
index 52cf0a0a59..a9361e3756 100644
--- a/docs/cbom.json
+++ b/docs/cbom.json
@@ -2,23 +2,23 @@
   "$schema": "https://raw.githubusercontent.com/CycloneDX/specification/1.6/schema/bom-1.6.schema.json",
   "bomFormat": "CycloneDX",
   "specVersion": "1.6",
-  "serialNumber": "urn:uuid:de1355bb-9681-4a7e-8aa9-0ccc414ebe3b",
+  "serialNumber": "urn:uuid:d66add05-17dd-4986-8894-ed47d1e910b6",
   "version": 1,
   "metadata": {
-    "timestamp": "2024-11-05T12:25:53.012740+00:00",
+    "timestamp": "2024-12-09T14:24:28.343759+00:00",
     "component": {
       "type": "library",
-      "bom-ref": "pkg:github/open-quantum-safe/liboqs@69a80f8a66988521d51e94d716cff8c936c07b8d",
+      "bom-ref": "pkg:github/open-quantum-safe/liboqs@d0d0413dc9fff538296ab86bac492cb4bf54dedb",
       "name": "liboqs",
-      "version": "69a80f8a66988521d51e94d716cff8c936c07b8d"
+      "version": "d0d0413dc9fff538296ab86bac492cb4bf54dedb"
     }
   },
   "components": [
     {
       "type": "library",
-      "bom-ref": "pkg:github/open-quantum-safe/liboqs@69a80f8a66988521d51e94d716cff8c936c07b8d",
+      "bom-ref": "pkg:github/open-quantum-safe/liboqs@d0d0413dc9fff538296ab86bac492cb4bf54dedb",
       "name": "liboqs",
-      "version": "69a80f8a66988521d51e94d716cff8c936c07b8d"
+      "version": "d0d0413dc9fff538296ab86bac492cb4bf54dedb"
     },
     {
       "type": "cryptographic-asset",
@@ -1060,26 +1060,6 @@
         }
       }
     },
-    {
-      "type": "cryptographic-asset",
-      "bom-ref": "alg:ML-KEM-512:x86_64",
-      "name": "ML-KEM",
-      "cryptoProperties": {
-        "assetType": "algorithm",
-        "algorithmProperties": {
-          "parameterSetIdentifier": "ML-KEM-512",
-          "primitive": "kem",
-          "executionEnvironment": "software-plain-ram",
-          "cryptoFunctions": [
-            "keygen",
-            "encapsulate",
-            "decapsulate"
-          ],
-          "nistQuantumSecurityLevel": 1,
-          "implementationPlatform": "x86_64"
-        }
-      }
-    },
     {
       "type": "cryptographic-asset",
       "bom-ref": "alg:ML-KEM-768:generic",
@@ -1100,26 +1080,6 @@
         }
       }
     },
-    {
-      "type": "cryptographic-asset",
-      "bom-ref": "alg:ML-KEM-768:x86_64",
-      "name": "ML-KEM",
-      "cryptoProperties": {
-        "assetType": "algorithm",
-        "algorithmProperties": {
-          "parameterSetIdentifier": "ML-KEM-768",
-          "primitive": "kem",
-          "executionEnvironment": "software-plain-ram",
-          "cryptoFunctions": [
-            "keygen",
-            "encapsulate",
-            "decapsulate"
-          ],
-          "nistQuantumSecurityLevel": 3,
-          "implementationPlatform": "x86_64"
-        }
-      }
-    },
     {
       "type": "cryptographic-asset",
       "bom-ref": "alg:ML-KEM-1024:generic",
@@ -1140,26 +1100,6 @@
         }
       }
     },
-    {
-      "type": "cryptographic-asset",
-      "bom-ref": "alg:ML-KEM-1024:x86_64",
-      "name": "ML-KEM",
-      "cryptoProperties": {
-        "assetType": "algorithm",
-        "algorithmProperties": {
-          "parameterSetIdentifier": "ML-KEM-1024",
-          "primitive": "kem",
-          "executionEnvironment": "software-plain-ram",
-          "cryptoFunctions": [
-            "keygen",
-            "encapsulate",
-            "decapsulate"
-          ],
-          "nistQuantumSecurityLevel": 5,
-          "implementationPlatform": "x86_64"
-        }
-      }
-    },
     {
       "type": "cryptographic-asset",
       "bom-ref": "alg:sntrup761:generic",
@@ -3127,7 +3067,7 @@
   ],
   "dependencies": [
     {
-      "ref": "pkg:github/open-quantum-safe/liboqs@69a80f8a66988521d51e94d716cff8c936c07b8d",
+      "ref": "pkg:github/open-quantum-safe/liboqs@d0d0413dc9fff538296ab86bac492cb4bf54dedb",
       "provides": [
         "alg:BIKE-L1:x86_64",
         "alg:BIKE-L3:x86_64",
@@ -3181,11 +3121,8 @@
         "alg:Kyber1024:x86_64",
         "alg:Kyber1024:armv8-a",
         "alg:ML-KEM-512:generic",
-        "alg:ML-KEM-512:x86_64",
         "alg:ML-KEM-768:generic",
-        "alg:ML-KEM-768:x86_64",
         "alg:ML-KEM-1024:generic",
-        "alg:ML-KEM-1024:x86_64",
         "alg:sntrup761:generic",
         "alg:sntrup761:x86_64",
         "alg:cross-rsdp-128-balanced:generic",
@@ -3605,36 +3542,18 @@
         "alg:sha3"
       ]
     },
-    {
-      "ref": "alg:ML-KEM-512:x86_64",
-      "dependsOn": [
-        "alg:sha3"
-      ]
-    },
     {
       "ref": "alg:ML-KEM-768:generic",
       "dependsOn": [
         "alg:sha3"
       ]
     },
-    {
-      "ref": "alg:ML-KEM-768:x86_64",
-      "dependsOn": [
-        "alg:sha3"
-      ]
-    },
     {
       "ref": "alg:ML-KEM-1024:generic",
       "dependsOn": [
         "alg:sha3"
       ]
     },
-    {
-      "ref": "alg:ML-KEM-1024:x86_64",
-      "dependsOn": [
-        "alg:sha3"
-      ]
-    },
     {
       "ref": "alg:sntrup761:generic",
       "dependsOn": [
diff --git a/scripts/copy_from_upstream/copy_from_upstream.yml b/scripts/copy_from_upstream/copy_from_upstream.yml
index ff23a2287d..ea14e4022f 100644
--- a/scripts/copy_from_upstream/copy_from_upstream.yml
+++ b/scripts/copy_from_upstream/copy_from_upstream.yml
@@ -38,6 +38,13 @@ upstreams:
     kem_meta_path: '{pretty_name_full}_META.yml'
     kem_scheme_path: '.'
     patches: [pqcrystals-ml_kem.patch]
+  -
+    name: mlkem-native
+    git_url: https://github.com/bhess/mlkem-native.git
+    git_branch: updates
+    git_commit: 14141720b0149cad6c2f91b037e3e6a15882840c
+    kem_meta_path: '{pretty_name_full}_META.yml'
+    kem_scheme_path: '.'
   -
     name: pqcrystals-dilithium
     git_url: https://github.com/pq-crystals/dilithium.git
@@ -166,7 +173,7 @@ kems:
   -
     name: ml_kem
     default_implementation: ref
-    upstream_location: pqcrystals-kyber-standard
+    upstream_location: mlkem-native
     schemes:
       -
         scheme: "512"
diff --git a/src/kem/ml_kem/CMakeLists.txt b/src/kem/ml_kem/CMakeLists.txt
index 14cc9b850d..f9cf7706aa 100644
--- a/src/kem/ml_kem/CMakeLists.txt
+++ b/src/kem/ml_kem/CMakeLists.txt
@@ -6,57 +6,30 @@
 set(_ML_KEM_OBJS "")
 
 if(OQS_ENABLE_KEM_ml_kem_512)
-    add_library(ml_kem_512_ref OBJECT kem_ml_kem_512.c pqcrystals-kyber-standard_ml-kem-512_ref/cbd.c pqcrystals-kyber-standard_ml-kem-512_ref/indcpa.c pqcrystals-kyber-standard_ml-kem-512_ref/kem.c pqcrystals-kyber-standard_ml-kem-512_ref/ntt.c pqcrystals-kyber-standard_ml-kem-512_ref/poly.c pqcrystals-kyber-standard_ml-kem-512_ref/polyvec.c pqcrystals-kyber-standard_ml-kem-512_ref/reduce.c pqcrystals-kyber-standard_ml-kem-512_ref/symmetric-shake.c pqcrystals-kyber-standard_ml-kem-512_ref/verify.c)
-    target_compile_options(ml_kem_512_ref PUBLIC -DKYBER_K=2)
-    target_include_directories(ml_kem_512_ref PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqcrystals-kyber-standard_ml-kem-512_ref)
+    add_library(ml_kem_512_ref OBJECT kem_ml_kem_512.c mlkem-native_ml-kem-512_ref/cbd.c mlkem-native_ml-kem-512_ref/debug/debug.c mlkem-native_ml-kem-512_ref/indcpa.c mlkem-native_ml-kem-512_ref/kem.c mlkem-native_ml-kem-512_ref/ntt.c mlkem-native_ml-kem-512_ref/poly.c mlkem-native_ml-kem-512_ref/polyvec.c mlkem-native_ml-kem-512_ref/reduce.c mlkem-native_ml-kem-512_ref/rej_uniform.c mlkem-native_ml-kem-512_ref/symmetric-shake.c mlkem-native_ml-kem-512_ref/verify.c mlkem-native_ml-kem-512_ref/zetas.c)
+    target_compile_options(ml_kem_512_ref PUBLIC -DMLKEM_K=2)
+    target_include_directories(ml_kem_512_ref PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-512_ref)
     target_include_directories(ml_kem_512_ref PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
-    target_compile_options(ml_kem_512_ref PUBLIC -DKYBER_K=2)
+    target_compile_options(ml_kem_512_ref PUBLIC -DMLKEM_K=2)
     set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_512_ref>)
 endif()
 
-if(OQS_ENABLE_KEM_ml_kem_512_avx2)
-    add_library(ml_kem_512_avx2 OBJECT pqcrystals-kyber-standard_ml-kem-512_avx2/basemul.S pqcrystals-kyber-standard_ml-kem-512_avx2/cbd.c pqcrystals-kyber-standard_ml-kem-512_avx2/consts.c pqcrystals-kyber-standard_ml-kem-512_avx2/fq.S pqcrystals-kyber-standard_ml-kem-512_avx2/indcpa.c pqcrystals-kyber-standard_ml-kem-512_avx2/invntt.S pqcrystals-kyber-standard_ml-kem-512_avx2/kem.c pqcrystals-kyber-standard_ml-kem-512_avx2/ntt.S pqcrystals-kyber-standard_ml-kem-512_avx2/poly.c pqcrystals-kyber-standard_ml-kem-512_avx2/polyvec.c pqcrystals-kyber-standard_ml-kem-512_avx2/rejsample.c pqcrystals-kyber-standard_ml-kem-512_avx2/shuffle.S pqcrystals-kyber-standard_ml-kem-512_avx2/symmetric-shake.c pqcrystals-kyber-standard_ml-kem-512_avx2/verify.c)
-    target_include_directories(ml_kem_512_avx2 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqcrystals-kyber-standard_ml-kem-512_avx2)
-    target_include_directories(ml_kem_512_avx2 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
-    target_compile_options(ml_kem_512_avx2 PRIVATE  -mavx2  -mbmi2  -mpopcnt )
-    target_compile_options(ml_kem_512_avx2 PUBLIC -DKYBER_K=2)
-    set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_512_avx2>)
-endif()
-
 if(OQS_ENABLE_KEM_ml_kem_768)
-    add_library(ml_kem_768_ref OBJECT kem_ml_kem_768.c pqcrystals-kyber-standard_ml-kem-768_ref/cbd.c pqcrystals-kyber-standard_ml-kem-768_ref/indcpa.c pqcrystals-kyber-standard_ml-kem-768_ref/kem.c pqcrystals-kyber-standard_ml-kem-768_ref/ntt.c pqcrystals-kyber-standard_ml-kem-768_ref/poly.c pqcrystals-kyber-standard_ml-kem-768_ref/polyvec.c pqcrystals-kyber-standard_ml-kem-768_ref/reduce.c pqcrystals-kyber-standard_ml-kem-768_ref/symmetric-shake.c pqcrystals-kyber-standard_ml-kem-768_ref/verify.c)
-    target_compile_options(ml_kem_768_ref PUBLIC -DKYBER_K=3)
-    target_include_directories(ml_kem_768_ref PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqcrystals-kyber-standard_ml-kem-768_ref)
+    add_library(ml_kem_768_ref OBJECT kem_ml_kem_768.c mlkem-native_ml-kem-768_ref/cbd.c mlkem-native_ml-kem-768_ref/debug/debug.c mlkem-native_ml-kem-768_ref/indcpa.c mlkem-native_ml-kem-768_ref/kem.c mlkem-native_ml-kem-768_ref/ntt.c mlkem-native_ml-kem-768_ref/poly.c mlkem-native_ml-kem-768_ref/polyvec.c mlkem-native_ml-kem-768_ref/reduce.c mlkem-native_ml-kem-768_ref/rej_uniform.c mlkem-native_ml-kem-768_ref/symmetric-shake.c mlkem-native_ml-kem-768_ref/verify.c mlkem-native_ml-kem-768_ref/zetas.c)
+    target_compile_options(ml_kem_768_ref PUBLIC -DMLKEM_K=3)
+    target_include_directories(ml_kem_768_ref PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-768_ref)
     target_include_directories(ml_kem_768_ref PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
-    target_compile_options(ml_kem_768_ref PUBLIC -DKYBER_K=3)
+    target_compile_options(ml_kem_768_ref PUBLIC -DMLKEM_K=3)
     set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_768_ref>)
 endif()
 
-if(OQS_ENABLE_KEM_ml_kem_768_avx2)
-    add_library(ml_kem_768_avx2 OBJECT pqcrystals-kyber-standard_ml-kem-768_avx2/basemul.S pqcrystals-kyber-standard_ml-kem-768_avx2/cbd.c pqcrystals-kyber-standard_ml-kem-768_avx2/consts.c pqcrystals-kyber-standard_ml-kem-768_avx2/fq.S pqcrystals-kyber-standard_ml-kem-768_avx2/indcpa.c pqcrystals-kyber-standard_ml-kem-768_avx2/invntt.S pqcrystals-kyber-standard_ml-kem-768_avx2/kem.c pqcrystals-kyber-standard_ml-kem-768_avx2/ntt.S pqcrystals-kyber-standard_ml-kem-768_avx2/poly.c pqcrystals-kyber-standard_ml-kem-768_avx2/polyvec.c pqcrystals-kyber-standard_ml-kem-768_avx2/rejsample.c pqcrystals-kyber-standard_ml-kem-768_avx2/shuffle.S pqcrystals-kyber-standard_ml-kem-768_avx2/symmetric-shake.c pqcrystals-kyber-standard_ml-kem-768_avx2/verify.c)
-    target_include_directories(ml_kem_768_avx2 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqcrystals-kyber-standard_ml-kem-768_avx2)
-    target_include_directories(ml_kem_768_avx2 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
-    target_compile_options(ml_kem_768_avx2 PRIVATE  -mavx2  -mbmi2  -mpopcnt )
-    target_compile_options(ml_kem_768_avx2 PUBLIC -DKYBER_K=3)
-    set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_768_avx2>)
-endif()
-
 if(OQS_ENABLE_KEM_ml_kem_1024)
-    add_library(ml_kem_1024_ref OBJECT kem_ml_kem_1024.c pqcrystals-kyber-standard_ml-kem-1024_ref/cbd.c pqcrystals-kyber-standard_ml-kem-1024_ref/indcpa.c pqcrystals-kyber-standard_ml-kem-1024_ref/kem.c pqcrystals-kyber-standard_ml-kem-1024_ref/ntt.c pqcrystals-kyber-standard_ml-kem-1024_ref/poly.c pqcrystals-kyber-standard_ml-kem-1024_ref/polyvec.c pqcrystals-kyber-standard_ml-kem-1024_ref/reduce.c pqcrystals-kyber-standard_ml-kem-1024_ref/symmetric-shake.c pqcrystals-kyber-standard_ml-kem-1024_ref/verify.c)
-    target_compile_options(ml_kem_1024_ref PUBLIC -DKYBER_K=4)
-    target_include_directories(ml_kem_1024_ref PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqcrystals-kyber-standard_ml-kem-1024_ref)
+    add_library(ml_kem_1024_ref OBJECT kem_ml_kem_1024.c mlkem-native_ml-kem-1024_ref/cbd.c mlkem-native_ml-kem-1024_ref/debug/debug.c mlkem-native_ml-kem-1024_ref/indcpa.c mlkem-native_ml-kem-1024_ref/kem.c mlkem-native_ml-kem-1024_ref/ntt.c mlkem-native_ml-kem-1024_ref/poly.c mlkem-native_ml-kem-1024_ref/polyvec.c mlkem-native_ml-kem-1024_ref/reduce.c mlkem-native_ml-kem-1024_ref/rej_uniform.c mlkem-native_ml-kem-1024_ref/symmetric-shake.c mlkem-native_ml-kem-1024_ref/verify.c mlkem-native_ml-kem-1024_ref/zetas.c)
+    target_compile_options(ml_kem_1024_ref PUBLIC -DMLKEM_K=4)
+    target_include_directories(ml_kem_1024_ref PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-1024_ref)
     target_include_directories(ml_kem_1024_ref PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
-    target_compile_options(ml_kem_1024_ref PUBLIC -DKYBER_K=4)
+    target_compile_options(ml_kem_1024_ref PUBLIC -DMLKEM_K=4)
     set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_1024_ref>)
 endif()
 
-if(OQS_ENABLE_KEM_ml_kem_1024_avx2)
-    add_library(ml_kem_1024_avx2 OBJECT pqcrystals-kyber-standard_ml-kem-1024_avx2/basemul.S pqcrystals-kyber-standard_ml-kem-1024_avx2/cbd.c pqcrystals-kyber-standard_ml-kem-1024_avx2/consts.c pqcrystals-kyber-standard_ml-kem-1024_avx2/fq.S pqcrystals-kyber-standard_ml-kem-1024_avx2/indcpa.c pqcrystals-kyber-standard_ml-kem-1024_avx2/invntt.S pqcrystals-kyber-standard_ml-kem-1024_avx2/kem.c pqcrystals-kyber-standard_ml-kem-1024_avx2/ntt.S pqcrystals-kyber-standard_ml-kem-1024_avx2/poly.c pqcrystals-kyber-standard_ml-kem-1024_avx2/polyvec.c pqcrystals-kyber-standard_ml-kem-1024_avx2/rejsample.c pqcrystals-kyber-standard_ml-kem-1024_avx2/shuffle.S pqcrystals-kyber-standard_ml-kem-1024_avx2/symmetric-shake.c pqcrystals-kyber-standard_ml-kem-1024_avx2/verify.c)
-    target_include_directories(ml_kem_1024_avx2 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqcrystals-kyber-standard_ml-kem-1024_avx2)
-    target_include_directories(ml_kem_1024_avx2 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
-    target_compile_options(ml_kem_1024_avx2 PRIVATE  -mavx2  -mbmi2  -mpopcnt )
-    target_compile_options(ml_kem_1024_avx2 PUBLIC -DKYBER_K=4)
-    set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_1024_avx2>)
-endif()
-
 set(ML_KEM_OBJS ${_ML_KEM_OBJS} PARENT_SCOPE)
diff --git a/src/kem/ml_kem/kem_ml_kem_1024.c b/src/kem/ml_kem/kem_ml_kem_1024.c
index bc533aef9e..c4babc3195 100644
--- a/src/kem/ml_kem/kem_ml_kem_1024.c
+++ b/src/kem/ml_kem/kem_ml_kem_1024.c
@@ -30,62 +30,20 @@ OQS_KEM *OQS_KEM_ml_kem_1024_new(void) {
 	return kem;
 }
 
-extern int pqcrystals_ml_kem_1024_ref_keypair(uint8_t *pk, uint8_t *sk);
-extern int pqcrystals_ml_kem_1024_ref_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-extern int pqcrystals_ml_kem_1024_ref_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-
-#if defined(OQS_ENABLE_KEM_ml_kem_1024_avx2)
-extern int pqcrystals_ml_kem_1024_avx2_keypair(uint8_t *pk, uint8_t *sk);
-extern int pqcrystals_ml_kem_1024_avx2_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-extern int pqcrystals_ml_kem_1024_avx2_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-#endif
+extern int PQCP_MLKEM_NATIVE_MLKEM1024_keypair(uint8_t *pk, uint8_t *sk);
+extern int PQCP_MLKEM_NATIVE_MLKEM1024_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
+extern int PQCP_MLKEM_NATIVE_MLKEM1024_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_keypair(uint8_t *public_key, uint8_t *secret_key) {
-#if defined(OQS_ENABLE_KEM_ml_kem_1024_avx2)
-#if defined(OQS_DIST_BUILD)
-	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) {
-#endif /* OQS_DIST_BUILD */
-		return (OQS_STATUS) pqcrystals_ml_kem_1024_avx2_keypair(public_key, secret_key);
-#if defined(OQS_DIST_BUILD)
-	} else {
-		return (OQS_STATUS) pqcrystals_ml_kem_1024_ref_keypair(public_key, secret_key);
-	}
-#endif /* OQS_DIST_BUILD */
-#else
-	return (OQS_STATUS) pqcrystals_ml_kem_1024_ref_keypair(public_key, secret_key);
-#endif
+	return (OQS_STATUS) PQCP_MLKEM_NATIVE_MLKEM1024_keypair(public_key, secret_key);
 }
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) {
-#if defined(OQS_ENABLE_KEM_ml_kem_1024_avx2)
-#if defined(OQS_DIST_BUILD)
-	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) {
-#endif /* OQS_DIST_BUILD */
-		return (OQS_STATUS) pqcrystals_ml_kem_1024_avx2_enc(ciphertext, shared_secret, public_key);
-#if defined(OQS_DIST_BUILD)
-	} else {
-		return (OQS_STATUS) pqcrystals_ml_kem_1024_ref_enc(ciphertext, shared_secret, public_key);
-	}
-#endif /* OQS_DIST_BUILD */
-#else
-	return (OQS_STATUS) pqcrystals_ml_kem_1024_ref_enc(ciphertext, shared_secret, public_key);
-#endif
+	return (OQS_STATUS) PQCP_MLKEM_NATIVE_MLKEM1024_enc(ciphertext, shared_secret, public_key);
 }
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_decaps(uint8_t *shared_secret, const uint8_t *ciphertext, const uint8_t *secret_key) {
-#if defined(OQS_ENABLE_KEM_ml_kem_1024_avx2)
-#if defined(OQS_DIST_BUILD)
-	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) {
-#endif /* OQS_DIST_BUILD */
-		return (OQS_STATUS) pqcrystals_ml_kem_1024_avx2_dec(shared_secret, ciphertext, secret_key);
-#if defined(OQS_DIST_BUILD)
-	} else {
-		return (OQS_STATUS) pqcrystals_ml_kem_1024_ref_dec(shared_secret, ciphertext, secret_key);
-	}
-#endif /* OQS_DIST_BUILD */
-#else
-	return (OQS_STATUS) pqcrystals_ml_kem_1024_ref_dec(shared_secret, ciphertext, secret_key);
-#endif
+	return (OQS_STATUS) PQCP_MLKEM_NATIVE_MLKEM1024_dec(shared_secret, ciphertext, secret_key);
 }
 
 #endif
diff --git a/src/kem/ml_kem/kem_ml_kem_512.c b/src/kem/ml_kem/kem_ml_kem_512.c
index f2dcde53d2..8f52c2a3b9 100644
--- a/src/kem/ml_kem/kem_ml_kem_512.c
+++ b/src/kem/ml_kem/kem_ml_kem_512.c
@@ -30,62 +30,20 @@ OQS_KEM *OQS_KEM_ml_kem_512_new(void) {
 	return kem;
 }
 
-extern int pqcrystals_ml_kem_512_ref_keypair(uint8_t *pk, uint8_t *sk);
-extern int pqcrystals_ml_kem_512_ref_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-extern int pqcrystals_ml_kem_512_ref_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-
-#if defined(OQS_ENABLE_KEM_ml_kem_512_avx2)
-extern int pqcrystals_ml_kem_512_avx2_keypair(uint8_t *pk, uint8_t *sk);
-extern int pqcrystals_ml_kem_512_avx2_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-extern int pqcrystals_ml_kem_512_avx2_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-#endif
+extern int PQCP_MLKEM_NATIVE_MLKEM512_keypair(uint8_t *pk, uint8_t *sk);
+extern int PQCP_MLKEM_NATIVE_MLKEM512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
+extern int PQCP_MLKEM_NATIVE_MLKEM512_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_512_keypair(uint8_t *public_key, uint8_t *secret_key) {
-#if defined(OQS_ENABLE_KEM_ml_kem_512_avx2)
-#if defined(OQS_DIST_BUILD)
-	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) {
-#endif /* OQS_DIST_BUILD */
-		return (OQS_STATUS) pqcrystals_ml_kem_512_avx2_keypair(public_key, secret_key);
-#if defined(OQS_DIST_BUILD)
-	} else {
-		return (OQS_STATUS) pqcrystals_ml_kem_512_ref_keypair(public_key, secret_key);
-	}
-#endif /* OQS_DIST_BUILD */
-#else
-	return (OQS_STATUS) pqcrystals_ml_kem_512_ref_keypair(public_key, secret_key);
-#endif
+	return (OQS_STATUS) PQCP_MLKEM_NATIVE_MLKEM512_keypair(public_key, secret_key);
 }
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_512_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) {
-#if defined(OQS_ENABLE_KEM_ml_kem_512_avx2)
-#if defined(OQS_DIST_BUILD)
-	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) {
-#endif /* OQS_DIST_BUILD */
-		return (OQS_STATUS) pqcrystals_ml_kem_512_avx2_enc(ciphertext, shared_secret, public_key);
-#if defined(OQS_DIST_BUILD)
-	} else {
-		return (OQS_STATUS) pqcrystals_ml_kem_512_ref_enc(ciphertext, shared_secret, public_key);
-	}
-#endif /* OQS_DIST_BUILD */
-#else
-	return (OQS_STATUS) pqcrystals_ml_kem_512_ref_enc(ciphertext, shared_secret, public_key);
-#endif
+	return (OQS_STATUS) PQCP_MLKEM_NATIVE_MLKEM512_enc(ciphertext, shared_secret, public_key);
 }
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_512_decaps(uint8_t *shared_secret, const uint8_t *ciphertext, const uint8_t *secret_key) {
-#if defined(OQS_ENABLE_KEM_ml_kem_512_avx2)
-#if defined(OQS_DIST_BUILD)
-	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) {
-#endif /* OQS_DIST_BUILD */
-		return (OQS_STATUS) pqcrystals_ml_kem_512_avx2_dec(shared_secret, ciphertext, secret_key);
-#if defined(OQS_DIST_BUILD)
-	} else {
-		return (OQS_STATUS) pqcrystals_ml_kem_512_ref_dec(shared_secret, ciphertext, secret_key);
-	}
-#endif /* OQS_DIST_BUILD */
-#else
-	return (OQS_STATUS) pqcrystals_ml_kem_512_ref_dec(shared_secret, ciphertext, secret_key);
-#endif
+	return (OQS_STATUS) PQCP_MLKEM_NATIVE_MLKEM512_dec(shared_secret, ciphertext, secret_key);
 }
 
 #endif
diff --git a/src/kem/ml_kem/kem_ml_kem_768.c b/src/kem/ml_kem/kem_ml_kem_768.c
index 14eb6ba404..fd7de0a0ff 100644
--- a/src/kem/ml_kem/kem_ml_kem_768.c
+++ b/src/kem/ml_kem/kem_ml_kem_768.c
@@ -30,62 +30,20 @@ OQS_KEM *OQS_KEM_ml_kem_768_new(void) {
 	return kem;
 }
 
-extern int pqcrystals_ml_kem_768_ref_keypair(uint8_t *pk, uint8_t *sk);
-extern int pqcrystals_ml_kem_768_ref_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-extern int pqcrystals_ml_kem_768_ref_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-
-#if defined(OQS_ENABLE_KEM_ml_kem_768_avx2)
-extern int pqcrystals_ml_kem_768_avx2_keypair(uint8_t *pk, uint8_t *sk);
-extern int pqcrystals_ml_kem_768_avx2_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-extern int pqcrystals_ml_kem_768_avx2_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-#endif
+extern int PQCP_MLKEM_NATIVE_MLKEM768_keypair(uint8_t *pk, uint8_t *sk);
+extern int PQCP_MLKEM_NATIVE_MLKEM768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
+extern int PQCP_MLKEM_NATIVE_MLKEM768_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_768_keypair(uint8_t *public_key, uint8_t *secret_key) {
-#if defined(OQS_ENABLE_KEM_ml_kem_768_avx2)
-#if defined(OQS_DIST_BUILD)
-	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) {
-#endif /* OQS_DIST_BUILD */
-		return (OQS_STATUS) pqcrystals_ml_kem_768_avx2_keypair(public_key, secret_key);
-#if defined(OQS_DIST_BUILD)
-	} else {
-		return (OQS_STATUS) pqcrystals_ml_kem_768_ref_keypair(public_key, secret_key);
-	}
-#endif /* OQS_DIST_BUILD */
-#else
-	return (OQS_STATUS) pqcrystals_ml_kem_768_ref_keypair(public_key, secret_key);
-#endif
+	return (OQS_STATUS) PQCP_MLKEM_NATIVE_MLKEM768_keypair(public_key, secret_key);
 }
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_768_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) {
-#if defined(OQS_ENABLE_KEM_ml_kem_768_avx2)
-#if defined(OQS_DIST_BUILD)
-	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) {
-#endif /* OQS_DIST_BUILD */
-		return (OQS_STATUS) pqcrystals_ml_kem_768_avx2_enc(ciphertext, shared_secret, public_key);
-#if defined(OQS_DIST_BUILD)
-	} else {
-		return (OQS_STATUS) pqcrystals_ml_kem_768_ref_enc(ciphertext, shared_secret, public_key);
-	}
-#endif /* OQS_DIST_BUILD */
-#else
-	return (OQS_STATUS) pqcrystals_ml_kem_768_ref_enc(ciphertext, shared_secret, public_key);
-#endif
+	return (OQS_STATUS) PQCP_MLKEM_NATIVE_MLKEM768_enc(ciphertext, shared_secret, public_key);
 }
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_768_decaps(uint8_t *shared_secret, const uint8_t *ciphertext, const uint8_t *secret_key) {
-#if defined(OQS_ENABLE_KEM_ml_kem_768_avx2)
-#if defined(OQS_DIST_BUILD)
-	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) {
-#endif /* OQS_DIST_BUILD */
-		return (OQS_STATUS) pqcrystals_ml_kem_768_avx2_dec(shared_secret, ciphertext, secret_key);
-#if defined(OQS_DIST_BUILD)
-	} else {
-		return (OQS_STATUS) pqcrystals_ml_kem_768_ref_dec(shared_secret, ciphertext, secret_key);
-	}
-#endif /* OQS_DIST_BUILD */
-#else
-	return (OQS_STATUS) pqcrystals_ml_kem_768_ref_dec(shared_secret, ciphertext, secret_key);
-#endif
+	return (OQS_STATUS) PQCP_MLKEM_NATIVE_MLKEM768_dec(shared_secret, ciphertext, secret_key);
 }
 
 #endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/LICENSE b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/LICENSE
similarity index 100%
rename from src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/LICENSE
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/LICENSE
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/api.h
new file mode 100644
index 0000000000..94597323f1
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/api.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef API_H
+#define API_H
+
+#include <stdint.h>
+
+#define PQCP_MLKEM_NATIVE_MLKEM512_SECRETKEYBYTES 1632
+#define PQCP_MLKEM_NATIVE_MLKEM512_PUBLICKEYBYTES 800
+#define PQCP_MLKEM_NATIVE_MLKEM512_CIPHERTEXTBYTES 768
+#define PQCP_MLKEM_NATIVE_MLKEM512_KEYPAIRCOINBYTES 64
+#define PQCP_MLKEM_NATIVE_MLKEM512_ENCCOINBYTES 32
+#define PQCP_MLKEM_NATIVE_MLKEM512_BYTES 32
+
+int PQCP_MLKEM_NATIVE_MLKEM512_keypair_derand(uint8_t *pk, uint8_t *sk,
+                                              const uint8_t *coins);
+int PQCP_MLKEM_NATIVE_MLKEM512_keypair(uint8_t *pk, uint8_t *sk);
+int PQCP_MLKEM_NATIVE_MLKEM512_enc_derand(uint8_t *ct, uint8_t *ss,
+                                          const uint8_t *pk,
+                                          const uint8_t *coins);
+int PQCP_MLKEM_NATIVE_MLKEM512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
+int PQCP_MLKEM_NATIVE_MLKEM512_dec(uint8_t *ss, const uint8_t *ct,
+                                   const uint8_t *sk);
+
+#define PQCP_MLKEM_NATIVE_MLKEM768_SECRETKEYBYTES 2400
+#define PQCP_MLKEM_NATIVE_MLKEM768_PUBLICKEYBYTES 1184
+#define PQCP_MLKEM_NATIVE_MLKEM768_CIPHERTEXTBYTES 1088
+#define PQCP_MLKEM_NATIVE_MLKEM768_KEYPAIRCOINBYTES 64
+#define PQCP_MLKEM_NATIVE_MLKEM768_ENCCOINBYTES 32
+#define PQCP_MLKEM_NATIVE_MLKEM768_BYTES 32
+
+int PQCP_MLKEM_NATIVE_MLKEM768_keypair_derand(uint8_t *pk, uint8_t *sk,
+                                              const uint8_t *coins);
+int PQCP_MLKEM_NATIVE_MLKEM768_keypair(uint8_t *pk, uint8_t *sk);
+int PQCP_MLKEM_NATIVE_MLKEM768_enc_derand(uint8_t *ct, uint8_t *ss,
+                                          const uint8_t *pk,
+                                          const uint8_t *coins);
+int PQCP_MLKEM_NATIVE_MLKEM768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
+int PQCP_MLKEM_NATIVE_MLKEM768_dec(uint8_t *ss, const uint8_t *ct,
+                                   const uint8_t *sk);
+
+#define PQCP_MLKEM_NATIVE_MLKEM1024_SECRETKEYBYTES 3168
+#define PQCP_MLKEM_NATIVE_MLKEM1024_PUBLICKEYBYTES 1568
+#define PQCP_MLKEM_NATIVE_MLKEM1024_CIPHERTEXTBYTES 1568
+#define PQCP_MLKEM_NATIVE_MLKEM1024_KEYPAIRCOINBYTES 64
+#define PQCP_MLKEM_NATIVE_MLKEM1024_ENCCOINBYTES 32
+#define PQCP_MLKEM_NATIVE_MLKEM1024_BYTES 32
+
+int PQCP_MLKEM_NATIVE_MLKEM1024_keypair_derand(uint8_t *pk, uint8_t *sk,
+                                               const uint8_t *coins);
+int PQCP_MLKEM_NATIVE_MLKEM1024_keypair(uint8_t *pk, uint8_t *sk);
+int PQCP_MLKEM_NATIVE_MLKEM1024_enc_derand(uint8_t *ct, uint8_t *ss,
+                                           const uint8_t *pk,
+                                           const uint8_t *coins);
+int PQCP_MLKEM_NATIVE_MLKEM1024_enc(uint8_t *ct, uint8_t *ss,
+                                    const uint8_t *pk);
+int PQCP_MLKEM_NATIVE_MLKEM1024_dec(uint8_t *ss, const uint8_t *ct,
+                                    const uint8_t *sk);
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/arith_native.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/arith_native.h
new file mode 100644
index 0000000000..b7e921323a
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/arith_native.h
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef MLKEM_ARITH_NATIVE_H
+#define MLKEM_ARITH_NATIVE_H
+
+#include <stdint.h>
+#include "config.h"
+#include "params.h"
+
+#if defined(MLKEM_USE_NATIVE)
+
+#include "poly.h"
+#include "polyvec.h"
+#include "profile.h"
+
+/*
+ * MLKEM native arithmetic interface
+ *
+ * This is the C<->native arithmetic interface used in this repository
+ * to allow for the drop-in of native code for performance critical
+ * components of ML-KEM.
+ *
+ * A _profile_ is a specific implementation of (part of) this interface.
+ * The active profile (if any) is determined in profile.h.
+ *
+ * To add a function to a profile, define MLKEM_USE_NATIVE_XXX and
+ * implement `static inline xxx(...)` in the profile header.
+ *
+ * The only exception is MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER. This option can
+ * be set if there are native implementations for all of NTT, invNTT, and
+ * base multiplication, and allows the native implementation to use a
+ * custom order of polynomial coefficients in NTT domain -- the use of such
+ * custom order is not an implementation-detail since the public matrix
+ * is generated in NTT domain. In this case, a permutation function
+ * poly_permute_bitrev_to_custom() needs to be provided that permutes
+ * polynomials in NTT domain from bitreversed to the custom order.
+ */
+
+/*
+ * Those functions are meant to be trivial wrappers around
+ * the chosen native implementation. The are static inline
+ * to avoid unnecessary calls.
+ * The macro before each declaration controls whether a native
+ * implementation is present.
+ */
+
+#if defined(MLKEM_USE_NATIVE_NTT)
+/*************************************************
+ * Name:        ntt_native
+ *
+ * Description: Computes negacyclic number-theoretic transform (NTT) of
+ *              a polynomial in place.
+ *
+ *              The input polynomial is assumed to be in normal order.
+ *              The output polynomial is in bitreversed order, or of a
+ *              custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *              for more information.
+ *
+ * Arguments:   - poly *p: pointer to in/output polynomial
+ **************************************************/
+static INLINE void ntt_native(poly *);
+#endif /* MLKEM_USE_NATIVE_NTT */
+
+#if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
+/*
+ * This must only be set if NTT, invNTT, basemul, mulcache, and
+ * to/from byte stream conversions all have native implementations
+ * that are adapted to the custom order.
+ */
+#if !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) || \
+    !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) ||                  \
+    !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) ||  \
+    !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) ||                           \
+    !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
+#error \
+    "Invalid native profile: MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER can only be \
+set if there are native implementations for NTT, invNTT, mulcache, basemul, \
+and to/from bytes conversions."
+#endif
+
+/*************************************************
+ * Name:        poly_permute_bitrev_to_custom
+ *
+ * Description: When MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is defined,
+ *              convert a polynomial in NTT domain from bitreversed
+ *              order to the custom order output by the native NTT.
+ *
+ *              This must only be defined if there is native code for
+ *              all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache.
+ * Arguments:   - poly *p: pointer to in/output polynomial
+ *
+ **************************************************/
+static INLINE void poly_permute_bitrev_to_custom(poly *);
+#endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */
+
+#if defined(MLKEM_USE_NATIVE_INTT)
+/*************************************************
+ * Name:        intt_native
+ *
+ * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
+ *              of a polynomial in place.
+ *
+ *              The input polynomial is in bitreversed order, or of a
+ *              custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *              for more information.
+ *              The output polynomial is assumed to be in normal order.
+ *
+ * Arguments:   - uint16_t *a: pointer to in/output polynomial
+ **************************************************/
+static INLINE void intt_native(poly *);
+#endif /* MLKEM_USE_NATIVE_INTT */
+
+#if defined(MLKEM_USE_NATIVE_POLY_REDUCE)
+/*************************************************
+ * Name:        poly_reduce_native
+ *
+ * Description: Applies modular reduction to all coefficients of a polynomial.
+ *
+ * Arguments:   - poly *r: pointer to input/output polynomial
+ **************************************************/
+static INLINE void poly_reduce_native(poly *);
+#endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
+
+#if defined(MLKEM_USE_NATIVE_POLY_TOMONT)
+/*************************************************
+ * Name:        poly_tomont_native
+ *
+ * Description: Inplace conversion of all coefficients of a polynomial
+ *              from normal domain to Montgomery domain
+ *
+ * Arguments:   - poly *r: pointer to input/output polynomial
+ **************************************************/
+static INLINE void poly_tomont_native(poly *);
+#endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
+
+#if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE)
+/*************************************************
+ * Name:        poly_mulcache_compute_native
+ *
+ * Description: Compute multiplication cache for a polynomial
+ *              in NTT domain.
+ *
+ *              The purpose of the multiplication cache is to
+ *              cache repeated computations required during a
+ *              base multiplication of polynomials in NTT domain.
+ *              The structure of the multiplication-cache is
+ *              implementation defined.
+ *
+ * Arguments:   INPUT:
+ *              - poly: const pointer to input polynomial.
+ *                  This must be in NTT domain and inin bitreversed order, or of
+ *                  a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *                  See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *                  for more information.
+ *              OUTPUT
+ *              - cache: pointer to multiplication cache
+ **************************************************/
+static INLINE void poly_mulcache_compute_native(poly_mulcache *cache,
+                                                const poly *poly);
+#endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
+
+#if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
+/*************************************************
+ * Name:        poly_mulcache_compute_native
+ *
+ * Description: Compute multiplication of polynomials in NTT domain.
+ *
+ * Arguments:   INPUT:
+ *              - a: First polynomial operand.
+ *                 This must be in NTT domain and inin bitreversed order, or of
+ *                 a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *                 See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *                 for more information.
+ *              - b: Second polynomial operand.
+ *                 As for a.
+ *              - b_cache: Multiplication-cache for b.
+ *              OUTPUT
+ *              - r: Result of the base multiplication. This is again
+ *                   in NTT domain, and of the same order as a and b.
+ **************************************************/
+static INLINE void polyvec_basemul_acc_montgomery_cached_native(
+    poly *r, const polyvec *a, const polyvec *b,
+    const polyvec_mulcache *b_cache);
+#endif
+
+#if defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
+/*************************************************
+ * Name:        poly_tobytes_native
+ *
+ * Description: Serialization of a polynomial.
+ *              Signed coefficients are converted to
+ *              unsigned form before serialization.
+ *
+ * Arguments:   INPUT:
+ *              - a: const pointer to input polynomial,
+ *                with each coefficient in the range -Q+1 .. Q-1
+ *              OUTPUT
+ *              - r: pointer to output byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ **************************************************/
+static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
+                                       const poly *a);
+#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+
+#if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
+/*************************************************
+ * Name:        poly_frombytes_native
+ *
+ * Description: Serialization of a polynomial.
+ *              Signed coefficients are converted to
+ *              unsigned form before serialization.
+ *
+ * Arguments:   INPUT:
+ *              - r: pointer to output polynomial in NTT domain
+ *              OUTPUT
+ *              - a: const pointer to input byte aray
+ *                   (of MLKEM_POLYBYTES bytes)
+ **************************************************/
+static INLINE void poly_frombytes_native(poly *a,
+                                         const uint8_t r[MLKEM_POLYBYTES]);
+#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+
+#if defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
+/*************************************************
+ * Name:        rej_uniform_native
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ *              uniform random integers mod q
+ *
+ * Arguments:   - int16_t *r:          pointer to output buffer
+ *              - unsigned int len:    requested number of 16-bit integers
+ *                                     (uniform mod q).
+ *              - const uint8_t *buf:  pointer to input buffer
+ *                                     (assumed to be uniform random bytes)
+ *              - unsigned int buflen: length of input buffer in bytes.
+ *
+ * Return -1 if the native implementation does not support the input lengths.
+ * Otherwise, returns non-negative number of sampled 16-bit integers (at most
+ * len).
+ **************************************************/
+static INLINE int rej_uniform_native(int16_t *r, unsigned int len,
+                                     const uint8_t *buf, unsigned int buflen);
+#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+
+#endif /* MLKEM_USE_NATIVE */
+#endif /* MLKEM_ARITH_NATIVE_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbd.c
new file mode 100644
index 0000000000..073f3c81d7
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbd.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "cbd.h"
+#include <stdint.h>
+#include "params.h"
+
+/*************************************************
+ * Name:        load32_littleendian
+ *
+ * Description: load 4 bytes into a 32-bit integer
+ *              in little-endian order
+ *
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x
+ **************************************************/
+static uint32_t load32_littleendian(const uint8_t x[4])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  r |= (uint32_t)x[3] << 24;
+  return r;
+}
+
+/*************************************************
+ * Name:        load24_littleendian
+ *
+ * Description: load 3 bytes into a 32-bit integer
+ *              in little-endian order.
+ *              This function is only needed for ML-KEM-512
+ *
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
+ **************************************************/
+#if MLKEM_ETA1 == 3
+static uint32_t load24_littleendian(const uint8_t x[3])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  return r;
+}
+#endif
+
+/*************************************************
+ * Name:        cbd2
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter eta=2
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
+{
+  int i;
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(array_abs_bound(r->coeffs, 0, (8 * i - 1), 2)))
+  {
+    int j;
+    uint32_t t = load32_littleendian(buf + 4 * i);
+    uint32_t d = t & 0x55555555;
+    d += (t >> 1) & 0x55555555;
+
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(array_abs_bound(r->coeffs, 0, 8 * i + j - 1, 2)))
+    {
+      const int16_t a = (d >> (4 * j + 0)) & 0x3;
+      const int16_t b = (d >> (4 * j + 2)) & 0x3;
+      r->coeffs[8 * i + j] = a - b;
+    }
+  }
+}
+
+/*************************************************
+ * Name:        cbd3
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter eta=3.
+ *              This function is only needed for ML-KEM-512
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+#if MLKEM_ETA1 == 3
+static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
+{
+  int i;
+  for (i = 0; i < MLKEM_N / 4; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N / 4)
+    invariant(array_abs_bound(r->coeffs, 0, (4 * i - 1), 3)))
+  {
+    int j;
+    const uint32_t t = load24_littleendian(buf + 3 * i);
+    uint32_t d = t & 0x00249249;
+    d += (t >> 1) & 0x00249249;
+    d += (t >> 2) & 0x00249249;
+
+    for (j = 0; j < 4; j++)
+    __loop__(
+      invariant(i >= 0 && i <= MLKEM_N / 4 && j >= 0 && j <= 4)
+      invariant(array_abs_bound(r->coeffs, 0, 4 * i + j - 1, 3)))
+    {
+      const int16_t a = (d >> (6 * j + 0)) & 0x7;
+      const int16_t b = (d >> (6 * j + 3)) & 0x7;
+      r->coeffs[4 * i + j] = a - b;
+    }
+  }
+}
+#endif
+
+void poly_cbd_eta1(poly *r, const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
+{
+#if MLKEM_ETA1 == 2
+  cbd2(r, buf);
+#elif MLKEM_ETA1 == 3
+  cbd3(r, buf);
+#else
+#error "This implementation requires eta1 in {2,3}"
+#endif
+}
+
+void poly_cbd_eta2(poly *r, const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
+{
+#if MLKEM_ETA2 == 2
+  cbd2(r, buf);
+#else
+#error "This implementation requires eta2 = 2"
+#endif
+}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbd.h
new file mode 100644
index 0000000000..4dc8635bb5
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbd.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef CBD_H
+#define CBD_H
+
+#include <stdint.h>
+#include "params.h"
+#include "poly.h"
+
+#define poly_cbd_eta1 MLKEM_NAMESPACE(poly_cbd_eta1)
+/*************************************************
+ * Name:        poly_cbd_eta1
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+void poly_cbd_eta1(poly *r, const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N - 1, MLKEM_ETA1))
+);
+
+#define poly_cbd_eta2 MLKEM_NAMESPACE(poly_cbd_eta2)
+/*************************************************
+ * Name:        poly_cbd_eta1
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA2.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+void poly_cbd_eta2(poly *r, const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N - 1, MLKEM_ETA2))
+);
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbmc.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbmc.h
new file mode 100644
index 0000000000..317a26421b
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbmc.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/***************************************************
+ * Basic replacements for __CPROVER_XXX contracts
+ ***************************************************/
+
+#include "common.h"
+
+#ifndef CBMC
+
+#define STATIC_INLINE_TESTABLE static INLINE
+#define STATIC_TESTABLE static
+
+#define __contract__(x)
+#define __loop__(x)
+#define cassert(x, y)
+
+#else /* CBMC _is_ defined, therefore we're doing proof */
+
+/* expose certain procedures to CBMC proofs that are static otherwise */
+#define STATIC_TESTABLE
+#define STATIC_INLINE_TESTABLE
+
+#define __contract__(x) x
+#define __loop__(x) x
+
+/* https://diffblue.github.io/cbmc/contracts-assigns.html */
+#define assigns(...) __CPROVER_assigns(__VA_ARGS__)
+
+/* https://diffblue.github.io/cbmc/contracts-requires-ensures.html */
+#define requires(...) __CPROVER_requires(__VA_ARGS__)
+#define ensures(...) __CPROVER_ensures(__VA_ARGS__)
+/* https://diffblue.github.io/cbmc/contracts-loops.html */
+#define invariant(...) __CPROVER_loop_invariant(__VA_ARGS__)
+#define decreases(...) __CPROVER_decreases(__VA_ARGS__)
+/* cassert to avoid confusion with in-built assert */
+#define cassert(...) __CPROVER_assert(__VA_ARGS__)
+#define assume(...) __CPROVER_assume(__VA_ARGS__)
+
+/***************************************************
+ * Macros for "expression" forms that may appear
+ * _inside_ top-level contracts.
+ ***************************************************/
+
+/*
+ * function return value - useful inside ensures
+ * https://diffblue.github.io/cbmc/contracts-functions.html
+ */
+#define return_value (__CPROVER_return_value)
+
+/*
+ * assigns l-value targets
+ * https://diffblue.github.io/cbmc/contracts-assigns.html
+ */
+#define object_whole(...) __CPROVER_object_whole(__VA_ARGS__)
+#define memory_slice(...) __CPROVER_object_upto(__VA_ARGS__)
+#define same_object(...) __CPROVER_same_object(__VA_ARGS__)
+
+/*
+ * Pointer-related predicates
+ * https://diffblue.github.io/cbmc/contracts-memory-predicates.html
+ */
+#define memory_no_alias(...) __CPROVER_is_fresh(__VA_ARGS__)
+#define readable(...) __CPROVER_r_ok(__VA_ARGS__)
+#define writeable(...) __CPROVER_w_ok(__VA_ARGS__)
+
+/*
+ * History variables
+ * https://diffblue.github.io/cbmc/contracts-history-variables.html
+ */
+#define old(...) __CPROVER_old(__VA_ARGS__)
+#define loop_entry(...) __CPROVER_loop_entry(__VA_ARGS__)
+
+/*
+ * Quantifiers
+ * Note that the range on qvar is _inclusive_ between qvar_lb .. qvar_ub
+ * https://diffblue.github.io/cbmc/contracts-quantifiers.html
+ */
+
+/*
+ * Prevent clang-format from corrupting CBMC's special ==> operator
+ */
+/* clang-format off */
+#define forall(type, qvar, qvar_lb, qvar_ub, predicate)           \
+  __CPROVER_forall                                                \
+  {                                                               \
+    type qvar;                                                    \
+    ((qvar_lb) <= (qvar) && (qvar) <= (qvar_ub)) ==> (predicate)  \
+  }
+
+#define EXISTS(type, qvar, qvar_lb, qvar_ub, predicate)         \
+  __CPROVER_exists                                              \
+  {                                                             \
+    type qvar;                                                  \
+    ((qvar_lb) <= (qvar) && (qvar) <= (qvar_ub)) && (predicate) \
+  }
+/* clang-format on */
+
+/***************************************************
+ * Convenience macros for common contract patterns
+ ***************************************************/
+
+/*
+ * Boolean-value predidate that asserts that "all values of array_var are in
+ * range value_lb .. value_ub (inclusive)"
+ * Example:
+ *  array_bound(a->coeffs, 0, MLKEM_N-1, -(MLKEM_Q - 1), MLKEM_Q - 1)
+ * expands to
+ *  __CPROVER_forall { int k; (0 <= k && k <= MLKEM_N-1) ==> ( (-(MLKEM_Q -
+ *  1) <= a->coeffs[k]) && (a->coeffs[k] <= (MLKEM_Q - 1))) }
+ */
+
+/*
+ * Prevent clang-format from corrupting CBMC's special ==> operator
+ */
+/* clang-format off */
+#define CBMC_CONCAT_(left, right) left##right
+#define CBMC_CONCAT(left, right) CBMC_CONCAT_(left, right)
+
+#define array_bound_core(indextype, qvar, qvar_lb, qvar_ub, array_var, \
+                         value_lb, value_ub)                           \
+  __CPROVER_forall                                                     \
+  {                                                                    \
+    indextype qvar;                                                    \
+    ((qvar_lb) <= (qvar) && (qvar) <= (qvar_ub)) ==>                   \
+        (((value_lb) <= (array_var[(qvar)])) &&                        \
+        ((array_var[(qvar)]) <= (value_ub)))                           \
+  }
+
+#define array_bound(array_var, qvar_lb, qvar_ub, value_lb, value_ub) \
+  array_bound_core(int, CBMC_CONCAT(_cbmc_idx, __LINE__), (qvar_lb), \
+                   (qvar_ub), (array_var), (value_lb), (value_ub))
+
+
+/* Wrapper around array_bound operating on absolute values */
+#define array_abs_bound(arr, lb, ub, k) \
+  array_bound((arr), (lb), (ub), (-(k)), (k))
+/* clang-format on */
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/common.h
new file mode 100644
index 0000000000..94c29ed927
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/common.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef COMMON_H
+#define COMMON_H
+
+
+/*
+ * C90 does not have the inline compiler directive yet.
+ * We don't use it in C90 builds.
+ * However, in that case the compiler warns about some inline functions in
+ * header files not being used in every compilation unit that includes that
+ * header. To work around it we silence that warning in that case using
+ * __attribute__((unused)).
+ */
+
+/* Do not use inline for C90 builds*/
+#if !defined(inline)
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#define INLINE inline
+#define ALWAYS_INLINE __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#define INLINE __inline
+#define ALWAYS_INLINE __forceinline
+#else
+#define INLINE __attribute__((unused))
+#define ALWAYS_INLINE
+#endif
+
+#else
+#define INLINE inline
+#define ALWAYS_INLINE __attribute__((always_inline))
+#endif
+
+
+/*
+ * C90 does not have the restrict compiler directive yet.
+ * We don't use it in C90 builds.
+ */
+#if !defined(restrict)
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#define RESTRICT restrict
+#else
+#define RESTRICT
+#endif
+
+#else
+
+#define RESTRICT restrict
+#endif
+
+#define DEFAULT_ALIGN 32
+#if defined(_WIN32)
+#define ALIGN __declspec(align(DEFAULT_ALIGN))
+#define asm __asm
+#else
+#define asm __asm__
+#define ALIGN __attribute__((aligned(DEFAULT_ALIGN)))
+#endif
+
+#define MLKEM_CONCAT_(left, right) left##right
+#define MLKEM_CONCAT(left, right) MLKEM_CONCAT_(left, right)
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/config.h
new file mode 100644
index 0000000000..370a141a65
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/config.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef CONFIG_H
+#define CONFIG_H
+
+#include "cpucap.h"
+
+#if defined(MLKEM_USE_NATIVE)
+
+#if defined(SYS_AARCH64)
+#define MLKEM_USE_NATIVE_AARCH64
+#endif /* SYS_AARCH64 */
+
+#if defined(SYS_X86_64)
+#define MLKEM_USE_NATIVE_X86_64
+#endif /* SYS_X86_64 */
+
+#endif /* MLKEM_USE_NATIVE */
+#endif /* CONFIG_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cpucap.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cpucap.h
new file mode 100644
index 0000000000..cfcbbc3fe9
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cpucap.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef CPUCAP_H
+#define CPUCAP_H
+
+/* Check if we're running on an AArch64 system. _M_ARM64 is set by MSVC. */
+#if defined(__AARCH64EL__) || defined(_M_ARM64)
+#define SYS_AARCH64
+#endif
+
+#if defined(__x86_64__)
+#define SYS_X86_64
+#if defined(__AVX2__)
+#define SYS_X86_64_AVX2
+#endif
+#endif /* __x86_64__ */
+
+/* Check endianness */
+#if defined(__BYTE_ORDER__)
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define SYS_LITTLE_ENDIAN
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define SYS_BIG_ENDIAN
+#else /* __BYTE_ORER__ */
+#error "__BYTE_ORDER__ defined, but don't recognize value."
+#endif /* __BYTE_ORER__ */
+#endif /* !defined(__BYTE_ORER__) */
+
+/* If FORCE_AARCH64 is set, assert that we're indeed on an AArch64 system. */
+#if defined(FORCE_AARCH64) && !defined(SYS_AARCH64)
+#error "FORCE_AARCH64 is set, but we don't seem to be on an AArch64 system."
+#endif
+
+/* If FORCE_X86_64 is set, assert that we're indeed on an X86_64 system. */
+#if defined(FORCE_X86_64) && !defined(SYS_X86_64)
+#error "FORCE_X86_64 is set, but we don't seem to be on an X86_64 system."
+#endif
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/debug/debug.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/debug/debug.c
new file mode 100644
index 0000000000..aa9b578074
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/debug/debug.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#define _ISOC99_SOURCE
+#include "debug.h"
+#include <stdio.h>
+
+#if defined(MLKEM_DEBUG)
+
+static char debug_buf[256];
+
+void mlkem_debug_assert(const char *file, int line, const char *description,
+                        const int val)
+{
+  if (val == 0)
+  {
+    snprintf(debug_buf, sizeof(debug_buf), "Assertion failed: %s (value %d)",
+             description, val);
+    mlkem_debug_print_error(file, line, debug_buf);
+    exit(1);
+  }
+}
+void mlkem_debug_check_bounds(const char *file, int line,
+                              const char *description, const int16_t *ptr,
+                              unsigned len, int lower_bound_exclusive,
+                              int upper_bound_exclusive)
+{
+  int err = 0;
+  unsigned i;
+  for (i = 0; i < len; i++)
+  {
+    int16_t val = ptr[i];
+    if (!(val > lower_bound_exclusive && val < upper_bound_exclusive))
+    {
+      snprintf(debug_buf, sizeof(debug_buf),
+               "%s, index %u, value %d out of bounds (%d,%d)", description, i,
+               (int)val, lower_bound_exclusive, upper_bound_exclusive);
+      mlkem_debug_print_error(file, line, debug_buf);
+      err = 1;
+    }
+  }
+
+  if (err == 1)
+    exit(1);
+}
+
+void mlkem_debug_print_error(const char *file, int line, const char *msg)
+{
+  fprintf(stderr, "[ERROR:%s:%04d] %s\n", file, line, msg);
+  fflush(stderr);
+}
+
+#else /* MLKEM_DEBUG */
+
+int empty_cu_debug;
+
+#endif /* MLKEM_DEBUG */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/debug/debug.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/debug/debug.h
new file mode 100644
index 0000000000..65208771d2
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/debug/debug.h
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef MLKEM_DEBUG_H
+#define MLKEM_DEBUG_H
+
+#if defined(MLKEM_DEBUG)
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/*************************************************
+ * Name:        mlkem_debug_assert
+ *
+ * Description: Check debug assertion
+ *
+ *              Prints an error message to stderr and calls
+ *              exit(1) if not.
+ *
+ * Arguments:   - file: filename
+ *              - line: line number
+ *              - description: Textual description of assertion
+ *              - val: Value asserted to be non-zero
+ **************************************************/
+void mlkem_debug_assert(const char *file, int line, const char *description,
+                        const int val);
+
+/*************************************************
+ * Name:        mlkem_debug_check_bounds
+ *
+ * Description: Check whether values in an array of int16_t
+ *              are within specified bounds.
+ *
+ *              Prints an error message to stderr and calls
+ *              exit(1) if not.
+ *
+ * Arguments:   - file: filename
+ *              - line: line number
+ *              - description: Textual description of check
+ *              - ptr: Base of array to be checked
+ *              - len: Number of int16_t in ptr
+ *              - lower_bound_exclusive: Exclusive lower bound
+ *              - upper_bound_exclusive: Exclusive upper bound
+ **************************************************/
+void mlkem_debug_check_bounds(const char *file, int line,
+                              const char *description, const int16_t *ptr,
+                              unsigned len, int lower_bound_exclusive,
+                              int upper_bound_exclusive);
+
+/* Print error message to stderr alongside file and line information */
+void mlkem_debug_print_error(const char *file, int line, const char *msg);
+
+/* Check assertion, calling exit() upon failure
+ *
+ * val: Value that's asserted to be non-zero
+ * msg: Message to print on failure
+ *
+ * Currently called CASSERT to avoid clash with CBMC assert.
+ */
+#define CASSERT(val, msg)                                 \
+  do                                                      \
+  {                                                       \
+    mlkem_debug_assert(__FILE__, __LINE__, (msg), (val)); \
+  } while (0)
+
+/* Check absolute bounds of scalar
+ * val: Scalar to be checked
+ * abs_bound: Exclusive upper bound on absolute value to check
+ * msg: Message to print on failure */
+#define SCALAR_BOUND(val, abs_bound, msg) \
+  CASSERT((val) > -(abs_bound) && (val) < (abs_bound), msg)
+
+/* Check that all coefficients in array of int16_t's are non-negative
+ * and below an exclusive upper bound.
+ *
+ * ptr: Base of array, expression of type int16_t*
+ * len: Number of int16_t in array
+ * high_bound: Exclusive upper bound on absolute value to check
+ * msg: Message to print on failure */
+#define UBOUND(ptr, len, high_bound, msg)                                 \
+  do                                                                      \
+  {                                                                       \
+    mlkem_debug_check_bounds(__FILE__, __LINE__, (msg), (int16_t *)(ptr), \
+                             (len), -1, ((high_bound)));                  \
+  } while (0)
+
+/* Check absolute bounds in array of int16_t's
+ * ptr: Base of array, expression of type int16_t*
+ * len: Number of int16_t in array
+ * abs_bound: Exclusive upper bound on absolute value to check
+ * msg: Message to print on failure */
+#define BOUND(ptr, len, abs_bound, msg)                                   \
+  do                                                                      \
+  {                                                                       \
+    mlkem_debug_check_bounds(__FILE__, __LINE__, (msg), (int16_t *)(ptr), \
+                             (len), -(abs_bound), (abs_bound));           \
+  } while (0)
+
+/* Check absolute bounds on coefficients in polynomial or mulcache
+ * ptr: poly* or poly_mulcache* pointer to polynomial (cache) to check
+ * abs_bound: Exclusive upper bound on absolute value to check
+ * msg: Message to print on failure */
+#define POLY_BOUND_MSG(ptr, abs_bound, msg)                                    \
+  BOUND((ptr)->coeffs, (sizeof((ptr)->coeffs) / sizeof(int16_t)), (abs_bound), \
+        msg)
+
+/* Check unsigned bounds on coefficients in polynomial or mulcache
+ * ptr: poly* or poly_mulcache* pointer to polynomial (cache) to check
+ * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
+ * msg: Message to print on failure */
+#define POLY_UBOUND_MSG(ptr, ubound, msg)                                    \
+  UBOUND((ptr)->coeffs, (sizeof((ptr)->coeffs) / sizeof(int16_t)), (ubound), \
+         msg)
+
+/* Check absolute bounds on coefficients in polynomial
+ * ptr: poly* of poly_mulcache* pointer to polynomial (cache) to check
+ * abs_bound: Exclusive upper bound on absolute value to check */
+#define POLY_BOUND(ptr, abs_bound) \
+  POLY_BOUND_MSG((ptr), (abs_bound), "poly absolute bound for " #ptr)
+
+/* Check unsigned bounds on coefficients in polynomial
+ * ptr: poly* of poly_mulcache* pointer to polynomial (cache) to check
+ * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
+ */
+#define POLY_UBOUND(ptr, ubound) \
+  POLY_UBOUND_MSG((ptr), (ubound), "poly unsigned bound for " #ptr)
+
+/* Check absolute bounds on coefficients in vector of polynomials
+ * ptr: polyvec* or polyvec_mulcache* pointer to vector of polynomials to check
+ * abs_bound: Exclusive upper bound on absolute value to check */
+#define POLYVEC_BOUND(ptr, abs_bound)                                      \
+  do                                                                       \
+  {                                                                        \
+    unsigned _debug_polyvec_bound_idx;                                     \
+    for (_debug_polyvec_bound_idx = 0; _debug_polyvec_bound_idx < MLKEM_K; \
+         _debug_polyvec_bound_idx++)                                       \
+      POLY_BOUND_MSG(&(ptr)->vec[_debug_polyvec_bound_idx], (abs_bound),   \
+                     "polyvec absolute bound for " #ptr ".vec[i]");        \
+  } while (0)
+
+/* Check unsigned bounds on coefficients in vector of polynomials
+ * ptr: polyvec* or polyvec_mulcache* pointer to vector of polynomials to check
+ * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
+ */
+#define POLYVEC_UBOUND(ptr, ubound)                                        \
+  do                                                                       \
+  {                                                                        \
+    unsigned _debug_polyvec_bound_idx;                                     \
+    for (_debug_polyvec_bound_idx = 0; _debug_polyvec_bound_idx < MLKEM_K; \
+         _debug_polyvec_bound_idx++)                                       \
+      POLY_UBOUND_MSG(&(ptr)->vec[_debug_polyvec_bound_idx], (ubound),     \
+                      "polyvec unsigned bound for " #ptr ".vec[i]");       \
+  } while (0)
+
+/* Following AWS-LC to define a C99-compliant static assert */
+#define MLKEM_STATIC_ASSERT_DEFINE(cond, msg)                            \
+  typedef struct                                                         \
+  {                                                                      \
+    unsigned int MLKEM_CONCAT(static_assertion_, msg) : (cond) ? 1 : -1; \
+  } MLKEM_CONCAT(static_assertion_, msg) __attribute__((unused));
+
+#define MLKEM_STATIC_ASSERT_ADD_LINE0(cond, suffix) \
+  MLKEM_STATIC_ASSERT_DEFINE(cond, MLKEM_CONCAT(at_line_, suffix))
+#define MLKEM_STATIC_ASSERT_ADD_LINE1(cond, line, suffix) \
+  MLKEM_STATIC_ASSERT_ADD_LINE0(cond, MLKEM_CONCAT(line, suffix))
+#define MLKEM_STATIC_ASSERT_ADD_LINE2(cond, suffix) \
+  MLKEM_STATIC_ASSERT_ADD_LINE1(cond, __LINE__, suffix)
+#define MLKEM_STATIC_ASSERT_ADD_ERROR(cond, suffix) \
+  MLKEM_STATIC_ASSERT_ADD_LINE2(cond, MLKEM_CONCAT(_error_is_, suffix))
+#define STATIC_ASSERT(cond, error) MLKEM_STATIC_ASSERT_ADD_ERROR(cond, error)
+
+#else /* MLKEM_DEBUG */
+
+#define CASSERT(val, msg) \
+  do                      \
+  {                       \
+  } while (0)
+#define SCALAR_BOUND(val, abs_bound, msg) \
+  do                                      \
+  {                                       \
+  } while (0)
+#define BOUND(ptr, len, abs_bound, msg) \
+  do                                    \
+  {                                     \
+  } while (0)
+#define POLY_BOUND(ptr, abs_bound) \
+  do                               \
+  {                                \
+  } while (0)
+#define POLYVEC_BOUND(ptr, abs_bound) \
+  do                                  \
+  {                                   \
+  } while (0)
+#define POLY_BOUND_MSG(ptr, ubound, abs_bound) \
+  do                                           \
+  {                                            \
+  } while (0)
+#define UBOUND(ptr, len, high_bound, msg) \
+  do                                      \
+  {                                       \
+  } while (0)
+#define POLY_UBOUND(ptr, ubound) \
+  do                             \
+  {                              \
+  } while (0)
+#define POLYVEC_UBOUND(ptr, ubound) \
+  do                                \
+  {                                 \
+  } while (0)
+#define POLY_UBOUND_MSG(ptr, ubound, msg) \
+  do                                      \
+  {                                       \
+  } while (0)
+#define STATIC_ASSERT(cond, error)
+
+#endif /* MLKEM_DEBUG */
+
+#endif /* MLKEM_DEBUG_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.c
new file mode 100644
index 0000000000..669460c29c
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.c
@@ -0,0 +1,577 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "indcpa.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include "fips202.h"
+#include "fips202x4.h"
+#include "indcpa.h"
+#include "ntt.h"
+#include "params.h"
+#include "poly.h"
+#include "polyvec.h"
+#include "randombytes.h"
+#include "rej_uniform.h"
+#include "symmetric.h"
+
+#include "arith_native.h"
+#include "debug/debug.h"
+
+#include "cbmc.h"
+
+
+/*************************************************
+ * Name:        pack_pk
+ *
+ * Description: Serialize the public key as concatenation of the
+ *              serialized vector of polynomials pk
+ *              and the public seed used to generate the matrix A.
+ *
+ * Arguments:   uint8_t *r: pointer to the output serialized public key
+ *              polyvec *pk: pointer to the input public-key polyvec.
+ *                Must have coefficients within [0,..,q-1].
+ *              const uint8_t *seed: pointer to the input public seed
+ **************************************************/
+static void pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES], polyvec *pk,
+                    const uint8_t seed[MLKEM_SYMBYTES])
+{
+  POLYVEC_BOUND(pk, MLKEM_Q);
+  polyvec_tobytes(r, pk);
+  memcpy(r + MLKEM_POLYVECBYTES, seed, MLKEM_SYMBYTES);
+}
+
+/*************************************************
+ * Name:        unpack_pk
+ *
+ * Description: De-serialize public key from a byte array;
+ *              approximate inverse of pack_pk
+ *
+ * Arguments:   - polyvec *pk: pointer to output public-key polynomial vector
+ *                  Coefficients will be normalized to [0,..,q-1].
+ *              - uint8_t *seed: pointer to output seed to generate matrix A
+ *              - const uint8_t *packedpk: pointer to input serialized public
+ *                  key.
+ **************************************************/
+static void unpack_pk(polyvec *pk, uint8_t seed[MLKEM_SYMBYTES],
+                      const uint8_t packedpk[MLKEM_INDCPA_PUBLICKEYBYTES])
+{
+  polyvec_frombytes(pk, packedpk);
+  memcpy(seed, packedpk + MLKEM_POLYVECBYTES, MLKEM_SYMBYTES);
+
+  /*
+   * TODO! We know from the modulus check that this will result in an
+   * unsigned canonical polynomial, but CBMC does not know it. We should
+   * weaken the specification of `unpack_pk()` and all depending functions
+   * to work with the weaker 4096-bound, so that the proofs go through
+   * without the need of this redundant call to polyvec_reduce().
+   */
+  polyvec_reduce(pk);
+}
+
+/*************************************************
+ * Name:        pack_sk
+ *
+ * Description: Serialize the secret key
+ *
+ * Arguments:   - uint8_t *r: pointer to output serialized secret key
+ *              - polyvec *sk: pointer to input vector of polynomials (secret
+ *key)
+ **************************************************/
+static void pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES], polyvec *sk)
+{
+  POLYVEC_BOUND(sk, MLKEM_Q);
+  polyvec_tobytes(r, sk);
+}
+
+/*************************************************
+ * Name:        unpack_sk
+ *
+ * Description: De-serialize the secret key; inverse of pack_sk
+ *
+ * Arguments:   - polyvec *sk: pointer to output vector of polynomials (secret
+ *key)
+ *              - const uint8_t *packedsk: pointer to input serialized secret
+ *key
+ **************************************************/
+static void unpack_sk(polyvec *sk,
+                      const uint8_t packedsk[MLKEM_INDCPA_SECRETKEYBYTES])
+{
+  polyvec_frombytes(sk, packedsk);
+  polyvec_reduce(sk);
+}
+
+/*************************************************
+ * Name:        pack_ciphertext
+ *
+ * Description: Serialize the ciphertext as concatenation of the
+ *              compressed and serialized vector of polynomials b
+ *              and the compressed and serialized polynomial v
+ *
+ * Arguments:   uint8_t *r: pointer to the output serialized ciphertext
+ *              poly *pk: pointer to the input vector of polynomials b
+ *              poly *v: pointer to the input polynomial v
+ **************************************************/
+static void pack_ciphertext(uint8_t r[MLKEM_INDCPA_BYTES], polyvec *b, poly *v)
+{
+  polyvec_compress_du(r, b);
+  poly_compress_dv(r + MLKEM_POLYVECCOMPRESSEDBYTES_DU, v);
+}
+
+/*************************************************
+ * Name:        unpack_ciphertext
+ *
+ * Description: De-serialize and decompress ciphertext from a byte array;
+ *              approximate inverse of pack_ciphertext
+ *
+ * Arguments:   - polyvec *b: pointer to the output vector of polynomials b
+ *              - poly *v: pointer to the output polynomial v
+ *              - const uint8_t *c: pointer to the input serialized ciphertext
+ **************************************************/
+static void unpack_ciphertext(polyvec *b, poly *v,
+                              const uint8_t c[MLKEM_INDCPA_BYTES])
+{
+  polyvec_decompress_du(b, c);
+  poly_decompress_dv(v, c + MLKEM_POLYVECCOMPRESSEDBYTES_DU);
+}
+
+#ifndef MLKEM_GEN_MATRIX_NBLOCKS
+#define MLKEM_GEN_MATRIX_NBLOCKS \
+  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + SHAKE128_RATE) / SHAKE128_RATE)
+#endif
+
+/*
+ * Generate four A matrix entries from a seed, using rejection
+ * sampling on the output of a XOF.
+ */
+STATIC_TESTABLE
+void gen_matrix_entry_x4(poly *vec, uint8_t *seed[4])
+__contract__(
+  requires(memory_no_alias(vec, sizeof(poly) * 4))
+  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
+  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(vec, sizeof(poly) * 4))
+  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N - 1, 0, (MLKEM_Q - 1)))
+  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N - 1, 0, (MLKEM_Q - 1)))
+  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N - 1, 0, (MLKEM_Q - 1)))
+  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N - 1, 0, (MLKEM_Q - 1))))
+{
+  /* Temporary buffers for XOF output before rejection sampling */
+  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * SHAKE128_RATE];
+  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * SHAKE128_RATE];
+  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * SHAKE128_RATE];
+  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * SHAKE128_RATE];
+
+  /* Tracks the number of coefficients we have already sampled */
+  unsigned int ctr[KECCAK_WAY];
+  shake128x4incctx statex;
+  unsigned int buflen;
+
+  shake128x4_inc_init(&statex);
+
+  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
+  shake128x4_absorb_once(&statex, seed[0], seed[1], seed[2], seed[3],
+                    MLKEM_SYMBYTES + 2);
+
+  /*
+   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   * This should generate the matrix entries with high probability.
+   */
+  shake128x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
+                           &statex);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * SHAKE128_RATE;
+  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
+  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
+  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
+  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
+
+  /*
+   * So long as not all matrix entries have been generated, squeeze
+   * one more block a time until we're done.
+   */
+  buflen = SHAKE128_RATE;
+  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
+         ctr[3] < MLKEM_N)
+  __loop__(
+    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
+       object_whole(buf1), object_whole(buf2), object_whole(buf3))
+    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
+    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
+    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0] - 1, 0, (MLKEM_Q - 1)))
+    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1] - 1, 0, (MLKEM_Q - 1)))
+    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2] - 1, 0, (MLKEM_Q - 1)))
+    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3] - 1, 0, (MLKEM_Q - 1))))
+  {
+    shake128x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
+    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
+    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
+    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
+    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
+  }
+
+  shake128x4_inc_ctx_release(&statex);
+}
+
+/*
+ * Generate a single A matrix entry from a seed, using rejection
+ * sampling on the output of a XOF.
+ */
+STATIC_TESTABLE
+void gen_matrix_entry(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
+__contract__(
+  requires(memory_no_alias(entry, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(entry, sizeof(poly)))
+  ensures(array_bound(entry->coeffs, 0, MLKEM_N - 1, 0, (MLKEM_Q - 1))))
+{
+  shake128incctx state;
+  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * SHAKE128_RATE];
+  unsigned int ctr, buflen;
+
+  shake128_inc_init(&state);
+
+  shake128_absorb_once(&state, seed, MLKEM_SYMBYTES + 2);
+
+  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   */
+  /* This should generate the matrix entry with high probability. */
+  shake128_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * SHAKE128_RATE;
+  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
+
+  /* Squeeze + sample one more block a time until we're done */
+  buflen = SHAKE128_RATE;
+  while (ctr < MLKEM_N)
+  __loop__(
+    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
+    invariant(0 <= ctr && ctr <= MLKEM_N)
+    invariant(ctr > 0 ==> array_bound(entry->coeffs, 0, ctr - 1,
+                                          0, (MLKEM_Q - 1))))
+  {
+    shake128_squeezeblocks(buf, 1, &state);
+    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, SHAKE128_RATE);
+  }
+
+  shake128_inc_ctx_release(&state);
+}
+
+/*************************************************
+ * Name:        gen_matrix
+ *
+ * Description: Deterministically generate matrix A (or the transpose of A)
+ *              from a seed. Entries of the matrix are polynomials that look
+ *              uniformly random. Performs rejection sampling on output of
+ *              a XOF
+ *
+ * Arguments:   - polyvec *a: pointer to ouptput matrix A
+ *              - const uint8_t *seed: pointer to input seed
+ *              - int transposed: boolean deciding whether A or A^T is generated
+ **************************************************/
+/* Not static for benchmarking */
+void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
+{
+  int i;
+  unsigned int j;
+  /*
+   * We generate four separate seed arrays rather than a single one to work
+   * around limitations in CBMC function contracts dealing with disjoint slices
+   * of the same parent object.
+   */
+
+  ALIGN uint8_t seed0[MLKEM_SYMBYTES + 2];
+  ALIGN uint8_t seed1[MLKEM_SYMBYTES + 2];
+  ALIGN uint8_t seed2[MLKEM_SYMBYTES + 2];
+  ALIGN uint8_t seed3[MLKEM_SYMBYTES + 2];
+  uint8_t *seedxy[4];
+  seedxy[0] = seed0;
+  seedxy[1] = seed1;
+  seedxy[2] = seed2;
+  seedxy[3] = seed3;
+
+  for (j = 0; j < KECCAK_WAY; j++)
+  {
+    memcpy(seedxy[j], seed, MLKEM_SYMBYTES);
+  }
+
+  for (i = 0; i < (MLKEM_K * MLKEM_K / KECCAK_WAY) * KECCAK_WAY;
+       i += KECCAK_WAY)
+  {
+    uint8_t x, y;
+
+    for (j = 0; j < KECCAK_WAY; j++)
+    {
+      x = (i + j) / MLKEM_K;
+      y = (i + j) % MLKEM_K;
+      if (transposed)
+      {
+        seedxy[j][MLKEM_SYMBYTES + 0] = x;
+        seedxy[j][MLKEM_SYMBYTES + 1] = y;
+      }
+      else
+      {
+        seedxy[j][MLKEM_SYMBYTES + 0] = y;
+        seedxy[j][MLKEM_SYMBYTES + 1] = x;
+      }
+    }
+
+    /*
+     * This call writes across polyvec boundaries for K=2 and K=3.
+     * This is intentional and safe.
+     */
+    gen_matrix_entry_x4(&a[0].vec[0] + i, seedxy);
+  }
+
+  /* For left over polynomial, we use single keccak. */
+  if (i < MLKEM_K * MLKEM_K)
+  {
+    uint8_t x, y;
+    x = i / MLKEM_K;
+    y = i % MLKEM_K;
+
+    if (transposed)
+    {
+      seed0[MLKEM_SYMBYTES + 0] = x;
+      seed0[MLKEM_SYMBYTES + 1] = y;
+    }
+    else
+    {
+      seed0[MLKEM_SYMBYTES + 0] = y;
+      seed0[MLKEM_SYMBYTES + 1] = x;
+    }
+
+    gen_matrix_entry(&a[0].vec[0] + i, seed0);
+    i++;
+  }
+
+  cassert(i == MLKEM_K * MLKEM_K,
+          "gen_matrix: failed to generate whole matrix");
+
+#if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
+  /*
+   * The public matrix is generated in NTT domain. If the native backend
+   * uses a custom order in NTT domain, permute A accordingly.
+   */
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    for (j = 0; j < MLKEM_K; j++)
+    {
+      poly_permute_bitrev_to_custom(&a[i].vec[j]);
+    }
+  }
+#endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */
+}
+
+/*************************************************
+ * Name:        matvec_mul
+ *
+ * Description: Computes matrix-vector product in NTT domain,
+ *              via Montgomery multiplication.
+ *
+ * Arguments:   - polyvec *out: Pointer to output polynomial vector
+ *              - polyvec a[MLKEM_K]: Input matrix. Must be in NTT domain
+ *                  and have coefficients of absolute value < MLKEM_Q.
+ *              - polyvec *v: Input polynomial vector. Must be in NTT domain.
+ *              - polyvec *vc: Mulcache for v, computed via
+ *                  polyvec_mulcache_compute().
+ **************************************************/
+STATIC_TESTABLE
+void matvec_mul(polyvec *out, const polyvec a[MLKEM_K], const polyvec *v,
+                const polyvec_mulcache *vc)
+__contract__(
+  requires(memory_no_alias(out, sizeof(polyvec)))
+  requires(memory_no_alias(a, sizeof(polyvec) * MLKEM_K))
+  requires(memory_no_alias(v, sizeof(polyvec)))
+  requires(memory_no_alias(vc, sizeof(polyvec_mulcache)))
+  requires(forall(int, k0, 0, MLKEM_K - 1,
+  forall(int, k1, 0, MLKEM_K - 1,
+    array_abs_bound(a[k0].vec[k1].coeffs, 0, MLKEM_N - 1, (MLKEM_Q - 1)))))
+  assigns(object_whole(out)))
+{
+  int i;
+  for (i = 0; i < MLKEM_K; i++)
+  __loop__(
+    assigns(i, object_whole(out))
+    invariant(i >= 0 && i <= MLKEM_K))
+  {
+    polyvec_basemul_acc_montgomery_cached(&out->vec[i], &a[i], v, vc);
+  }
+}
+
+/*************************************************
+ * Name:        indcpa_keypair_derand
+ *
+ * Description: Generates public and private key for the CPA-secure
+ *              public-key encryption scheme underlying ML-KEM
+ *
+ * Arguments:   - uint8_t *pk: pointer to output public key
+ *                             (of length MLKEM_INDCPA_PUBLICKEYBYTES bytes)
+ *              - uint8_t *sk: pointer to output private key
+ *                             (of length MLKEM_INDCPA_SECRETKEYBYTES bytes)
+ *              - const uint8_t *coins: pointer to input randomness
+ *                             (of length MLKEM_SYMBYTES bytes)
+ **************************************************/
+
+STATIC_ASSERT(NTT_BOUND + MLKEM_Q < INT16_MAX, indcpa_enc_bound_0)
+
+void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+                           uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+                           const uint8_t coins[MLKEM_SYMBYTES])
+{
+  ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
+  const uint8_t *publicseed = buf;
+  const uint8_t *noiseseed = buf + MLKEM_SYMBYTES;
+  polyvec a[MLKEM_K], e, pkpv, skpv;
+  polyvec_mulcache skpv_cache;
+
+  ALIGN uint8_t coins_with_domain_separator[MLKEM_SYMBYTES + 1];
+  /* Concatenate coins with MLKEM_K for domain separation of security levels */
+  memcpy(coins_with_domain_separator, coins, MLKEM_SYMBYTES);
+  coins_with_domain_separator[MLKEM_SYMBYTES] = MLKEM_K;
+
+  hash_g(buf, coins_with_domain_separator, MLKEM_SYMBYTES + 1);
+
+  gen_matrix(a, publicseed, 0 /* no transpose */);
+
+#if MLKEM_K == 2
+  poly_getnoise_eta1_4x(skpv.vec + 0, skpv.vec + 1, e.vec + 0, e.vec + 1,
+                        noiseseed, 0, 1, 2, 3);
+#elif MLKEM_K == 3
+  /*
+   * Only the first three output buffers are needed.
+   * The laster parameter is a dummy that's overwritten later.
+   */
+  poly_getnoise_eta1_4x(skpv.vec + 0, skpv.vec + 1, skpv.vec + 2,
+                        pkpv.vec + 0 /* irrelevant */, noiseseed, 0, 1, 2,
+                        0xFF /* irrelevant */);
+  /* Same here */
+  poly_getnoise_eta1_4x(e.vec + 0, e.vec + 1, e.vec + 2,
+                        pkpv.vec + 0 /* irrelevant */, noiseseed, 3, 4, 5,
+                        0xFF /* irrelevant */);
+#elif MLKEM_K == 4
+  poly_getnoise_eta1_4x(skpv.vec + 0, skpv.vec + 1, skpv.vec + 2, skpv.vec + 3,
+                        noiseseed, 0, 1, 2, 3);
+  poly_getnoise_eta1_4x(e.vec + 0, e.vec + 1, e.vec + 2, e.vec + 3, noiseseed,
+                        4, 5, 6, 7);
+#endif
+
+  polyvec_ntt(&skpv);
+  polyvec_ntt(&e);
+
+  polyvec_mulcache_compute(&skpv_cache, &skpv);
+  matvec_mul(&pkpv, a, &skpv, &skpv_cache);
+  polyvec_tomont(&pkpv);
+
+  /* Arithmetic cannot overflow, see static assertion at the top */
+  polyvec_add(&pkpv, &e);
+  polyvec_reduce(&pkpv);
+  polyvec_reduce(&skpv);
+
+  pack_sk(sk, &skpv);
+  pack_pk(pk, &pkpv, publicseed);
+}
+
+/*************************************************
+ * Name:        indcpa_enc
+ *
+ * Description: Encryption function of the CPA-secure
+ *              public-key encryption scheme underlying Kyber.
+ *
+ * Arguments:   - uint8_t *c: pointer to output ciphertext
+ *                            (of length MLKEM_INDCPA_BYTES bytes)
+ *              - const uint8_t *m: pointer to input message
+ *                                  (of length MLKEM_INDCPA_MSGBYTES bytes)
+ *              - const uint8_t *pk: pointer to input public key
+ *                                   (of length MLKEM_INDCPA_PUBLICKEYBYTES)
+ *              - const uint8_t *coins: pointer to input random coins used as
+ *seed (of length MLKEM_SYMBYTES) to deterministically generate all randomness
+ **************************************************/
+
+/* Check that the arithmetic in indcpa_enc() does not overflow */
+STATIC_ASSERT(INVNTT_BOUND + MLKEM_ETA1 < INT16_MAX, indcpa_enc_bound_0)
+STATIC_ASSERT(INVNTT_BOUND + MLKEM_ETA2 + MLKEM_Q < INT16_MAX,
+              indcpa_enc_bound_1)
+
+void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
+                const uint8_t m[MLKEM_INDCPA_MSGBYTES],
+                const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+                const uint8_t coins[MLKEM_SYMBYTES])
+{
+  ALIGN uint8_t seed[MLKEM_SYMBYTES];
+  polyvec sp, pkpv, ep, at[MLKEM_K], b;
+  poly v, k, epp;
+  polyvec_mulcache sp_cache;
+
+  unpack_pk(&pkpv, seed, pk);
+  poly_frommsg(&k, m);
+  gen_matrix(at, seed, 1 /* transpose */);
+
+#if MLKEM_K == 2
+  poly_getnoise_eta1122_4x(sp.vec + 0, sp.vec + 1, ep.vec + 0, ep.vec + 1,
+                           coins, 0, 1, 2, 3);
+  poly_getnoise_eta2(&epp, coins, 4);
+#elif MLKEM_K == 3
+  /*
+   * In this call, only the first three output buffers are needed.
+   * The last parameter is a dummy that's overwritten later.
+   */
+  poly_getnoise_eta1_4x(sp.vec + 0, sp.vec + 1, sp.vec + 2, &b.vec[0], coins, 0,
+                        1, 2, 0xFF);
+  /* The fourth output buffer in this call _is_ used. */
+  poly_getnoise_eta2_4x(ep.vec + 0, ep.vec + 1, ep.vec + 2, &epp, coins, 3, 4,
+                        5, 6);
+#elif MLKEM_K == 4
+  poly_getnoise_eta1_4x(sp.vec + 0, sp.vec + 1, sp.vec + 2, sp.vec + 3, coins,
+                        0, 1, 2, 3);
+  poly_getnoise_eta2_4x(ep.vec + 0, ep.vec + 1, ep.vec + 2, ep.vec + 3, coins,
+                        4, 5, 6, 7);
+  poly_getnoise_eta2(&epp, coins, 8);
+#endif
+
+  polyvec_ntt(&sp);
+
+  polyvec_mulcache_compute(&sp_cache, &sp);
+  matvec_mul(&b, at, &sp, &sp_cache);
+  polyvec_basemul_acc_montgomery_cached(&v, &pkpv, &sp, &sp_cache);
+
+  polyvec_invntt_tomont(&b);
+  poly_invntt_tomont(&v);
+
+  /* Arithmetic cannot overflow, see static assertion at the top */
+  polyvec_add(&b, &ep);
+  poly_add(&v, &epp);
+  poly_add(&v, &k);
+
+  polyvec_reduce(&b);
+  poly_reduce(&v);
+
+  pack_ciphertext(c, &b, &v);
+}
+
+/* Check that the arithmetic in indcpa_dec() does not overflow */
+STATIC_ASSERT(INVNTT_BOUND + MLKEM_Q < INT16_MAX, indcpa_dec_bound_0)
+
+void indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
+                const uint8_t c[MLKEM_INDCPA_BYTES],
+                const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES])
+{
+  polyvec b, skpv;
+  poly v, sb;
+
+  unpack_ciphertext(&b, &v, c);
+  unpack_sk(&skpv, sk);
+
+  polyvec_ntt(&b);
+  polyvec_basemul_acc_montgomery(&sb, &skpv, &b);
+  poly_invntt_tomont(&sb);
+
+  /* Arithmetic cannot overflow, see static assertion at the top */
+  poly_sub(&v, &sb);
+  poly_reduce(&v);
+
+  poly_tomsg(m, &v);
+}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.h
new file mode 100644
index 0000000000..3f57eb1295
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef INDCPA_H
+#define INDCPA_H
+
+#include <stdint.h>
+#include "cbmc.h"
+#include "params.h"
+#include "polyvec.h"
+
+
+#define gen_matrix MLKEM_NAMESPACE(gen_matrix)
+
+void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
+__contract__(
+  requires(memory_no_alias(a, sizeof(polyvec) * MLKEM_K))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires(transposed == 0 || transposed == 1)
+  assigns(object_whole(a))
+  ensures(forall(int, x, 0, MLKEM_K - 1, forall(int, y, 0, MLKEM_K - 1,
+  array_bound(a[x].vec[y].coeffs, 0, MLKEM_N - 1, 0, (MLKEM_Q - 1)))));
+);
+
+#define indcpa_keypair_derand MLKEM_NAMESPACE(indcpa_keypair_derand)
+void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+                           uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+                           const uint8_t coins[MLKEM_SYMBYTES])
+__contract__(
+  requires(memory_no_alias(pk, MLKEM_INDCPA_PUBLICKEYBYTES))
+  requires(memory_no_alias(sk, MLKEM_INDCPA_SECRETKEYBYTES))
+  requires(memory_no_alias(coins, MLKEM_SYMBYTES))
+  assigns(object_whole(pk))
+  assigns(object_whole(sk))
+);
+
+#define indcpa_enc MLKEM_NAMESPACE(indcpa_enc)
+/*************************************************
+ * Name:        indcpa_dec
+ *
+ * Description: Decryption function of the CPA-secure
+ *              public-key encryption scheme underlying Kyber.
+ *
+ * Arguments:   - uint8_t *m: pointer to output decrypted message
+ *                            (of length MLKEM_INDCPA_MSGBYTES)
+ *              - const uint8_t *c: pointer to input ciphertext
+ *                                  (of length MLKEM_INDCPA_BYTES)
+ *              - const uint8_t *sk: pointer to input secret key
+ *                                   (of length MLKEM_INDCPA_SECRETKEYBYTES)
+ **************************************************/
+void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
+                const uint8_t m[MLKEM_INDCPA_MSGBYTES],
+                const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+                const uint8_t coins[MLKEM_SYMBYTES])
+__contract__(
+  requires(memory_no_alias(c, MLKEM_INDCPA_BYTES))
+  requires(memory_no_alias(m, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(pk, MLKEM_INDCPA_PUBLICKEYBYTES))
+  requires(memory_no_alias(coins, MLKEM_SYMBYTES))
+  assigns(object_whole(c))
+);
+
+#define indcpa_dec MLKEM_NAMESPACE(indcpa_dec)
+void indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
+                const uint8_t c[MLKEM_INDCPA_BYTES],
+                const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES])
+__contract__(
+  requires(memory_no_alias(c, MLKEM_INDCPA_BYTES))
+  requires(memory_no_alias(m, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(sk, MLKEM_INDCPA_SECRETKEYBYTES))
+  assigns(object_whole(m))
+);
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/kem.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/kem.c
new file mode 100644
index 0000000000..f84ee3f3da
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/kem.c
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "kem.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include "indcpa.h"
+#include "params.h"
+#include "randombytes.h"
+#include "symmetric.h"
+#include "verify.h"
+
+#if defined(CBMC)
+/* Redeclaration with contract needed for CBMC only */
+int memcmp(const void *str1, const void *str2, size_t n)
+__contract__(
+  requires(memory_no_alias(str1, n))
+  requires(memory_no_alias(str2, n))
+);
+#endif
+
+/*************************************************
+ * Name:        check_pk
+ *
+ * Description: Implements modulus check mandated by FIPS203,
+ *              i.e., ensures that coefficients are in [0,q-1].
+ *              Described in Section 7.2 of FIPS203.
+ *
+ * Arguments:   - const uint8_t *pk: pointer to input public key
+ *                (an already allocated array of MLKEM_PUBLICKEYBYTES bytes)
+ **
+ * Returns 0 on success, and -1 on failure
+ **************************************************/
+static int check_pk(const uint8_t pk[MLKEM_PUBLICKEYBYTES])
+{
+  polyvec p;
+  uint8_t p_reencoded[MLKEM_POLYVECBYTES];
+  polyvec_frombytes(&p, pk);
+  polyvec_reduce(&p);
+  polyvec_tobytes(p_reencoded, &p);
+  /* Data is public, so a variable-time memcmp() is OK */
+  if (memcmp(pk, p_reencoded, MLKEM_POLYVECBYTES))
+  {
+    return -1;
+  }
+  return 0;
+}
+
+/*************************************************
+ * Name:        check_sk
+ *
+ * Description: Implements public key hash check mandated by FIPS203,
+ *              i.e., ensures that
+ *              sk[768𝑘+32 ∶ 768𝑘+64] = H(pk)= H(sk[384𝑘 : 768𝑘+32])
+ *              Described in Section 7.3 of FIPS203.
+ *
+ * Arguments:   - const uint8_t *sk: pointer to input private key
+ *                (an already allocated array of MLKEM_SECRETKEYBYTES bytes)
+ *
+ * Returns 0 on success, and -1 on failure
+ **************************************************/
+static int check_sk(const uint8_t sk[MLKEM_SECRETKEYBYTES])
+{
+  uint8_t test[MLKEM_SYMBYTES];
+  /*
+   * The parts of `sk` being hashed and compared here are public, so
+   * no public information is leaked through the runtime or the return value
+   * of this function.
+   */
+  hash_h(test, sk + MLKEM_INDCPA_SECRETKEYBYTES, MLKEM_PUBLICKEYBYTES);
+  if (memcmp(sk + MLKEM_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES, test,
+             MLKEM_SYMBYTES))
+  {
+    return -1;
+  }
+  return 0;
+}
+
+/*************************************************
+ * Name:        crypto_kem_keypair_derand
+ *
+ * Description: Generates public and private key
+ *              for CCA-secure ML-KEM key encapsulation mechanism
+ *
+ * Arguments:   - uint8_t *pk: pointer to output public key
+ *                (an already allocated array of MLKEM_PUBLICKEYBYTES bytes)
+ *              - uint8_t *sk: pointer to output private key
+ *                (an already allocated array of MLKEM_SECRETKEYBYTES bytes)
+ *              - uint8_t *coins: pointer to input randomness
+ *                (an already allocated array filled with 2*MLKEM_SYMBYTES
+ *random bytes)
+ **
+ * Returns 0 (success)
+ **************************************************/
+int crypto_kem_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins)
+{
+  indcpa_keypair_derand(pk, sk, coins);
+  memcpy(sk + MLKEM_INDCPA_SECRETKEYBYTES, pk, MLKEM_PUBLICKEYBYTES);
+  hash_h(sk + MLKEM_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES, pk,
+         MLKEM_PUBLICKEYBYTES);
+  /* Value z for pseudo-random output on reject */
+  memcpy(sk + MLKEM_SECRETKEYBYTES - MLKEM_SYMBYTES, coins + MLKEM_SYMBYTES,
+         MLKEM_SYMBYTES);
+  return 0;
+}
+
+int crypto_kem_keypair(uint8_t *pk, uint8_t *sk)
+{
+  ALIGN uint8_t coins[2 * MLKEM_SYMBYTES];
+  randombytes(coins, 2 * MLKEM_SYMBYTES);
+  crypto_kem_keypair_derand(pk, sk, coins);
+  return 0;
+}
+
+int crypto_kem_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk,
+                          const uint8_t *coins)
+{
+  ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
+  /* Will contain key, coins */
+  ALIGN uint8_t kr[2 * MLKEM_SYMBYTES];
+
+  if (check_pk(pk))
+  {
+    return -1;
+  }
+
+  memcpy(buf, coins, MLKEM_SYMBYTES);
+
+  /* Multitarget countermeasure for coins + contributory KEM */
+  hash_h(buf + MLKEM_SYMBYTES, pk, MLKEM_PUBLICKEYBYTES);
+  hash_g(kr, buf, 2 * MLKEM_SYMBYTES);
+
+  /* coins are in kr+MLKEM_SYMBYTES */
+  indcpa_enc(ct, buf, pk, kr + MLKEM_SYMBYTES);
+
+  memcpy(ss, kr, MLKEM_SYMBYTES);
+  return 0;
+}
+
+int crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk)
+{
+  ALIGN uint8_t coins[MLKEM_SYMBYTES];
+  randombytes(coins, MLKEM_SYMBYTES);
+  return crypto_kem_enc_derand(ct, ss, pk, coins);
+}
+
+int crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk)
+{
+  uint8_t fail;
+  ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
+  /* Will contain key, coins */
+  ALIGN uint8_t kr[2 * MLKEM_SYMBYTES];
+  ALIGN uint8_t cmp[MLKEM_CIPHERTEXTBYTES + MLKEM_SYMBYTES];
+  const uint8_t *pk = sk + MLKEM_INDCPA_SECRETKEYBYTES;
+
+  if (check_sk(sk))
+  {
+    return -1;
+  }
+
+  indcpa_dec(buf, ct, sk);
+
+  /* Multitarget countermeasure for coins + contributory KEM */
+  memcpy(buf + MLKEM_SYMBYTES, sk + MLKEM_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES,
+         MLKEM_SYMBYTES);
+  hash_g(kr, buf, 2 * MLKEM_SYMBYTES);
+
+  /* coins are in kr+MLKEM_SYMBYTES */
+  indcpa_enc(cmp, buf, pk, kr + MLKEM_SYMBYTES);
+
+  fail = ct_memcmp(ct, cmp, MLKEM_CIPHERTEXTBYTES);
+
+  /* Compute rejection key */
+  rkprf(ss, sk + MLKEM_SECRETKEYBYTES - MLKEM_SYMBYTES, ct);
+
+  /* Copy true key to return buffer if fail is 0 */
+  ct_cmov_zero(ss, kr, MLKEM_SYMBYTES, fail);
+
+  return 0;
+}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/kem.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/kem.h
new file mode 100644
index 0000000000..6a33be7c7e
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/kem.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef KEM_H
+#define KEM_H
+
+#include <stdint.h>
+#include "cbmc.h"
+#include "params.h"
+
+#define CRYPTO_SECRETKEYBYTES MLKEM_SECRETKEYBYTES
+#define CRYPTO_PUBLICKEYBYTES MLKEM_PUBLICKEYBYTES
+#define CRYPTO_CIPHERTEXTBYTES MLKEM_CIPHERTEXTBYTES
+#define CRYPTO_BYTES MLKEM_SSBYTES
+
+#if (MLKEM_K == 2)
+#define CRYPTO_ALGNAME "Kyber512"
+#elif (MLKEM_K == 3)
+#define CRYPTO_ALGNAME "Kyber768"
+#elif (MLKEM_K == 4)
+#define CRYPTO_ALGNAME "Kyber1024"
+#endif
+
+#define crypto_kem_keypair_derand MLKEM_NAMESPACE(keypair_derand)
+int crypto_kem_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins)
+__contract__(
+  requires(memory_no_alias(pk, MLKEM_PUBLICKEYBYTES))
+  requires(memory_no_alias(sk, MLKEM_SECRETKEYBYTES))
+  requires(memory_no_alias(coins, 2 * MLKEM_SYMBYTES))
+  assigns(object_whole(pk))
+  assigns(object_whole(sk))
+);
+
+#define crypto_kem_keypair MLKEM_NAMESPACE(keypair)
+/*************************************************
+ * Name:        crypto_kem_keypair
+ *
+ * Description: Generates public and private key
+ *              for CCA-secure ML-KEM key encapsulation mechanism
+ *
+ * Arguments:   - uint8_t *pk: pointer to output public key
+ *                (an already allocated array of MLKEM_PUBLICKEYBYTES bytes)
+ *              - uint8_t *sk: pointer to output private key
+ *                (an already allocated array of MLKEM_SECRETKEYBYTES bytes)
+ *
+ * Returns 0 (success)
+ **************************************************/
+int crypto_kem_keypair(uint8_t *pk, uint8_t *sk)
+__contract__(
+  requires(memory_no_alias(pk, MLKEM_PUBLICKEYBYTES))
+  requires(memory_no_alias(sk, MLKEM_SECRETKEYBYTES))
+  assigns(object_whole(pk))
+  assigns(object_whole(sk))
+);
+
+#define crypto_kem_enc_derand MLKEM_NAMESPACE(enc_derand)
+/*************************************************
+ * Name:        crypto_kem_enc_derand
+ *
+ * Description: Generates cipher text and shared
+ *              secret for given public key
+ *
+ * Arguments:   - uint8_t *ct: pointer to output cipher text
+ *                (an already allocated array of MLKEM_CIPHERTEXTBYTES bytes)
+ *              - uint8_t *ss: pointer to output shared secret
+ *                (an already allocated array of MLKEM_SSBYTES bytes)
+ *              - const uint8_t *pk: pointer to input public key
+ *                (an already allocated array of MLKEM_PUBLICKEYBYTES bytes)
+ *              - const uint8_t *coins: pointer to input randomness
+ *                (an already allocated array filled with MLKEM_SYMBYTES random
+ *bytes)
+ **
+ * Returns 0 on success, and -1 if the public key modulus check (see Section 7.2
+ * of FIPS203) fails.
+ **************************************************/
+int crypto_kem_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk,
+                          const uint8_t *coins)
+__contract__(
+  requires(memory_no_alias(ct, MLKEM_CIPHERTEXTBYTES))
+  requires(memory_no_alias(ss, MLKEM_SSBYTES))
+  requires(memory_no_alias(pk, MLKEM_PUBLICKEYBYTES))
+  requires(memory_no_alias(coins, MLKEM_SYMBYTES))
+  assigns(object_whole(ct))
+  assigns(object_whole(ss))
+);
+
+#define crypto_kem_enc MLKEM_NAMESPACE(enc)
+/*************************************************
+ * Name:        crypto_kem_enc
+ *
+ * Description: Generates cipher text and shared
+ *              secret for given public key
+ *
+ * Arguments:   - uint8_t *ct: pointer to output cipher text
+ *                (an already allocated array of MLKEM_CIPHERTEXTBYTES bytes)
+ *              - uint8_t *ss: pointer to output shared secret
+ *                (an already allocated array of MLKEM_SSBYTES bytes)
+ *              - const uint8_t *pk: pointer to input public key
+ *                (an already allocated array of MLKEM_PUBLICKEYBYTES bytes)
+ *
+ * Returns 0 on success, and -1 if the public key modulus check (see Section 7.2
+ * of FIPS203) fails.
+ **************************************************/
+int crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk)
+__contract__(
+  requires(memory_no_alias(ct, MLKEM_CIPHERTEXTBYTES))
+  requires(memory_no_alias(ss, MLKEM_SSBYTES))
+  requires(memory_no_alias(pk, MLKEM_PUBLICKEYBYTES))
+  assigns(object_whole(ct))
+  assigns(object_whole(ss))
+);
+
+#define crypto_kem_dec MLKEM_NAMESPACE(dec)
+/*************************************************
+ * Name:        crypto_kem_dec
+ *
+ * Description: Generates shared secret for given
+ *              cipher text and private key
+ *
+ * Arguments:   - uint8_t *ss: pointer to output shared secret
+ *                (an already allocated array of MLKEM_SSBYTES bytes)
+ *              - const uint8_t *ct: pointer to input cipher text
+ *                (an already allocated array of MLKEM_CIPHERTEXTBYTES bytes)
+ *              - const uint8_t *sk: pointer to input private key
+ *                (an already allocated array of MLKEM_SECRETKEYBYTES bytes)
+ *
+ * Returns 0 on success, and -1 if the secret key hash check (see Section 7.3 of
+ * FIPS203) fails.
+ *
+ * On failure, ss will contain a pseudo-random value.
+ **************************************************/
+int crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk)
+__contract__(
+  requires(memory_no_alias(ss, MLKEM_SSBYTES))
+  requires(memory_no_alias(ct, MLKEM_CIPHERTEXTBYTES))
+  requires(memory_no_alias(sk, MLKEM_SECRETKEYBYTES))
+  assigns(object_whole(ss))
+);
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/ntt.c
new file mode 100644
index 0000000000..1844ca19fd
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/ntt.c
@@ -0,0 +1,278 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "ntt.h"
+#include <stdint.h>
+#include "params.h"
+#include "reduce.h"
+
+#include "arith_native.h"
+#include "debug/debug.h"
+#include "ntt.h"
+
+#if !defined(MLKEM_USE_NATIVE_NTT)
+/*
+ * Computes a block CT butterflies with a fixed twiddle factor,
+ * using Montgomery multiplication.
+ * Parameters:
+ * - r: Pointer to base of polynomial (_not_ the base of butterfly block)
+ * - root: Twiddle factor to use for the butterfly. This must be in
+ *         Montgomery form and signed canonical.
+ * - start: Offset to the beginning of the butterfly block
+ * - len: Index difference between coefficients subject to a butterfly
+ * - bound: Ghost variable describing coefficient bound: Prior to `start`,
+ *          coefficients must be bound by `bound + MLKEM_Q`. Post `start`,
+ *          they must be bound by `bound`.
+ * When this function returns, output coefficients in the index range
+ * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`.
+ * Example:
+ * - start=8, len=4
+ *   This would compute the following four butterflies
+ *          8     --    12
+ *             9    --     13
+ *                10   --     14
+ *                   11   --     15
+ * - start=4, len=2
+ *   This would compute the following two butterflies
+ *          4 -- 6
+ *             5 -- 7
+ */
+STATIC_TESTABLE
+void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, int start, int len,
+                         int bound)
+__contract__(
+  requires(0 <= start && start < MLKEM_N)
+  requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
+  requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
+  requires(-HALF_Q < zeta && zeta < HALF_Q)
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(array_abs_bound(r, 0, start - 1, bound + MLKEM_Q))
+  requires(array_abs_bound(r, start, MLKEM_N - 1, bound))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, start + 2*len - 1, bound + MLKEM_Q))
+  ensures(array_abs_bound(r, start + 2 * len, MLKEM_N - 1, bound)))
+{
+  /* `bound` is a ghost variable only needed in the CBMC specification */
+  int j;
+  ((void)bound);
+  for (j = start; j < start + len; j++)
+  __loop__(
+    invariant(start <= j && j <= start + len)
+    /* 
+     * Coefficients are updated in strided pairs, so the bounds for the
+     * intermediate states alternate twice between the old and new bound
+     */
+    invariant(array_abs_bound(r, 0,           j - 1,           bound + MLKEM_Q))
+    invariant(array_abs_bound(r, j,           start + len - 1, bound))
+    invariant(array_abs_bound(r, start + len, j + len - 1,     bound + MLKEM_Q))
+    invariant(array_abs_bound(r, j + len,     MLKEM_N - 1,     bound)))
+  {
+    int16_t t;
+    t = fqmul(r[j + len], zeta);
+    r[j + len] = r[j] - t;
+    r[j] = r[j] + t;
+  }
+}
+
+/*
+ *Compute one layer of forward NTT
+ * Parameters:
+ * - r: Pointer to base of polynomial
+ * - len: Stride of butterflies in this layer.
+ * - layer: Ghost variable indicating which layer is being applied.
+ *          Must match `len` via `len == MLKEM_N >> layer`.
+ * Note: `len` could be dropped and computed in the function, but
+ *   we are following the structure of the reference NTT from the
+ *   official Kyber implementation here, merely adding `layer` as
+ *   a ghost variable for the specifications.
+ */
+STATIC_TESTABLE
+void ntt_layer(int16_t r[MLKEM_N], int len, int layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
+  requires(array_abs_bound(r, 0, MLKEM_N - 1, layer * MLKEM_Q - 1))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N - 1, (layer + 1) * MLKEM_Q - 1)))
+{
+  int start, k;
+  /* `layer` is a ghost variable only needed in the CBMC specification */
+  ((void)layer);
+  /* Twiddle factors for layer n start at index 2^(layer-1) */
+  k = MLKEM_N / (2 * len);
+  for (start = 0; start < MLKEM_N; start += 2 * len)
+  __loop__(
+    invariant(0 <= start && start < MLKEM_N + 2 * len)
+    invariant(0 <= k && k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
+    invariant(array_abs_bound(r, 0, start - 1, (layer * MLKEM_Q - 1) + MLKEM_Q))
+    invariant(array_abs_bound(r, start, MLKEM_N - 1, layer * MLKEM_Q - 1)))
+  {
+    int16_t zeta = zetas[k++];
+    ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q - 1);
+  }
+}
+
+/*
+ * Compute full forward NTT
+ * NOTE: This particular implementation satisfies a much tighter
+ * bound on the output coefficients (5*q) than the contractual one (8*q),
+ * but this is not needed in the calling code. Should we change the
+ * base multiplication strategy to require smaller NTT output bounds,
+ * the proof may need strengthening.
+ * REF-CHANGE: Removed indirection poly_ntt -> ntt()
+ * and integrated polynomial reduction into the NTT.
+ */
+
+
+void poly_ntt(poly *p)
+{
+  int len, layer;
+  int16_t *r;
+  POLY_BOUND_MSG(p, MLKEM_Q, "ref ntt input");
+  r = p->coeffs;
+
+  for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
+  __loop__(
+    invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer))
+    invariant(array_abs_bound(r, 0, MLKEM_N - 1, layer * MLKEM_Q - 1)))
+  {
+    ntt_layer(r, len, layer);
+  }
+
+  /* Check the stronger bound */
+  POLY_BOUND_MSG(p, NTT_BOUND, "ref ntt output");
+}
+#else  /* MLKEM_USE_NATIVE_NTT */
+
+/* Check that bound for native NTT implies contractual bound */
+STATIC_ASSERT(NTT_BOUND_NATIVE <= NTT_BOUND, invntt_bound)
+
+void poly_ntt(poly *p)
+{
+  POLY_BOUND_MSG(p, MLKEM_Q, "native ntt input");
+  ntt_native(p);
+  POLY_BOUND_MSG(p, NTT_BOUND_NATIVE, "native ntt output");
+}
+#endif /* MLKEM_USE_NATIVE_NTT */
+
+#if !defined(MLKEM_USE_NATIVE_INTT)
+
+/* Check that bound for reference invNTT implies contractual bound */
+#define INVNTT_BOUND_REF (3 * MLKEM_Q / 4)
+STATIC_ASSERT(INVNTT_BOUND_REF <= INVNTT_BOUND, invntt_bound)
+
+/* Compute one layer of inverse NTT */
+STATIC_TESTABLE
+void invntt_layer(int16_t *r, int len, int layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
+  requires(len == (1 << (8 - layer)))
+  requires(array_abs_bound(r, 0, MLKEM_N - 1, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N - 1, MLKEM_Q)))
+{
+  int start, k;
+  /* `layer` is a ghost variable used only in the specification */
+  ((void)layer);
+  k = MLKEM_N / len - 1;
+  for (start = 0; start < MLKEM_N; start += 2 * len)
+  __loop__(
+    invariant(array_abs_bound(r, 0, MLKEM_N - 1, MLKEM_Q))
+    invariant(0 <= start && start <= MLKEM_N && 0 <= k && k <= 127)
+    /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
+    invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
+  {
+    int j;
+    int16_t zeta = zetas[k--];
+    for (j = start; j < start + len; j++)
+    __loop__(
+      invariant(start <= j && j <= start + len)
+      invariant(0 <= start && start <= MLKEM_N && 0 <= k && k <= 127)
+      invariant(array_abs_bound(r, 0, MLKEM_N - 1, MLKEM_Q)))
+    {
+      int16_t t = r[j];
+      r[j] = barrett_reduce(t + r[j + len]);
+      r[j + len] = r[j + len] - t;
+      r[j + len] = fqmul(r[j + len], zeta);
+    }
+  }
+}
+
+void poly_invntt_tomont(poly *p)
+{
+  /*
+   * Scale input polynomial to account for Montgomery factor
+   * and NTT twist. This also brings coefficients down to
+   * absolute value < MLKEM_Q.
+   */
+  int j, len, layer;
+  const int16_t f = 1441;
+  int16_t *r = p->coeffs;
+
+  for (j = 0; j < MLKEM_N; j++)
+  __loop__(
+    invariant(0 <= j && j <= MLKEM_N)
+    invariant(array_abs_bound(r, 0, j - 1, MLKEM_Q)))
+  {
+    r[j] = fqmul(r[j], f);
+  }
+
+  /* Run the invNTT layers */
+  for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
+  __loop__(
+    invariant(2 <= len && len <= 256 && 0 <= layer && layer <= 7 && len == (1 << (8 - layer)))
+    invariant(array_abs_bound(r, 0, MLKEM_N - 1, MLKEM_Q)))
+  {
+    invntt_layer(p->coeffs, len, layer);
+  }
+
+  POLY_BOUND_MSG(p, INVNTT_BOUND_REF, "ref intt output");
+}
+#else  /* MLKEM_USE_NATIVE_INTT */
+
+/* Check that bound for native invNTT implies contractual bound */
+STATIC_ASSERT(INVNTT_BOUND_NATIVE <= INVNTT_BOUND, invntt_bound)
+
+void poly_invntt_tomont(poly *p)
+{
+  intt_native(p);
+  POLY_BOUND_MSG(p, INVNTT_BOUND_NATIVE, "native intt output");
+}
+#endif /* MLKEM_USE_NATIVE_INTT */
+
+/*************************************************
+ * Name:        basemul_cached
+ *
+ * Description: Multiplication of polynomials in Zq[X]/(X^2-zeta)
+ *              used for multiplication of elements in Rq in NTT domain
+ *
+ *              Bounds:
+ *              - a is assumed to be < q in absolute value.
+ *              - Return value < 3/2 q in absolute value
+ *
+ * Arguments:   - int16_t r[2]: pointer to the output polynomial
+ *              - const int16_t a[2]: pointer to the first factor
+ *              - const int16_t b[2]: pointer to the second factor
+ *              - int16_t b_cached: Cached precomputation of b[1] * zeta
+ **************************************************/
+void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
+                    int16_t b_cached)
+{
+  int32_t t0, t1;
+
+  BOUND(a, 2, MLKEM_Q, "basemul input bound");
+
+  t0 = (int32_t)a[1] * b_cached;
+  t0 += (int32_t)a[0] * b[0];
+  t1 = (int32_t)a[0] * b[1];
+  t1 += (int32_t)a[1] * b[0];
+
+  /* |ti| < 2 * q * 2^15 */
+  r[0] = montgomery_reduce(t0);
+  r[1] = montgomery_reduce(t1);
+
+  /* |r[i]| < 3/2 q */
+  BOUND(r, 2, 3 * MLKEM_Q / 2, "basemul output bound");
+}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/ntt.h
new file mode 100644
index 0000000000..0f7b30624b
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/ntt.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef NTT_H
+#define NTT_H
+
+#include <stdint.h>
+#include "arith_native.h"
+#include "cbmc.h"
+#include "params.h"
+#include "poly.h"
+#include "reduce.h"
+
+#define zetas MLKEM_NAMESPACE(zetas)
+extern const int16_t zetas[128];
+
+/*************************************************
+ * Name:        poly_ntt
+ *
+ * Description: Computes negacyclic number-theoretic transform (NTT) of
+ *              a polynomial in place.
+ *
+ *              The input is assumed to be in normal order and
+ *              coefficient-wise bound by MLKEM_Q in absolute value.
+ *
+ *              The output polynomial is in bitreversed order, and
+ *              coefficient-wise bound by NTT_BOUND in absolute value.
+ *
+ *              (NOTE: Sometimes the input to the NTT is actually smaller,
+ *               which gives better bounds.)
+ *
+ * Arguments:   - poly *p: pointer to in/output polynomial
+ **************************************************/
+
+#define poly_ntt MLKEM_NAMESPACE(poly_ntt)
+void poly_ntt(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(array_abs_bound(r->coeffs, 0, MLKEM_N - 1, MLKEM_Q - 1))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N - 1, NTT_BOUND - 1))
+);
+
+/*************************************************
+ * Name:        poly_invntt_tomont
+ *
+ * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
+ *              of a polynomial in place;
+ *              inputs assumed to be in bitreversed order, output in normal
+ *              order
+ *
+ *              The input is assumed to be in bitreversed order, and can
+ *              have arbitrary coefficients in int16_t.
+ *
+ *              The output polynomial is in normal order, and
+ *              coefficient-wise bound by INVNTT_BOUND in absolute value.
+ *
+ * Arguments:   - uint16_t *a: pointer to in/output polynomial
+ **************************************************/
+#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont)
+void poly_invntt_tomont(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N - 1, INVNTT_BOUND - 1))
+);
+
+#define basemul_cached MLKEM_NAMESPACE(basemul_cached)
+/************************************************************
+ * Name: basemul_cached
+ *
+ * Description: Computes a representative modulo q of
+ *              (a0*b0 + a1*b_cached, a0*b1 + a1*b0)/65536
+ *
+ *              If b_cached is b1*zeta, this represents the
+ *              product of (a0 + a1*X) and (b0 + b1*X) in
+ *              Fq[X]/(X^2 - zeta).
+ *
+ * Arguments: - r: Pointer to output polynomial
+ *                   Upon return, coefficients are bound by
+ *                   3*(q+1)/2 in absolute value.
+ *            - a: Pointer to first input polynomial
+ *                   Must be coefficient-wise < q in absolute value.
+ *            - b: Pointer to second input polynomial
+ *                   Can have arbitrary int16_t coefficients
+ *            - b_cached: Some precomputed value, typically derived from
+ *                   b1 and a twiddle factor. Can be an arbitary int16_t.
+ ************************************************************/
+void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
+                    int16_t b_cached)
+__contract__(
+  requires(memory_no_alias(r, 2 * sizeof(int16_t)))
+  requires(memory_no_alias(a, 2 * sizeof(int16_t)))
+  requires(memory_no_alias(b, 2 * sizeof(int16_t)))
+  requires(array_abs_bound(a, 0, 1, MLKEM_Q - 1))
+  assigns(memory_slice(r, 2 * sizeof(int16_t)))
+  ensures(array_abs_bound(r, 0, 1, (3 * HALF_Q - 1)))
+);
+
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/params.h
new file mode 100644
index 0000000000..c5ba1aa656
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/params.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef PARAMS_H
+#define PARAMS_H
+
+#include "common.h"
+#include "cpucap.h"
+
+#define KECCAK_WAY 4
+
+
+#ifndef MLKEM_K
+#define MLKEM_K 3 /* Change this for different security strengths */
+#endif
+
+/* Don't change parameters below this line */
+#if (MLKEM_K == 2)
+#define MLKEM_NAMESPACE(s) PQCP_MLKEM_NATIVE_MLKEM512_##s
+#define _MLKEM_NAMESPACE(s) _PQCP_MLKEM_NATIVE_MLKEM512_##s
+#elif (MLKEM_K == 3)
+#define MLKEM_NAMESPACE(s) PQCP_MLKEM_NATIVE_MLKEM768_##s
+#define _MLKEM_NAMESPACE(s) _PQCP_MLKEM_NATIVE_MLKEM768_##s
+#elif (MLKEM_K == 4)
+#define MLKEM_NAMESPACE(s) PQCP_MLKEM_NATIVE_MLKEM1024_##s
+#define _MLKEM_NAMESPACE(s) _PQCP_MLKEM_NATIVE_MLKEM1024_##s
+#else
+#error "MLKEM_K must be in {2,3,4}"
+#endif
+
+#define MLKEM_N 256
+#define MLKEM_Q 3329
+
+#define MLKEM_SYMBYTES 32 /* size in bytes of hashes, and seeds */
+#define MLKEM_SSBYTES 32  /* size in bytes of shared key */
+
+#define MLKEM_POLYBYTES 384
+#define MLKEM_POLYVECBYTES (MLKEM_K * MLKEM_POLYBYTES)
+
+#if MLKEM_K == 2
+#define MLKEM_ETA1 3
+#define MLKEM_POLYCOMPRESSEDBYTES_DV 128
+#define MLKEM_POLYCOMPRESSEDBYTES_DU 320
+#define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
+#elif MLKEM_K == 3
+#define MLKEM_ETA1 2
+#define MLKEM_POLYCOMPRESSEDBYTES_DV 128
+#define MLKEM_POLYCOMPRESSEDBYTES_DU 320
+#define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
+#elif MLKEM_K == 4
+#define MLKEM_ETA1 2
+#define MLKEM_POLYCOMPRESSEDBYTES_DV 160
+#define MLKEM_POLYCOMPRESSEDBYTES_DU 352
+#define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
+#endif
+
+#define MLKEM_ETA2 2
+
+#define MLKEM_INDCPA_MSGBYTES (MLKEM_SYMBYTES)
+#define MLKEM_INDCPA_PUBLICKEYBYTES (MLKEM_POLYVECBYTES + MLKEM_SYMBYTES)
+#define MLKEM_INDCPA_SECRETKEYBYTES (MLKEM_POLYVECBYTES)
+#define MLKEM_INDCPA_BYTES \
+  (MLKEM_POLYVECCOMPRESSEDBYTES_DU + MLKEM_POLYCOMPRESSEDBYTES_DV)
+
+#define MLKEM_PUBLICKEYBYTES (MLKEM_INDCPA_PUBLICKEYBYTES)
+/* 32 bytes of additional space to save H(pk) */
+#define MLKEM_SECRETKEYBYTES                                   \
+  (MLKEM_INDCPA_SECRETKEYBYTES + MLKEM_INDCPA_PUBLICKEYBYTES + \
+   2 * MLKEM_SYMBYTES)
+#define MLKEM_CIPHERTEXTBYTES (MLKEM_INDCPA_BYTES)
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.c
new file mode 100644
index 0000000000..93a663c12b
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.c
@@ -0,0 +1,568 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "poly.h"
+#include <stdint.h>
+#include <string.h>
+#include "cbd.h"
+#include "cbmc.h"
+#include "fips202x4.h"
+#include "ntt.h"
+#include "params.h"
+#include "reduce.h"
+#include "symmetric.h"
+#include "verify.h"
+
+#include "arith_native.h"
+#include "debug/debug.h"
+
+void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
+{
+  int j;
+#if (MLKEM_POLYCOMPRESSEDBYTES_DU == 352)
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(invariant(j >= 0 && j <= MLKEM_N / 8))
+  {
+    int k;
+    uint16_t t[8];
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k >= 0 && k <= 8)
+      invariant(forall(int, r, 0, k - 1, t[r] < (1u << 11))))
+    {
+      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+    }
+
+    /*
+     * REF-CHANGE: Use array indexing into
+     * r rather than pointer-arithmetic to simplify verification
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 11-bit in size.
+     */
+    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
+    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
+    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
+    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
+    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
+    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
+    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
+    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
+    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
+    r[11 * j + 10] = (t[7] >> 3);
+  }
+
+#elif (MLKEM_POLYCOMPRESSEDBYTES_DU == 320)
+  for (j = 0; j < MLKEM_N / 4; j++)
+  __loop__(invariant(j >= 0 && j <= MLKEM_N / 4))
+  {
+    int k;
+    uint16_t t[4];
+    for (k = 0; k < 4; k++)
+    __loop__(
+      invariant(k >= 0 && k <= 4)
+      invariant(forall(int, r, 0, k - 1, t[r] < (1u << 10))))
+    {
+      t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
+    }
+
+    /*
+     * REF-CHANGE: Use array indexing into
+     * r rather than pointer-arithmetic to simplify verification
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 10-bit in size.
+     */
+    r[5 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF);
+    r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF);
+    r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
+    r[5 * j + 4] = (t[3] >> 2);
+  }
+#else
+#error "MLKEM_POLYCOMPRESSEDBYTES_DU needs to be in {320,352}"
+#endif
+}
+
+
+void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+{
+  int j;
+#if (MLKEM_POLYCOMPRESSEDBYTES_DU == 352)
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(
+    invariant(0 <= j && j <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * j - 1, 0, (MLKEM_Q - 1))))
+  {
+    int k;
+    uint16_t t[8];
+    uint8_t const *base = &a[11 * j];
+    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
+    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
+                    ((uint16_t)base[4] << 10));
+    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
+    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
+    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
+                    ((uint16_t)base[8] << 9));
+    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
+    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
+
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(0 <= k && k <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * j + k - 1, 0, (MLKEM_Q - 1))))
+    {
+      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
+    }
+  }
+#elif (MLKEM_POLYCOMPRESSEDBYTES_DU == 320)
+  for (j = 0; j < MLKEM_N / 4; j++)
+  __loop__(
+    invariant(0 <= j && j <= MLKEM_N / 4)
+    invariant(array_bound(r->coeffs, 0, 4 * j - 1, 0, (MLKEM_Q - 1))))
+  {
+    int k;
+    uint16_t t[4];
+    uint8_t const *base = &a[5 * j];
+
+    t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6));
+    t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4));
+    t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2));
+
+    for (k = 0; k < 4; k++)
+    __loop__(
+      invariant(0 <= k && k <= 4)
+      invariant(array_bound(r->coeffs, 0, 4 * j + k - 1, 0, (MLKEM_Q - 1))))
+    {
+      r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
+    }
+  }
+#else
+#error "MLKEM_POLYCOMPRESSEDBYTES_DU needs to be in {320,352}"
+#endif
+}
+
+void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
+{
+  int i;
+  POLY_UBOUND(a, MLKEM_Q);
+
+#if (MLKEM_POLYCOMPRESSEDBYTES_DV == 128)
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
+  {
+    int j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(array_bound(t, 0, (j-1), 0, 15)))
+    {
+      /* REF-CHANGE: Precondition change, we assume unsigned canonical data */
+      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
+    }
+
+    /*
+     * REF-CHANGE: Use array indexing into
+     * r rather than pointer-arithmetic to simplify verification
+     */
+    r[i * 4] = t[0] | (t[1] << 4);
+    r[i * 4 + 1] = t[2] | (t[3] << 4);
+    r[i * 4 + 2] = t[4] | (t[5] << 4);
+    r[i * 4 + 3] = t[6] | (t[7] << 4);
+  }
+#elif (MLKEM_POLYCOMPRESSEDBYTES_DV == 160)
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
+  {
+    int j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(array_bound(t, 0, (j-1), 0, 31)))
+    {
+      /* REF-CHANGE: Precondition change, we assume unsigned canonical data */
+      t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
+    }
+
+    /*
+     * REF-CHANGE: Explicitly truncate to avoid warning about
+     * implicit truncation in CBMC, and use array indexing into
+     * r rather than pointer-arithmetic to simplify verification
+     */
+    r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5));
+    r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
+    r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4));
+    r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
+    r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
+  }
+#else
+#error "MLKEM_POLYCOMPRESSEDBYTES_DV needs to be in {128, 160}"
+#endif
+}
+
+void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+{
+  int i;
+#if (MLKEM_POLYCOMPRESSEDBYTES_DV == 128)
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, (2 * i - 1), 0, (MLKEM_Q - 1))))
+  {
+    /* REF-CHANGE: Hoist scalar decompression into separate function */
+    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
+    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
+  }
+#elif (MLKEM_POLYCOMPRESSEDBYTES_DV == 160)
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, (8 * i - 1), 0, (MLKEM_Q - 1))))
+  {
+    int j;
+    uint8_t t[8];
+    const int offset = i * 5;
+    /*
+     * REF-CHANGE: Explicitly truncate to avoid warning about
+     * implicit truncation in CBMC and unwind loop for ease
+     * of proof.
+     */
+
+    /*
+     * Decompress 5 8-bit bytes (so 40 bits) into
+     * 8 5-bit values stored in t[]
+     */
+    t[0] = 0x1F & (a[offset + 0] >> 0);
+    t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3));
+    t[2] = 0x1F & (a[offset + 1] >> 2);
+    t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1));
+    t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4));
+    t[5] = 0x1F & (a[offset + 3] >> 1);
+    t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2));
+    t[7] = 0x1F & (a[offset + 4] >> 3);
+
+    /* and copy to the correct slice in r[] */
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(j >= 0 && j <= 8 && i >= 0 && i <= MLKEM_N / 8)
+      invariant(array_bound(r->coeffs, 0, (8 * i + j - 1), 0, (MLKEM_Q - 1))))
+    {
+      /* REF-CHANGE: Hoist scalar decompression into separate function */
+      r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
+    }
+  }
+#else
+#error "MLKEM_POLYCOMPRESSEDBYTES_DV needs to be in {128, 160}"
+#endif
+
+  POLY_UBOUND(r, MLKEM_Q);
+}
+
+#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+{
+  unsigned int i;
+  POLY_UBOUND(a, MLKEM_Q);
+
+
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(invariant(i >= 0 && i <= MLKEM_N / 2))
+  {
+    const uint16_t t0 = a->coeffs[2 * i];
+    const uint16_t t1 = a->coeffs[2 * i + 1];
+    /* REF-CHANGE: Precondition change, we assume unsigned canonical data */
+
+    /*
+     * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of
+     * significant data, so these can be packed into 24 bits or exactly
+     * 3 bytes, as follows.
+     */
+
+    /* Least significant bits 0 - 7 of t0. */
+    r[3 * i + 0] = t0 & 0xFF;
+
+    /*
+     * Most significant bits 8 - 11 of t0 become the least significant
+     * nibble of the second byte. The least significant 4 bits
+     * of t1 become the upper nibble of the second byte.
+     */
+    r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0);
+
+    /* Bits 4 - 11 of t1 become the third byte. */
+    r[3 * i + 2] = t1 >> 4;
+  }
+}
+#else  /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+{
+  POLY_UBOUND(a, MLKEM_Q);
+  poly_tobytes_native(r, a);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+{
+  int i;
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, (2 * i - 1), 0, 4095)))
+  {
+    /* REF-CHANGE: Introduce some locals for better readability */
+    const uint8_t t0 = a[3 * i + 0];
+    const uint8_t t1 = a[3 * i + 1];
+    const uint8_t t2 = a[3 * i + 2];
+    r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF);
+    r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4);
+  }
+
+  /* Note that the coefficients are not canonical */
+  POLY_UBOUND(r, 4096);
+}
+#else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+{
+  poly_frombytes_native(r, a);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+
+void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+{
+  int i;
+#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8)
+#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!"
+#endif
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, (8 * i - 1), 0, (MLKEM_Q - 1))))
+  {
+    int j;
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i >= 0 && i <  MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(array_bound(r->coeffs, 0, (8 * i + j - 1), 0, (MLKEM_Q - 1))))
+    {
+      /* Prevent the compiler from recognizing this as a bit selection */
+      uint8_t mask = value_barrier_u8(1u << j);
+      r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
+    }
+  }
+  POLY_BOUND_MSG(r, MLKEM_Q, "poly_frommsg output");
+}
+
+void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
+{
+  int i;
+  POLY_UBOUND(a, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
+  {
+    int j;
+    msg[i] = 0;
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8))
+    {
+      uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
+      msg[i] |= t << j;
+    }
+  }
+}
+
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+{
+  ALIGN uint8_t buf[KECCAK_WAY][MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
+  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+  extkey[0][MLKEM_SYMBYTES] = nonce0;
+  extkey[1][MLKEM_SYMBYTES] = nonce1;
+  extkey[2][MLKEM_SYMBYTES] = nonce2;
+  extkey[3][MLKEM_SYMBYTES] = nonce3;
+  shake256x4(buf[0], buf[1], buf[2], buf[3], MLKEM_ETA1 * MLKEM_N / 4,
+             extkey[0], extkey[1], extkey[2], extkey[3], MLKEM_SYMBYTES + 1);
+  poly_cbd_eta1(r0, buf[0]);
+  poly_cbd_eta1(r1, buf[1]);
+  poly_cbd_eta1(r2, buf[2]);
+  poly_cbd_eta1(r3, buf[3]);
+
+  POLY_BOUND_MSG(r0, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 0");
+  POLY_BOUND_MSG(r1, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 1");
+  POLY_BOUND_MSG(r2, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 2");
+  POLY_BOUND_MSG(r3, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 3");
+}
+
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+{
+  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
+  prf(buf, sizeof(buf), seed, nonce);
+  poly_cbd_eta2(r, buf);
+
+  POLY_BOUND_MSG(r, MLKEM_ETA1 + 1, "poly_getnoise_eta2 output");
+}
+
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+{
+  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
+  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+  extkey[0][MLKEM_SYMBYTES] = nonce0;
+  extkey[1][MLKEM_SYMBYTES] = nonce1;
+  extkey[2][MLKEM_SYMBYTES] = nonce2;
+  extkey[3][MLKEM_SYMBYTES] = nonce3;
+
+#if MLKEM_ETA1 == MLKEM_ETA2
+  shake256x4(buf1[0], buf1[1], buf2[0], buf2[1], MLKEM_ETA1 * MLKEM_N / 4,
+             extkey[0], extkey[1], extkey[2], extkey[3], MLKEM_SYMBYTES + 1);
+#else
+  shake256(buf1[0], sizeof(buf1[0]), extkey[0], sizeof(extkey[0]));
+  shake256(buf1[1], sizeof(buf1[1]), extkey[1], sizeof(extkey[1]));
+  shake256(buf2[0], sizeof(buf2[0]), extkey[2], sizeof(extkey[2]));
+  shake256(buf2[1], sizeof(buf2[1]), extkey[3], sizeof(extkey[3]));
+#endif
+
+  poly_cbd_eta1(r0, buf1[0]);
+  poly_cbd_eta1(r1, buf1[1]);
+  poly_cbd_eta2(r2, buf2[0]);
+  poly_cbd_eta2(r3, buf2[1]);
+
+  POLY_BOUND_MSG(r0, MLKEM_ETA1 + 1, "poly_getnoise_eta1122_4x output 0");
+  POLY_BOUND_MSG(r1, MLKEM_ETA1 + 1, "poly_getnoise_eta1122_4x output 1");
+  POLY_BOUND_MSG(r2, MLKEM_ETA2 + 1, "poly_getnoise_eta1122_4x output 2");
+  POLY_BOUND_MSG(r3, MLKEM_ETA2 + 1, "poly_getnoise_eta1122_4x output 3");
+}
+
+void poly_basemul_montgomery_cached(poly *r, const poly *a, const poly *b,
+                                    const poly_mulcache *b_cache)
+{
+  int i;
+  POLY_BOUND(b_cache, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 4; i++)
+  __loop__(
+    assigns(i, object_whole(r))
+    invariant(i >= 0 && i <= MLKEM_N / 4)
+    invariant(array_abs_bound(r->coeffs, 0, (4 * i - 1), (3 * HALF_Q - 1))))
+  {
+    basemul_cached(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i],
+                   b_cache->coeffs[2 * i]);
+    basemul_cached(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2],
+                   &b->coeffs[4 * i + 2], b_cache->coeffs[2 * i + 1]);
+  }
+}
+
+#if !defined(MLKEM_USE_NATIVE_POLY_TOMONT)
+void poly_tomont(poly *r)
+{
+  int i;
+  const int16_t f = (1ULL << 32) % MLKEM_Q; /* 1353 */
+  for (i = 0; i < MLKEM_N; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(array_abs_bound(r->coeffs ,0, (i - 1), (MLKEM_Q - 1))))
+  {
+    r->coeffs[i] = fqmul(r->coeffs[i], f);
+  }
+
+  POLY_BOUND(r, MLKEM_Q);
+}
+#else  /* MLKEM_USE_NATIVE_POLY_TOMONT */
+void poly_tomont(poly *r)
+{
+  poly_tomont_native(r);
+  POLY_BOUND(r, MLKEM_Q);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_REDUCE)
+void poly_reduce(poly *r)
+{
+  int i;
+  for (i = 0; i < MLKEM_N; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(array_bound(r->coeffs, 0, (i - 1), 0, (MLKEM_Q - 1))))
+  {
+    /* Barrett reduction, giving signed canonical representative */
+    int16_t t = barrett_reduce(r->coeffs[i]);
+    /* Conditional addition to get unsigned canonical representative */
+    r->coeffs[i] = scalar_signed_to_unsigned_q(t);
+  }
+
+  POLY_UBOUND(r, MLKEM_Q);
+}
+#else  /* MLKEM_USE_NATIVE_POLY_REDUCE */
+void poly_reduce(poly *r)
+{
+  poly_reduce_native(r);
+  POLY_UBOUND(r, MLKEM_Q);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
+
+void poly_add(poly *r, const poly *b)
+{
+  int i;
+  for (i = 0; i < MLKEM_N; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(forall(int, k0, i, MLKEM_N - 1, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
+    invariant(forall(int, k1, 0, i - 1, r->coeffs[k1] == loop_entry(*r).coeffs[k1] + b->coeffs[k1])))
+  {
+    r->coeffs[i] = r->coeffs[i] + b->coeffs[i];
+  }
+}
+
+void poly_sub(poly *r, const poly *b)
+{
+  int i;
+  for (i = 0; i < MLKEM_N; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(forall(int, k0, i, MLKEM_N - 1, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
+    invariant(forall(int, k1, 0, i - 1, r->coeffs[k1] == loop_entry(*r).coeffs[k1] - b->coeffs[k1])))
+  {
+    r->coeffs[i] = r->coeffs[i] - b->coeffs[i];
+  }
+}
+
+#if !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE)
+void poly_mulcache_compute(poly_mulcache *x, const poly *a)
+{
+  int i;
+  for (i = 0; i < MLKEM_N / 4; i++)
+  __loop__(invariant(i >= 0 && i <= MLKEM_N / 4))
+  {
+    x->coeffs[2 * i + 0] = fqmul(a->coeffs[4 * i + 1], zetas[64 + i]);
+    x->coeffs[2 * i + 1] = fqmul(a->coeffs[4 * i + 3], -zetas[64 + i]);
+  }
+  POLY_BOUND(x, MLKEM_Q);
+}
+#else  /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
+void poly_mulcache_compute(poly_mulcache *x, const poly *a)
+{
+  poly_mulcache_compute_native(x, a);
+  /* Omitting POLY_BOUND(x, MLKEM_Q) since native implementations may
+   * decide not to use a mulcache. Note that the C backend implementation
+   * of poly_basemul_montgomery_cached() does still include the check. */
+}
+#endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.h
new file mode 100644
index 0000000000..35990684b6
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.h
@@ -0,0 +1,773 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef POLY_H
+#define POLY_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include "cbmc.h"
+#include "params.h"
+#include "reduce.h"
+#include "verify.h"
+
+/* Absolute exclusive upper bound for the output of the inverse NTT */
+#define INVNTT_BOUND (8 * MLKEM_Q)
+
+/* Absolute exclusive upper bound for the output of the forward NTT */
+#define NTT_BOUND (8 * MLKEM_Q)
+
+/*
+ * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial
+ * coeffs[0] + X*coeffs[1] + X^2*coeffs[2] + ... + X^{n-1}*coeffs[n-1]
+ */
+typedef struct
+{
+  int16_t coeffs[MLKEM_N];
+} ALIGN poly;
+
+/*
+ * INTERNAL presentation of precomputed data speeding up
+ * the base multiplication of two polynomials in NTT domain.
+ */
+/*
+ * REF-CHANGE: This structure does not exist in the reference
+ * implementation.
+ */
+typedef struct
+{
+  int16_t coeffs[MLKEM_N >> 1];
+} poly_mulcache;
+
+/************************************************************
+ * Name: scalar_compress_d1
+ *
+ * Description: Computes round(u * 2 / q)
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 1.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d1(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 2)
+  ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2)  )
+{
+  uint32_t d0 = u << 1;
+  d0 *= 645083;
+  d0 += 1u << 30;
+  d0 >>= 31;
+  return d0;
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_compress_d4
+ *
+ * Description: Computes round(u * 16 / q) % 16
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 4.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d4(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 16)
+  ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16))
+{
+  uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */
+  return (d0 + (1u << 27)) >> 28;      /* round(d0/2^28) */
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d4
+ *
+ * Description: Computes round(u * q / 16)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 4.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d4(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 16)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 8) / 16; }
+
+/************************************************************
+ * Name: scalar_compress_d5
+ *
+ * Description: Computes round(u * 32 / q) % 32
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 5.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d5(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 32)
+  ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32)  )
+{
+  uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */
+  return (d0 + (1u << 26)) >> 27;      /* round(d0/2^27) */
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d5
+ *
+ * Description: Computes round(u * q / 32)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 5.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 32
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d5(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 32)
+  ensures(return_value <= MLKEM_Q - 1)
+) { return ((u * MLKEM_Q) + 16) / 32; }
+
+/************************************************************
+ * Name: scalar_compress_d10
+ *
+ * Description: Computes round(u * 2**10 / q) % 2**10
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d10(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < (1u << 10))
+  ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10)))
+{
+  uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */
+  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
+  return (d0 & 0x3FF);
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d10
+ *
+ * Description: Computes round(u * q / 1024)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d10(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 1024)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 512) / 1024; }
+
+/************************************************************
+ * Name: scalar_compress_d11
+ *
+ * Description: Computes round(u * 2**11 / q) % 2**11
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 11.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d11(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < (1u << 11))
+  ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11)))
+{
+  uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */
+  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
+  return (d0 & 0x7FF);
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d11
+ *
+ * Description: Computes round(u * q / 1024)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d11(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 2048)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 1024) / 2048; }
+
+/************************************************************
+ * Name: scalar_signed_to_unsigned_q
+ *
+ * Description: converts signed polynomial coefficient
+ *              from signed (-3328 .. 3328) form to
+ *              unsigned form (0 .. 3328).
+ *
+ * Note: Cryptographic constant time implementation
+ *
+ * Examples:       0 -> 0
+ *                 1 -> 1
+ *              3328 -> 3328
+ *                -1 -> 3328
+ *                -2 -> 3327
+ *             -3328 -> 1
+ *
+ * Arguments: c: signed coefficient to be converted
+ ************************************************************/
+static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
+__contract__(
+  requires(c >= -(MLKEM_Q - 1) && c <= (MLKEM_Q - 1))
+  ensures(return_value >= 0 && return_value <= (MLKEM_Q - 1))
+  ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
+{
+  /* Add Q if c is negative, but in constant time */
+  c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
+
+  cassert(c >= 0, "scalar_signed_to_unsigned_q result lower bound");
+  cassert(c < MLKEM_Q, "scalar_signed_to_unsigned_q result upper bound");
+
+  /* and therefore cast to uint16_t is safe. */
+  return (uint16_t)c;
+}
+
+#define poly_compress_du MLKEM_NAMESPACE(poly_compress_du)
+/*************************************************
+ * Name:        poly_compress_du
+ *
+ * Description: Compression (du bits) and subsequent serialization of a
+ *polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                            (of length MLKEM_POLYCOMPRESSEDBYTES)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1)))
+  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
+);
+
+#define poly_decompress_du MLKEM_NAMESPACE(poly_decompress_du)
+/*************************************************
+ * Name:        poly_decompress_du
+ *
+ * Description: De-serialization and subsequent decompression (du bits) of a
+ *polynomial; approximate inverse of poly_compress_du
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                                  (of length MLKEM_POLYCOMPRESSEDBYTES bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1)))
+);
+
+#define poly_compress_dv MLKEM_NAMESPACE(poly_compress_dv)
+/*************************************************
+ * Name:        poly_compress_dv
+ *
+ * Description: Compression (dv bits) and subsequent serialization of a
+ *polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                            (of length MLKEM_POLYCOMPRESSEDBYTES_DV)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1)))
+  assigns(object_whole(r))
+);
+
+#define poly_decompress_dv MLKEM_NAMESPACE(poly_decompress_dv)
+/*************************************************
+ * Name:        poly_decompress_dv
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV
+ *bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(object_whole(r))
+  ensures(array_bound(r->coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1)))
+);
+
+#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
+/*************************************************
+ * Name:        poly_tobytes
+ *
+ * Description: Serialization of a polynomial.
+ *              Signed coefficients are converted to
+ *              unsigned form before serialization.
+ *
+ * Arguments:   INPUT:
+ *              - a: const pointer to input polynomial,
+ *                with each coefficient in the range [0,1,..,Q-1]
+ *              OUTPUT
+ *              - r: pointer to output byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ **************************************************/
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYBYTES))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1)))
+  assigns(object_whole(r))
+);
+
+
+#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes)
+/*************************************************
+ * Name:        poly_frombytes
+ *
+ * Description: De-serialization of a polynomial.
+ *
+ * Arguments:   INPUT
+ *              - a: pointer to input byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ *              OUTPUT
+ *              - r: pointer to output polynomial, with
+ *                   each coefficient unsigned and in the range
+ *                   0 .. 4095
+ **************************************************/
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, (MLKEM_N - 1), 0, 4095))
+);
+
+
+#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg)
+/*************************************************
+ * Name:        poly_frommsg
+ *
+ * Description: Convert 32-byte message to polynomial
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *msg: pointer to input message
+ **************************************************/
+void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+__contract__(
+  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(object_whole(r))
+  ensures(array_bound(r->coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1)))
+);
+
+#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg)
+/*************************************************
+ * Name:        poly_tomsg
+ *
+ * Description: Convert polynomial to 32-byte message
+ *
+ * Arguments:   - uint8_t *msg: pointer to output message
+ *              - const poly *r: pointer to input polynomial
+ *                Coefficients must be unsigned canonical
+ **************************************************/
+void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r)
+__contract__(
+  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(array_bound(r->coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1)))
+  assigns(object_whole(msg))
+);
+
+#define poly_getnoise_eta1_4x MLKEM_NAMESPACE(poly_getnoise_eta1_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and nonces, with output polynomials close to centered binomial distribution
+ * with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+/* Depending on MLKEM_K, the pointers passed to this function belong
+   to the same objects, so we cannot use memory_no_alias for r0-r3.
+
+   NOTE: Somehow it is important to use memory_no_alias() first in the
+         conjunctions defining each case.
+*/
+#if MLKEM_K == 2
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
+    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N - 1, MLKEM_ETA1));
+);
+#elif MLKEM_K == 4
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case B: r0, r1, r2, r3 consecutive */
+    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N - 1, MLKEM_ETA1));
+);
+#elif MLKEM_K == 3
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case C: r0, r1, r2 consecutive */
+ (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
+  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N - 1, MLKEM_ETA1));
+);
+#endif /* MLKEM_K */
+
+#if MLKEM_ETA1 == MLKEM_ETA2
+/*
+ * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
+ * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
+ * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
+ */
+#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
+#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
+
+#define poly_getnoise_eta2 MLKEM_NAMESPACE(poly_getnoise_eta2)
+/*************************************************
+ * Name:        poly_getnoise_eta2
+ *
+ * Description: Sample a polynomial deterministically from a seed and a nonce,
+ *              with output polynomial close to centered binomial distribution
+ *              with parameter MLKEM_ETA2
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce: one-byte input nonce
+ **************************************************/
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N - 1, MLKEM_ETA2))
+);
+
+#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE(poly_getnoise_eta1122_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1122_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and a nonces, with output polynomials close to centered binomial
+ * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+__contract__(
+  requires( /* r0, r1 consecutive, r2, r3 consecutive */
+ (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
+  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+     && array_abs_bound(r1->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+     && array_abs_bound(r2->coeffs,0, MLKEM_N - 1, MLKEM_ETA2)
+     && array_abs_bound(r3->coeffs,0, MLKEM_N - 1, MLKEM_ETA2));
+);
+
+#define poly_basemul_montgomery_cached \
+  MLKEM_NAMESPACE(poly_basemul_montgomery_cached)
+/*************************************************
+ * Name:        poly_basemul_montgomery_cached
+ *
+ * Description: Multiplication of two polynomials in NTT domain,
+ *              using mulcache for second operand.
+ *
+ *              Bounds:
+ *              - a is assumed to be coefficient-wise < q in absolute value.
+ *
+ *              The result is coefficient-wise bound by 3/2 q in absolute
+ *              value.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const poly *a: pointer to first input polynomial
+ *              - const poly *b: pointer to second input polynomial
+ *              - const poly_mulcache *b_cache: pointer to mulcache
+ *                  for second input polynomial. Can be computed
+ *                  via poly_mulcache_compute().
+ **************************************************/
+void poly_basemul_montgomery_cached(poly *r, const poly *a, const poly *b,
+                                    const poly_mulcache *b_cache)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(memory_no_alias(b, sizeof(poly)))
+  requires(memory_no_alias(b_cache, sizeof(poly_mulcache)))
+  requires(array_abs_bound(a->coeffs, 0, MLKEM_N - 1, (MLKEM_Q - 1)))
+  assigns(object_whole(r))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N - 1, (3 * HALF_Q - 1)))
+);
+
+#define poly_tomont MLKEM_NAMESPACE(poly_tomont)
+/*************************************************
+ * Name:        poly_tomont
+ *
+ * Description: Inplace conversion of all coefficients of a polynomial
+ *              from normal domain to Montgomery domain
+ *
+ *              Bounds: Output < q in absolute value.
+ *
+ * Arguments:   - poly *r: pointer to input/output polynomial
+ **************************************************/
+void poly_tomont(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N - 1, (MLKEM_Q - 1)))
+);
+
+/* REF-CHANGE: This function does not exist in the reference implementation */
+#define poly_mulcache_compute MLKEM_NAMESPACE(poly_mulcache_compute)
+/************************************************************
+ * Name: poly_mulcache_compute
+ *
+ * Description: Computes the mulcache for a polynomial in NTT domain
+ *
+ *              The mulcache of a degree-2 polynomial b := b0 + b1*X
+ *              in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when
+ *              computing products of b in Fq[X]/(X^2-zeta).
+ *
+ *              The mulcache of a polynomial in NTT domain -- which is
+ *              a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta),
+ *              for varying zeta, is the 128-tuple of mulcaches of those
+ *              polynomials.
+ *
+ * Arguments: - x: Pointer to mulcache to be populated
+ *            - a: Pointer to input polynomial
+ ************************************************************/
+/*
+ * NOTE: The default C implementation of this function populates
+ * the mulcache with values in (-q,q), but this is not needed for the
+ * higher level safety proofs, and thus not part of the spec.
+ */
+void poly_mulcache_compute(poly_mulcache *x, const poly *a)
+__contract__(
+  requires(memory_no_alias(x, sizeof(poly_mulcache)))
+  requires(memory_no_alias(a, sizeof(poly)))
+  assigns(object_whole(x))
+);
+
+#define poly_reduce MLKEM_NAMESPACE(poly_reduce)
+/*************************************************
+ * Name:        poly_reduce
+ *
+ * Description: Converts polynomial to _unsigned canonical_ representatives.
+ *
+ *              The input coefficients can be arbitrary integers in int16_t.
+ *              The output coefficients are in [0,1,...,MLKEM_Q-1].
+ *
+ * Arguments:   - poly *r: pointer to input/output polynomial
+ **************************************************/
+/*
+ * REF-CHANGE: The semantics of poly_reduce() is different in
+ * the reference implementation, which requires
+ * signed canonical output data. Unsigned canonical
+ * outputs are better suited to the only remaining
+ * use of poly_reduce() in the context of (de)serialization.
+ */
+void poly_reduce(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N - 1, 0, (MLKEM_Q - 1)))
+);
+
+#define poly_add MLKEM_NAMESPACE(poly_add)
+/************************************************************
+ * Name: poly_add
+ *
+ * Description: Adds two polynomials in place
+ *
+ * Arguments: - r: Pointer to input-output polynomial to be added to.
+ *            - b: Pointer to input polynomial that should be added
+ *                 to r. Must be disjoint from r.
+ *
+ * The coefficients of r and b must be so that the addition does
+ * not overflow. Otherwise, the behaviour of this function is undefined.
+ *
+ ************************************************************/
+/*
+ * REF-CHANGE:
+ * The reference implementation uses a 3-argument poly_add.
+ * We specialize to the accumulator form to avoid reasoning about aliasing.
+ */
+void poly_add(poly *r, const poly *b)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(b, sizeof(poly)))
+  requires(forall(int, k0, 0, MLKEM_N - 1, (int32_t) r->coeffs[k0] + b->coeffs[k0] <= INT16_MAX))
+  requires(forall(int, k1, 0, MLKEM_N - 1, (int32_t) r->coeffs[k1] + b->coeffs[k1] >= INT16_MIN))
+  ensures(forall(int, k, 0, MLKEM_N - 1, r->coeffs[k] == old(*r).coeffs[k] + b->coeffs[k]))
+  assigns(memory_slice(r, sizeof(poly)))
+);
+
+#define poly_sub MLKEM_NAMESPACE(poly_sub)
+/*************************************************
+ * Name:        poly_sub
+ *
+ * Description: Subtract two polynomials; no modular reduction is performed
+ *
+ * Arguments: - poly *r:       Pointer to input-output polynomial to be added
+ *to.
+ *            - const poly *b: Pointer to second input polynomial
+ **************************************************/
+/*
+ * REF-CHANGE:
+ * The reference implementation uses a 3-argument poly_sub.
+ * We specialize to the accumulator form to avoid reasoning about aliasing.
+ */
+void poly_sub(poly *r, const poly *b)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(b, sizeof(poly)))
+  requires(forall(int, k0, 0, MLKEM_N - 1, (int32_t) r->coeffs[k0] - b->coeffs[k0] <= INT16_MAX))
+  requires(forall(int, k1, 0, MLKEM_N - 1, (int32_t) r->coeffs[k1] - b->coeffs[k1] >= INT16_MIN))
+  ensures(forall(int, k, 0, MLKEM_N - 1, r->coeffs[k] == old(*r).coeffs[k] - b->coeffs[k]))
+  assigns(object_whole(r))
+);
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/polyvec.c
new file mode 100644
index 0000000000..5e4dd0c5c4
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/polyvec.c
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "polyvec.h"
+#include <stdint.h>
+#include "arith_native.h"
+#include "config.h"
+#include "ntt.h"
+#include "params.h"
+#include "poly.h"
+
+#include "debug/debug.h"
+void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
+                         const polyvec *a)
+{
+  unsigned int i;
+  POLYVEC_UBOUND(a, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]);
+  }
+}
+
+void polyvec_decompress_du(polyvec *r,
+                           const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
+{
+  unsigned int i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
+  }
+
+  POLYVEC_UBOUND(r, MLKEM_Q);
+}
+
+void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
+{
+  unsigned int i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]);
+  }
+}
+
+void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
+{
+  int i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES);
+  }
+}
+
+void polyvec_ntt(polyvec *r)
+{
+  unsigned int i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_ntt(&r->vec[i]);
+  }
+}
+
+void polyvec_invntt_tomont(polyvec *r)
+{
+  unsigned int i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_invntt_tomont(&r->vec[i]);
+  }
+}
+
+/*************************************************
+ * Name:        polyvec_basemul_acc_montgomery
+ *
+ * Description: Multiply elements of a and b in NTT domain, accumulate into r,
+ *              and multiply by 2^-16.
+ *
+ *              Bounds:
+ *              - a is assumed to be coefficient-wise < q in absolute value.
+ *              - b is assumed to be the output of a forward NTT and
+ *                thus coefficient-wise bound by NTT_BOUND
+ *              - b_cache is assumed to be coefficient-wise bound by
+ *                MLKEM_Q.
+ *
+ * Arguments: - poly *r: pointer to output polynomial
+ *            - const polyvec *a: pointer to first input vector of polynomials
+ *            - const polyvec *b: pointer to second input vector of polynomials
+ *            - const polyvec_mulcache *b_cache: mulcache for b
+ **************************************************/
+#if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
+void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
+                                           const polyvec *b,
+                                           const polyvec_mulcache *b_cache)
+{
+  int i;
+  poly t;
+
+  POLYVEC_BOUND(a, MLKEM_Q);
+  POLYVEC_BOUND(b, NTT_BOUND);
+  POLYVEC_BOUND(b_cache, MLKEM_Q);
+
+  poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]);
+  for (i = 1; i < MLKEM_K; i++)
+  {
+    poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i],
+                                   &b_cache->vec[i]);
+    poly_add(r, &t);
+    /* abs bounds: < (i+1) * 3/2 * q */
+  }
+
+  /*
+   * Those bounds are true for the C implementation, but not needed
+   * in the higher level bounds reasoning. It is thus best to omit
+   * them from the spec to not unnecessarily constraint native implementations.
+   */
+  cassert(
+      array_abs_bound(r->coeffs, 0, MLKEM_N - 1, MLKEM_K * (3 * HALF_Q - 1)),
+      "polyvec_basemul_acc_montgomery_cached output bounds");
+  /* TODO: Integrate CBMC assertion into POLY_BOUND if CBMC is set */
+  POLY_BOUND(r, MLKEM_K * 3 * HALF_Q);
+}
+#else  /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
+void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
+                                           const polyvec *b,
+                                           const polyvec_mulcache *b_cache)
+{
+  POLYVEC_BOUND(a, MLKEM_Q);
+  POLYVEC_BOUND(b, NTT_BOUND);
+  /* Omitting POLYVEC_BOUND(b_cache, MLKEM_Q) since native implementations may
+   * decide not to use a mulcache. Note that the C backend implementation
+   * of poly_basemul_montgomery_cached() does still include the check. */
+  polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache);
+}
+#endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
+
+/*************************************************
+ * Name:        polyvec_basemul_acc_montgomery
+ *
+ * Description: Multiply elements of a and b in NTT domain, accumulate into r,
+ *              and multiply by 2^-16.
+ *
+ * Arguments: - poly *r: pointer to output polynomial
+ *            - const polyvec *a: pointer to first input vector of polynomials
+ *            - const polyvec *b: pointer to second input vector of polynomials
+ **************************************************/
+void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
+{
+  polyvec_mulcache b_cache;
+  polyvec_mulcache_compute(&b_cache, b);
+  polyvec_basemul_acc_montgomery_cached(r, a, b, &b_cache);
+}
+
+/*************************************************
+ * Name:        polyvec_mulcache_compute
+ *
+ * Description: Precompute values speeding up
+ *              base multiplications of polynomials
+ *              in NTT domain.
+ *
+ * Arguments: - polyvec_mulcache *x: pointer to output cache.
+ *            - const poly *a: pointer to input polynomial
+ **************************************************/
+void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a)
+{
+  unsigned int i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_mulcache_compute(&x->vec[i], &a->vec[i]);
+  }
+}
+
+
+/*************************************************
+ * Name:        polyvec_reduce
+ *
+ * Description: Applies Barrett reduction to each coefficient
+ *              of each element of a vector of polynomials;
+ *              for details of the Barrett reduction see comments in reduce.c
+ *
+ * Arguments:   - polyvec *r: pointer to input/output polynomial
+ **************************************************/
+void polyvec_reduce(polyvec *r)
+{
+  unsigned int i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_reduce(&r->vec[i]);
+  }
+}
+
+void polyvec_add(polyvec *r, const polyvec *b)
+{
+  int i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_add(&r->vec[i], &b->vec[i]);
+  }
+}
+
+void polyvec_tomont(polyvec *r)
+{
+  unsigned int i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_tomont(&r->vec[i]);
+  }
+}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/polyvec.h
new file mode 100644
index 0000000000..7771fd3b28
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/polyvec.h
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef POLYVEC_H
+#define POLYVEC_H
+
+#include <stdint.h>
+#include "params.h"
+#include "poly.h"
+
+typedef struct
+{
+  poly vec[MLKEM_K];
+} ALIGN polyvec;
+
+/* REF-CHANGE: This struct does not exist in the reference implementation */
+typedef struct
+{
+  poly_mulcache vec[MLKEM_K];
+} polyvec_mulcache;
+
+#define polyvec_compress_du MLKEM_NAMESPACE(polyvec_compress_du)
+/*************************************************
+ * Name:        polyvec_compress_du
+ *
+ * Description: Compress and serialize vector of polynomials
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                            (needs space for MLKEM_POLYVECCOMPRESSEDBYTES_DU)
+ *              - const polyvec *a: pointer to input vector of polynomials.
+ *                                  Coefficients must be unsigned canonical,
+ *                                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
+                         const polyvec *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  requires(forall(int, k0, 0, MLKEM_K - 1,
+         array_bound(a->vec[k0].coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1))))
+  assigns(object_whole(r))
+);
+
+#define polyvec_decompress_du MLKEM_NAMESPACE(polyvec_decompress_du)
+/*************************************************
+ * Name:        polyvec_decompress_du
+ *
+ * Description: De-serialize and decompress vector of polynomials;
+ *              approximate inverse of polyvec_compress_du
+ *
+ * Arguments:   - polyvec *r:       pointer to output vector of polynomials.
+ *                Output will have coefficients normalized to [0,..,q-1].
+ *              - const uint8_t *a: pointer to input byte array
+ *                                  (of length MLKEM_POLYVECCOMPRESSEDBYTES_DU)
+ **************************************************/
+void polyvec_decompress_du(polyvec *r,
+                           const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(int, k0, 0, MLKEM_K - 1,
+         array_bound(r->vec[k0].coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1))))
+);
+
+#define polyvec_tobytes MLKEM_NAMESPACE(polyvec_tobytes)
+/*************************************************
+ * Name:        polyvec_tobytes
+ *
+ * Description: Serialize vector of polynomials
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                            (needs space for MLKEM_POLYVECBYTES)
+ *              - const polyvec *a: pointer to input vector of polynomials
+ *                  Each polynomial must have coefficients in [0,..,q-1].
+ **************************************************/
+void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
+__contract__(
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  requires(memory_no_alias(r, MLKEM_POLYVECBYTES))
+  requires(forall(int, k0, 0, MLKEM_K - 1,
+         array_bound(a->vec[k0].coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1))))
+  assigns(object_whole(r))
+);
+
+#define polyvec_frombytes MLKEM_NAMESPACE(polyvec_frombytes)
+/*************************************************
+ * Name:        polyvec_frombytes
+ *
+ * Description: De-serialize vector of polynomials;
+ *              inverse of polyvec_tobytes
+ *
+ * Arguments:   - const polyvec *a: pointer to output vector of polynomials
+ *                 (of length MLKEM_POLYVECBYTES). Output will have coefficients
+ *                 normalized to [0,..,q-1].
+ *              - uint8_t *r: pointer to input byte array
+ **************************************************/
+void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  requires(memory_no_alias(a, MLKEM_POLYVECBYTES))
+  assigns(object_whole(r))
+  ensures(forall(int, k0, 0, MLKEM_K - 1,
+        array_bound(r->vec[k0].coeffs, 0, (MLKEM_N - 1), 0, 4095)))
+);
+
+#define polyvec_ntt MLKEM_NAMESPACE(polyvec_ntt)
+/*************************************************
+ * Name:        polyvec_ntt
+ *
+ * Description: Apply forward NTT to all elements of a vector of polynomials.
+ *
+ *              The input is assumed to be in normal order and
+ *              coefficient-wise bound by MLKEM_Q in absolute value.
+ *
+ *              The output polynomial is in bitreversed order, and
+ *              coefficient-wise bound by NTT_BOUND in absolute value.
+ *
+ * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
+ *
+ **************************************************/
+void polyvec_ntt(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  requires(forall(int, j, 0, MLKEM_K - 1,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N - 1, (MLKEM_Q - 1))))
+  assigns(object_whole(r))
+  ensures(forall(int, j, 0, MLKEM_K - 1,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N - 1, (NTT_BOUND - 1))))
+);
+
+#define polyvec_invntt_tomont MLKEM_NAMESPACE(polyvec_invntt_tomont)
+/*************************************************
+ * Name:        polyvec_invntt_tomont
+ *
+ * Description: Apply inverse NTT to all elements of a vector of polynomials
+ *              and multiply by Montgomery factor 2^16
+ *
+ *              The input is assumed to be in bitreversed order, and can
+ *              have arbitrary coefficients in int16_t.
+ *
+ *              The output polynomial is in normal order, and
+ *              coefficient-wise bound by INVNTT_BOUND in absolute value.
+ *
+ *
+ * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
+ **************************************************/
+void polyvec_invntt_tomont(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(int, j, 0, MLKEM_K - 1,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N - 1, (INVNTT_BOUND - 1))))
+);
+
+#define polyvec_basemul_acc_montgomery \
+  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery)
+void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a,
+                                    const polyvec *b);
+
+/* REF-CHANGE: This function does not exist in the reference implementation */
+#define polyvec_basemul_acc_montgomery_cached \
+  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached)
+/*************************************************
+ * Name:        polyvec_basemul_acc_montgomery_cached
+ *
+ * Description: Scalar product of two vectors of polynomials in NTT domain,
+ *              using mulcache for second operand.
+ *
+ *              Bounds:
+ *              - a is assumed to be coefficient-wise < q in absolute value.
+ *              - No bounds guarantees for the coefficients in the result.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const polyvec *a: pointer to first input polynomial vector
+ *              - const polyvec *b: pointer to second input polynomial vector
+ *              - const polyvec_mulcache *b_cache: pointer to mulcache
+ *                  for second input polynomial vector. Can be computed
+ *                  via polyvec_mulcache_compute().
+ **************************************************/
+void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
+                                           const polyvec *b,
+                                           const polyvec_mulcache *b_cache)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  requires(memory_no_alias(b, sizeof(polyvec)))
+  requires(memory_no_alias(b_cache, sizeof(polyvec_mulcache)))
+/* Input is coefficient-wise < q in absolute value */
+  requires(forall(int, k1, 0, MLKEM_K - 1,
+ array_abs_bound(a->vec[k1].coeffs, 0, MLKEM_N - 1, (MLKEM_Q - 1))))
+  assigns(memory_slice(r, sizeof(poly)))
+);
+
+/* REF-CHANGE: This function does not exist in the reference implementation */
+#define polyvec_mulcache_compute MLKEM_NAMESPACE(polyvec_mulcache_compute)
+/************************************************************
+ * Name: polyvec_mulcache_compute
+ *
+ * Description: Computes the mulcache for a vector of polynomials in NTT domain
+ *
+ *              The mulcache of a degree-2 polynomial b := b0 + b1*X
+ *              in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when
+ *              computing products of b in Fq[X]/(X^2-zeta).
+ *
+ *              The mulcache of a polynomial in NTT domain -- which is
+ *              a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta),
+ *              for varying zeta, is the 128-tuple of mulcaches of those
+ *              polynomials.
+ *
+ *              The mulcache of a vector of polynomials is the vector
+ *              of mulcaches of its entries.
+ *
+ * Arguments: - x: Pointer to mulcache to be populated
+ *            - a: Pointer to input polynomial vector
+ ************************************************************/
+/*
+ * NOTE: The default C implementation of this function populates
+ * the mulcache with values in (-q,q), but this is not needed for the
+ * higher level safety proofs, and thus not part of the spec.
+ */
+void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a)
+__contract__(
+  requires(memory_no_alias(x, sizeof(polyvec_mulcache)))
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  assigns(object_whole(x))
+);
+
+#define polyvec_reduce MLKEM_NAMESPACE(polyvec_reduce)
+/*************************************************
+ * Name:        polyvec_reduce
+ *
+ * Description: Applies Barrett reduction to each coefficient
+ *              of each element of a vector of polynomials;
+ *              for details of the Barrett reduction see comments in reduce.c
+ *
+ * Arguments:   - polyvec *r: pointer to input/output polynomial
+ **************************************************/
+/*
+ * REF-CHANGE: The semantics of polyvec_reduce() is different in
+ *             the reference implementation, which requires
+ *             signed canonical output data. Unsigned canonical
+ *             outputs are better suited to the only remaining
+ *             use of poly_reduce() in the context of (de)serialization.
+ */
+void polyvec_reduce(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(int, k0, 0, MLKEM_K - 1,
+    array_bound(r->vec[k0].coeffs, 0, MLKEM_N - 1, 0, (MLKEM_Q - 1))))
+);
+
+#define polyvec_add MLKEM_NAMESPACE(polyvec_add)
+/*************************************************
+ * Name:        polyvec_add
+ *
+ * Description: Add vectors of polynomials
+ *
+ * Arguments: - polyvec *r: pointer to input-output vector of polynomials to be
+ *              added to
+ *            - const polyvec *b: pointer to second input vector of polynomials
+ *
+ * The coefficients of r and b must be so that the addition does
+ * not overflow. Otherwise, the behaviour of this function is undefined.
+ *
+ * The coefficients returned in *r are in int16_t which is sufficient
+ * to prove type-safety of calling units. Therefore, no stronger
+ * ensures clause is required on this function.
+ **************************************************/
+void polyvec_add(polyvec *r, const polyvec *b)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  requires(memory_no_alias(b, sizeof(polyvec)))
+  requires(forall(int, j0, 0, MLKEM_K - 1,
+          forall(int, k0, 0, MLKEM_N - 1,
+            (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX)))
+  requires(forall(int, j1, 0, MLKEM_K - 1,
+          forall(int, k1, 0, MLKEM_N - 1,
+            (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN)))
+  assigns(object_whole(r))
+);
+
+#define polyvec_tomont MLKEM_NAMESPACE(polyvec_tomont)
+/*************************************************
+ * Name:        polyvec_tomont
+ *
+ * Description: Inplace conversion of all coefficients of a polynomial
+ *              vector from normal domain to Montgomery domain
+ *
+ *              Bounds: Output < q in absolute value.
+ *
+ **************************************************/
+void polyvec_tomont(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(memory_slice(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(int, j, 0, MLKEM_K - 1,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N - 1, (MLKEM_Q - 1))))
+);
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/reduce.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/reduce.c
new file mode 100644
index 0000000000..db7baf0f56
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/reduce.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "reduce.h"
+#include <stdint.h>
+#include "params.h"
+
+/* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */
+static const uint32_t QINV = 62209; /* q^-1 mod 2^16 */
+
+/*************************************************
+ * Name:        cast_uint16_to_int16
+ *
+ * Description: Cast uint16 value to int16
+ *
+ * Returns:
+ *   input x in     0 .. 32767: returns value unchanged
+ *   input x in 32768 .. 65535: returns (x - 65536)
+ **************************************************/
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "conversion"
+#endif
+static INLINE int16_t cast_uint16_to_int16(uint16_t x)
+{
+  /*
+   * PORTABILITY: This relies on uint16_t -> int16_t
+   * being implemented as the inverse of int16_t -> uint16_t,
+   * which is implementation-defined (C99 6.3.1.3 (3))
+   * CBMC (correctly) fails to prove this conversion is OK,
+   * so we have to suppress that check here
+   */
+  return (int16_t)x;
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/*************************************************
+ * Name:        montgomery_reduce_generic
+ *
+ * Description: Generic Montgomery reduction; given a 32-bit integer a, computes
+ *              16-bit integer congruent to a * R^-1 mod q, where R=2^16
+ *
+ * Arguments:   - int32_t a: input integer to be reduced
+ *
+ * Returns:     integer congruent to a * R^-1 modulo q
+ *
+ *              Bounds: For any C such that |a| < q * C, the return value
+ *              has absolute value < q (C/2^16 + 1/2).
+ *
+ *              Notable special cases:
+ *              - The Montgomery multiplication of a value of absolute value
+ *                < q * C with a signed-canonical value ( < q/2 ) has
+ *                absolute value q * (0.0254 * C + 1/2).
+ *              - The Montgomery multiplication of a value of absolute value
+ *                < q * C with a value t of |t| < q has absolute value
+ *                < q * (0.0508 * C + 1/2).
+ *              - The Montgomery multiplication of a value of absolute value
+ *                < C with a value of abs < q has absolute value
+ *                < q (C/2^16 + 1/2).
+ **************************************************/
+ALWAYS_INLINE
+static INLINE int16_t montgomery_reduce_generic(int32_t a)
+{
+  /*
+   *Bounds on paper
+   * - Case |a| < q * C, for some C
+   * |t| <= |a|/2^16 + |t|*q/2^16
+   *      < q * C / 2^16 + q/2
+   *      = q (C/2^16 + 1/2)
+   * - Case |a| < (q/2) * C * q, for some C
+   * Replace C -> C * q in the above and estimate
+   * q / 2^17 < 0.0254.
+   */
+
+  /*  Compute a*q^{-1} mod 2^16 in unsigned representatives */
+  const uint16_t a_reduced = a & UINT16_MAX;
+  const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
+
+  /* Lift to signed canonical representative mod 2^16. */
+  const int16_t t = cast_uint16_to_int16(a_inverted);
+
+  int32_t r = a - ((int32_t)t * MLKEM_Q);
+
+  /*
+   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
+   * implementation-defined for negative left argument. Here,
+   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
+   */
+  r = r >> 16;
+
+  return (int16_t)r;
+}
+
+int16_t montgomery_reduce(int32_t a)
+{
+  int16_t res;
+  SCALAR_BOUND(a, 2 * MLKEM_Q * 32768, "montgomery_reduce input");
+
+  res = montgomery_reduce_generic(a);
+
+  SCALAR_BOUND(res, (3 * (MLKEM_Q + 1)) / 2, "montgomery_reduce output");
+  return res;
+}
+
+int16_t fqmul(int16_t a, int16_t b)
+{
+  int16_t res;
+  SCALAR_BOUND(b, HALF_Q, "fqmul input");
+
+  res = montgomery_reduce((int32_t)a * (int32_t)b);
+
+  SCALAR_BOUND(res, MLKEM_Q, "fqmul output");
+  return res;
+}
+
+/*
+ * To divide by MLKEM_Q using Barrett multiplication, the "magic number"
+ * multiplier is round_to_nearest(2**26/MLKEM_Q)
+ */
+#define BPOWER 26
+static const int32_t barrett_multiplier =
+    ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q;
+
+/*************************************************
+ * Name:        barrett_reduce
+ *
+ * Description: Barrett reduction; given a 16-bit integer a, computes
+ *              centered representative congruent to a mod q in
+ *              {-(q-1)/2,...,(q-1)/2}
+ *
+ * Arguments:   - int16_t a: input integer to be reduced
+ *
+ * Returns:     integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
+ **************************************************/
+int16_t barrett_reduce(int16_t a)
+{
+  /*
+   * Compute round_to_nearest(a/MLKEM_Q) using the multiplier
+   * above and shift by BPOWER places.
+   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
+   * implementation-defined for negative left argument. Here,
+   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
+   */
+  const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER;
+
+  /*
+   * t is in -10 .. +10, so we need 32-bit math to
+   * evaluate t * MLKEM_Q and the subsequent subtraction
+   */
+  return (int16_t)(a - t * MLKEM_Q);
+}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/reduce.h
new file mode 100644
index 0000000000..2a486cf3ec
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/reduce.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef REDUCE_H
+#define REDUCE_H
+
+#include <stdint.h>
+#include "cbmc.h"
+#include "debug/debug.h"
+#include "params.h"
+
+#define MONT -1044                 /* 2^16 mod q */
+#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */
+
+/*************************************************
+ * Name:        montgomery_reduce
+ *
+ * Description: Montgomery reduction
+ *
+ * Arguments:   - int32_t a: input integer to be reduced
+ *                  Must be smaller than 2 * q * 2^15 in absolute value.
+ *
+ * Returns:     integer congruent to a * R^-1 modulo q,
+ *              smaller than 3/2 q in absolute value.
+ **************************************************/
+#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce)
+int16_t montgomery_reduce(int32_t a)
+__contract__(
+  requires(a > -(2 * MLKEM_Q * 32768))
+  requires(a <  (2 * MLKEM_Q * 32768))
+  ensures(return_value > -(3 * HALF_Q) && return_value < (3 * HALF_Q))
+);
+
+#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce)
+int16_t barrett_reduce(int16_t a)
+__contract__(
+  ensures(return_value > -HALF_Q && return_value < HALF_Q)
+);
+
+/*************************************************
+ * Name:        fqmul
+ *
+ * Description: Montgomery multiplication modulo q=3329
+ *
+ * Arguments:   - int16_t a: first factor
+ *                  Can be any int16_t.
+ *              - int16_t b: second factor.
+ *                  Must be signed canonical (abs value <(q+1)/2)
+ *
+ * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and
+ * smaller than q in absolute value.
+ *
+ **************************************************/
+#define fqmul MLKEM_NAMESPACE(fqmul)
+int16_t fqmul(int16_t a, int16_t b)
+__contract__(
+  requires(b > -HALF_Q)
+  requires(b < HALF_Q)
+  ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q)
+);
+
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/rej_uniform.c
new file mode 100644
index 0000000000..4e8a5ce9b2
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/rej_uniform.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "params.h"
+
+#include "arith_native.h"
+#include "rej_uniform.h"
+
+/*************************************************
+ * Name:        rej_uniform_scalar
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ *              uniform random integers mod q
+ *
+ * Arguments:   - int16_t *r:          pointer to output buffer
+ *              - unsigned int target: requested number of 16-bit integers
+ *                                     (uniform mod q).
+ *                                     Must be <= 4096.
+ *              - unsigned int offset: number of 16-bit integers that have
+ *                                     already been sampled.
+ *                                     Must be <= target.
+ *              - const uint8_t *buf:  pointer to input buffer
+ *                                     (assumed to be uniform random bytes)
+ *              - unsigned int buflen: length of input buffer in bytes
+ *                                     Must be <= 4096.
+ *                                     Must be a multiple of 3.
+ *
+ * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
+ * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
+ * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
+ *
+ * Returns the new offset of sampled 16-bit integers, at most target,
+ * and at least the initial offset.
+ * If the new offset is strictly less than len, all of the input buffers
+ * is guaranteed to have been consumed. If it is equal to len, no information
+ * is provided on how many bytes of the input buffer have been consumed.
+ **************************************************/
+static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target,
+                                       unsigned int offset, const uint8_t *buf,
+                                       unsigned int buflen)
+{
+  unsigned int ctr, pos;
+  uint16_t val0, val1;
+
+  ctr = offset;
+  pos = 0;
+  /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */
+  while (ctr < target && pos + 3 <= buflen)
+  __loop__(
+    invariant(offset <= ctr && ctr <= target && pos <= buflen)
+    invariant(ctr > 0 ==> array_bound(r, 0, ctr - 1, 0, (MLKEM_Q - 1))))
+  {
+    val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
+    val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF;
+    pos += 3;
+
+    if (val0 < MLKEM_Q)
+    {
+      r[ctr++] = val0;
+    }
+    if (ctr < target && val1 < MLKEM_Q)
+    {
+      r[ctr++] = val1;
+    }
+  }
+  return ctr;
+}
+
+#if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
+unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
+                         const uint8_t *buf, unsigned int buflen)
+{
+  return rej_uniform_scalar(r, target, offset, buf, buflen);
+}
+#else  /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+
+unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
+                         const uint8_t *buf, unsigned int buflen)
+{
+  int ret;
+
+  /* Sample from large buffer with full lane as much as possible. */
+  ret = rej_uniform_native(r + offset, target - offset, buf, buflen);
+  if (ret != -1)
+    return offset + (unsigned)ret;
+
+  return rej_uniform_scalar(r, target, offset, buf, buflen);
+}
+#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/rej_uniform.h
new file mode 100644
index 0000000000..aeb9cc3eb0
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/rej_uniform.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef REJ_UNIFORM_H
+#define REJ_UNIFORM_H
+
+#include <stdint.h>
+#include <stdlib.h>
+#include "cbmc.h"
+#include "params.h"
+
+#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
+/*************************************************
+ * Name:        rej_uniform
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ *              uniform random integers mod q
+ *
+ * Arguments:   - int16_t *r:          pointer to output buffer
+ *              - unsigned int target: requested number of 16-bit integers
+ *                                     (uniform mod q).
+ *                                     Must be <= 4096.
+ *              - unsigned int offset: number of 16-bit integers that have
+ *                                     already been sampled.
+ *                                     Must be <= target.
+ *              - const uint8_t *buf:  pointer to input buffer
+ *                                     (assumed to be uniform random bytes)
+ *              - unsigned int buflen: length of input buffer in bytes
+ *                                     Must be <= 4096.
+ *                                     Must be a multiple of 3.
+ *
+ * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
+ * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
+ * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
+ *
+ * Returns the new offset of sampled 16-bit integers, at most target,
+ * and at least the initial offset.
+ * If the new offset is strictly less than len, all of the input buffers
+ * is guaranteed to have been consumed. If it is equal to len, no information
+ * is provided on how many bytes of the input buffer have been consumed.
+ **************************************************/
+
+/*
+ * REF-CHANGE: The signature differs from the Kyber reference implementation
+ * in that it adds the offset and always expects the base of the target
+ * buffer. This avoids shifting the buffer base in the caller, which appears
+ * tricky to reason about.
+ */
+unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
+                         const uint8_t *buf, unsigned int buflen)
+__contract__(
+  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
+  requires(memory_no_alias(r, sizeof(int16_t) * target))
+  requires(memory_no_alias(buf, buflen))
+  requires(offset > 0 ==> array_bound(r, 0, offset - 1, 0, (MLKEM_Q - 1)))
+  assigns(memory_slice(r, sizeof(int16_t) * target))
+  ensures(offset <= return_value && return_value <= target)
+  ensures(return_value > 0 ==> array_bound(r, 0, return_value - 1, 0, (MLKEM_Q - 1)))
+);
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/symmetric-shake.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/symmetric-shake.c
new file mode 100644
index 0000000000..5dd8c10d92
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/symmetric-shake.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include "fips202.h"
+#include "params.h"
+#include "symmetric.h"
+
+void mlkem_shake256_prf(uint8_t *out, size_t outlen,
+                        const uint8_t key[MLKEM_SYMBYTES], uint8_t nonce)
+{
+  uint8_t extkey[MLKEM_SYMBYTES + 1];
+
+  memcpy(extkey, key, MLKEM_SYMBYTES);
+  extkey[MLKEM_SYMBYTES] = nonce;
+
+  shake256(out, outlen, extkey, sizeof(extkey));
+}
+
+void mlkem_shake256_rkprf(uint8_t out[MLKEM_SSBYTES],
+                          const uint8_t key[MLKEM_SYMBYTES],
+                          const uint8_t input[MLKEM_CIPHERTEXTBYTES])
+{
+  shake256incctx s;
+
+  shake256_inc_init(&s);
+  shake256_inc_absorb(&s, key, MLKEM_SYMBYTES);
+  shake256_inc_absorb(&s, input, MLKEM_CIPHERTEXTBYTES);
+  shake256_inc_finalize(&s);
+  shake256_inc_squeeze(out, MLKEM_SSBYTES, &s);
+  shake256_inc_ctx_release(&s);
+}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/symmetric.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/symmetric.h
new file mode 100644
index 0000000000..202741a7b3
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/symmetric.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef SYMMETRIC_H
+#define SYMMETRIC_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include "params.h"
+
+#include "fips202.h"
+
+#include "cbmc.h"
+
+#define mlkem_shake256_prf MLKEM_NAMESPACE(mlkem_shake256_prf)
+/*************************************************
+ * Name:        mlkem_shake256_prf
+ *
+ * Ref:         FIPS-203 Section 4.1. Function PRF (eq 4.3)
+ *
+ * Description: Usage of SHAKE256 as a PRF, concatenates secret and public input
+ *              and then generates outlen bytes of SHAKE256 output
+ *
+ * Arguments:   - uint8_t *out: pointer to output
+ *              - size_t outlen: number of requested output bytes
+ *              - const uint8_t *key: pointer to the key (of length
+ *                MLKEM_SYMBYTES)
+ *              - uint8_t nonce: single-byte nonce (public PRF input)
+ *
+ *              out and key may NOT be aliased.
+ **************************************************/
+void mlkem_shake256_prf(uint8_t *out, size_t outlen,
+                        const uint8_t key[MLKEM_SYMBYTES], uint8_t nonce)
+__contract__(
+  requires(memory_no_alias(out, outlen))
+  requires(memory_no_alias(key, MLKEM_SYMBYTES))
+  assigns(memory_slice(out, outlen))
+);
+
+#define mlkem_shake256_rkprf MLKEM_NAMESPACE(mlkem_shake256_rkprf)
+/*************************************************
+ * Name:        mlkem_shake256_rkprf
+ *
+ * Ref:         FIPS-203 Section 4.1. Hash function J
+ *
+ * Description: Usage of SHAKE256 as a PRF, concatenates key with input
+ *              and then generates MLKEM_SSBYTES bytes of SHAKE256 output
+ *
+ * Arguments:   - uint8_t *out: pointer to output
+ *              - const uint8_t *key: pointer to the key (of length
+ *                MLKEM_SYMBYTES)
+ *              - const uint8_t *input: pointer to the input (of length
+ *                MLKEM_CIPHERTEXTBYTES)
+ *
+ *              out, key, and input may NOT be aliased.
+ **************************************************/
+void mlkem_shake256_rkprf(uint8_t out[MLKEM_SSBYTES],
+                          const uint8_t key[MLKEM_SYMBYTES],
+                          const uint8_t input[MLKEM_CIPHERTEXTBYTES])
+__contract__(
+  requires(memory_no_alias(out, MLKEM_SSBYTES))
+  requires(memory_no_alias(key, MLKEM_SYMBYTES))
+  requires(memory_no_alias(input, MLKEM_CIPHERTEXTBYTES))
+  assigns(memory_slice(out, MLKEM_SSBYTES))
+);
+
+
+/* Macros denoting FIPS-203 specific Hash functions */
+
+/* Hash function H, FIPS-201 4.1 (eq 4.4) */
+#define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES)
+
+/* Hash function G, FIPS-201 4.1 (eq 4.5) */
+#define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES)
+
+/* Macros denoting FIPS-203 specific PRFs */
+#define prf(OUT, OUTBYTES, KEY, NONCE) \
+  mlkem_shake256_prf(OUT, OUTBYTES, KEY, NONCE)
+#define rkprf(OUT, KEY, INPUT) mlkem_shake256_rkprf(OUT, KEY, INPUT)
+
+#endif /* SYMMETRIC_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/verify.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/verify.c
new file mode 100644
index 0000000000..b5b71e023e
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/verify.c
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "verify.h"
+
+#if !defined(MLKEM_USE_ASM_VALUE_BARRIER)
+/*
+ * Masking value used in constant-time functions from
+ * verify.h to block the compiler's range analysis and
+ * thereby reduce the risk of compiler-introduced branches.
+ */
+volatile uint64_t ct_opt_blocker_u64 = 0;
+
+#else /* MLKEM_USE_ASM_VALUE_BARRIER */
+
+int empty_cu_verify;
+
+#endif /* MLKEM_USE_ASM_VALUE_BARRIER */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/verify.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/verify.h
new file mode 100644
index 0000000000..5c62223c3d
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/verify.h
@@ -0,0 +1,305 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef VERIFY_H
+#define VERIFY_H
+
+#include <limits.h>
+#include <stddef.h>
+#include <stdint.h>
+#include "cbmc.h"
+#include "params.h"
+
+/* Constant-time comparisons and conditional operations
+
+   We reduce the risk for compilation into variable-time code
+   through the use of 'value barriers'.
+
+   Functionally, a value barrier is a no-op. To the compiler, however,
+   it constitutes an arbitrary modification of its input, and therefore
+   harden's value propagation and range analysis.
+
+   We consider two approaches to implement a value barrier:
+   - An empty inline asm block which marks the target value as clobbered.
+   - XOR'ing with the value of a volatile global that's set to 0.
+
+   The first approach is cheap because it only prevents the compiler
+   from reasoning about the value of the variable past the barrier,
+   but does not directly generate additional instructions.
+
+   The second approach generates redundant loads and XOR operations
+   and therefore comes at a higher runtime cost. However, it appears
+   more robust towards optimization, as compilers should never drop
+   a volatile load.
+
+   We use the empty-ASM value barrier for GCC and clang, and fall
+   back to the global volatile barrier otherwise.
+
+   The global value barrier can be forced by setting MLKEM_NO_ASM_VALUE_BARRIER.
+
+*/
+
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(CBMC) && \
+    !defined(MLKEM_NO_ASM_VALUE_BARRIER)
+#define MLKEM_USE_ASM_VALUE_BARRIER
+#endif
+
+#if !defined(MLKEM_USE_ASM_VALUE_BARRIER)
+
+/*
+ * Declaration of global volatile that the global value barrier
+ * is loading from and masking with.
+ */
+#define ct_opt_blocker_u64 MLKEM_NAMESPACE(ct_opt_blocker_u64)
+extern volatile uint64_t ct_opt_blocker_u64;
+
+/* Helper functions for obtaining masks of various sizes */
+STATIC_INLINE_TESTABLE uint8_t get_optblocker_u8(void)
+__contract__(ensures(return_value == 0)) { return (uint8_t)ct_opt_blocker_u64; }
+
+STATIC_INLINE_TESTABLE uint32_t get_optblocker_u32(void)
+__contract__(ensures(return_value == 0)) { return ct_opt_blocker_u64; }
+
+STATIC_INLINE_TESTABLE uint32_t get_optblocker_i32(void)
+__contract__(ensures(return_value == 0)) { return ct_opt_blocker_u64; }
+
+STATIC_INLINE_TESTABLE uint32_t value_barrier_u32(uint32_t b)
+__contract__(ensures(return_value == b)) { return (b ^ get_optblocker_u32()); }
+
+STATIC_INLINE_TESTABLE int32_t value_barrier_i32(int32_t b)
+__contract__(ensures(return_value == b)) { return (b ^ get_optblocker_i32()); }
+
+STATIC_INLINE_TESTABLE uint8_t value_barrier_u8(uint8_t b)
+__contract__(ensures(return_value == b)) { return (b ^ get_optblocker_u8()); }
+
+#else /* !MLKEM_USE_ASM_VALUE_BARRIER */
+
+STATIC_INLINE_TESTABLE uint32_t value_barrier_u32(uint32_t b)
+__contract__(ensures(return_value == b))
+{
+  asm("" : "+r"(b));
+  return b;
+}
+
+STATIC_INLINE_TESTABLE int32_t value_barrier_i32(int32_t b)
+__contract__(ensures(return_value == b))
+{
+  asm("" : "+r"(b));
+  return b;
+}
+
+STATIC_INLINE_TESTABLE uint8_t value_barrier_u8(uint8_t b)
+__contract__(ensures(return_value == b))
+{
+  asm("" : "+r"(b));
+  return b;
+}
+
+#endif /* MLKEM_USE_ASM_VALUE_BARRIER */
+
+/*
+ * The ct_cmask_nonzero_xxx functions below make deliberate use of unsigned
+ * overflow, which is fully defined behaviour in C. It is thus safe to disable
+ * this warning.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+
+/*************************************************
+ * Name:        ct_cmask_nonzero_u16
+ *
+ * Description: Return 0 if input is zero, and -1 otherwise.
+ *
+ * Arguments:   uint16_t x: Value to be converted into a mask
+ **************************************************/
+STATIC_INLINE_TESTABLE uint16_t ct_cmask_nonzero_u16(uint16_t x)
+__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFFFF)))
+{
+  uint32_t tmp = value_barrier_u32(-((uint32_t)x));
+  tmp >>= 16;
+  return tmp;
+}
+
+/*************************************************
+ * Name:        ct_cmask_nonzero_u8
+ *
+ * Description: Return 0 if input is zero, and -1 otherwise.
+ *
+ * Arguments:   uint8_t x: Value to be converted into a mask
+ **************************************************/
+STATIC_INLINE_TESTABLE uint8_t ct_cmask_nonzero_u8(uint8_t x)
+__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFF)))
+{
+  uint32_t tmp = value_barrier_u32(-((uint32_t)x));
+  tmp >>= 24;
+  return tmp;
+}
+
+/* Put unsigned overflow warnings in CBMC back into scope */
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/*
+ * The ct_cmask_neg_i16 function below makes deliberate use of
+ * signed to unsigned integer conversion, which is fully defined
+ * behaviour in C. It is thus safe to disable this warning.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "conversion"
+#endif
+
+/*************************************************
+ * Name:        ct_cmask_neg_i16
+ *
+ * Description: Return 0 if input is non-negative, and -1 otherwise.
+ *
+ * Arguments:   uint16_t x: Value to be converted into a mask
+ **************************************************/
+STATIC_INLINE_TESTABLE uint16_t ct_cmask_neg_i16(int16_t x)
+__contract__(ensures(return_value == ((x < 0) ? 0xFFFF : 0)))
+{
+  int32_t tmp = value_barrier_i32((int32_t)x);
+  tmp >>= 16;
+  return (int16_t)tmp;
+}
+
+/* Put unsigned-to-signed warnings in CBMC back into scope */
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/*
+ * The ct_csel_xxx functions below make deliberate use of unsigned
+ * to signed integer conversion, which is implementation-defined
+ * behaviour. Here, we assume that uint16_t -> int16_t is inverse
+ * to int16_t -> uint16_t.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "conversion"
+#endif
+
+/*************************************************
+ * Name:        ct_sel_int16
+ *
+ * Description: Functionally equivalent to cond ? a : b,
+ *              but implemented with guards against
+ *              compiler-introduced branches.
+ *
+ * Arguments:   int16_t a:       First alternative
+ *              int16_t b:       Second alternative
+ *              uint16_t cond:   Condition variable.
+ **************************************************/
+STATIC_INLINE_TESTABLE int16_t ct_sel_int16(int16_t a, int16_t b, uint16_t cond)
+__contract__(ensures(return_value == (cond ? a : b)))
+{
+  uint16_t au = a, bu = b;
+  uint16_t res = bu ^ (ct_cmask_nonzero_u16(cond) & (au ^ bu));
+  return (int16_t)res;
+}
+
+/* Put unsigned-to-signed warnings in CBMC back into scope */
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/*************************************************
+ * Name:        ct_sel_uint8
+ *
+ * Description: Functionally equivalent to cond ? a : b,
+ *              but implemented with guards against
+ *              compiler-introduced branches.
+ *
+ * Arguments:   uint8_t a:       First alternative
+ *              uint8_t b:       Second alternative
+ *              uuint8_t cond:   Condition variable.
+ **************************************************/
+STATIC_INLINE_TESTABLE uint8_t ct_sel_uint8(uint8_t a, uint8_t b, uint8_t cond)
+__contract__(ensures(return_value == (cond ? a : b)))
+{
+  return b ^ (ct_cmask_nonzero_u8(cond) & (a ^ b));
+}
+
+/*************************************************
+ * Name:        ct_memcmp
+ *
+ * Description: Compare two arrays for equality in constant time.
+ *
+ * Arguments:   const uint8_t *a: pointer to first byte array
+ *              const uint8_t *b: pointer to second byte array
+ *              size_t len:       length of the byte arrays
+ *
+ * Returns 0 if the byte arrays are equal, a non-zero value otherwise
+ **************************************************/
+STATIC_INLINE_TESTABLE uint8_t ct_memcmp(const uint8_t *a, const uint8_t *b,
+                                         const size_t len)
+__contract__(
+  requires(memory_no_alias(a, len))
+  requires(memory_no_alias(b, len))
+  requires(len <= INT_MAX)
+  ensures((return_value == 0) == forall(int, i, 0, ((int)len - 1), (a[i] == b[i]))))
+{
+  uint8_t r = 0, s = 0;
+
+  /*
+   * Switch to a _signed_ ilen value, so that our loop counter
+   * can also be signed, and thus (i - 1) in the loop invariant
+   * can yield -1 as required.
+   */
+  const int ilen = (int)len;
+  int i;
+
+  for (i = 0; i < ilen; i++)
+  __loop__(
+    invariant(i >= 0 && i <= ilen)
+    invariant((r == 0) == (forall(int, k, 0, (i - 1), (a[k] == b[k])))))
+  {
+    r |= a[i] ^ b[i];
+    /* s is useless, but prevents the loop from being aborted once r=0xff. */
+    s ^= a[i] ^ b[i];
+  }
+
+  /*
+   * - Convert r into a mask; this may not be necessary, but is an additional
+   *   safeguard
+   *   towards leaking information about a and b.
+   * - XOR twice with s, separated by a value barrier, to prevent the compile
+   *   from dropping the s computation in the loop.
+   */
+  return (value_barrier_u8(ct_cmask_nonzero_u8(r) ^ s) ^ s);
+}
+
+/*************************************************
+ * Name:        ct_cmov_zero
+ *
+ * Description: Copy len bytes from x to r if b is zero;
+ *              don't modify x if b is non-zero.
+ *              assumes two's complement representation of negative integers.
+ *              Runs in constant time.
+ *
+ * Arguments:   uint8_t *r:       pointer to output byte array
+ *              const uint8_t *x: pointer to input byte array
+ *              size_t len:       Amount of bytes to be copied
+ *              uint8_t b:        Condition value.
+ **************************************************/
+STATIC_INLINE_TESTABLE
+void ct_cmov_zero(uint8_t *r, const uint8_t *x, size_t len, uint8_t b)
+__contract__(
+  requires(memory_no_alias(r, len))
+  requires(memory_no_alias(x, len))
+  assigns(memory_slice(r, len)))
+{
+  size_t i;
+  for (i = 0; i < len; i++)
+  __loop__(invariant(i <= len))
+  {
+    r[i] = ct_sel_uint8(r[i], x[i], b);
+  }
+}
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/zetas.c
new file mode 100644
index 0000000000..f52b2ff5ad
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/zetas.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * WARNING: This file is auto-generated from scripts/autogenerate_files.py
+ *          Do not modify it directly.
+ */
+
+#include "ntt.h"
+
+/*
+ * Table of zeta values used in the reference NTT and inverse NTT.
+ * See autogenerate_files.py for details.
+ */
+const int16_t zetas[128] = {
+    -1044, -758,  -359,  -1517, 1493,  1422,  287,   202,  -171,  622,   1577,
+    182,   962,   -1202, -1474, 1468,  573,   -1325, 264,  383,   -829,  1458,
+    -1602, -130,  -681,  1017,  732,   608,   -1542, 411,  -205,  -1571, 1223,
+    652,   -552,  1015,  -1293, 1491,  -282,  -1544, 516,  -8,    -320,  -666,
+    -1618, -1162, 126,   1469,  -853,  -90,   -271,  830,  107,   -1421, -247,
+    -951,  -398,  961,   -1508, -725,  448,   -1065, 677,  -1275, -1103, 430,
+    555,   843,   -1251, 871,   1550,  105,   422,   587,  177,   -235,  -291,
+    -460,  1574,  1653,  -246,  778,   1159,  -147,  -777, 1483,  -602,  1119,
+    -1590, 644,   -872,  349,   418,   329,   -156,  -75,  817,   1097,  603,
+    610,   1322,  -1285, -1465, 384,   -1215, -136,  1218, -1335, -874,  220,
+    -1187, -1659, -1185, -1530, -1278, 794,   -1510, -854, -870,  478,   -108,
+    -308,  996,   991,   958,   -1460, 1522,  1628,
+};
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/LICENSE b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/LICENSE
similarity index 100%
rename from src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/LICENSE
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_ref/LICENSE
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/api.h
new file mode 100644
index 0000000000..94597323f1
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/api.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef API_H
+#define API_H
+
+#include <stdint.h>
+
+#define PQCP_MLKEM_NATIVE_MLKEM512_SECRETKEYBYTES 1632
+#define PQCP_MLKEM_NATIVE_MLKEM512_PUBLICKEYBYTES 800
+#define PQCP_MLKEM_NATIVE_MLKEM512_CIPHERTEXTBYTES 768
+#define PQCP_MLKEM_NATIVE_MLKEM512_KEYPAIRCOINBYTES 64
+#define PQCP_MLKEM_NATIVE_MLKEM512_ENCCOINBYTES 32
+#define PQCP_MLKEM_NATIVE_MLKEM512_BYTES 32
+
+int PQCP_MLKEM_NATIVE_MLKEM512_keypair_derand(uint8_t *pk, uint8_t *sk,
+                                              const uint8_t *coins);
+int PQCP_MLKEM_NATIVE_MLKEM512_keypair(uint8_t *pk, uint8_t *sk);
+int PQCP_MLKEM_NATIVE_MLKEM512_enc_derand(uint8_t *ct, uint8_t *ss,
+                                          const uint8_t *pk,
+                                          const uint8_t *coins);
+int PQCP_MLKEM_NATIVE_MLKEM512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
+int PQCP_MLKEM_NATIVE_MLKEM512_dec(uint8_t *ss, const uint8_t *ct,
+                                   const uint8_t *sk);
+
+#define PQCP_MLKEM_NATIVE_MLKEM768_SECRETKEYBYTES 2400
+#define PQCP_MLKEM_NATIVE_MLKEM768_PUBLICKEYBYTES 1184
+#define PQCP_MLKEM_NATIVE_MLKEM768_CIPHERTEXTBYTES 1088
+#define PQCP_MLKEM_NATIVE_MLKEM768_KEYPAIRCOINBYTES 64
+#define PQCP_MLKEM_NATIVE_MLKEM768_ENCCOINBYTES 32
+#define PQCP_MLKEM_NATIVE_MLKEM768_BYTES 32
+
+int PQCP_MLKEM_NATIVE_MLKEM768_keypair_derand(uint8_t *pk, uint8_t *sk,
+                                              const uint8_t *coins);
+int PQCP_MLKEM_NATIVE_MLKEM768_keypair(uint8_t *pk, uint8_t *sk);
+int PQCP_MLKEM_NATIVE_MLKEM768_enc_derand(uint8_t *ct, uint8_t *ss,
+                                          const uint8_t *pk,
+                                          const uint8_t *coins);
+int PQCP_MLKEM_NATIVE_MLKEM768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
+int PQCP_MLKEM_NATIVE_MLKEM768_dec(uint8_t *ss, const uint8_t *ct,
+                                   const uint8_t *sk);
+
+#define PQCP_MLKEM_NATIVE_MLKEM1024_SECRETKEYBYTES 3168
+#define PQCP_MLKEM_NATIVE_MLKEM1024_PUBLICKEYBYTES 1568
+#define PQCP_MLKEM_NATIVE_MLKEM1024_CIPHERTEXTBYTES 1568
+#define PQCP_MLKEM_NATIVE_MLKEM1024_KEYPAIRCOINBYTES 64
+#define PQCP_MLKEM_NATIVE_MLKEM1024_ENCCOINBYTES 32
+#define PQCP_MLKEM_NATIVE_MLKEM1024_BYTES 32
+
+int PQCP_MLKEM_NATIVE_MLKEM1024_keypair_derand(uint8_t *pk, uint8_t *sk,
+                                               const uint8_t *coins);
+int PQCP_MLKEM_NATIVE_MLKEM1024_keypair(uint8_t *pk, uint8_t *sk);
+int PQCP_MLKEM_NATIVE_MLKEM1024_enc_derand(uint8_t *ct, uint8_t *ss,
+                                           const uint8_t *pk,
+                                           const uint8_t *coins);
+int PQCP_MLKEM_NATIVE_MLKEM1024_enc(uint8_t *ct, uint8_t *ss,
+                                    const uint8_t *pk);
+int PQCP_MLKEM_NATIVE_MLKEM1024_dec(uint8_t *ss, const uint8_t *ct,
+                                    const uint8_t *sk);
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/arith_native.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/arith_native.h
new file mode 100644
index 0000000000..b7e921323a
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/arith_native.h
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef MLKEM_ARITH_NATIVE_H
+#define MLKEM_ARITH_NATIVE_H
+
+#include <stdint.h>
+#include "config.h"
+#include "params.h"
+
+#if defined(MLKEM_USE_NATIVE)
+
+#include "poly.h"
+#include "polyvec.h"
+#include "profile.h"
+
+/*
+ * MLKEM native arithmetic interface
+ *
+ * This is the C<->native arithmetic interface used in this repository
+ * to allow for the drop-in of native code for performance critical
+ * components of ML-KEM.
+ *
+ * A _profile_ is a specific implementation of (part of) this interface.
+ * The active profile (if any) is determined in profile.h.
+ *
+ * To add a function to a profile, define MLKEM_USE_NATIVE_XXX and
+ * implement `static inline xxx(...)` in the profile header.
+ *
+ * The only exception is MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER. This option can
+ * be set if there are native implementations for all of NTT, invNTT, and
+ * base multiplication, and allows the native implementation to use a
+ * custom order of polynomial coefficients in NTT domain -- the use of such
+ * custom order is not an implementation-detail since the public matrix
+ * is generated in NTT domain. In this case, a permutation function
+ * poly_permute_bitrev_to_custom() needs to be provided that permutes
+ * polynomials in NTT domain from bitreversed to the custom order.
+ */
+
+/*
+ * Those functions are meant to be trivial wrappers around
+ * the chosen native implementation. The are static inline
+ * to avoid unnecessary calls.
+ * The macro before each declaration controls whether a native
+ * implementation is present.
+ */
+
+#if defined(MLKEM_USE_NATIVE_NTT)
+/*************************************************
+ * Name:        ntt_native
+ *
+ * Description: Computes negacyclic number-theoretic transform (NTT) of
+ *              a polynomial in place.
+ *
+ *              The input polynomial is assumed to be in normal order.
+ *              The output polynomial is in bitreversed order, or of a
+ *              custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *              for more information.
+ *
+ * Arguments:   - poly *p: pointer to in/output polynomial
+ **************************************************/
+static INLINE void ntt_native(poly *);
+#endif /* MLKEM_USE_NATIVE_NTT */
+
+#if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
+/*
+ * This must only be set if NTT, invNTT, basemul, mulcache, and
+ * to/from byte stream conversions all have native implementations
+ * that are adapted to the custom order.
+ */
+#if !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) || \
+    !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) ||                  \
+    !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) ||  \
+    !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) ||                           \
+    !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
+#error \
+    "Invalid native profile: MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER can only be \
+set if there are native implementations for NTT, invNTT, mulcache, basemul, \
+and to/from bytes conversions."
+#endif
+
+/*************************************************
+ * Name:        poly_permute_bitrev_to_custom
+ *
+ * Description: When MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is defined,
+ *              convert a polynomial in NTT domain from bitreversed
+ *              order to the custom order output by the native NTT.
+ *
+ *              This must only be defined if there is native code for
+ *              all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache.
+ * Arguments:   - poly *p: pointer to in/output polynomial
+ *
+ **************************************************/
+static INLINE void poly_permute_bitrev_to_custom(poly *);
+#endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */
+
+#if defined(MLKEM_USE_NATIVE_INTT)
+/*************************************************
+ * Name:        intt_native
+ *
+ * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
+ *              of a polynomial in place.
+ *
+ *              The input polynomial is in bitreversed order, or of a
+ *              custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *              for more information.
+ *              The output polynomial is assumed to be in normal order.
+ *
+ * Arguments:   - uint16_t *a: pointer to in/output polynomial
+ **************************************************/
+static INLINE void intt_native(poly *);
+#endif /* MLKEM_USE_NATIVE_INTT */
+
+#if defined(MLKEM_USE_NATIVE_POLY_REDUCE)
+/*************************************************
+ * Name:        poly_reduce_native
+ *
+ * Description: Applies modular reduction to all coefficients of a polynomial.
+ *
+ * Arguments:   - poly *r: pointer to input/output polynomial
+ **************************************************/
+static INLINE void poly_reduce_native(poly *);
+#endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
+
+#if defined(MLKEM_USE_NATIVE_POLY_TOMONT)
+/*************************************************
+ * Name:        poly_tomont_native
+ *
+ * Description: Inplace conversion of all coefficients of a polynomial
+ *              from normal domain to Montgomery domain
+ *
+ * Arguments:   - poly *r: pointer to input/output polynomial
+ **************************************************/
+static INLINE void poly_tomont_native(poly *);
+#endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
+
+#if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE)
+/*************************************************
+ * Name:        poly_mulcache_compute_native
+ *
+ * Description: Compute multiplication cache for a polynomial
+ *              in NTT domain.
+ *
+ *              The purpose of the multiplication cache is to
+ *              cache repeated computations required during a
+ *              base multiplication of polynomials in NTT domain.
+ *              The structure of the multiplication-cache is
+ *              implementation defined.
+ *
+ * Arguments:   INPUT:
+ *              - poly: const pointer to input polynomial.
+ *                  This must be in NTT domain and inin bitreversed order, or of
+ *                  a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *                  See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *                  for more information.
+ *              OUTPUT
+ *              - cache: pointer to multiplication cache
+ **************************************************/
+static INLINE void poly_mulcache_compute_native(poly_mulcache *cache,
+                                                const poly *poly);
+#endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
+
+#if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
+/*************************************************
+ * Name:        poly_mulcache_compute_native
+ *
+ * Description: Compute multiplication of polynomials in NTT domain.
+ *
+ * Arguments:   INPUT:
+ *              - a: First polynomial operand.
+ *                 This must be in NTT domain and inin bitreversed order, or of
+ *                 a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *                 See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *                 for more information.
+ *              - b: Second polynomial operand.
+ *                 As for a.
+ *              - b_cache: Multiplication-cache for b.
+ *              OUTPUT
+ *              - r: Result of the base multiplication. This is again
+ *                   in NTT domain, and of the same order as a and b.
+ **************************************************/
+static INLINE void polyvec_basemul_acc_montgomery_cached_native(
+    poly *r, const polyvec *a, const polyvec *b,
+    const polyvec_mulcache *b_cache);
+#endif
+
+#if defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
+/*************************************************
+ * Name:        poly_tobytes_native
+ *
+ * Description: Serialization of a polynomial.
+ *              Signed coefficients are converted to
+ *              unsigned form before serialization.
+ *
+ * Arguments:   INPUT:
+ *              - a: const pointer to input polynomial,
+ *                with each coefficient in the range -Q+1 .. Q-1
+ *              OUTPUT
+ *              - r: pointer to output byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ **************************************************/
+static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
+                                       const poly *a);
+#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+
+#if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
+/*************************************************
+ * Name:        poly_frombytes_native
+ *
+ * Description: Serialization of a polynomial.
+ *              Signed coefficients are converted to
+ *              unsigned form before serialization.
+ *
+ * Arguments:   INPUT:
+ *              - r: pointer to output polynomial in NTT domain
+ *              OUTPUT
+ *              - a: const pointer to input byte aray
+ *                   (of MLKEM_POLYBYTES bytes)
+ **************************************************/
+static INLINE void poly_frombytes_native(poly *a,
+                                         const uint8_t r[MLKEM_POLYBYTES]);
+#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+
+#if defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
+/*************************************************
+ * Name:        rej_uniform_native
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ *              uniform random integers mod q
+ *
+ * Arguments:   - int16_t *r:          pointer to output buffer
+ *              - unsigned int len:    requested number of 16-bit integers
+ *                                     (uniform mod q).
+ *              - const uint8_t *buf:  pointer to input buffer
+ *                                     (assumed to be uniform random bytes)
+ *              - unsigned int buflen: length of input buffer in bytes.
+ *
+ * Return -1 if the native implementation does not support the input lengths.
+ * Otherwise, returns non-negative number of sampled 16-bit integers (at most
+ * len).
+ **************************************************/
+static INLINE int rej_uniform_native(int16_t *r, unsigned int len,
+                                     const uint8_t *buf, unsigned int buflen);
+#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+
+#endif /* MLKEM_USE_NATIVE */
+#endif /* MLKEM_ARITH_NATIVE_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbd.c
new file mode 100644
index 0000000000..073f3c81d7
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbd.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "cbd.h"
+#include <stdint.h>
+#include "params.h"
+
+/*************************************************
+ * Name:        load32_littleendian
+ *
+ * Description: load 4 bytes into a 32-bit integer
+ *              in little-endian order
+ *
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x
+ **************************************************/
+static uint32_t load32_littleendian(const uint8_t x[4])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  r |= (uint32_t)x[3] << 24;
+  return r;
+}
+
+/*************************************************
+ * Name:        load24_littleendian
+ *
+ * Description: load 3 bytes into a 32-bit integer
+ *              in little-endian order.
+ *              This function is only needed for ML-KEM-512
+ *
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
+ **************************************************/
+#if MLKEM_ETA1 == 3
+static uint32_t load24_littleendian(const uint8_t x[3])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  return r;
+}
+#endif
+
+/*************************************************
+ * Name:        cbd2
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter eta=2
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
+{
+  int i;
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(array_abs_bound(r->coeffs, 0, (8 * i - 1), 2)))
+  {
+    int j;
+    uint32_t t = load32_littleendian(buf + 4 * i);
+    uint32_t d = t & 0x55555555;
+    d += (t >> 1) & 0x55555555;
+
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(array_abs_bound(r->coeffs, 0, 8 * i + j - 1, 2)))
+    {
+      const int16_t a = (d >> (4 * j + 0)) & 0x3;
+      const int16_t b = (d >> (4 * j + 2)) & 0x3;
+      r->coeffs[8 * i + j] = a - b;
+    }
+  }
+}
+
+/*************************************************
+ * Name:        cbd3
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter eta=3.
+ *              This function is only needed for ML-KEM-512
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+#if MLKEM_ETA1 == 3
+static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
+{
+  int i;
+  for (i = 0; i < MLKEM_N / 4; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N / 4)
+    invariant(array_abs_bound(r->coeffs, 0, (4 * i - 1), 3)))
+  {
+    int j;
+    const uint32_t t = load24_littleendian(buf + 3 * i);
+    uint32_t d = t & 0x00249249;
+    d += (t >> 1) & 0x00249249;
+    d += (t >> 2) & 0x00249249;
+
+    for (j = 0; j < 4; j++)
+    __loop__(
+      invariant(i >= 0 && i <= MLKEM_N / 4 && j >= 0 && j <= 4)
+      invariant(array_abs_bound(r->coeffs, 0, 4 * i + j - 1, 3)))
+    {
+      const int16_t a = (d >> (6 * j + 0)) & 0x7;
+      const int16_t b = (d >> (6 * j + 3)) & 0x7;
+      r->coeffs[4 * i + j] = a - b;
+    }
+  }
+}
+#endif
+
+void poly_cbd_eta1(poly *r, const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
+{
+#if MLKEM_ETA1 == 2
+  cbd2(r, buf);
+#elif MLKEM_ETA1 == 3
+  cbd3(r, buf);
+#else
+#error "This implementation requires eta1 in {2,3}"
+#endif
+}
+
+void poly_cbd_eta2(poly *r, const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
+{
+#if MLKEM_ETA2 == 2
+  cbd2(r, buf);
+#else
+#error "This implementation requires eta2 = 2"
+#endif
+}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbd.h
new file mode 100644
index 0000000000..4dc8635bb5
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbd.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef CBD_H
+#define CBD_H
+
+#include <stdint.h>
+#include "params.h"
+#include "poly.h"
+
+#define poly_cbd_eta1 MLKEM_NAMESPACE(poly_cbd_eta1)
+/*************************************************
+ * Name:        poly_cbd_eta1
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+void poly_cbd_eta1(poly *r, const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N - 1, MLKEM_ETA1))
+);
+
+#define poly_cbd_eta2 MLKEM_NAMESPACE(poly_cbd_eta2)
+/*************************************************
+ * Name:        poly_cbd_eta1
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA2.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+void poly_cbd_eta2(poly *r, const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N - 1, MLKEM_ETA2))
+);
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbmc.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbmc.h
new file mode 100644
index 0000000000..317a26421b
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbmc.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/***************************************************
+ * Basic replacements for __CPROVER_XXX contracts
+ ***************************************************/
+
+#include "common.h"
+
+#ifndef CBMC
+
+#define STATIC_INLINE_TESTABLE static INLINE
+#define STATIC_TESTABLE static
+
+#define __contract__(x)
+#define __loop__(x)
+#define cassert(x, y)
+
+#else /* CBMC _is_ defined, therefore we're doing proof */
+
+/* expose certain procedures to CBMC proofs that are static otherwise */
+#define STATIC_TESTABLE
+#define STATIC_INLINE_TESTABLE
+
+#define __contract__(x) x
+#define __loop__(x) x
+
+/* https://diffblue.github.io/cbmc/contracts-assigns.html */
+#define assigns(...) __CPROVER_assigns(__VA_ARGS__)
+
+/* https://diffblue.github.io/cbmc/contracts-requires-ensures.html */
+#define requires(...) __CPROVER_requires(__VA_ARGS__)
+#define ensures(...) __CPROVER_ensures(__VA_ARGS__)
+/* https://diffblue.github.io/cbmc/contracts-loops.html */
+#define invariant(...) __CPROVER_loop_invariant(__VA_ARGS__)
+#define decreases(...) __CPROVER_decreases(__VA_ARGS__)
+/* cassert to avoid confusion with in-built assert */
+#define cassert(...) __CPROVER_assert(__VA_ARGS__)
+#define assume(...) __CPROVER_assume(__VA_ARGS__)
+
+/***************************************************
+ * Macros for "expression" forms that may appear
+ * _inside_ top-level contracts.
+ ***************************************************/
+
+/*
+ * function return value - useful inside ensures
+ * https://diffblue.github.io/cbmc/contracts-functions.html
+ */
+#define return_value (__CPROVER_return_value)
+
+/*
+ * assigns l-value targets
+ * https://diffblue.github.io/cbmc/contracts-assigns.html
+ */
+#define object_whole(...) __CPROVER_object_whole(__VA_ARGS__)
+#define memory_slice(...) __CPROVER_object_upto(__VA_ARGS__)
+#define same_object(...) __CPROVER_same_object(__VA_ARGS__)
+
+/*
+ * Pointer-related predicates
+ * https://diffblue.github.io/cbmc/contracts-memory-predicates.html
+ */
+#define memory_no_alias(...) __CPROVER_is_fresh(__VA_ARGS__)
+#define readable(...) __CPROVER_r_ok(__VA_ARGS__)
+#define writeable(...) __CPROVER_w_ok(__VA_ARGS__)
+
+/*
+ * History variables
+ * https://diffblue.github.io/cbmc/contracts-history-variables.html
+ */
+#define old(...) __CPROVER_old(__VA_ARGS__)
+#define loop_entry(...) __CPROVER_loop_entry(__VA_ARGS__)
+
+/*
+ * Quantifiers
+ * Note that the range on qvar is _inclusive_ between qvar_lb .. qvar_ub
+ * https://diffblue.github.io/cbmc/contracts-quantifiers.html
+ */
+
+/*
+ * Prevent clang-format from corrupting CBMC's special ==> operator
+ */
+/* clang-format off */
+#define forall(type, qvar, qvar_lb, qvar_ub, predicate)           \
+  __CPROVER_forall                                                \
+  {                                                               \
+    type qvar;                                                    \
+    ((qvar_lb) <= (qvar) && (qvar) <= (qvar_ub)) ==> (predicate)  \
+  }
+
+#define EXISTS(type, qvar, qvar_lb, qvar_ub, predicate)         \
+  __CPROVER_exists                                              \
+  {                                                             \
+    type qvar;                                                  \
+    ((qvar_lb) <= (qvar) && (qvar) <= (qvar_ub)) && (predicate) \
+  }
+/* clang-format on */
+
+/***************************************************
+ * Convenience macros for common contract patterns
+ ***************************************************/
+
+/*
+ * Boolean-value predidate that asserts that "all values of array_var are in
+ * range value_lb .. value_ub (inclusive)"
+ * Example:
+ *  array_bound(a->coeffs, 0, MLKEM_N-1, -(MLKEM_Q - 1), MLKEM_Q - 1)
+ * expands to
+ *  __CPROVER_forall { int k; (0 <= k && k <= MLKEM_N-1) ==> ( (-(MLKEM_Q -
+ *  1) <= a->coeffs[k]) && (a->coeffs[k] <= (MLKEM_Q - 1))) }
+ */
+
+/*
+ * Prevent clang-format from corrupting CBMC's special ==> operator
+ */
+/* clang-format off */
+#define CBMC_CONCAT_(left, right) left##right
+#define CBMC_CONCAT(left, right) CBMC_CONCAT_(left, right)
+
+#define array_bound_core(indextype, qvar, qvar_lb, qvar_ub, array_var, \
+                         value_lb, value_ub)                           \
+  __CPROVER_forall                                                     \
+  {                                                                    \
+    indextype qvar;                                                    \
+    ((qvar_lb) <= (qvar) && (qvar) <= (qvar_ub)) ==>                   \
+        (((value_lb) <= (array_var[(qvar)])) &&                        \
+        ((array_var[(qvar)]) <= (value_ub)))                           \
+  }
+
+#define array_bound(array_var, qvar_lb, qvar_ub, value_lb, value_ub) \
+  array_bound_core(int, CBMC_CONCAT(_cbmc_idx, __LINE__), (qvar_lb), \
+                   (qvar_ub), (array_var), (value_lb), (value_ub))
+
+
+/* Wrapper around array_bound operating on absolute values */
+#define array_abs_bound(arr, lb, ub, k) \
+  array_bound((arr), (lb), (ub), (-(k)), (k))
+/* clang-format on */
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/common.h
new file mode 100644
index 0000000000..94c29ed927
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/common.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef COMMON_H
+#define COMMON_H
+
+
+/*
+ * C90 does not have the inline compiler directive yet.
+ * We don't use it in C90 builds.
+ * However, in that case the compiler warns about some inline functions in
+ * header files not being used in every compilation unit that includes that
+ * header. To work around it we silence that warning in that case using
+ * __attribute__((unused)).
+ */
+
+/* Do not use inline for C90 builds*/
+#if !defined(inline)
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#define INLINE inline
+#define ALWAYS_INLINE __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#define INLINE __inline
+#define ALWAYS_INLINE __forceinline
+#else
+#define INLINE __attribute__((unused))
+#define ALWAYS_INLINE
+#endif
+
+#else
+#define INLINE inline
+#define ALWAYS_INLINE __attribute__((always_inline))
+#endif
+
+
+/*
+ * C90 does not have the restrict compiler directive yet.
+ * We don't use it in C90 builds.
+ */
+#if !defined(restrict)
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#define RESTRICT restrict
+#else
+#define RESTRICT
+#endif
+
+#else
+
+#define RESTRICT restrict
+#endif
+
+#define DEFAULT_ALIGN 32
+#if defined(_WIN32)
+#define ALIGN __declspec(align(DEFAULT_ALIGN))
+#define asm __asm
+#else
+#define asm __asm__
+#define ALIGN __attribute__((aligned(DEFAULT_ALIGN)))
+#endif
+
+#define MLKEM_CONCAT_(left, right) left##right
+#define MLKEM_CONCAT(left, right) MLKEM_CONCAT_(left, right)
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/config.h
new file mode 100644
index 0000000000..370a141a65
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/config.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef CONFIG_H
+#define CONFIG_H
+
+#include "cpucap.h"
+
+#if defined(MLKEM_USE_NATIVE)
+
+#if defined(SYS_AARCH64)
+#define MLKEM_USE_NATIVE_AARCH64
+#endif /* SYS_AARCH64 */
+
+#if defined(SYS_X86_64)
+#define MLKEM_USE_NATIVE_X86_64
+#endif /* SYS_X86_64 */
+
+#endif /* MLKEM_USE_NATIVE */
+#endif /* CONFIG_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cpucap.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cpucap.h
new file mode 100644
index 0000000000..cfcbbc3fe9
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cpucap.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef CPUCAP_H
+#define CPUCAP_H
+
+/* Check if we're running on an AArch64 system. _M_ARM64 is set by MSVC. */
+#if defined(__AARCH64EL__) || defined(_M_ARM64)
+#define SYS_AARCH64
+#endif
+
+#if defined(__x86_64__)
+#define SYS_X86_64
+#if defined(__AVX2__)
+#define SYS_X86_64_AVX2
+#endif
+#endif /* __x86_64__ */
+
+/* Check endianness */
+#if defined(__BYTE_ORDER__)
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define SYS_LITTLE_ENDIAN
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define SYS_BIG_ENDIAN
+#else /* __BYTE_ORER__ */
+#error "__BYTE_ORDER__ defined, but don't recognize value."
+#endif /* __BYTE_ORER__ */
+#endif /* !defined(__BYTE_ORER__) */
+
+/* If FORCE_AARCH64 is set, assert that we're indeed on an AArch64 system. */
+#if defined(FORCE_AARCH64) && !defined(SYS_AARCH64)
+#error "FORCE_AARCH64 is set, but we don't seem to be on an AArch64 system."
+#endif
+
+/* If FORCE_X86_64 is set, assert that we're indeed on an X86_64 system. */
+#if defined(FORCE_X86_64) && !defined(SYS_X86_64)
+#error "FORCE_X86_64 is set, but we don't seem to be on an X86_64 system."
+#endif
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/debug/debug.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/debug/debug.c
new file mode 100644
index 0000000000..aa9b578074
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/debug/debug.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#define _ISOC99_SOURCE
+#include "debug.h"
+#include <stdio.h>
+
+#if defined(MLKEM_DEBUG)
+
+static char debug_buf[256];
+
+void mlkem_debug_assert(const char *file, int line, const char *description,
+                        const int val)
+{
+  if (val == 0)
+  {
+    snprintf(debug_buf, sizeof(debug_buf), "Assertion failed: %s (value %d)",
+             description, val);
+    mlkem_debug_print_error(file, line, debug_buf);
+    exit(1);
+  }
+}
+void mlkem_debug_check_bounds(const char *file, int line,
+                              const char *description, const int16_t *ptr,
+                              unsigned len, int lower_bound_exclusive,
+                              int upper_bound_exclusive)
+{
+  int err = 0;
+  unsigned i;
+  for (i = 0; i < len; i++)
+  {
+    int16_t val = ptr[i];
+    if (!(val > lower_bound_exclusive && val < upper_bound_exclusive))
+    {
+      snprintf(debug_buf, sizeof(debug_buf),
+               "%s, index %u, value %d out of bounds (%d,%d)", description, i,
+               (int)val, lower_bound_exclusive, upper_bound_exclusive);
+      mlkem_debug_print_error(file, line, debug_buf);
+      err = 1;
+    }
+  }
+
+  if (err == 1)
+    exit(1);
+}
+
+void mlkem_debug_print_error(const char *file, int line, const char *msg)
+{
+  fprintf(stderr, "[ERROR:%s:%04d] %s\n", file, line, msg);
+  fflush(stderr);
+}
+
+#else /* MLKEM_DEBUG */
+
+int empty_cu_debug;
+
+#endif /* MLKEM_DEBUG */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/debug/debug.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/debug/debug.h
new file mode 100644
index 0000000000..65208771d2
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/debug/debug.h
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef MLKEM_DEBUG_H
+#define MLKEM_DEBUG_H
+
+#if defined(MLKEM_DEBUG)
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/*************************************************
+ * Name:        mlkem_debug_assert
+ *
+ * Description: Check debug assertion
+ *
+ *              Prints an error message to stderr and calls
+ *              exit(1) if not.
+ *
+ * Arguments:   - file: filename
+ *              - line: line number
+ *              - description: Textual description of assertion
+ *              - val: Value asserted to be non-zero
+ **************************************************/
+void mlkem_debug_assert(const char *file, int line, const char *description,
+                        const int val);
+
+/*************************************************
+ * Name:        mlkem_debug_check_bounds
+ *
+ * Description: Check whether values in an array of int16_t
+ *              are within specified bounds.
+ *
+ *              Prints an error message to stderr and calls
+ *              exit(1) if not.
+ *
+ * Arguments:   - file: filename
+ *              - line: line number
+ *              - description: Textual description of check
+ *              - ptr: Base of array to be checked
+ *              - len: Number of int16_t in ptr
+ *              - lower_bound_exclusive: Exclusive lower bound
+ *              - upper_bound_exclusive: Exclusive upper bound
+ **************************************************/
+void mlkem_debug_check_bounds(const char *file, int line,
+                              const char *description, const int16_t *ptr,
+                              unsigned len, int lower_bound_exclusive,
+                              int upper_bound_exclusive);
+
+/* Print error message to stderr alongside file and line information */
+void mlkem_debug_print_error(const char *file, int line, const char *msg);
+
+/* Check assertion, calling exit() upon failure
+ *
+ * val: Value that's asserted to be non-zero
+ * msg: Message to print on failure
+ *
+ * Currently called CASSERT to avoid clash with CBMC assert.
+ */
+#define CASSERT(val, msg)                                 \
+  do                                                      \
+  {                                                       \
+    mlkem_debug_assert(__FILE__, __LINE__, (msg), (val)); \
+  } while (0)
+
+/* Check absolute bounds of scalar
+ * val: Scalar to be checked
+ * abs_bound: Exclusive upper bound on absolute value to check
+ * msg: Message to print on failure */
+#define SCALAR_BOUND(val, abs_bound, msg) \
+  CASSERT((val) > -(abs_bound) && (val) < (abs_bound), msg)
+
+/* Check that all coefficients in array of int16_t's are non-negative
+ * and below an exclusive upper bound.
+ *
+ * ptr: Base of array, expression of type int16_t*
+ * len: Number of int16_t in array
+ * high_bound: Exclusive upper bound on absolute value to check
+ * msg: Message to print on failure */
+#define UBOUND(ptr, len, high_bound, msg)                                 \
+  do                                                                      \
+  {                                                                       \
+    mlkem_debug_check_bounds(__FILE__, __LINE__, (msg), (int16_t *)(ptr), \
+                             (len), -1, ((high_bound)));                  \
+  } while (0)
+
+/* Check absolute bounds in array of int16_t's
+ * ptr: Base of array, expression of type int16_t*
+ * len: Number of int16_t in array
+ * abs_bound: Exclusive upper bound on absolute value to check
+ * msg: Message to print on failure */
+#define BOUND(ptr, len, abs_bound, msg)                                   \
+  do                                                                      \
+  {                                                                       \
+    mlkem_debug_check_bounds(__FILE__, __LINE__, (msg), (int16_t *)(ptr), \
+                             (len), -(abs_bound), (abs_bound));           \
+  } while (0)
+
+/* Check absolute bounds on coefficients in polynomial or mulcache
+ * ptr: poly* or poly_mulcache* pointer to polynomial (cache) to check
+ * abs_bound: Exclusive upper bound on absolute value to check
+ * msg: Message to print on failure */
+#define POLY_BOUND_MSG(ptr, abs_bound, msg)                                    \
+  BOUND((ptr)->coeffs, (sizeof((ptr)->coeffs) / sizeof(int16_t)), (abs_bound), \
+        msg)
+
+/* Check unsigned bounds on coefficients in polynomial or mulcache
+ * ptr: poly* or poly_mulcache* pointer to polynomial (cache) to check
+ * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
+ * msg: Message to print on failure */
+#define POLY_UBOUND_MSG(ptr, ubound, msg)                                    \
+  UBOUND((ptr)->coeffs, (sizeof((ptr)->coeffs) / sizeof(int16_t)), (ubound), \
+         msg)
+
+/* Check absolute bounds on coefficients in polynomial
+ * ptr: poly* of poly_mulcache* pointer to polynomial (cache) to check
+ * abs_bound: Exclusive upper bound on absolute value to check */
+#define POLY_BOUND(ptr, abs_bound) \
+  POLY_BOUND_MSG((ptr), (abs_bound), "poly absolute bound for " #ptr)
+
+/* Check unsigned bounds on coefficients in polynomial
+ * ptr: poly* of poly_mulcache* pointer to polynomial (cache) to check
+ * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
+ */
+#define POLY_UBOUND(ptr, ubound) \
+  POLY_UBOUND_MSG((ptr), (ubound), "poly unsigned bound for " #ptr)
+
+/* Check absolute bounds on coefficients in vector of polynomials
+ * ptr: polyvec* or polyvec_mulcache* pointer to vector of polynomials to check
+ * abs_bound: Exclusive upper bound on absolute value to check */
+#define POLYVEC_BOUND(ptr, abs_bound)                                      \
+  do                                                                       \
+  {                                                                        \
+    unsigned _debug_polyvec_bound_idx;                                     \
+    for (_debug_polyvec_bound_idx = 0; _debug_polyvec_bound_idx < MLKEM_K; \
+         _debug_polyvec_bound_idx++)                                       \
+      POLY_BOUND_MSG(&(ptr)->vec[_debug_polyvec_bound_idx], (abs_bound),   \
+                     "polyvec absolute bound for " #ptr ".vec[i]");        \
+  } while (0)
+
+/* Check unsigned bounds on coefficients in vector of polynomials
+ * ptr: polyvec* or polyvec_mulcache* pointer to vector of polynomials to check
+ * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
+ */
+#define POLYVEC_UBOUND(ptr, ubound)                                        \
+  do                                                                       \
+  {                                                                        \
+    unsigned _debug_polyvec_bound_idx;                                     \
+    for (_debug_polyvec_bound_idx = 0; _debug_polyvec_bound_idx < MLKEM_K; \
+         _debug_polyvec_bound_idx++)                                       \
+      POLY_UBOUND_MSG(&(ptr)->vec[_debug_polyvec_bound_idx], (ubound),     \
+                      "polyvec unsigned bound for " #ptr ".vec[i]");       \
+  } while (0)
+
+/* Following AWS-LC to define a C99-compliant static assert */
+#define MLKEM_STATIC_ASSERT_DEFINE(cond, msg)                            \
+  typedef struct                                                         \
+  {                                                                      \
+    unsigned int MLKEM_CONCAT(static_assertion_, msg) : (cond) ? 1 : -1; \
+  } MLKEM_CONCAT(static_assertion_, msg) __attribute__((unused));
+
+#define MLKEM_STATIC_ASSERT_ADD_LINE0(cond, suffix) \
+  MLKEM_STATIC_ASSERT_DEFINE(cond, MLKEM_CONCAT(at_line_, suffix))
+#define MLKEM_STATIC_ASSERT_ADD_LINE1(cond, line, suffix) \
+  MLKEM_STATIC_ASSERT_ADD_LINE0(cond, MLKEM_CONCAT(line, suffix))
+#define MLKEM_STATIC_ASSERT_ADD_LINE2(cond, suffix) \
+  MLKEM_STATIC_ASSERT_ADD_LINE1(cond, __LINE__, suffix)
+#define MLKEM_STATIC_ASSERT_ADD_ERROR(cond, suffix) \
+  MLKEM_STATIC_ASSERT_ADD_LINE2(cond, MLKEM_CONCAT(_error_is_, suffix))
+#define STATIC_ASSERT(cond, error) MLKEM_STATIC_ASSERT_ADD_ERROR(cond, error)
+
+#else /* MLKEM_DEBUG */
+
+#define CASSERT(val, msg) \
+  do                      \
+  {                       \
+  } while (0)
+#define SCALAR_BOUND(val, abs_bound, msg) \
+  do                                      \
+  {                                       \
+  } while (0)
+#define BOUND(ptr, len, abs_bound, msg) \
+  do                                    \
+  {                                     \
+  } while (0)
+#define POLY_BOUND(ptr, abs_bound) \
+  do                               \
+  {                                \
+  } while (0)
+#define POLYVEC_BOUND(ptr, abs_bound) \
+  do                                  \
+  {                                   \
+  } while (0)
+#define POLY_BOUND_MSG(ptr, ubound, abs_bound) \
+  do                                           \
+  {                                            \
+  } while (0)
+#define UBOUND(ptr, len, high_bound, msg) \
+  do                                      \
+  {                                       \
+  } while (0)
+#define POLY_UBOUND(ptr, ubound) \
+  do                             \
+  {                              \
+  } while (0)
+#define POLYVEC_UBOUND(ptr, ubound) \
+  do                                \
+  {                                 \
+  } while (0)
+#define POLY_UBOUND_MSG(ptr, ubound, msg) \
+  do                                      \
+  {                                       \
+  } while (0)
+#define STATIC_ASSERT(cond, error)
+
+#endif /* MLKEM_DEBUG */
+
+#endif /* MLKEM_DEBUG_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.c
new file mode 100644
index 0000000000..669460c29c
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.c
@@ -0,0 +1,577 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "indcpa.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include "fips202.h"
+#include "fips202x4.h"
+#include "indcpa.h"
+#include "ntt.h"
+#include "params.h"
+#include "poly.h"
+#include "polyvec.h"
+#include "randombytes.h"
+#include "rej_uniform.h"
+#include "symmetric.h"
+
+#include "arith_native.h"
+#include "debug/debug.h"
+
+#include "cbmc.h"
+
+
+/*************************************************
+ * Name:        pack_pk
+ *
+ * Description: Serialize the public key as concatenation of the
+ *              serialized vector of polynomials pk
+ *              and the public seed used to generate the matrix A.
+ *
+ * Arguments:   uint8_t *r: pointer to the output serialized public key
+ *              polyvec *pk: pointer to the input public-key polyvec.
+ *                Must have coefficients within [0,..,q-1].
+ *              const uint8_t *seed: pointer to the input public seed
+ **************************************************/
+static void pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES], polyvec *pk,
+                    const uint8_t seed[MLKEM_SYMBYTES])
+{
+  POLYVEC_BOUND(pk, MLKEM_Q);
+  polyvec_tobytes(r, pk);
+  memcpy(r + MLKEM_POLYVECBYTES, seed, MLKEM_SYMBYTES);
+}
+
+/*************************************************
+ * Name:        unpack_pk
+ *
+ * Description: De-serialize public key from a byte array;
+ *              approximate inverse of pack_pk
+ *
+ * Arguments:   - polyvec *pk: pointer to output public-key polynomial vector
+ *                  Coefficients will be normalized to [0,..,q-1].
+ *              - uint8_t *seed: pointer to output seed to generate matrix A
+ *              - const uint8_t *packedpk: pointer to input serialized public
+ *                  key.
+ **************************************************/
+static void unpack_pk(polyvec *pk, uint8_t seed[MLKEM_SYMBYTES],
+                      const uint8_t packedpk[MLKEM_INDCPA_PUBLICKEYBYTES])
+{
+  polyvec_frombytes(pk, packedpk);
+  memcpy(seed, packedpk + MLKEM_POLYVECBYTES, MLKEM_SYMBYTES);
+
+  /*
+   * TODO! We know from the modulus check that this will result in an
+   * unsigned canonical polynomial, but CBMC does not know it. We should
+   * weaken the specification of `unpack_pk()` and all depending functions
+   * to work with the weaker 4096-bound, so that the proofs go through
+   * without the need of this redundant call to polyvec_reduce().
+   */
+  polyvec_reduce(pk);
+}
+
+/*************************************************
+ * Name:        pack_sk
+ *
+ * Description: Serialize the secret key
+ *
+ * Arguments:   - uint8_t *r: pointer to output serialized secret key
+ *              - polyvec *sk: pointer to input vector of polynomials (secret
+ *key)
+ **************************************************/
+static void pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES], polyvec *sk)
+{
+  POLYVEC_BOUND(sk, MLKEM_Q);
+  polyvec_tobytes(r, sk);
+}
+
+/*************************************************
+ * Name:        unpack_sk
+ *
+ * Description: De-serialize the secret key; inverse of pack_sk
+ *
+ * Arguments:   - polyvec *sk: pointer to output vector of polynomials (secret
+ *key)
+ *              - const uint8_t *packedsk: pointer to input serialized secret
+ *key
+ **************************************************/
+static void unpack_sk(polyvec *sk,
+                      const uint8_t packedsk[MLKEM_INDCPA_SECRETKEYBYTES])
+{
+  polyvec_frombytes(sk, packedsk);
+  polyvec_reduce(sk);
+}
+
+/*************************************************
+ * Name:        pack_ciphertext
+ *
+ * Description: Serialize the ciphertext as concatenation of the
+ *              compressed and serialized vector of polynomials b
+ *              and the compressed and serialized polynomial v
+ *
+ * Arguments:   uint8_t *r: pointer to the output serialized ciphertext
+ *              poly *pk: pointer to the input vector of polynomials b
+ *              poly *v: pointer to the input polynomial v
+ **************************************************/
+static void pack_ciphertext(uint8_t r[MLKEM_INDCPA_BYTES], polyvec *b, poly *v)
+{
+  polyvec_compress_du(r, b);
+  poly_compress_dv(r + MLKEM_POLYVECCOMPRESSEDBYTES_DU, v);
+}
+
+/*************************************************
+ * Name:        unpack_ciphertext
+ *
+ * Description: De-serialize and decompress ciphertext from a byte array;
+ *              approximate inverse of pack_ciphertext
+ *
+ * Arguments:   - polyvec *b: pointer to the output vector of polynomials b
+ *              - poly *v: pointer to the output polynomial v
+ *              - const uint8_t *c: pointer to the input serialized ciphertext
+ **************************************************/
+static void unpack_ciphertext(polyvec *b, poly *v,
+                              const uint8_t c[MLKEM_INDCPA_BYTES])
+{
+  polyvec_decompress_du(b, c);
+  poly_decompress_dv(v, c + MLKEM_POLYVECCOMPRESSEDBYTES_DU);
+}
+
+#ifndef MLKEM_GEN_MATRIX_NBLOCKS
+#define MLKEM_GEN_MATRIX_NBLOCKS \
+  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + SHAKE128_RATE) / SHAKE128_RATE)
+#endif
+
+/*
+ * Generate four A matrix entries from a seed, using rejection
+ * sampling on the output of a XOF.
+ */
+STATIC_TESTABLE
+void gen_matrix_entry_x4(poly *vec, uint8_t *seed[4])
+__contract__(
+  requires(memory_no_alias(vec, sizeof(poly) * 4))
+  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
+  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(vec, sizeof(poly) * 4))
+  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N - 1, 0, (MLKEM_Q - 1)))
+  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N - 1, 0, (MLKEM_Q - 1)))
+  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N - 1, 0, (MLKEM_Q - 1)))
+  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N - 1, 0, (MLKEM_Q - 1))))
+{
+  /* Temporary buffers for XOF output before rejection sampling */
+  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * SHAKE128_RATE];
+  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * SHAKE128_RATE];
+  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * SHAKE128_RATE];
+  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * SHAKE128_RATE];
+
+  /* Tracks the number of coefficients we have already sampled */
+  unsigned int ctr[KECCAK_WAY];
+  shake128x4incctx statex;
+  unsigned int buflen;
+
+  shake128x4_inc_init(&statex);
+
+  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
+  shake128x4_absorb_once(&statex, seed[0], seed[1], seed[2], seed[3],
+                    MLKEM_SYMBYTES + 2);
+
+  /*
+   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   * This should generate the matrix entries with high probability.
+   */
+  shake128x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
+                           &statex);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * SHAKE128_RATE;
+  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
+  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
+  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
+  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
+
+  /*
+   * So long as not all matrix entries have been generated, squeeze
+   * one more block a time until we're done.
+   */
+  buflen = SHAKE128_RATE;
+  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
+         ctr[3] < MLKEM_N)
+  __loop__(
+    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
+       object_whole(buf1), object_whole(buf2), object_whole(buf3))
+    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
+    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
+    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0] - 1, 0, (MLKEM_Q - 1)))
+    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1] - 1, 0, (MLKEM_Q - 1)))
+    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2] - 1, 0, (MLKEM_Q - 1)))
+    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3] - 1, 0, (MLKEM_Q - 1))))
+  {
+    shake128x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
+    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
+    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
+    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
+    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
+  }
+
+  shake128x4_inc_ctx_release(&statex);
+}
+
+/*
+ * Generate a single A matrix entry from a seed, using rejection
+ * sampling on the output of a XOF.
+ */
+STATIC_TESTABLE
+void gen_matrix_entry(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
+__contract__(
+  requires(memory_no_alias(entry, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(entry, sizeof(poly)))
+  ensures(array_bound(entry->coeffs, 0, MLKEM_N - 1, 0, (MLKEM_Q - 1))))
+{
+  shake128incctx state;
+  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * SHAKE128_RATE];
+  unsigned int ctr, buflen;
+
+  shake128_inc_init(&state);
+
+  shake128_absorb_once(&state, seed, MLKEM_SYMBYTES + 2);
+
+  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   */
+  /* This should generate the matrix entry with high probability. */
+  shake128_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * SHAKE128_RATE;
+  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
+
+  /* Squeeze + sample one more block a time until we're done */
+  buflen = SHAKE128_RATE;
+  while (ctr < MLKEM_N)
+  __loop__(
+    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
+    invariant(0 <= ctr && ctr <= MLKEM_N)
+    invariant(ctr > 0 ==> array_bound(entry->coeffs, 0, ctr - 1,
+                                          0, (MLKEM_Q - 1))))
+  {
+    shake128_squeezeblocks(buf, 1, &state);
+    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, SHAKE128_RATE);
+  }
+
+  shake128_inc_ctx_release(&state);
+}
+
+/*************************************************
+ * Name:        gen_matrix
+ *
+ * Description: Deterministically generate matrix A (or the transpose of A)
+ *              from a seed. Entries of the matrix are polynomials that look
+ *              uniformly random. Performs rejection sampling on output of
+ *              a XOF
+ *
+ * Arguments:   - polyvec *a: pointer to ouptput matrix A
+ *              - const uint8_t *seed: pointer to input seed
+ *              - int transposed: boolean deciding whether A or A^T is generated
+ **************************************************/
+/* Not static for benchmarking */
+void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
+{
+  int i;
+  unsigned int j;
+  /*
+   * We generate four separate seed arrays rather than a single one to work
+   * around limitations in CBMC function contracts dealing with disjoint slices
+   * of the same parent object.
+   */
+
+  ALIGN uint8_t seed0[MLKEM_SYMBYTES + 2];
+  ALIGN uint8_t seed1[MLKEM_SYMBYTES + 2];
+  ALIGN uint8_t seed2[MLKEM_SYMBYTES + 2];
+  ALIGN uint8_t seed3[MLKEM_SYMBYTES + 2];
+  uint8_t *seedxy[4];
+  seedxy[0] = seed0;
+  seedxy[1] = seed1;
+  seedxy[2] = seed2;
+  seedxy[3] = seed3;
+
+  for (j = 0; j < KECCAK_WAY; j++)
+  {
+    memcpy(seedxy[j], seed, MLKEM_SYMBYTES);
+  }
+
+  for (i = 0; i < (MLKEM_K * MLKEM_K / KECCAK_WAY) * KECCAK_WAY;
+       i += KECCAK_WAY)
+  {
+    uint8_t x, y;
+
+    for (j = 0; j < KECCAK_WAY; j++)
+    {
+      x = (i + j) / MLKEM_K;
+      y = (i + j) % MLKEM_K;
+      if (transposed)
+      {
+        seedxy[j][MLKEM_SYMBYTES + 0] = x;
+        seedxy[j][MLKEM_SYMBYTES + 1] = y;
+      }
+      else
+      {
+        seedxy[j][MLKEM_SYMBYTES + 0] = y;
+        seedxy[j][MLKEM_SYMBYTES + 1] = x;
+      }
+    }
+
+    /*
+     * This call writes across polyvec boundaries for K=2 and K=3.
+     * This is intentional and safe.
+     */
+    gen_matrix_entry_x4(&a[0].vec[0] + i, seedxy);
+  }
+
+  /* For left over polynomial, we use single keccak. */
+  if (i < MLKEM_K * MLKEM_K)
+  {
+    uint8_t x, y;
+    x = i / MLKEM_K;
+    y = i % MLKEM_K;
+
+    if (transposed)
+    {
+      seed0[MLKEM_SYMBYTES + 0] = x;
+      seed0[MLKEM_SYMBYTES + 1] = y;
+    }
+    else
+    {
+      seed0[MLKEM_SYMBYTES + 0] = y;
+      seed0[MLKEM_SYMBYTES + 1] = x;
+    }
+
+    gen_matrix_entry(&a[0].vec[0] + i, seed0);
+    i++;
+  }
+
+  cassert(i == MLKEM_K * MLKEM_K,
+          "gen_matrix: failed to generate whole matrix");
+
+#if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
+  /*
+   * The public matrix is generated in NTT domain. If the native backend
+   * uses a custom order in NTT domain, permute A accordingly.
+   */
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    for (j = 0; j < MLKEM_K; j++)
+    {
+      poly_permute_bitrev_to_custom(&a[i].vec[j]);
+    }
+  }
+#endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */
+}
+
+/*************************************************
+ * Name:        matvec_mul
+ *
+ * Description: Computes matrix-vector product in NTT domain,
+ *              via Montgomery multiplication.
+ *
+ * Arguments:   - polyvec *out: Pointer to output polynomial vector
+ *              - polyvec a[MLKEM_K]: Input matrix. Must be in NTT domain
+ *                  and have coefficients of absolute value < MLKEM_Q.
+ *              - polyvec *v: Input polynomial vector. Must be in NTT domain.
+ *              - polyvec *vc: Mulcache for v, computed via
+ *                  polyvec_mulcache_compute().
+ **************************************************/
+STATIC_TESTABLE
+void matvec_mul(polyvec *out, const polyvec a[MLKEM_K], const polyvec *v,
+                const polyvec_mulcache *vc)
+__contract__(
+  requires(memory_no_alias(out, sizeof(polyvec)))
+  requires(memory_no_alias(a, sizeof(polyvec) * MLKEM_K))
+  requires(memory_no_alias(v, sizeof(polyvec)))
+  requires(memory_no_alias(vc, sizeof(polyvec_mulcache)))
+  requires(forall(int, k0, 0, MLKEM_K - 1,
+  forall(int, k1, 0, MLKEM_K - 1,
+    array_abs_bound(a[k0].vec[k1].coeffs, 0, MLKEM_N - 1, (MLKEM_Q - 1)))))
+  assigns(object_whole(out)))
+{
+  int i;
+  for (i = 0; i < MLKEM_K; i++)
+  __loop__(
+    assigns(i, object_whole(out))
+    invariant(i >= 0 && i <= MLKEM_K))
+  {
+    polyvec_basemul_acc_montgomery_cached(&out->vec[i], &a[i], v, vc);
+  }
+}
+
+/*************************************************
+ * Name:        indcpa_keypair_derand
+ *
+ * Description: Generates public and private key for the CPA-secure
+ *              public-key encryption scheme underlying ML-KEM
+ *
+ * Arguments:   - uint8_t *pk: pointer to output public key
+ *                             (of length MLKEM_INDCPA_PUBLICKEYBYTES bytes)
+ *              - uint8_t *sk: pointer to output private key
+ *                             (of length MLKEM_INDCPA_SECRETKEYBYTES bytes)
+ *              - const uint8_t *coins: pointer to input randomness
+ *                             (of length MLKEM_SYMBYTES bytes)
+ **************************************************/
+
+STATIC_ASSERT(NTT_BOUND + MLKEM_Q < INT16_MAX, indcpa_enc_bound_0)
+
+void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+                           uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+                           const uint8_t coins[MLKEM_SYMBYTES])
+{
+  ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
+  const uint8_t *publicseed = buf;
+  const uint8_t *noiseseed = buf + MLKEM_SYMBYTES;
+  polyvec a[MLKEM_K], e, pkpv, skpv;
+  polyvec_mulcache skpv_cache;
+
+  ALIGN uint8_t coins_with_domain_separator[MLKEM_SYMBYTES + 1];
+  /* Concatenate coins with MLKEM_K for domain separation of security levels */
+  memcpy(coins_with_domain_separator, coins, MLKEM_SYMBYTES);
+  coins_with_domain_separator[MLKEM_SYMBYTES] = MLKEM_K;
+
+  hash_g(buf, coins_with_domain_separator, MLKEM_SYMBYTES + 1);
+
+  gen_matrix(a, publicseed, 0 /* no transpose */);
+
+#if MLKEM_K == 2
+  poly_getnoise_eta1_4x(skpv.vec + 0, skpv.vec + 1, e.vec + 0, e.vec + 1,
+                        noiseseed, 0, 1, 2, 3);
+#elif MLKEM_K == 3
+  /*
+   * Only the first three output buffers are needed.
+   * The laster parameter is a dummy that's overwritten later.
+   */
+  poly_getnoise_eta1_4x(skpv.vec + 0, skpv.vec + 1, skpv.vec + 2,
+                        pkpv.vec + 0 /* irrelevant */, noiseseed, 0, 1, 2,
+                        0xFF /* irrelevant */);
+  /* Same here */
+  poly_getnoise_eta1_4x(e.vec + 0, e.vec + 1, e.vec + 2,
+                        pkpv.vec + 0 /* irrelevant */, noiseseed, 3, 4, 5,
+                        0xFF /* irrelevant */);
+#elif MLKEM_K == 4
+  poly_getnoise_eta1_4x(skpv.vec + 0, skpv.vec + 1, skpv.vec + 2, skpv.vec + 3,
+                        noiseseed, 0, 1, 2, 3);
+  poly_getnoise_eta1_4x(e.vec + 0, e.vec + 1, e.vec + 2, e.vec + 3, noiseseed,
+                        4, 5, 6, 7);
+#endif
+
+  polyvec_ntt(&skpv);
+  polyvec_ntt(&e);
+
+  polyvec_mulcache_compute(&skpv_cache, &skpv);
+  matvec_mul(&pkpv, a, &skpv, &skpv_cache);
+  polyvec_tomont(&pkpv);
+
+  /* Arithmetic cannot overflow, see static assertion at the top */
+  polyvec_add(&pkpv, &e);
+  polyvec_reduce(&pkpv);
+  polyvec_reduce(&skpv);
+
+  pack_sk(sk, &skpv);
+  pack_pk(pk, &pkpv, publicseed);
+}
+
+/*************************************************
+ * Name:        indcpa_enc
+ *
+ * Description: Encryption function of the CPA-secure
+ *              public-key encryption scheme underlying Kyber.
+ *
+ * Arguments:   - uint8_t *c: pointer to output ciphertext
+ *                            (of length MLKEM_INDCPA_BYTES bytes)
+ *              - const uint8_t *m: pointer to input message
+ *                                  (of length MLKEM_INDCPA_MSGBYTES bytes)
+ *              - const uint8_t *pk: pointer to input public key
+ *                                   (of length MLKEM_INDCPA_PUBLICKEYBYTES)
+ *              - const uint8_t *coins: pointer to input random coins used as
+ *seed (of length MLKEM_SYMBYTES) to deterministically generate all randomness
+ **************************************************/
+
+/* Check that the arithmetic in indcpa_enc() does not overflow */
+STATIC_ASSERT(INVNTT_BOUND + MLKEM_ETA1 < INT16_MAX, indcpa_enc_bound_0)
+STATIC_ASSERT(INVNTT_BOUND + MLKEM_ETA2 + MLKEM_Q < INT16_MAX,
+              indcpa_enc_bound_1)
+
+void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
+                const uint8_t m[MLKEM_INDCPA_MSGBYTES],
+                const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+                const uint8_t coins[MLKEM_SYMBYTES])
+{
+  ALIGN uint8_t seed[MLKEM_SYMBYTES];
+  polyvec sp, pkpv, ep, at[MLKEM_K], b;
+  poly v, k, epp;
+  polyvec_mulcache sp_cache;
+
+  unpack_pk(&pkpv, seed, pk);
+  poly_frommsg(&k, m);
+  gen_matrix(at, seed, 1 /* transpose */);
+
+#if MLKEM_K == 2
+  poly_getnoise_eta1122_4x(sp.vec + 0, sp.vec + 1, ep.vec + 0, ep.vec + 1,
+                           coins, 0, 1, 2, 3);
+  poly_getnoise_eta2(&epp, coins, 4);
+#elif MLKEM_K == 3
+  /*
+   * In this call, only the first three output buffers are needed.
+   * The last parameter is a dummy that's overwritten later.
+   */
+  poly_getnoise_eta1_4x(sp.vec + 0, sp.vec + 1, sp.vec + 2, &b.vec[0], coins, 0,
+                        1, 2, 0xFF);
+  /* The fourth output buffer in this call _is_ used. */
+  poly_getnoise_eta2_4x(ep.vec + 0, ep.vec + 1, ep.vec + 2, &epp, coins, 3, 4,
+                        5, 6);
+#elif MLKEM_K == 4
+  poly_getnoise_eta1_4x(sp.vec + 0, sp.vec + 1, sp.vec + 2, sp.vec + 3, coins,
+                        0, 1, 2, 3);
+  poly_getnoise_eta2_4x(ep.vec + 0, ep.vec + 1, ep.vec + 2, ep.vec + 3, coins,
+                        4, 5, 6, 7);
+  poly_getnoise_eta2(&epp, coins, 8);
+#endif
+
+  polyvec_ntt(&sp);
+
+  polyvec_mulcache_compute(&sp_cache, &sp);
+  matvec_mul(&b, at, &sp, &sp_cache);
+  polyvec_basemul_acc_montgomery_cached(&v, &pkpv, &sp, &sp_cache);
+
+  polyvec_invntt_tomont(&b);
+  poly_invntt_tomont(&v);
+
+  /* Arithmetic cannot overflow, see static assertion at the top */
+  polyvec_add(&b, &ep);
+  poly_add(&v, &epp);
+  poly_add(&v, &k);
+
+  polyvec_reduce(&b);
+  poly_reduce(&v);
+
+  pack_ciphertext(c, &b, &v);
+}
+
+/* Check that the arithmetic in indcpa_dec() does not overflow */
+STATIC_ASSERT(INVNTT_BOUND + MLKEM_Q < INT16_MAX, indcpa_dec_bound_0)
+
+void indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
+                const uint8_t c[MLKEM_INDCPA_BYTES],
+                const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES])
+{
+  polyvec b, skpv;
+  poly v, sb;
+
+  unpack_ciphertext(&b, &v, c);
+  unpack_sk(&skpv, sk);
+
+  polyvec_ntt(&b);
+  polyvec_basemul_acc_montgomery(&sb, &skpv, &b);
+  poly_invntt_tomont(&sb);
+
+  /* Arithmetic cannot overflow, see static assertion at the top */
+  poly_sub(&v, &sb);
+  poly_reduce(&v);
+
+  poly_tomsg(m, &v);
+}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.h
new file mode 100644
index 0000000000..3f57eb1295
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef INDCPA_H
+#define INDCPA_H
+
+#include <stdint.h>
+#include "cbmc.h"
+#include "params.h"
+#include "polyvec.h"
+
+
+#define gen_matrix MLKEM_NAMESPACE(gen_matrix)
+
+void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
+__contract__(
+  requires(memory_no_alias(a, sizeof(polyvec) * MLKEM_K))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires(transposed == 0 || transposed == 1)
+  assigns(object_whole(a))
+  ensures(forall(int, x, 0, MLKEM_K - 1, forall(int, y, 0, MLKEM_K - 1,
+  array_bound(a[x].vec[y].coeffs, 0, MLKEM_N - 1, 0, (MLKEM_Q - 1)))));
+);
+
+#define indcpa_keypair_derand MLKEM_NAMESPACE(indcpa_keypair_derand)
+void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+                           uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+                           const uint8_t coins[MLKEM_SYMBYTES])
+__contract__(
+  requires(memory_no_alias(pk, MLKEM_INDCPA_PUBLICKEYBYTES))
+  requires(memory_no_alias(sk, MLKEM_INDCPA_SECRETKEYBYTES))
+  requires(memory_no_alias(coins, MLKEM_SYMBYTES))
+  assigns(object_whole(pk))
+  assigns(object_whole(sk))
+);
+
+#define indcpa_enc MLKEM_NAMESPACE(indcpa_enc)
+/*************************************************
+ * Name:        indcpa_dec
+ *
+ * Description: Decryption function of the CPA-secure
+ *              public-key encryption scheme underlying Kyber.
+ *
+ * Arguments:   - uint8_t *m: pointer to output decrypted message
+ *                            (of length MLKEM_INDCPA_MSGBYTES)
+ *              - const uint8_t *c: pointer to input ciphertext
+ *                                  (of length MLKEM_INDCPA_BYTES)
+ *              - const uint8_t *sk: pointer to input secret key
+ *                                   (of length MLKEM_INDCPA_SECRETKEYBYTES)
+ **************************************************/
+void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
+                const uint8_t m[MLKEM_INDCPA_MSGBYTES],
+                const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+                const uint8_t coins[MLKEM_SYMBYTES])
+__contract__(
+  requires(memory_no_alias(c, MLKEM_INDCPA_BYTES))
+  requires(memory_no_alias(m, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(pk, MLKEM_INDCPA_PUBLICKEYBYTES))
+  requires(memory_no_alias(coins, MLKEM_SYMBYTES))
+  assigns(object_whole(c))
+);
+
+#define indcpa_dec MLKEM_NAMESPACE(indcpa_dec)
+void indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
+                const uint8_t c[MLKEM_INDCPA_BYTES],
+                const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES])
+__contract__(
+  requires(memory_no_alias(c, MLKEM_INDCPA_BYTES))
+  requires(memory_no_alias(m, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(sk, MLKEM_INDCPA_SECRETKEYBYTES))
+  assigns(object_whole(m))
+);
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/kem.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/kem.c
new file mode 100644
index 0000000000..f84ee3f3da
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/kem.c
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "kem.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include "indcpa.h"
+#include "params.h"
+#include "randombytes.h"
+#include "symmetric.h"
+#include "verify.h"
+
+#if defined(CBMC)
+/* Redeclaration with contract needed for CBMC only */
+int memcmp(const void *str1, const void *str2, size_t n)
+__contract__(
+  requires(memory_no_alias(str1, n))
+  requires(memory_no_alias(str2, n))
+);
+#endif
+
+/*************************************************
+ * Name:        check_pk
+ *
+ * Description: Implements modulus check mandated by FIPS203,
+ *              i.e., ensures that coefficients are in [0,q-1].
+ *              Described in Section 7.2 of FIPS203.
+ *
+ * Arguments:   - const uint8_t *pk: pointer to input public key
+ *                (an already allocated array of MLKEM_PUBLICKEYBYTES bytes)
+ **
+ * Returns 0 on success, and -1 on failure
+ **************************************************/
+static int check_pk(const uint8_t pk[MLKEM_PUBLICKEYBYTES])
+{
+  polyvec p;
+  uint8_t p_reencoded[MLKEM_POLYVECBYTES];
+  polyvec_frombytes(&p, pk);
+  polyvec_reduce(&p);
+  polyvec_tobytes(p_reencoded, &p);
+  /* Data is public, so a variable-time memcmp() is OK */
+  if (memcmp(pk, p_reencoded, MLKEM_POLYVECBYTES))
+  {
+    return -1;
+  }
+  return 0;
+}
+
+/*************************************************
+ * Name:        check_sk
+ *
+ * Description: Implements public key hash check mandated by FIPS203,
+ *              i.e., ensures that
+ *              sk[768𝑘+32 ∶ 768𝑘+64] = H(pk)= H(sk[384𝑘 : 768𝑘+32])
+ *              Described in Section 7.3 of FIPS203.
+ *
+ * Arguments:   - const uint8_t *sk: pointer to input private key
+ *                (an already allocated array of MLKEM_SECRETKEYBYTES bytes)
+ *
+ * Returns 0 on success, and -1 on failure
+ **************************************************/
+static int check_sk(const uint8_t sk[MLKEM_SECRETKEYBYTES])
+{
+  uint8_t test[MLKEM_SYMBYTES];
+  /*
+   * The parts of `sk` being hashed and compared here are public, so
+   * no public information is leaked through the runtime or the return value
+   * of this function.
+   */
+  hash_h(test, sk + MLKEM_INDCPA_SECRETKEYBYTES, MLKEM_PUBLICKEYBYTES);
+  if (memcmp(sk + MLKEM_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES, test,
+             MLKEM_SYMBYTES))
+  {
+    return -1;
+  }
+  return 0;
+}
+
+/*************************************************
+ * Name:        crypto_kem_keypair_derand
+ *
+ * Description: Generates public and private key
+ *              for CCA-secure ML-KEM key encapsulation mechanism
+ *
+ * Arguments:   - uint8_t *pk: pointer to output public key
+ *                (an already allocated array of MLKEM_PUBLICKEYBYTES bytes)
+ *              - uint8_t *sk: pointer to output private key
+ *                (an already allocated array of MLKEM_SECRETKEYBYTES bytes)
+ *              - uint8_t *coins: pointer to input randomness
+ *                (an already allocated array filled with 2*MLKEM_SYMBYTES
+ *random bytes)
+ **
+ * Returns 0 (success)
+ **************************************************/
+int crypto_kem_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins)
+{
+  indcpa_keypair_derand(pk, sk, coins);
+  memcpy(sk + MLKEM_INDCPA_SECRETKEYBYTES, pk, MLKEM_PUBLICKEYBYTES);
+  hash_h(sk + MLKEM_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES, pk,
+         MLKEM_PUBLICKEYBYTES);
+  /* Value z for pseudo-random output on reject */
+  memcpy(sk + MLKEM_SECRETKEYBYTES - MLKEM_SYMBYTES, coins + MLKEM_SYMBYTES,
+         MLKEM_SYMBYTES);
+  return 0;
+}
+
+int crypto_kem_keypair(uint8_t *pk, uint8_t *sk)
+{
+  ALIGN uint8_t coins[2 * MLKEM_SYMBYTES];
+  randombytes(coins, 2 * MLKEM_SYMBYTES);
+  crypto_kem_keypair_derand(pk, sk, coins);
+  return 0;
+}
+
+int crypto_kem_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk,
+                          const uint8_t *coins)
+{
+  ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
+  /* Will contain key, coins */
+  ALIGN uint8_t kr[2 * MLKEM_SYMBYTES];
+
+  if (check_pk(pk))
+  {
+    return -1;
+  }
+
+  memcpy(buf, coins, MLKEM_SYMBYTES);
+
+  /* Multitarget countermeasure for coins + contributory KEM */
+  hash_h(buf + MLKEM_SYMBYTES, pk, MLKEM_PUBLICKEYBYTES);
+  hash_g(kr, buf, 2 * MLKEM_SYMBYTES);
+
+  /* coins are in kr+MLKEM_SYMBYTES */
+  indcpa_enc(ct, buf, pk, kr + MLKEM_SYMBYTES);
+
+  memcpy(ss, kr, MLKEM_SYMBYTES);
+  return 0;
+}
+
+int crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk)
+{
+  ALIGN uint8_t coins[MLKEM_SYMBYTES];
+  randombytes(coins, MLKEM_SYMBYTES);
+  return crypto_kem_enc_derand(ct, ss, pk, coins);
+}
+
+int crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk)
+{
+  uint8_t fail;
+  ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
+  /* Will contain key, coins */
+  ALIGN uint8_t kr[2 * MLKEM_SYMBYTES];
+  ALIGN uint8_t cmp[MLKEM_CIPHERTEXTBYTES + MLKEM_SYMBYTES];
+  const uint8_t *pk = sk + MLKEM_INDCPA_SECRETKEYBYTES;
+
+  if (check_sk(sk))
+  {
+    return -1;
+  }
+
+  indcpa_dec(buf, ct, sk);
+
+  /* Multitarget countermeasure for coins + contributory KEM */
+  memcpy(buf + MLKEM_SYMBYTES, sk + MLKEM_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES,
+         MLKEM_SYMBYTES);
+  hash_g(kr, buf, 2 * MLKEM_SYMBYTES);
+
+  /* coins are in kr+MLKEM_SYMBYTES */
+  indcpa_enc(cmp, buf, pk, kr + MLKEM_SYMBYTES);
+
+  fail = ct_memcmp(ct, cmp, MLKEM_CIPHERTEXTBYTES);
+
+  /* Compute rejection key */
+  rkprf(ss, sk + MLKEM_SECRETKEYBYTES - MLKEM_SYMBYTES, ct);
+
+  /* Copy true key to return buffer if fail is 0 */
+  ct_cmov_zero(ss, kr, MLKEM_SYMBYTES, fail);
+
+  return 0;
+}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/kem.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/kem.h
new file mode 100644
index 0000000000..6a33be7c7e
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/kem.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef KEM_H
+#define KEM_H
+
+#include <stdint.h>
+#include "cbmc.h"
+#include "params.h"
+
+#define CRYPTO_SECRETKEYBYTES MLKEM_SECRETKEYBYTES
+#define CRYPTO_PUBLICKEYBYTES MLKEM_PUBLICKEYBYTES
+#define CRYPTO_CIPHERTEXTBYTES MLKEM_CIPHERTEXTBYTES
+#define CRYPTO_BYTES MLKEM_SSBYTES
+
+#if (MLKEM_K == 2)
+#define CRYPTO_ALGNAME "Kyber512"
+#elif (MLKEM_K == 3)
+#define CRYPTO_ALGNAME "Kyber768"
+#elif (MLKEM_K == 4)
+#define CRYPTO_ALGNAME "Kyber1024"
+#endif
+
+#define crypto_kem_keypair_derand MLKEM_NAMESPACE(keypair_derand)
+int crypto_kem_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins)
+__contract__(
+  requires(memory_no_alias(pk, MLKEM_PUBLICKEYBYTES))
+  requires(memory_no_alias(sk, MLKEM_SECRETKEYBYTES))
+  requires(memory_no_alias(coins, 2 * MLKEM_SYMBYTES))
+  assigns(object_whole(pk))
+  assigns(object_whole(sk))
+);
+
+#define crypto_kem_keypair MLKEM_NAMESPACE(keypair)
+/*************************************************
+ * Name:        crypto_kem_keypair
+ *
+ * Description: Generates public and private key
+ *              for CCA-secure ML-KEM key encapsulation mechanism
+ *
+ * Arguments:   - uint8_t *pk: pointer to output public key
+ *                (an already allocated array of MLKEM_PUBLICKEYBYTES bytes)
+ *              - uint8_t *sk: pointer to output private key
+ *                (an already allocated array of MLKEM_SECRETKEYBYTES bytes)
+ *
+ * Returns 0 (success)
+ **************************************************/
+int crypto_kem_keypair(uint8_t *pk, uint8_t *sk)
+__contract__(
+  requires(memory_no_alias(pk, MLKEM_PUBLICKEYBYTES))
+  requires(memory_no_alias(sk, MLKEM_SECRETKEYBYTES))
+  assigns(object_whole(pk))
+  assigns(object_whole(sk))
+);
+
+#define crypto_kem_enc_derand MLKEM_NAMESPACE(enc_derand)
+/*************************************************
+ * Name:        crypto_kem_enc_derand
+ *
+ * Description: Generates cipher text and shared
+ *              secret for given public key
+ *
+ * Arguments:   - uint8_t *ct: pointer to output cipher text
+ *                (an already allocated array of MLKEM_CIPHERTEXTBYTES bytes)
+ *              - uint8_t *ss: pointer to output shared secret
+ *                (an already allocated array of MLKEM_SSBYTES bytes)
+ *              - const uint8_t *pk: pointer to input public key
+ *                (an already allocated array of MLKEM_PUBLICKEYBYTES bytes)
+ *              - const uint8_t *coins: pointer to input randomness
+ *                (an already allocated array filled with MLKEM_SYMBYTES random
+ *bytes)
+ **
+ * Returns 0 on success, and -1 if the public key modulus check (see Section 7.2
+ * of FIPS203) fails.
+ **************************************************/
+int crypto_kem_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk,
+                          const uint8_t *coins)
+__contract__(
+  requires(memory_no_alias(ct, MLKEM_CIPHERTEXTBYTES))
+  requires(memory_no_alias(ss, MLKEM_SSBYTES))
+  requires(memory_no_alias(pk, MLKEM_PUBLICKEYBYTES))
+  requires(memory_no_alias(coins, MLKEM_SYMBYTES))
+  assigns(object_whole(ct))
+  assigns(object_whole(ss))
+);
+
+#define crypto_kem_enc MLKEM_NAMESPACE(enc)
+/*************************************************
+ * Name:        crypto_kem_enc
+ *
+ * Description: Generates cipher text and shared
+ *              secret for given public key
+ *
+ * Arguments:   - uint8_t *ct: pointer to output cipher text
+ *                (an already allocated array of MLKEM_CIPHERTEXTBYTES bytes)
+ *              - uint8_t *ss: pointer to output shared secret
+ *                (an already allocated array of MLKEM_SSBYTES bytes)
+ *              - const uint8_t *pk: pointer to input public key
+ *                (an already allocated array of MLKEM_PUBLICKEYBYTES bytes)
+ *
+ * Returns 0 on success, and -1 if the public key modulus check (see Section 7.2
+ * of FIPS203) fails.
+ **************************************************/
+int crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk)
+__contract__(
+  requires(memory_no_alias(ct, MLKEM_CIPHERTEXTBYTES))
+  requires(memory_no_alias(ss, MLKEM_SSBYTES))
+  requires(memory_no_alias(pk, MLKEM_PUBLICKEYBYTES))
+  assigns(object_whole(ct))
+  assigns(object_whole(ss))
+);
+
+#define crypto_kem_dec MLKEM_NAMESPACE(dec)
+/*************************************************
+ * Name:        crypto_kem_dec
+ *
+ * Description: Generates shared secret for given
+ *              cipher text and private key
+ *
+ * Arguments:   - uint8_t *ss: pointer to output shared secret
+ *                (an already allocated array of MLKEM_SSBYTES bytes)
+ *              - const uint8_t *ct: pointer to input cipher text
+ *                (an already allocated array of MLKEM_CIPHERTEXTBYTES bytes)
+ *              - const uint8_t *sk: pointer to input private key
+ *                (an already allocated array of MLKEM_SECRETKEYBYTES bytes)
+ *
+ * Returns 0 on success, and -1 if the secret key hash check (see Section 7.3 of
+ * FIPS203) fails.
+ *
+ * On failure, ss will contain a pseudo-random value.
+ **************************************************/
+int crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk)
+__contract__(
+  requires(memory_no_alias(ss, MLKEM_SSBYTES))
+  requires(memory_no_alias(ct, MLKEM_CIPHERTEXTBYTES))
+  requires(memory_no_alias(sk, MLKEM_SECRETKEYBYTES))
+  assigns(object_whole(ss))
+);
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/ntt.c
new file mode 100644
index 0000000000..1844ca19fd
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/ntt.c
@@ -0,0 +1,278 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "ntt.h"
+#include <stdint.h>
+#include "params.h"
+#include "reduce.h"
+
+#include "arith_native.h"
+#include "debug/debug.h"
+#include "ntt.h"
+
+#if !defined(MLKEM_USE_NATIVE_NTT)
+/*
+ * Computes a block CT butterflies with a fixed twiddle factor,
+ * using Montgomery multiplication.
+ * Parameters:
+ * - r: Pointer to base of polynomial (_not_ the base of butterfly block)
+ * - root: Twiddle factor to use for the butterfly. This must be in
+ *         Montgomery form and signed canonical.
+ * - start: Offset to the beginning of the butterfly block
+ * - len: Index difference between coefficients subject to a butterfly
+ * - bound: Ghost variable describing coefficient bound: Prior to `start`,
+ *          coefficients must be bound by `bound + MLKEM_Q`. Post `start`,
+ *          they must be bound by `bound`.
+ * When this function returns, output coefficients in the index range
+ * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`.
+ * Example:
+ * - start=8, len=4
+ *   This would compute the following four butterflies
+ *          8     --    12
+ *             9    --     13
+ *                10   --     14
+ *                   11   --     15
+ * - start=4, len=2
+ *   This would compute the following two butterflies
+ *          4 -- 6
+ *             5 -- 7
+ */
+STATIC_TESTABLE
+void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, int start, int len,
+                         int bound)
+__contract__(
+  requires(0 <= start && start < MLKEM_N)
+  requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
+  requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
+  requires(-HALF_Q < zeta && zeta < HALF_Q)
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(array_abs_bound(r, 0, start - 1, bound + MLKEM_Q))
+  requires(array_abs_bound(r, start, MLKEM_N - 1, bound))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, start + 2*len - 1, bound + MLKEM_Q))
+  ensures(array_abs_bound(r, start + 2 * len, MLKEM_N - 1, bound)))
+{
+  /* `bound` is a ghost variable only needed in the CBMC specification */
+  int j;
+  ((void)bound);
+  for (j = start; j < start + len; j++)
+  __loop__(
+    invariant(start <= j && j <= start + len)
+    /* 
+     * Coefficients are updated in strided pairs, so the bounds for the
+     * intermediate states alternate twice between the old and new bound
+     */
+    invariant(array_abs_bound(r, 0,           j - 1,           bound + MLKEM_Q))
+    invariant(array_abs_bound(r, j,           start + len - 1, bound))
+    invariant(array_abs_bound(r, start + len, j + len - 1,     bound + MLKEM_Q))
+    invariant(array_abs_bound(r, j + len,     MLKEM_N - 1,     bound)))
+  {
+    int16_t t;
+    t = fqmul(r[j + len], zeta);
+    r[j + len] = r[j] - t;
+    r[j] = r[j] + t;
+  }
+}
+
+/*
+ *Compute one layer of forward NTT
+ * Parameters:
+ * - r: Pointer to base of polynomial
+ * - len: Stride of butterflies in this layer.
+ * - layer: Ghost variable indicating which layer is being applied.
+ *          Must match `len` via `len == MLKEM_N >> layer`.
+ * Note: `len` could be dropped and computed in the function, but
+ *   we are following the structure of the reference NTT from the
+ *   official Kyber implementation here, merely adding `layer` as
+ *   a ghost variable for the specifications.
+ */
+STATIC_TESTABLE
+void ntt_layer(int16_t r[MLKEM_N], int len, int layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
+  requires(array_abs_bound(r, 0, MLKEM_N - 1, layer * MLKEM_Q - 1))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N - 1, (layer + 1) * MLKEM_Q - 1)))
+{
+  int start, k;
+  /* `layer` is a ghost variable only needed in the CBMC specification */
+  ((void)layer);
+  /* Twiddle factors for layer n start at index 2^(layer-1) */
+  k = MLKEM_N / (2 * len);
+  for (start = 0; start < MLKEM_N; start += 2 * len)
+  __loop__(
+    invariant(0 <= start && start < MLKEM_N + 2 * len)
+    invariant(0 <= k && k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
+    invariant(array_abs_bound(r, 0, start - 1, (layer * MLKEM_Q - 1) + MLKEM_Q))
+    invariant(array_abs_bound(r, start, MLKEM_N - 1, layer * MLKEM_Q - 1)))
+  {
+    int16_t zeta = zetas[k++];
+    ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q - 1);
+  }
+}
+
+/*
+ * Compute full forward NTT
+ * NOTE: This particular implementation satisfies a much tighter
+ * bound on the output coefficients (5*q) than the contractual one (8*q),
+ * but this is not needed in the calling code. Should we change the
+ * base multiplication strategy to require smaller NTT output bounds,
+ * the proof may need strengthening.
+ * REF-CHANGE: Removed indirection poly_ntt -> ntt()
+ * and integrated polynomial reduction into the NTT.
+ */
+
+
+void poly_ntt(poly *p)
+{
+  int len, layer;
+  int16_t *r;
+  POLY_BOUND_MSG(p, MLKEM_Q, "ref ntt input");
+  r = p->coeffs;
+
+  for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
+  __loop__(
+    invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer))
+    invariant(array_abs_bound(r, 0, MLKEM_N - 1, layer * MLKEM_Q - 1)))
+  {
+    ntt_layer(r, len, layer);
+  }
+
+  /* Check the stronger bound */
+  POLY_BOUND_MSG(p, NTT_BOUND, "ref ntt output");
+}
+#else  /* MLKEM_USE_NATIVE_NTT */
+
+/* Check that bound for native NTT implies contractual bound */
+STATIC_ASSERT(NTT_BOUND_NATIVE <= NTT_BOUND, invntt_bound)
+
+void poly_ntt(poly *p)
+{
+  POLY_BOUND_MSG(p, MLKEM_Q, "native ntt input");
+  ntt_native(p);
+  POLY_BOUND_MSG(p, NTT_BOUND_NATIVE, "native ntt output");
+}
+#endif /* MLKEM_USE_NATIVE_NTT */
+
+#if !defined(MLKEM_USE_NATIVE_INTT)
+
+/* Check that bound for reference invNTT implies contractual bound */
+#define INVNTT_BOUND_REF (3 * MLKEM_Q / 4)
+STATIC_ASSERT(INVNTT_BOUND_REF <= INVNTT_BOUND, invntt_bound)
+
+/* Compute one layer of inverse NTT */
+STATIC_TESTABLE
+void invntt_layer(int16_t *r, int len, int layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
+  requires(len == (1 << (8 - layer)))
+  requires(array_abs_bound(r, 0, MLKEM_N - 1, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N - 1, MLKEM_Q)))
+{
+  int start, k;
+  /* `layer` is a ghost variable used only in the specification */
+  ((void)layer);
+  k = MLKEM_N / len - 1;
+  for (start = 0; start < MLKEM_N; start += 2 * len)
+  __loop__(
+    invariant(array_abs_bound(r, 0, MLKEM_N - 1, MLKEM_Q))
+    invariant(0 <= start && start <= MLKEM_N && 0 <= k && k <= 127)
+    /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
+    invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
+  {
+    int j;
+    int16_t zeta = zetas[k--];
+    for (j = start; j < start + len; j++)
+    __loop__(
+      invariant(start <= j && j <= start + len)
+      invariant(0 <= start && start <= MLKEM_N && 0 <= k && k <= 127)
+      invariant(array_abs_bound(r, 0, MLKEM_N - 1, MLKEM_Q)))
+    {
+      int16_t t = r[j];
+      r[j] = barrett_reduce(t + r[j + len]);
+      r[j + len] = r[j + len] - t;
+      r[j + len] = fqmul(r[j + len], zeta);
+    }
+  }
+}
+
+void poly_invntt_tomont(poly *p)
+{
+  /*
+   * Scale input polynomial to account for Montgomery factor
+   * and NTT twist. This also brings coefficients down to
+   * absolute value < MLKEM_Q.
+   */
+  int j, len, layer;
+  const int16_t f = 1441;
+  int16_t *r = p->coeffs;
+
+  for (j = 0; j < MLKEM_N; j++)
+  __loop__(
+    invariant(0 <= j && j <= MLKEM_N)
+    invariant(array_abs_bound(r, 0, j - 1, MLKEM_Q)))
+  {
+    r[j] = fqmul(r[j], f);
+  }
+
+  /* Run the invNTT layers */
+  for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
+  __loop__(
+    invariant(2 <= len && len <= 256 && 0 <= layer && layer <= 7 && len == (1 << (8 - layer)))
+    invariant(array_abs_bound(r, 0, MLKEM_N - 1, MLKEM_Q)))
+  {
+    invntt_layer(p->coeffs, len, layer);
+  }
+
+  POLY_BOUND_MSG(p, INVNTT_BOUND_REF, "ref intt output");
+}
+#else  /* MLKEM_USE_NATIVE_INTT */
+
+/* Check that bound for native invNTT implies contractual bound */
+STATIC_ASSERT(INVNTT_BOUND_NATIVE <= INVNTT_BOUND, invntt_bound)
+
+void poly_invntt_tomont(poly *p)
+{
+  intt_native(p);
+  POLY_BOUND_MSG(p, INVNTT_BOUND_NATIVE, "native intt output");
+}
+#endif /* MLKEM_USE_NATIVE_INTT */
+
+/*************************************************
+ * Name:        basemul_cached
+ *
+ * Description: Multiplication of polynomials in Zq[X]/(X^2-zeta)
+ *              used for multiplication of elements in Rq in NTT domain
+ *
+ *              Bounds:
+ *              - a is assumed to be < q in absolute value.
+ *              - Return value < 3/2 q in absolute value
+ *
+ * Arguments:   - int16_t r[2]: pointer to the output polynomial
+ *              - const int16_t a[2]: pointer to the first factor
+ *              - const int16_t b[2]: pointer to the second factor
+ *              - int16_t b_cached: Cached precomputation of b[1] * zeta
+ **************************************************/
+void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
+                    int16_t b_cached)
+{
+  int32_t t0, t1;
+
+  BOUND(a, 2, MLKEM_Q, "basemul input bound");
+
+  t0 = (int32_t)a[1] * b_cached;
+  t0 += (int32_t)a[0] * b[0];
+  t1 = (int32_t)a[0] * b[1];
+  t1 += (int32_t)a[1] * b[0];
+
+  /* |ti| < 2 * q * 2^15 */
+  r[0] = montgomery_reduce(t0);
+  r[1] = montgomery_reduce(t1);
+
+  /* |r[i]| < 3/2 q */
+  BOUND(r, 2, 3 * MLKEM_Q / 2, "basemul output bound");
+}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/ntt.h
new file mode 100644
index 0000000000..0f7b30624b
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/ntt.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef NTT_H
+#define NTT_H
+
+#include <stdint.h>
+#include "arith_native.h"
+#include "cbmc.h"
+#include "params.h"
+#include "poly.h"
+#include "reduce.h"
+
+#define zetas MLKEM_NAMESPACE(zetas)
+extern const int16_t zetas[128];
+
+/*************************************************
+ * Name:        poly_ntt
+ *
+ * Description: Computes negacyclic number-theoretic transform (NTT) of
+ *              a polynomial in place.
+ *
+ *              The input is assumed to be in normal order and
+ *              coefficient-wise bound by MLKEM_Q in absolute value.
+ *
+ *              The output polynomial is in bitreversed order, and
+ *              coefficient-wise bound by NTT_BOUND in absolute value.
+ *
+ *              (NOTE: Sometimes the input to the NTT is actually smaller,
+ *               which gives better bounds.)
+ *
+ * Arguments:   - poly *p: pointer to in/output polynomial
+ **************************************************/
+
+#define poly_ntt MLKEM_NAMESPACE(poly_ntt)
+void poly_ntt(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(array_abs_bound(r->coeffs, 0, MLKEM_N - 1, MLKEM_Q - 1))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N - 1, NTT_BOUND - 1))
+);
+
+/*************************************************
+ * Name:        poly_invntt_tomont
+ *
+ * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
+ *              of a polynomial in place;
+ *              inputs assumed to be in bitreversed order, output in normal
+ *              order
+ *
+ *              The input is assumed to be in bitreversed order, and can
+ *              have arbitrary coefficients in int16_t.
+ *
+ *              The output polynomial is in normal order, and
+ *              coefficient-wise bound by INVNTT_BOUND in absolute value.
+ *
+ * Arguments:   - uint16_t *a: pointer to in/output polynomial
+ **************************************************/
+#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont)
+void poly_invntt_tomont(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N - 1, INVNTT_BOUND - 1))
+);
+
+#define basemul_cached MLKEM_NAMESPACE(basemul_cached)
+/************************************************************
+ * Name: basemul_cached
+ *
+ * Description: Computes a representative modulo q of
+ *              (a0*b0 + a1*b_cached, a0*b1 + a1*b0)/65536
+ *
+ *              If b_cached is b1*zeta, this represents the
+ *              product of (a0 + a1*X) and (b0 + b1*X) in
+ *              Fq[X]/(X^2 - zeta).
+ *
+ * Arguments: - r: Pointer to output polynomial
+ *                   Upon return, coefficients are bound by
+ *                   3*(q+1)/2 in absolute value.
+ *            - a: Pointer to first input polynomial
+ *                   Must be coefficient-wise < q in absolute value.
+ *            - b: Pointer to second input polynomial
+ *                   Can have arbitrary int16_t coefficients
+ *            - b_cached: Some precomputed value, typically derived from
+ *                   b1 and a twiddle factor. Can be an arbitary int16_t.
+ ************************************************************/
+void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
+                    int16_t b_cached)
+__contract__(
+  requires(memory_no_alias(r, 2 * sizeof(int16_t)))
+  requires(memory_no_alias(a, 2 * sizeof(int16_t)))
+  requires(memory_no_alias(b, 2 * sizeof(int16_t)))
+  requires(array_abs_bound(a, 0, 1, MLKEM_Q - 1))
+  assigns(memory_slice(r, 2 * sizeof(int16_t)))
+  ensures(array_abs_bound(r, 0, 1, (3 * HALF_Q - 1)))
+);
+
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/params.h
new file mode 100644
index 0000000000..da8041267e
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/params.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef PARAMS_H
+#define PARAMS_H
+
+#include "common.h"
+#include "cpucap.h"
+
+#define KECCAK_WAY 4
+
+#ifndef MLKEM_K
+#define MLKEM_K 3 /* Change this for different security strengths */
+#endif
+
+/* Don't change parameters below this line */
+#if (MLKEM_K == 2)
+#define MLKEM_NAMESPACE(s) PQCP_MLKEM_NATIVE_MLKEM512_##s
+#define _MLKEM_NAMESPACE(s) _PQCP_MLKEM_NATIVE_MLKEM512_##s
+#elif (MLKEM_K == 3)
+#define MLKEM_NAMESPACE(s) PQCP_MLKEM_NATIVE_MLKEM768_##s
+#define _MLKEM_NAMESPACE(s) _PQCP_MLKEM_NATIVE_MLKEM768_##s
+#elif (MLKEM_K == 4)
+#define MLKEM_NAMESPACE(s) PQCP_MLKEM_NATIVE_MLKEM1024_##s
+#define _MLKEM_NAMESPACE(s) _PQCP_MLKEM_NATIVE_MLKEM1024_##s
+#else
+#error "MLKEM_K must be in {2,3,4}"
+#endif
+
+#define MLKEM_N 256
+#define MLKEM_Q 3329
+
+#define MLKEM_SYMBYTES 32 /* size in bytes of hashes, and seeds */
+#define MLKEM_SSBYTES 32  /* size in bytes of shared key */
+
+#define MLKEM_POLYBYTES 384
+#define MLKEM_POLYVECBYTES (MLKEM_K * MLKEM_POLYBYTES)
+
+#if MLKEM_K == 2
+#define MLKEM_ETA1 3
+#define MLKEM_POLYCOMPRESSEDBYTES_DV 128
+#define MLKEM_POLYCOMPRESSEDBYTES_DU 320
+#define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
+#elif MLKEM_K == 3
+#define MLKEM_ETA1 2
+#define MLKEM_POLYCOMPRESSEDBYTES_DV 128
+#define MLKEM_POLYCOMPRESSEDBYTES_DU 320
+#define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
+#elif MLKEM_K == 4
+#define MLKEM_ETA1 2
+#define MLKEM_POLYCOMPRESSEDBYTES_DV 160
+#define MLKEM_POLYCOMPRESSEDBYTES_DU 352
+#define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
+#endif
+
+#define MLKEM_ETA2 2
+
+#define MLKEM_INDCPA_MSGBYTES (MLKEM_SYMBYTES)
+#define MLKEM_INDCPA_PUBLICKEYBYTES (MLKEM_POLYVECBYTES + MLKEM_SYMBYTES)
+#define MLKEM_INDCPA_SECRETKEYBYTES (MLKEM_POLYVECBYTES)
+#define MLKEM_INDCPA_BYTES \
+  (MLKEM_POLYVECCOMPRESSEDBYTES_DU + MLKEM_POLYCOMPRESSEDBYTES_DV)
+
+#define MLKEM_PUBLICKEYBYTES (MLKEM_INDCPA_PUBLICKEYBYTES)
+/* 32 bytes of additional space to save H(pk) */
+#define MLKEM_SECRETKEYBYTES                                   \
+  (MLKEM_INDCPA_SECRETKEYBYTES + MLKEM_INDCPA_PUBLICKEYBYTES + \
+   2 * MLKEM_SYMBYTES)
+#define MLKEM_CIPHERTEXTBYTES (MLKEM_INDCPA_BYTES)
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.c
new file mode 100644
index 0000000000..93a663c12b
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.c
@@ -0,0 +1,568 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "poly.h"
+#include <stdint.h>
+#include <string.h>
+#include "cbd.h"
+#include "cbmc.h"
+#include "fips202x4.h"
+#include "ntt.h"
+#include "params.h"
+#include "reduce.h"
+#include "symmetric.h"
+#include "verify.h"
+
+#include "arith_native.h"
+#include "debug/debug.h"
+
+void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
+{
+  int j;
+#if (MLKEM_POLYCOMPRESSEDBYTES_DU == 352)
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(invariant(j >= 0 && j <= MLKEM_N / 8))
+  {
+    int k;
+    uint16_t t[8];
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k >= 0 && k <= 8)
+      invariant(forall(int, r, 0, k - 1, t[r] < (1u << 11))))
+    {
+      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+    }
+
+    /*
+     * REF-CHANGE: Use array indexing into
+     * r rather than pointer-arithmetic to simplify verification
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 11-bit in size.
+     */
+    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
+    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
+    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
+    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
+    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
+    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
+    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
+    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
+    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
+    r[11 * j + 10] = (t[7] >> 3);
+  }
+
+#elif (MLKEM_POLYCOMPRESSEDBYTES_DU == 320)
+  for (j = 0; j < MLKEM_N / 4; j++)
+  __loop__(invariant(j >= 0 && j <= MLKEM_N / 4))
+  {
+    int k;
+    uint16_t t[4];
+    for (k = 0; k < 4; k++)
+    __loop__(
+      invariant(k >= 0 && k <= 4)
+      invariant(forall(int, r, 0, k - 1, t[r] < (1u << 10))))
+    {
+      t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
+    }
+
+    /*
+     * REF-CHANGE: Use array indexing into
+     * r rather than pointer-arithmetic to simplify verification
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 10-bit in size.
+     */
+    r[5 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF);
+    r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF);
+    r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
+    r[5 * j + 4] = (t[3] >> 2);
+  }
+#else
+#error "MLKEM_POLYCOMPRESSEDBYTES_DU needs to be in {320,352}"
+#endif
+}
+
+
+void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+{
+  int j;
+#if (MLKEM_POLYCOMPRESSEDBYTES_DU == 352)
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(
+    invariant(0 <= j && j <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * j - 1, 0, (MLKEM_Q - 1))))
+  {
+    int k;
+    uint16_t t[8];
+    uint8_t const *base = &a[11 * j];
+    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
+    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
+                    ((uint16_t)base[4] << 10));
+    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
+    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
+    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
+                    ((uint16_t)base[8] << 9));
+    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
+    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
+
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(0 <= k && k <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * j + k - 1, 0, (MLKEM_Q - 1))))
+    {
+      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
+    }
+  }
+#elif (MLKEM_POLYCOMPRESSEDBYTES_DU == 320)
+  for (j = 0; j < MLKEM_N / 4; j++)
+  __loop__(
+    invariant(0 <= j && j <= MLKEM_N / 4)
+    invariant(array_bound(r->coeffs, 0, 4 * j - 1, 0, (MLKEM_Q - 1))))
+  {
+    int k;
+    uint16_t t[4];
+    uint8_t const *base = &a[5 * j];
+
+    t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6));
+    t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4));
+    t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2));
+
+    for (k = 0; k < 4; k++)
+    __loop__(
+      invariant(0 <= k && k <= 4)
+      invariant(array_bound(r->coeffs, 0, 4 * j + k - 1, 0, (MLKEM_Q - 1))))
+    {
+      r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
+    }
+  }
+#else
+#error "MLKEM_POLYCOMPRESSEDBYTES_DU needs to be in {320,352}"
+#endif
+}
+
+void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
+{
+  int i;
+  POLY_UBOUND(a, MLKEM_Q);
+
+#if (MLKEM_POLYCOMPRESSEDBYTES_DV == 128)
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
+  {
+    int j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(array_bound(t, 0, (j-1), 0, 15)))
+    {
+      /* REF-CHANGE: Precondition change, we assume unsigned canonical data */
+      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
+    }
+
+    /*
+     * REF-CHANGE: Use array indexing into
+     * r rather than pointer-arithmetic to simplify verification
+     */
+    r[i * 4] = t[0] | (t[1] << 4);
+    r[i * 4 + 1] = t[2] | (t[3] << 4);
+    r[i * 4 + 2] = t[4] | (t[5] << 4);
+    r[i * 4 + 3] = t[6] | (t[7] << 4);
+  }
+#elif (MLKEM_POLYCOMPRESSEDBYTES_DV == 160)
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
+  {
+    int j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(array_bound(t, 0, (j-1), 0, 31)))
+    {
+      /* REF-CHANGE: Precondition change, we assume unsigned canonical data */
+      t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
+    }
+
+    /*
+     * REF-CHANGE: Explicitly truncate to avoid warning about
+     * implicit truncation in CBMC, and use array indexing into
+     * r rather than pointer-arithmetic to simplify verification
+     */
+    r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5));
+    r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
+    r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4));
+    r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
+    r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
+  }
+#else
+#error "MLKEM_POLYCOMPRESSEDBYTES_DV needs to be in {128, 160}"
+#endif
+}
+
+void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+{
+  int i;
+#if (MLKEM_POLYCOMPRESSEDBYTES_DV == 128)
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, (2 * i - 1), 0, (MLKEM_Q - 1))))
+  {
+    /* REF-CHANGE: Hoist scalar decompression into separate function */
+    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
+    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
+  }
+#elif (MLKEM_POLYCOMPRESSEDBYTES_DV == 160)
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, (8 * i - 1), 0, (MLKEM_Q - 1))))
+  {
+    int j;
+    uint8_t t[8];
+    const int offset = i * 5;
+    /*
+     * REF-CHANGE: Explicitly truncate to avoid warning about
+     * implicit truncation in CBMC and unwind loop for ease
+     * of proof.
+     */
+
+    /*
+     * Decompress 5 8-bit bytes (so 40 bits) into
+     * 8 5-bit values stored in t[]
+     */
+    t[0] = 0x1F & (a[offset + 0] >> 0);
+    t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3));
+    t[2] = 0x1F & (a[offset + 1] >> 2);
+    t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1));
+    t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4));
+    t[5] = 0x1F & (a[offset + 3] >> 1);
+    t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2));
+    t[7] = 0x1F & (a[offset + 4] >> 3);
+
+    /* and copy to the correct slice in r[] */
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(j >= 0 && j <= 8 && i >= 0 && i <= MLKEM_N / 8)
+      invariant(array_bound(r->coeffs, 0, (8 * i + j - 1), 0, (MLKEM_Q - 1))))
+    {
+      /* REF-CHANGE: Hoist scalar decompression into separate function */
+      r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
+    }
+  }
+#else
+#error "MLKEM_POLYCOMPRESSEDBYTES_DV needs to be in {128, 160}"
+#endif
+
+  POLY_UBOUND(r, MLKEM_Q);
+}
+
+#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+{
+  unsigned int i;
+  POLY_UBOUND(a, MLKEM_Q);
+
+
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(invariant(i >= 0 && i <= MLKEM_N / 2))
+  {
+    const uint16_t t0 = a->coeffs[2 * i];
+    const uint16_t t1 = a->coeffs[2 * i + 1];
+    /* REF-CHANGE: Precondition change, we assume unsigned canonical data */
+
+    /*
+     * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of
+     * significant data, so these can be packed into 24 bits or exactly
+     * 3 bytes, as follows.
+     */
+
+    /* Least significant bits 0 - 7 of t0. */
+    r[3 * i + 0] = t0 & 0xFF;
+
+    /*
+     * Most significant bits 8 - 11 of t0 become the least significant
+     * nibble of the second byte. The least significant 4 bits
+     * of t1 become the upper nibble of the second byte.
+     */
+    r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0);
+
+    /* Bits 4 - 11 of t1 become the third byte. */
+    r[3 * i + 2] = t1 >> 4;
+  }
+}
+#else  /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+{
+  POLY_UBOUND(a, MLKEM_Q);
+  poly_tobytes_native(r, a);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+{
+  int i;
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, (2 * i - 1), 0, 4095)))
+  {
+    /* REF-CHANGE: Introduce some locals for better readability */
+    const uint8_t t0 = a[3 * i + 0];
+    const uint8_t t1 = a[3 * i + 1];
+    const uint8_t t2 = a[3 * i + 2];
+    r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF);
+    r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4);
+  }
+
+  /* Note that the coefficients are not canonical */
+  POLY_UBOUND(r, 4096);
+}
+#else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+{
+  poly_frombytes_native(r, a);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+
+void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+{
+  int i;
+#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8)
+#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!"
+#endif
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, (8 * i - 1), 0, (MLKEM_Q - 1))))
+  {
+    int j;
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i >= 0 && i <  MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(array_bound(r->coeffs, 0, (8 * i + j - 1), 0, (MLKEM_Q - 1))))
+    {
+      /* Prevent the compiler from recognizing this as a bit selection */
+      uint8_t mask = value_barrier_u8(1u << j);
+      r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
+    }
+  }
+  POLY_BOUND_MSG(r, MLKEM_Q, "poly_frommsg output");
+}
+
+void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
+{
+  int i;
+  POLY_UBOUND(a, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
+  {
+    int j;
+    msg[i] = 0;
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8))
+    {
+      uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
+      msg[i] |= t << j;
+    }
+  }
+}
+
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+{
+  ALIGN uint8_t buf[KECCAK_WAY][MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
+  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+  extkey[0][MLKEM_SYMBYTES] = nonce0;
+  extkey[1][MLKEM_SYMBYTES] = nonce1;
+  extkey[2][MLKEM_SYMBYTES] = nonce2;
+  extkey[3][MLKEM_SYMBYTES] = nonce3;
+  shake256x4(buf[0], buf[1], buf[2], buf[3], MLKEM_ETA1 * MLKEM_N / 4,
+             extkey[0], extkey[1], extkey[2], extkey[3], MLKEM_SYMBYTES + 1);
+  poly_cbd_eta1(r0, buf[0]);
+  poly_cbd_eta1(r1, buf[1]);
+  poly_cbd_eta1(r2, buf[2]);
+  poly_cbd_eta1(r3, buf[3]);
+
+  POLY_BOUND_MSG(r0, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 0");
+  POLY_BOUND_MSG(r1, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 1");
+  POLY_BOUND_MSG(r2, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 2");
+  POLY_BOUND_MSG(r3, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 3");
+}
+
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+{
+  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
+  prf(buf, sizeof(buf), seed, nonce);
+  poly_cbd_eta2(r, buf);
+
+  POLY_BOUND_MSG(r, MLKEM_ETA1 + 1, "poly_getnoise_eta2 output");
+}
+
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+{
+  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
+  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+  extkey[0][MLKEM_SYMBYTES] = nonce0;
+  extkey[1][MLKEM_SYMBYTES] = nonce1;
+  extkey[2][MLKEM_SYMBYTES] = nonce2;
+  extkey[3][MLKEM_SYMBYTES] = nonce3;
+
+#if MLKEM_ETA1 == MLKEM_ETA2
+  shake256x4(buf1[0], buf1[1], buf2[0], buf2[1], MLKEM_ETA1 * MLKEM_N / 4,
+             extkey[0], extkey[1], extkey[2], extkey[3], MLKEM_SYMBYTES + 1);
+#else
+  shake256(buf1[0], sizeof(buf1[0]), extkey[0], sizeof(extkey[0]));
+  shake256(buf1[1], sizeof(buf1[1]), extkey[1], sizeof(extkey[1]));
+  shake256(buf2[0], sizeof(buf2[0]), extkey[2], sizeof(extkey[2]));
+  shake256(buf2[1], sizeof(buf2[1]), extkey[3], sizeof(extkey[3]));
+#endif
+
+  poly_cbd_eta1(r0, buf1[0]);
+  poly_cbd_eta1(r1, buf1[1]);
+  poly_cbd_eta2(r2, buf2[0]);
+  poly_cbd_eta2(r3, buf2[1]);
+
+  POLY_BOUND_MSG(r0, MLKEM_ETA1 + 1, "poly_getnoise_eta1122_4x output 0");
+  POLY_BOUND_MSG(r1, MLKEM_ETA1 + 1, "poly_getnoise_eta1122_4x output 1");
+  POLY_BOUND_MSG(r2, MLKEM_ETA2 + 1, "poly_getnoise_eta1122_4x output 2");
+  POLY_BOUND_MSG(r3, MLKEM_ETA2 + 1, "poly_getnoise_eta1122_4x output 3");
+}
+
+void poly_basemul_montgomery_cached(poly *r, const poly *a, const poly *b,
+                                    const poly_mulcache *b_cache)
+{
+  int i;
+  POLY_BOUND(b_cache, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 4; i++)
+  __loop__(
+    assigns(i, object_whole(r))
+    invariant(i >= 0 && i <= MLKEM_N / 4)
+    invariant(array_abs_bound(r->coeffs, 0, (4 * i - 1), (3 * HALF_Q - 1))))
+  {
+    basemul_cached(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i],
+                   b_cache->coeffs[2 * i]);
+    basemul_cached(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2],
+                   &b->coeffs[4 * i + 2], b_cache->coeffs[2 * i + 1]);
+  }
+}
+
+#if !defined(MLKEM_USE_NATIVE_POLY_TOMONT)
+void poly_tomont(poly *r)
+{
+  int i;
+  const int16_t f = (1ULL << 32) % MLKEM_Q; /* 1353 */
+  for (i = 0; i < MLKEM_N; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(array_abs_bound(r->coeffs ,0, (i - 1), (MLKEM_Q - 1))))
+  {
+    r->coeffs[i] = fqmul(r->coeffs[i], f);
+  }
+
+  POLY_BOUND(r, MLKEM_Q);
+}
+#else  /* MLKEM_USE_NATIVE_POLY_TOMONT */
+void poly_tomont(poly *r)
+{
+  poly_tomont_native(r);
+  POLY_BOUND(r, MLKEM_Q);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_REDUCE)
+void poly_reduce(poly *r)
+{
+  int i;
+  for (i = 0; i < MLKEM_N; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(array_bound(r->coeffs, 0, (i - 1), 0, (MLKEM_Q - 1))))
+  {
+    /* Barrett reduction, giving signed canonical representative */
+    int16_t t = barrett_reduce(r->coeffs[i]);
+    /* Conditional addition to get unsigned canonical representative */
+    r->coeffs[i] = scalar_signed_to_unsigned_q(t);
+  }
+
+  POLY_UBOUND(r, MLKEM_Q);
+}
+#else  /* MLKEM_USE_NATIVE_POLY_REDUCE */
+void poly_reduce(poly *r)
+{
+  poly_reduce_native(r);
+  POLY_UBOUND(r, MLKEM_Q);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
+
+void poly_add(poly *r, const poly *b)
+{
+  int i;
+  for (i = 0; i < MLKEM_N; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(forall(int, k0, i, MLKEM_N - 1, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
+    invariant(forall(int, k1, 0, i - 1, r->coeffs[k1] == loop_entry(*r).coeffs[k1] + b->coeffs[k1])))
+  {
+    r->coeffs[i] = r->coeffs[i] + b->coeffs[i];
+  }
+}
+
+void poly_sub(poly *r, const poly *b)
+{
+  int i;
+  for (i = 0; i < MLKEM_N; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(forall(int, k0, i, MLKEM_N - 1, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
+    invariant(forall(int, k1, 0, i - 1, r->coeffs[k1] == loop_entry(*r).coeffs[k1] - b->coeffs[k1])))
+  {
+    r->coeffs[i] = r->coeffs[i] - b->coeffs[i];
+  }
+}
+
+#if !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE)
+void poly_mulcache_compute(poly_mulcache *x, const poly *a)
+{
+  int i;
+  for (i = 0; i < MLKEM_N / 4; i++)
+  __loop__(invariant(i >= 0 && i <= MLKEM_N / 4))
+  {
+    x->coeffs[2 * i + 0] = fqmul(a->coeffs[4 * i + 1], zetas[64 + i]);
+    x->coeffs[2 * i + 1] = fqmul(a->coeffs[4 * i + 3], -zetas[64 + i]);
+  }
+  POLY_BOUND(x, MLKEM_Q);
+}
+#else  /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
+void poly_mulcache_compute(poly_mulcache *x, const poly *a)
+{
+  poly_mulcache_compute_native(x, a);
+  /* Omitting POLY_BOUND(x, MLKEM_Q) since native implementations may
+   * decide not to use a mulcache. Note that the C backend implementation
+   * of poly_basemul_montgomery_cached() does still include the check. */
+}
+#endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.h
new file mode 100644
index 0000000000..35990684b6
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.h
@@ -0,0 +1,773 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef POLY_H
+#define POLY_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include "cbmc.h"
+#include "params.h"
+#include "reduce.h"
+#include "verify.h"
+
+/* Absolute exclusive upper bound for the output of the inverse NTT */
+#define INVNTT_BOUND (8 * MLKEM_Q)
+
+/* Absolute exclusive upper bound for the output of the forward NTT */
+#define NTT_BOUND (8 * MLKEM_Q)
+
+/*
+ * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial
+ * coeffs[0] + X*coeffs[1] + X^2*coeffs[2] + ... + X^{n-1}*coeffs[n-1]
+ */
+typedef struct
+{
+  int16_t coeffs[MLKEM_N];
+} ALIGN poly;
+
+/*
+ * INTERNAL presentation of precomputed data speeding up
+ * the base multiplication of two polynomials in NTT domain.
+ */
+/*
+ * REF-CHANGE: This structure does not exist in the reference
+ * implementation.
+ */
+typedef struct
+{
+  int16_t coeffs[MLKEM_N >> 1];
+} poly_mulcache;
+
+/************************************************************
+ * Name: scalar_compress_d1
+ *
+ * Description: Computes round(u * 2 / q)
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 1.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d1(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 2)
+  ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2)  )
+{
+  uint32_t d0 = u << 1;
+  d0 *= 645083;
+  d0 += 1u << 30;
+  d0 >>= 31;
+  return d0;
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_compress_d4
+ *
+ * Description: Computes round(u * 16 / q) % 16
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 4.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d4(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 16)
+  ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16))
+{
+  uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */
+  return (d0 + (1u << 27)) >> 28;      /* round(d0/2^28) */
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d4
+ *
+ * Description: Computes round(u * q / 16)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 4.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d4(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 16)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 8) / 16; }
+
+/************************************************************
+ * Name: scalar_compress_d5
+ *
+ * Description: Computes round(u * 32 / q) % 32
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 5.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d5(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 32)
+  ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32)  )
+{
+  uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */
+  return (d0 + (1u << 26)) >> 27;      /* round(d0/2^27) */
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d5
+ *
+ * Description: Computes round(u * q / 32)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 5.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 32
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d5(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 32)
+  ensures(return_value <= MLKEM_Q - 1)
+) { return ((u * MLKEM_Q) + 16) / 32; }
+
+/************************************************************
+ * Name: scalar_compress_d10
+ *
+ * Description: Computes round(u * 2**10 / q) % 2**10
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d10(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < (1u << 10))
+  ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10)))
+{
+  uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */
+  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
+  return (d0 & 0x3FF);
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d10
+ *
+ * Description: Computes round(u * q / 1024)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d10(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 1024)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 512) / 1024; }
+
+/************************************************************
+ * Name: scalar_compress_d11
+ *
+ * Description: Computes round(u * 2**11 / q) % 2**11
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 11.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d11(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < (1u << 11))
+  ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11)))
+{
+  uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */
+  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
+  return (d0 & 0x7FF);
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d11
+ *
+ * Description: Computes round(u * q / 1024)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d11(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 2048)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 1024) / 2048; }
+
+/************************************************************
+ * Name: scalar_signed_to_unsigned_q
+ *
+ * Description: converts signed polynomial coefficient
+ *              from signed (-3328 .. 3328) form to
+ *              unsigned form (0 .. 3328).
+ *
+ * Note: Cryptographic constant time implementation
+ *
+ * Examples:       0 -> 0
+ *                 1 -> 1
+ *              3328 -> 3328
+ *                -1 -> 3328
+ *                -2 -> 3327
+ *             -3328 -> 1
+ *
+ * Arguments: c: signed coefficient to be converted
+ ************************************************************/
+static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
+__contract__(
+  requires(c >= -(MLKEM_Q - 1) && c <= (MLKEM_Q - 1))
+  ensures(return_value >= 0 && return_value <= (MLKEM_Q - 1))
+  ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
+{
+  /* Add Q if c is negative, but in constant time */
+  c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
+
+  cassert(c >= 0, "scalar_signed_to_unsigned_q result lower bound");
+  cassert(c < MLKEM_Q, "scalar_signed_to_unsigned_q result upper bound");
+
+  /* and therefore cast to uint16_t is safe. */
+  return (uint16_t)c;
+}
+
+#define poly_compress_du MLKEM_NAMESPACE(poly_compress_du)
+/*************************************************
+ * Name:        poly_compress_du
+ *
+ * Description: Compression (du bits) and subsequent serialization of a
+ *polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                            (of length MLKEM_POLYCOMPRESSEDBYTES)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1)))
+  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
+);
+
+#define poly_decompress_du MLKEM_NAMESPACE(poly_decompress_du)
+/*************************************************
+ * Name:        poly_decompress_du
+ *
+ * Description: De-serialization and subsequent decompression (du bits) of a
+ *polynomial; approximate inverse of poly_compress_du
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                                  (of length MLKEM_POLYCOMPRESSEDBYTES bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1)))
+);
+
+#define poly_compress_dv MLKEM_NAMESPACE(poly_compress_dv)
+/*************************************************
+ * Name:        poly_compress_dv
+ *
+ * Description: Compression (dv bits) and subsequent serialization of a
+ *polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                            (of length MLKEM_POLYCOMPRESSEDBYTES_DV)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1)))
+  assigns(object_whole(r))
+);
+
+#define poly_decompress_dv MLKEM_NAMESPACE(poly_decompress_dv)
+/*************************************************
+ * Name:        poly_decompress_dv
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV
+ *bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(object_whole(r))
+  ensures(array_bound(r->coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1)))
+);
+
+#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
+/*************************************************
+ * Name:        poly_tobytes
+ *
+ * Description: Serialization of a polynomial.
+ *              Signed coefficients are converted to
+ *              unsigned form before serialization.
+ *
+ * Arguments:   INPUT:
+ *              - a: const pointer to input polynomial,
+ *                with each coefficient in the range [0,1,..,Q-1]
+ *              OUTPUT
+ *              - r: pointer to output byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ **************************************************/
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYBYTES))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1)))
+  assigns(object_whole(r))
+);
+
+
+#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes)
+/*************************************************
+ * Name:        poly_frombytes
+ *
+ * Description: De-serialization of a polynomial.
+ *
+ * Arguments:   INPUT
+ *              - a: pointer to input byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ *              OUTPUT
+ *              - r: pointer to output polynomial, with
+ *                   each coefficient unsigned and in the range
+ *                   0 .. 4095
+ **************************************************/
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, (MLKEM_N - 1), 0, 4095))
+);
+
+
+#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg)
+/*************************************************
+ * Name:        poly_frommsg
+ *
+ * Description: Convert 32-byte message to polynomial
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *msg: pointer to input message
+ **************************************************/
+void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+__contract__(
+  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(object_whole(r))
+  ensures(array_bound(r->coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1)))
+);
+
+#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg)
+/*************************************************
+ * Name:        poly_tomsg
+ *
+ * Description: Convert polynomial to 32-byte message
+ *
+ * Arguments:   - uint8_t *msg: pointer to output message
+ *              - const poly *r: pointer to input polynomial
+ *                Coefficients must be unsigned canonical
+ **************************************************/
+void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r)
+__contract__(
+  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(array_bound(r->coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1)))
+  assigns(object_whole(msg))
+);
+
+#define poly_getnoise_eta1_4x MLKEM_NAMESPACE(poly_getnoise_eta1_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and nonces, with output polynomials close to centered binomial distribution
+ * with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+/* Depending on MLKEM_K, the pointers passed to this function belong
+   to the same objects, so we cannot use memory_no_alias for r0-r3.
+
+   NOTE: Somehow it is important to use memory_no_alias() first in the
+         conjunctions defining each case.
+*/
+#if MLKEM_K == 2
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
+    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N - 1, MLKEM_ETA1));
+);
+#elif MLKEM_K == 4
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case B: r0, r1, r2, r3 consecutive */
+    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N - 1, MLKEM_ETA1));
+);
+#elif MLKEM_K == 3
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case C: r0, r1, r2 consecutive */
+ (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
+  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N - 1, MLKEM_ETA1));
+);
+#endif /* MLKEM_K */
+
+#if MLKEM_ETA1 == MLKEM_ETA2
+/*
+ * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
+ * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
+ * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
+ */
+#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
+#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
+
+#define poly_getnoise_eta2 MLKEM_NAMESPACE(poly_getnoise_eta2)
+/*************************************************
+ * Name:        poly_getnoise_eta2
+ *
+ * Description: Sample a polynomial deterministically from a seed and a nonce,
+ *              with output polynomial close to centered binomial distribution
+ *              with parameter MLKEM_ETA2
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce: one-byte input nonce
+ **************************************************/
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N - 1, MLKEM_ETA2))
+);
+
+#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE(poly_getnoise_eta1122_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1122_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and a nonces, with output polynomials close to centered binomial
+ * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+__contract__(
+  requires( /* r0, r1 consecutive, r2, r3 consecutive */
+ (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
+  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+     && array_abs_bound(r1->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+     && array_abs_bound(r2->coeffs,0, MLKEM_N - 1, MLKEM_ETA2)
+     && array_abs_bound(r3->coeffs,0, MLKEM_N - 1, MLKEM_ETA2));
+);
+
+#define poly_basemul_montgomery_cached \
+  MLKEM_NAMESPACE(poly_basemul_montgomery_cached)
+/*************************************************
+ * Name:        poly_basemul_montgomery_cached
+ *
+ * Description: Multiplication of two polynomials in NTT domain,
+ *              using mulcache for second operand.
+ *
+ *              Bounds:
+ *              - a is assumed to be coefficient-wise < q in absolute value.
+ *
+ *              The result is coefficient-wise bound by 3/2 q in absolute
+ *              value.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const poly *a: pointer to first input polynomial
+ *              - const poly *b: pointer to second input polynomial
+ *              - const poly_mulcache *b_cache: pointer to mulcache
+ *                  for second input polynomial. Can be computed
+ *                  via poly_mulcache_compute().
+ **************************************************/
+void poly_basemul_montgomery_cached(poly *r, const poly *a, const poly *b,
+                                    const poly_mulcache *b_cache)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(memory_no_alias(b, sizeof(poly)))
+  requires(memory_no_alias(b_cache, sizeof(poly_mulcache)))
+  requires(array_abs_bound(a->coeffs, 0, MLKEM_N - 1, (MLKEM_Q - 1)))
+  assigns(object_whole(r))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N - 1, (3 * HALF_Q - 1)))
+);
+
+#define poly_tomont MLKEM_NAMESPACE(poly_tomont)
+/*************************************************
+ * Name:        poly_tomont
+ *
+ * Description: Inplace conversion of all coefficients of a polynomial
+ *              from normal domain to Montgomery domain
+ *
+ *              Bounds: Output < q in absolute value.
+ *
+ * Arguments:   - poly *r: pointer to input/output polynomial
+ **************************************************/
+void poly_tomont(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N - 1, (MLKEM_Q - 1)))
+);
+
+/* REF-CHANGE: This function does not exist in the reference implementation */
+#define poly_mulcache_compute MLKEM_NAMESPACE(poly_mulcache_compute)
+/************************************************************
+ * Name: poly_mulcache_compute
+ *
+ * Description: Computes the mulcache for a polynomial in NTT domain
+ *
+ *              The mulcache of a degree-2 polynomial b := b0 + b1*X
+ *              in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when
+ *              computing products of b in Fq[X]/(X^2-zeta).
+ *
+ *              The mulcache of a polynomial in NTT domain -- which is
+ *              a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta),
+ *              for varying zeta, is the 128-tuple of mulcaches of those
+ *              polynomials.
+ *
+ * Arguments: - x: Pointer to mulcache to be populated
+ *            - a: Pointer to input polynomial
+ ************************************************************/
+/*
+ * NOTE: The default C implementation of this function populates
+ * the mulcache with values in (-q,q), but this is not needed for the
+ * higher level safety proofs, and thus not part of the spec.
+ */
+void poly_mulcache_compute(poly_mulcache *x, const poly *a)
+__contract__(
+  requires(memory_no_alias(x, sizeof(poly_mulcache)))
+  requires(memory_no_alias(a, sizeof(poly)))
+  assigns(object_whole(x))
+);
+
+#define poly_reduce MLKEM_NAMESPACE(poly_reduce)
+/*************************************************
+ * Name:        poly_reduce
+ *
+ * Description: Converts polynomial to _unsigned canonical_ representatives.
+ *
+ *              The input coefficients can be arbitrary integers in int16_t.
+ *              The output coefficients are in [0,1,...,MLKEM_Q-1].
+ *
+ * Arguments:   - poly *r: pointer to input/output polynomial
+ **************************************************/
+/*
+ * REF-CHANGE: The semantics of poly_reduce() is different in
+ * the reference implementation, which requires
+ * signed canonical output data. Unsigned canonical
+ * outputs are better suited to the only remaining
+ * use of poly_reduce() in the context of (de)serialization.
+ */
+void poly_reduce(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N - 1, 0, (MLKEM_Q - 1)))
+);
+
+#define poly_add MLKEM_NAMESPACE(poly_add)
+/************************************************************
+ * Name: poly_add
+ *
+ * Description: Adds two polynomials in place
+ *
+ * Arguments: - r: Pointer to input-output polynomial to be added to.
+ *            - b: Pointer to input polynomial that should be added
+ *                 to r. Must be disjoint from r.
+ *
+ * The coefficients of r and b must be so that the addition does
+ * not overflow. Otherwise, the behaviour of this function is undefined.
+ *
+ ************************************************************/
+/*
+ * REF-CHANGE:
+ * The reference implementation uses a 3-argument poly_add.
+ * We specialize to the accumulator form to avoid reasoning about aliasing.
+ */
+void poly_add(poly *r, const poly *b)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(b, sizeof(poly)))
+  requires(forall(int, k0, 0, MLKEM_N - 1, (int32_t) r->coeffs[k0] + b->coeffs[k0] <= INT16_MAX))
+  requires(forall(int, k1, 0, MLKEM_N - 1, (int32_t) r->coeffs[k1] + b->coeffs[k1] >= INT16_MIN))
+  ensures(forall(int, k, 0, MLKEM_N - 1, r->coeffs[k] == old(*r).coeffs[k] + b->coeffs[k]))
+  assigns(memory_slice(r, sizeof(poly)))
+);
+
+#define poly_sub MLKEM_NAMESPACE(poly_sub)
+/*************************************************
+ * Name:        poly_sub
+ *
+ * Description: Subtract two polynomials; no modular reduction is performed
+ *
+ * Arguments: - poly *r:       Pointer to input-output polynomial to be added
+ *to.
+ *            - const poly *b: Pointer to second input polynomial
+ **************************************************/
+/*
+ * REF-CHANGE:
+ * The reference implementation uses a 3-argument poly_sub.
+ * We specialize to the accumulator form to avoid reasoning about aliasing.
+ */
+void poly_sub(poly *r, const poly *b)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(b, sizeof(poly)))
+  requires(forall(int, k0, 0, MLKEM_N - 1, (int32_t) r->coeffs[k0] - b->coeffs[k0] <= INT16_MAX))
+  requires(forall(int, k1, 0, MLKEM_N - 1, (int32_t) r->coeffs[k1] - b->coeffs[k1] >= INT16_MIN))
+  ensures(forall(int, k, 0, MLKEM_N - 1, r->coeffs[k] == old(*r).coeffs[k] - b->coeffs[k]))
+  assigns(object_whole(r))
+);
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/polyvec.c
new file mode 100644
index 0000000000..5e4dd0c5c4
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/polyvec.c
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "polyvec.h"
+#include <stdint.h>
+#include "arith_native.h"
+#include "config.h"
+#include "ntt.h"
+#include "params.h"
+#include "poly.h"
+
+#include "debug/debug.h"
+void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
+                         const polyvec *a)
+{
+  unsigned int i;
+  POLYVEC_UBOUND(a, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]);
+  }
+}
+
+void polyvec_decompress_du(polyvec *r,
+                           const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
+{
+  unsigned int i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
+  }
+
+  POLYVEC_UBOUND(r, MLKEM_Q);
+}
+
+void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
+{
+  unsigned int i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]);
+  }
+}
+
+void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
+{
+  int i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES);
+  }
+}
+
+void polyvec_ntt(polyvec *r)
+{
+  unsigned int i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_ntt(&r->vec[i]);
+  }
+}
+
+void polyvec_invntt_tomont(polyvec *r)
+{
+  unsigned int i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_invntt_tomont(&r->vec[i]);
+  }
+}
+
+/*************************************************
+ * Name:        polyvec_basemul_acc_montgomery
+ *
+ * Description: Multiply elements of a and b in NTT domain, accumulate into r,
+ *              and multiply by 2^-16.
+ *
+ *              Bounds:
+ *              - a is assumed to be coefficient-wise < q in absolute value.
+ *              - b is assumed to be the output of a forward NTT and
+ *                thus coefficient-wise bound by NTT_BOUND
+ *              - b_cache is assumed to be coefficient-wise bound by
+ *                MLKEM_Q.
+ *
+ * Arguments: - poly *r: pointer to output polynomial
+ *            - const polyvec *a: pointer to first input vector of polynomials
+ *            - const polyvec *b: pointer to second input vector of polynomials
+ *            - const polyvec_mulcache *b_cache: mulcache for b
+ **************************************************/
+#if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
+void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
+                                           const polyvec *b,
+                                           const polyvec_mulcache *b_cache)
+{
+  int i;
+  poly t;
+
+  POLYVEC_BOUND(a, MLKEM_Q);
+  POLYVEC_BOUND(b, NTT_BOUND);
+  POLYVEC_BOUND(b_cache, MLKEM_Q);
+
+  poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]);
+  for (i = 1; i < MLKEM_K; i++)
+  {
+    poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i],
+                                   &b_cache->vec[i]);
+    poly_add(r, &t);
+    /* abs bounds: < (i+1) * 3/2 * q */
+  }
+
+  /*
+   * Those bounds are true for the C implementation, but not needed
+   * in the higher level bounds reasoning. It is thus best to omit
+   * them from the spec to not unnecessarily constraint native implementations.
+   */
+  cassert(
+      array_abs_bound(r->coeffs, 0, MLKEM_N - 1, MLKEM_K * (3 * HALF_Q - 1)),
+      "polyvec_basemul_acc_montgomery_cached output bounds");
+  /* TODO: Integrate CBMC assertion into POLY_BOUND if CBMC is set */
+  POLY_BOUND(r, MLKEM_K * 3 * HALF_Q);
+}
+#else  /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
+void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
+                                           const polyvec *b,
+                                           const polyvec_mulcache *b_cache)
+{
+  POLYVEC_BOUND(a, MLKEM_Q);
+  POLYVEC_BOUND(b, NTT_BOUND);
+  /* Omitting POLYVEC_BOUND(b_cache, MLKEM_Q) since native implementations may
+   * decide not to use a mulcache. Note that the C backend implementation
+   * of poly_basemul_montgomery_cached() does still include the check. */
+  polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache);
+}
+#endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
+
+/*************************************************
+ * Name:        polyvec_basemul_acc_montgomery
+ *
+ * Description: Multiply elements of a and b in NTT domain, accumulate into r,
+ *              and multiply by 2^-16.
+ *
+ * Arguments: - poly *r: pointer to output polynomial
+ *            - const polyvec *a: pointer to first input vector of polynomials
+ *            - const polyvec *b: pointer to second input vector of polynomials
+ **************************************************/
+void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
+{
+  polyvec_mulcache b_cache;
+  polyvec_mulcache_compute(&b_cache, b);
+  polyvec_basemul_acc_montgomery_cached(r, a, b, &b_cache);
+}
+
+/*************************************************
+ * Name:        polyvec_mulcache_compute
+ *
+ * Description: Precompute values speeding up
+ *              base multiplications of polynomials
+ *              in NTT domain.
+ *
+ * Arguments: - polyvec_mulcache *x: pointer to output cache.
+ *            - const poly *a: pointer to input polynomial
+ **************************************************/
+void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a)
+{
+  unsigned int i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_mulcache_compute(&x->vec[i], &a->vec[i]);
+  }
+}
+
+
+/*************************************************
+ * Name:        polyvec_reduce
+ *
+ * Description: Applies Barrett reduction to each coefficient
+ *              of each element of a vector of polynomials;
+ *              for details of the Barrett reduction see comments in reduce.c
+ *
+ * Arguments:   - polyvec *r: pointer to input/output polynomial
+ **************************************************/
+void polyvec_reduce(polyvec *r)
+{
+  unsigned int i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_reduce(&r->vec[i]);
+  }
+}
+
+void polyvec_add(polyvec *r, const polyvec *b)
+{
+  int i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_add(&r->vec[i], &b->vec[i]);
+  }
+}
+
+void polyvec_tomont(polyvec *r)
+{
+  unsigned int i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_tomont(&r->vec[i]);
+  }
+}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/polyvec.h
new file mode 100644
index 0000000000..7771fd3b28
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/polyvec.h
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef POLYVEC_H
+#define POLYVEC_H
+
+#include <stdint.h>
+#include "params.h"
+#include "poly.h"
+
+typedef struct
+{
+  poly vec[MLKEM_K];
+} ALIGN polyvec;
+
+/* REF-CHANGE: This struct does not exist in the reference implementation */
+typedef struct
+{
+  poly_mulcache vec[MLKEM_K];
+} polyvec_mulcache;
+
+#define polyvec_compress_du MLKEM_NAMESPACE(polyvec_compress_du)
+/*************************************************
+ * Name:        polyvec_compress_du
+ *
+ * Description: Compress and serialize vector of polynomials
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                            (needs space for MLKEM_POLYVECCOMPRESSEDBYTES_DU)
+ *              - const polyvec *a: pointer to input vector of polynomials.
+ *                                  Coefficients must be unsigned canonical,
+ *                                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
+                         const polyvec *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  requires(forall(int, k0, 0, MLKEM_K - 1,
+         array_bound(a->vec[k0].coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1))))
+  assigns(object_whole(r))
+);
+
+#define polyvec_decompress_du MLKEM_NAMESPACE(polyvec_decompress_du)
+/*************************************************
+ * Name:        polyvec_decompress_du
+ *
+ * Description: De-serialize and decompress vector of polynomials;
+ *              approximate inverse of polyvec_compress_du
+ *
+ * Arguments:   - polyvec *r:       pointer to output vector of polynomials.
+ *                Output will have coefficients normalized to [0,..,q-1].
+ *              - const uint8_t *a: pointer to input byte array
+ *                                  (of length MLKEM_POLYVECCOMPRESSEDBYTES_DU)
+ **************************************************/
+void polyvec_decompress_du(polyvec *r,
+                           const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(int, k0, 0, MLKEM_K - 1,
+         array_bound(r->vec[k0].coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1))))
+);
+
+#define polyvec_tobytes MLKEM_NAMESPACE(polyvec_tobytes)
+/*************************************************
+ * Name:        polyvec_tobytes
+ *
+ * Description: Serialize vector of polynomials
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                            (needs space for MLKEM_POLYVECBYTES)
+ *              - const polyvec *a: pointer to input vector of polynomials
+ *                  Each polynomial must have coefficients in [0,..,q-1].
+ **************************************************/
+void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
+__contract__(
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  requires(memory_no_alias(r, MLKEM_POLYVECBYTES))
+  requires(forall(int, k0, 0, MLKEM_K - 1,
+         array_bound(a->vec[k0].coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1))))
+  assigns(object_whole(r))
+);
+
+#define polyvec_frombytes MLKEM_NAMESPACE(polyvec_frombytes)
+/*************************************************
+ * Name:        polyvec_frombytes
+ *
+ * Description: De-serialize vector of polynomials;
+ *              inverse of polyvec_tobytes
+ *
+ * Arguments:   - const polyvec *a: pointer to output vector of polynomials
+ *                 (of length MLKEM_POLYVECBYTES). Output will have coefficients
+ *                 normalized to [0,..,q-1].
+ *              - uint8_t *r: pointer to input byte array
+ **************************************************/
+void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  requires(memory_no_alias(a, MLKEM_POLYVECBYTES))
+  assigns(object_whole(r))
+  ensures(forall(int, k0, 0, MLKEM_K - 1,
+        array_bound(r->vec[k0].coeffs, 0, (MLKEM_N - 1), 0, 4095)))
+);
+
+#define polyvec_ntt MLKEM_NAMESPACE(polyvec_ntt)
+/*************************************************
+ * Name:        polyvec_ntt
+ *
+ * Description: Apply forward NTT to all elements of a vector of polynomials.
+ *
+ *              The input is assumed to be in normal order and
+ *              coefficient-wise bound by MLKEM_Q in absolute value.
+ *
+ *              The output polynomial is in bitreversed order, and
+ *              coefficient-wise bound by NTT_BOUND in absolute value.
+ *
+ * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
+ *
+ **************************************************/
+void polyvec_ntt(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  requires(forall(int, j, 0, MLKEM_K - 1,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N - 1, (MLKEM_Q - 1))))
+  assigns(object_whole(r))
+  ensures(forall(int, j, 0, MLKEM_K - 1,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N - 1, (NTT_BOUND - 1))))
+);
+
+#define polyvec_invntt_tomont MLKEM_NAMESPACE(polyvec_invntt_tomont)
+/*************************************************
+ * Name:        polyvec_invntt_tomont
+ *
+ * Description: Apply inverse NTT to all elements of a vector of polynomials
+ *              and multiply by Montgomery factor 2^16
+ *
+ *              The input is assumed to be in bitreversed order, and can
+ *              have arbitrary coefficients in int16_t.
+ *
+ *              The output polynomial is in normal order, and
+ *              coefficient-wise bound by INVNTT_BOUND in absolute value.
+ *
+ *
+ * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
+ **************************************************/
+void polyvec_invntt_tomont(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(int, j, 0, MLKEM_K - 1,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N - 1, (INVNTT_BOUND - 1))))
+);
+
+#define polyvec_basemul_acc_montgomery \
+  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery)
+void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a,
+                                    const polyvec *b);
+
+/* REF-CHANGE: This function does not exist in the reference implementation */
+#define polyvec_basemul_acc_montgomery_cached \
+  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached)
+/*************************************************
+ * Name:        polyvec_basemul_acc_montgomery_cached
+ *
+ * Description: Scalar product of two vectors of polynomials in NTT domain,
+ *              using mulcache for second operand.
+ *
+ *              Bounds:
+ *              - a is assumed to be coefficient-wise < q in absolute value.
+ *              - No bounds guarantees for the coefficients in the result.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const polyvec *a: pointer to first input polynomial vector
+ *              - const polyvec *b: pointer to second input polynomial vector
+ *              - const polyvec_mulcache *b_cache: pointer to mulcache
+ *                  for second input polynomial vector. Can be computed
+ *                  via polyvec_mulcache_compute().
+ **************************************************/
+void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
+                                           const polyvec *b,
+                                           const polyvec_mulcache *b_cache)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  requires(memory_no_alias(b, sizeof(polyvec)))
+  requires(memory_no_alias(b_cache, sizeof(polyvec_mulcache)))
+/* Input is coefficient-wise < q in absolute value */
+  requires(forall(int, k1, 0, MLKEM_K - 1,
+ array_abs_bound(a->vec[k1].coeffs, 0, MLKEM_N - 1, (MLKEM_Q - 1))))
+  assigns(memory_slice(r, sizeof(poly)))
+);
+
+/* REF-CHANGE: This function does not exist in the reference implementation */
+#define polyvec_mulcache_compute MLKEM_NAMESPACE(polyvec_mulcache_compute)
+/************************************************************
+ * Name: polyvec_mulcache_compute
+ *
+ * Description: Computes the mulcache for a vector of polynomials in NTT domain
+ *
+ *              The mulcache of a degree-2 polynomial b := b0 + b1*X
+ *              in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when
+ *              computing products of b in Fq[X]/(X^2-zeta).
+ *
+ *              The mulcache of a polynomial in NTT domain -- which is
+ *              a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta),
+ *              for varying zeta, is the 128-tuple of mulcaches of those
+ *              polynomials.
+ *
+ *              The mulcache of a vector of polynomials is the vector
+ *              of mulcaches of its entries.
+ *
+ * Arguments: - x: Pointer to mulcache to be populated
+ *            - a: Pointer to input polynomial vector
+ ************************************************************/
+/*
+ * NOTE: The default C implementation of this function populates
+ * the mulcache with values in (-q,q), but this is not needed for the
+ * higher level safety proofs, and thus not part of the spec.
+ */
+void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a)
+__contract__(
+  requires(memory_no_alias(x, sizeof(polyvec_mulcache)))
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  assigns(object_whole(x))
+);
+
+#define polyvec_reduce MLKEM_NAMESPACE(polyvec_reduce)
+/*************************************************
+ * Name:        polyvec_reduce
+ *
+ * Description: Applies Barrett reduction to each coefficient
+ *              of each element of a vector of polynomials;
+ *              for details of the Barrett reduction see comments in reduce.c
+ *
+ * Arguments:   - polyvec *r: pointer to input/output polynomial
+ **************************************************/
+/*
+ * REF-CHANGE: The semantics of polyvec_reduce() is different in
+ *             the reference implementation, which requires
+ *             signed canonical output data. Unsigned canonical
+ *             outputs are better suited to the only remaining
+ *             use of poly_reduce() in the context of (de)serialization.
+ */
+void polyvec_reduce(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(int, k0, 0, MLKEM_K - 1,
+    array_bound(r->vec[k0].coeffs, 0, MLKEM_N - 1, 0, (MLKEM_Q - 1))))
+);
+
+#define polyvec_add MLKEM_NAMESPACE(polyvec_add)
+/*************************************************
+ * Name:        polyvec_add
+ *
+ * Description: Add vectors of polynomials
+ *
+ * Arguments: - polyvec *r: pointer to input-output vector of polynomials to be
+ *              added to
+ *            - const polyvec *b: pointer to second input vector of polynomials
+ *
+ * The coefficients of r and b must be so that the addition does
+ * not overflow. Otherwise, the behaviour of this function is undefined.
+ *
+ * The coefficients returned in *r are in int16_t which is sufficient
+ * to prove type-safety of calling units. Therefore, no stronger
+ * ensures clause is required on this function.
+ **************************************************/
+void polyvec_add(polyvec *r, const polyvec *b)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  requires(memory_no_alias(b, sizeof(polyvec)))
+  requires(forall(int, j0, 0, MLKEM_K - 1,
+          forall(int, k0, 0, MLKEM_N - 1,
+            (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX)))
+  requires(forall(int, j1, 0, MLKEM_K - 1,
+          forall(int, k1, 0, MLKEM_N - 1,
+            (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN)))
+  assigns(object_whole(r))
+);
+
+#define polyvec_tomont MLKEM_NAMESPACE(polyvec_tomont)
+/*************************************************
+ * Name:        polyvec_tomont
+ *
+ * Description: Inplace conversion of all coefficients of a polynomial
+ *              vector from normal domain to Montgomery domain
+ *
+ *              Bounds: Output < q in absolute value.
+ *
+ **************************************************/
+void polyvec_tomont(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(memory_slice(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(int, j, 0, MLKEM_K - 1,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N - 1, (MLKEM_Q - 1))))
+);
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/reduce.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/reduce.c
new file mode 100644
index 0000000000..db7baf0f56
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/reduce.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "reduce.h"
+#include <stdint.h>
+#include "params.h"
+
+/* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */
+static const uint32_t QINV = 62209; /* q^-1 mod 2^16 */
+
+/*************************************************
+ * Name:        cast_uint16_to_int16
+ *
+ * Description: Cast uint16 value to int16
+ *
+ * Returns:
+ *   input x in     0 .. 32767: returns value unchanged
+ *   input x in 32768 .. 65535: returns (x - 65536)
+ **************************************************/
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "conversion"
+#endif
+static INLINE int16_t cast_uint16_to_int16(uint16_t x)
+{
+  /*
+   * PORTABILITY: This relies on uint16_t -> int16_t
+   * being implemented as the inverse of int16_t -> uint16_t,
+   * which is implementation-defined (C99 6.3.1.3 (3))
+   * CBMC (correctly) fails to prove this conversion is OK,
+   * so we have to suppress that check here
+   */
+  return (int16_t)x;
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/*************************************************
+ * Name:        montgomery_reduce_generic
+ *
+ * Description: Generic Montgomery reduction; given a 32-bit integer a, computes
+ *              16-bit integer congruent to a * R^-1 mod q, where R=2^16
+ *
+ * Arguments:   - int32_t a: input integer to be reduced
+ *
+ * Returns:     integer congruent to a * R^-1 modulo q
+ *
+ *              Bounds: For any C such that |a| < q * C, the return value
+ *              has absolute value < q (C/2^16 + 1/2).
+ *
+ *              Notable special cases:
+ *              - The Montgomery multiplication of a value of absolute value
+ *                < q * C with a signed-canonical value ( < q/2 ) has
+ *                absolute value q * (0.0254 * C + 1/2).
+ *              - The Montgomery multiplication of a value of absolute value
+ *                < q * C with a value t of |t| < q has absolute value
+ *                < q * (0.0508 * C + 1/2).
+ *              - The Montgomery multiplication of a value of absolute value
+ *                < C with a value of abs < q has absolute value
+ *                < q (C/2^16 + 1/2).
+ **************************************************/
+ALWAYS_INLINE
+static INLINE int16_t montgomery_reduce_generic(int32_t a)
+{
+  /*
+   *Bounds on paper
+   * - Case |a| < q * C, for some C
+   * |t| <= |a|/2^16 + |t|*q/2^16
+   *      < q * C / 2^16 + q/2
+   *      = q (C/2^16 + 1/2)
+   * - Case |a| < (q/2) * C * q, for some C
+   * Replace C -> C * q in the above and estimate
+   * q / 2^17 < 0.0254.
+   */
+
+  /*  Compute a*q^{-1} mod 2^16 in unsigned representatives */
+  const uint16_t a_reduced = a & UINT16_MAX;
+  const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
+
+  /* Lift to signed canonical representative mod 2^16. */
+  const int16_t t = cast_uint16_to_int16(a_inverted);
+
+  int32_t r = a - ((int32_t)t * MLKEM_Q);
+
+  /*
+   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
+   * implementation-defined for negative left argument. Here,
+   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
+   */
+  r = r >> 16;
+
+  return (int16_t)r;
+}
+
+int16_t montgomery_reduce(int32_t a)
+{
+  int16_t res;
+  SCALAR_BOUND(a, 2 * MLKEM_Q * 32768, "montgomery_reduce input");
+
+  res = montgomery_reduce_generic(a);
+
+  SCALAR_BOUND(res, (3 * (MLKEM_Q + 1)) / 2, "montgomery_reduce output");
+  return res;
+}
+
+int16_t fqmul(int16_t a, int16_t b)
+{
+  int16_t res;
+  SCALAR_BOUND(b, HALF_Q, "fqmul input");
+
+  res = montgomery_reduce((int32_t)a * (int32_t)b);
+
+  SCALAR_BOUND(res, MLKEM_Q, "fqmul output");
+  return res;
+}
+
+/*
+ * To divide by MLKEM_Q using Barrett multiplication, the "magic number"
+ * multiplier is round_to_nearest(2**26/MLKEM_Q)
+ */
+#define BPOWER 26
+static const int32_t barrett_multiplier =
+    ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q;
+
+/*************************************************
+ * Name:        barrett_reduce
+ *
+ * Description: Barrett reduction; given a 16-bit integer a, computes
+ *              centered representative congruent to a mod q in
+ *              {-(q-1)/2,...,(q-1)/2}
+ *
+ * Arguments:   - int16_t a: input integer to be reduced
+ *
+ * Returns:     integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
+ **************************************************/
+int16_t barrett_reduce(int16_t a)
+{
+  /*
+   * Compute round_to_nearest(a/MLKEM_Q) using the multiplier
+   * above and shift by BPOWER places.
+   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
+   * implementation-defined for negative left argument. Here,
+   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
+   */
+  const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER;
+
+  /*
+   * t is in -10 .. +10, so we need 32-bit math to
+   * evaluate t * MLKEM_Q and the subsequent subtraction
+   */
+  return (int16_t)(a - t * MLKEM_Q);
+}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/reduce.h
new file mode 100644
index 0000000000..2a486cf3ec
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/reduce.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef REDUCE_H
+#define REDUCE_H
+
+#include <stdint.h>
+#include "cbmc.h"
+#include "debug/debug.h"
+#include "params.h"
+
+#define MONT -1044                 /* 2^16 mod q */
+#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */
+
+/*************************************************
+ * Name:        montgomery_reduce
+ *
+ * Description: Montgomery reduction
+ *
+ * Arguments:   - int32_t a: input integer to be reduced
+ *                  Must be smaller than 2 * q * 2^15 in absolute value.
+ *
+ * Returns:     integer congruent to a * R^-1 modulo q,
+ *              smaller than 3/2 q in absolute value.
+ **************************************************/
+#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce)
+int16_t montgomery_reduce(int32_t a)
+__contract__(
+  requires(a > -(2 * MLKEM_Q * 32768))
+  requires(a <  (2 * MLKEM_Q * 32768))
+  ensures(return_value > -(3 * HALF_Q) && return_value < (3 * HALF_Q))
+);
+
+#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce)
+int16_t barrett_reduce(int16_t a)
+__contract__(
+  ensures(return_value > -HALF_Q && return_value < HALF_Q)
+);
+
+/*************************************************
+ * Name:        fqmul
+ *
+ * Description: Montgomery multiplication modulo q=3329
+ *
+ * Arguments:   - int16_t a: first factor
+ *                  Can be any int16_t.
+ *              - int16_t b: second factor.
+ *                  Must be signed canonical (abs value <(q+1)/2)
+ *
+ * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and
+ * smaller than q in absolute value.
+ *
+ **************************************************/
+#define fqmul MLKEM_NAMESPACE(fqmul)
+int16_t fqmul(int16_t a, int16_t b)
+__contract__(
+  requires(b > -HALF_Q)
+  requires(b < HALF_Q)
+  ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q)
+);
+
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/rej_uniform.c
new file mode 100644
index 0000000000..4e8a5ce9b2
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/rej_uniform.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "params.h"
+
+#include "arith_native.h"
+#include "rej_uniform.h"
+
+/*************************************************
+ * Name:        rej_uniform_scalar
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ *              uniform random integers mod q
+ *
+ * Arguments:   - int16_t *r:          pointer to output buffer
+ *              - unsigned int target: requested number of 16-bit integers
+ *                                     (uniform mod q).
+ *                                     Must be <= 4096.
+ *              - unsigned int offset: number of 16-bit integers that have
+ *                                     already been sampled.
+ *                                     Must be <= target.
+ *              - const uint8_t *buf:  pointer to input buffer
+ *                                     (assumed to be uniform random bytes)
+ *              - unsigned int buflen: length of input buffer in bytes
+ *                                     Must be <= 4096.
+ *                                     Must be a multiple of 3.
+ *
+ * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
+ * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
+ * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
+ *
+ * Returns the new offset of sampled 16-bit integers, at most target,
+ * and at least the initial offset.
+ * If the new offset is strictly less than len, all of the input buffers
+ * is guaranteed to have been consumed. If it is equal to len, no information
+ * is provided on how many bytes of the input buffer have been consumed.
+ **************************************************/
+static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target,
+                                       unsigned int offset, const uint8_t *buf,
+                                       unsigned int buflen)
+{
+  unsigned int ctr, pos;
+  uint16_t val0, val1;
+
+  ctr = offset;
+  pos = 0;
+  /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */
+  while (ctr < target && pos + 3 <= buflen)
+  __loop__(
+    invariant(offset <= ctr && ctr <= target && pos <= buflen)
+    invariant(ctr > 0 ==> array_bound(r, 0, ctr - 1, 0, (MLKEM_Q - 1))))
+  {
+    val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
+    val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF;
+    pos += 3;
+
+    if (val0 < MLKEM_Q)
+    {
+      r[ctr++] = val0;
+    }
+    if (ctr < target && val1 < MLKEM_Q)
+    {
+      r[ctr++] = val1;
+    }
+  }
+  return ctr;
+}
+
+#if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
+unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
+                         const uint8_t *buf, unsigned int buflen)
+{
+  return rej_uniform_scalar(r, target, offset, buf, buflen);
+}
+#else  /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+
+unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
+                         const uint8_t *buf, unsigned int buflen)
+{
+  int ret;
+
+  /* Sample from large buffer with full lane as much as possible. */
+  ret = rej_uniform_native(r + offset, target - offset, buf, buflen);
+  if (ret != -1)
+    return offset + (unsigned)ret;
+
+  return rej_uniform_scalar(r, target, offset, buf, buflen);
+}
+#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/rej_uniform.h
new file mode 100644
index 0000000000..aeb9cc3eb0
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/rej_uniform.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef REJ_UNIFORM_H
+#define REJ_UNIFORM_H
+
+#include <stdint.h>
+#include <stdlib.h>
+#include "cbmc.h"
+#include "params.h"
+
+#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
+/*************************************************
+ * Name:        rej_uniform
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ *              uniform random integers mod q
+ *
+ * Arguments:   - int16_t *r:          pointer to output buffer
+ *              - unsigned int target: requested number of 16-bit integers
+ *                                     (uniform mod q).
+ *                                     Must be <= 4096.
+ *              - unsigned int offset: number of 16-bit integers that have
+ *                                     already been sampled.
+ *                                     Must be <= target.
+ *              - const uint8_t *buf:  pointer to input buffer
+ *                                     (assumed to be uniform random bytes)
+ *              - unsigned int buflen: length of input buffer in bytes
+ *                                     Must be <= 4096.
+ *                                     Must be a multiple of 3.
+ *
+ * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
+ * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
+ * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
+ *
+ * Returns the new offset of sampled 16-bit integers, at most target,
+ * and at least the initial offset.
+ * If the new offset is strictly less than len, all of the input buffers
+ * is guaranteed to have been consumed. If it is equal to len, no information
+ * is provided on how many bytes of the input buffer have been consumed.
+ **************************************************/
+
+/*
+ * REF-CHANGE: The signature differs from the Kyber reference implementation
+ * in that it adds the offset and always expects the base of the target
+ * buffer. This avoids shifting the buffer base in the caller, which appears
+ * tricky to reason about.
+ */
+unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
+                         const uint8_t *buf, unsigned int buflen)
+__contract__(
+  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
+  requires(memory_no_alias(r, sizeof(int16_t) * target))
+  requires(memory_no_alias(buf, buflen))
+  requires(offset > 0 ==> array_bound(r, 0, offset - 1, 0, (MLKEM_Q - 1)))
+  assigns(memory_slice(r, sizeof(int16_t) * target))
+  ensures(offset <= return_value && return_value <= target)
+  ensures(return_value > 0 ==> array_bound(r, 0, return_value - 1, 0, (MLKEM_Q - 1)))
+);
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/symmetric-shake.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/symmetric-shake.c
new file mode 100644
index 0000000000..5dd8c10d92
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/symmetric-shake.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include "fips202.h"
+#include "params.h"
+#include "symmetric.h"
+
+void mlkem_shake256_prf(uint8_t *out, size_t outlen,
+                        const uint8_t key[MLKEM_SYMBYTES], uint8_t nonce)
+{
+  uint8_t extkey[MLKEM_SYMBYTES + 1];
+
+  memcpy(extkey, key, MLKEM_SYMBYTES);
+  extkey[MLKEM_SYMBYTES] = nonce;
+
+  shake256(out, outlen, extkey, sizeof(extkey));
+}
+
+void mlkem_shake256_rkprf(uint8_t out[MLKEM_SSBYTES],
+                          const uint8_t key[MLKEM_SYMBYTES],
+                          const uint8_t input[MLKEM_CIPHERTEXTBYTES])
+{
+  shake256incctx s;
+
+  shake256_inc_init(&s);
+  shake256_inc_absorb(&s, key, MLKEM_SYMBYTES);
+  shake256_inc_absorb(&s, input, MLKEM_CIPHERTEXTBYTES);
+  shake256_inc_finalize(&s);
+  shake256_inc_squeeze(out, MLKEM_SSBYTES, &s);
+  shake256_inc_ctx_release(&s);
+}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/symmetric.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/symmetric.h
new file mode 100644
index 0000000000..202741a7b3
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/symmetric.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef SYMMETRIC_H
+#define SYMMETRIC_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include "params.h"
+
+#include "fips202.h"
+
+#include "cbmc.h"
+
+#define mlkem_shake256_prf MLKEM_NAMESPACE(mlkem_shake256_prf)
+/*************************************************
+ * Name:        mlkem_shake256_prf
+ *
+ * Ref:         FIPS-203 Section 4.1. Function PRF (eq 4.3)
+ *
+ * Description: Usage of SHAKE256 as a PRF, concatenates secret and public input
+ *              and then generates outlen bytes of SHAKE256 output
+ *
+ * Arguments:   - uint8_t *out: pointer to output
+ *              - size_t outlen: number of requested output bytes
+ *              - const uint8_t *key: pointer to the key (of length
+ *                MLKEM_SYMBYTES)
+ *              - uint8_t nonce: single-byte nonce (public PRF input)
+ *
+ *              out and key may NOT be aliased.
+ **************************************************/
+void mlkem_shake256_prf(uint8_t *out, size_t outlen,
+                        const uint8_t key[MLKEM_SYMBYTES], uint8_t nonce)
+__contract__(
+  requires(memory_no_alias(out, outlen))
+  requires(memory_no_alias(key, MLKEM_SYMBYTES))
+  assigns(memory_slice(out, outlen))
+);
+
+#define mlkem_shake256_rkprf MLKEM_NAMESPACE(mlkem_shake256_rkprf)
+/*************************************************
+ * Name:        mlkem_shake256_rkprf
+ *
+ * Ref:         FIPS-203 Section 4.1. Hash function J
+ *
+ * Description: Usage of SHAKE256 as a PRF, concatenates key with input
+ *              and then generates MLKEM_SSBYTES bytes of SHAKE256 output
+ *
+ * Arguments:   - uint8_t *out: pointer to output
+ *              - const uint8_t *key: pointer to the key (of length
+ *                MLKEM_SYMBYTES)
+ *              - const uint8_t *input: pointer to the input (of length
+ *                MLKEM_CIPHERTEXTBYTES)
+ *
+ *              out, key, and input may NOT be aliased.
+ **************************************************/
+void mlkem_shake256_rkprf(uint8_t out[MLKEM_SSBYTES],
+                          const uint8_t key[MLKEM_SYMBYTES],
+                          const uint8_t input[MLKEM_CIPHERTEXTBYTES])
+__contract__(
+  requires(memory_no_alias(out, MLKEM_SSBYTES))
+  requires(memory_no_alias(key, MLKEM_SYMBYTES))
+  requires(memory_no_alias(input, MLKEM_CIPHERTEXTBYTES))
+  assigns(memory_slice(out, MLKEM_SSBYTES))
+);
+
+
+/* Macros denoting FIPS-203 specific Hash functions */
+
+/* Hash function H, FIPS-201 4.1 (eq 4.4) */
+#define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES)
+
+/* Hash function G, FIPS-201 4.1 (eq 4.5) */
+#define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES)
+
+/* Macros denoting FIPS-203 specific PRFs */
+#define prf(OUT, OUTBYTES, KEY, NONCE) \
+  mlkem_shake256_prf(OUT, OUTBYTES, KEY, NONCE)
+#define rkprf(OUT, KEY, INPUT) mlkem_shake256_rkprf(OUT, KEY, INPUT)
+
+#endif /* SYMMETRIC_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/verify.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/verify.c
new file mode 100644
index 0000000000..b5b71e023e
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/verify.c
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "verify.h"
+
+#if !defined(MLKEM_USE_ASM_VALUE_BARRIER)
+/*
+ * Masking value used in constant-time functions from
+ * verify.h to block the compiler's range analysis and
+ * thereby reduce the risk of compiler-introduced branches.
+ */
+volatile uint64_t ct_opt_blocker_u64 = 0;
+
+#else /* MLKEM_USE_ASM_VALUE_BARRIER */
+
+int empty_cu_verify;
+
+#endif /* MLKEM_USE_ASM_VALUE_BARRIER */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/verify.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/verify.h
new file mode 100644
index 0000000000..5c62223c3d
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/verify.h
@@ -0,0 +1,305 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef VERIFY_H
+#define VERIFY_H
+
+#include <limits.h>
+#include <stddef.h>
+#include <stdint.h>
+#include "cbmc.h"
+#include "params.h"
+
+/* Constant-time comparisons and conditional operations
+
+   We reduce the risk for compilation into variable-time code
+   through the use of 'value barriers'.
+
+   Functionally, a value barrier is a no-op. To the compiler, however,
+   it constitutes an arbitrary modification of its input, and therefore
+   harden's value propagation and range analysis.
+
+   We consider two approaches to implement a value barrier:
+   - An empty inline asm block which marks the target value as clobbered.
+   - XOR'ing with the value of a volatile global that's set to 0.
+
+   The first approach is cheap because it only prevents the compiler
+   from reasoning about the value of the variable past the barrier,
+   but does not directly generate additional instructions.
+
+   The second approach generates redundant loads and XOR operations
+   and therefore comes at a higher runtime cost. However, it appears
+   more robust towards optimization, as compilers should never drop
+   a volatile load.
+
+   We use the empty-ASM value barrier for GCC and clang, and fall
+   back to the global volatile barrier otherwise.
+
+   The global value barrier can be forced by setting MLKEM_NO_ASM_VALUE_BARRIER.
+
+*/
+
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(CBMC) && \
+    !defined(MLKEM_NO_ASM_VALUE_BARRIER)
+#define MLKEM_USE_ASM_VALUE_BARRIER
+#endif
+
+#if !defined(MLKEM_USE_ASM_VALUE_BARRIER)
+
+/*
+ * Declaration of global volatile that the global value barrier
+ * is loading from and masking with.
+ */
+#define ct_opt_blocker_u64 MLKEM_NAMESPACE(ct_opt_blocker_u64)
+extern volatile uint64_t ct_opt_blocker_u64;
+
+/* Helper functions for obtaining masks of various sizes */
+STATIC_INLINE_TESTABLE uint8_t get_optblocker_u8(void)
+__contract__(ensures(return_value == 0)) { return (uint8_t)ct_opt_blocker_u64; }
+
+STATIC_INLINE_TESTABLE uint32_t get_optblocker_u32(void)
+__contract__(ensures(return_value == 0)) { return ct_opt_blocker_u64; }
+
+STATIC_INLINE_TESTABLE uint32_t get_optblocker_i32(void)
+__contract__(ensures(return_value == 0)) { return ct_opt_blocker_u64; }
+
+STATIC_INLINE_TESTABLE uint32_t value_barrier_u32(uint32_t b)
+__contract__(ensures(return_value == b)) { return (b ^ get_optblocker_u32()); }
+
+STATIC_INLINE_TESTABLE int32_t value_barrier_i32(int32_t b)
+__contract__(ensures(return_value == b)) { return (b ^ get_optblocker_i32()); }
+
+STATIC_INLINE_TESTABLE uint8_t value_barrier_u8(uint8_t b)
+__contract__(ensures(return_value == b)) { return (b ^ get_optblocker_u8()); }
+
+#else /* !MLKEM_USE_ASM_VALUE_BARRIER */
+
+STATIC_INLINE_TESTABLE uint32_t value_barrier_u32(uint32_t b)
+__contract__(ensures(return_value == b))
+{
+  asm("" : "+r"(b));
+  return b;
+}
+
+STATIC_INLINE_TESTABLE int32_t value_barrier_i32(int32_t b)
+__contract__(ensures(return_value == b))
+{
+  asm("" : "+r"(b));
+  return b;
+}
+
+STATIC_INLINE_TESTABLE uint8_t value_barrier_u8(uint8_t b)
+__contract__(ensures(return_value == b))
+{
+  asm("" : "+r"(b));
+  return b;
+}
+
+#endif /* MLKEM_USE_ASM_VALUE_BARRIER */
+
+/*
+ * The ct_cmask_nonzero_xxx functions below make deliberate use of unsigned
+ * overflow, which is fully defined behaviour in C. It is thus safe to disable
+ * this warning.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+
+/*************************************************
+ * Name:        ct_cmask_nonzero_u16
+ *
+ * Description: Return 0 if input is zero, and -1 otherwise.
+ *
+ * Arguments:   uint16_t x: Value to be converted into a mask
+ **************************************************/
+STATIC_INLINE_TESTABLE uint16_t ct_cmask_nonzero_u16(uint16_t x)
+__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFFFF)))
+{
+  uint32_t tmp = value_barrier_u32(-((uint32_t)x));
+  tmp >>= 16;
+  return tmp;
+}
+
+/*************************************************
+ * Name:        ct_cmask_nonzero_u8
+ *
+ * Description: Return 0 if input is zero, and -1 otherwise.
+ *
+ * Arguments:   uint8_t x: Value to be converted into a mask
+ **************************************************/
+STATIC_INLINE_TESTABLE uint8_t ct_cmask_nonzero_u8(uint8_t x)
+__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFF)))
+{
+  uint32_t tmp = value_barrier_u32(-((uint32_t)x));
+  tmp >>= 24;
+  return tmp;
+}
+
+/* Put unsigned overflow warnings in CBMC back into scope */
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/*
+ * The ct_cmask_neg_i16 function below makes deliberate use of
+ * signed to unsigned integer conversion, which is fully defined
+ * behaviour in C. It is thus safe to disable this warning.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "conversion"
+#endif
+
+/*************************************************
+ * Name:        ct_cmask_neg_i16
+ *
+ * Description: Return 0 if input is non-negative, and -1 otherwise.
+ *
+ * Arguments:   uint16_t x: Value to be converted into a mask
+ **************************************************/
+STATIC_INLINE_TESTABLE uint16_t ct_cmask_neg_i16(int16_t x)
+__contract__(ensures(return_value == ((x < 0) ? 0xFFFF : 0)))
+{
+  int32_t tmp = value_barrier_i32((int32_t)x);
+  tmp >>= 16;
+  return (int16_t)tmp;
+}
+
+/* Put unsigned-to-signed warnings in CBMC back into scope */
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/*
+ * The ct_csel_xxx functions below make deliberate use of unsigned
+ * to signed integer conversion, which is implementation-defined
+ * behaviour. Here, we assume that uint16_t -> int16_t is inverse
+ * to int16_t -> uint16_t.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "conversion"
+#endif
+
+/*************************************************
+ * Name:        ct_sel_int16
+ *
+ * Description: Functionally equivalent to cond ? a : b,
+ *              but implemented with guards against
+ *              compiler-introduced branches.
+ *
+ * Arguments:   int16_t a:       First alternative
+ *              int16_t b:       Second alternative
+ *              uint16_t cond:   Condition variable.
+ **************************************************/
+STATIC_INLINE_TESTABLE int16_t ct_sel_int16(int16_t a, int16_t b, uint16_t cond)
+__contract__(ensures(return_value == (cond ? a : b)))
+{
+  uint16_t au = a, bu = b;
+  uint16_t res = bu ^ (ct_cmask_nonzero_u16(cond) & (au ^ bu));
+  return (int16_t)res;
+}
+
+/* Put unsigned-to-signed warnings in CBMC back into scope */
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/*************************************************
+ * Name:        ct_sel_uint8
+ *
+ * Description: Functionally equivalent to cond ? a : b,
+ *              but implemented with guards against
+ *              compiler-introduced branches.
+ *
+ * Arguments:   uint8_t a:       First alternative
+ *              uint8_t b:       Second alternative
+ *              uuint8_t cond:   Condition variable.
+ **************************************************/
+STATIC_INLINE_TESTABLE uint8_t ct_sel_uint8(uint8_t a, uint8_t b, uint8_t cond)
+__contract__(ensures(return_value == (cond ? a : b)))
+{
+  return b ^ (ct_cmask_nonzero_u8(cond) & (a ^ b));
+}
+
+/*************************************************
+ * Name:        ct_memcmp
+ *
+ * Description: Compare two arrays for equality in constant time.
+ *
+ * Arguments:   const uint8_t *a: pointer to first byte array
+ *              const uint8_t *b: pointer to second byte array
+ *              size_t len:       length of the byte arrays
+ *
+ * Returns 0 if the byte arrays are equal, a non-zero value otherwise
+ **************************************************/
+STATIC_INLINE_TESTABLE uint8_t ct_memcmp(const uint8_t *a, const uint8_t *b,
+                                         const size_t len)
+__contract__(
+  requires(memory_no_alias(a, len))
+  requires(memory_no_alias(b, len))
+  requires(len <= INT_MAX)
+  ensures((return_value == 0) == forall(int, i, 0, ((int)len - 1), (a[i] == b[i]))))
+{
+  uint8_t r = 0, s = 0;
+
+  /*
+   * Switch to a _signed_ ilen value, so that our loop counter
+   * can also be signed, and thus (i - 1) in the loop invariant
+   * can yield -1 as required.
+   */
+  const int ilen = (int)len;
+  int i;
+
+  for (i = 0; i < ilen; i++)
+  __loop__(
+    invariant(i >= 0 && i <= ilen)
+    invariant((r == 0) == (forall(int, k, 0, (i - 1), (a[k] == b[k])))))
+  {
+    r |= a[i] ^ b[i];
+    /* s is useless, but prevents the loop from being aborted once r=0xff. */
+    s ^= a[i] ^ b[i];
+  }
+
+  /*
+   * - Convert r into a mask; this may not be necessary, but is an additional
+   *   safeguard
+   *   towards leaking information about a and b.
+   * - XOR twice with s, separated by a value barrier, to prevent the compile
+   *   from dropping the s computation in the loop.
+   */
+  return (value_barrier_u8(ct_cmask_nonzero_u8(r) ^ s) ^ s);
+}
+
+/*************************************************
+ * Name:        ct_cmov_zero
+ *
+ * Description: Copy len bytes from x to r if b is zero;
+ *              don't modify x if b is non-zero.
+ *              assumes two's complement representation of negative integers.
+ *              Runs in constant time.
+ *
+ * Arguments:   uint8_t *r:       pointer to output byte array
+ *              const uint8_t *x: pointer to input byte array
+ *              size_t len:       Amount of bytes to be copied
+ *              uint8_t b:        Condition value.
+ **************************************************/
+STATIC_INLINE_TESTABLE
+void ct_cmov_zero(uint8_t *r, const uint8_t *x, size_t len, uint8_t b)
+__contract__(
+  requires(memory_no_alias(r, len))
+  requires(memory_no_alias(x, len))
+  assigns(memory_slice(r, len)))
+{
+  size_t i;
+  for (i = 0; i < len; i++)
+  __loop__(invariant(i <= len))
+  {
+    r[i] = ct_sel_uint8(r[i], x[i], b);
+  }
+}
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/zetas.c
new file mode 100644
index 0000000000..f52b2ff5ad
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/zetas.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * WARNING: This file is auto-generated from scripts/autogenerate_files.py
+ *          Do not modify it directly.
+ */
+
+#include "ntt.h"
+
+/*
+ * Table of zeta values used in the reference NTT and inverse NTT.
+ * See autogenerate_files.py for details.
+ */
+const int16_t zetas[128] = {
+    -1044, -758,  -359,  -1517, 1493,  1422,  287,   202,  -171,  622,   1577,
+    182,   962,   -1202, -1474, 1468,  573,   -1325, 264,  383,   -829,  1458,
+    -1602, -130,  -681,  1017,  732,   608,   -1542, 411,  -205,  -1571, 1223,
+    652,   -552,  1015,  -1293, 1491,  -282,  -1544, 516,  -8,    -320,  -666,
+    -1618, -1162, 126,   1469,  -853,  -90,   -271,  830,  107,   -1421, -247,
+    -951,  -398,  961,   -1508, -725,  448,   -1065, 677,  -1275, -1103, 430,
+    555,   843,   -1251, 871,   1550,  105,   422,   587,  177,   -235,  -291,
+    -460,  1574,  1653,  -246,  778,   1159,  -147,  -777, 1483,  -602,  1119,
+    -1590, 644,   -872,  349,   418,   329,   -156,  -75,  817,   1097,  603,
+    610,   1322,  -1285, -1465, 384,   -1215, -136,  1218, -1335, -874,  220,
+    -1187, -1659, -1185, -1530, -1278, 794,   -1510, -854, -870,  478,   -108,
+    -308,  996,   991,   958,   -1460, 1522,  1628,
+};
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/LICENSE b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/LICENSE
similarity index 100%
rename from src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/LICENSE
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_ref/LICENSE
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/api.h
new file mode 100644
index 0000000000..94597323f1
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/api.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef API_H
+#define API_H
+
+#include <stdint.h>
+
+#define PQCP_MLKEM_NATIVE_MLKEM512_SECRETKEYBYTES 1632
+#define PQCP_MLKEM_NATIVE_MLKEM512_PUBLICKEYBYTES 800
+#define PQCP_MLKEM_NATIVE_MLKEM512_CIPHERTEXTBYTES 768
+#define PQCP_MLKEM_NATIVE_MLKEM512_KEYPAIRCOINBYTES 64
+#define PQCP_MLKEM_NATIVE_MLKEM512_ENCCOINBYTES 32
+#define PQCP_MLKEM_NATIVE_MLKEM512_BYTES 32
+
+int PQCP_MLKEM_NATIVE_MLKEM512_keypair_derand(uint8_t *pk, uint8_t *sk,
+                                              const uint8_t *coins);
+int PQCP_MLKEM_NATIVE_MLKEM512_keypair(uint8_t *pk, uint8_t *sk);
+int PQCP_MLKEM_NATIVE_MLKEM512_enc_derand(uint8_t *ct, uint8_t *ss,
+                                          const uint8_t *pk,
+                                          const uint8_t *coins);
+int PQCP_MLKEM_NATIVE_MLKEM512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
+int PQCP_MLKEM_NATIVE_MLKEM512_dec(uint8_t *ss, const uint8_t *ct,
+                                   const uint8_t *sk);
+
+#define PQCP_MLKEM_NATIVE_MLKEM768_SECRETKEYBYTES 2400
+#define PQCP_MLKEM_NATIVE_MLKEM768_PUBLICKEYBYTES 1184
+#define PQCP_MLKEM_NATIVE_MLKEM768_CIPHERTEXTBYTES 1088
+#define PQCP_MLKEM_NATIVE_MLKEM768_KEYPAIRCOINBYTES 64
+#define PQCP_MLKEM_NATIVE_MLKEM768_ENCCOINBYTES 32
+#define PQCP_MLKEM_NATIVE_MLKEM768_BYTES 32
+
+int PQCP_MLKEM_NATIVE_MLKEM768_keypair_derand(uint8_t *pk, uint8_t *sk,
+                                              const uint8_t *coins);
+int PQCP_MLKEM_NATIVE_MLKEM768_keypair(uint8_t *pk, uint8_t *sk);
+int PQCP_MLKEM_NATIVE_MLKEM768_enc_derand(uint8_t *ct, uint8_t *ss,
+                                          const uint8_t *pk,
+                                          const uint8_t *coins);
+int PQCP_MLKEM_NATIVE_MLKEM768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
+int PQCP_MLKEM_NATIVE_MLKEM768_dec(uint8_t *ss, const uint8_t *ct,
+                                   const uint8_t *sk);
+
+#define PQCP_MLKEM_NATIVE_MLKEM1024_SECRETKEYBYTES 3168
+#define PQCP_MLKEM_NATIVE_MLKEM1024_PUBLICKEYBYTES 1568
+#define PQCP_MLKEM_NATIVE_MLKEM1024_CIPHERTEXTBYTES 1568
+#define PQCP_MLKEM_NATIVE_MLKEM1024_KEYPAIRCOINBYTES 64
+#define PQCP_MLKEM_NATIVE_MLKEM1024_ENCCOINBYTES 32
+#define PQCP_MLKEM_NATIVE_MLKEM1024_BYTES 32
+
+int PQCP_MLKEM_NATIVE_MLKEM1024_keypair_derand(uint8_t *pk, uint8_t *sk,
+                                               const uint8_t *coins);
+int PQCP_MLKEM_NATIVE_MLKEM1024_keypair(uint8_t *pk, uint8_t *sk);
+int PQCP_MLKEM_NATIVE_MLKEM1024_enc_derand(uint8_t *ct, uint8_t *ss,
+                                           const uint8_t *pk,
+                                           const uint8_t *coins);
+int PQCP_MLKEM_NATIVE_MLKEM1024_enc(uint8_t *ct, uint8_t *ss,
+                                    const uint8_t *pk);
+int PQCP_MLKEM_NATIVE_MLKEM1024_dec(uint8_t *ss, const uint8_t *ct,
+                                    const uint8_t *sk);
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/arith_native.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/arith_native.h
new file mode 100644
index 0000000000..b7e921323a
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/arith_native.h
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef MLKEM_ARITH_NATIVE_H
+#define MLKEM_ARITH_NATIVE_H
+
+#include <stdint.h>
+#include "config.h"
+#include "params.h"
+
+#if defined(MLKEM_USE_NATIVE)
+
+#include "poly.h"
+#include "polyvec.h"
+#include "profile.h"
+
+/*
+ * MLKEM native arithmetic interface
+ *
+ * This is the C<->native arithmetic interface used in this repository
+ * to allow for the drop-in of native code for performance critical
+ * components of ML-KEM.
+ *
+ * A _profile_ is a specific implementation of (part of) this interface.
+ * The active profile (if any) is determined in profile.h.
+ *
+ * To add a function to a profile, define MLKEM_USE_NATIVE_XXX and
+ * implement `static inline xxx(...)` in the profile header.
+ *
+ * The only exception is MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER. This option can
+ * be set if there are native implementations for all of NTT, invNTT, and
+ * base multiplication, and allows the native implementation to use a
+ * custom order of polynomial coefficients in NTT domain -- the use of such
+ * custom order is not an implementation-detail since the public matrix
+ * is generated in NTT domain. In this case, a permutation function
+ * poly_permute_bitrev_to_custom() needs to be provided that permutes
+ * polynomials in NTT domain from bitreversed to the custom order.
+ */
+
+/*
+ * Those functions are meant to be trivial wrappers around
+ * the chosen native implementation. The are static inline
+ * to avoid unnecessary calls.
+ * The macro before each declaration controls whether a native
+ * implementation is present.
+ */
+
+#if defined(MLKEM_USE_NATIVE_NTT)
+/*************************************************
+ * Name:        ntt_native
+ *
+ * Description: Computes negacyclic number-theoretic transform (NTT) of
+ *              a polynomial in place.
+ *
+ *              The input polynomial is assumed to be in normal order.
+ *              The output polynomial is in bitreversed order, or of a
+ *              custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *              for more information.
+ *
+ * Arguments:   - poly *p: pointer to in/output polynomial
+ **************************************************/
+static INLINE void ntt_native(poly *);
+#endif /* MLKEM_USE_NATIVE_NTT */
+
+#if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
+/*
+ * This must only be set if NTT, invNTT, basemul, mulcache, and
+ * to/from byte stream conversions all have native implementations
+ * that are adapted to the custom order.
+ */
+#if !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) || \
+    !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) ||                  \
+    !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) ||  \
+    !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) ||                           \
+    !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
+#error \
+    "Invalid native profile: MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER can only be \
+set if there are native implementations for NTT, invNTT, mulcache, basemul, \
+and to/from bytes conversions."
+#endif
+
+/*************************************************
+ * Name:        poly_permute_bitrev_to_custom
+ *
+ * Description: When MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is defined,
+ *              convert a polynomial in NTT domain from bitreversed
+ *              order to the custom order output by the native NTT.
+ *
+ *              This must only be defined if there is native code for
+ *              all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache.
+ * Arguments:   - poly *p: pointer to in/output polynomial
+ *
+ **************************************************/
+static INLINE void poly_permute_bitrev_to_custom(poly *);
+#endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */
+
+#if defined(MLKEM_USE_NATIVE_INTT)
+/*************************************************
+ * Name:        intt_native
+ *
+ * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
+ *              of a polynomial in place.
+ *
+ *              The input polynomial is in bitreversed order, or of a
+ *              custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *              for more information.
+ *              The output polynomial is assumed to be in normal order.
+ *
+ * Arguments:   - uint16_t *a: pointer to in/output polynomial
+ **************************************************/
+static INLINE void intt_native(poly *);
+#endif /* MLKEM_USE_NATIVE_INTT */
+
+#if defined(MLKEM_USE_NATIVE_POLY_REDUCE)
+/*************************************************
+ * Name:        poly_reduce_native
+ *
+ * Description: Applies modular reduction to all coefficients of a polynomial.
+ *
+ * Arguments:   - poly *r: pointer to input/output polynomial
+ **************************************************/
+static INLINE void poly_reduce_native(poly *);
+#endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
+
+#if defined(MLKEM_USE_NATIVE_POLY_TOMONT)
+/*************************************************
+ * Name:        poly_tomont_native
+ *
+ * Description: Inplace conversion of all coefficients of a polynomial
+ *              from normal domain to Montgomery domain
+ *
+ * Arguments:   - poly *r: pointer to input/output polynomial
+ **************************************************/
+static INLINE void poly_tomont_native(poly *);
+#endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
+
+#if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE)
+/*************************************************
+ * Name:        poly_mulcache_compute_native
+ *
+ * Description: Compute multiplication cache for a polynomial
+ *              in NTT domain.
+ *
+ *              The purpose of the multiplication cache is to
+ *              cache repeated computations required during a
+ *              base multiplication of polynomials in NTT domain.
+ *              The structure of the multiplication-cache is
+ *              implementation defined.
+ *
+ * Arguments:   INPUT:
+ *              - poly: const pointer to input polynomial.
+ *                  This must be in NTT domain and inin bitreversed order, or of
+ *                  a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *                  See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *                  for more information.
+ *              OUTPUT
+ *              - cache: pointer to multiplication cache
+ **************************************************/
+static INLINE void poly_mulcache_compute_native(poly_mulcache *cache,
+                                                const poly *poly);
+#endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
+
+#if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
+/*************************************************
+ * Name:        poly_mulcache_compute_native
+ *
+ * Description: Compute multiplication of polynomials in NTT domain.
+ *
+ * Arguments:   INPUT:
+ *              - a: First polynomial operand.
+ *                 This must be in NTT domain and inin bitreversed order, or of
+ *                 a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *                 See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *                 for more information.
+ *              - b: Second polynomial operand.
+ *                 As for a.
+ *              - b_cache: Multiplication-cache for b.
+ *              OUTPUT
+ *              - r: Result of the base multiplication. This is again
+ *                   in NTT domain, and of the same order as a and b.
+ **************************************************/
+static INLINE void polyvec_basemul_acc_montgomery_cached_native(
+    poly *r, const polyvec *a, const polyvec *b,
+    const polyvec_mulcache *b_cache);
+#endif
+
+#if defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
+/*************************************************
+ * Name:        poly_tobytes_native
+ *
+ * Description: Serialization of a polynomial.
+ *              Signed coefficients are converted to
+ *              unsigned form before serialization.
+ *
+ * Arguments:   INPUT:
+ *              - a: const pointer to input polynomial,
+ *                with each coefficient in the range -Q+1 .. Q-1
+ *              OUTPUT
+ *              - r: pointer to output byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ **************************************************/
+static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
+                                       const poly *a);
+#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+
+#if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
+/*************************************************
+ * Name:        poly_frombytes_native
+ *
+ * Description: Serialization of a polynomial.
+ *              Signed coefficients are converted to
+ *              unsigned form before serialization.
+ *
+ * Arguments:   INPUT:
+ *              - r: pointer to output polynomial in NTT domain
+ *              OUTPUT
+ *              - a: const pointer to input byte aray
+ *                   (of MLKEM_POLYBYTES bytes)
+ **************************************************/
+static INLINE void poly_frombytes_native(poly *a,
+                                         const uint8_t r[MLKEM_POLYBYTES]);
+#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+
+#if defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
+/*************************************************
+ * Name:        rej_uniform_native
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ *              uniform random integers mod q
+ *
+ * Arguments:   - int16_t *r:          pointer to output buffer
+ *              - unsigned int len:    requested number of 16-bit integers
+ *                                     (uniform mod q).
+ *              - const uint8_t *buf:  pointer to input buffer
+ *                                     (assumed to be uniform random bytes)
+ *              - unsigned int buflen: length of input buffer in bytes.
+ *
+ * Return -1 if the native implementation does not support the input lengths.
+ * Otherwise, returns non-negative number of sampled 16-bit integers (at most
+ * len).
+ **************************************************/
+static INLINE int rej_uniform_native(int16_t *r, unsigned int len,
+                                     const uint8_t *buf, unsigned int buflen);
+#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+
+#endif /* MLKEM_USE_NATIVE */
+#endif /* MLKEM_ARITH_NATIVE_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbd.c
new file mode 100644
index 0000000000..073f3c81d7
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbd.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "cbd.h"
+#include <stdint.h>
+#include "params.h"
+
+/*************************************************
+ * Name:        load32_littleendian
+ *
+ * Description: load 4 bytes into a 32-bit integer
+ *              in little-endian order
+ *
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x
+ **************************************************/
+static uint32_t load32_littleendian(const uint8_t x[4])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  r |= (uint32_t)x[3] << 24;
+  return r;
+}
+
+/*************************************************
+ * Name:        load24_littleendian
+ *
+ * Description: load 3 bytes into a 32-bit integer
+ *              in little-endian order.
+ *              This function is only needed for ML-KEM-512
+ *
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
+ **************************************************/
+#if MLKEM_ETA1 == 3
+static uint32_t load24_littleendian(const uint8_t x[3])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  return r;
+}
+#endif
+
+/*************************************************
+ * Name:        cbd2
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter eta=2
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
+{
+  int i;
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(array_abs_bound(r->coeffs, 0, (8 * i - 1), 2)))
+  {
+    int j;
+    uint32_t t = load32_littleendian(buf + 4 * i);
+    uint32_t d = t & 0x55555555;
+    d += (t >> 1) & 0x55555555;
+
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(array_abs_bound(r->coeffs, 0, 8 * i + j - 1, 2)))
+    {
+      const int16_t a = (d >> (4 * j + 0)) & 0x3;
+      const int16_t b = (d >> (4 * j + 2)) & 0x3;
+      r->coeffs[8 * i + j] = a - b;
+    }
+  }
+}
+
+/*************************************************
+ * Name:        cbd3
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter eta=3.
+ *              This function is only needed for ML-KEM-512
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+#if MLKEM_ETA1 == 3
+static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
+{
+  int i;
+  for (i = 0; i < MLKEM_N / 4; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N / 4)
+    invariant(array_abs_bound(r->coeffs, 0, (4 * i - 1), 3)))
+  {
+    int j;
+    const uint32_t t = load24_littleendian(buf + 3 * i);
+    uint32_t d = t & 0x00249249;
+    d += (t >> 1) & 0x00249249;
+    d += (t >> 2) & 0x00249249;
+
+    for (j = 0; j < 4; j++)
+    __loop__(
+      invariant(i >= 0 && i <= MLKEM_N / 4 && j >= 0 && j <= 4)
+      invariant(array_abs_bound(r->coeffs, 0, 4 * i + j - 1, 3)))
+    {
+      const int16_t a = (d >> (6 * j + 0)) & 0x7;
+      const int16_t b = (d >> (6 * j + 3)) & 0x7;
+      r->coeffs[4 * i + j] = a - b;
+    }
+  }
+}
+#endif
+
+void poly_cbd_eta1(poly *r, const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
+{
+#if MLKEM_ETA1 == 2
+  cbd2(r, buf);
+#elif MLKEM_ETA1 == 3
+  cbd3(r, buf);
+#else
+#error "This implementation requires eta1 in {2,3}"
+#endif
+}
+
+void poly_cbd_eta2(poly *r, const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
+{
+#if MLKEM_ETA2 == 2
+  cbd2(r, buf);
+#else
+#error "This implementation requires eta2 = 2"
+#endif
+}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbd.h
new file mode 100644
index 0000000000..4dc8635bb5
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbd.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef CBD_H
+#define CBD_H
+
+#include <stdint.h>
+#include "params.h"
+#include "poly.h"
+
+#define poly_cbd_eta1 MLKEM_NAMESPACE(poly_cbd_eta1)
+/*************************************************
+ * Name:        poly_cbd_eta1
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+void poly_cbd_eta1(poly *r, const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N - 1, MLKEM_ETA1))
+);
+
+#define poly_cbd_eta2 MLKEM_NAMESPACE(poly_cbd_eta2)
+/*************************************************
+ * Name:        poly_cbd_eta1
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA2.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+void poly_cbd_eta2(poly *r, const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N - 1, MLKEM_ETA2))
+);
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbmc.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbmc.h
new file mode 100644
index 0000000000..317a26421b
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbmc.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/***************************************************
+ * Basic replacements for __CPROVER_XXX contracts
+ ***************************************************/
+
+#include "common.h"
+
+#ifndef CBMC
+
+#define STATIC_INLINE_TESTABLE static INLINE
+#define STATIC_TESTABLE static
+
+#define __contract__(x)
+#define __loop__(x)
+#define cassert(x, y)
+
+#else /* CBMC _is_ defined, therefore we're doing proof */
+
+/* expose certain procedures to CBMC proofs that are static otherwise */
+#define STATIC_TESTABLE
+#define STATIC_INLINE_TESTABLE
+
+#define __contract__(x) x
+#define __loop__(x) x
+
+/* https://diffblue.github.io/cbmc/contracts-assigns.html */
+#define assigns(...) __CPROVER_assigns(__VA_ARGS__)
+
+/* https://diffblue.github.io/cbmc/contracts-requires-ensures.html */
+#define requires(...) __CPROVER_requires(__VA_ARGS__)
+#define ensures(...) __CPROVER_ensures(__VA_ARGS__)
+/* https://diffblue.github.io/cbmc/contracts-loops.html */
+#define invariant(...) __CPROVER_loop_invariant(__VA_ARGS__)
+#define decreases(...) __CPROVER_decreases(__VA_ARGS__)
+/* cassert to avoid confusion with in-built assert */
+#define cassert(...) __CPROVER_assert(__VA_ARGS__)
+#define assume(...) __CPROVER_assume(__VA_ARGS__)
+
+/***************************************************
+ * Macros for "expression" forms that may appear
+ * _inside_ top-level contracts.
+ ***************************************************/
+
+/*
+ * function return value - useful inside ensures
+ * https://diffblue.github.io/cbmc/contracts-functions.html
+ */
+#define return_value (__CPROVER_return_value)
+
+/*
+ * assigns l-value targets
+ * https://diffblue.github.io/cbmc/contracts-assigns.html
+ */
+#define object_whole(...) __CPROVER_object_whole(__VA_ARGS__)
+#define memory_slice(...) __CPROVER_object_upto(__VA_ARGS__)
+#define same_object(...) __CPROVER_same_object(__VA_ARGS__)
+
+/*
+ * Pointer-related predicates
+ * https://diffblue.github.io/cbmc/contracts-memory-predicates.html
+ */
+#define memory_no_alias(...) __CPROVER_is_fresh(__VA_ARGS__)
+#define readable(...) __CPROVER_r_ok(__VA_ARGS__)
+#define writeable(...) __CPROVER_w_ok(__VA_ARGS__)
+
+/*
+ * History variables
+ * https://diffblue.github.io/cbmc/contracts-history-variables.html
+ */
+#define old(...) __CPROVER_old(__VA_ARGS__)
+#define loop_entry(...) __CPROVER_loop_entry(__VA_ARGS__)
+
+/*
+ * Quantifiers
+ * Note that the range on qvar is _inclusive_ between qvar_lb .. qvar_ub
+ * https://diffblue.github.io/cbmc/contracts-quantifiers.html
+ */
+
+/*
+ * Prevent clang-format from corrupting CBMC's special ==> operator
+ */
+/* clang-format off */
+#define forall(type, qvar, qvar_lb, qvar_ub, predicate)           \
+  __CPROVER_forall                                                \
+  {                                                               \
+    type qvar;                                                    \
+    ((qvar_lb) <= (qvar) && (qvar) <= (qvar_ub)) ==> (predicate)  \
+  }
+
+#define EXISTS(type, qvar, qvar_lb, qvar_ub, predicate)         \
+  __CPROVER_exists                                              \
+  {                                                             \
+    type qvar;                                                  \
+    ((qvar_lb) <= (qvar) && (qvar) <= (qvar_ub)) && (predicate) \
+  }
+/* clang-format on */
+
+/***************************************************
+ * Convenience macros for common contract patterns
+ ***************************************************/
+
+/*
+ * Boolean-value predidate that asserts that "all values of array_var are in
+ * range value_lb .. value_ub (inclusive)"
+ * Example:
+ *  array_bound(a->coeffs, 0, MLKEM_N-1, -(MLKEM_Q - 1), MLKEM_Q - 1)
+ * expands to
+ *  __CPROVER_forall { int k; (0 <= k && k <= MLKEM_N-1) ==> ( (-(MLKEM_Q -
+ *  1) <= a->coeffs[k]) && (a->coeffs[k] <= (MLKEM_Q - 1))) }
+ */
+
+/*
+ * Prevent clang-format from corrupting CBMC's special ==> operator
+ */
+/* clang-format off */
+#define CBMC_CONCAT_(left, right) left##right
+#define CBMC_CONCAT(left, right) CBMC_CONCAT_(left, right)
+
+#define array_bound_core(indextype, qvar, qvar_lb, qvar_ub, array_var, \
+                         value_lb, value_ub)                           \
+  __CPROVER_forall                                                     \
+  {                                                                    \
+    indextype qvar;                                                    \
+    ((qvar_lb) <= (qvar) && (qvar) <= (qvar_ub)) ==>                   \
+        (((value_lb) <= (array_var[(qvar)])) &&                        \
+        ((array_var[(qvar)]) <= (value_ub)))                           \
+  }
+
+#define array_bound(array_var, qvar_lb, qvar_ub, value_lb, value_ub) \
+  array_bound_core(int, CBMC_CONCAT(_cbmc_idx, __LINE__), (qvar_lb), \
+                   (qvar_ub), (array_var), (value_lb), (value_ub))
+
+
+/* Wrapper around array_bound operating on absolute values */
+#define array_abs_bound(arr, lb, ub, k) \
+  array_bound((arr), (lb), (ub), (-(k)), (k))
+/* clang-format on */
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/common.h
new file mode 100644
index 0000000000..94c29ed927
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/common.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef COMMON_H
+#define COMMON_H
+
+
+/*
+ * C90 does not have the inline compiler directive yet.
+ * We don't use it in C90 builds.
+ * However, in that case the compiler warns about some inline functions in
+ * header files not being used in every compilation unit that includes that
+ * header. To work around it we silence that warning in that case using
+ * __attribute__((unused)).
+ */
+
+/* Do not use inline for C90 builds*/
+#if !defined(inline)
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#define INLINE inline
+#define ALWAYS_INLINE __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#define INLINE __inline
+#define ALWAYS_INLINE __forceinline
+#else
+#define INLINE __attribute__((unused))
+#define ALWAYS_INLINE
+#endif
+
+#else
+#define INLINE inline
+#define ALWAYS_INLINE __attribute__((always_inline))
+#endif
+
+
+/*
+ * C90 does not have the restrict compiler directive yet.
+ * We don't use it in C90 builds.
+ */
+#if !defined(restrict)
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#define RESTRICT restrict
+#else
+#define RESTRICT
+#endif
+
+#else
+
+#define RESTRICT restrict
+#endif
+
+#define DEFAULT_ALIGN 32
+#if defined(_WIN32)
+#define ALIGN __declspec(align(DEFAULT_ALIGN))
+#define asm __asm
+#else
+#define asm __asm__
+#define ALIGN __attribute__((aligned(DEFAULT_ALIGN)))
+#endif
+
+#define MLKEM_CONCAT_(left, right) left##right
+#define MLKEM_CONCAT(left, right) MLKEM_CONCAT_(left, right)
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/config.h
new file mode 100644
index 0000000000..370a141a65
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/config.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef CONFIG_H
+#define CONFIG_H
+
+#include "cpucap.h"
+
+#if defined(MLKEM_USE_NATIVE)
+
+#if defined(SYS_AARCH64)
+#define MLKEM_USE_NATIVE_AARCH64
+#endif /* SYS_AARCH64 */
+
+#if defined(SYS_X86_64)
+#define MLKEM_USE_NATIVE_X86_64
+#endif /* SYS_X86_64 */
+
+#endif /* MLKEM_USE_NATIVE */
+#endif /* CONFIG_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cpucap.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cpucap.h
new file mode 100644
index 0000000000..cfcbbc3fe9
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cpucap.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef CPUCAP_H
+#define CPUCAP_H
+
+/* Check if we're running on an AArch64 system. _M_ARM64 is set by MSVC. */
+#if defined(__AARCH64EL__) || defined(_M_ARM64)
+#define SYS_AARCH64
+#endif
+
+#if defined(__x86_64__)
+#define SYS_X86_64
+#if defined(__AVX2__)
+#define SYS_X86_64_AVX2
+#endif
+#endif /* __x86_64__ */
+
+/* Check endianness */
+#if defined(__BYTE_ORDER__)
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define SYS_LITTLE_ENDIAN
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define SYS_BIG_ENDIAN
+#else /* __BYTE_ORER__ */
+#error "__BYTE_ORDER__ defined, but don't recognize value."
+#endif /* __BYTE_ORER__ */
+#endif /* !defined(__BYTE_ORER__) */
+
+/* If FORCE_AARCH64 is set, assert that we're indeed on an AArch64 system. */
+#if defined(FORCE_AARCH64) && !defined(SYS_AARCH64)
+#error "FORCE_AARCH64 is set, but we don't seem to be on an AArch64 system."
+#endif
+
+/* If FORCE_X86_64 is set, assert that we're indeed on an X86_64 system. */
+#if defined(FORCE_X86_64) && !defined(SYS_X86_64)
+#error "FORCE_X86_64 is set, but we don't seem to be on an X86_64 system."
+#endif
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/debug/debug.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/debug/debug.c
new file mode 100644
index 0000000000..aa9b578074
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/debug/debug.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#define _ISOC99_SOURCE
+#include "debug.h"
+#include <stdio.h>
+
+#if defined(MLKEM_DEBUG)
+
+static char debug_buf[256];
+
+void mlkem_debug_assert(const char *file, int line, const char *description,
+                        const int val)
+{
+  if (val == 0)
+  {
+    snprintf(debug_buf, sizeof(debug_buf), "Assertion failed: %s (value %d)",
+             description, val);
+    mlkem_debug_print_error(file, line, debug_buf);
+    exit(1);
+  }
+}
+void mlkem_debug_check_bounds(const char *file, int line,
+                              const char *description, const int16_t *ptr,
+                              unsigned len, int lower_bound_exclusive,
+                              int upper_bound_exclusive)
+{
+  int err = 0;
+  unsigned i;
+  for (i = 0; i < len; i++)
+  {
+    int16_t val = ptr[i];
+    if (!(val > lower_bound_exclusive && val < upper_bound_exclusive))
+    {
+      snprintf(debug_buf, sizeof(debug_buf),
+               "%s, index %u, value %d out of bounds (%d,%d)", description, i,
+               (int)val, lower_bound_exclusive, upper_bound_exclusive);
+      mlkem_debug_print_error(file, line, debug_buf);
+      err = 1;
+    }
+  }
+
+  if (err == 1)
+    exit(1);
+}
+
+void mlkem_debug_print_error(const char *file, int line, const char *msg)
+{
+  fprintf(stderr, "[ERROR:%s:%04d] %s\n", file, line, msg);
+  fflush(stderr);
+}
+
+#else /* MLKEM_DEBUG */
+
+int empty_cu_debug;
+
+#endif /* MLKEM_DEBUG */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/debug/debug.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/debug/debug.h
new file mode 100644
index 0000000000..65208771d2
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/debug/debug.h
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef MLKEM_DEBUG_H
+#define MLKEM_DEBUG_H
+
+#if defined(MLKEM_DEBUG)
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/*************************************************
+ * Name:        mlkem_debug_assert
+ *
+ * Description: Check debug assertion
+ *
+ *              Prints an error message to stderr and calls
+ *              exit(1) if not.
+ *
+ * Arguments:   - file: filename
+ *              - line: line number
+ *              - description: Textual description of assertion
+ *              - val: Value asserted to be non-zero
+ **************************************************/
+void mlkem_debug_assert(const char *file, int line, const char *description,
+                        const int val);
+
+/*************************************************
+ * Name:        mlkem_debug_check_bounds
+ *
+ * Description: Check whether values in an array of int16_t
+ *              are within specified bounds.
+ *
+ *              Prints an error message to stderr and calls
+ *              exit(1) if not.
+ *
+ * Arguments:   - file: filename
+ *              - line: line number
+ *              - description: Textual description of check
+ *              - ptr: Base of array to be checked
+ *              - len: Number of int16_t in ptr
+ *              - lower_bound_exclusive: Exclusive lower bound
+ *              - upper_bound_exclusive: Exclusive upper bound
+ **************************************************/
+void mlkem_debug_check_bounds(const char *file, int line,
+                              const char *description, const int16_t *ptr,
+                              unsigned len, int lower_bound_exclusive,
+                              int upper_bound_exclusive);
+
+/* Print error message to stderr alongside file and line information */
+void mlkem_debug_print_error(const char *file, int line, const char *msg);
+
+/* Check assertion, calling exit() upon failure
+ *
+ * val: Value that's asserted to be non-zero
+ * msg: Message to print on failure
+ *
+ * Currently called CASSERT to avoid clash with CBMC assert.
+ */
+#define CASSERT(val, msg)                                 \
+  do                                                      \
+  {                                                       \
+    mlkem_debug_assert(__FILE__, __LINE__, (msg), (val)); \
+  } while (0)
+
+/* Check absolute bounds of scalar
+ * val: Scalar to be checked
+ * abs_bound: Exclusive upper bound on absolute value to check
+ * msg: Message to print on failure */
+#define SCALAR_BOUND(val, abs_bound, msg) \
+  CASSERT((val) > -(abs_bound) && (val) < (abs_bound), msg)
+
+/* Check that all coefficients in array of int16_t's are non-negative
+ * and below an exclusive upper bound.
+ *
+ * ptr: Base of array, expression of type int16_t*
+ * len: Number of int16_t in array
+ * high_bound: Exclusive upper bound on absolute value to check
+ * msg: Message to print on failure */
+#define UBOUND(ptr, len, high_bound, msg)                                 \
+  do                                                                      \
+  {                                                                       \
+    mlkem_debug_check_bounds(__FILE__, __LINE__, (msg), (int16_t *)(ptr), \
+                             (len), -1, ((high_bound)));                  \
+  } while (0)
+
+/* Check absolute bounds in array of int16_t's
+ * ptr: Base of array, expression of type int16_t*
+ * len: Number of int16_t in array
+ * abs_bound: Exclusive upper bound on absolute value to check
+ * msg: Message to print on failure */
+#define BOUND(ptr, len, abs_bound, msg)                                   \
+  do                                                                      \
+  {                                                                       \
+    mlkem_debug_check_bounds(__FILE__, __LINE__, (msg), (int16_t *)(ptr), \
+                             (len), -(abs_bound), (abs_bound));           \
+  } while (0)
+
+/* Check absolute bounds on coefficients in polynomial or mulcache
+ * ptr: poly* or poly_mulcache* pointer to polynomial (cache) to check
+ * abs_bound: Exclusive upper bound on absolute value to check
+ * msg: Message to print on failure */
+#define POLY_BOUND_MSG(ptr, abs_bound, msg)                                    \
+  BOUND((ptr)->coeffs, (sizeof((ptr)->coeffs) / sizeof(int16_t)), (abs_bound), \
+        msg)
+
+/* Check unsigned bounds on coefficients in polynomial or mulcache
+ * ptr: poly* or poly_mulcache* pointer to polynomial (cache) to check
+ * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
+ * msg: Message to print on failure */
+#define POLY_UBOUND_MSG(ptr, ubound, msg)                                    \
+  UBOUND((ptr)->coeffs, (sizeof((ptr)->coeffs) / sizeof(int16_t)), (ubound), \
+         msg)
+
+/* Check absolute bounds on coefficients in polynomial
+ * ptr: poly* of poly_mulcache* pointer to polynomial (cache) to check
+ * abs_bound: Exclusive upper bound on absolute value to check */
+#define POLY_BOUND(ptr, abs_bound) \
+  POLY_BOUND_MSG((ptr), (abs_bound), "poly absolute bound for " #ptr)
+
+/* Check unsigned bounds on coefficients in polynomial
+ * ptr: poly* of poly_mulcache* pointer to polynomial (cache) to check
+ * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
+ */
+#define POLY_UBOUND(ptr, ubound) \
+  POLY_UBOUND_MSG((ptr), (ubound), "poly unsigned bound for " #ptr)
+
+/* Check absolute bounds on coefficients in vector of polynomials
+ * ptr: polyvec* or polyvec_mulcache* pointer to vector of polynomials to check
+ * abs_bound: Exclusive upper bound on absolute value to check */
+#define POLYVEC_BOUND(ptr, abs_bound)                                      \
+  do                                                                       \
+  {                                                                        \
+    unsigned _debug_polyvec_bound_idx;                                     \
+    for (_debug_polyvec_bound_idx = 0; _debug_polyvec_bound_idx < MLKEM_K; \
+         _debug_polyvec_bound_idx++)                                       \
+      POLY_BOUND_MSG(&(ptr)->vec[_debug_polyvec_bound_idx], (abs_bound),   \
+                     "polyvec absolute bound for " #ptr ".vec[i]");        \
+  } while (0)
+
+/* Check unsigned bounds on coefficients in vector of polynomials
+ * ptr: polyvec* or polyvec_mulcache* pointer to vector of polynomials to check
+ * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
+ */
+#define POLYVEC_UBOUND(ptr, ubound)                                        \
+  do                                                                       \
+  {                                                                        \
+    unsigned _debug_polyvec_bound_idx;                                     \
+    for (_debug_polyvec_bound_idx = 0; _debug_polyvec_bound_idx < MLKEM_K; \
+         _debug_polyvec_bound_idx++)                                       \
+      POLY_UBOUND_MSG(&(ptr)->vec[_debug_polyvec_bound_idx], (ubound),     \
+                      "polyvec unsigned bound for " #ptr ".vec[i]");       \
+  } while (0)
+
+/* Following AWS-LC to define a C99-compliant static assert */
+#define MLKEM_STATIC_ASSERT_DEFINE(cond, msg)                            \
+  typedef struct                                                         \
+  {                                                                      \
+    unsigned int MLKEM_CONCAT(static_assertion_, msg) : (cond) ? 1 : -1; \
+  } MLKEM_CONCAT(static_assertion_, msg) __attribute__((unused));
+
+#define MLKEM_STATIC_ASSERT_ADD_LINE0(cond, suffix) \
+  MLKEM_STATIC_ASSERT_DEFINE(cond, MLKEM_CONCAT(at_line_, suffix))
+#define MLKEM_STATIC_ASSERT_ADD_LINE1(cond, line, suffix) \
+  MLKEM_STATIC_ASSERT_ADD_LINE0(cond, MLKEM_CONCAT(line, suffix))
+#define MLKEM_STATIC_ASSERT_ADD_LINE2(cond, suffix) \
+  MLKEM_STATIC_ASSERT_ADD_LINE1(cond, __LINE__, suffix)
+#define MLKEM_STATIC_ASSERT_ADD_ERROR(cond, suffix) \
+  MLKEM_STATIC_ASSERT_ADD_LINE2(cond, MLKEM_CONCAT(_error_is_, suffix))
+#define STATIC_ASSERT(cond, error) MLKEM_STATIC_ASSERT_ADD_ERROR(cond, error)
+
+#else /* MLKEM_DEBUG */
+
+#define CASSERT(val, msg) \
+  do                      \
+  {                       \
+  } while (0)
+#define SCALAR_BOUND(val, abs_bound, msg) \
+  do                                      \
+  {                                       \
+  } while (0)
+#define BOUND(ptr, len, abs_bound, msg) \
+  do                                    \
+  {                                     \
+  } while (0)
+#define POLY_BOUND(ptr, abs_bound) \
+  do                               \
+  {                                \
+  } while (0)
+#define POLYVEC_BOUND(ptr, abs_bound) \
+  do                                  \
+  {                                   \
+  } while (0)
+#define POLY_BOUND_MSG(ptr, ubound, abs_bound) \
+  do                                           \
+  {                                            \
+  } while (0)
+#define UBOUND(ptr, len, high_bound, msg) \
+  do                                      \
+  {                                       \
+  } while (0)
+#define POLY_UBOUND(ptr, ubound) \
+  do                             \
+  {                              \
+  } while (0)
+#define POLYVEC_UBOUND(ptr, ubound) \
+  do                                \
+  {                                 \
+  } while (0)
+#define POLY_UBOUND_MSG(ptr, ubound, msg) \
+  do                                      \
+  {                                       \
+  } while (0)
+#define STATIC_ASSERT(cond, error)
+
+#endif /* MLKEM_DEBUG */
+
+#endif /* MLKEM_DEBUG_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.c
new file mode 100644
index 0000000000..d37e818aeb
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.c
@@ -0,0 +1,578 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "indcpa.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include "fips202.h"
+#include "fips202x4.h"
+#include "indcpa.h"
+#include "ntt.h"
+#include "params.h"
+#include "poly.h"
+#include "polyvec.h"
+#include "randombytes.h"
+#include "rej_uniform.h"
+#include "symmetric.h"
+
+#include "arith_native.h"
+#include "debug/debug.h"
+
+#include "cbmc.h"
+
+
+
+/*************************************************
+ * Name:        pack_pk
+ *
+ * Description: Serialize the public key as concatenation of the
+ *              serialized vector of polynomials pk
+ *              and the public seed used to generate the matrix A.
+ *
+ * Arguments:   uint8_t *r: pointer to the output serialized public key
+ *              polyvec *pk: pointer to the input public-key polyvec.
+ *                Must have coefficients within [0,..,q-1].
+ *              const uint8_t *seed: pointer to the input public seed
+ **************************************************/
+static void pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES], polyvec *pk,
+                    const uint8_t seed[MLKEM_SYMBYTES])
+{
+  POLYVEC_BOUND(pk, MLKEM_Q);
+  polyvec_tobytes(r, pk);
+  memcpy(r + MLKEM_POLYVECBYTES, seed, MLKEM_SYMBYTES);
+}
+
+/*************************************************
+ * Name:        unpack_pk
+ *
+ * Description: De-serialize public key from a byte array;
+ *              approximate inverse of pack_pk
+ *
+ * Arguments:   - polyvec *pk: pointer to output public-key polynomial vector
+ *                  Coefficients will be normalized to [0,..,q-1].
+ *              - uint8_t *seed: pointer to output seed to generate matrix A
+ *              - const uint8_t *packedpk: pointer to input serialized public
+ *                  key.
+ **************************************************/
+static void unpack_pk(polyvec *pk, uint8_t seed[MLKEM_SYMBYTES],
+                      const uint8_t packedpk[MLKEM_INDCPA_PUBLICKEYBYTES])
+{
+  polyvec_frombytes(pk, packedpk);
+  memcpy(seed, packedpk + MLKEM_POLYVECBYTES, MLKEM_SYMBYTES);
+
+  /*
+   * TODO! We know from the modulus check that this will result in an
+   * unsigned canonical polynomial, but CBMC does not know it. We should
+   * weaken the specification of `unpack_pk()` and all depending functions
+   * to work with the weaker 4096-bound, so that the proofs go through
+   * without the need of this redundant call to polyvec_reduce().
+   */
+  polyvec_reduce(pk);
+}
+
+/*************************************************
+ * Name:        pack_sk
+ *
+ * Description: Serialize the secret key
+ *
+ * Arguments:   - uint8_t *r: pointer to output serialized secret key
+ *              - polyvec *sk: pointer to input vector of polynomials (secret
+ *key)
+ **************************************************/
+static void pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES], polyvec *sk)
+{
+  POLYVEC_BOUND(sk, MLKEM_Q);
+  polyvec_tobytes(r, sk);
+}
+
+/*************************************************
+ * Name:        unpack_sk
+ *
+ * Description: De-serialize the secret key; inverse of pack_sk
+ *
+ * Arguments:   - polyvec *sk: pointer to output vector of polynomials (secret
+ *key)
+ *              - const uint8_t *packedsk: pointer to input serialized secret
+ *key
+ **************************************************/
+static void unpack_sk(polyvec *sk,
+                      const uint8_t packedsk[MLKEM_INDCPA_SECRETKEYBYTES])
+{
+  polyvec_frombytes(sk, packedsk);
+  polyvec_reduce(sk);
+}
+
+/*************************************************
+ * Name:        pack_ciphertext
+ *
+ * Description: Serialize the ciphertext as concatenation of the
+ *              compressed and serialized vector of polynomials b
+ *              and the compressed and serialized polynomial v
+ *
+ * Arguments:   uint8_t *r: pointer to the output serialized ciphertext
+ *              poly *pk: pointer to the input vector of polynomials b
+ *              poly *v: pointer to the input polynomial v
+ **************************************************/
+static void pack_ciphertext(uint8_t r[MLKEM_INDCPA_BYTES], polyvec *b, poly *v)
+{
+  polyvec_compress_du(r, b);
+  poly_compress_dv(r + MLKEM_POLYVECCOMPRESSEDBYTES_DU, v);
+}
+
+/*************************************************
+ * Name:        unpack_ciphertext
+ *
+ * Description: De-serialize and decompress ciphertext from a byte array;
+ *              approximate inverse of pack_ciphertext
+ *
+ * Arguments:   - polyvec *b: pointer to the output vector of polynomials b
+ *              - poly *v: pointer to the output polynomial v
+ *              - const uint8_t *c: pointer to the input serialized ciphertext
+ **************************************************/
+static void unpack_ciphertext(polyvec *b, poly *v,
+                              const uint8_t c[MLKEM_INDCPA_BYTES])
+{
+  polyvec_decompress_du(b, c);
+  poly_decompress_dv(v, c + MLKEM_POLYVECCOMPRESSEDBYTES_DU);
+}
+
+#ifndef MLKEM_GEN_MATRIX_NBLOCKS
+#define MLKEM_GEN_MATRIX_NBLOCKS \
+  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + SHAKE128_RATE) / SHAKE128_RATE)
+#endif
+
+/*
+ * Generate four A matrix entries from a seed, using rejection
+ * sampling on the output of a XOF.
+ */
+STATIC_TESTABLE
+void gen_matrix_entry_x4(poly *vec, uint8_t *seed[4])
+__contract__(
+  requires(memory_no_alias(vec, sizeof(poly) * 4))
+  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
+  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(vec, sizeof(poly) * 4))
+  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N - 1, 0, (MLKEM_Q - 1)))
+  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N - 1, 0, (MLKEM_Q - 1)))
+  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N - 1, 0, (MLKEM_Q - 1)))
+  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N - 1, 0, (MLKEM_Q - 1))))
+{
+  /* Temporary buffers for XOF output before rejection sampling */
+  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * SHAKE128_RATE];
+  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * SHAKE128_RATE];
+  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * SHAKE128_RATE];
+  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * SHAKE128_RATE];
+
+  /* Tracks the number of coefficients we have already sampled */
+  unsigned int ctr[KECCAK_WAY];
+  shake128x4incctx statex;
+  unsigned int buflen;
+
+  shake128x4_inc_init(&statex);
+
+  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
+  shake128x4_absorb_once(&statex, seed[0], seed[1], seed[2], seed[3],
+                    MLKEM_SYMBYTES + 2);
+
+  /*
+   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   * This should generate the matrix entries with high probability.
+   */
+  shake128x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
+                           &statex);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * SHAKE128_RATE;
+  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
+  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
+  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
+  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
+
+  /*
+   * So long as not all matrix entries have been generated, squeeze
+   * one more block a time until we're done.
+   */
+  buflen = SHAKE128_RATE;
+  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
+         ctr[3] < MLKEM_N)
+  __loop__(
+    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
+       object_whole(buf1), object_whole(buf2), object_whole(buf3))
+    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
+    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
+    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0] - 1, 0, (MLKEM_Q - 1)))
+    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1] - 1, 0, (MLKEM_Q - 1)))
+    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2] - 1, 0, (MLKEM_Q - 1)))
+    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3] - 1, 0, (MLKEM_Q - 1))))
+  {
+    shake128x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
+    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
+    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
+    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
+    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
+  }
+
+  shake128x4_inc_ctx_release(&statex);
+}
+
+/*
+ * Generate a single A matrix entry from a seed, using rejection
+ * sampling on the output of a XOF.
+ */
+STATIC_TESTABLE
+void gen_matrix_entry(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
+__contract__(
+  requires(memory_no_alias(entry, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(entry, sizeof(poly)))
+  ensures(array_bound(entry->coeffs, 0, MLKEM_N - 1, 0, (MLKEM_Q - 1))))
+{
+  shake128incctx state;
+  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * SHAKE128_RATE];
+  unsigned int ctr, buflen;
+
+  shake128_inc_init(&state);
+
+  shake128_absorb_once(&state, seed, MLKEM_SYMBYTES + 2);
+
+  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   */
+  /* This should generate the matrix entry with high probability. */
+  shake128_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * SHAKE128_RATE;
+  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
+
+  /* Squeeze + sample one more block a time until we're done */
+  buflen = SHAKE128_RATE;
+  while (ctr < MLKEM_N)
+  __loop__(
+    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
+    invariant(0 <= ctr && ctr <= MLKEM_N)
+    invariant(ctr > 0 ==> array_bound(entry->coeffs, 0, ctr - 1,
+                                          0, (MLKEM_Q - 1))))
+  {
+    shake128_squeezeblocks(buf, 1, &state);
+    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, SHAKE128_RATE);
+  }
+
+  shake128_inc_ctx_release(&state);
+}
+
+/*************************************************
+ * Name:        gen_matrix
+ *
+ * Description: Deterministically generate matrix A (or the transpose of A)
+ *              from a seed. Entries of the matrix are polynomials that look
+ *              uniformly random. Performs rejection sampling on output of
+ *              a XOF
+ *
+ * Arguments:   - polyvec *a: pointer to ouptput matrix A
+ *              - const uint8_t *seed: pointer to input seed
+ *              - int transposed: boolean deciding whether A or A^T is generated
+ **************************************************/
+/* Not static for benchmarking */
+void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
+{
+  int i;
+  unsigned int j;
+  /*
+   * We generate four separate seed arrays rather than a single one to work
+   * around limitations in CBMC function contracts dealing with disjoint slices
+   * of the same parent object.
+   */
+
+  ALIGN uint8_t seed0[MLKEM_SYMBYTES + 2];
+  ALIGN uint8_t seed1[MLKEM_SYMBYTES + 2];
+  ALIGN uint8_t seed2[MLKEM_SYMBYTES + 2];
+  ALIGN uint8_t seed3[MLKEM_SYMBYTES + 2];
+  uint8_t *seedxy[4];
+  seedxy[0] = seed0;
+  seedxy[1] = seed1;
+  seedxy[2] = seed2;
+  seedxy[3] = seed3;
+
+  for (j = 0; j < KECCAK_WAY; j++)
+  {
+    memcpy(seedxy[j], seed, MLKEM_SYMBYTES);
+  }
+
+  for (i = 0; i < (MLKEM_K * MLKEM_K / KECCAK_WAY) * KECCAK_WAY;
+       i += KECCAK_WAY)
+  {
+    uint8_t x, y;
+
+    for (j = 0; j < KECCAK_WAY; j++)
+    {
+      x = (i + j) / MLKEM_K;
+      y = (i + j) % MLKEM_K;
+      if (transposed)
+      {
+        seedxy[j][MLKEM_SYMBYTES + 0] = x;
+        seedxy[j][MLKEM_SYMBYTES + 1] = y;
+      }
+      else
+      {
+        seedxy[j][MLKEM_SYMBYTES + 0] = y;
+        seedxy[j][MLKEM_SYMBYTES + 1] = x;
+      }
+    }
+
+    /*
+     * This call writes across polyvec boundaries for K=2 and K=3.
+     * This is intentional and safe.
+     */
+    gen_matrix_entry_x4(&a[0].vec[0] + i, seedxy);
+  }
+
+  /* For left over polynomial, we use single keccak. */
+  if (i < MLKEM_K * MLKEM_K)
+  {
+    uint8_t x, y;
+    x = i / MLKEM_K;
+    y = i % MLKEM_K;
+
+    if (transposed)
+    {
+      seed0[MLKEM_SYMBYTES + 0] = x;
+      seed0[MLKEM_SYMBYTES + 1] = y;
+    }
+    else
+    {
+      seed0[MLKEM_SYMBYTES + 0] = y;
+      seed0[MLKEM_SYMBYTES + 1] = x;
+    }
+
+    gen_matrix_entry(&a[0].vec[0] + i, seed0);
+    i++;
+  }
+
+  cassert(i == MLKEM_K * MLKEM_K,
+          "gen_matrix: failed to generate whole matrix");
+
+#if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
+  /*
+   * The public matrix is generated in NTT domain. If the native backend
+   * uses a custom order in NTT domain, permute A accordingly.
+   */
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    for (j = 0; j < MLKEM_K; j++)
+    {
+      poly_permute_bitrev_to_custom(&a[i].vec[j]);
+    }
+  }
+#endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */
+}
+
+/*************************************************
+ * Name:        matvec_mul
+ *
+ * Description: Computes matrix-vector product in NTT domain,
+ *              via Montgomery multiplication.
+ *
+ * Arguments:   - polyvec *out: Pointer to output polynomial vector
+ *              - polyvec a[MLKEM_K]: Input matrix. Must be in NTT domain
+ *                  and have coefficients of absolute value < MLKEM_Q.
+ *              - polyvec *v: Input polynomial vector. Must be in NTT domain.
+ *              - polyvec *vc: Mulcache for v, computed via
+ *                  polyvec_mulcache_compute().
+ **************************************************/
+STATIC_TESTABLE
+void matvec_mul(polyvec *out, const polyvec a[MLKEM_K], const polyvec *v,
+                const polyvec_mulcache *vc)
+__contract__(
+  requires(memory_no_alias(out, sizeof(polyvec)))
+  requires(memory_no_alias(a, sizeof(polyvec) * MLKEM_K))
+  requires(memory_no_alias(v, sizeof(polyvec)))
+  requires(memory_no_alias(vc, sizeof(polyvec_mulcache)))
+  requires(forall(int, k0, 0, MLKEM_K - 1,
+  forall(int, k1, 0, MLKEM_K - 1,
+    array_abs_bound(a[k0].vec[k1].coeffs, 0, MLKEM_N - 1, (MLKEM_Q - 1)))))
+  assigns(object_whole(out)))
+{
+  int i;
+  for (i = 0; i < MLKEM_K; i++)
+  __loop__(
+    assigns(i, object_whole(out))
+    invariant(i >= 0 && i <= MLKEM_K))
+  {
+    polyvec_basemul_acc_montgomery_cached(&out->vec[i], &a[i], v, vc);
+  }
+}
+
+/*************************************************
+ * Name:        indcpa_keypair_derand
+ *
+ * Description: Generates public and private key for the CPA-secure
+ *              public-key encryption scheme underlying ML-KEM
+ *
+ * Arguments:   - uint8_t *pk: pointer to output public key
+ *                             (of length MLKEM_INDCPA_PUBLICKEYBYTES bytes)
+ *              - uint8_t *sk: pointer to output private key
+ *                             (of length MLKEM_INDCPA_SECRETKEYBYTES bytes)
+ *              - const uint8_t *coins: pointer to input randomness
+ *                             (of length MLKEM_SYMBYTES bytes)
+ **************************************************/
+
+STATIC_ASSERT(NTT_BOUND + MLKEM_Q < INT16_MAX, indcpa_enc_bound_0)
+
+void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+                           uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+                           const uint8_t coins[MLKEM_SYMBYTES])
+{
+  ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
+  const uint8_t *publicseed = buf;
+  const uint8_t *noiseseed = buf + MLKEM_SYMBYTES;
+  polyvec a[MLKEM_K], e, pkpv, skpv;
+  polyvec_mulcache skpv_cache;
+
+  ALIGN uint8_t coins_with_domain_separator[MLKEM_SYMBYTES + 1];
+  /* Concatenate coins with MLKEM_K for domain separation of security levels */
+  memcpy(coins_with_domain_separator, coins, MLKEM_SYMBYTES);
+  coins_with_domain_separator[MLKEM_SYMBYTES] = MLKEM_K;
+
+  hash_g(buf, coins_with_domain_separator, MLKEM_SYMBYTES + 1);
+
+  gen_matrix(a, publicseed, 0 /* no transpose */);
+
+#if MLKEM_K == 2
+  poly_getnoise_eta1_4x(skpv.vec + 0, skpv.vec + 1, e.vec + 0, e.vec + 1,
+                        noiseseed, 0, 1, 2, 3);
+#elif MLKEM_K == 3
+  /*
+   * Only the first three output buffers are needed.
+   * The laster parameter is a dummy that's overwritten later.
+   */
+  poly_getnoise_eta1_4x(skpv.vec + 0, skpv.vec + 1, skpv.vec + 2,
+                        pkpv.vec + 0 /* irrelevant */, noiseseed, 0, 1, 2,
+                        0xFF /* irrelevant */);
+  /* Same here */
+  poly_getnoise_eta1_4x(e.vec + 0, e.vec + 1, e.vec + 2,
+                        pkpv.vec + 0 /* irrelevant */, noiseseed, 3, 4, 5,
+                        0xFF /* irrelevant */);
+#elif MLKEM_K == 4
+  poly_getnoise_eta1_4x(skpv.vec + 0, skpv.vec + 1, skpv.vec + 2, skpv.vec + 3,
+                        noiseseed, 0, 1, 2, 3);
+  poly_getnoise_eta1_4x(e.vec + 0, e.vec + 1, e.vec + 2, e.vec + 3, noiseseed,
+                        4, 5, 6, 7);
+#endif
+
+  polyvec_ntt(&skpv);
+  polyvec_ntt(&e);
+
+  polyvec_mulcache_compute(&skpv_cache, &skpv);
+  matvec_mul(&pkpv, a, &skpv, &skpv_cache);
+  polyvec_tomont(&pkpv);
+
+  /* Arithmetic cannot overflow, see static assertion at the top */
+  polyvec_add(&pkpv, &e);
+  polyvec_reduce(&pkpv);
+  polyvec_reduce(&skpv);
+
+  pack_sk(sk, &skpv);
+  pack_pk(pk, &pkpv, publicseed);
+}
+
+/*************************************************
+ * Name:        indcpa_enc
+ *
+ * Description: Encryption function of the CPA-secure
+ *              public-key encryption scheme underlying Kyber.
+ *
+ * Arguments:   - uint8_t *c: pointer to output ciphertext
+ *                            (of length MLKEM_INDCPA_BYTES bytes)
+ *              - const uint8_t *m: pointer to input message
+ *                                  (of length MLKEM_INDCPA_MSGBYTES bytes)
+ *              - const uint8_t *pk: pointer to input public key
+ *                                   (of length MLKEM_INDCPA_PUBLICKEYBYTES)
+ *              - const uint8_t *coins: pointer to input random coins used as
+ *seed (of length MLKEM_SYMBYTES) to deterministically generate all randomness
+ **************************************************/
+
+/* Check that the arithmetic in indcpa_enc() does not overflow */
+STATIC_ASSERT(INVNTT_BOUND + MLKEM_ETA1 < INT16_MAX, indcpa_enc_bound_0)
+STATIC_ASSERT(INVNTT_BOUND + MLKEM_ETA2 + MLKEM_Q < INT16_MAX,
+              indcpa_enc_bound_1)
+
+void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
+                const uint8_t m[MLKEM_INDCPA_MSGBYTES],
+                const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+                const uint8_t coins[MLKEM_SYMBYTES])
+{
+  ALIGN uint8_t seed[MLKEM_SYMBYTES];
+  polyvec sp, pkpv, ep, at[MLKEM_K], b;
+  poly v, k, epp;
+  polyvec_mulcache sp_cache;
+
+  unpack_pk(&pkpv, seed, pk);
+  poly_frommsg(&k, m);
+  gen_matrix(at, seed, 1 /* transpose */);
+
+#if MLKEM_K == 2
+  poly_getnoise_eta1122_4x(sp.vec + 0, sp.vec + 1, ep.vec + 0, ep.vec + 1,
+                           coins, 0, 1, 2, 3);
+  poly_getnoise_eta2(&epp, coins, 4);
+#elif MLKEM_K == 3
+  /*
+   * In this call, only the first three output buffers are needed.
+   * The last parameter is a dummy that's overwritten later.
+   */
+  poly_getnoise_eta1_4x(sp.vec + 0, sp.vec + 1, sp.vec + 2, &b.vec[0], coins, 0,
+                        1, 2, 0xFF);
+  /* The fourth output buffer in this call _is_ used. */
+  poly_getnoise_eta2_4x(ep.vec + 0, ep.vec + 1, ep.vec + 2, &epp, coins, 3, 4,
+                        5, 6);
+#elif MLKEM_K == 4
+  poly_getnoise_eta1_4x(sp.vec + 0, sp.vec + 1, sp.vec + 2, sp.vec + 3, coins,
+                        0, 1, 2, 3);
+  poly_getnoise_eta2_4x(ep.vec + 0, ep.vec + 1, ep.vec + 2, ep.vec + 3, coins,
+                        4, 5, 6, 7);
+  poly_getnoise_eta2(&epp, coins, 8);
+#endif
+
+  polyvec_ntt(&sp);
+
+  polyvec_mulcache_compute(&sp_cache, &sp);
+  matvec_mul(&b, at, &sp, &sp_cache);
+  polyvec_basemul_acc_montgomery_cached(&v, &pkpv, &sp, &sp_cache);
+
+  polyvec_invntt_tomont(&b);
+  poly_invntt_tomont(&v);
+
+  /* Arithmetic cannot overflow, see static assertion at the top */
+  polyvec_add(&b, &ep);
+  poly_add(&v, &epp);
+  poly_add(&v, &k);
+
+  polyvec_reduce(&b);
+  poly_reduce(&v);
+
+  pack_ciphertext(c, &b, &v);
+}
+
+/* Check that the arithmetic in indcpa_dec() does not overflow */
+STATIC_ASSERT(INVNTT_BOUND + MLKEM_Q < INT16_MAX, indcpa_dec_bound_0)
+
+void indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
+                const uint8_t c[MLKEM_INDCPA_BYTES],
+                const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES])
+{
+  polyvec b, skpv;
+  poly v, sb;
+
+  unpack_ciphertext(&b, &v, c);
+  unpack_sk(&skpv, sk);
+
+  polyvec_ntt(&b);
+  polyvec_basemul_acc_montgomery(&sb, &skpv, &b);
+  poly_invntt_tomont(&sb);
+
+  /* Arithmetic cannot overflow, see static assertion at the top */
+  poly_sub(&v, &sb);
+  poly_reduce(&v);
+
+  poly_tomsg(m, &v);
+}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.h
new file mode 100644
index 0000000000..3f57eb1295
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef INDCPA_H
+#define INDCPA_H
+
+#include <stdint.h>
+#include "cbmc.h"
+#include "params.h"
+#include "polyvec.h"
+
+
+#define gen_matrix MLKEM_NAMESPACE(gen_matrix)
+
+void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
+__contract__(
+  requires(memory_no_alias(a, sizeof(polyvec) * MLKEM_K))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires(transposed == 0 || transposed == 1)
+  assigns(object_whole(a))
+  ensures(forall(int, x, 0, MLKEM_K - 1, forall(int, y, 0, MLKEM_K - 1,
+  array_bound(a[x].vec[y].coeffs, 0, MLKEM_N - 1, 0, (MLKEM_Q - 1)))));
+);
+
+#define indcpa_keypair_derand MLKEM_NAMESPACE(indcpa_keypair_derand)
+void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+                           uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+                           const uint8_t coins[MLKEM_SYMBYTES])
+__contract__(
+  requires(memory_no_alias(pk, MLKEM_INDCPA_PUBLICKEYBYTES))
+  requires(memory_no_alias(sk, MLKEM_INDCPA_SECRETKEYBYTES))
+  requires(memory_no_alias(coins, MLKEM_SYMBYTES))
+  assigns(object_whole(pk))
+  assigns(object_whole(sk))
+);
+
+#define indcpa_enc MLKEM_NAMESPACE(indcpa_enc)
+/*************************************************
+ * Name:        indcpa_dec
+ *
+ * Description: Decryption function of the CPA-secure
+ *              public-key encryption scheme underlying Kyber.
+ *
+ * Arguments:   - uint8_t *m: pointer to output decrypted message
+ *                            (of length MLKEM_INDCPA_MSGBYTES)
+ *              - const uint8_t *c: pointer to input ciphertext
+ *                                  (of length MLKEM_INDCPA_BYTES)
+ *              - const uint8_t *sk: pointer to input secret key
+ *                                   (of length MLKEM_INDCPA_SECRETKEYBYTES)
+ **************************************************/
+void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
+                const uint8_t m[MLKEM_INDCPA_MSGBYTES],
+                const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+                const uint8_t coins[MLKEM_SYMBYTES])
+__contract__(
+  requires(memory_no_alias(c, MLKEM_INDCPA_BYTES))
+  requires(memory_no_alias(m, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(pk, MLKEM_INDCPA_PUBLICKEYBYTES))
+  requires(memory_no_alias(coins, MLKEM_SYMBYTES))
+  assigns(object_whole(c))
+);
+
+#define indcpa_dec MLKEM_NAMESPACE(indcpa_dec)
+void indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
+                const uint8_t c[MLKEM_INDCPA_BYTES],
+                const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES])
+__contract__(
+  requires(memory_no_alias(c, MLKEM_INDCPA_BYTES))
+  requires(memory_no_alias(m, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(sk, MLKEM_INDCPA_SECRETKEYBYTES))
+  assigns(object_whole(m))
+);
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/kem.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/kem.c
new file mode 100644
index 0000000000..f84ee3f3da
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/kem.c
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "kem.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include "indcpa.h"
+#include "params.h"
+#include "randombytes.h"
+#include "symmetric.h"
+#include "verify.h"
+
+#if defined(CBMC)
+/* Redeclaration with contract needed for CBMC only */
+int memcmp(const void *str1, const void *str2, size_t n)
+__contract__(
+  requires(memory_no_alias(str1, n))
+  requires(memory_no_alias(str2, n))
+);
+#endif
+
+/*************************************************
+ * Name:        check_pk
+ *
+ * Description: Implements modulus check mandated by FIPS203,
+ *              i.e., ensures that coefficients are in [0,q-1].
+ *              Described in Section 7.2 of FIPS203.
+ *
+ * Arguments:   - const uint8_t *pk: pointer to input public key
+ *                (an already allocated array of MLKEM_PUBLICKEYBYTES bytes)
+ **
+ * Returns 0 on success, and -1 on failure
+ **************************************************/
+static int check_pk(const uint8_t pk[MLKEM_PUBLICKEYBYTES])
+{
+  polyvec p;
+  uint8_t p_reencoded[MLKEM_POLYVECBYTES];
+  polyvec_frombytes(&p, pk);
+  polyvec_reduce(&p);
+  polyvec_tobytes(p_reencoded, &p);
+  /* Data is public, so a variable-time memcmp() is OK */
+  if (memcmp(pk, p_reencoded, MLKEM_POLYVECBYTES))
+  {
+    return -1;
+  }
+  return 0;
+}
+
+/*************************************************
+ * Name:        check_sk
+ *
+ * Description: Implements public key hash check mandated by FIPS203,
+ *              i.e., ensures that
+ *              sk[768𝑘+32 ∶ 768𝑘+64] = H(pk)= H(sk[384𝑘 : 768𝑘+32])
+ *              Described in Section 7.3 of FIPS203.
+ *
+ * Arguments:   - const uint8_t *sk: pointer to input private key
+ *                (an already allocated array of MLKEM_SECRETKEYBYTES bytes)
+ *
+ * Returns 0 on success, and -1 on failure
+ **************************************************/
+static int check_sk(const uint8_t sk[MLKEM_SECRETKEYBYTES])
+{
+  uint8_t test[MLKEM_SYMBYTES];
+  /*
+   * The parts of `sk` being hashed and compared here are public, so
+   * no public information is leaked through the runtime or the return value
+   * of this function.
+   */
+  hash_h(test, sk + MLKEM_INDCPA_SECRETKEYBYTES, MLKEM_PUBLICKEYBYTES);
+  if (memcmp(sk + MLKEM_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES, test,
+             MLKEM_SYMBYTES))
+  {
+    return -1;
+  }
+  return 0;
+}
+
+/*************************************************
+ * Name:        crypto_kem_keypair_derand
+ *
+ * Description: Generates public and private key
+ *              for CCA-secure ML-KEM key encapsulation mechanism
+ *
+ * Arguments:   - uint8_t *pk: pointer to output public key
+ *                (an already allocated array of MLKEM_PUBLICKEYBYTES bytes)
+ *              - uint8_t *sk: pointer to output private key
+ *                (an already allocated array of MLKEM_SECRETKEYBYTES bytes)
+ *              - uint8_t *coins: pointer to input randomness
+ *                (an already allocated array filled with 2*MLKEM_SYMBYTES
+ *random bytes)
+ **
+ * Returns 0 (success)
+ **************************************************/
+int crypto_kem_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins)
+{
+  indcpa_keypair_derand(pk, sk, coins);
+  memcpy(sk + MLKEM_INDCPA_SECRETKEYBYTES, pk, MLKEM_PUBLICKEYBYTES);
+  hash_h(sk + MLKEM_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES, pk,
+         MLKEM_PUBLICKEYBYTES);
+  /* Value z for pseudo-random output on reject */
+  memcpy(sk + MLKEM_SECRETKEYBYTES - MLKEM_SYMBYTES, coins + MLKEM_SYMBYTES,
+         MLKEM_SYMBYTES);
+  return 0;
+}
+
+int crypto_kem_keypair(uint8_t *pk, uint8_t *sk)
+{
+  ALIGN uint8_t coins[2 * MLKEM_SYMBYTES];
+  randombytes(coins, 2 * MLKEM_SYMBYTES);
+  crypto_kem_keypair_derand(pk, sk, coins);
+  return 0;
+}
+
+int crypto_kem_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk,
+                          const uint8_t *coins)
+{
+  ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
+  /* Will contain key, coins */
+  ALIGN uint8_t kr[2 * MLKEM_SYMBYTES];
+
+  if (check_pk(pk))
+  {
+    return -1;
+  }
+
+  memcpy(buf, coins, MLKEM_SYMBYTES);
+
+  /* Multitarget countermeasure for coins + contributory KEM */
+  hash_h(buf + MLKEM_SYMBYTES, pk, MLKEM_PUBLICKEYBYTES);
+  hash_g(kr, buf, 2 * MLKEM_SYMBYTES);
+
+  /* coins are in kr+MLKEM_SYMBYTES */
+  indcpa_enc(ct, buf, pk, kr + MLKEM_SYMBYTES);
+
+  memcpy(ss, kr, MLKEM_SYMBYTES);
+  return 0;
+}
+
+int crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk)
+{
+  ALIGN uint8_t coins[MLKEM_SYMBYTES];
+  randombytes(coins, MLKEM_SYMBYTES);
+  return crypto_kem_enc_derand(ct, ss, pk, coins);
+}
+
+int crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk)
+{
+  uint8_t fail;
+  ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
+  /* Will contain key, coins */
+  ALIGN uint8_t kr[2 * MLKEM_SYMBYTES];
+  ALIGN uint8_t cmp[MLKEM_CIPHERTEXTBYTES + MLKEM_SYMBYTES];
+  const uint8_t *pk = sk + MLKEM_INDCPA_SECRETKEYBYTES;
+
+  if (check_sk(sk))
+  {
+    return -1;
+  }
+
+  indcpa_dec(buf, ct, sk);
+
+  /* Multitarget countermeasure for coins + contributory KEM */
+  memcpy(buf + MLKEM_SYMBYTES, sk + MLKEM_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES,
+         MLKEM_SYMBYTES);
+  hash_g(kr, buf, 2 * MLKEM_SYMBYTES);
+
+  /* coins are in kr+MLKEM_SYMBYTES */
+  indcpa_enc(cmp, buf, pk, kr + MLKEM_SYMBYTES);
+
+  fail = ct_memcmp(ct, cmp, MLKEM_CIPHERTEXTBYTES);
+
+  /* Compute rejection key */
+  rkprf(ss, sk + MLKEM_SECRETKEYBYTES - MLKEM_SYMBYTES, ct);
+
+  /* Copy true key to return buffer if fail is 0 */
+  ct_cmov_zero(ss, kr, MLKEM_SYMBYTES, fail);
+
+  return 0;
+}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/kem.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/kem.h
new file mode 100644
index 0000000000..6a33be7c7e
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/kem.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef KEM_H
+#define KEM_H
+
+#include <stdint.h>
+#include "cbmc.h"
+#include "params.h"
+
+#define CRYPTO_SECRETKEYBYTES MLKEM_SECRETKEYBYTES
+#define CRYPTO_PUBLICKEYBYTES MLKEM_PUBLICKEYBYTES
+#define CRYPTO_CIPHERTEXTBYTES MLKEM_CIPHERTEXTBYTES
+#define CRYPTO_BYTES MLKEM_SSBYTES
+
+#if (MLKEM_K == 2)
+#define CRYPTO_ALGNAME "Kyber512"
+#elif (MLKEM_K == 3)
+#define CRYPTO_ALGNAME "Kyber768"
+#elif (MLKEM_K == 4)
+#define CRYPTO_ALGNAME "Kyber1024"
+#endif
+
+#define crypto_kem_keypair_derand MLKEM_NAMESPACE(keypair_derand)
+int crypto_kem_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins)
+__contract__(
+  requires(memory_no_alias(pk, MLKEM_PUBLICKEYBYTES))
+  requires(memory_no_alias(sk, MLKEM_SECRETKEYBYTES))
+  requires(memory_no_alias(coins, 2 * MLKEM_SYMBYTES))
+  assigns(object_whole(pk))
+  assigns(object_whole(sk))
+);
+
+#define crypto_kem_keypair MLKEM_NAMESPACE(keypair)
+/*************************************************
+ * Name:        crypto_kem_keypair
+ *
+ * Description: Generates public and private key
+ *              for CCA-secure ML-KEM key encapsulation mechanism
+ *
+ * Arguments:   - uint8_t *pk: pointer to output public key
+ *                (an already allocated array of MLKEM_PUBLICKEYBYTES bytes)
+ *              - uint8_t *sk: pointer to output private key
+ *                (an already allocated array of MLKEM_SECRETKEYBYTES bytes)
+ *
+ * Returns 0 (success)
+ **************************************************/
+int crypto_kem_keypair(uint8_t *pk, uint8_t *sk)
+__contract__(
+  requires(memory_no_alias(pk, MLKEM_PUBLICKEYBYTES))
+  requires(memory_no_alias(sk, MLKEM_SECRETKEYBYTES))
+  assigns(object_whole(pk))
+  assigns(object_whole(sk))
+);
+
+#define crypto_kem_enc_derand MLKEM_NAMESPACE(enc_derand)
+/*************************************************
+ * Name:        crypto_kem_enc_derand
+ *
+ * Description: Generates cipher text and shared
+ *              secret for given public key
+ *
+ * Arguments:   - uint8_t *ct: pointer to output cipher text
+ *                (an already allocated array of MLKEM_CIPHERTEXTBYTES bytes)
+ *              - uint8_t *ss: pointer to output shared secret
+ *                (an already allocated array of MLKEM_SSBYTES bytes)
+ *              - const uint8_t *pk: pointer to input public key
+ *                (an already allocated array of MLKEM_PUBLICKEYBYTES bytes)
+ *              - const uint8_t *coins: pointer to input randomness
+ *                (an already allocated array filled with MLKEM_SYMBYTES random
+ *bytes)
+ **
+ * Returns 0 on success, and -1 if the public key modulus check (see Section 7.2
+ * of FIPS203) fails.
+ **************************************************/
+int crypto_kem_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk,
+                          const uint8_t *coins)
+__contract__(
+  requires(memory_no_alias(ct, MLKEM_CIPHERTEXTBYTES))
+  requires(memory_no_alias(ss, MLKEM_SSBYTES))
+  requires(memory_no_alias(pk, MLKEM_PUBLICKEYBYTES))
+  requires(memory_no_alias(coins, MLKEM_SYMBYTES))
+  assigns(object_whole(ct))
+  assigns(object_whole(ss))
+);
+
+#define crypto_kem_enc MLKEM_NAMESPACE(enc)
+/*************************************************
+ * Name:        crypto_kem_enc
+ *
+ * Description: Generates cipher text and shared
+ *              secret for given public key
+ *
+ * Arguments:   - uint8_t *ct: pointer to output cipher text
+ *                (an already allocated array of MLKEM_CIPHERTEXTBYTES bytes)
+ *              - uint8_t *ss: pointer to output shared secret
+ *                (an already allocated array of MLKEM_SSBYTES bytes)
+ *              - const uint8_t *pk: pointer to input public key
+ *                (an already allocated array of MLKEM_PUBLICKEYBYTES bytes)
+ *
+ * Returns 0 on success, and -1 if the public key modulus check (see Section 7.2
+ * of FIPS203) fails.
+ **************************************************/
+int crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk)
+__contract__(
+  requires(memory_no_alias(ct, MLKEM_CIPHERTEXTBYTES))
+  requires(memory_no_alias(ss, MLKEM_SSBYTES))
+  requires(memory_no_alias(pk, MLKEM_PUBLICKEYBYTES))
+  assigns(object_whole(ct))
+  assigns(object_whole(ss))
+);
+
+#define crypto_kem_dec MLKEM_NAMESPACE(dec)
+/*************************************************
+ * Name:        crypto_kem_dec
+ *
+ * Description: Generates shared secret for given
+ *              cipher text and private key
+ *
+ * Arguments:   - uint8_t *ss: pointer to output shared secret
+ *                (an already allocated array of MLKEM_SSBYTES bytes)
+ *              - const uint8_t *ct: pointer to input cipher text
+ *                (an already allocated array of MLKEM_CIPHERTEXTBYTES bytes)
+ *              - const uint8_t *sk: pointer to input private key
+ *                (an already allocated array of MLKEM_SECRETKEYBYTES bytes)
+ *
+ * Returns 0 on success, and -1 if the secret key hash check (see Section 7.3 of
+ * FIPS203) fails.
+ *
+ * On failure, ss will contain a pseudo-random value.
+ **************************************************/
+int crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk)
+__contract__(
+  requires(memory_no_alias(ss, MLKEM_SSBYTES))
+  requires(memory_no_alias(ct, MLKEM_CIPHERTEXTBYTES))
+  requires(memory_no_alias(sk, MLKEM_SECRETKEYBYTES))
+  assigns(object_whole(ss))
+);
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/ntt.c
new file mode 100644
index 0000000000..1844ca19fd
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/ntt.c
@@ -0,0 +1,278 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "ntt.h"
+#include <stdint.h>
+#include "params.h"
+#include "reduce.h"
+
+#include "arith_native.h"
+#include "debug/debug.h"
+#include "ntt.h"
+
+#if !defined(MLKEM_USE_NATIVE_NTT)
+/*
+ * Computes a block CT butterflies with a fixed twiddle factor,
+ * using Montgomery multiplication.
+ * Parameters:
+ * - r: Pointer to base of polynomial (_not_ the base of butterfly block)
+ * - root: Twiddle factor to use for the butterfly. This must be in
+ *         Montgomery form and signed canonical.
+ * - start: Offset to the beginning of the butterfly block
+ * - len: Index difference between coefficients subject to a butterfly
+ * - bound: Ghost variable describing coefficient bound: Prior to `start`,
+ *          coefficients must be bound by `bound + MLKEM_Q`. Post `start`,
+ *          they must be bound by `bound`.
+ * When this function returns, output coefficients in the index range
+ * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`.
+ * Example:
+ * - start=8, len=4
+ *   This would compute the following four butterflies
+ *          8     --    12
+ *             9    --     13
+ *                10   --     14
+ *                   11   --     15
+ * - start=4, len=2
+ *   This would compute the following two butterflies
+ *          4 -- 6
+ *             5 -- 7
+ */
+STATIC_TESTABLE
+void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, int start, int len,
+                         int bound)
+__contract__(
+  requires(0 <= start && start < MLKEM_N)
+  requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
+  requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
+  requires(-HALF_Q < zeta && zeta < HALF_Q)
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(array_abs_bound(r, 0, start - 1, bound + MLKEM_Q))
+  requires(array_abs_bound(r, start, MLKEM_N - 1, bound))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, start + 2*len - 1, bound + MLKEM_Q))
+  ensures(array_abs_bound(r, start + 2 * len, MLKEM_N - 1, bound)))
+{
+  /* `bound` is a ghost variable only needed in the CBMC specification */
+  int j;
+  ((void)bound);
+  for (j = start; j < start + len; j++)
+  __loop__(
+    invariant(start <= j && j <= start + len)
+    /* 
+     * Coefficients are updated in strided pairs, so the bounds for the
+     * intermediate states alternate twice between the old and new bound
+     */
+    invariant(array_abs_bound(r, 0,           j - 1,           bound + MLKEM_Q))
+    invariant(array_abs_bound(r, j,           start + len - 1, bound))
+    invariant(array_abs_bound(r, start + len, j + len - 1,     bound + MLKEM_Q))
+    invariant(array_abs_bound(r, j + len,     MLKEM_N - 1,     bound)))
+  {
+    int16_t t;
+    t = fqmul(r[j + len], zeta);
+    r[j + len] = r[j] - t;
+    r[j] = r[j] + t;
+  }
+}
+
+/*
+ *Compute one layer of forward NTT
+ * Parameters:
+ * - r: Pointer to base of polynomial
+ * - len: Stride of butterflies in this layer.
+ * - layer: Ghost variable indicating which layer is being applied.
+ *          Must match `len` via `len == MLKEM_N >> layer`.
+ * Note: `len` could be dropped and computed in the function, but
+ *   we are following the structure of the reference NTT from the
+ *   official Kyber implementation here, merely adding `layer` as
+ *   a ghost variable for the specifications.
+ */
+STATIC_TESTABLE
+void ntt_layer(int16_t r[MLKEM_N], int len, int layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
+  requires(array_abs_bound(r, 0, MLKEM_N - 1, layer * MLKEM_Q - 1))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N - 1, (layer + 1) * MLKEM_Q - 1)))
+{
+  int start, k;
+  /* `layer` is a ghost variable only needed in the CBMC specification */
+  ((void)layer);
+  /* Twiddle factors for layer n start at index 2^(layer-1) */
+  k = MLKEM_N / (2 * len);
+  for (start = 0; start < MLKEM_N; start += 2 * len)
+  __loop__(
+    invariant(0 <= start && start < MLKEM_N + 2 * len)
+    invariant(0 <= k && k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
+    invariant(array_abs_bound(r, 0, start - 1, (layer * MLKEM_Q - 1) + MLKEM_Q))
+    invariant(array_abs_bound(r, start, MLKEM_N - 1, layer * MLKEM_Q - 1)))
+  {
+    int16_t zeta = zetas[k++];
+    ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q - 1);
+  }
+}
+
+/*
+ * Compute full forward NTT
+ * NOTE: This particular implementation satisfies a much tighter
+ * bound on the output coefficients (5*q) than the contractual one (8*q),
+ * but this is not needed in the calling code. Should we change the
+ * base multiplication strategy to require smaller NTT output bounds,
+ * the proof may need strengthening.
+ * REF-CHANGE: Removed indirection poly_ntt -> ntt()
+ * and integrated polynomial reduction into the NTT.
+ */
+
+
+void poly_ntt(poly *p)
+{
+  int len, layer;
+  int16_t *r;
+  POLY_BOUND_MSG(p, MLKEM_Q, "ref ntt input");
+  r = p->coeffs;
+
+  for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
+  __loop__(
+    invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer))
+    invariant(array_abs_bound(r, 0, MLKEM_N - 1, layer * MLKEM_Q - 1)))
+  {
+    ntt_layer(r, len, layer);
+  }
+
+  /* Check the stronger bound */
+  POLY_BOUND_MSG(p, NTT_BOUND, "ref ntt output");
+}
+#else  /* MLKEM_USE_NATIVE_NTT */
+
+/* Check that bound for native NTT implies contractual bound */
+STATIC_ASSERT(NTT_BOUND_NATIVE <= NTT_BOUND, invntt_bound)
+
+void poly_ntt(poly *p)
+{
+  POLY_BOUND_MSG(p, MLKEM_Q, "native ntt input");
+  ntt_native(p);
+  POLY_BOUND_MSG(p, NTT_BOUND_NATIVE, "native ntt output");
+}
+#endif /* MLKEM_USE_NATIVE_NTT */
+
+#if !defined(MLKEM_USE_NATIVE_INTT)
+
+/* Check that bound for reference invNTT implies contractual bound */
+#define INVNTT_BOUND_REF (3 * MLKEM_Q / 4)
+STATIC_ASSERT(INVNTT_BOUND_REF <= INVNTT_BOUND, invntt_bound)
+
+/* Compute one layer of inverse NTT */
+STATIC_TESTABLE
+void invntt_layer(int16_t *r, int len, int layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
+  requires(len == (1 << (8 - layer)))
+  requires(array_abs_bound(r, 0, MLKEM_N - 1, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N - 1, MLKEM_Q)))
+{
+  int start, k;
+  /* `layer` is a ghost variable used only in the specification */
+  ((void)layer);
+  k = MLKEM_N / len - 1;
+  for (start = 0; start < MLKEM_N; start += 2 * len)
+  __loop__(
+    invariant(array_abs_bound(r, 0, MLKEM_N - 1, MLKEM_Q))
+    invariant(0 <= start && start <= MLKEM_N && 0 <= k && k <= 127)
+    /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
+    invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
+  {
+    int j;
+    int16_t zeta = zetas[k--];
+    for (j = start; j < start + len; j++)
+    __loop__(
+      invariant(start <= j && j <= start + len)
+      invariant(0 <= start && start <= MLKEM_N && 0 <= k && k <= 127)
+      invariant(array_abs_bound(r, 0, MLKEM_N - 1, MLKEM_Q)))
+    {
+      int16_t t = r[j];
+      r[j] = barrett_reduce(t + r[j + len]);
+      r[j + len] = r[j + len] - t;
+      r[j + len] = fqmul(r[j + len], zeta);
+    }
+  }
+}
+
+void poly_invntt_tomont(poly *p)
+{
+  /*
+   * Scale input polynomial to account for Montgomery factor
+   * and NTT twist. This also brings coefficients down to
+   * absolute value < MLKEM_Q.
+   */
+  int j, len, layer;
+  const int16_t f = 1441;
+  int16_t *r = p->coeffs;
+
+  for (j = 0; j < MLKEM_N; j++)
+  __loop__(
+    invariant(0 <= j && j <= MLKEM_N)
+    invariant(array_abs_bound(r, 0, j - 1, MLKEM_Q)))
+  {
+    r[j] = fqmul(r[j], f);
+  }
+
+  /* Run the invNTT layers */
+  for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
+  __loop__(
+    invariant(2 <= len && len <= 256 && 0 <= layer && layer <= 7 && len == (1 << (8 - layer)))
+    invariant(array_abs_bound(r, 0, MLKEM_N - 1, MLKEM_Q)))
+  {
+    invntt_layer(p->coeffs, len, layer);
+  }
+
+  POLY_BOUND_MSG(p, INVNTT_BOUND_REF, "ref intt output");
+}
+#else  /* MLKEM_USE_NATIVE_INTT */
+
+/* Check that bound for native invNTT implies contractual bound */
+STATIC_ASSERT(INVNTT_BOUND_NATIVE <= INVNTT_BOUND, invntt_bound)
+
+void poly_invntt_tomont(poly *p)
+{
+  intt_native(p);
+  POLY_BOUND_MSG(p, INVNTT_BOUND_NATIVE, "native intt output");
+}
+#endif /* MLKEM_USE_NATIVE_INTT */
+
+/*************************************************
+ * Name:        basemul_cached
+ *
+ * Description: Multiplication of polynomials in Zq[X]/(X^2-zeta)
+ *              used for multiplication of elements in Rq in NTT domain
+ *
+ *              Bounds:
+ *              - a is assumed to be < q in absolute value.
+ *              - Return value < 3/2 q in absolute value
+ *
+ * Arguments:   - int16_t r[2]: pointer to the output polynomial
+ *              - const int16_t a[2]: pointer to the first factor
+ *              - const int16_t b[2]: pointer to the second factor
+ *              - int16_t b_cached: Cached precomputation of b[1] * zeta
+ **************************************************/
+void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
+                    int16_t b_cached)
+{
+  int32_t t0, t1;
+
+  BOUND(a, 2, MLKEM_Q, "basemul input bound");
+
+  t0 = (int32_t)a[1] * b_cached;
+  t0 += (int32_t)a[0] * b[0];
+  t1 = (int32_t)a[0] * b[1];
+  t1 += (int32_t)a[1] * b[0];
+
+  /* |ti| < 2 * q * 2^15 */
+  r[0] = montgomery_reduce(t0);
+  r[1] = montgomery_reduce(t1);
+
+  /* |r[i]| < 3/2 q */
+  BOUND(r, 2, 3 * MLKEM_Q / 2, "basemul output bound");
+}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/ntt.h
new file mode 100644
index 0000000000..0f7b30624b
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/ntt.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef NTT_H
+#define NTT_H
+
+#include <stdint.h>
+#include "arith_native.h"
+#include "cbmc.h"
+#include "params.h"
+#include "poly.h"
+#include "reduce.h"
+
+#define zetas MLKEM_NAMESPACE(zetas)
+extern const int16_t zetas[128];
+
+/*************************************************
+ * Name:        poly_ntt
+ *
+ * Description: Computes negacyclic number-theoretic transform (NTT) of
+ *              a polynomial in place.
+ *
+ *              The input is assumed to be in normal order and
+ *              coefficient-wise bound by MLKEM_Q in absolute value.
+ *
+ *              The output polynomial is in bitreversed order, and
+ *              coefficient-wise bound by NTT_BOUND in absolute value.
+ *
+ *              (NOTE: Sometimes the input to the NTT is actually smaller,
+ *               which gives better bounds.)
+ *
+ * Arguments:   - poly *p: pointer to in/output polynomial
+ **************************************************/
+
+#define poly_ntt MLKEM_NAMESPACE(poly_ntt)
+void poly_ntt(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(array_abs_bound(r->coeffs, 0, MLKEM_N - 1, MLKEM_Q - 1))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N - 1, NTT_BOUND - 1))
+);
+
+/*************************************************
+ * Name:        poly_invntt_tomont
+ *
+ * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
+ *              of a polynomial in place;
+ *              inputs assumed to be in bitreversed order, output in normal
+ *              order
+ *
+ *              The input is assumed to be in bitreversed order, and can
+ *              have arbitrary coefficients in int16_t.
+ *
+ *              The output polynomial is in normal order, and
+ *              coefficient-wise bound by INVNTT_BOUND in absolute value.
+ *
+ * Arguments:   - uint16_t *a: pointer to in/output polynomial
+ **************************************************/
+#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont)
+void poly_invntt_tomont(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N - 1, INVNTT_BOUND - 1))
+);
+
+#define basemul_cached MLKEM_NAMESPACE(basemul_cached)
+/************************************************************
+ * Name: basemul_cached
+ *
+ * Description: Computes a representative modulo q of
+ *              (a0*b0 + a1*b_cached, a0*b1 + a1*b0)/65536
+ *
+ *              If b_cached is b1*zeta, this represents the
+ *              product of (a0 + a1*X) and (b0 + b1*X) in
+ *              Fq[X]/(X^2 - zeta).
+ *
+ * Arguments: - r: Pointer to output polynomial
+ *                   Upon return, coefficients are bound by
+ *                   3*(q+1)/2 in absolute value.
+ *            - a: Pointer to first input polynomial
+ *                   Must be coefficient-wise < q in absolute value.
+ *            - b: Pointer to second input polynomial
+ *                   Can have arbitrary int16_t coefficients
+ *            - b_cached: Some precomputed value, typically derived from
+ *                   b1 and a twiddle factor. Can be an arbitary int16_t.
+ ************************************************************/
+void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
+                    int16_t b_cached)
+__contract__(
+  requires(memory_no_alias(r, 2 * sizeof(int16_t)))
+  requires(memory_no_alias(a, 2 * sizeof(int16_t)))
+  requires(memory_no_alias(b, 2 * sizeof(int16_t)))
+  requires(array_abs_bound(a, 0, 1, MLKEM_Q - 1))
+  assigns(memory_slice(r, 2 * sizeof(int16_t)))
+  ensures(array_abs_bound(r, 0, 1, (3 * HALF_Q - 1)))
+);
+
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/params.h
new file mode 100644
index 0000000000..da8041267e
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/params.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef PARAMS_H
+#define PARAMS_H
+
+#include "common.h"
+#include "cpucap.h"
+
+#define KECCAK_WAY 4
+
+#ifndef MLKEM_K
+#define MLKEM_K 3 /* Change this for different security strengths */
+#endif
+
+/* Don't change parameters below this line */
+#if (MLKEM_K == 2)
+#define MLKEM_NAMESPACE(s) PQCP_MLKEM_NATIVE_MLKEM512_##s
+#define _MLKEM_NAMESPACE(s) _PQCP_MLKEM_NATIVE_MLKEM512_##s
+#elif (MLKEM_K == 3)
+#define MLKEM_NAMESPACE(s) PQCP_MLKEM_NATIVE_MLKEM768_##s
+#define _MLKEM_NAMESPACE(s) _PQCP_MLKEM_NATIVE_MLKEM768_##s
+#elif (MLKEM_K == 4)
+#define MLKEM_NAMESPACE(s) PQCP_MLKEM_NATIVE_MLKEM1024_##s
+#define _MLKEM_NAMESPACE(s) _PQCP_MLKEM_NATIVE_MLKEM1024_##s
+#else
+#error "MLKEM_K must be in {2,3,4}"
+#endif
+
+#define MLKEM_N 256
+#define MLKEM_Q 3329
+
+#define MLKEM_SYMBYTES 32 /* size in bytes of hashes, and seeds */
+#define MLKEM_SSBYTES 32  /* size in bytes of shared key */
+
+#define MLKEM_POLYBYTES 384
+#define MLKEM_POLYVECBYTES (MLKEM_K * MLKEM_POLYBYTES)
+
+#if MLKEM_K == 2
+#define MLKEM_ETA1 3
+#define MLKEM_POLYCOMPRESSEDBYTES_DV 128
+#define MLKEM_POLYCOMPRESSEDBYTES_DU 320
+#define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
+#elif MLKEM_K == 3
+#define MLKEM_ETA1 2
+#define MLKEM_POLYCOMPRESSEDBYTES_DV 128
+#define MLKEM_POLYCOMPRESSEDBYTES_DU 320
+#define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
+#elif MLKEM_K == 4
+#define MLKEM_ETA1 2
+#define MLKEM_POLYCOMPRESSEDBYTES_DV 160
+#define MLKEM_POLYCOMPRESSEDBYTES_DU 352
+#define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
+#endif
+
+#define MLKEM_ETA2 2
+
+#define MLKEM_INDCPA_MSGBYTES (MLKEM_SYMBYTES)
+#define MLKEM_INDCPA_PUBLICKEYBYTES (MLKEM_POLYVECBYTES + MLKEM_SYMBYTES)
+#define MLKEM_INDCPA_SECRETKEYBYTES (MLKEM_POLYVECBYTES)
+#define MLKEM_INDCPA_BYTES \
+  (MLKEM_POLYVECCOMPRESSEDBYTES_DU + MLKEM_POLYCOMPRESSEDBYTES_DV)
+
+#define MLKEM_PUBLICKEYBYTES (MLKEM_INDCPA_PUBLICKEYBYTES)
+/* 32 bytes of additional space to save H(pk) */
+#define MLKEM_SECRETKEYBYTES                                   \
+  (MLKEM_INDCPA_SECRETKEYBYTES + MLKEM_INDCPA_PUBLICKEYBYTES + \
+   2 * MLKEM_SYMBYTES)
+#define MLKEM_CIPHERTEXTBYTES (MLKEM_INDCPA_BYTES)
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.c
new file mode 100644
index 0000000000..93a663c12b
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.c
@@ -0,0 +1,568 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "poly.h"
+#include <stdint.h>
+#include <string.h>
+#include "cbd.h"
+#include "cbmc.h"
+#include "fips202x4.h"
+#include "ntt.h"
+#include "params.h"
+#include "reduce.h"
+#include "symmetric.h"
+#include "verify.h"
+
+#include "arith_native.h"
+#include "debug/debug.h"
+
+void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
+{
+  int j;
+#if (MLKEM_POLYCOMPRESSEDBYTES_DU == 352)
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(invariant(j >= 0 && j <= MLKEM_N / 8))
+  {
+    int k;
+    uint16_t t[8];
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k >= 0 && k <= 8)
+      invariant(forall(int, r, 0, k - 1, t[r] < (1u << 11))))
+    {
+      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+    }
+
+    /*
+     * REF-CHANGE: Use array indexing into
+     * r rather than pointer-arithmetic to simplify verification
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 11-bit in size.
+     */
+    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
+    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
+    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
+    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
+    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
+    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
+    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
+    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
+    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
+    r[11 * j + 10] = (t[7] >> 3);
+  }
+
+#elif (MLKEM_POLYCOMPRESSEDBYTES_DU == 320)
+  for (j = 0; j < MLKEM_N / 4; j++)
+  __loop__(invariant(j >= 0 && j <= MLKEM_N / 4))
+  {
+    int k;
+    uint16_t t[4];
+    for (k = 0; k < 4; k++)
+    __loop__(
+      invariant(k >= 0 && k <= 4)
+      invariant(forall(int, r, 0, k - 1, t[r] < (1u << 10))))
+    {
+      t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
+    }
+
+    /*
+     * REF-CHANGE: Use array indexing into
+     * r rather than pointer-arithmetic to simplify verification
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 10-bit in size.
+     */
+    r[5 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF);
+    r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF);
+    r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
+    r[5 * j + 4] = (t[3] >> 2);
+  }
+#else
+#error "MLKEM_POLYCOMPRESSEDBYTES_DU needs to be in {320,352}"
+#endif
+}
+
+
+void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+{
+  int j;
+#if (MLKEM_POLYCOMPRESSEDBYTES_DU == 352)
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(
+    invariant(0 <= j && j <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * j - 1, 0, (MLKEM_Q - 1))))
+  {
+    int k;
+    uint16_t t[8];
+    uint8_t const *base = &a[11 * j];
+    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
+    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
+                    ((uint16_t)base[4] << 10));
+    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
+    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
+    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
+                    ((uint16_t)base[8] << 9));
+    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
+    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
+
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(0 <= k && k <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * j + k - 1, 0, (MLKEM_Q - 1))))
+    {
+      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
+    }
+  }
+#elif (MLKEM_POLYCOMPRESSEDBYTES_DU == 320)
+  for (j = 0; j < MLKEM_N / 4; j++)
+  __loop__(
+    invariant(0 <= j && j <= MLKEM_N / 4)
+    invariant(array_bound(r->coeffs, 0, 4 * j - 1, 0, (MLKEM_Q - 1))))
+  {
+    int k;
+    uint16_t t[4];
+    uint8_t const *base = &a[5 * j];
+
+    t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6));
+    t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4));
+    t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2));
+
+    for (k = 0; k < 4; k++)
+    __loop__(
+      invariant(0 <= k && k <= 4)
+      invariant(array_bound(r->coeffs, 0, 4 * j + k - 1, 0, (MLKEM_Q - 1))))
+    {
+      r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
+    }
+  }
+#else
+#error "MLKEM_POLYCOMPRESSEDBYTES_DU needs to be in {320,352}"
+#endif
+}
+
+void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
+{
+  int i;
+  POLY_UBOUND(a, MLKEM_Q);
+
+#if (MLKEM_POLYCOMPRESSEDBYTES_DV == 128)
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
+  {
+    int j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(array_bound(t, 0, (j-1), 0, 15)))
+    {
+      /* REF-CHANGE: Precondition change, we assume unsigned canonical data */
+      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
+    }
+
+    /*
+     * REF-CHANGE: Use array indexing into
+     * r rather than pointer-arithmetic to simplify verification
+     */
+    r[i * 4] = t[0] | (t[1] << 4);
+    r[i * 4 + 1] = t[2] | (t[3] << 4);
+    r[i * 4 + 2] = t[4] | (t[5] << 4);
+    r[i * 4 + 3] = t[6] | (t[7] << 4);
+  }
+#elif (MLKEM_POLYCOMPRESSEDBYTES_DV == 160)
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
+  {
+    int j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(array_bound(t, 0, (j-1), 0, 31)))
+    {
+      /* REF-CHANGE: Precondition change, we assume unsigned canonical data */
+      t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
+    }
+
+    /*
+     * REF-CHANGE: Explicitly truncate to avoid warning about
+     * implicit truncation in CBMC, and use array indexing into
+     * r rather than pointer-arithmetic to simplify verification
+     */
+    r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5));
+    r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
+    r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4));
+    r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
+    r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
+  }
+#else
+#error "MLKEM_POLYCOMPRESSEDBYTES_DV needs to be in {128, 160}"
+#endif
+}
+
+void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+{
+  int i;
+#if (MLKEM_POLYCOMPRESSEDBYTES_DV == 128)
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, (2 * i - 1), 0, (MLKEM_Q - 1))))
+  {
+    /* REF-CHANGE: Hoist scalar decompression into separate function */
+    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
+    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
+  }
+#elif (MLKEM_POLYCOMPRESSEDBYTES_DV == 160)
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, (8 * i - 1), 0, (MLKEM_Q - 1))))
+  {
+    int j;
+    uint8_t t[8];
+    const int offset = i * 5;
+    /*
+     * REF-CHANGE: Explicitly truncate to avoid warning about
+     * implicit truncation in CBMC and unwind loop for ease
+     * of proof.
+     */
+
+    /*
+     * Decompress 5 8-bit bytes (so 40 bits) into
+     * 8 5-bit values stored in t[]
+     */
+    t[0] = 0x1F & (a[offset + 0] >> 0);
+    t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3));
+    t[2] = 0x1F & (a[offset + 1] >> 2);
+    t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1));
+    t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4));
+    t[5] = 0x1F & (a[offset + 3] >> 1);
+    t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2));
+    t[7] = 0x1F & (a[offset + 4] >> 3);
+
+    /* and copy to the correct slice in r[] */
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(j >= 0 && j <= 8 && i >= 0 && i <= MLKEM_N / 8)
+      invariant(array_bound(r->coeffs, 0, (8 * i + j - 1), 0, (MLKEM_Q - 1))))
+    {
+      /* REF-CHANGE: Hoist scalar decompression into separate function */
+      r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
+    }
+  }
+#else
+#error "MLKEM_POLYCOMPRESSEDBYTES_DV needs to be in {128, 160}"
+#endif
+
+  POLY_UBOUND(r, MLKEM_Q);
+}
+
+#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+{
+  unsigned int i;
+  POLY_UBOUND(a, MLKEM_Q);
+
+
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(invariant(i >= 0 && i <= MLKEM_N / 2))
+  {
+    const uint16_t t0 = a->coeffs[2 * i];
+    const uint16_t t1 = a->coeffs[2 * i + 1];
+    /* REF-CHANGE: Precondition change, we assume unsigned canonical data */
+
+    /*
+     * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of
+     * significant data, so these can be packed into 24 bits or exactly
+     * 3 bytes, as follows.
+     */
+
+    /* Least significant bits 0 - 7 of t0. */
+    r[3 * i + 0] = t0 & 0xFF;
+
+    /*
+     * Most significant bits 8 - 11 of t0 become the least significant
+     * nibble of the second byte. The least significant 4 bits
+     * of t1 become the upper nibble of the second byte.
+     */
+    r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0);
+
+    /* Bits 4 - 11 of t1 become the third byte. */
+    r[3 * i + 2] = t1 >> 4;
+  }
+}
+#else  /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+{
+  POLY_UBOUND(a, MLKEM_Q);
+  poly_tobytes_native(r, a);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+{
+  int i;
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, (2 * i - 1), 0, 4095)))
+  {
+    /* REF-CHANGE: Introduce some locals for better readability */
+    const uint8_t t0 = a[3 * i + 0];
+    const uint8_t t1 = a[3 * i + 1];
+    const uint8_t t2 = a[3 * i + 2];
+    r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF);
+    r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4);
+  }
+
+  /* Note that the coefficients are not canonical */
+  POLY_UBOUND(r, 4096);
+}
+#else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+{
+  poly_frombytes_native(r, a);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+
+void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+{
+  int i;
+#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8)
+#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!"
+#endif
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, (8 * i - 1), 0, (MLKEM_Q - 1))))
+  {
+    int j;
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i >= 0 && i <  MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(array_bound(r->coeffs, 0, (8 * i + j - 1), 0, (MLKEM_Q - 1))))
+    {
+      /* Prevent the compiler from recognizing this as a bit selection */
+      uint8_t mask = value_barrier_u8(1u << j);
+      r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
+    }
+  }
+  POLY_BOUND_MSG(r, MLKEM_Q, "poly_frommsg output");
+}
+
+void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
+{
+  int i;
+  POLY_UBOUND(a, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
+  {
+    int j;
+    msg[i] = 0;
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8))
+    {
+      uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
+      msg[i] |= t << j;
+    }
+  }
+}
+
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+{
+  ALIGN uint8_t buf[KECCAK_WAY][MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
+  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+  extkey[0][MLKEM_SYMBYTES] = nonce0;
+  extkey[1][MLKEM_SYMBYTES] = nonce1;
+  extkey[2][MLKEM_SYMBYTES] = nonce2;
+  extkey[3][MLKEM_SYMBYTES] = nonce3;
+  shake256x4(buf[0], buf[1], buf[2], buf[3], MLKEM_ETA1 * MLKEM_N / 4,
+             extkey[0], extkey[1], extkey[2], extkey[3], MLKEM_SYMBYTES + 1);
+  poly_cbd_eta1(r0, buf[0]);
+  poly_cbd_eta1(r1, buf[1]);
+  poly_cbd_eta1(r2, buf[2]);
+  poly_cbd_eta1(r3, buf[3]);
+
+  POLY_BOUND_MSG(r0, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 0");
+  POLY_BOUND_MSG(r1, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 1");
+  POLY_BOUND_MSG(r2, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 2");
+  POLY_BOUND_MSG(r3, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 3");
+}
+
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+{
+  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
+  prf(buf, sizeof(buf), seed, nonce);
+  poly_cbd_eta2(r, buf);
+
+  POLY_BOUND_MSG(r, MLKEM_ETA1 + 1, "poly_getnoise_eta2 output");
+}
+
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+{
+  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
+  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+  extkey[0][MLKEM_SYMBYTES] = nonce0;
+  extkey[1][MLKEM_SYMBYTES] = nonce1;
+  extkey[2][MLKEM_SYMBYTES] = nonce2;
+  extkey[3][MLKEM_SYMBYTES] = nonce3;
+
+#if MLKEM_ETA1 == MLKEM_ETA2
+  shake256x4(buf1[0], buf1[1], buf2[0], buf2[1], MLKEM_ETA1 * MLKEM_N / 4,
+             extkey[0], extkey[1], extkey[2], extkey[3], MLKEM_SYMBYTES + 1);
+#else
+  shake256(buf1[0], sizeof(buf1[0]), extkey[0], sizeof(extkey[0]));
+  shake256(buf1[1], sizeof(buf1[1]), extkey[1], sizeof(extkey[1]));
+  shake256(buf2[0], sizeof(buf2[0]), extkey[2], sizeof(extkey[2]));
+  shake256(buf2[1], sizeof(buf2[1]), extkey[3], sizeof(extkey[3]));
+#endif
+
+  poly_cbd_eta1(r0, buf1[0]);
+  poly_cbd_eta1(r1, buf1[1]);
+  poly_cbd_eta2(r2, buf2[0]);
+  poly_cbd_eta2(r3, buf2[1]);
+
+  POLY_BOUND_MSG(r0, MLKEM_ETA1 + 1, "poly_getnoise_eta1122_4x output 0");
+  POLY_BOUND_MSG(r1, MLKEM_ETA1 + 1, "poly_getnoise_eta1122_4x output 1");
+  POLY_BOUND_MSG(r2, MLKEM_ETA2 + 1, "poly_getnoise_eta1122_4x output 2");
+  POLY_BOUND_MSG(r3, MLKEM_ETA2 + 1, "poly_getnoise_eta1122_4x output 3");
+}
+
+void poly_basemul_montgomery_cached(poly *r, const poly *a, const poly *b,
+                                    const poly_mulcache *b_cache)
+{
+  int i;
+  POLY_BOUND(b_cache, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 4; i++)
+  __loop__(
+    assigns(i, object_whole(r))
+    invariant(i >= 0 && i <= MLKEM_N / 4)
+    invariant(array_abs_bound(r->coeffs, 0, (4 * i - 1), (3 * HALF_Q - 1))))
+  {
+    basemul_cached(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i],
+                   b_cache->coeffs[2 * i]);
+    basemul_cached(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2],
+                   &b->coeffs[4 * i + 2], b_cache->coeffs[2 * i + 1]);
+  }
+}
+
+#if !defined(MLKEM_USE_NATIVE_POLY_TOMONT)
+void poly_tomont(poly *r)
+{
+  int i;
+  const int16_t f = (1ULL << 32) % MLKEM_Q; /* 1353 */
+  for (i = 0; i < MLKEM_N; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(array_abs_bound(r->coeffs ,0, (i - 1), (MLKEM_Q - 1))))
+  {
+    r->coeffs[i] = fqmul(r->coeffs[i], f);
+  }
+
+  POLY_BOUND(r, MLKEM_Q);
+}
+#else  /* MLKEM_USE_NATIVE_POLY_TOMONT */
+void poly_tomont(poly *r)
+{
+  poly_tomont_native(r);
+  POLY_BOUND(r, MLKEM_Q);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_REDUCE)
+void poly_reduce(poly *r)
+{
+  int i;
+  for (i = 0; i < MLKEM_N; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(array_bound(r->coeffs, 0, (i - 1), 0, (MLKEM_Q - 1))))
+  {
+    /* Barrett reduction, giving signed canonical representative */
+    int16_t t = barrett_reduce(r->coeffs[i]);
+    /* Conditional addition to get unsigned canonical representative */
+    r->coeffs[i] = scalar_signed_to_unsigned_q(t);
+  }
+
+  POLY_UBOUND(r, MLKEM_Q);
+}
+#else  /* MLKEM_USE_NATIVE_POLY_REDUCE */
+void poly_reduce(poly *r)
+{
+  poly_reduce_native(r);
+  POLY_UBOUND(r, MLKEM_Q);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
+
+void poly_add(poly *r, const poly *b)
+{
+  int i;
+  for (i = 0; i < MLKEM_N; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(forall(int, k0, i, MLKEM_N - 1, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
+    invariant(forall(int, k1, 0, i - 1, r->coeffs[k1] == loop_entry(*r).coeffs[k1] + b->coeffs[k1])))
+  {
+    r->coeffs[i] = r->coeffs[i] + b->coeffs[i];
+  }
+}
+
+void poly_sub(poly *r, const poly *b)
+{
+  int i;
+  for (i = 0; i < MLKEM_N; i++)
+  __loop__(
+    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(forall(int, k0, i, MLKEM_N - 1, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
+    invariant(forall(int, k1, 0, i - 1, r->coeffs[k1] == loop_entry(*r).coeffs[k1] - b->coeffs[k1])))
+  {
+    r->coeffs[i] = r->coeffs[i] - b->coeffs[i];
+  }
+}
+
+#if !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE)
+void poly_mulcache_compute(poly_mulcache *x, const poly *a)
+{
+  int i;
+  for (i = 0; i < MLKEM_N / 4; i++)
+  __loop__(invariant(i >= 0 && i <= MLKEM_N / 4))
+  {
+    x->coeffs[2 * i + 0] = fqmul(a->coeffs[4 * i + 1], zetas[64 + i]);
+    x->coeffs[2 * i + 1] = fqmul(a->coeffs[4 * i + 3], -zetas[64 + i]);
+  }
+  POLY_BOUND(x, MLKEM_Q);
+}
+#else  /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
+void poly_mulcache_compute(poly_mulcache *x, const poly *a)
+{
+  poly_mulcache_compute_native(x, a);
+  /* Omitting POLY_BOUND(x, MLKEM_Q) since native implementations may
+   * decide not to use a mulcache. Note that the C backend implementation
+   * of poly_basemul_montgomery_cached() does still include the check. */
+}
+#endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.h
new file mode 100644
index 0000000000..35990684b6
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.h
@@ -0,0 +1,773 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef POLY_H
+#define POLY_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include "cbmc.h"
+#include "params.h"
+#include "reduce.h"
+#include "verify.h"
+
+/* Absolute exclusive upper bound for the output of the inverse NTT */
+#define INVNTT_BOUND (8 * MLKEM_Q)
+
+/* Absolute exclusive upper bound for the output of the forward NTT */
+#define NTT_BOUND (8 * MLKEM_Q)
+
+/*
+ * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial
+ * coeffs[0] + X*coeffs[1] + X^2*coeffs[2] + ... + X^{n-1}*coeffs[n-1]
+ */
+typedef struct
+{
+  int16_t coeffs[MLKEM_N];
+} ALIGN poly;
+
+/*
+ * INTERNAL presentation of precomputed data speeding up
+ * the base multiplication of two polynomials in NTT domain.
+ */
+/*
+ * REF-CHANGE: This structure does not exist in the reference
+ * implementation.
+ */
+typedef struct
+{
+  int16_t coeffs[MLKEM_N >> 1];
+} poly_mulcache;
+
+/************************************************************
+ * Name: scalar_compress_d1
+ *
+ * Description: Computes round(u * 2 / q)
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 1.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d1(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 2)
+  ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2)  )
+{
+  uint32_t d0 = u << 1;
+  d0 *= 645083;
+  d0 += 1u << 30;
+  d0 >>= 31;
+  return d0;
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_compress_d4
+ *
+ * Description: Computes round(u * 16 / q) % 16
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 4.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d4(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 16)
+  ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16))
+{
+  uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */
+  return (d0 + (1u << 27)) >> 28;      /* round(d0/2^28) */
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d4
+ *
+ * Description: Computes round(u * q / 16)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 4.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d4(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 16)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 8) / 16; }
+
+/************************************************************
+ * Name: scalar_compress_d5
+ *
+ * Description: Computes round(u * 32 / q) % 32
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 5.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d5(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 32)
+  ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32)  )
+{
+  uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */
+  return (d0 + (1u << 26)) >> 27;      /* round(d0/2^27) */
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d5
+ *
+ * Description: Computes round(u * q / 32)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 5.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 32
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d5(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 32)
+  ensures(return_value <= MLKEM_Q - 1)
+) { return ((u * MLKEM_Q) + 16) / 32; }
+
+/************************************************************
+ * Name: scalar_compress_d10
+ *
+ * Description: Computes round(u * 2**10 / q) % 2**10
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d10(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < (1u << 10))
+  ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10)))
+{
+  uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */
+  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
+  return (d0 & 0x3FF);
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d10
+ *
+ * Description: Computes round(u * q / 1024)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d10(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 1024)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 512) / 1024; }
+
+/************************************************************
+ * Name: scalar_compress_d11
+ *
+ * Description: Computes round(u * 2**11 / q) % 2**11
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 11.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d11(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < (1u << 11))
+  ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11)))
+{
+  uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */
+  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
+  return (d0 & 0x7FF);
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d11
+ *
+ * Description: Computes round(u * q / 1024)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d11(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 2048)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 1024) / 2048; }
+
+/************************************************************
+ * Name: scalar_signed_to_unsigned_q
+ *
+ * Description: converts signed polynomial coefficient
+ *              from signed (-3328 .. 3328) form to
+ *              unsigned form (0 .. 3328).
+ *
+ * Note: Cryptographic constant time implementation
+ *
+ * Examples:       0 -> 0
+ *                 1 -> 1
+ *              3328 -> 3328
+ *                -1 -> 3328
+ *                -2 -> 3327
+ *             -3328 -> 1
+ *
+ * Arguments: c: signed coefficient to be converted
+ ************************************************************/
+static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
+__contract__(
+  requires(c >= -(MLKEM_Q - 1) && c <= (MLKEM_Q - 1))
+  ensures(return_value >= 0 && return_value <= (MLKEM_Q - 1))
+  ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
+{
+  /* Add Q if c is negative, but in constant time */
+  c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
+
+  cassert(c >= 0, "scalar_signed_to_unsigned_q result lower bound");
+  cassert(c < MLKEM_Q, "scalar_signed_to_unsigned_q result upper bound");
+
+  /* and therefore cast to uint16_t is safe. */
+  return (uint16_t)c;
+}
+
+#define poly_compress_du MLKEM_NAMESPACE(poly_compress_du)
+/*************************************************
+ * Name:        poly_compress_du
+ *
+ * Description: Compression (du bits) and subsequent serialization of a
+ *polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                            (of length MLKEM_POLYCOMPRESSEDBYTES)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1)))
+  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
+);
+
+#define poly_decompress_du MLKEM_NAMESPACE(poly_decompress_du)
+/*************************************************
+ * Name:        poly_decompress_du
+ *
+ * Description: De-serialization and subsequent decompression (du bits) of a
+ *polynomial; approximate inverse of poly_compress_du
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                                  (of length MLKEM_POLYCOMPRESSEDBYTES bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1)))
+);
+
+#define poly_compress_dv MLKEM_NAMESPACE(poly_compress_dv)
+/*************************************************
+ * Name:        poly_compress_dv
+ *
+ * Description: Compression (dv bits) and subsequent serialization of a
+ *polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                            (of length MLKEM_POLYCOMPRESSEDBYTES_DV)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1)))
+  assigns(object_whole(r))
+);
+
+#define poly_decompress_dv MLKEM_NAMESPACE(poly_decompress_dv)
+/*************************************************
+ * Name:        poly_decompress_dv
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV
+ *bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(object_whole(r))
+  ensures(array_bound(r->coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1)))
+);
+
+#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
+/*************************************************
+ * Name:        poly_tobytes
+ *
+ * Description: Serialization of a polynomial.
+ *              Signed coefficients are converted to
+ *              unsigned form before serialization.
+ *
+ * Arguments:   INPUT:
+ *              - a: const pointer to input polynomial,
+ *                with each coefficient in the range [0,1,..,Q-1]
+ *              OUTPUT
+ *              - r: pointer to output byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ **************************************************/
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYBYTES))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1)))
+  assigns(object_whole(r))
+);
+
+
+#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes)
+/*************************************************
+ * Name:        poly_frombytes
+ *
+ * Description: De-serialization of a polynomial.
+ *
+ * Arguments:   INPUT
+ *              - a: pointer to input byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ *              OUTPUT
+ *              - r: pointer to output polynomial, with
+ *                   each coefficient unsigned and in the range
+ *                   0 .. 4095
+ **************************************************/
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, (MLKEM_N - 1), 0, 4095))
+);
+
+
+#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg)
+/*************************************************
+ * Name:        poly_frommsg
+ *
+ * Description: Convert 32-byte message to polynomial
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *msg: pointer to input message
+ **************************************************/
+void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+__contract__(
+  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(object_whole(r))
+  ensures(array_bound(r->coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1)))
+);
+
+#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg)
+/*************************************************
+ * Name:        poly_tomsg
+ *
+ * Description: Convert polynomial to 32-byte message
+ *
+ * Arguments:   - uint8_t *msg: pointer to output message
+ *              - const poly *r: pointer to input polynomial
+ *                Coefficients must be unsigned canonical
+ **************************************************/
+void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r)
+__contract__(
+  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(array_bound(r->coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1)))
+  assigns(object_whole(msg))
+);
+
+#define poly_getnoise_eta1_4x MLKEM_NAMESPACE(poly_getnoise_eta1_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and nonces, with output polynomials close to centered binomial distribution
+ * with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+/* Depending on MLKEM_K, the pointers passed to this function belong
+   to the same objects, so we cannot use memory_no_alias for r0-r3.
+
+   NOTE: Somehow it is important to use memory_no_alias() first in the
+         conjunctions defining each case.
+*/
+#if MLKEM_K == 2
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
+    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N - 1, MLKEM_ETA1));
+);
+#elif MLKEM_K == 4
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case B: r0, r1, r2, r3 consecutive */
+    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N - 1, MLKEM_ETA1));
+);
+#elif MLKEM_K == 3
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case C: r0, r1, r2 consecutive */
+ (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
+  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N - 1, MLKEM_ETA1));
+);
+#endif /* MLKEM_K */
+
+#if MLKEM_ETA1 == MLKEM_ETA2
+/*
+ * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
+ * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
+ * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
+ */
+#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
+#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
+
+#define poly_getnoise_eta2 MLKEM_NAMESPACE(poly_getnoise_eta2)
+/*************************************************
+ * Name:        poly_getnoise_eta2
+ *
+ * Description: Sample a polynomial deterministically from a seed and a nonce,
+ *              with output polynomial close to centered binomial distribution
+ *              with parameter MLKEM_ETA2
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce: one-byte input nonce
+ **************************************************/
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N - 1, MLKEM_ETA2))
+);
+
+#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE(poly_getnoise_eta1122_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1122_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and a nonces, with output polynomials close to centered binomial
+ * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+__contract__(
+  requires( /* r0, r1 consecutive, r2, r3 consecutive */
+ (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
+  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+     && array_abs_bound(r1->coeffs,0, MLKEM_N - 1, MLKEM_ETA1)
+     && array_abs_bound(r2->coeffs,0, MLKEM_N - 1, MLKEM_ETA2)
+     && array_abs_bound(r3->coeffs,0, MLKEM_N - 1, MLKEM_ETA2));
+);
+
+#define poly_basemul_montgomery_cached \
+  MLKEM_NAMESPACE(poly_basemul_montgomery_cached)
+/*************************************************
+ * Name:        poly_basemul_montgomery_cached
+ *
+ * Description: Multiplication of two polynomials in NTT domain,
+ *              using mulcache for second operand.
+ *
+ *              Bounds:
+ *              - a is assumed to be coefficient-wise < q in absolute value.
+ *
+ *              The result is coefficient-wise bound by 3/2 q in absolute
+ *              value.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const poly *a: pointer to first input polynomial
+ *              - const poly *b: pointer to second input polynomial
+ *              - const poly_mulcache *b_cache: pointer to mulcache
+ *                  for second input polynomial. Can be computed
+ *                  via poly_mulcache_compute().
+ **************************************************/
+void poly_basemul_montgomery_cached(poly *r, const poly *a, const poly *b,
+                                    const poly_mulcache *b_cache)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(memory_no_alias(b, sizeof(poly)))
+  requires(memory_no_alias(b_cache, sizeof(poly_mulcache)))
+  requires(array_abs_bound(a->coeffs, 0, MLKEM_N - 1, (MLKEM_Q - 1)))
+  assigns(object_whole(r))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N - 1, (3 * HALF_Q - 1)))
+);
+
+#define poly_tomont MLKEM_NAMESPACE(poly_tomont)
+/*************************************************
+ * Name:        poly_tomont
+ *
+ * Description: Inplace conversion of all coefficients of a polynomial
+ *              from normal domain to Montgomery domain
+ *
+ *              Bounds: Output < q in absolute value.
+ *
+ * Arguments:   - poly *r: pointer to input/output polynomial
+ **************************************************/
+void poly_tomont(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N - 1, (MLKEM_Q - 1)))
+);
+
+/* REF-CHANGE: This function does not exist in the reference implementation */
+#define poly_mulcache_compute MLKEM_NAMESPACE(poly_mulcache_compute)
+/************************************************************
+ * Name: poly_mulcache_compute
+ *
+ * Description: Computes the mulcache for a polynomial in NTT domain
+ *
+ *              The mulcache of a degree-2 polynomial b := b0 + b1*X
+ *              in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when
+ *              computing products of b in Fq[X]/(X^2-zeta).
+ *
+ *              The mulcache of a polynomial in NTT domain -- which is
+ *              a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta),
+ *              for varying zeta, is the 128-tuple of mulcaches of those
+ *              polynomials.
+ *
+ * Arguments: - x: Pointer to mulcache to be populated
+ *            - a: Pointer to input polynomial
+ ************************************************************/
+/*
+ * NOTE: The default C implementation of this function populates
+ * the mulcache with values in (-q,q), but this is not needed for the
+ * higher level safety proofs, and thus not part of the spec.
+ */
+void poly_mulcache_compute(poly_mulcache *x, const poly *a)
+__contract__(
+  requires(memory_no_alias(x, sizeof(poly_mulcache)))
+  requires(memory_no_alias(a, sizeof(poly)))
+  assigns(object_whole(x))
+);
+
+#define poly_reduce MLKEM_NAMESPACE(poly_reduce)
+/*************************************************
+ * Name:        poly_reduce
+ *
+ * Description: Converts polynomial to _unsigned canonical_ representatives.
+ *
+ *              The input coefficients can be arbitrary integers in int16_t.
+ *              The output coefficients are in [0,1,...,MLKEM_Q-1].
+ *
+ * Arguments:   - poly *r: pointer to input/output polynomial
+ **************************************************/
+/*
+ * REF-CHANGE: The semantics of poly_reduce() is different in
+ * the reference implementation, which requires
+ * signed canonical output data. Unsigned canonical
+ * outputs are better suited to the only remaining
+ * use of poly_reduce() in the context of (de)serialization.
+ */
+void poly_reduce(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N - 1, 0, (MLKEM_Q - 1)))
+);
+
+#define poly_add MLKEM_NAMESPACE(poly_add)
+/************************************************************
+ * Name: poly_add
+ *
+ * Description: Adds two polynomials in place
+ *
+ * Arguments: - r: Pointer to input-output polynomial to be added to.
+ *            - b: Pointer to input polynomial that should be added
+ *                 to r. Must be disjoint from r.
+ *
+ * The coefficients of r and b must be so that the addition does
+ * not overflow. Otherwise, the behaviour of this function is undefined.
+ *
+ ************************************************************/
+/*
+ * REF-CHANGE:
+ * The reference implementation uses a 3-argument poly_add.
+ * We specialize to the accumulator form to avoid reasoning about aliasing.
+ */
+void poly_add(poly *r, const poly *b)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(b, sizeof(poly)))
+  requires(forall(int, k0, 0, MLKEM_N - 1, (int32_t) r->coeffs[k0] + b->coeffs[k0] <= INT16_MAX))
+  requires(forall(int, k1, 0, MLKEM_N - 1, (int32_t) r->coeffs[k1] + b->coeffs[k1] >= INT16_MIN))
+  ensures(forall(int, k, 0, MLKEM_N - 1, r->coeffs[k] == old(*r).coeffs[k] + b->coeffs[k]))
+  assigns(memory_slice(r, sizeof(poly)))
+);
+
+#define poly_sub MLKEM_NAMESPACE(poly_sub)
+/*************************************************
+ * Name:        poly_sub
+ *
+ * Description: Subtract two polynomials; no modular reduction is performed
+ *
+ * Arguments: - poly *r:       Pointer to input-output polynomial to be added
+ *to.
+ *            - const poly *b: Pointer to second input polynomial
+ **************************************************/
+/*
+ * REF-CHANGE:
+ * The reference implementation uses a 3-argument poly_sub.
+ * We specialize to the accumulator form to avoid reasoning about aliasing.
+ */
+void poly_sub(poly *r, const poly *b)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(b, sizeof(poly)))
+  requires(forall(int, k0, 0, MLKEM_N - 1, (int32_t) r->coeffs[k0] - b->coeffs[k0] <= INT16_MAX))
+  requires(forall(int, k1, 0, MLKEM_N - 1, (int32_t) r->coeffs[k1] - b->coeffs[k1] >= INT16_MIN))
+  ensures(forall(int, k, 0, MLKEM_N - 1, r->coeffs[k] == old(*r).coeffs[k] - b->coeffs[k]))
+  assigns(object_whole(r))
+);
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/polyvec.c
new file mode 100644
index 0000000000..5e4dd0c5c4
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/polyvec.c
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "polyvec.h"
+#include <stdint.h>
+#include "arith_native.h"
+#include "config.h"
+#include "ntt.h"
+#include "params.h"
+#include "poly.h"
+
+#include "debug/debug.h"
+void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
+                         const polyvec *a)
+{
+  unsigned int i;
+  POLYVEC_UBOUND(a, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]);
+  }
+}
+
+void polyvec_decompress_du(polyvec *r,
+                           const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
+{
+  unsigned int i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
+  }
+
+  POLYVEC_UBOUND(r, MLKEM_Q);
+}
+
+void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
+{
+  unsigned int i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]);
+  }
+}
+
+void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
+{
+  int i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES);
+  }
+}
+
+void polyvec_ntt(polyvec *r)
+{
+  unsigned int i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_ntt(&r->vec[i]);
+  }
+}
+
+void polyvec_invntt_tomont(polyvec *r)
+{
+  unsigned int i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_invntt_tomont(&r->vec[i]);
+  }
+}
+
+/*************************************************
+ * Name:        polyvec_basemul_acc_montgomery
+ *
+ * Description: Multiply elements of a and b in NTT domain, accumulate into r,
+ *              and multiply by 2^-16.
+ *
+ *              Bounds:
+ *              - a is assumed to be coefficient-wise < q in absolute value.
+ *              - b is assumed to be the output of a forward NTT and
+ *                thus coefficient-wise bound by NTT_BOUND
+ *              - b_cache is assumed to be coefficient-wise bound by
+ *                MLKEM_Q.
+ *
+ * Arguments: - poly *r: pointer to output polynomial
+ *            - const polyvec *a: pointer to first input vector of polynomials
+ *            - const polyvec *b: pointer to second input vector of polynomials
+ *            - const polyvec_mulcache *b_cache: mulcache for b
+ **************************************************/
+#if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
+void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
+                                           const polyvec *b,
+                                           const polyvec_mulcache *b_cache)
+{
+  int i;
+  poly t;
+
+  POLYVEC_BOUND(a, MLKEM_Q);
+  POLYVEC_BOUND(b, NTT_BOUND);
+  POLYVEC_BOUND(b_cache, MLKEM_Q);
+
+  poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]);
+  for (i = 1; i < MLKEM_K; i++)
+  {
+    poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i],
+                                   &b_cache->vec[i]);
+    poly_add(r, &t);
+    /* abs bounds: < (i+1) * 3/2 * q */
+  }
+
+  /*
+   * Those bounds are true for the C implementation, but not needed
+   * in the higher level bounds reasoning. It is thus best to omit
+   * them from the spec to not unnecessarily constraint native implementations.
+   */
+  cassert(
+      array_abs_bound(r->coeffs, 0, MLKEM_N - 1, MLKEM_K * (3 * HALF_Q - 1)),
+      "polyvec_basemul_acc_montgomery_cached output bounds");
+  /* TODO: Integrate CBMC assertion into POLY_BOUND if CBMC is set */
+  POLY_BOUND(r, MLKEM_K * 3 * HALF_Q);
+}
+#else  /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
+void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
+                                           const polyvec *b,
+                                           const polyvec_mulcache *b_cache)
+{
+  POLYVEC_BOUND(a, MLKEM_Q);
+  POLYVEC_BOUND(b, NTT_BOUND);
+  /* Omitting POLYVEC_BOUND(b_cache, MLKEM_Q) since native implementations may
+   * decide not to use a mulcache. Note that the C backend implementation
+   * of poly_basemul_montgomery_cached() does still include the check. */
+  polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache);
+}
+#endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
+
+/*************************************************
+ * Name:        polyvec_basemul_acc_montgomery
+ *
+ * Description: Multiply elements of a and b in NTT domain, accumulate into r,
+ *              and multiply by 2^-16.
+ *
+ * Arguments: - poly *r: pointer to output polynomial
+ *            - const polyvec *a: pointer to first input vector of polynomials
+ *            - const polyvec *b: pointer to second input vector of polynomials
+ **************************************************/
+void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
+{
+  polyvec_mulcache b_cache;
+  polyvec_mulcache_compute(&b_cache, b);
+  polyvec_basemul_acc_montgomery_cached(r, a, b, &b_cache);
+}
+
+/*************************************************
+ * Name:        polyvec_mulcache_compute
+ *
+ * Description: Precompute values speeding up
+ *              base multiplications of polynomials
+ *              in NTT domain.
+ *
+ * Arguments: - polyvec_mulcache *x: pointer to output cache.
+ *            - const poly *a: pointer to input polynomial
+ **************************************************/
+void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a)
+{
+  unsigned int i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_mulcache_compute(&x->vec[i], &a->vec[i]);
+  }
+}
+
+
+/*************************************************
+ * Name:        polyvec_reduce
+ *
+ * Description: Applies Barrett reduction to each coefficient
+ *              of each element of a vector of polynomials;
+ *              for details of the Barrett reduction see comments in reduce.c
+ *
+ * Arguments:   - polyvec *r: pointer to input/output polynomial
+ **************************************************/
+void polyvec_reduce(polyvec *r)
+{
+  unsigned int i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_reduce(&r->vec[i]);
+  }
+}
+
+void polyvec_add(polyvec *r, const polyvec *b)
+{
+  int i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_add(&r->vec[i], &b->vec[i]);
+  }
+}
+
+void polyvec_tomont(polyvec *r)
+{
+  unsigned int i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_tomont(&r->vec[i]);
+  }
+}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/polyvec.h
new file mode 100644
index 0000000000..7771fd3b28
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/polyvec.h
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef POLYVEC_H
+#define POLYVEC_H
+
+#include <stdint.h>
+#include "params.h"
+#include "poly.h"
+
+typedef struct
+{
+  poly vec[MLKEM_K];
+} ALIGN polyvec;
+
+/* REF-CHANGE: This struct does not exist in the reference implementation */
+typedef struct
+{
+  poly_mulcache vec[MLKEM_K];
+} polyvec_mulcache;
+
+#define polyvec_compress_du MLKEM_NAMESPACE(polyvec_compress_du)
+/*************************************************
+ * Name:        polyvec_compress_du
+ *
+ * Description: Compress and serialize vector of polynomials
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                            (needs space for MLKEM_POLYVECCOMPRESSEDBYTES_DU)
+ *              - const polyvec *a: pointer to input vector of polynomials.
+ *                                  Coefficients must be unsigned canonical,
+ *                                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
+                         const polyvec *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  requires(forall(int, k0, 0, MLKEM_K - 1,
+         array_bound(a->vec[k0].coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1))))
+  assigns(object_whole(r))
+);
+
+#define polyvec_decompress_du MLKEM_NAMESPACE(polyvec_decompress_du)
+/*************************************************
+ * Name:        polyvec_decompress_du
+ *
+ * Description: De-serialize and decompress vector of polynomials;
+ *              approximate inverse of polyvec_compress_du
+ *
+ * Arguments:   - polyvec *r:       pointer to output vector of polynomials.
+ *                Output will have coefficients normalized to [0,..,q-1].
+ *              - const uint8_t *a: pointer to input byte array
+ *                                  (of length MLKEM_POLYVECCOMPRESSEDBYTES_DU)
+ **************************************************/
+void polyvec_decompress_du(polyvec *r,
+                           const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(int, k0, 0, MLKEM_K - 1,
+         array_bound(r->vec[k0].coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1))))
+);
+
+#define polyvec_tobytes MLKEM_NAMESPACE(polyvec_tobytes)
+/*************************************************
+ * Name:        polyvec_tobytes
+ *
+ * Description: Serialize vector of polynomials
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                            (needs space for MLKEM_POLYVECBYTES)
+ *              - const polyvec *a: pointer to input vector of polynomials
+ *                  Each polynomial must have coefficients in [0,..,q-1].
+ **************************************************/
+void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
+__contract__(
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  requires(memory_no_alias(r, MLKEM_POLYVECBYTES))
+  requires(forall(int, k0, 0, MLKEM_K - 1,
+         array_bound(a->vec[k0].coeffs, 0, (MLKEM_N - 1), 0, (MLKEM_Q - 1))))
+  assigns(object_whole(r))
+);
+
+#define polyvec_frombytes MLKEM_NAMESPACE(polyvec_frombytes)
+/*************************************************
+ * Name:        polyvec_frombytes
+ *
+ * Description: De-serialize vector of polynomials;
+ *              inverse of polyvec_tobytes
+ *
+ * Arguments:   - const polyvec *a: pointer to output vector of polynomials
+ *                 (of length MLKEM_POLYVECBYTES). Output will have coefficients
+ *                 normalized to [0,..,q-1].
+ *              - uint8_t *r: pointer to input byte array
+ **************************************************/
+void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  requires(memory_no_alias(a, MLKEM_POLYVECBYTES))
+  assigns(object_whole(r))
+  ensures(forall(int, k0, 0, MLKEM_K - 1,
+        array_bound(r->vec[k0].coeffs, 0, (MLKEM_N - 1), 0, 4095)))
+);
+
+#define polyvec_ntt MLKEM_NAMESPACE(polyvec_ntt)
+/*************************************************
+ * Name:        polyvec_ntt
+ *
+ * Description: Apply forward NTT to all elements of a vector of polynomials.
+ *
+ *              The input is assumed to be in normal order and
+ *              coefficient-wise bound by MLKEM_Q in absolute value.
+ *
+ *              The output polynomial is in bitreversed order, and
+ *              coefficient-wise bound by NTT_BOUND in absolute value.
+ *
+ * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
+ *
+ **************************************************/
+void polyvec_ntt(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  requires(forall(int, j, 0, MLKEM_K - 1,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N - 1, (MLKEM_Q - 1))))
+  assigns(object_whole(r))
+  ensures(forall(int, j, 0, MLKEM_K - 1,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N - 1, (NTT_BOUND - 1))))
+);
+
+#define polyvec_invntt_tomont MLKEM_NAMESPACE(polyvec_invntt_tomont)
+/*************************************************
+ * Name:        polyvec_invntt_tomont
+ *
+ * Description: Apply inverse NTT to all elements of a vector of polynomials
+ *              and multiply by Montgomery factor 2^16
+ *
+ *              The input is assumed to be in bitreversed order, and can
+ *              have arbitrary coefficients in int16_t.
+ *
+ *              The output polynomial is in normal order, and
+ *              coefficient-wise bound by INVNTT_BOUND in absolute value.
+ *
+ *
+ * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
+ **************************************************/
+void polyvec_invntt_tomont(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(int, j, 0, MLKEM_K - 1,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N - 1, (INVNTT_BOUND - 1))))
+);
+
+#define polyvec_basemul_acc_montgomery \
+  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery)
+void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a,
+                                    const polyvec *b);
+
+/* REF-CHANGE: This function does not exist in the reference implementation */
+#define polyvec_basemul_acc_montgomery_cached \
+  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached)
+/*************************************************
+ * Name:        polyvec_basemul_acc_montgomery_cached
+ *
+ * Description: Scalar product of two vectors of polynomials in NTT domain,
+ *              using mulcache for second operand.
+ *
+ *              Bounds:
+ *              - a is assumed to be coefficient-wise < q in absolute value.
+ *              - No bounds guarantees for the coefficients in the result.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const polyvec *a: pointer to first input polynomial vector
+ *              - const polyvec *b: pointer to second input polynomial vector
+ *              - const polyvec_mulcache *b_cache: pointer to mulcache
+ *                  for second input polynomial vector. Can be computed
+ *                  via polyvec_mulcache_compute().
+ **************************************************/
+void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
+                                           const polyvec *b,
+                                           const polyvec_mulcache *b_cache)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  requires(memory_no_alias(b, sizeof(polyvec)))
+  requires(memory_no_alias(b_cache, sizeof(polyvec_mulcache)))
+/* Input is coefficient-wise < q in absolute value */
+  requires(forall(int, k1, 0, MLKEM_K - 1,
+ array_abs_bound(a->vec[k1].coeffs, 0, MLKEM_N - 1, (MLKEM_Q - 1))))
+  assigns(memory_slice(r, sizeof(poly)))
+);
+
+/* REF-CHANGE: This function does not exist in the reference implementation */
+#define polyvec_mulcache_compute MLKEM_NAMESPACE(polyvec_mulcache_compute)
+/************************************************************
+ * Name: polyvec_mulcache_compute
+ *
+ * Description: Computes the mulcache for a vector of polynomials in NTT domain
+ *
+ *              The mulcache of a degree-2 polynomial b := b0 + b1*X
+ *              in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when
+ *              computing products of b in Fq[X]/(X^2-zeta).
+ *
+ *              The mulcache of a polynomial in NTT domain -- which is
+ *              a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta),
+ *              for varying zeta, is the 128-tuple of mulcaches of those
+ *              polynomials.
+ *
+ *              The mulcache of a vector of polynomials is the vector
+ *              of mulcaches of its entries.
+ *
+ * Arguments: - x: Pointer to mulcache to be populated
+ *            - a: Pointer to input polynomial vector
+ ************************************************************/
+/*
+ * NOTE: The default C implementation of this function populates
+ * the mulcache with values in (-q,q), but this is not needed for the
+ * higher level safety proofs, and thus not part of the spec.
+ */
+void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a)
+__contract__(
+  requires(memory_no_alias(x, sizeof(polyvec_mulcache)))
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  assigns(object_whole(x))
+);
+
+#define polyvec_reduce MLKEM_NAMESPACE(polyvec_reduce)
+/*************************************************
+ * Name:        polyvec_reduce
+ *
+ * Description: Applies Barrett reduction to each coefficient
+ *              of each element of a vector of polynomials;
+ *              for details of the Barrett reduction see comments in reduce.c
+ *
+ * Arguments:   - polyvec *r: pointer to input/output polynomial
+ **************************************************/
+/*
+ * REF-CHANGE: The semantics of polyvec_reduce() is different in
+ *             the reference implementation, which requires
+ *             signed canonical output data. Unsigned canonical
+ *             outputs are better suited to the only remaining
+ *             use of poly_reduce() in the context of (de)serialization.
+ */
+void polyvec_reduce(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(int, k0, 0, MLKEM_K - 1,
+    array_bound(r->vec[k0].coeffs, 0, MLKEM_N - 1, 0, (MLKEM_Q - 1))))
+);
+
+#define polyvec_add MLKEM_NAMESPACE(polyvec_add)
+/*************************************************
+ * Name:        polyvec_add
+ *
+ * Description: Add vectors of polynomials
+ *
+ * Arguments: - polyvec *r: pointer to input-output vector of polynomials to be
+ *              added to
+ *            - const polyvec *b: pointer to second input vector of polynomials
+ *
+ * The coefficients of r and b must be so that the addition does
+ * not overflow. Otherwise, the behaviour of this function is undefined.
+ *
+ * The coefficients returned in *r are in int16_t which is sufficient
+ * to prove type-safety of calling units. Therefore, no stronger
+ * ensures clause is required on this function.
+ **************************************************/
+void polyvec_add(polyvec *r, const polyvec *b)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  requires(memory_no_alias(b, sizeof(polyvec)))
+  requires(forall(int, j0, 0, MLKEM_K - 1,
+          forall(int, k0, 0, MLKEM_N - 1,
+            (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX)))
+  requires(forall(int, j1, 0, MLKEM_K - 1,
+          forall(int, k1, 0, MLKEM_N - 1,
+            (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN)))
+  assigns(object_whole(r))
+);
+
+#define polyvec_tomont MLKEM_NAMESPACE(polyvec_tomont)
+/*************************************************
+ * Name:        polyvec_tomont
+ *
+ * Description: Inplace conversion of all coefficients of a polynomial
+ *              vector from normal domain to Montgomery domain
+ *
+ *              Bounds: Output < q in absolute value.
+ *
+ **************************************************/
+void polyvec_tomont(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(memory_slice(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(int, j, 0, MLKEM_K - 1,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N - 1, (MLKEM_Q - 1))))
+);
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/reduce.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/reduce.c
new file mode 100644
index 0000000000..db7baf0f56
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/reduce.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "reduce.h"
+#include <stdint.h>
+#include "params.h"
+
+/* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */
+static const uint32_t QINV = 62209; /* q^-1 mod 2^16 */
+
+/*************************************************
+ * Name:        cast_uint16_to_int16
+ *
+ * Description: Cast uint16 value to int16
+ *
+ * Returns:
+ *   input x in     0 .. 32767: returns value unchanged
+ *   input x in 32768 .. 65535: returns (x - 65536)
+ **************************************************/
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "conversion"
+#endif
+static INLINE int16_t cast_uint16_to_int16(uint16_t x)
+{
+  /*
+   * PORTABILITY: This relies on uint16_t -> int16_t
+   * being implemented as the inverse of int16_t -> uint16_t,
+   * which is implementation-defined (C99 6.3.1.3 (3))
+   * CBMC (correctly) fails to prove this conversion is OK,
+   * so we have to suppress that check here
+   */
+  return (int16_t)x;
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/*************************************************
+ * Name:        montgomery_reduce_generic
+ *
+ * Description: Generic Montgomery reduction; given a 32-bit integer a, computes
+ *              16-bit integer congruent to a * R^-1 mod q, where R=2^16
+ *
+ * Arguments:   - int32_t a: input integer to be reduced
+ *
+ * Returns:     integer congruent to a * R^-1 modulo q
+ *
+ *              Bounds: For any C such that |a| < q * C, the return value
+ *              has absolute value < q (C/2^16 + 1/2).
+ *
+ *              Notable special cases:
+ *              - The Montgomery multiplication of a value of absolute value
+ *                < q * C with a signed-canonical value ( < q/2 ) has
+ *                absolute value q * (0.0254 * C + 1/2).
+ *              - The Montgomery multiplication of a value of absolute value
+ *                < q * C with a value t of |t| < q has absolute value
+ *                < q * (0.0508 * C + 1/2).
+ *              - The Montgomery multiplication of a value of absolute value
+ *                < C with a value of abs < q has absolute value
+ *                < q (C/2^16 + 1/2).
+ **************************************************/
+ALWAYS_INLINE
+static INLINE int16_t montgomery_reduce_generic(int32_t a)
+{
+  /*
+   *Bounds on paper
+   * - Case |a| < q * C, for some C
+   * |t| <= |a|/2^16 + |t|*q/2^16
+   *      < q * C / 2^16 + q/2
+   *      = q (C/2^16 + 1/2)
+   * - Case |a| < (q/2) * C * q, for some C
+   * Replace C -> C * q in the above and estimate
+   * q / 2^17 < 0.0254.
+   */
+
+  /*  Compute a*q^{-1} mod 2^16 in unsigned representatives */
+  const uint16_t a_reduced = a & UINT16_MAX;
+  const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
+
+  /* Lift to signed canonical representative mod 2^16. */
+  const int16_t t = cast_uint16_to_int16(a_inverted);
+
+  int32_t r = a - ((int32_t)t * MLKEM_Q);
+
+  /*
+   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
+   * implementation-defined for negative left argument. Here,
+   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
+   */
+  r = r >> 16;
+
+  return (int16_t)r;
+}
+
+int16_t montgomery_reduce(int32_t a)
+{
+  int16_t res;
+  SCALAR_BOUND(a, 2 * MLKEM_Q * 32768, "montgomery_reduce input");
+
+  res = montgomery_reduce_generic(a);
+
+  SCALAR_BOUND(res, (3 * (MLKEM_Q + 1)) / 2, "montgomery_reduce output");
+  return res;
+}
+
+int16_t fqmul(int16_t a, int16_t b)
+{
+  int16_t res;
+  SCALAR_BOUND(b, HALF_Q, "fqmul input");
+
+  res = montgomery_reduce((int32_t)a * (int32_t)b);
+
+  SCALAR_BOUND(res, MLKEM_Q, "fqmul output");
+  return res;
+}
+
+/*
+ * To divide by MLKEM_Q using Barrett multiplication, the "magic number"
+ * multiplier is round_to_nearest(2**26/MLKEM_Q)
+ */
+#define BPOWER 26
+static const int32_t barrett_multiplier =
+    ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q;
+
+/*************************************************
+ * Name:        barrett_reduce
+ *
+ * Description: Barrett reduction; given a 16-bit integer a, computes
+ *              centered representative congruent to a mod q in
+ *              {-(q-1)/2,...,(q-1)/2}
+ *
+ * Arguments:   - int16_t a: input integer to be reduced
+ *
+ * Returns:     integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
+ **************************************************/
+int16_t barrett_reduce(int16_t a)
+{
+  /*
+   * Compute round_to_nearest(a/MLKEM_Q) using the multiplier
+   * above and shift by BPOWER places.
+   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
+   * implementation-defined for negative left argument. Here,
+   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
+   */
+  const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER;
+
+  /*
+   * t is in -10 .. +10, so we need 32-bit math to
+   * evaluate t * MLKEM_Q and the subsequent subtraction
+   */
+  return (int16_t)(a - t * MLKEM_Q);
+}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/reduce.h
new file mode 100644
index 0000000000..2a486cf3ec
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/reduce.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef REDUCE_H
+#define REDUCE_H
+
+#include <stdint.h>
+#include "cbmc.h"
+#include "debug/debug.h"
+#include "params.h"
+
+#define MONT -1044                 /* 2^16 mod q */
+#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */
+
+/*************************************************
+ * Name:        montgomery_reduce
+ *
+ * Description: Montgomery reduction
+ *
+ * Arguments:   - int32_t a: input integer to be reduced
+ *                  Must be smaller than 2 * q * 2^15 in absolute value.
+ *
+ * Returns:     integer congruent to a * R^-1 modulo q,
+ *              smaller than 3/2 q in absolute value.
+ **************************************************/
+#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce)
+int16_t montgomery_reduce(int32_t a)
+__contract__(
+  requires(a > -(2 * MLKEM_Q * 32768))
+  requires(a <  (2 * MLKEM_Q * 32768))
+  ensures(return_value > -(3 * HALF_Q) && return_value < (3 * HALF_Q))
+);
+
+#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce)
+int16_t barrett_reduce(int16_t a)
+__contract__(
+  ensures(return_value > -HALF_Q && return_value < HALF_Q)
+);
+
+/*************************************************
+ * Name:        fqmul
+ *
+ * Description: Montgomery multiplication modulo q=3329
+ *
+ * Arguments:   - int16_t a: first factor
+ *                  Can be any int16_t.
+ *              - int16_t b: second factor.
+ *                  Must be signed canonical (abs value <(q+1)/2)
+ *
+ * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and
+ * smaller than q in absolute value.
+ *
+ **************************************************/
+#define fqmul MLKEM_NAMESPACE(fqmul)
+int16_t fqmul(int16_t a, int16_t b)
+__contract__(
+  requires(b > -HALF_Q)
+  requires(b < HALF_Q)
+  ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q)
+);
+
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/rej_uniform.c
new file mode 100644
index 0000000000..4e8a5ce9b2
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/rej_uniform.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "params.h"
+
+#include "arith_native.h"
+#include "rej_uniform.h"
+
+/*************************************************
+ * Name:        rej_uniform_scalar
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ *              uniform random integers mod q
+ *
+ * Arguments:   - int16_t *r:          pointer to output buffer
+ *              - unsigned int target: requested number of 16-bit integers
+ *                                     (uniform mod q).
+ *                                     Must be <= 4096.
+ *              - unsigned int offset: number of 16-bit integers that have
+ *                                     already been sampled.
+ *                                     Must be <= target.
+ *              - const uint8_t *buf:  pointer to input buffer
+ *                                     (assumed to be uniform random bytes)
+ *              - unsigned int buflen: length of input buffer in bytes
+ *                                     Must be <= 4096.
+ *                                     Must be a multiple of 3.
+ *
+ * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
+ * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
+ * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
+ *
+ * Returns the new offset of sampled 16-bit integers, at most target,
+ * and at least the initial offset.
+ * If the new offset is strictly less than len, all of the input buffers
+ * is guaranteed to have been consumed. If it is equal to len, no information
+ * is provided on how many bytes of the input buffer have been consumed.
+ **************************************************/
+static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target,
+                                       unsigned int offset, const uint8_t *buf,
+                                       unsigned int buflen)
+{
+  unsigned int ctr, pos;
+  uint16_t val0, val1;
+
+  ctr = offset;
+  pos = 0;
+  /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */
+  while (ctr < target && pos + 3 <= buflen)
+  __loop__(
+    invariant(offset <= ctr && ctr <= target && pos <= buflen)
+    invariant(ctr > 0 ==> array_bound(r, 0, ctr - 1, 0, (MLKEM_Q - 1))))
+  {
+    val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
+    val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF;
+    pos += 3;
+
+    if (val0 < MLKEM_Q)
+    {
+      r[ctr++] = val0;
+    }
+    if (ctr < target && val1 < MLKEM_Q)
+    {
+      r[ctr++] = val1;
+    }
+  }
+  return ctr;
+}
+
+#if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
+unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
+                         const uint8_t *buf, unsigned int buflen)
+{
+  return rej_uniform_scalar(r, target, offset, buf, buflen);
+}
+#else  /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+
+unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
+                         const uint8_t *buf, unsigned int buflen)
+{
+  int ret;
+
+  /* Sample from large buffer with full lane as much as possible. */
+  ret = rej_uniform_native(r + offset, target - offset, buf, buflen);
+  if (ret != -1)
+    return offset + (unsigned)ret;
+
+  return rej_uniform_scalar(r, target, offset, buf, buflen);
+}
+#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/rej_uniform.h
new file mode 100644
index 0000000000..aeb9cc3eb0
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/rej_uniform.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef REJ_UNIFORM_H
+#define REJ_UNIFORM_H
+
+#include <stdint.h>
+#include <stdlib.h>
+#include "cbmc.h"
+#include "params.h"
+
+#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
+/*************************************************
+ * Name:        rej_uniform
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ *              uniform random integers mod q
+ *
+ * Arguments:   - int16_t *r:          pointer to output buffer
+ *              - unsigned int target: requested number of 16-bit integers
+ *                                     (uniform mod q).
+ *                                     Must be <= 4096.
+ *              - unsigned int offset: number of 16-bit integers that have
+ *                                     already been sampled.
+ *                                     Must be <= target.
+ *              - const uint8_t *buf:  pointer to input buffer
+ *                                     (assumed to be uniform random bytes)
+ *              - unsigned int buflen: length of input buffer in bytes
+ *                                     Must be <= 4096.
+ *                                     Must be a multiple of 3.
+ *
+ * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
+ * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
+ * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
+ *
+ * Returns the new offset of sampled 16-bit integers, at most target,
+ * and at least the initial offset.
+ * If the new offset is strictly less than len, all of the input buffers
+ * is guaranteed to have been consumed. If it is equal to len, no information
+ * is provided on how many bytes of the input buffer have been consumed.
+ **************************************************/
+
+/*
+ * REF-CHANGE: The signature differs from the Kyber reference implementation
+ * in that it adds the offset and always expects the base of the target
+ * buffer. This avoids shifting the buffer base in the caller, which appears
+ * tricky to reason about.
+ */
+unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
+                         const uint8_t *buf, unsigned int buflen)
+__contract__(
+  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
+  requires(memory_no_alias(r, sizeof(int16_t) * target))
+  requires(memory_no_alias(buf, buflen))
+  requires(offset > 0 ==> array_bound(r, 0, offset - 1, 0, (MLKEM_Q - 1)))
+  assigns(memory_slice(r, sizeof(int16_t) * target))
+  ensures(offset <= return_value && return_value <= target)
+  ensures(return_value > 0 ==> array_bound(r, 0, return_value - 1, 0, (MLKEM_Q - 1)))
+);
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/symmetric-shake.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/symmetric-shake.c
new file mode 100644
index 0000000000..5dd8c10d92
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/symmetric-shake.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include "fips202.h"
+#include "params.h"
+#include "symmetric.h"
+
+void mlkem_shake256_prf(uint8_t *out, size_t outlen,
+                        const uint8_t key[MLKEM_SYMBYTES], uint8_t nonce)
+{
+  uint8_t extkey[MLKEM_SYMBYTES + 1];
+
+  memcpy(extkey, key, MLKEM_SYMBYTES);
+  extkey[MLKEM_SYMBYTES] = nonce;
+
+  shake256(out, outlen, extkey, sizeof(extkey));
+}
+
+void mlkem_shake256_rkprf(uint8_t out[MLKEM_SSBYTES],
+                          const uint8_t key[MLKEM_SYMBYTES],
+                          const uint8_t input[MLKEM_CIPHERTEXTBYTES])
+{
+  shake256incctx s;
+
+  shake256_inc_init(&s);
+  shake256_inc_absorb(&s, key, MLKEM_SYMBYTES);
+  shake256_inc_absorb(&s, input, MLKEM_CIPHERTEXTBYTES);
+  shake256_inc_finalize(&s);
+  shake256_inc_squeeze(out, MLKEM_SSBYTES, &s);
+  shake256_inc_ctx_release(&s);
+}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/symmetric.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/symmetric.h
new file mode 100644
index 0000000000..202741a7b3
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/symmetric.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef SYMMETRIC_H
+#define SYMMETRIC_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include "params.h"
+
+#include "fips202.h"
+
+#include "cbmc.h"
+
+#define mlkem_shake256_prf MLKEM_NAMESPACE(mlkem_shake256_prf)
+/*************************************************
+ * Name:        mlkem_shake256_prf
+ *
+ * Ref:         FIPS-203 Section 4.1. Function PRF (eq 4.3)
+ *
+ * Description: Usage of SHAKE256 as a PRF, concatenates secret and public input
+ *              and then generates outlen bytes of SHAKE256 output
+ *
+ * Arguments:   - uint8_t *out: pointer to output
+ *              - size_t outlen: number of requested output bytes
+ *              - const uint8_t *key: pointer to the key (of length
+ *                MLKEM_SYMBYTES)
+ *              - uint8_t nonce: single-byte nonce (public PRF input)
+ *
+ *              out and key may NOT be aliased.
+ **************************************************/
+void mlkem_shake256_prf(uint8_t *out, size_t outlen,
+                        const uint8_t key[MLKEM_SYMBYTES], uint8_t nonce)
+__contract__(
+  requires(memory_no_alias(out, outlen))
+  requires(memory_no_alias(key, MLKEM_SYMBYTES))
+  assigns(memory_slice(out, outlen))
+);
+
+#define mlkem_shake256_rkprf MLKEM_NAMESPACE(mlkem_shake256_rkprf)
+/*************************************************
+ * Name:        mlkem_shake256_rkprf
+ *
+ * Ref:         FIPS-203 Section 4.1. Hash function J
+ *
+ * Description: Usage of SHAKE256 as a PRF, concatenates key with input
+ *              and then generates MLKEM_SSBYTES bytes of SHAKE256 output
+ *
+ * Arguments:   - uint8_t *out: pointer to output
+ *              - const uint8_t *key: pointer to the key (of length
+ *                MLKEM_SYMBYTES)
+ *              - const uint8_t *input: pointer to the input (of length
+ *                MLKEM_CIPHERTEXTBYTES)
+ *
+ *              out, key, and input may NOT be aliased.
+ **************************************************/
+void mlkem_shake256_rkprf(uint8_t out[MLKEM_SSBYTES],
+                          const uint8_t key[MLKEM_SYMBYTES],
+                          const uint8_t input[MLKEM_CIPHERTEXTBYTES])
+__contract__(
+  requires(memory_no_alias(out, MLKEM_SSBYTES))
+  requires(memory_no_alias(key, MLKEM_SYMBYTES))
+  requires(memory_no_alias(input, MLKEM_CIPHERTEXTBYTES))
+  assigns(memory_slice(out, MLKEM_SSBYTES))
+);
+
+
+/* Macros denoting FIPS-203 specific Hash functions */
+
+/* Hash function H, FIPS-201 4.1 (eq 4.4) */
+#define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES)
+
+/* Hash function G, FIPS-201 4.1 (eq 4.5) */
+#define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES)
+
+/* Macros denoting FIPS-203 specific PRFs */
+#define prf(OUT, OUTBYTES, KEY, NONCE) \
+  mlkem_shake256_prf(OUT, OUTBYTES, KEY, NONCE)
+#define rkprf(OUT, KEY, INPUT) mlkem_shake256_rkprf(OUT, KEY, INPUT)
+
+#endif /* SYMMETRIC_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/verify.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/verify.c
new file mode 100644
index 0000000000..b5b71e023e
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/verify.c
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "verify.h"
+
+#if !defined(MLKEM_USE_ASM_VALUE_BARRIER)
+/*
+ * Masking value used in constant-time functions from
+ * verify.h to block the compiler's range analysis and
+ * thereby reduce the risk of compiler-introduced branches.
+ */
+volatile uint64_t ct_opt_blocker_u64 = 0;
+
+#else /* MLKEM_USE_ASM_VALUE_BARRIER */
+
+int empty_cu_verify;
+
+#endif /* MLKEM_USE_ASM_VALUE_BARRIER */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/verify.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/verify.h
new file mode 100644
index 0000000000..5c62223c3d
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/verify.h
@@ -0,0 +1,305 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef VERIFY_H
+#define VERIFY_H
+
+#include <limits.h>
+#include <stddef.h>
+#include <stdint.h>
+#include "cbmc.h"
+#include "params.h"
+
+/* Constant-time comparisons and conditional operations
+
+   We reduce the risk for compilation into variable-time code
+   through the use of 'value barriers'.
+
+   Functionally, a value barrier is a no-op. To the compiler, however,
+   it constitutes an arbitrary modification of its input, and therefore
+   harden's value propagation and range analysis.
+
+   We consider two approaches to implement a value barrier:
+   - An empty inline asm block which marks the target value as clobbered.
+   - XOR'ing with the value of a volatile global that's set to 0.
+
+   The first approach is cheap because it only prevents the compiler
+   from reasoning about the value of the variable past the barrier,
+   but does not directly generate additional instructions.
+
+   The second approach generates redundant loads and XOR operations
+   and therefore comes at a higher runtime cost. However, it appears
+   more robust towards optimization, as compilers should never drop
+   a volatile load.
+
+   We use the empty-ASM value barrier for GCC and clang, and fall
+   back to the global volatile barrier otherwise.
+
+   The global value barrier can be forced by setting MLKEM_NO_ASM_VALUE_BARRIER.
+
+*/
+
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(CBMC) && \
+    !defined(MLKEM_NO_ASM_VALUE_BARRIER)
+#define MLKEM_USE_ASM_VALUE_BARRIER
+#endif
+
+#if !defined(MLKEM_USE_ASM_VALUE_BARRIER)
+
+/*
+ * Declaration of global volatile that the global value barrier
+ * is loading from and masking with.
+ */
+#define ct_opt_blocker_u64 MLKEM_NAMESPACE(ct_opt_blocker_u64)
+extern volatile uint64_t ct_opt_blocker_u64;
+
+/* Helper functions for obtaining masks of various sizes */
+STATIC_INLINE_TESTABLE uint8_t get_optblocker_u8(void)
+__contract__(ensures(return_value == 0)) { return (uint8_t)ct_opt_blocker_u64; }
+
+STATIC_INLINE_TESTABLE uint32_t get_optblocker_u32(void)
+__contract__(ensures(return_value == 0)) { return ct_opt_blocker_u64; }
+
+STATIC_INLINE_TESTABLE uint32_t get_optblocker_i32(void)
+__contract__(ensures(return_value == 0)) { return ct_opt_blocker_u64; }
+
+STATIC_INLINE_TESTABLE uint32_t value_barrier_u32(uint32_t b)
+__contract__(ensures(return_value == b)) { return (b ^ get_optblocker_u32()); }
+
+STATIC_INLINE_TESTABLE int32_t value_barrier_i32(int32_t b)
+__contract__(ensures(return_value == b)) { return (b ^ get_optblocker_i32()); }
+
+STATIC_INLINE_TESTABLE uint8_t value_barrier_u8(uint8_t b)
+__contract__(ensures(return_value == b)) { return (b ^ get_optblocker_u8()); }
+
+#else /* !MLKEM_USE_ASM_VALUE_BARRIER */
+
+STATIC_INLINE_TESTABLE uint32_t value_barrier_u32(uint32_t b)
+__contract__(ensures(return_value == b))
+{
+  asm("" : "+r"(b));
+  return b;
+}
+
+STATIC_INLINE_TESTABLE int32_t value_barrier_i32(int32_t b)
+__contract__(ensures(return_value == b))
+{
+  asm("" : "+r"(b));
+  return b;
+}
+
+STATIC_INLINE_TESTABLE uint8_t value_barrier_u8(uint8_t b)
+__contract__(ensures(return_value == b))
+{
+  asm("" : "+r"(b));
+  return b;
+}
+
+#endif /* MLKEM_USE_ASM_VALUE_BARRIER */
+
+/*
+ * The ct_cmask_nonzero_xxx functions below make deliberate use of unsigned
+ * overflow, which is fully defined behaviour in C. It is thus safe to disable
+ * this warning.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+
+/*************************************************
+ * Name:        ct_cmask_nonzero_u16
+ *
+ * Description: Return 0 if input is zero, and -1 otherwise.
+ *
+ * Arguments:   uint16_t x: Value to be converted into a mask
+ **************************************************/
+STATIC_INLINE_TESTABLE uint16_t ct_cmask_nonzero_u16(uint16_t x)
+__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFFFF)))
+{
+  uint32_t tmp = value_barrier_u32(-((uint32_t)x));
+  tmp >>= 16;
+  return tmp;
+}
+
+/*************************************************
+ * Name:        ct_cmask_nonzero_u8
+ *
+ * Description: Return 0 if input is zero, and -1 otherwise.
+ *
+ * Arguments:   uint8_t x: Value to be converted into a mask
+ **************************************************/
+STATIC_INLINE_TESTABLE uint8_t ct_cmask_nonzero_u8(uint8_t x)
+__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFF)))
+{
+  uint32_t tmp = value_barrier_u32(-((uint32_t)x));
+  tmp >>= 24;
+  return tmp;
+}
+
+/* Put unsigned overflow warnings in CBMC back into scope */
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/*
+ * The ct_cmask_neg_i16 function below makes deliberate use of
+ * signed to unsigned integer conversion, which is fully defined
+ * behaviour in C. It is thus safe to disable this warning.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "conversion"
+#endif
+
+/*************************************************
+ * Name:        ct_cmask_neg_i16
+ *
+ * Description: Return 0 if input is non-negative, and -1 otherwise.
+ *
+ * Arguments:   uint16_t x: Value to be converted into a mask
+ **************************************************/
+STATIC_INLINE_TESTABLE uint16_t ct_cmask_neg_i16(int16_t x)
+__contract__(ensures(return_value == ((x < 0) ? 0xFFFF : 0)))
+{
+  int32_t tmp = value_barrier_i32((int32_t)x);
+  tmp >>= 16;
+  return (int16_t)tmp;
+}
+
+/* Put unsigned-to-signed warnings in CBMC back into scope */
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/*
+ * The ct_csel_xxx functions below make deliberate use of unsigned
+ * to signed integer conversion, which is implementation-defined
+ * behaviour. Here, we assume that uint16_t -> int16_t is inverse
+ * to int16_t -> uint16_t.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "conversion"
+#endif
+
+/*************************************************
+ * Name:        ct_sel_int16
+ *
+ * Description: Functionally equivalent to cond ? a : b,
+ *              but implemented with guards against
+ *              compiler-introduced branches.
+ *
+ * Arguments:   int16_t a:       First alternative
+ *              int16_t b:       Second alternative
+ *              uint16_t cond:   Condition variable.
+ **************************************************/
+STATIC_INLINE_TESTABLE int16_t ct_sel_int16(int16_t a, int16_t b, uint16_t cond)
+__contract__(ensures(return_value == (cond ? a : b)))
+{
+  uint16_t au = a, bu = b;
+  uint16_t res = bu ^ (ct_cmask_nonzero_u16(cond) & (au ^ bu));
+  return (int16_t)res;
+}
+
+/* Put unsigned-to-signed warnings in CBMC back into scope */
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/*************************************************
+ * Name:        ct_sel_uint8
+ *
+ * Description: Functionally equivalent to cond ? a : b,
+ *              but implemented with guards against
+ *              compiler-introduced branches.
+ *
+ * Arguments:   uint8_t a:       First alternative
+ *              uint8_t b:       Second alternative
+ *              uuint8_t cond:   Condition variable.
+ **************************************************/
+STATIC_INLINE_TESTABLE uint8_t ct_sel_uint8(uint8_t a, uint8_t b, uint8_t cond)
+__contract__(ensures(return_value == (cond ? a : b)))
+{
+  return b ^ (ct_cmask_nonzero_u8(cond) & (a ^ b));
+}
+
+/*************************************************
+ * Name:        ct_memcmp
+ *
+ * Description: Compare two arrays for equality in constant time.
+ *
+ * Arguments:   const uint8_t *a: pointer to first byte array
+ *              const uint8_t *b: pointer to second byte array
+ *              size_t len:       length of the byte arrays
+ *
+ * Returns 0 if the byte arrays are equal, a non-zero value otherwise
+ **************************************************/
+STATIC_INLINE_TESTABLE uint8_t ct_memcmp(const uint8_t *a, const uint8_t *b,
+                                         const size_t len)
+__contract__(
+  requires(memory_no_alias(a, len))
+  requires(memory_no_alias(b, len))
+  requires(len <= INT_MAX)
+  ensures((return_value == 0) == forall(int, i, 0, ((int)len - 1), (a[i] == b[i]))))
+{
+  uint8_t r = 0, s = 0;
+
+  /*
+   * Switch to a _signed_ ilen value, so that our loop counter
+   * can also be signed, and thus (i - 1) in the loop invariant
+   * can yield -1 as required.
+   */
+  const int ilen = (int)len;
+  int i;
+
+  for (i = 0; i < ilen; i++)
+  __loop__(
+    invariant(i >= 0 && i <= ilen)
+    invariant((r == 0) == (forall(int, k, 0, (i - 1), (a[k] == b[k])))))
+  {
+    r |= a[i] ^ b[i];
+    /* s is useless, but prevents the loop from being aborted once r=0xff. */
+    s ^= a[i] ^ b[i];
+  }
+
+  /*
+   * - Convert r into a mask; this may not be necessary, but is an additional
+   *   safeguard
+   *   towards leaking information about a and b.
+   * - XOR twice with s, separated by a value barrier, to prevent the compile
+   *   from dropping the s computation in the loop.
+   */
+  return (value_barrier_u8(ct_cmask_nonzero_u8(r) ^ s) ^ s);
+}
+
+/*************************************************
+ * Name:        ct_cmov_zero
+ *
+ * Description: Copy len bytes from x to r if b is zero;
+ *              don't modify x if b is non-zero.
+ *              assumes two's complement representation of negative integers.
+ *              Runs in constant time.
+ *
+ * Arguments:   uint8_t *r:       pointer to output byte array
+ *              const uint8_t *x: pointer to input byte array
+ *              size_t len:       Amount of bytes to be copied
+ *              uint8_t b:        Condition value.
+ **************************************************/
+STATIC_INLINE_TESTABLE
+void ct_cmov_zero(uint8_t *r, const uint8_t *x, size_t len, uint8_t b)
+__contract__(
+  requires(memory_no_alias(r, len))
+  requires(memory_no_alias(x, len))
+  assigns(memory_slice(r, len)))
+{
+  size_t i;
+  for (i = 0; i < len; i++)
+  __loop__(invariant(i <= len))
+  {
+    r[i] = ct_sel_uint8(r[i], x[i], b);
+  }
+}
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/zetas.c
new file mode 100644
index 0000000000..f52b2ff5ad
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/zetas.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * WARNING: This file is auto-generated from scripts/autogenerate_files.py
+ *          Do not modify it directly.
+ */
+
+#include "ntt.h"
+
+/*
+ * Table of zeta values used in the reference NTT and inverse NTT.
+ * See autogenerate_files.py for details.
+ */
+const int16_t zetas[128] = {
+    -1044, -758,  -359,  -1517, 1493,  1422,  287,   202,  -171,  622,   1577,
+    182,   962,   -1202, -1474, 1468,  573,   -1325, 264,  383,   -829,  1458,
+    -1602, -130,  -681,  1017,  732,   608,   -1542, 411,  -205,  -1571, 1223,
+    652,   -552,  1015,  -1293, 1491,  -282,  -1544, 516,  -8,    -320,  -666,
+    -1618, -1162, 126,   1469,  -853,  -90,   -271,  830,  107,   -1421, -247,
+    -951,  -398,  961,   -1508, -725,  448,   -1065, 677,  -1275, -1103, 430,
+    555,   843,   -1251, 871,   1550,  105,   422,   587,  177,   -235,  -291,
+    -460,  1574,  1653,  -246,  778,   1159,  -147,  -777, 1483,  -602,  1119,
+    -1590, 644,   -872,  349,   418,   329,   -156,  -75,  817,   1097,  603,
+    610,   1322,  -1285, -1465, 384,   -1215, -136,  1218, -1335, -874,  220,
+    -1187, -1659, -1185, -1530, -1278, 794,   -1510, -854, -870,  478,   -108,
+    -308,  996,   991,   958,   -1460, 1522,  1628,
+};
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/align.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/align.h
deleted file mode 100644
index 3463866f37..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/align.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef ALIGN_H
-#define ALIGN_H
-
-#include <stdint.h>
-#include <immintrin.h>
-
-#define ALIGNED_UINT8(N)        \
-    union {                     \
-        uint8_t coeffs[N];      \
-        __m256i vec[(N+31)/32]; \
-    }
-
-#define ALIGNED_INT16(N)        \
-    union {                     \
-        int16_t coeffs[N];      \
-        __m256i vec[(N+15)/16]; \
-    }
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/api.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/api.h
deleted file mode 100644
index a154e80f1d..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/api.h
+++ /dev/null
@@ -1,66 +0,0 @@
-#ifndef API_H
-#define API_H
-
-#include <stdint.h>
-
-#define pqcrystals_kyber512_SECRETKEYBYTES 1632
-#define pqcrystals_kyber512_PUBLICKEYBYTES 800
-#define pqcrystals_kyber512_CIPHERTEXTBYTES 768
-#define pqcrystals_kyber512_KEYPAIRCOINBYTES 64
-#define pqcrystals_kyber512_ENCCOINBYTES 32
-#define pqcrystals_kyber512_BYTES 32
-
-#define pqcrystals_kyber512_avx2_SECRETKEYBYTES pqcrystals_kyber512_SECRETKEYBYTES
-#define pqcrystals_kyber512_avx2_PUBLICKEYBYTES pqcrystals_kyber512_PUBLICKEYBYTES
-#define pqcrystals_kyber512_avx2_CIPHERTEXTBYTES pqcrystals_kyber512_CIPHERTEXTBYTES
-#define pqcrystals_kyber512_avx2_KEYPAIRCOINBYTES pqcrystals_kyber512_KEYPAIRCOINBYTES
-#define pqcrystals_kyber512_avx2_ENCCOINBYTES pqcrystals_kyber512_ENCCOINBYTES
-#define pqcrystals_kyber512_avx2_BYTES pqcrystals_kyber512_BYTES
-
-int pqcrystals_kyber512_avx2_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
-int pqcrystals_kyber512_avx2_keypair(uint8_t *pk, uint8_t *sk);
-int pqcrystals_kyber512_avx2_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
-int pqcrystals_kyber512_avx2_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-int pqcrystals_kyber512_avx2_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-
-#define pqcrystals_kyber768_SECRETKEYBYTES 2400
-#define pqcrystals_kyber768_PUBLICKEYBYTES 1184
-#define pqcrystals_kyber768_CIPHERTEXTBYTES 1088
-#define pqcrystals_kyber768_KEYPAIRCOINBYTES 64
-#define pqcrystals_kyber768_ENCCOINBYTES 32
-#define pqcrystals_kyber768_BYTES 32
-
-#define pqcrystals_kyber768_avx2_SECRETKEYBYTES pqcrystals_kyber768_SECRETKEYBYTES
-#define pqcrystals_kyber768_avx2_PUBLICKEYBYTES pqcrystals_kyber768_PUBLICKEYBYTES
-#define pqcrystals_kyber768_avx2_CIPHERTEXTBYTES pqcrystals_kyber768_CIPHERTEXTBYTES
-#define pqcrystals_kyber768_avx2_KEYPAIRCOINBYTES pqcrystals_kyber768_KEYPAIRCOINBYTES
-#define pqcrystals_kyber768_avx2_ENCCOINBYTES pqcrystals_kyber768_ENCCOINBYTES
-#define pqcrystals_kyber768_avx2_BYTES pqcrystals_kyber768_BYTES
-
-int pqcrystals_kyber768_avx2_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
-int pqcrystals_kyber768_avx2_keypair(uint8_t *pk, uint8_t *sk);
-int pqcrystals_kyber768_avx2_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
-int pqcrystals_kyber768_avx2_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-int pqcrystals_kyber768_avx2_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-
-#define pqcrystals_kyber1024_SECRETKEYBYTES 3168
-#define pqcrystals_kyber1024_PUBLICKEYBYTES 1568
-#define pqcrystals_kyber1024_CIPHERTEXTBYTES 1568
-#define pqcrystals_kyber1024_KEYPAIRCOINBYTES 64
-#define pqcrystals_kyber1024_ENCCOINBYTES 32
-#define pqcrystals_kyber1024_BYTES 32
-
-#define pqcrystals_kyber1024_avx2_SECRETKEYBYTES pqcrystals_kyber1024_SECRETKEYBYTES
-#define pqcrystals_kyber1024_avx2_PUBLICKEYBYTES pqcrystals_kyber1024_PUBLICKEYBYTES
-#define pqcrystals_kyber1024_avx2_CIPHERTEXTBYTES pqcrystals_kyber1024_CIPHERTEXTBYTES
-#define pqcrystals_kyber1024_avx2_KEYPAIRCOINBYTES pqcrystals_kyber1024_KEYPAIRCOINBYTES
-#define pqcrystals_kyber1024_avx2_ENCCOINBYTES pqcrystals_kyber1024_ENCCOINBYTES
-#define pqcrystals_kyber1024_avx2_BYTES pqcrystals_kyber1024_BYTES
-
-int pqcrystals_kyber1024_avx2_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
-int pqcrystals_kyber1024_avx2_keypair(uint8_t *pk, uint8_t *sk);
-int pqcrystals_kyber1024_avx2_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
-int pqcrystals_kyber1024_avx2_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-int pqcrystals_kyber1024_avx2_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/basemul.S b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/basemul.S
deleted file mode 100644
index 36990639b2..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/basemul.S
+++ /dev/null
@@ -1,105 +0,0 @@
-#include "consts.h"
-
-.macro schoolbook off
-vmovdqa		_16XQINV*2(%rcx),%ymm0
-vmovdqa		(64*\off+ 0)*2(%rsi),%ymm1		# a0
-vmovdqa		(64*\off+16)*2(%rsi),%ymm2		# b0
-vmovdqa		(64*\off+32)*2(%rsi),%ymm3		# a1
-vmovdqa		(64*\off+48)*2(%rsi),%ymm4		# b1
-
-vpmullw		%ymm0,%ymm1,%ymm9			# a0.lo
-vpmullw		%ymm0,%ymm2,%ymm10			# b0.lo
-vpmullw		%ymm0,%ymm3,%ymm11			# a1.lo
-vpmullw		%ymm0,%ymm4,%ymm12			# b1.lo
-
-vmovdqa		(64*\off+ 0)*2(%rdx),%ymm5		# c0
-vmovdqa		(64*\off+16)*2(%rdx),%ymm6		# d0
-
-vpmulhw		%ymm5,%ymm1,%ymm13			# a0c0.hi
-vpmulhw		%ymm6,%ymm1,%ymm1			# a0d0.hi
-vpmulhw		%ymm5,%ymm2,%ymm14			# b0c0.hi
-vpmulhw		%ymm6,%ymm2,%ymm2			# b0d0.hi
-
-vmovdqa		(64*\off+32)*2(%rdx),%ymm7		# c1
-vmovdqa		(64*\off+48)*2(%rdx),%ymm8		# d1
-
-vpmulhw		%ymm7,%ymm3,%ymm15			# a1c1.hi
-vpmulhw		%ymm8,%ymm3,%ymm3			# a1d1.hi
-vpmulhw		%ymm7,%ymm4,%ymm0			# b1c1.hi
-vpmulhw		%ymm8,%ymm4,%ymm4			# b1d1.hi
-
-vmovdqa		%ymm13,(%rsp)
-
-vpmullw		%ymm5,%ymm9,%ymm13			# a0c0.lo
-vpmullw		%ymm6,%ymm9,%ymm9			# a0d0.lo
-vpmullw		%ymm5,%ymm10,%ymm5			# b0c0.lo
-vpmullw		%ymm6,%ymm10,%ymm10			# b0d0.lo
-
-vpmullw		%ymm7,%ymm11,%ymm6			# a1c1.lo
-vpmullw		%ymm8,%ymm11,%ymm11			# a1d1.lo
-vpmullw		%ymm7,%ymm12,%ymm7			# b1c1.lo
-vpmullw		%ymm8,%ymm12,%ymm12			# b1d1.lo
-
-vmovdqa		_16XQ*2(%rcx),%ymm8
-vpmulhw		%ymm8,%ymm13,%ymm13
-vpmulhw		%ymm8,%ymm9,%ymm9
-vpmulhw		%ymm8,%ymm5,%ymm5
-vpmulhw		%ymm8,%ymm10,%ymm10
-vpmulhw		%ymm8,%ymm6,%ymm6
-vpmulhw		%ymm8,%ymm11,%ymm11
-vpmulhw		%ymm8,%ymm7,%ymm7
-vpmulhw		%ymm8,%ymm12,%ymm12
-
-vpsubw		(%rsp),%ymm13,%ymm13			# -a0c0
-vpsubw		%ymm9,%ymm1,%ymm9			# a0d0
-vpsubw		%ymm5,%ymm14,%ymm5			# b0c0
-vpsubw		%ymm10,%ymm2,%ymm10			# b0d0
-
-vpsubw		%ymm6,%ymm15,%ymm6			# a1c1
-vpsubw		%ymm11,%ymm3,%ymm11			# a1d1
-vpsubw		%ymm7,%ymm0,%ymm7			# b1c1
-vpsubw		%ymm12,%ymm4,%ymm12			# b1d1
-
-vmovdqa		(%r9),%ymm0
-vmovdqa		32(%r9),%ymm1
-vpmullw		%ymm0,%ymm10,%ymm2
-vpmullw		%ymm0,%ymm12,%ymm3
-vpmulhw		%ymm1,%ymm10,%ymm10
-vpmulhw		%ymm1,%ymm12,%ymm12
-vpmulhw		%ymm8,%ymm2,%ymm2
-vpmulhw		%ymm8,%ymm3,%ymm3
-vpsubw		%ymm2,%ymm10,%ymm10			# rb0d0
-vpsubw		%ymm3,%ymm12,%ymm12			# rb1d1
-
-vpaddw		%ymm5,%ymm9,%ymm9
-vpaddw		%ymm7,%ymm11,%ymm11
-vpsubw		%ymm13,%ymm10,%ymm13
-vpsubw		%ymm12,%ymm6,%ymm6
-
-vmovdqa		%ymm13,(64*\off+ 0)*2(%rdi)
-vmovdqa		%ymm9,(64*\off+16)*2(%rdi)
-vmovdqa		%ymm6,(64*\off+32)*2(%rdi)
-vmovdqa		%ymm11,(64*\off+48)*2(%rdi)
-.endm
-
-.text
-.global cdecl(basemul_avx)
-cdecl(basemul_avx):
-mov		%rsp,%r8
-and		$-32,%rsp
-sub		$32,%rsp
-
-lea		(_ZETAS_EXP+176)*2(%rcx),%r9
-schoolbook	0
-
-add		$32*2,%r9
-schoolbook	1
-
-add		$192*2,%r9
-schoolbook	2
-
-add		$32*2,%r9
-schoolbook	3
-
-mov		%r8,%rsp
-ret
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/cbd.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/cbd.c
deleted file mode 100644
index dad473c79e..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/cbd.c
+++ /dev/null
@@ -1,144 +0,0 @@
-#include <stdint.h>
-#include <immintrin.h>
-#include "params.h"
-#include "cbd.h"
-
-/*************************************************
-* Name:        cbd2
-*
-* Description: Given an array of uniformly random bytes, compute
-*              polynomial with coefficients distributed according to
-*              a centered binomial distribution with parameter eta=2
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const __m256i *buf: pointer to aligned input byte array
-**************************************************/
-static void cbd2(poly * restrict r, const __m256i buf[2*KYBER_N/128])
-{
-  unsigned int i;
-  __m256i f0, f1, f2, f3;
-  const __m256i mask55 = _mm256_set1_epi32(0x55555555);
-  const __m256i mask33 = _mm256_set1_epi32(0x33333333);
-  const __m256i mask03 = _mm256_set1_epi32(0x03030303);
-  const __m256i mask0F = _mm256_set1_epi32(0x0F0F0F0F);
-
-  for(i = 0; i < KYBER_N/64; i++) {
-    f0 = _mm256_load_si256(&buf[i]);
-
-    f1 = _mm256_srli_epi16(f0, 1);
-    f0 = _mm256_and_si256(mask55, f0);
-    f1 = _mm256_and_si256(mask55, f1);
-    f0 = _mm256_add_epi8(f0, f1);
-
-    f1 = _mm256_srli_epi16(f0, 2);
-    f0 = _mm256_and_si256(mask33, f0);
-    f1 = _mm256_and_si256(mask33, f1);
-    f0 = _mm256_add_epi8(f0, mask33);
-    f0 = _mm256_sub_epi8(f0, f1);
-
-    f1 = _mm256_srli_epi16(f0, 4);
-    f0 = _mm256_and_si256(mask0F, f0);
-    f1 = _mm256_and_si256(mask0F, f1);
-    f0 = _mm256_sub_epi8(f0, mask03);
-    f1 = _mm256_sub_epi8(f1, mask03);
-
-    f2 = _mm256_unpacklo_epi8(f0, f1);
-    f3 = _mm256_unpackhi_epi8(f0, f1);
-
-    f0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f2));
-    f1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f2,1));
-    f2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f3));
-    f3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f3,1));
-
-    _mm256_store_si256(&r->vec[4*i+0], f0);
-    _mm256_store_si256(&r->vec[4*i+1], f2);
-    _mm256_store_si256(&r->vec[4*i+2], f1);
-    _mm256_store_si256(&r->vec[4*i+3], f3);
-  }
-}
-
-#if KYBER_ETA1 == 3
-/*************************************************
-* Name:        cbd3
-*
-* Description: Given an array of uniformly random bytes, compute
-*              polynomial with coefficients distributed according to
-*              a centered binomial distribution with parameter eta=3
-*              This function is only needed for Kyber-512
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const __m256i *buf: pointer to aligned input byte array
-**************************************************/
-static void cbd3(poly * restrict r, const uint8_t buf[3*KYBER_N/4+8])
-{
-  unsigned int i;
-  __m256i f0, f1, f2, f3;
-  const __m256i mask249 = _mm256_set1_epi32(0x249249);
-  const __m256i mask6DB = _mm256_set1_epi32(0x6DB6DB);
-  const __m256i mask07 = _mm256_set1_epi32(7);
-  const __m256i mask70 = _mm256_set1_epi32(7 << 16);
-  const __m256i mask3 = _mm256_set1_epi16(3);
-  const __m256i shufbidx = _mm256_set_epi8(-1,15,14,13,-1,12,11,10,-1, 9, 8, 7,-1, 6, 5, 4,
-                                           -1,11,10, 9,-1, 8, 7, 6,-1, 5, 4, 3,-1, 2, 1, 0);
-
-  for(i = 0; i < KYBER_N/32; i++) {
-    f0 = _mm256_loadu_si256((__m256i *)&buf[24*i]);
-    f0 = _mm256_permute4x64_epi64(f0,0x94);
-    f0 = _mm256_shuffle_epi8(f0,shufbidx);
-
-    f1 = _mm256_srli_epi32(f0,1);
-    f2 = _mm256_srli_epi32(f0,2);
-    f0 = _mm256_and_si256(mask249,f0);
-    f1 = _mm256_and_si256(mask249,f1);
-    f2 = _mm256_and_si256(mask249,f2);
-    f0 = _mm256_add_epi32(f0,f1);
-    f0 = _mm256_add_epi32(f0,f2);
-
-    f1 = _mm256_srli_epi32(f0,3);
-    f0 = _mm256_add_epi32(f0,mask6DB);
-    f0 = _mm256_sub_epi32(f0,f1);
-
-    f1 = _mm256_slli_epi32(f0,10);
-    f2 = _mm256_srli_epi32(f0,12);
-    f3 = _mm256_srli_epi32(f0, 2);
-    f0 = _mm256_and_si256(f0,mask07);
-    f1 = _mm256_and_si256(f1,mask70);
-    f2 = _mm256_and_si256(f2,mask07);
-    f3 = _mm256_and_si256(f3,mask70);
-    f0 = _mm256_add_epi16(f0,f1);
-    f1 = _mm256_add_epi16(f2,f3);
-    f0 = _mm256_sub_epi16(f0,mask3);
-    f1 = _mm256_sub_epi16(f1,mask3);
-
-    f2 = _mm256_unpacklo_epi32(f0,f1);
-    f3 = _mm256_unpackhi_epi32(f0,f1);
-
-    f0 = _mm256_permute2x128_si256(f2,f3,0x20);
-    f1 = _mm256_permute2x128_si256(f2,f3,0x31);
-
-    _mm256_store_si256(&r->vec[2*i+0], f0);
-    _mm256_store_si256(&r->vec[2*i+1], f1);
-  }
-}
-#endif
-
-/* buf 32 bytes longer for cbd3 */
-void poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1*KYBER_N/128+1])
-{
-#if KYBER_ETA1 == 2
-  cbd2(r, buf);
-#elif KYBER_ETA1 == 3
-  cbd3(r, (uint8_t *)buf);
-#else
-#error "This implementation requires eta1 in {2,3}"
-#endif
-}
-
-void poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2*KYBER_N/128])
-{
-#if KYBER_ETA2 == 2
-  cbd2(r, buf);
-#else
-#error "This implementation requires eta2 = 2"
-#endif
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/cbd.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/cbd.h
deleted file mode 100644
index 05788e06b4..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/cbd.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef CBD_H
-#define CBD_H
-
-#include <stdint.h>
-#include <immintrin.h>
-#include "params.h"
-#include "poly.h"
-
-#define poly_cbd_eta1 KYBER_NAMESPACE(poly_cbd_eta1)
-void poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1*KYBER_N/128+1]);
-
-#define poly_cbd_eta2 KYBER_NAMESPACE(poly_cbd_eta2)
-void poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2*KYBER_N/128]);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/consts.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/consts.c
deleted file mode 100644
index 84e596893d..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/consts.c
+++ /dev/null
@@ -1,121 +0,0 @@
-#include "align.h"
-#include "params.h"
-#include "consts.h"
-
-#define Q KYBER_Q
-#define MONT -1044 // 2^16 mod q
-#define QINV -3327 // q^-1 mod 2^16
-#define V 20159 // floor(2^26/q + 0.5)
-#define FHI 1441 // mont^2/128
-#define FLO -10079 // qinv*FHI
-#define MONTSQHI 1353 // mont^2
-#define MONTSQLO 20553 // qinv*MONTSQHI
-#define MASK 4095
-#define SHIFT 32
-
-const qdata_t qdata = {{
-#define _16XQ 0
-  Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q,
-
-#define _16XQINV 16
-  QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV,
-  QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV,
-
-#define _16XV 32
-  V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
-
-#define _16XFLO 48
-  FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO,
-  FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO,
-
-#define _16XFHI 64
-  FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI,
-  FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI,
-
-#define _16XMONTSQLO 80
-  MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
-  MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
-  MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
-  MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
-
-#define _16XMONTSQHI 96
-  MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
-  MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
-  MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
-  MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
-
-#define _16XMASK 112
-  MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK,
-  MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK,
-
-#define _REVIDXB 128
-  3854, 3340, 2826, 2312, 1798, 1284, 770, 256,
-  3854, 3340, 2826, 2312, 1798, 1284, 770, 256,
-
-#define _REVIDXD 144
-  7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0, 0,
-
-#define _ZETAS_EXP 160
-   31498,  31498,  31498,  31498,   -758,   -758,   -758,   -758,
-    5237,   5237,   5237,   5237,   1397,   1397,   1397,   1397,
-   14745,  14745,  14745,  14745,  14745,  14745,  14745,  14745,
-   14745,  14745,  14745,  14745,  14745,  14745,  14745,  14745,
-    -359,   -359,   -359,   -359,   -359,   -359,   -359,   -359,
-    -359,   -359,   -359,   -359,   -359,   -359,   -359,   -359,
-   13525,  13525,  13525,  13525,  13525,  13525,  13525,  13525,
-  -12402, -12402, -12402, -12402, -12402, -12402, -12402, -12402,
-    1493,   1493,   1493,   1493,   1493,   1493,   1493,   1493,
-    1422,   1422,   1422,   1422,   1422,   1422,   1422,   1422,
-  -20907, -20907, -20907, -20907,  27758,  27758,  27758,  27758,
-   -3799,  -3799,  -3799,  -3799, -15690, -15690, -15690, -15690,
-    -171,   -171,   -171,   -171,    622,    622,    622,    622,
-    1577,   1577,   1577,   1577,    182,    182,    182,    182,
-   -5827,  -5827,  17363,  17363, -26360, -26360, -29057, -29057,
-    5571,   5571,  -1102,  -1102,  21438,  21438, -26242, -26242,
-     573,    573,  -1325,  -1325,    264,    264,    383,    383,
-    -829,   -829,   1458,   1458,  -1602,  -1602,   -130,   -130,
-   -5689,  -6516,   1496,  30967, -23565,  20179,  20710,  25080,
-  -12796,  26616,  16064, -12442,   9134,   -650, -25986,  27837,
-    1223,    652,   -552,   1015,  -1293,   1491,   -282,  -1544,
-     516,     -8,   -320,   -666,  -1618,  -1162,    126,   1469,
-    -335, -11477, -32227,  20494, -27738,    945, -14883,   6182,
-   32010,  10631,  29175, -28762, -18486,  17560, -14430,  -5276,
-   -1103,    555,  -1251,   1550,    422,    177,   -291,   1574,
-    -246,   1159,   -777,   -602,  -1590,   -872,    418,   -156,
-   11182,  13387, -14233, -21655,  13131,  -4587,  23092,   5493,
-  -32502,  30317, -18741,  12639,  20100,  18525,  19529, -12619,
-     430,    843,    871,    105,    587,   -235,   -460,   1653,
-     778,   -147,   1483,   1119,    644,    349,    329,    -75,
-     787,    787,    787,    787,    787,    787,    787,    787,
-     787,    787,    787,    787,    787,    787,    787,    787,
-   -1517,  -1517,  -1517,  -1517,  -1517,  -1517,  -1517,  -1517,
-   -1517,  -1517,  -1517,  -1517,  -1517,  -1517,  -1517,  -1517,
-   28191,  28191,  28191,  28191,  28191,  28191,  28191,  28191,
-  -16694, -16694, -16694, -16694, -16694, -16694, -16694, -16694,
-     287,    287,    287,    287,    287,    287,    287,    287,
-     202,    202,    202,    202,    202,    202,    202,    202,
-   10690,  10690,  10690,  10690,   1358,   1358,   1358,   1358,
-  -11202, -11202, -11202, -11202,  31164,  31164,  31164,  31164,
-     962,    962,    962,    962,  -1202,  -1202,  -1202,  -1202,
-   -1474,  -1474,  -1474,  -1474,   1468,   1468,   1468,   1468,
-  -28073, -28073,  24313,  24313, -10532, -10532,   8800,   8800,
-   18426,  18426,   8859,   8859,  26675,  26675, -16163, -16163,
-    -681,   -681,   1017,   1017,    732,    732,    608,    608,
-   -1542,  -1542,    411,    411,   -205,   -205,  -1571,  -1571,
-   19883, -28250, -15887,  -8898, -28309,   9075, -30199,  18249,
-   13426,  14017, -29156, -12757,  16832,   4311, -24155, -17915,
-    -853,    -90,   -271,    830,    107,  -1421,   -247,   -951,
-    -398,    961,  -1508,   -725,    448,  -1065,    677,  -1275,
-  -31183,  25435,  -7382,  24391, -20927,  10946,  24214,  16989,
-   10335,  -7934, -22502,  10906,  31636,  28644,  23998, -17422,
-     817,    603,   1322,  -1465,  -1215,   1218,   -874,  -1187,
-   -1185,  -1278,  -1510,   -870,   -108,    996,    958,   1522,
-   20297,   2146,  15355, -32384,  -6280, -14903, -11044,  14469,
-  -21498, -20198,  23210, -17442, -23860, -20257,   7756,  23132,
-    1097,    610,  -1285,    384,   -136,  -1335,    220,  -1659,
-   -1530,    794,   -854,    478,   -308,    991,  -1460,   1628,
-
-#define _16XSHIFT 624
-  SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT,
-  SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT
-}};
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/consts.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/consts.h
deleted file mode 100644
index f95899cd8e..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/consts.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef CONSTS_H
-#define CONSTS_H
-
-#include "params.h"
-
-#define _16XQ            0
-#define _16XQINV        16
-#define _16XV           32
-#define _16XFLO         48
-#define _16XFHI         64
-#define _16XMONTSQLO    80
-#define _16XMONTSQHI    96
-#define _16XMASK       112
-#define _REVIDXB       128
-#define _REVIDXD       144
-#define _ZETAS_EXP     160
-#define	_16XSHIFT      624
-
-/* The C ABI on MacOS exports all symbols with a leading
- * underscore. This means that any symbols we refer to from
- * C files (functions) can't be found, and all symbols we
- * refer to from ASM also can't be found.
- *
- * This define helps us get around this
- */
-#ifdef __ASSEMBLER__
-#if defined(__WIN32__) || defined(__APPLE__)
-#define decorate(s) _##s
-#define cdecl2(s) decorate(s)
-#define cdecl(s) cdecl2(KYBER_NAMESPACE(##s))
-#else
-#define cdecl(s) KYBER_NAMESPACE(##s)
-#endif
-#endif
-
-#ifndef __ASSEMBLER__
-#include "align.h"
-typedef ALIGNED_INT16(640) qdata_t;
-#define qdata KYBER_NAMESPACE(qdata)
-extern const qdata_t qdata;
-#endif
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/fq.S b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/fq.S
deleted file mode 100644
index 3bb1ebd3d8..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/fq.S
+++ /dev/null
@@ -1,88 +0,0 @@
-#include "consts.h"
-.include "fq.inc"
-
-.text
-reduce128_avx:
-#load
-vmovdqa		(%rdi),%ymm2
-vmovdqa		32(%rdi),%ymm3
-vmovdqa		64(%rdi),%ymm4
-vmovdqa		96(%rdi),%ymm5
-vmovdqa		128(%rdi),%ymm6
-vmovdqa		160(%rdi),%ymm7
-vmovdqa		192(%rdi),%ymm8
-vmovdqa		224(%rdi),%ymm9
-
-red16		2
-red16		3
-red16		4
-red16		5
-red16		6
-red16		7
-red16		8
-red16		9
-
-#store
-vmovdqa		%ymm2,(%rdi)
-vmovdqa		%ymm3,32(%rdi)
-vmovdqa		%ymm4,64(%rdi)
-vmovdqa		%ymm5,96(%rdi)
-vmovdqa		%ymm6,128(%rdi)
-vmovdqa		%ymm7,160(%rdi)
-vmovdqa		%ymm8,192(%rdi)
-vmovdqa		%ymm9,224(%rdi)
-
-ret
-
-.global cdecl(reduce_avx)
-cdecl(reduce_avx):
-#consts
-vmovdqa		_16XQ*2(%rsi),%ymm0
-vmovdqa		_16XV*2(%rsi),%ymm1
-call		reduce128_avx
-add		$256,%rdi
-call		reduce128_avx
-ret
-
-tomont128_avx:
-#load
-vmovdqa		(%rdi),%ymm3
-vmovdqa		32(%rdi),%ymm4
-vmovdqa		64(%rdi),%ymm5
-vmovdqa		96(%rdi),%ymm6
-vmovdqa		128(%rdi),%ymm7
-vmovdqa		160(%rdi),%ymm8
-vmovdqa		192(%rdi),%ymm9
-vmovdqa		224(%rdi),%ymm10
-
-fqmulprecomp	1,2,3,11
-fqmulprecomp	1,2,4,12
-fqmulprecomp	1,2,5,13
-fqmulprecomp	1,2,6,14
-fqmulprecomp	1,2,7,15
-fqmulprecomp	1,2,8,11
-fqmulprecomp	1,2,9,12
-fqmulprecomp	1,2,10,13
-
-#store
-vmovdqa		%ymm3,(%rdi)
-vmovdqa		%ymm4,32(%rdi)
-vmovdqa		%ymm5,64(%rdi)
-vmovdqa		%ymm6,96(%rdi)
-vmovdqa		%ymm7,128(%rdi)
-vmovdqa		%ymm8,160(%rdi)
-vmovdqa		%ymm9,192(%rdi)
-vmovdqa		%ymm10,224(%rdi)
-
-ret
-
-.global cdecl(tomont_avx)
-cdecl(tomont_avx):
-#consts
-vmovdqa		_16XQ*2(%rsi),%ymm0
-vmovdqa		_16XMONTSQLO*2(%rsi),%ymm1
-vmovdqa		_16XMONTSQHI*2(%rsi),%ymm2
-call		tomont128_avx
-add		$256,%rdi
-call		tomont128_avx
-ret
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/fq.inc b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/fq.inc
deleted file mode 100644
index 4b7afc3118..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/fq.inc
+++ /dev/null
@@ -1,30 +0,0 @@
-.macro red16 r,rs=0,x=12
-vpmulhw         %ymm1,%ymm\r,%ymm\x
-.if \rs
-vpmulhrsw	%ymm\rs,%ymm\x,%ymm\x
-.else
-vpsraw          $10,%ymm\x,%ymm\x
-.endif
-vpmullw         %ymm0,%ymm\x,%ymm\x
-vpsubw          %ymm\x,%ymm\r,%ymm\r
-.endm
-
-.macro csubq r,x=12
-vpsubw		%ymm0,%ymm\r,%ymm\r
-vpsraw		$15,%ymm\r,%ymm\x
-vpand		%ymm0,%ymm\x,%ymm\x
-vpaddw		%ymm\x,%ymm\r,%ymm\r
-.endm
-
-.macro caddq r,x=12
-vpsraw		$15,%ymm\r,%ymm\x
-vpand		%ymm0,%ymm\x,%ymm\x
-vpaddw		%ymm\x,%ymm\r,%ymm\r
-.endm
-
-.macro fqmulprecomp al,ah,b,x=12
-vpmullw		%ymm\al,%ymm\b,%ymm\x
-vpmulhw		%ymm\ah,%ymm\b,%ymm\b
-vpmulhw		%ymm0,%ymm\x,%ymm\x
-vpsubw		%ymm\x,%ymm\b,%ymm\b
-.endm
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/indcpa.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/indcpa.c
deleted file mode 100644
index c4b2b3a89f..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/indcpa.c
+++ /dev/null
@@ -1,568 +0,0 @@
-#include <stddef.h>
-#include <stdint.h>
-#include <immintrin.h>
-#include <string.h>
-#include "align.h"
-#include "params.h"
-#include "indcpa.h"
-#include "polyvec.h"
-#include "poly.h"
-#include "ntt.h"
-#include "cbd.h"
-#include "rejsample.h"
-#include "symmetric.h"
-#include "randombytes.h"
-
-/*************************************************
-* Name:        pack_pk
-*
-* Description: Serialize the public key as concatenation of the
-*              serialized vector of polynomials pk and the
-*              public seed used to generate the matrix A.
-*              The polynomial coefficients in pk are assumed to
-*              lie in the invertal [0,q], i.e. pk must be reduced
-*              by polyvec_reduce().
-*
-* Arguments:   uint8_t *r: pointer to the output serialized public key
-*              polyvec *pk: pointer to the input public-key polyvec
-*              const uint8_t *seed: pointer to the input public seed
-**************************************************/
-static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES],
-                    polyvec *pk,
-                    const uint8_t seed[KYBER_SYMBYTES])
-{
-  polyvec_tobytes(r, pk);
-  memcpy(r+KYBER_POLYVECBYTES, seed, KYBER_SYMBYTES);
-}
-
-/*************************************************
-* Name:        unpack_pk
-*
-* Description: De-serialize public key from a byte array;
-*              approximate inverse of pack_pk
-*
-* Arguments:   - polyvec *pk: pointer to output public-key polynomial vector
-*              - uint8_t *seed: pointer to output seed to generate matrix A
-*              - const uint8_t *packedpk: pointer to input serialized public key
-**************************************************/
-static void unpack_pk(polyvec *pk,
-                      uint8_t seed[KYBER_SYMBYTES],
-                      const uint8_t packedpk[KYBER_INDCPA_PUBLICKEYBYTES])
-{
-  polyvec_frombytes(pk, packedpk);
-  memcpy(seed, packedpk+KYBER_POLYVECBYTES, KYBER_SYMBYTES);
-}
-
-/*************************************************
-* Name:        pack_sk
-*
-* Description: Serialize the secret key.
-*              The polynomial coefficients in sk are assumed to
-*              lie in the invertal [0,q], i.e. sk must be reduced
-*              by polyvec_reduce().
-*
-* Arguments:   - uint8_t *r: pointer to output serialized secret key
-*              - polyvec *sk: pointer to input vector of polynomials (secret key)
-**************************************************/
-static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk)
-{
-  polyvec_tobytes(r, sk);
-}
-
-/*************************************************
-* Name:        unpack_sk
-*
-* Description: De-serialize the secret key; inverse of pack_sk
-*
-* Arguments:   - polyvec *sk: pointer to output vector of polynomials (secret key)
-*              - const uint8_t *packedsk: pointer to input serialized secret key
-**************************************************/
-static void unpack_sk(polyvec *sk, const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES])
-{
-  polyvec_frombytes(sk, packedsk);
-}
-
-/*************************************************
-* Name:        pack_ciphertext
-*
-* Description: Serialize the ciphertext as concatenation of the
-*              compressed and serialized vector of polynomials b
-*              and the compressed and serialized polynomial v.
-*              The polynomial coefficients in b and v are assumed to
-*              lie in the invertal [0,q], i.e. b and v must be reduced
-*              by polyvec_reduce() and poly_reduce(), respectively.
-*
-* Arguments:   uint8_t *r: pointer to the output serialized ciphertext
-*              poly *pk: pointer to the input vector of polynomials b
-*              poly *v: pointer to the input polynomial v
-**************************************************/
-static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], polyvec *b, poly *v)
-{
-  polyvec_compress(r, b);
-  poly_compress(r+KYBER_POLYVECCOMPRESSEDBYTES, v);
-}
-
-/*************************************************
-* Name:        unpack_ciphertext
-*
-* Description: De-serialize and decompress ciphertext from a byte array;
-*              approximate inverse of pack_ciphertext
-*
-* Arguments:   - polyvec *b: pointer to the output vector of polynomials b
-*              - poly *v: pointer to the output polynomial v
-*              - const uint8_t *c: pointer to the input serialized ciphertext
-**************************************************/
-static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[KYBER_INDCPA_BYTES])
-{
-  polyvec_decompress(b, c);
-  poly_decompress(v, c+KYBER_POLYVECCOMPRESSEDBYTES);
-}
-
-/*************************************************
-* Name:        rej_uniform
-*
-* Description: Run rejection sampling on uniform random bytes to generate
-*              uniform random integers mod q
-*
-* Arguments:   - int16_t *r: pointer to output array
-*              - unsigned int len: requested number of 16-bit integers (uniform mod q)
-*              - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes)
-*              - unsigned int buflen: length of input buffer in bytes
-*
-* Returns number of sampled 16-bit integers (at most len)
-**************************************************/
-static unsigned int rej_uniform(int16_t *r,
-                                unsigned int len,
-                                const uint8_t *buf,
-                                unsigned int buflen)
-{
-  unsigned int ctr, pos;
-  uint16_t val0, val1;
-
-  ctr = pos = 0;
-  while(ctr < len && pos <= buflen - 3) {  // buflen is always at least 3
-    val0 = ((buf[pos+0] >> 0) | ((uint16_t)buf[pos+1] << 8)) & 0xFFF;
-    val1 = ((buf[pos+1] >> 4) | ((uint16_t)buf[pos+2] << 4)) & 0xFFF;
-    pos += 3;
-
-    if(val0 < KYBER_Q)
-      r[ctr++] = val0;
-    if(ctr < len && val1 < KYBER_Q)
-      r[ctr++] = val1;
-  }
-
-  return ctr;
-}
-
-#define gen_a(A,B)  gen_matrix(A,B,0)
-#define gen_at(A,B) gen_matrix(A,B,1)
-
-/*************************************************
-* Name:        gen_matrix
-*
-* Description: Deterministically generate matrix A (or the transpose of A)
-*              from a seed. Entries of the matrix are polynomials that look
-*              uniformly random. Performs rejection sampling on output of
-*              a XOF
-*
-* Arguments:   - polyvec *a: pointer to ouptput matrix A
-*              - const uint8_t *seed: pointer to input seed
-*              - int transposed: boolean deciding whether A or A^T is generated
-**************************************************/
-#if KYBER_K == 2
-void gen_matrix(polyvec *a, const uint8_t seed[32], int transposed)
-{
-  unsigned int ctr0, ctr1, ctr2, ctr3;
-  ALIGNED_UINT8(REJ_UNIFORM_AVX_NBLOCKS*SHAKE128_RATE) buf[4];
-  __m256i f;
-  shake128x4incctx state;
-
-  f = _mm256_loadu_si256((__m256i *)seed);
-  _mm256_store_si256(buf[0].vec, f);
-  _mm256_store_si256(buf[1].vec, f);
-  _mm256_store_si256(buf[2].vec, f);
-  _mm256_store_si256(buf[3].vec, f);
-
-  if(transposed) {
-    buf[0].coeffs[32] = 0;
-    buf[0].coeffs[33] = 0;
-    buf[1].coeffs[32] = 0;
-    buf[1].coeffs[33] = 1;
-    buf[2].coeffs[32] = 1;
-    buf[2].coeffs[33] = 0;
-    buf[3].coeffs[32] = 1;
-    buf[3].coeffs[33] = 1;
-  }
-  else {
-    buf[0].coeffs[32] = 0;
-    buf[0].coeffs[33] = 0;
-    buf[1].coeffs[32] = 1;
-    buf[1].coeffs[33] = 0;
-    buf[2].coeffs[32] = 0;
-    buf[2].coeffs[33] = 1;
-    buf[3].coeffs[32] = 1;
-    buf[3].coeffs[33] = 1;
-  }
-
-  shake128x4_inc_init(&state);
-  shake128x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 34);
-  shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state);
-
-  ctr0 = rej_uniform_avx(a[0].vec[0].coeffs, buf[0].coeffs);
-  ctr1 = rej_uniform_avx(a[0].vec[1].coeffs, buf[1].coeffs);
-  ctr2 = rej_uniform_avx(a[1].vec[0].coeffs, buf[2].coeffs);
-  ctr3 = rej_uniform_avx(a[1].vec[1].coeffs, buf[3].coeffs);
-
-  while(ctr0 < KYBER_N || ctr1 < KYBER_N || ctr2 < KYBER_N || ctr3 < KYBER_N) {
-    shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 1, &state);
-
-    ctr0 += rej_uniform(a[0].vec[0].coeffs + ctr0, KYBER_N - ctr0, buf[0].coeffs, SHAKE128_RATE);
-    ctr1 += rej_uniform(a[0].vec[1].coeffs + ctr1, KYBER_N - ctr1, buf[1].coeffs, SHAKE128_RATE);
-    ctr2 += rej_uniform(a[1].vec[0].coeffs + ctr2, KYBER_N - ctr2, buf[2].coeffs, SHAKE128_RATE);
-    ctr3 += rej_uniform(a[1].vec[1].coeffs + ctr3, KYBER_N - ctr3, buf[3].coeffs, SHAKE128_RATE);
-  }
-
-  poly_nttunpack(&a[0].vec[0]);
-  poly_nttunpack(&a[0].vec[1]);
-  poly_nttunpack(&a[1].vec[0]);
-  poly_nttunpack(&a[1].vec[1]);
-  shake128x4_inc_ctx_release(&state);
-}
-#elif KYBER_K == 3
-void gen_matrix(polyvec *a, const uint8_t seed[32], int transposed)
-{
-  unsigned int ctr0, ctr1, ctr2, ctr3;
-  ALIGNED_UINT8(REJ_UNIFORM_AVX_NBLOCKS*SHAKE128_RATE) buf[4];
-  __m256i f;
-  shake128x4incctx state;
-  shake128incctx state1x;
-
-  f = _mm256_loadu_si256((__m256i *)seed);
-  _mm256_store_si256(buf[0].vec, f);
-  _mm256_store_si256(buf[1].vec, f);
-  _mm256_store_si256(buf[2].vec, f);
-  _mm256_store_si256(buf[3].vec, f);
-
-  if(transposed) {
-    buf[0].coeffs[32] = 0;
-    buf[0].coeffs[33] = 0;
-    buf[1].coeffs[32] = 0;
-    buf[1].coeffs[33] = 1;
-    buf[2].coeffs[32] = 0;
-    buf[2].coeffs[33] = 2;
-    buf[3].coeffs[32] = 1;
-    buf[3].coeffs[33] = 0;
-  }
-  else {
-    buf[0].coeffs[32] = 0;
-    buf[0].coeffs[33] = 0;
-    buf[1].coeffs[32] = 1;
-    buf[1].coeffs[33] = 0;
-    buf[2].coeffs[32] = 2;
-    buf[2].coeffs[33] = 0;
-    buf[3].coeffs[32] = 0;
-    buf[3].coeffs[33] = 1;
-  }
-
-  shake128x4_inc_init(&state);
-  shake128x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 34);
-  shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state);
-
-  ctr0 = rej_uniform_avx(a[0].vec[0].coeffs, buf[0].coeffs);
-  ctr1 = rej_uniform_avx(a[0].vec[1].coeffs, buf[1].coeffs);
-  ctr2 = rej_uniform_avx(a[0].vec[2].coeffs, buf[2].coeffs);
-  ctr3 = rej_uniform_avx(a[1].vec[0].coeffs, buf[3].coeffs);
-
-  while(ctr0 < KYBER_N || ctr1 < KYBER_N || ctr2 < KYBER_N || ctr3 < KYBER_N) {
-    shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 1, &state);
-
-    ctr0 += rej_uniform(a[0].vec[0].coeffs + ctr0, KYBER_N - ctr0, buf[0].coeffs, SHAKE128_RATE);
-    ctr1 += rej_uniform(a[0].vec[1].coeffs + ctr1, KYBER_N - ctr1, buf[1].coeffs, SHAKE128_RATE);
-    ctr2 += rej_uniform(a[0].vec[2].coeffs + ctr2, KYBER_N - ctr2, buf[2].coeffs, SHAKE128_RATE);
-    ctr3 += rej_uniform(a[1].vec[0].coeffs + ctr3, KYBER_N - ctr3, buf[3].coeffs, SHAKE128_RATE);
-  }
-
-  poly_nttunpack(&a[0].vec[0]);
-  poly_nttunpack(&a[0].vec[1]);
-  poly_nttunpack(&a[0].vec[2]);
-  poly_nttunpack(&a[1].vec[0]);
-
-  f = _mm256_loadu_si256((__m256i *)seed);
-  _mm256_store_si256(buf[0].vec, f);
-  _mm256_store_si256(buf[1].vec, f);
-  _mm256_store_si256(buf[2].vec, f);
-  _mm256_store_si256(buf[3].vec, f);
-
-  if(transposed) {
-    buf[0].coeffs[32] = 1;
-    buf[0].coeffs[33] = 1;
-    buf[1].coeffs[32] = 1;
-    buf[1].coeffs[33] = 2;
-    buf[2].coeffs[32] = 2;
-    buf[2].coeffs[33] = 0;
-    buf[3].coeffs[32] = 2;
-    buf[3].coeffs[33] = 1;
-  }
-  else {
-    buf[0].coeffs[32] = 1;
-    buf[0].coeffs[33] = 1;
-    buf[1].coeffs[32] = 2;
-    buf[1].coeffs[33] = 1;
-    buf[2].coeffs[32] = 0;
-    buf[2].coeffs[33] = 2;
-    buf[3].coeffs[32] = 1;
-    buf[3].coeffs[33] = 2;
-  }
-
-  shake128x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 34);
-  shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state);
-
-  ctr0 = rej_uniform_avx(a[1].vec[1].coeffs, buf[0].coeffs);
-  ctr1 = rej_uniform_avx(a[1].vec[2].coeffs, buf[1].coeffs);
-  ctr2 = rej_uniform_avx(a[2].vec[0].coeffs, buf[2].coeffs);
-  ctr3 = rej_uniform_avx(a[2].vec[1].coeffs, buf[3].coeffs);
-
-  while(ctr0 < KYBER_N || ctr1 < KYBER_N || ctr2 < KYBER_N || ctr3 < KYBER_N) {
-    shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 1, &state);
-
-    ctr0 += rej_uniform(a[1].vec[1].coeffs + ctr0, KYBER_N - ctr0, buf[0].coeffs, SHAKE128_RATE);
-    ctr1 += rej_uniform(a[1].vec[2].coeffs + ctr1, KYBER_N - ctr1, buf[1].coeffs, SHAKE128_RATE);
-    ctr2 += rej_uniform(a[2].vec[0].coeffs + ctr2, KYBER_N - ctr2, buf[2].coeffs, SHAKE128_RATE);
-    ctr3 += rej_uniform(a[2].vec[1].coeffs + ctr3, KYBER_N - ctr3, buf[3].coeffs, SHAKE128_RATE);
-  }
-  shake128x4_inc_ctx_release(&state);
-
-  poly_nttunpack(&a[1].vec[1]);
-  poly_nttunpack(&a[1].vec[2]);
-  poly_nttunpack(&a[2].vec[0]);
-  poly_nttunpack(&a[2].vec[1]);
-
-  f = _mm256_loadu_si256((__m256i *)seed);
-  _mm256_store_si256(buf[0].vec, f);
-  buf[0].coeffs[32] = 2;
-  buf[0].coeffs[33] = 2;
-
-  shake128_inc_init(&state1x);
-  shake128_absorb_once(&state1x, buf[0].coeffs, 34);
-  shake128_squeezeblocks(buf[0].coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state1x);
-  ctr0 = rej_uniform_avx(a[2].vec[2].coeffs, buf[0].coeffs);
-  while(ctr0 < KYBER_N) {
-    shake128_squeezeblocks(buf[0].coeffs, 1, &state1x);
-    ctr0 += rej_uniform(a[2].vec[2].coeffs + ctr0, KYBER_N - ctr0, buf[0].coeffs, SHAKE128_RATE);
-  }
-  shake128_inc_ctx_release(&state1x);
-
-  poly_nttunpack(&a[2].vec[2]);
-}
-#elif KYBER_K == 4
-void gen_matrix(polyvec *a, const uint8_t seed[32], int transposed)
-{
-  unsigned int i, ctr0, ctr1, ctr2, ctr3;
-  ALIGNED_UINT8(REJ_UNIFORM_AVX_NBLOCKS*SHAKE128_RATE) buf[4];
-  __m256i f;
-  shake128x4incctx state;
-  shake128x4_inc_init(&state);
-
-  for(i=0;i<4;i++) {
-    f = _mm256_loadu_si256((__m256i *)seed);
-    _mm256_store_si256(buf[0].vec, f);
-    _mm256_store_si256(buf[1].vec, f);
-    _mm256_store_si256(buf[2].vec, f);
-    _mm256_store_si256(buf[3].vec, f);
-
-    if(transposed) {
-      buf[0].coeffs[32] = i;
-      buf[0].coeffs[33] = 0;
-      buf[1].coeffs[32] = i;
-      buf[1].coeffs[33] = 1;
-      buf[2].coeffs[32] = i;
-      buf[2].coeffs[33] = 2;
-      buf[3].coeffs[32] = i;
-      buf[3].coeffs[33] = 3;
-    }
-    else {
-      buf[0].coeffs[32] = 0;
-      buf[0].coeffs[33] = i;
-      buf[1].coeffs[32] = 1;
-      buf[1].coeffs[33] = i;
-      buf[2].coeffs[32] = 2;
-      buf[2].coeffs[33] = i;
-      buf[3].coeffs[32] = 3;
-      buf[3].coeffs[33] = i;
-    }
-
-    shake128x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 34);
-    shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state);
-
-    ctr0 = rej_uniform_avx(a[i].vec[0].coeffs, buf[0].coeffs);
-    ctr1 = rej_uniform_avx(a[i].vec[1].coeffs, buf[1].coeffs);
-    ctr2 = rej_uniform_avx(a[i].vec[2].coeffs, buf[2].coeffs);
-    ctr3 = rej_uniform_avx(a[i].vec[3].coeffs, buf[3].coeffs);
-
-    while(ctr0 < KYBER_N || ctr1 < KYBER_N || ctr2 < KYBER_N || ctr3 < KYBER_N) {
-      shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 1, &state);
-
-      ctr0 += rej_uniform(a[i].vec[0].coeffs + ctr0, KYBER_N - ctr0, buf[0].coeffs, SHAKE128_RATE);
-      ctr1 += rej_uniform(a[i].vec[1].coeffs + ctr1, KYBER_N - ctr1, buf[1].coeffs, SHAKE128_RATE);
-      ctr2 += rej_uniform(a[i].vec[2].coeffs + ctr2, KYBER_N - ctr2, buf[2].coeffs, SHAKE128_RATE);
-      ctr3 += rej_uniform(a[i].vec[3].coeffs + ctr3, KYBER_N - ctr3, buf[3].coeffs, SHAKE128_RATE);
-    }
-
-    poly_nttunpack(&a[i].vec[0]);
-    poly_nttunpack(&a[i].vec[1]);
-    poly_nttunpack(&a[i].vec[2]);
-    poly_nttunpack(&a[i].vec[3]);
-  }
-  shake128x4_inc_ctx_release(&state);
-}
-#endif
-
-/*************************************************
-* Name:        indcpa_keypair_derand
-*
-* Description: Generates public and private key for the CPA-secure
-*              public-key encryption scheme underlying Kyber
-*
-* Arguments:   - uint8_t *pk: pointer to output public key
-*                             (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
-*              - uint8_t *sk: pointer to output private key
-*                             (of length KYBER_INDCPA_SECRETKEYBYTES bytes)
-*              - const uint8_t *coins: pointer to input randomness
-*                             (of length KYBER_SYMBYTES bytes)
-**************************************************/
-void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
-                           uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES],
-                           const uint8_t coins[KYBER_SYMBYTES])
-{
-  unsigned int i;
-  uint8_t buf[2*KYBER_SYMBYTES];
-  const uint8_t *publicseed = buf;
-  const uint8_t *noiseseed = buf + KYBER_SYMBYTES;
-  polyvec a[KYBER_K], e, pkpv, skpv;
-
-  memcpy(buf, coins, KYBER_SYMBYTES);
-  buf[KYBER_SYMBYTES] = KYBER_K;
-  hash_g(buf, buf, KYBER_SYMBYTES+1);
-
-  gen_a(a, publicseed);
-
-#if KYBER_K == 2
-  poly_getnoise_eta1_4x(skpv.vec+0, skpv.vec+1, e.vec+0, e.vec+1, noiseseed, 0, 1, 2, 3);
-#elif KYBER_K == 3
-  poly_getnoise_eta1_4x(skpv.vec+0, skpv.vec+1, skpv.vec+2, e.vec+0, noiseseed, 0, 1, 2, 3);
-  poly_getnoise_eta1_4x(e.vec+1, e.vec+2, pkpv.vec+0, pkpv.vec+1, noiseseed, 4, 5, 6, 7);
-#elif KYBER_K == 4
-  poly_getnoise_eta1_4x(skpv.vec+0, skpv.vec+1, skpv.vec+2, skpv.vec+3, noiseseed,  0, 1, 2, 3);
-  poly_getnoise_eta1_4x(e.vec+0, e.vec+1, e.vec+2, e.vec+3, noiseseed, 4, 5, 6, 7);
-#endif
-
-  polyvec_ntt(&skpv);
-  polyvec_reduce(&skpv);
-  polyvec_ntt(&e);
-
-  // matrix-vector multiplication
-  for(i=0;i<KYBER_K;i++) {
-    polyvec_basemul_acc_montgomery(&pkpv.vec[i], &a[i], &skpv);
-    poly_tomont(&pkpv.vec[i]);
-  }
-
-  polyvec_add(&pkpv, &pkpv, &e);
-  polyvec_reduce(&pkpv);
-
-  pack_sk(sk, &skpv);
-  pack_pk(pk, &pkpv, publicseed);
-}
-
-/*************************************************
-* Name:        indcpa_enc
-*
-* Description: Encryption function of the CPA-secure
-*              public-key encryption scheme underlying Kyber.
-*
-* Arguments:   - uint8_t *c: pointer to output ciphertext
-*                            (of length KYBER_INDCPA_BYTES bytes)
-*              - const uint8_t *m: pointer to input message
-*                                  (of length KYBER_INDCPA_MSGBYTES bytes)
-*              - const uint8_t *pk: pointer to input public key
-*                                   (of length KYBER_INDCPA_PUBLICKEYBYTES)
-*              - const uint8_t *coins: pointer to input random coins used as seed
-*                                      (of length KYBER_SYMBYTES) to deterministically
-*                                      generate all randomness
-**************************************************/
-void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
-                const uint8_t m[KYBER_INDCPA_MSGBYTES],
-                const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
-                const uint8_t coins[KYBER_SYMBYTES])
-{
-  unsigned int i;
-  uint8_t seed[KYBER_SYMBYTES];
-  polyvec sp, pkpv, ep, at[KYBER_K], b;
-  poly v, k, epp;
-
-  unpack_pk(&pkpv, seed, pk);
-  poly_frommsg(&k, m);
-  gen_at(at, seed);
-
-#if KYBER_K == 2
-  poly_getnoise_eta1122_4x(sp.vec+0, sp.vec+1, ep.vec+0, ep.vec+1, coins, 0, 1, 2, 3);
-  poly_getnoise_eta2(&epp, coins, 4);
-#elif KYBER_K == 3
-  poly_getnoise_eta1_4x(sp.vec+0, sp.vec+1, sp.vec+2, ep.vec+0, coins, 0, 1, 2 ,3);
-  poly_getnoise_eta1_4x(ep.vec+1, ep.vec+2, &epp, b.vec+0, coins,  4, 5, 6, 7);
-#elif KYBER_K == 4
-  poly_getnoise_eta1_4x(sp.vec+0, sp.vec+1, sp.vec+2, sp.vec+3, coins, 0, 1, 2, 3);
-  poly_getnoise_eta1_4x(ep.vec+0, ep.vec+1, ep.vec+2, ep.vec+3, coins, 4, 5, 6, 7);
-  poly_getnoise_eta2(&epp, coins, 8);
-#endif
-
-  polyvec_ntt(&sp);
-
-  // matrix-vector multiplication
-  for(i=0;i<KYBER_K;i++)
-    polyvec_basemul_acc_montgomery(&b.vec[i], &at[i], &sp);
-  polyvec_basemul_acc_montgomery(&v, &pkpv, &sp);
-
-  polyvec_invntt_tomont(&b);
-  poly_invntt_tomont(&v);
-
-  polyvec_add(&b, &b, &ep);
-  poly_add(&v, &v, &epp);
-  poly_add(&v, &v, &k);
-  polyvec_reduce(&b);
-  poly_reduce(&v);
-
-  pack_ciphertext(c, &b, &v);
-}
-
-/*************************************************
-* Name:        indcpa_dec
-*
-* Description: Decryption function of the CPA-secure
-*              public-key encryption scheme underlying Kyber.
-*
-* Arguments:   - uint8_t *m: pointer to output decrypted message
-*                            (of length KYBER_INDCPA_MSGBYTES)
-*              - const uint8_t *c: pointer to input ciphertext
-*                                  (of length KYBER_INDCPA_BYTES)
-*              - const uint8_t *sk: pointer to input secret key
-*                                   (of length KYBER_INDCPA_SECRETKEYBYTES)
-**************************************************/
-void indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES],
-                const uint8_t c[KYBER_INDCPA_BYTES],
-                const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES])
-{
-  polyvec b, skpv;
-  poly v, mp;
-
-  unpack_ciphertext(&b, &v, c);
-  unpack_sk(&skpv, sk);
-
-  polyvec_ntt(&b);
-  polyvec_basemul_acc_montgomery(&mp, &skpv, &b);
-  poly_invntt_tomont(&mp);
-
-  poly_sub(&mp, &v, &mp);
-  poly_reduce(&mp);
-
-  poly_tomsg(m, &mp);
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/indcpa.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/indcpa.h
deleted file mode 100644
index 6dd5088c85..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/indcpa.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef INDCPA_H
-#define INDCPA_H
-
-#include <stdint.h>
-#include "params.h"
-#include "polyvec.h"
-
-#define gen_matrix KYBER_NAMESPACE(gen_matrix)
-void gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed);
-
-#define indcpa_keypair_derand KYBER_NAMESPACE(indcpa_keypair_derand)
-void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
-                           uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES],
-                           const uint8_t coins[KYBER_SYMBYTES]);
-
-#define indcpa_enc KYBER_NAMESPACE(indcpa_enc)
-void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
-                const uint8_t m[KYBER_INDCPA_MSGBYTES],
-                const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
-                const uint8_t coins[KYBER_SYMBYTES]);
-
-#define indcpa_dec KYBER_NAMESPACE(indcpa_dec)
-void indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES],
-                const uint8_t c[KYBER_INDCPA_BYTES],
-                const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/invntt.S b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/invntt.S
deleted file mode 100644
index 76d4189996..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/invntt.S
+++ /dev/null
@@ -1,193 +0,0 @@
-#include "consts.h"
-.include "shuffle.inc"
-.include "fq.inc"
-
-.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=2,zl1=2,zh0=3,zh1=3
-vpsubw		%ymm\rl0,%ymm\rh0,%ymm12
-vpaddw		%ymm\rh0,%ymm\rl0,%ymm\rl0
-vpsubw		%ymm\rl1,%ymm\rh1,%ymm13
-
-vpmullw		%ymm\zl0,%ymm12,%ymm\rh0
-vpaddw		%ymm\rh1,%ymm\rl1,%ymm\rl1
-vpsubw		%ymm\rl2,%ymm\rh2,%ymm14
-
-vpmullw		%ymm\zl0,%ymm13,%ymm\rh1
-vpaddw		%ymm\rh2,%ymm\rl2,%ymm\rl2
-vpsubw		%ymm\rl3,%ymm\rh3,%ymm15
-
-vpmullw		%ymm\zl1,%ymm14,%ymm\rh2
-vpaddw		%ymm\rh3,%ymm\rl3,%ymm\rl3
-vpmullw		%ymm\zl1,%ymm15,%ymm\rh3
-
-vpmulhw		%ymm\zh0,%ymm12,%ymm12
-vpmulhw		%ymm\zh0,%ymm13,%ymm13
-
-vpmulhw		%ymm\zh1,%ymm14,%ymm14
-vpmulhw		%ymm\zh1,%ymm15,%ymm15
-
-vpmulhw		%ymm0,%ymm\rh0,%ymm\rh0
-
-vpmulhw		%ymm0,%ymm\rh1,%ymm\rh1
-
-vpmulhw		%ymm0,%ymm\rh2,%ymm\rh2
-vpmulhw		%ymm0,%ymm\rh3,%ymm\rh3
-
-#
-
-#
-
-vpsubw		%ymm\rh0,%ymm12,%ymm\rh0
-
-vpsubw		%ymm\rh1,%ymm13,%ymm\rh1
-
-vpsubw		%ymm\rh2,%ymm14,%ymm\rh2
-vpsubw		%ymm\rh3,%ymm15,%ymm\rh3
-.endm
-
-.macro intt_levels0t5 off
-/* level 0 */
-vmovdqa		_16XFLO*2(%rsi),%ymm2
-vmovdqa		_16XFHI*2(%rsi),%ymm3
-
-vmovdqa         (128*\off+  0)*2(%rdi),%ymm4
-vmovdqa         (128*\off+ 32)*2(%rdi),%ymm6
-vmovdqa         (128*\off+ 16)*2(%rdi),%ymm5
-vmovdqa         (128*\off+ 48)*2(%rdi),%ymm7
-
-fqmulprecomp	2,3,4
-fqmulprecomp	2,3,6
-fqmulprecomp	2,3,5
-fqmulprecomp	2,3,7
-
-vmovdqa         (128*\off+ 64)*2(%rdi),%ymm8
-vmovdqa         (128*\off+ 96)*2(%rdi),%ymm10
-vmovdqa         (128*\off+ 80)*2(%rdi),%ymm9
-vmovdqa         (128*\off+112)*2(%rdi),%ymm11
-
-fqmulprecomp	2,3,8
-fqmulprecomp	2,3,10
-fqmulprecomp	2,3,9
-fqmulprecomp	2,3,11
-
-vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+208)*2(%rsi),%ymm15
-vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+176)*2(%rsi),%ymm1
-vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+224)*2(%rsi),%ymm2
-vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+192)*2(%rsi),%ymm3
-vmovdqa		_REVIDXB*2(%rsi),%ymm12
-vpshufb		%ymm12,%ymm15,%ymm15
-vpshufb		%ymm12,%ymm1,%ymm1
-vpshufb		%ymm12,%ymm2,%ymm2
-vpshufb		%ymm12,%ymm3,%ymm3
-
-butterfly	4,5,8,9,6,7,10,11,15,1,2,3
-
-/* level 1 */
-vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+144)*2(%rsi),%ymm2
-vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+160)*2(%rsi),%ymm3
-vmovdqa		_REVIDXB*2(%rsi),%ymm1
-vpshufb		%ymm1,%ymm2,%ymm2
-vpshufb		%ymm1,%ymm3,%ymm3
-
-butterfly	4,5,6,7,8,9,10,11,2,2,3,3
-
-shuffle1	4,5,3,5
-shuffle1	6,7,4,7
-shuffle1	8,9,6,9
-shuffle1	10,11,8,11
-
-/* level 2 */
-vmovdqa		_REVIDXD*2(%rsi),%ymm12
-vpermd		(_ZETAS_EXP+(1-\off)*224+112)*2(%rsi),%ymm12,%ymm2
-vpermd		(_ZETAS_EXP+(1-\off)*224+128)*2(%rsi),%ymm12,%ymm10
-
-butterfly	3,4,6,8,5,7,9,11,2,2,10,10
-
-vmovdqa		_16XV*2(%rsi),%ymm1
-red16		3
-
-shuffle2	3,4,10,4
-shuffle2	6,8,3,8
-shuffle2	5,7,6,7
-shuffle2	9,11,5,11
-
-/* level 3 */
-vpermq		$0x1B,(_ZETAS_EXP+(1-\off)*224+80)*2(%rsi),%ymm2
-vpermq		$0x1B,(_ZETAS_EXP+(1-\off)*224+96)*2(%rsi),%ymm9
-
-butterfly	10,3,6,5,4,8,7,11,2,2,9,9
-
-shuffle4	10,3,9,3
-shuffle4	6,5,10,5
-shuffle4	4,8,6,8
-shuffle4	7,11,4,11
-
-/* level 4 */
-vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+48)*2(%rsi),%ymm2
-vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+64)*2(%rsi),%ymm7
-
-butterfly	9,10,6,4,3,5,8,11,2,2,7,7
-
-red16		9
-
-shuffle8	9,10,7,10
-shuffle8	6,4,9,4
-shuffle8	3,5,6,5
-shuffle8	8,11,3,11
-
-/* level 5 */
-vmovdqa		(_ZETAS_EXP+(1-\off)*224+16)*2(%rsi),%ymm2
-vmovdqa		(_ZETAS_EXP+(1-\off)*224+32)*2(%rsi),%ymm8
-
-butterfly	7,9,6,3,10,4,5,11,2,2,8,8
-
-vmovdqa         %ymm7,(128*\off+  0)*2(%rdi)
-vmovdqa         %ymm9,(128*\off+ 16)*2(%rdi)
-vmovdqa         %ymm6,(128*\off+ 32)*2(%rdi)
-vmovdqa         %ymm3,(128*\off+ 48)*2(%rdi)
-vmovdqa         %ymm10,(128*\off+ 64)*2(%rdi)
-vmovdqa         %ymm4,(128*\off+ 80)*2(%rdi)
-vmovdqa         %ymm5,(128*\off+ 96)*2(%rdi)
-vmovdqa         %ymm11,(128*\off+112)*2(%rdi)
-.endm
-
-.macro intt_level6 off
-/* level 6 */
-vmovdqa         (64*\off+  0)*2(%rdi),%ymm4
-vmovdqa         (64*\off+128)*2(%rdi),%ymm8
-vmovdqa         (64*\off+ 16)*2(%rdi),%ymm5
-vmovdqa         (64*\off+144)*2(%rdi),%ymm9
-vpbroadcastq	(_ZETAS_EXP+0)*2(%rsi),%ymm2
-
-vmovdqa         (64*\off+ 32)*2(%rdi),%ymm6
-vmovdqa         (64*\off+160)*2(%rdi),%ymm10
-vmovdqa         (64*\off+ 48)*2(%rdi),%ymm7
-vmovdqa         (64*\off+176)*2(%rdi),%ymm11
-vpbroadcastq	(_ZETAS_EXP+4)*2(%rsi),%ymm3
-
-butterfly	4,5,6,7,8,9,10,11
-
-.if \off == 0
-red16		4
-.endif
-
-vmovdqa		%ymm4,(64*\off+  0)*2(%rdi)
-vmovdqa		%ymm5,(64*\off+ 16)*2(%rdi)
-vmovdqa		%ymm6,(64*\off+ 32)*2(%rdi)
-vmovdqa		%ymm7,(64*\off+ 48)*2(%rdi)
-vmovdqa		%ymm8,(64*\off+128)*2(%rdi)
-vmovdqa		%ymm9,(64*\off+144)*2(%rdi)
-vmovdqa		%ymm10,(64*\off+160)*2(%rdi)
-vmovdqa		%ymm11,(64*\off+176)*2(%rdi)
-.endm
-
-.text
-.global cdecl(invntt_avx)
-cdecl(invntt_avx):
-vmovdqa         _16XQ*2(%rsi),%ymm0
-
-intt_levels0t5	0
-intt_levels0t5	1
-
-intt_level6	0
-intt_level6	1
-ret
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/kem.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/kem.c
deleted file mode 100644
index 63abc1029c..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/kem.c
+++ /dev/null
@@ -1,169 +0,0 @@
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-#include "params.h"
-#include "kem.h"
-#include "indcpa.h"
-#include "verify.h"
-#include "symmetric.h"
-#include "randombytes.h"
-/*************************************************
-* Name:        crypto_kem_keypair_derand
-*
-* Description: Generates public and private key
-*              for CCA-secure Kyber key encapsulation mechanism
-*
-* Arguments:   - uint8_t *pk: pointer to output public key
-*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
-*              - uint8_t *sk: pointer to output private key
-*                (an already allocated array of KYBER_SECRETKEYBYTES bytes)
-*              - uint8_t *coins: pointer to input randomness
-*                (an already allocated array filled with 2*KYBER_SYMBYTES random bytes)
-**
-* Returns 0 (success)
-**************************************************/
-int crypto_kem_keypair_derand(uint8_t *pk,
-                              uint8_t *sk,
-                              const uint8_t *coins)
-{
-  indcpa_keypair_derand(pk, sk, coins);
-  memcpy(sk+KYBER_INDCPA_SECRETKEYBYTES, pk, KYBER_PUBLICKEYBYTES);
-  hash_h(sk+KYBER_SECRETKEYBYTES-2*KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
-  /* Value z for pseudo-random output on reject */
-  memcpy(sk+KYBER_SECRETKEYBYTES-KYBER_SYMBYTES, coins+KYBER_SYMBYTES, KYBER_SYMBYTES);
-  return 0;
-}
-
-/*************************************************
-* Name:        crypto_kem_keypair
-*
-* Description: Generates public and private key
-*              for CCA-secure Kyber key encapsulation mechanism
-*
-* Arguments:   - uint8_t *pk: pointer to output public key
-*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
-*              - uint8_t *sk: pointer to output private key
-*                (an already allocated array of KYBER_SECRETKEYBYTES bytes)
-*
-* Returns 0 (success)
-**************************************************/
-int crypto_kem_keypair(uint8_t *pk,
-                       uint8_t *sk)
-{
-  uint8_t coins[2*KYBER_SYMBYTES];
-  randombytes(coins, 2*KYBER_SYMBYTES);
-  crypto_kem_keypair_derand(pk, sk, coins);
-  return 0;
-}
-
-/*************************************************
-* Name:        crypto_kem_enc_derand
-*
-* Description: Generates cipher text and shared
-*              secret for given public key
-*
-* Arguments:   - uint8_t *ct: pointer to output cipher text
-*                (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
-*              - uint8_t *ss: pointer to output shared secret
-*                (an already allocated array of KYBER_SSBYTES bytes)
-*              - const uint8_t *pk: pointer to input public key
-*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
-*              - const uint8_t *coins: pointer to input randomness
-*                (an already allocated array filled with KYBER_SYMBYTES random bytes)
-**
-* Returns 0 (success)
-**************************************************/
-int crypto_kem_enc_derand(uint8_t *ct,
-                          uint8_t *ss,
-                          const uint8_t *pk,
-                          const uint8_t *coins)
-{
-  uint8_t buf[2*KYBER_SYMBYTES];
-  /* Will contain key, coins */
-  uint8_t kr[2*KYBER_SYMBYTES];
-
-  memcpy(buf, coins, KYBER_SYMBYTES);
-
-  /* Multitarget countermeasure for coins + contributory KEM */
-  hash_h(buf+KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
-  hash_g(kr, buf, 2*KYBER_SYMBYTES);
-
-  /* coins are in kr+KYBER_SYMBYTES */
-  indcpa_enc(ct, buf, pk, kr+KYBER_SYMBYTES);
-
-  memcpy(ss,kr,KYBER_SYMBYTES);
-  return 0;
-}
-
-/*************************************************
-* Name:        crypto_kem_enc
-*
-* Description: Generates cipher text and shared
-*              secret for given public key
-*
-* Arguments:   - uint8_t *ct: pointer to output cipher text
-*                (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
-*              - uint8_t *ss: pointer to output shared secret
-*                (an already allocated array of KYBER_SSBYTES bytes)
-*              - const uint8_t *pk: pointer to input public key
-*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
-*
-* Returns 0 (success)
-**************************************************/
-int crypto_kem_enc(uint8_t *ct,
-                   uint8_t *ss,
-                   const uint8_t *pk)
-{
-  uint8_t coins[KYBER_SYMBYTES];
-  randombytes(coins, KYBER_SYMBYTES);
-  crypto_kem_enc_derand(ct, ss, pk, coins);
-  return 0;
-}
-
-/*************************************************
-* Name:        crypto_kem_dec
-*
-* Description: Generates shared secret for given
-*              cipher text and private key
-*
-* Arguments:   - uint8_t *ss: pointer to output shared secret
-*                (an already allocated array of KYBER_SSBYTES bytes)
-*              - const uint8_t *ct: pointer to input cipher text
-*                (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
-*              - const uint8_t *sk: pointer to input private key
-*                (an already allocated array of KYBER_SECRETKEYBYTES bytes)
-*
-* Returns 0.
-*
-* On failure, ss will contain a pseudo-random value.
-**************************************************/
-int crypto_kem_dec(uint8_t *ss,
-                   const uint8_t *ct,
-                   const uint8_t *sk)
-{
-  int fail;
-  uint8_t buf[2*KYBER_SYMBYTES];
-  /* Will contain key, coins */
-  uint8_t kr[2*KYBER_SYMBYTES];
-  uint8_t cmp[KYBER_CIPHERTEXTBYTES+KYBER_SYMBYTES];
-  const uint8_t *pk = sk+KYBER_INDCPA_SECRETKEYBYTES;
-
-  indcpa_dec(buf, ct, sk);
-
-  /* Multitarget countermeasure for coins + contributory KEM */
-  memcpy(buf+KYBER_SYMBYTES, sk+KYBER_SECRETKEYBYTES-2*KYBER_SYMBYTES, KYBER_SYMBYTES);
-  hash_g(kr, buf, 2*KYBER_SYMBYTES);
-
-  /* coins are in kr+KYBER_SYMBYTES */
-  indcpa_enc(cmp, buf, pk, kr+KYBER_SYMBYTES);
-
-  fail = verify(ct, cmp, KYBER_CIPHERTEXTBYTES);
-
-  /* Compute rejection key */
-  rkprf(ss,sk+KYBER_SECRETKEYBYTES-KYBER_SYMBYTES,ct);
-
-  /* Copy true key to return buffer if fail is false */
-  cmov(ss,kr,KYBER_SYMBYTES,!fail);
-
-  return 0;
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/kem.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/kem.h
deleted file mode 100644
index 234f11966b..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/kem.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef KEM_H
-#define KEM_H
-
-#include <stdint.h>
-#include "params.h"
-
-#define CRYPTO_SECRETKEYBYTES  KYBER_SECRETKEYBYTES
-#define CRYPTO_PUBLICKEYBYTES  KYBER_PUBLICKEYBYTES
-#define CRYPTO_CIPHERTEXTBYTES KYBER_CIPHERTEXTBYTES
-#define CRYPTO_BYTES           KYBER_SSBYTES
-
-#if   (KYBER_K == 2)
-#define CRYPTO_ALGNAME "Kyber512"
-#elif (KYBER_K == 3)
-#define CRYPTO_ALGNAME "Kyber768"
-#elif (KYBER_K == 4)
-#define CRYPTO_ALGNAME "Kyber1024"
-#endif
-
-#define crypto_kem_keypair_derand KYBER_NAMESPACE(keypair_derand)
-int crypto_kem_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
-
-#define crypto_kem_keypair KYBER_NAMESPACE(keypair)
-int crypto_kem_keypair(uint8_t *pk, uint8_t *sk);
-
-#define crypto_kem_enc_derand KYBER_NAMESPACE(enc_derand)
-int crypto_kem_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
-
-#define crypto_kem_enc KYBER_NAMESPACE(enc)
-int crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-
-#define crypto_kem_dec KYBER_NAMESPACE(dec)
-int crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/ntt.S b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/ntt.S
deleted file mode 100644
index 0ce7b41297..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/ntt.S
+++ /dev/null
@@ -1,189 +0,0 @@
-#include "consts.h"
-.include "shuffle.inc"
-
-.macro mul rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=2,zh1=2
-vpmullw		%ymm\zl0,%ymm\rh0,%ymm12
-vpmullw		%ymm\zl0,%ymm\rh1,%ymm13
-
-vpmullw		%ymm\zl1,%ymm\rh2,%ymm14
-vpmullw		%ymm\zl1,%ymm\rh3,%ymm15
-
-vpmulhw		%ymm\zh0,%ymm\rh0,%ymm\rh0
-vpmulhw		%ymm\zh0,%ymm\rh1,%ymm\rh1
-
-vpmulhw		%ymm\zh1,%ymm\rh2,%ymm\rh2
-vpmulhw		%ymm\zh1,%ymm\rh3,%ymm\rh3
-.endm
-
-.macro reduce
-vpmulhw		%ymm0,%ymm12,%ymm12
-vpmulhw		%ymm0,%ymm13,%ymm13
-
-vpmulhw		%ymm0,%ymm14,%ymm14
-vpmulhw		%ymm0,%ymm15,%ymm15
-.endm
-
-.macro update rln,rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3
-vpaddw		%ymm\rh0,%ymm\rl0,%ymm\rln
-vpsubw		%ymm\rh0,%ymm\rl0,%ymm\rh0
-vpaddw		%ymm\rh1,%ymm\rl1,%ymm\rl0
-
-vpsubw		%ymm\rh1,%ymm\rl1,%ymm\rh1
-vpaddw		%ymm\rh2,%ymm\rl2,%ymm\rl1
-vpsubw		%ymm\rh2,%ymm\rl2,%ymm\rh2
-
-vpaddw		%ymm\rh3,%ymm\rl3,%ymm\rl2
-vpsubw		%ymm\rh3,%ymm\rl3,%ymm\rh3
-
-vpsubw		%ymm12,%ymm\rln,%ymm\rln
-vpaddw		%ymm12,%ymm\rh0,%ymm\rh0
-vpsubw		%ymm13,%ymm\rl0,%ymm\rl0
-
-vpaddw		%ymm13,%ymm\rh1,%ymm\rh1
-vpsubw		%ymm14,%ymm\rl1,%ymm\rl1
-vpaddw		%ymm14,%ymm\rh2,%ymm\rh2
-
-vpsubw		%ymm15,%ymm\rl2,%ymm\rl2
-vpaddw		%ymm15,%ymm\rh3,%ymm\rh3
-.endm
-
-.macro level0 off
-vpbroadcastq	(_ZETAS_EXP+0)*2(%rsi),%ymm15
-vmovdqa		(64*\off+128)*2(%rdi),%ymm8
-vmovdqa		(64*\off+144)*2(%rdi),%ymm9
-vmovdqa		(64*\off+160)*2(%rdi),%ymm10
-vmovdqa		(64*\off+176)*2(%rdi),%ymm11
-vpbroadcastq	(_ZETAS_EXP+4)*2(%rsi),%ymm2
-
-mul		8,9,10,11
-
-vmovdqa		(64*\off+  0)*2(%rdi),%ymm4
-vmovdqa		(64*\off+ 16)*2(%rdi),%ymm5
-vmovdqa		(64*\off+ 32)*2(%rdi),%ymm6
-vmovdqa		(64*\off+ 48)*2(%rdi),%ymm7
-
-reduce
-update		3,4,5,6,7,8,9,10,11
-
-vmovdqa		%ymm3,(64*\off+  0)*2(%rdi)
-vmovdqa		%ymm4,(64*\off+ 16)*2(%rdi)
-vmovdqa		%ymm5,(64*\off+ 32)*2(%rdi)
-vmovdqa		%ymm6,(64*\off+ 48)*2(%rdi)
-vmovdqa		%ymm8,(64*\off+128)*2(%rdi)
-vmovdqa		%ymm9,(64*\off+144)*2(%rdi)
-vmovdqa		%ymm10,(64*\off+160)*2(%rdi)
-vmovdqa		%ymm11,(64*\off+176)*2(%rdi)
-.endm
-
-.macro levels1t6 off
-/* level 1 */
-vmovdqa		(_ZETAS_EXP+224*\off+16)*2(%rsi),%ymm15
-vmovdqa		(128*\off+ 64)*2(%rdi),%ymm8
-vmovdqa		(128*\off+ 80)*2(%rdi),%ymm9
-vmovdqa		(128*\off+ 96)*2(%rdi),%ymm10
-vmovdqa		(128*\off+112)*2(%rdi),%ymm11
-vmovdqa		(_ZETAS_EXP+224*\off+32)*2(%rsi),%ymm2
-
-mul		8,9,10,11
-
-vmovdqa		(128*\off+  0)*2(%rdi),%ymm4
-vmovdqa	 	(128*\off+ 16)*2(%rdi),%ymm5
-vmovdqa		(128*\off+ 32)*2(%rdi),%ymm6
-vmovdqa		(128*\off+ 48)*2(%rdi),%ymm7
-
-reduce
-update		3,4,5,6,7,8,9,10,11
-
-/* level 2 */
-shuffle8	5,10,7,10
-shuffle8	6,11,5,11
-
-vmovdqa		(_ZETAS_EXP+224*\off+48)*2(%rsi),%ymm15
-vmovdqa		(_ZETAS_EXP+224*\off+64)*2(%rsi),%ymm2
-
-mul		7,10,5,11
-
-shuffle8	3,8,6,8
-shuffle8	4,9,3,9
-
-reduce
-update		4,6,8,3,9,7,10,5,11
-
-/* level 3 */
-shuffle4	8,5,9,5
-shuffle4	3,11,8,11
-
-vmovdqa		(_ZETAS_EXP+224*\off+80)*2(%rsi),%ymm15
-vmovdqa		(_ZETAS_EXP+224*\off+96)*2(%rsi),%ymm2
-
-mul		9,5,8,11
-
-shuffle4	4,7,3,7
-shuffle4	6,10,4,10
-
-reduce
-update		6,3,7,4,10,9,5,8,11
-
-/* level 4 */
-shuffle2	7,8,10,8
-shuffle2	4,11,7,11
-
-vmovdqa		(_ZETAS_EXP+224*\off+112)*2(%rsi),%ymm15
-vmovdqa		(_ZETAS_EXP+224*\off+128)*2(%rsi),%ymm2
-
-mul		10,8,7,11
-
-shuffle2	6,9,4,9
-shuffle2	3,5,6,5
-
-reduce
-update		3,4,9,6,5,10,8,7,11
-
-/* level 5 */
-shuffle1	9,7,5,7
-shuffle1	6,11,9,11
-
-vmovdqa		(_ZETAS_EXP+224*\off+144)*2(%rsi),%ymm15
-vmovdqa		(_ZETAS_EXP+224*\off+160)*2(%rsi),%ymm2
-
-mul		5,7,9,11
-
-shuffle1	3,10,6,10
-shuffle1	4,8,3,8
-
-reduce
-update		4,6,10,3,8,5,7,9,11
-
-/* level 6 */
-vmovdqa		(_ZETAS_EXP+224*\off+176)*2(%rsi),%ymm14
-vmovdqa		(_ZETAS_EXP+224*\off+208)*2(%rsi),%ymm15
-vmovdqa		(_ZETAS_EXP+224*\off+192)*2(%rsi),%ymm8
-vmovdqa		(_ZETAS_EXP+224*\off+224)*2(%rsi),%ymm2
-
-mul		10,3,9,11,14,15,8,2
-
-reduce
-update		8,4,6,5,7,10,3,9,11
-
-vmovdqa		%ymm8,(128*\off+  0)*2(%rdi)
-vmovdqa		%ymm4,(128*\off+ 16)*2(%rdi)
-vmovdqa		%ymm10,(128*\off+ 32)*2(%rdi)
-vmovdqa		%ymm3,(128*\off+ 48)*2(%rdi)
-vmovdqa		%ymm6,(128*\off+ 64)*2(%rdi)
-vmovdqa		%ymm5,(128*\off+ 80)*2(%rdi)
-vmovdqa		%ymm9,(128*\off+ 96)*2(%rdi)
-vmovdqa		%ymm11,(128*\off+112)*2(%rdi)
-.endm
-
-.text
-.global cdecl(ntt_avx)
-cdecl(ntt_avx):
-vmovdqa		_16XQ*2(%rsi),%ymm0
-
-level0		0
-level0		1
-
-levels1t6	0
-levels1t6	1
-
-ret
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/ntt.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/ntt.h
deleted file mode 100644
index a4f48e343b..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/ntt.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef NTT_H
-#define NTT_H
-
-#include <stdint.h>
-#include <immintrin.h>
-
-#define ntt_avx KYBER_NAMESPACE(ntt_avx)
-void ntt_avx(__m256i *r, const __m256i *qdata);
-#define invntt_avx KYBER_NAMESPACE(invntt_avx)
-void invntt_avx(__m256i *r, const __m256i *qdata);
-
-#define nttpack_avx KYBER_NAMESPACE(nttpack_avx)
-void nttpack_avx(__m256i *r, const __m256i *qdata);
-#define nttunpack_avx KYBER_NAMESPACE(nttunpack_avx)
-void nttunpack_avx(__m256i *r, const __m256i *qdata);
-
-#define basemul_avx KYBER_NAMESPACE(basemul_avx)
-void basemul_avx(__m256i *r,
-                 const __m256i *a,
-                 const __m256i *b,
-                 const __m256i *qdata);
-
-#define ntttobytes_avx KYBER_NAMESPACE(ntttobytes_avx)
-void ntttobytes_avx(uint8_t *r, const __m256i *a, const __m256i *qdata);
-#define nttfrombytes_avx KYBER_NAMESPACE(nttfrombytes_avx)
-void nttfrombytes_avx(__m256i *r, const uint8_t *a, const __m256i *qdata);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/params.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/params.h
deleted file mode 100644
index ecfabce4a5..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/params.h
+++ /dev/null
@@ -1,68 +0,0 @@
-#ifndef PARAMS_H
-#define PARAMS_H
-
-#ifndef KYBER_K
-#define KYBER_K 3	/* Change this for different security strengths */
-#endif
-
-//#define KYBER_90S	/* Uncomment this if you want the 90S variant */
-
-/* Don't change parameters below this line */
-#if   (KYBER_K == 2)
-#ifdef KYBER_90S
-#define KYBER_NAMESPACE(s) pqcrystals_kyber512_90s_avx2_##s
-#else
-#define KYBER_NAMESPACE(s) pqcrystals_ml_kem_512_avx2_##s
-#endif
-#elif (KYBER_K == 3)
-#ifdef KYBER_90S
-#define KYBER_NAMESPACE(s) pqcrystals_kyber768_90s_avx2_##s
-#else
-#define KYBER_NAMESPACE(s) pqcrystals_ml_kem_768_avx2_##s
-#endif
-#elif (KYBER_K == 4)
-#ifdef KYBER_90S
-#define KYBER_NAMESPACE(s) pqcrystals_kyber1024_90s_avx2_##s
-#else
-#define KYBER_NAMESPACE(s) pqcrystals_ml_kem_1024_avx2_##s
-#endif
-#else
-#error "KYBER_K must be in {2,3,4}"
-#endif
-
-#define KYBER_N 256
-#define KYBER_Q 3329
-
-#define KYBER_SYMBYTES 32   /* size in bytes of hashes, and seeds */
-#define KYBER_SSBYTES  32   /* size in bytes of shared key */
-
-#define KYBER_POLYBYTES		384
-#define KYBER_POLYVECBYTES	(KYBER_K * KYBER_POLYBYTES)
-
-#if KYBER_K == 2
-#define KYBER_ETA1 3
-#define KYBER_POLYCOMPRESSEDBYTES    128
-#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320)
-#elif KYBER_K == 3
-#define KYBER_ETA1 2
-#define KYBER_POLYCOMPRESSEDBYTES    128
-#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320)
-#elif KYBER_K == 4
-#define KYBER_ETA1 2
-#define KYBER_POLYCOMPRESSEDBYTES    160
-#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 352)
-#endif
-
-#define KYBER_ETA2 2
-
-#define KYBER_INDCPA_MSGBYTES       (KYBER_SYMBYTES)
-#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES)
-#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES)
-#define KYBER_INDCPA_BYTES          (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES)
-
-#define KYBER_PUBLICKEYBYTES  (KYBER_INDCPA_PUBLICKEYBYTES)
-/* 32 bytes of additional space to save H(pk) */
-#define KYBER_SECRETKEYBYTES  (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES)
-#define KYBER_CIPHERTEXTBYTES (KYBER_INDCPA_BYTES)
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/poly.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/poly.c
deleted file mode 100644
index 681fd6d23e..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/poly.c
+++ /dev/null
@@ -1,519 +0,0 @@
-#include <stdint.h>
-#include <immintrin.h>
-#include <string.h>
-#include "align.h"
-#include "fips202x4.h"
-#include "params.h"
-#include "poly.h"
-#include "ntt.h"
-#include "consts.h"
-#include "reduce.h"
-#include "cbd.h"
-#include "symmetric.h"
-
-/*************************************************
-* Name:        poly_compress
-*
-* Description: Compression and subsequent serialization of a polynomial.
-*              The coefficients of the input polynomial are assumed to
-*              lie in the invertal [0,q], i.e. the polynomial must be reduced
-*              by poly_reduce().
-*
-* Arguments:   - uint8_t *r: pointer to output byte array
-*                            (of length KYBER_POLYCOMPRESSEDBYTES)
-*              - const poly *a: pointer to input polynomial
-**************************************************/
-#if (KYBER_POLYCOMPRESSEDBYTES == 128)
-void poly_compress(uint8_t r[128], const poly * restrict a)
-{
-  unsigned int i;
-  __m256i f0, f1, f2, f3;
-  const __m256i v = _mm256_load_si256(&qdata.vec[_16XV/16]);
-  const __m256i shift1 = _mm256_set1_epi16(1 << 9);
-  const __m256i mask = _mm256_set1_epi16(15);
-  const __m256i shift2 = _mm256_set1_epi16((16 << 8) + 1);
-  const __m256i permdidx = _mm256_set_epi32(7,3,6,2,5,1,4,0);
-
-  for(i=0;i<KYBER_N/64;i++) {
-    f0 = _mm256_load_si256(&a->vec[4*i+0]);
-    f1 = _mm256_load_si256(&a->vec[4*i+1]);
-    f2 = _mm256_load_si256(&a->vec[4*i+2]);
-    f3 = _mm256_load_si256(&a->vec[4*i+3]);
-    f0 = _mm256_mulhi_epi16(f0,v);
-    f1 = _mm256_mulhi_epi16(f1,v);
-    f2 = _mm256_mulhi_epi16(f2,v);
-    f3 = _mm256_mulhi_epi16(f3,v);
-    f0 = _mm256_mulhrs_epi16(f0,shift1);
-    f1 = _mm256_mulhrs_epi16(f1,shift1);
-    f2 = _mm256_mulhrs_epi16(f2,shift1);
-    f3 = _mm256_mulhrs_epi16(f3,shift1);
-    f0 = _mm256_and_si256(f0,mask);
-    f1 = _mm256_and_si256(f1,mask);
-    f2 = _mm256_and_si256(f2,mask);
-    f3 = _mm256_and_si256(f3,mask);
-    f0 = _mm256_packus_epi16(f0,f1);
-    f2 = _mm256_packus_epi16(f2,f3);
-    f0 = _mm256_maddubs_epi16(f0,shift2);
-    f2 = _mm256_maddubs_epi16(f2,shift2);
-    f0 = _mm256_packus_epi16(f0,f2);
-    f0 = _mm256_permutevar8x32_epi32(f0,permdidx);
-    _mm256_storeu_si256((__m256i *)&r[32*i],f0);
-  }
-}
-
-void poly_decompress(poly * restrict r, const uint8_t a[128])
-{
-  unsigned int i;
-  __m128i t;
-  __m256i f;
-  const __m256i q = _mm256_load_si256(&qdata.vec[_16XQ/16]);
-  const __m256i shufbidx = _mm256_set_epi8(7,7,7,7,6,6,6,6,5,5,5,5,4,4,4,4,
-                                           3,3,3,3,2,2,2,2,1,1,1,1,0,0,0,0);
-  const __m256i mask = _mm256_set1_epi32(0x00F0000F);
-  const __m256i shift = _mm256_set1_epi32((128 << 16) + 2048);
-
-  for(i=0;i<KYBER_N/16;i++) {
-    t = _mm_loadl_epi64((__m128i *)&a[8*i]);
-    f = _mm256_broadcastsi128_si256(t);
-    f = _mm256_shuffle_epi8(f,shufbidx);
-    f = _mm256_and_si256(f,mask);
-    f = _mm256_mullo_epi16(f,shift);
-    f = _mm256_mulhrs_epi16(f,q);
-    _mm256_store_si256(&r->vec[i],f);
-  }
-}
-
-#elif (KYBER_POLYCOMPRESSEDBYTES == 160)
-void poly_compress(uint8_t r[160], const poly * restrict a)
-{
-  unsigned int i;
-  __m256i f0, f1;
-  __m128i t0, t1;
-  const __m256i v = _mm256_load_si256(&qdata.vec[_16XV/16]);
-  const __m256i shift1 = _mm256_set1_epi16(1 << 10);
-  const __m256i mask = _mm256_set1_epi16(31);
-  const __m256i shift2 = _mm256_set1_epi16((32 << 8) + 1);
-  const __m256i shift3 = _mm256_set1_epi32((1024 << 16) + 1);
-  const __m256i sllvdidx = _mm256_set1_epi64x(12);
-  const __m256i shufbidx = _mm256_set_epi8( 8,-1,-1,-1,-1,-1, 4, 3, 2, 1, 0,-1,12,11,10, 9,
-                                           -1,12,11,10, 9, 8,-1,-1,-1,-1,-1 ,4, 3, 2, 1, 0);
-
-  for(i=0;i<KYBER_N/32;i++) {
-    f0 = _mm256_load_si256(&a->vec[2*i+0]);
-    f1 = _mm256_load_si256(&a->vec[2*i+1]);
-    f0 = _mm256_mulhi_epi16(f0,v);
-    f1 = _mm256_mulhi_epi16(f1,v);
-    f0 = _mm256_mulhrs_epi16(f0,shift1);
-    f1 = _mm256_mulhrs_epi16(f1,shift1);
-    f0 = _mm256_and_si256(f0,mask);
-    f1 = _mm256_and_si256(f1,mask);
-    f0 = _mm256_packus_epi16(f0,f1);
-    f0 = _mm256_maddubs_epi16(f0,shift2);	// a0 a1 a2 a3 b0 b1 b2 b3 a4 a5 a6 a7 b4 b5 b6 b7
-    f0 = _mm256_madd_epi16(f0,shift3);		// a0 a1 b0 b1 a2 a3 b2 b3
-    f0 = _mm256_sllv_epi32(f0,sllvdidx);
-    f0 = _mm256_srlv_epi64(f0,sllvdidx);
-    f0 = _mm256_shuffle_epi8(f0,shufbidx);
-    t0 = _mm256_castsi256_si128(f0);
-    t1 = _mm256_extracti128_si256(f0,1);
-    t0 = _mm_blendv_epi8(t0,t1,_mm256_castsi256_si128(shufbidx));
-    _mm_storeu_si128((__m128i *)&r[20*i+ 0],t0);
-    memcpy(&r[20*i+16],&t1,4);
-  }
-}
-
-void poly_decompress(poly * restrict r, const uint8_t a[160])
-{
-  unsigned int i;
-  __m128i t;
-  __m256i f;
-  int16_t ti;
-  const __m256i q = _mm256_load_si256(&qdata.vec[_16XQ/16]);
-  const __m256i shufbidx = _mm256_set_epi8(9,9,9,8,8,8,8,7,7,6,6,6,6,5,5,5,
-                                           4,4,4,3,3,3,3,2,2,1,1,1,1,0,0,0);
-  const __m256i mask = _mm256_set_epi16(248,1984,62,496,3968,124,992,31,
-                                        248,1984,62,496,3968,124,992,31);
-  const __m256i shift = _mm256_set_epi16(128,16,512,64,8,256,32,1024,
-                                         128,16,512,64,8,256,32,1024);
-
-  for(i=0;i<KYBER_N/16;i++) {
-    t = _mm_loadl_epi64((__m128i *)&a[10*i+0]);
-    memcpy(&ti,&a[10*i+8],2);
-    t = _mm_insert_epi16(t,ti,4);
-    f = _mm256_broadcastsi128_si256(t);
-    f = _mm256_shuffle_epi8(f,shufbidx);
-    f = _mm256_and_si256(f,mask);
-    f = _mm256_mullo_epi16(f,shift);
-    f = _mm256_mulhrs_epi16(f,q);
-    _mm256_store_si256(&r->vec[i],f);
-  }
-}
-
-#endif
-
-/*************************************************
-* Name:        poly_tobytes
-*
-* Description: Serialization of a polynomial in NTT representation.
-*              The coefficients of the input polynomial are assumed to
-*              lie in the invertal [0,q], i.e. the polynomial must be reduced
-*              by poly_reduce(). The coefficients are orderd as output by
-*              poly_ntt(); the serialized output coefficients are in bitreversed
-*              order.
-*
-* Arguments:   - uint8_t *r: pointer to output byte array
-*                            (needs space for KYBER_POLYBYTES bytes)
-*              - poly *a: pointer to input polynomial
-**************************************************/
-void poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a)
-{
-  ntttobytes_avx(r, a->vec, qdata.vec);
-}
-
-/*************************************************
-* Name:        poly_frombytes
-*
-* Description: De-serialization of a polynomial;
-*              inverse of poly_tobytes
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *a: pointer to input byte array
-*                                  (of KYBER_POLYBYTES bytes)
-**************************************************/
-void poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES])
-{
-  nttfrombytes_avx(r->vec, a, qdata.vec);
-}
-
-/*************************************************
-* Name:        poly_frommsg
-*
-* Description: Convert 32-byte message to polynomial
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *msg: pointer to input message
-**************************************************/
-void poly_frommsg(poly * restrict r, const uint8_t msg[KYBER_INDCPA_MSGBYTES])
-{
-#if (KYBER_INDCPA_MSGBYTES != 32)
-#error "KYBER_INDCPA_MSGBYTES must be equal to 32!"
-#endif
-  __m256i f, g0, g1, g2, g3, h0, h1, h2, h3;
-  const __m256i shift = _mm256_broadcastsi128_si256(_mm_set_epi32(0,1,2,3));
-  const __m256i idx = _mm256_broadcastsi128_si256(_mm_set_epi8(15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0));
-  const __m256i hqs = _mm256_set1_epi16((KYBER_Q+1)/2);
-
-#define FROMMSG64(i)						\
-  g3 = _mm256_shuffle_epi32(f,0x55*i);				\
-  g3 = _mm256_sllv_epi32(g3,shift);				\
-  g3 = _mm256_shuffle_epi8(g3,idx);				\
-  g0 = _mm256_slli_epi16(g3,12);				\
-  g1 = _mm256_slli_epi16(g3,8);					\
-  g2 = _mm256_slli_epi16(g3,4);					\
-  g0 = _mm256_srai_epi16(g0,15);				\
-  g1 = _mm256_srai_epi16(g1,15);				\
-  g2 = _mm256_srai_epi16(g2,15);				\
-  g3 = _mm256_srai_epi16(g3,15);				\
-  g0 = _mm256_and_si256(g0,hqs);  /* 19 18 17 16  3  2  1  0 */	\
-  g1 = _mm256_and_si256(g1,hqs);  /* 23 22 21 20  7  6  5  4 */	\
-  g2 = _mm256_and_si256(g2,hqs);  /* 27 26 25 24 11 10  9  8 */	\
-  g3 = _mm256_and_si256(g3,hqs);  /* 31 30 29 28 15 14 13 12 */	\
-  h0 = _mm256_unpacklo_epi64(g0,g1);				\
-  h2 = _mm256_unpackhi_epi64(g0,g1);				\
-  h1 = _mm256_unpacklo_epi64(g2,g3);				\
-  h3 = _mm256_unpackhi_epi64(g2,g3);				\
-  g0 = _mm256_permute2x128_si256(h0,h1,0x20);			\
-  g2 = _mm256_permute2x128_si256(h0,h1,0x31);			\
-  g1 = _mm256_permute2x128_si256(h2,h3,0x20);			\
-  g3 = _mm256_permute2x128_si256(h2,h3,0x31);			\
-  _mm256_store_si256(&r->vec[0+2*i+0],g0);	\
-  _mm256_store_si256(&r->vec[0+2*i+1],g1);	\
-  _mm256_store_si256(&r->vec[8+2*i+0],g2);	\
-  _mm256_store_si256(&r->vec[8+2*i+1],g3)
-
-  f = _mm256_loadu_si256((__m256i *)msg);
-  FROMMSG64(0);
-  FROMMSG64(1);
-  FROMMSG64(2);
-  FROMMSG64(3);
-}
-
-/*************************************************
-* Name:        poly_tomsg
-*
-* Description: Convert polynomial to 32-byte message.
-*              The coefficients of the input polynomial are assumed to
-*              lie in the invertal [0,q], i.e. the polynomial must be reduced
-*              by poly_reduce().
-*
-* Arguments:   - uint8_t *msg: pointer to output message
-*              - poly *a: pointer to input polynomial
-**************************************************/
-void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly * restrict a)
-{
-  unsigned int i;
-  uint32_t small;
-  __m256i f0, f1, g0, g1;
-  const __m256i hq = _mm256_set1_epi16((KYBER_Q - 1)/2);
-  const __m256i hhq = _mm256_set1_epi16((KYBER_Q - 1)/4);
-
-  for(i=0;i<KYBER_N/32;i++) {
-    f0 = _mm256_load_si256(&a->vec[2*i+0]);
-    f1 = _mm256_load_si256(&a->vec[2*i+1]);
-    f0 = _mm256_sub_epi16(hq, f0);
-    f1 = _mm256_sub_epi16(hq, f1);
-    g0 = _mm256_srai_epi16(f0, 15);
-    g1 = _mm256_srai_epi16(f1, 15);
-    f0 = _mm256_xor_si256(f0, g0);
-    f1 = _mm256_xor_si256(f1, g1);
-    f0 = _mm256_sub_epi16(f0, hhq);
-    f1 = _mm256_sub_epi16(f1, hhq);
-    f0 = _mm256_packs_epi16(f0, f1);
-    f0 = _mm256_permute4x64_epi64(f0, 0xD8);
-    small = _mm256_movemask_epi8(f0);
-    memcpy(&msg[4*i], &small, 4);
-  }
-}
-
-/*************************************************
-* Name:        poly_getnoise_eta1
-*
-* Description: Sample a polynomial deterministically from a seed and a nonce,
-*              with output polynomial close to centered binomial distribution
-*              with parameter KYBER_ETA1
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *seed: pointer to input seed
-*                                     (of length KYBER_SYMBYTES bytes)
-*              - uint8_t nonce: one-byte input nonce
-**************************************************/
-void poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce)
-{
-  ALIGNED_UINT8(KYBER_ETA1*KYBER_N/4+32) buf; // +32 bytes as required by poly_cbd_eta1
-  prf(buf.coeffs, KYBER_ETA1*KYBER_N/4, seed, nonce);
-  poly_cbd_eta1(r, buf.vec);
-}
-
-/*************************************************
-* Name:        poly_getnoise_eta2
-*
-* Description: Sample a polynomial deterministically from a seed and a nonce,
-*              with output polynomial close to centered binomial distribution
-*              with parameter KYBER_ETA2
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *seed: pointer to input seed
-*                                     (of length KYBER_SYMBYTES bytes)
-*              - uint8_t nonce: one-byte input nonce
-**************************************************/
-void poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce)
-{
-  ALIGNED_UINT8(KYBER_ETA2*KYBER_N/4) buf;
-  prf(buf.coeffs, KYBER_ETA2*KYBER_N/4, seed, nonce);
-  poly_cbd_eta2(r, buf.vec);
-}
-
-#ifndef KYBER_90S
-#define NOISE_NBLOCKS ((KYBER_ETA1*KYBER_N/4+SHAKE256_RATE-1)/SHAKE256_RATE)
-void poly_getnoise_eta1_4x(poly *r0,
-                           poly *r1,
-                           poly *r2,
-                           poly *r3,
-                           const uint8_t seed[32],
-                           uint8_t nonce0,
-                           uint8_t nonce1,
-                           uint8_t nonce2,
-                           uint8_t nonce3)
-{
-  ALIGNED_UINT8(NOISE_NBLOCKS*SHAKE256_RATE) buf[4];
-  __m256i f;
-  shake256x4incctx state;
-
-  f = _mm256_loadu_si256((__m256i *)seed);
-  _mm256_store_si256(buf[0].vec, f);
-  _mm256_store_si256(buf[1].vec, f);
-  _mm256_store_si256(buf[2].vec, f);
-  _mm256_store_si256(buf[3].vec, f);
-
-  buf[0].coeffs[32] = nonce0;
-  buf[1].coeffs[32] = nonce1;
-  buf[2].coeffs[32] = nonce2;
-  buf[3].coeffs[32] = nonce3;
-
-  shake256x4_inc_init(&state);
-  shake256x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 33);
-  shake256x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, NOISE_NBLOCKS, &state);
-  shake256x4_inc_ctx_release(&state);
-
-  poly_cbd_eta1(r0, buf[0].vec);
-  poly_cbd_eta1(r1, buf[1].vec);
-  poly_cbd_eta1(r2, buf[2].vec);
-  poly_cbd_eta1(r3, buf[3].vec);
-}
-
-#if KYBER_K == 2
-void poly_getnoise_eta1122_4x(poly *r0,
-                              poly *r1,
-                              poly *r2,
-                              poly *r3,
-                              const uint8_t seed[32],
-                              uint8_t nonce0,
-                              uint8_t nonce1,
-                              uint8_t nonce2,
-                              uint8_t nonce3)
-{
-  ALIGNED_UINT8(NOISE_NBLOCKS*SHAKE256_RATE) buf[4];
-  __m256i f;
-  shake256x4incctx state;
-
-  f = _mm256_loadu_si256((__m256i *)seed);
-  _mm256_store_si256(buf[0].vec, f);
-  _mm256_store_si256(buf[1].vec, f);
-  _mm256_store_si256(buf[2].vec, f);
-  _mm256_store_si256(buf[3].vec, f);
-
-  buf[0].coeffs[32] = nonce0;
-  buf[1].coeffs[32] = nonce1;
-  buf[2].coeffs[32] = nonce2;
-  buf[3].coeffs[32] = nonce3;
-
-  shake256x4_inc_init(&state);
-  shake256x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 33);
-  shake256x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, NOISE_NBLOCKS, &state);
-  shake256x4_inc_ctx_release(&state);
-
-  poly_cbd_eta1(r0, buf[0].vec);
-  poly_cbd_eta1(r1, buf[1].vec);
-  poly_cbd_eta2(r2, buf[2].vec);
-  poly_cbd_eta2(r3, buf[3].vec);
-}
-#endif
-#endif
-
-/*************************************************
-* Name:        poly_ntt
-*
-* Description: Computes negacyclic number-theoretic transform (NTT) of
-*              a polynomial in place.
-*              Input coefficients assumed to be in normal order,
-*              output coefficients are in special order that is natural
-*              for the vectorization. Input coefficients are assumed to be
-*              bounded by q in absolute value, output coefficients are bounded
-*              by 16118 in absolute value.
-*
-* Arguments:   - poly *r: pointer to in/output polynomial
-**************************************************/
-void poly_ntt(poly *r)
-{
-  ntt_avx(r->vec, qdata.vec);
-}
-
-/*************************************************
-* Name:        poly_invntt_tomont
-*
-* Description: Computes inverse of negacyclic number-theoretic transform (NTT)
-*              of a polynomial in place;
-*              Input coefficients assumed to be in special order from vectorized
-*              forward ntt, output in normal order. Input coefficients can be
-*              arbitrary 16-bit integers, output coefficients are bounded by 14870
-*              in absolute value.
-*
-* Arguments:   - poly *a: pointer to in/output polynomial
-**************************************************/
-void poly_invntt_tomont(poly *r)
-{
-  invntt_avx(r->vec, qdata.vec);
-}
-
-void poly_nttunpack(poly *r)
-{
-  nttunpack_avx(r->vec, qdata.vec);
-}
-
-/*************************************************
-* Name:        poly_basemul_montgomery
-*
-* Description: Multiplication of two polynomials in NTT domain.
-*              One of the input polynomials needs to have coefficients
-*              bounded by q, the other polynomial can have arbitrary
-*              coefficients. Output coefficients are bounded by 6656.
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const poly *a: pointer to first input polynomial
-*              - const poly *b: pointer to second input polynomial
-**************************************************/
-void poly_basemul_montgomery(poly *r, const poly *a, const poly *b)
-{
-  basemul_avx(r->vec, a->vec, b->vec, qdata.vec);
-}
-
-/*************************************************
-* Name:        poly_tomont
-*
-* Description: Inplace conversion of all coefficients of a polynomial
-*              from normal domain to Montgomery domain
-*
-* Arguments:   - poly *r: pointer to input/output polynomial
-**************************************************/
-void poly_tomont(poly *r)
-{
-  tomont_avx(r->vec, qdata.vec);
-}
-
-/*************************************************
-* Name:        poly_reduce
-*
-* Description: Applies Barrett reduction to all coefficients of a polynomial
-*              for details of the Barrett reduction see comments in reduce.c
-*
-* Arguments:   - poly *r: pointer to input/output polynomial
-**************************************************/
-void poly_reduce(poly *r)
-{
-  reduce_avx(r->vec, qdata.vec);
-}
-
-/*************************************************
-* Name:        poly_add
-*
-* Description: Add two polynomials. No modular reduction
-*              is performed.
-*
-* Arguments: - poly *r: pointer to output polynomial
-*            - const poly *a: pointer to first input polynomial
-*            - const poly *b: pointer to second input polynomial
-**************************************************/
-void poly_add(poly *r, const poly *a, const poly *b)
-{
-  unsigned int i;
-  __m256i f0, f1;
-
-  for(i=0;i<KYBER_N/16;i++) {
-    f0 = _mm256_load_si256(&a->vec[i]);
-    f1 = _mm256_load_si256(&b->vec[i]);
-    f0 = _mm256_add_epi16(f0, f1);
-    _mm256_store_si256(&r->vec[i], f0);
-  }
-}
-
-/*************************************************
-* Name:        poly_sub
-*
-* Description: Subtract two polynomials. No modular reduction
-*              is performed.
-*
-* Arguments: - poly *r: pointer to output polynomial
-*            - const poly *a: pointer to first input polynomial
-*            - const poly *b: pointer to second input polynomial
-**************************************************/
-void poly_sub(poly *r, const poly *a, const poly *b)
-{
-  unsigned int i;
-  __m256i f0, f1;
-
-  for(i=0;i<KYBER_N/16;i++) {
-    f0 = _mm256_load_si256(&a->vec[i]);
-    f1 = _mm256_load_si256(&b->vec[i]);
-    f0 = _mm256_sub_epi16(f0, f1);
-    _mm256_store_si256(&r->vec[i], f0);
-  }
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/poly.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/poly.h
deleted file mode 100644
index 6a9cf71c70..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/poly.h
+++ /dev/null
@@ -1,77 +0,0 @@
-#ifndef POLY_H
-#define POLY_H
-
-#include <stdint.h>
-#include "align.h"
-#include "params.h"
-
-typedef ALIGNED_INT16(KYBER_N) poly;
-
-#define poly_compress KYBER_NAMESPACE(poly_compress)
-void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a);
-#define poly_decompress KYBER_NAMESPACE(poly_decompress)
-void poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]);
-
-#define poly_tobytes KYBER_NAMESPACE(poly_tobytes)
-void poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a);
-#define poly_frombytes KYBER_NAMESPACE(poly_frombytes)
-void poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]);
-
-#define poly_frommsg KYBER_NAMESPACE(poly_frommsg)
-void poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]);
-#define poly_tomsg KYBER_NAMESPACE(poly_tomsg)
-void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *r);
-
-#define poly_getnoise_eta1 KYBER_NAMESPACE(poly_getnoise_eta1)
-void poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);
-
-#define poly_getnoise_eta2 KYBER_NAMESPACE(poly_getnoise_eta2)
-void poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);
-
-#ifndef KYBER_90S
-#define poly_getnoise_eta1_4x KYBER_NAMESPACE(poly_getnoise_eta2_4x)
-void poly_getnoise_eta1_4x(poly *r0,
-                           poly *r1,
-                           poly *r2,
-                           poly *r3,
-                           const uint8_t seed[32],
-                           uint8_t nonce0,
-                           uint8_t nonce1,
-                           uint8_t nonce2,
-                           uint8_t nonce3);
-
-#if KYBER_K == 2
-#define poly_getnoise_eta1122_4x KYBER_NAMESPACE(poly_getnoise_eta1122_4x)
-void poly_getnoise_eta1122_4x(poly *r0,
-                              poly *r1,
-                              poly *r2,
-                              poly *r3,
-                              const uint8_t seed[32],
-                              uint8_t nonce0,
-                              uint8_t nonce1,
-                              uint8_t nonce2,
-                              uint8_t nonce3);
-#endif
-#endif
-
-
-#define poly_ntt KYBER_NAMESPACE(poly_ntt)
-void poly_ntt(poly *r);
-#define poly_invntt_tomont KYBER_NAMESPACE(poly_invntt_tomont)
-void poly_invntt_tomont(poly *r);
-#define poly_nttunpack KYBER_NAMESPACE(poly_nttunpack)
-void poly_nttunpack(poly *r);
-#define poly_basemul_montgomery KYBER_NAMESPACE(poly_basemul_montgomery)
-void poly_basemul_montgomery(poly *r, const poly *a, const poly *b);
-#define poly_tomont KYBER_NAMESPACE(poly_tomont)
-void poly_tomont(poly *r);
-
-#define poly_reduce KYBER_NAMESPACE(poly_reduce)
-void poly_reduce(poly *r);
-
-#define poly_add KYBER_NAMESPACE(poly_add)
-void poly_add(poly *r, const poly *a, const poly *b);
-#define poly_sub KYBER_NAMESPACE(poly_sub)
-void poly_sub(poly *r, const poly *a, const poly *b);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/polyvec.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/polyvec.c
deleted file mode 100644
index a0174b7b3f..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/polyvec.c
+++ /dev/null
@@ -1,307 +0,0 @@
-#include <stdint.h>
-#include <immintrin.h>
-#include <string.h>
-#include "params.h"
-#include "polyvec.h"
-#include "poly.h"
-#include "ntt.h"
-#include "consts.h"
-
-#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
-static void poly_compress10(uint8_t r[320], const poly * restrict a)
-{
-  unsigned int i;
-  __m256i f0, f1, f2;
-  __m128i t0, t1;
-  const __m256i v = _mm256_load_si256(&qdata.vec[_16XV/16]);
-  const __m256i v8 = _mm256_slli_epi16(v,3);
-  const __m256i off = _mm256_set1_epi16(15);
-  const __m256i shift1 = _mm256_set1_epi16(1 << 12);
-  const __m256i mask = _mm256_set1_epi16(1023);
-  const __m256i shift2 = _mm256_set1_epi64x((1024LL << 48) + (1LL << 32) + (1024 << 16) + 1);
-  const __m256i sllvdidx = _mm256_set1_epi64x(12);
-  const __m256i shufbidx = _mm256_set_epi8( 8, 4, 3, 2, 1, 0,-1,-1,-1,-1,-1,-1,12,11,10, 9,
-                                           -1,-1,-1,-1,-1,-1,12,11,10, 9, 8, 4, 3, 2, 1, 0);
-
-  for(i=0;i<KYBER_N/16;i++) {
-    f0 = _mm256_load_si256(&a->vec[i]);
-    f1 = _mm256_mullo_epi16(f0,v8);
-    f2 = _mm256_add_epi16(f0,off);
-    f0 = _mm256_slli_epi16(f0,3);
-    f0 = _mm256_mulhi_epi16(f0,v);
-    f2 = _mm256_sub_epi16(f1,f2);
-    f1 = _mm256_andnot_si256(f1,f2);
-    f1 = _mm256_srli_epi16(f1,15);
-    f0 = _mm256_sub_epi16(f0,f1);
-    f0 = _mm256_mulhrs_epi16(f0,shift1);
-    f0 = _mm256_and_si256(f0,mask);
-    f0 = _mm256_madd_epi16(f0,shift2);
-    f0 = _mm256_sllv_epi32(f0,sllvdidx);
-    f0 = _mm256_srli_epi64(f0,12);
-    f0 = _mm256_shuffle_epi8(f0,shufbidx);
-    t0 = _mm256_castsi256_si128(f0);
-    t1 = _mm256_extracti128_si256(f0,1);
-    t0 = _mm_blend_epi16(t0,t1,0xE0);
-    _mm_storeu_si128((__m128i *)&r[20*i+ 0],t0);
-    memcpy(&r[20*i+16],&t1,4);
-  }
-}
-
-static void poly_decompress10(poly * restrict r, const uint8_t a[320+12])
-{
-  unsigned int i;
-  __m256i f;
-  const __m256i q = _mm256_set1_epi32((KYBER_Q << 16) + 4*KYBER_Q);
-  const __m256i shufbidx = _mm256_set_epi8(11,10,10, 9, 9, 8, 8, 7,
-                                            6, 5, 5, 4, 4, 3, 3, 2,
-                                            9, 8, 8, 7, 7, 6, 6, 5,
-                                            4, 3, 3, 2, 2, 1, 1, 0);
-  const __m256i sllvdidx = _mm256_set1_epi64x(4);
-  const __m256i mask = _mm256_set1_epi32((32736 << 16) + 8184);
-
-  for(i=0;i<KYBER_N/16;i++) {
-    f = _mm256_loadu_si256((__m256i *)&a[20*i]);
-    f = _mm256_permute4x64_epi64(f,0x94);
-    f = _mm256_shuffle_epi8(f,shufbidx);
-    f = _mm256_sllv_epi32(f,sllvdidx);
-    f = _mm256_srli_epi16(f,1);
-    f = _mm256_and_si256(f,mask);
-    f = _mm256_mulhrs_epi16(f,q);
-    _mm256_store_si256(&r->vec[i],f);
-  }
-}
-
-#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
-static void poly_compress11(uint8_t r[352+2], const poly * restrict a)
-{
-  unsigned int i;
-  __m256i f0, f1, f2;
-  __m128i t0, t1;
-  const __m256i v = _mm256_load_si256(&qdata.vec[_16XV/16]);
-  const __m256i v8 = _mm256_slli_epi16(v,3);
-  const __m256i off = _mm256_set1_epi16(36);
-  const __m256i shift1 = _mm256_set1_epi16(1 << 13);
-  const __m256i mask = _mm256_set1_epi16(2047);
-  const __m256i shift2 = _mm256_set1_epi64x((2048LL << 48) + (1LL << 32) + (2048 << 16) + 1);
-  const __m256i sllvdidx = _mm256_set1_epi64x(10);
-  const __m256i srlvqidx = _mm256_set_epi64x(30,10,30,10);
-  const __m256i shufbidx = _mm256_set_epi8( 4, 3, 2, 1, 0, 0,-1,-1,-1,-1,10, 9, 8, 7, 6, 5,
-                                           -1,-1,-1,-1,-1,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-
-  for(i=0;i<KYBER_N/16;i++) {
-    f0 = _mm256_load_si256(&a->vec[i]);
-    f1 = _mm256_mullo_epi16(f0,v8);
-    f2 = _mm256_add_epi16(f0,off);
-    f0 = _mm256_slli_epi16(f0,3);
-    f0 = _mm256_mulhi_epi16(f0,v);
-    f2 = _mm256_sub_epi16(f1,f2);
-    f1 = _mm256_andnot_si256(f1,f2);
-    f1 = _mm256_srli_epi16(f1,15);
-    f0 = _mm256_sub_epi16(f0,f1);
-    f0 = _mm256_mulhrs_epi16(f0,shift1);
-    f0 = _mm256_and_si256(f0,mask);
-    f0 = _mm256_madd_epi16(f0,shift2);
-    f0 = _mm256_sllv_epi32(f0,sllvdidx);
-    f1 = _mm256_bsrli_epi128(f0,8);
-    f0 = _mm256_srlv_epi64(f0,srlvqidx);
-    f1 = _mm256_slli_epi64(f1,34);
-    f0 = _mm256_add_epi64(f0,f1);
-    f0 = _mm256_shuffle_epi8(f0,shufbidx);
-    t0 = _mm256_castsi256_si128(f0);
-    t1 = _mm256_extracti128_si256(f0,1);
-    t0 = _mm_blendv_epi8(t0,t1,_mm256_castsi256_si128(shufbidx));
-    _mm_storeu_si128((__m128i *)&r[22*i+ 0],t0);
-    _mm_storel_epi64((__m128i *)&r[22*i+16],t1);
-  }
-}
-
-static void poly_decompress11(poly * restrict r, const uint8_t a[352+10])
-{
-  unsigned int i;
-  __m256i f;
-  const __m256i q = _mm256_load_si256(&qdata.vec[_16XQ/16]);
-  const __m256i shufbidx = _mm256_set_epi8(13,12,12,11,10, 9, 9, 8,
-                                            8, 7, 6, 5, 5, 4, 4, 3,
-                                           10, 9, 9, 8, 7, 6, 6, 5,
-                                            5, 4, 3, 2, 2, 1, 1, 0);
-  const __m256i srlvdidx = _mm256_set_epi32(0,0,1,0,0,0,1,0);
-  const __m256i srlvqidx = _mm256_set_epi64x(2,0,2,0);
-  const __m256i shift = _mm256_set_epi16(4,32,1,8,32,1,4,32,4,32,1,8,32,1,4,32);
-  const __m256i mask = _mm256_set1_epi16(32752);
-
-  for(i=0;i<KYBER_N/16;i++) {
-    f = _mm256_loadu_si256((__m256i *)&a[22*i]);
-    f = _mm256_permute4x64_epi64(f,0x94);
-    f = _mm256_shuffle_epi8(f,shufbidx);
-    f = _mm256_srlv_epi32(f,srlvdidx);
-    f = _mm256_srlv_epi64(f,srlvqidx);
-    f = _mm256_mullo_epi16(f,shift);
-    f = _mm256_srli_epi16(f,1);
-    f = _mm256_and_si256(f,mask);
-    f = _mm256_mulhrs_epi16(f,q);
-    _mm256_store_si256(&r->vec[i],f);
-  }
-}
-
-#endif
-
-/*************************************************
-* Name:        polyvec_compress
-*
-* Description: Compress and serialize vector of polynomials
-*
-* Arguments:   - uint8_t *r: pointer to output byte array
-*                            (needs space for KYBER_POLYVECCOMPRESSEDBYTES)
-*              - polyvec *a: pointer to input vector of polynomials
-**************************************************/
-void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES+2], const polyvec *a)
-{
-  unsigned int i;
-
-#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
-  for(i=0;i<KYBER_K;i++)
-    poly_compress10(&r[320*i],&a->vec[i]);
-#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
-  for(i=0;i<KYBER_K;i++)
-    poly_compress11(&r[352*i],&a->vec[i]);
-#endif
-}
-
-/*************************************************
-* Name:        polyvec_decompress
-*
-* Description: De-serialize and decompress vector of polynomials;
-*              approximate inverse of polyvec_compress
-*
-* Arguments:   - polyvec *r: pointer to output vector of polynomials
-*              - const uint8_t *a: pointer to input byte array
-*                                  (of length KYBER_POLYVECCOMPRESSEDBYTES)
-**************************************************/
-void polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES+12])
-{
-  unsigned int i;
-
-#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
-  for(i=0;i<KYBER_K;i++)
-    poly_decompress10(&r->vec[i],&a[320*i]);
-#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
-  for(i=0;i<KYBER_K;i++)
-    poly_decompress11(&r->vec[i],&a[352*i]);
-#endif
-}
-
-/*************************************************
-* Name:        polyvec_tobytes
-*
-* Description: Serialize vector of polynomials
-*
-* Arguments:   - uint8_t *r: pointer to output byte array
-*                            (needs space for KYBER_POLYVECBYTES)
-*              - polyvec *a: pointer to input vector of polynomials
-**************************************************/
-void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_tobytes(r+i*KYBER_POLYBYTES, &a->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvec_frombytes
-*
-* Description: De-serialize vector of polynomials;
-*              inverse of polyvec_tobytes
-*
-* Arguments:   - uint8_t *r: pointer to output byte array
-*              - const polyvec *a: pointer to input vector of polynomials
-*                                  (of length KYBER_POLYVECBYTES)
-**************************************************/
-void polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES])
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_frombytes(&r->vec[i], a+i*KYBER_POLYBYTES);
-}
-
-/*************************************************
-* Name:        polyvec_ntt
-*
-* Description: Apply forward NTT to all elements of a vector of polynomials
-*
-* Arguments:   - polyvec *r: pointer to in/output vector of polynomials
-**************************************************/
-void polyvec_ntt(polyvec *r)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_ntt(&r->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvec_invntt_tomont
-*
-* Description: Apply inverse NTT to all elements of a vector of polynomials
-*              and multiply by Montgomery factor 2^16
-*
-* Arguments:   - polyvec *r: pointer to in/output vector of polynomials
-**************************************************/
-void polyvec_invntt_tomont(polyvec *r)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_invntt_tomont(&r->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvec_basemul_acc_montgomery
-*
-* Description: Multiply elements in a and b in NTT domain, accumulate into r,
-*              and multiply by 2^-16.
-*
-* Arguments: - poly *r: pointer to output polynomial
-*            - const polyvec *a: pointer to first input vector of polynomials
-*            - const polyvec *b: pointer to second input vector of polynomials
-**************************************************/
-void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
-{
-  unsigned int i;
-  poly tmp;
-
-  poly_basemul_montgomery(r,&a->vec[0],&b->vec[0]);
-  for(i=1;i<KYBER_K;i++) {
-    poly_basemul_montgomery(&tmp,&a->vec[i],&b->vec[i]);
-    poly_add(r,r,&tmp);
-  }
-}
-
-/*************************************************
-* Name:        polyvec_reduce
-*
-* Description: Applies Barrett reduction to each coefficient
-*              of each element of a vector of polynomials;
-*              for details of the Barrett reduction see comments in reduce.c
-*
-* Arguments:   - polyvec *r: pointer to input/output polynomial
-**************************************************/
-void polyvec_reduce(polyvec *r)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_reduce(&r->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvec_add
-*
-* Description: Add vectors of polynomials
-*
-* Arguments: - polyvec *r:       pointer to output vector of polynomials
-*            - const polyvec *a: pointer to first input vector of polynomials
-*            - const polyvec *b: pointer to second input vector of polynomials
-**************************************************/
-void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_add(&r->vec[i], &a->vec[i], &b->vec[i]);
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/polyvec.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/polyvec.h
deleted file mode 100644
index 2ce23c31ff..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/polyvec.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef POLYVEC_H
-#define POLYVEC_H
-
-#include <stdint.h>
-#include "params.h"
-#include "poly.h"
-
-typedef struct{
-  poly vec[KYBER_K];
-} polyvec;
-
-#define polyvec_compress KYBER_NAMESPACE(polyvec_compress)
-void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES+2], const polyvec *a);
-#define polyvec_decompress KYBER_NAMESPACE(polyvec_decompress)
-void polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES+12]);
-
-#define polyvec_tobytes KYBER_NAMESPACE(polyvec_tobytes)
-void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a);
-#define polyvec_frombytes KYBER_NAMESPACE(polyvec_frombytes)
-void polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]);
-
-#define polyvec_ntt KYBER_NAMESPACE(polyvec_ntt)
-void polyvec_ntt(polyvec *r);
-#define polyvec_invntt_tomont KYBER_NAMESPACE(polyvec_invntt_tomont)
-void polyvec_invntt_tomont(polyvec *r);
-
-#define polyvec_basemul_acc_montgomery KYBER_NAMESPACE(polyvec_basemul_acc_montgomery)
-void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b);
-
-#define polyvec_reduce KYBER_NAMESPACE(polyvec_reduce)
-void polyvec_reduce(polyvec *r);
-
-#define polyvec_add KYBER_NAMESPACE(polyvec_add)
-void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/reduce.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/reduce.h
deleted file mode 100644
index 5368185b5f..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/reduce.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef REDUCE_H
-#define REDUCE_H
-
-#include "params.h"
-#include <immintrin.h>
-
-#define reduce_avx KYBER_NAMESPACE(reduce_avx)
-void reduce_avx(__m256i *r, const __m256i *qdata);
-#define tomont_avx KYBER_NAMESPACE(tomont_avx)
-void tomont_avx(__m256i *r, const __m256i *qdata);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/rejsample.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/rejsample.c
deleted file mode 100644
index 9060a44cb9..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/rejsample.c
+++ /dev/null
@@ -1,398 +0,0 @@
-#include <stdint.h>
-#include <immintrin.h>
-#include <string.h>
-#include "params.h"
-#include "consts.h"
-#include "rejsample.h"
-
-//#define BMI
-
-#ifndef BMI
-static const uint8_t idx[256][8] = {
-  {-1, -1, -1, -1, -1, -1, -1, -1},
-  { 0, -1, -1, -1, -1, -1, -1, -1},
-  { 2, -1, -1, -1, -1, -1, -1, -1},
-  { 0,  2, -1, -1, -1, -1, -1, -1},
-  { 4, -1, -1, -1, -1, -1, -1, -1},
-  { 0,  4, -1, -1, -1, -1, -1, -1},
-  { 2,  4, -1, -1, -1, -1, -1, -1},
-  { 0,  2,  4, -1, -1, -1, -1, -1},
-  { 6, -1, -1, -1, -1, -1, -1, -1},
-  { 0,  6, -1, -1, -1, -1, -1, -1},
-  { 2,  6, -1, -1, -1, -1, -1, -1},
-  { 0,  2,  6, -1, -1, -1, -1, -1},
-  { 4,  6, -1, -1, -1, -1, -1, -1},
-  { 0,  4,  6, -1, -1, -1, -1, -1},
-  { 2,  4,  6, -1, -1, -1, -1, -1},
-  { 0,  2,  4,  6, -1, -1, -1, -1},
-  { 8, -1, -1, -1, -1, -1, -1, -1},
-  { 0,  8, -1, -1, -1, -1, -1, -1},
-  { 2,  8, -1, -1, -1, -1, -1, -1},
-  { 0,  2,  8, -1, -1, -1, -1, -1},
-  { 4,  8, -1, -1, -1, -1, -1, -1},
-  { 0,  4,  8, -1, -1, -1, -1, -1},
-  { 2,  4,  8, -1, -1, -1, -1, -1},
-  { 0,  2,  4,  8, -1, -1, -1, -1},
-  { 6,  8, -1, -1, -1, -1, -1, -1},
-  { 0,  6,  8, -1, -1, -1, -1, -1},
-  { 2,  6,  8, -1, -1, -1, -1, -1},
-  { 0,  2,  6,  8, -1, -1, -1, -1},
-  { 4,  6,  8, -1, -1, -1, -1, -1},
-  { 0,  4,  6,  8, -1, -1, -1, -1},
-  { 2,  4,  6,  8, -1, -1, -1, -1},
-  { 0,  2,  4,  6,  8, -1, -1, -1},
-  {10, -1, -1, -1, -1, -1, -1, -1},
-  { 0, 10, -1, -1, -1, -1, -1, -1},
-  { 2, 10, -1, -1, -1, -1, -1, -1},
-  { 0,  2, 10, -1, -1, -1, -1, -1},
-  { 4, 10, -1, -1, -1, -1, -1, -1},
-  { 0,  4, 10, -1, -1, -1, -1, -1},
-  { 2,  4, 10, -1, -1, -1, -1, -1},
-  { 0,  2,  4, 10, -1, -1, -1, -1},
-  { 6, 10, -1, -1, -1, -1, -1, -1},
-  { 0,  6, 10, -1, -1, -1, -1, -1},
-  { 2,  6, 10, -1, -1, -1, -1, -1},
-  { 0,  2,  6, 10, -1, -1, -1, -1},
-  { 4,  6, 10, -1, -1, -1, -1, -1},
-  { 0,  4,  6, 10, -1, -1, -1, -1},
-  { 2,  4,  6, 10, -1, -1, -1, -1},
-  { 0,  2,  4,  6, 10, -1, -1, -1},
-  { 8, 10, -1, -1, -1, -1, -1, -1},
-  { 0,  8, 10, -1, -1, -1, -1, -1},
-  { 2,  8, 10, -1, -1, -1, -1, -1},
-  { 0,  2,  8, 10, -1, -1, -1, -1},
-  { 4,  8, 10, -1, -1, -1, -1, -1},
-  { 0,  4,  8, 10, -1, -1, -1, -1},
-  { 2,  4,  8, 10, -1, -1, -1, -1},
-  { 0,  2,  4,  8, 10, -1, -1, -1},
-  { 6,  8, 10, -1, -1, -1, -1, -1},
-  { 0,  6,  8, 10, -1, -1, -1, -1},
-  { 2,  6,  8, 10, -1, -1, -1, -1},
-  { 0,  2,  6,  8, 10, -1, -1, -1},
-  { 4,  6,  8, 10, -1, -1, -1, -1},
-  { 0,  4,  6,  8, 10, -1, -1, -1},
-  { 2,  4,  6,  8, 10, -1, -1, -1},
-  { 0,  2,  4,  6,  8, 10, -1, -1},
-  {12, -1, -1, -1, -1, -1, -1, -1},
-  { 0, 12, -1, -1, -1, -1, -1, -1},
-  { 2, 12, -1, -1, -1, -1, -1, -1},
-  { 0,  2, 12, -1, -1, -1, -1, -1},
-  { 4, 12, -1, -1, -1, -1, -1, -1},
-  { 0,  4, 12, -1, -1, -1, -1, -1},
-  { 2,  4, 12, -1, -1, -1, -1, -1},
-  { 0,  2,  4, 12, -1, -1, -1, -1},
-  { 6, 12, -1, -1, -1, -1, -1, -1},
-  { 0,  6, 12, -1, -1, -1, -1, -1},
-  { 2,  6, 12, -1, -1, -1, -1, -1},
-  { 0,  2,  6, 12, -1, -1, -1, -1},
-  { 4,  6, 12, -1, -1, -1, -1, -1},
-  { 0,  4,  6, 12, -1, -1, -1, -1},
-  { 2,  4,  6, 12, -1, -1, -1, -1},
-  { 0,  2,  4,  6, 12, -1, -1, -1},
-  { 8, 12, -1, -1, -1, -1, -1, -1},
-  { 0,  8, 12, -1, -1, -1, -1, -1},
-  { 2,  8, 12, -1, -1, -1, -1, -1},
-  { 0,  2,  8, 12, -1, -1, -1, -1},
-  { 4,  8, 12, -1, -1, -1, -1, -1},
-  { 0,  4,  8, 12, -1, -1, -1, -1},
-  { 2,  4,  8, 12, -1, -1, -1, -1},
-  { 0,  2,  4,  8, 12, -1, -1, -1},
-  { 6,  8, 12, -1, -1, -1, -1, -1},
-  { 0,  6,  8, 12, -1, -1, -1, -1},
-  { 2,  6,  8, 12, -1, -1, -1, -1},
-  { 0,  2,  6,  8, 12, -1, -1, -1},
-  { 4,  6,  8, 12, -1, -1, -1, -1},
-  { 0,  4,  6,  8, 12, -1, -1, -1},
-  { 2,  4,  6,  8, 12, -1, -1, -1},
-  { 0,  2,  4,  6,  8, 12, -1, -1},
-  {10, 12, -1, -1, -1, -1, -1, -1},
-  { 0, 10, 12, -1, -1, -1, -1, -1},
-  { 2, 10, 12, -1, -1, -1, -1, -1},
-  { 0,  2, 10, 12, -1, -1, -1, -1},
-  { 4, 10, 12, -1, -1, -1, -1, -1},
-  { 0,  4, 10, 12, -1, -1, -1, -1},
-  { 2,  4, 10, 12, -1, -1, -1, -1},
-  { 0,  2,  4, 10, 12, -1, -1, -1},
-  { 6, 10, 12, -1, -1, -1, -1, -1},
-  { 0,  6, 10, 12, -1, -1, -1, -1},
-  { 2,  6, 10, 12, -1, -1, -1, -1},
-  { 0,  2,  6, 10, 12, -1, -1, -1},
-  { 4,  6, 10, 12, -1, -1, -1, -1},
-  { 0,  4,  6, 10, 12, -1, -1, -1},
-  { 2,  4,  6, 10, 12, -1, -1, -1},
-  { 0,  2,  4,  6, 10, 12, -1, -1},
-  { 8, 10, 12, -1, -1, -1, -1, -1},
-  { 0,  8, 10, 12, -1, -1, -1, -1},
-  { 2,  8, 10, 12, -1, -1, -1, -1},
-  { 0,  2,  8, 10, 12, -1, -1, -1},
-  { 4,  8, 10, 12, -1, -1, -1, -1},
-  { 0,  4,  8, 10, 12, -1, -1, -1},
-  { 2,  4,  8, 10, 12, -1, -1, -1},
-  { 0,  2,  4,  8, 10, 12, -1, -1},
-  { 6,  8, 10, 12, -1, -1, -1, -1},
-  { 0,  6,  8, 10, 12, -1, -1, -1},
-  { 2,  6,  8, 10, 12, -1, -1, -1},
-  { 0,  2,  6,  8, 10, 12, -1, -1},
-  { 4,  6,  8, 10, 12, -1, -1, -1},
-  { 0,  4,  6,  8, 10, 12, -1, -1},
-  { 2,  4,  6,  8, 10, 12, -1, -1},
-  { 0,  2,  4,  6,  8, 10, 12, -1},
-  {14, -1, -1, -1, -1, -1, -1, -1},
-  { 0, 14, -1, -1, -1, -1, -1, -1},
-  { 2, 14, -1, -1, -1, -1, -1, -1},
-  { 0,  2, 14, -1, -1, -1, -1, -1},
-  { 4, 14, -1, -1, -1, -1, -1, -1},
-  { 0,  4, 14, -1, -1, -1, -1, -1},
-  { 2,  4, 14, -1, -1, -1, -1, -1},
-  { 0,  2,  4, 14, -1, -1, -1, -1},
-  { 6, 14, -1, -1, -1, -1, -1, -1},
-  { 0,  6, 14, -1, -1, -1, -1, -1},
-  { 2,  6, 14, -1, -1, -1, -1, -1},
-  { 0,  2,  6, 14, -1, -1, -1, -1},
-  { 4,  6, 14, -1, -1, -1, -1, -1},
-  { 0,  4,  6, 14, -1, -1, -1, -1},
-  { 2,  4,  6, 14, -1, -1, -1, -1},
-  { 0,  2,  4,  6, 14, -1, -1, -1},
-  { 8, 14, -1, -1, -1, -1, -1, -1},
-  { 0,  8, 14, -1, -1, -1, -1, -1},
-  { 2,  8, 14, -1, -1, -1, -1, -1},
-  { 0,  2,  8, 14, -1, -1, -1, -1},
-  { 4,  8, 14, -1, -1, -1, -1, -1},
-  { 0,  4,  8, 14, -1, -1, -1, -1},
-  { 2,  4,  8, 14, -1, -1, -1, -1},
-  { 0,  2,  4,  8, 14, -1, -1, -1},
-  { 6,  8, 14, -1, -1, -1, -1, -1},
-  { 0,  6,  8, 14, -1, -1, -1, -1},
-  { 2,  6,  8, 14, -1, -1, -1, -1},
-  { 0,  2,  6,  8, 14, -1, -1, -1},
-  { 4,  6,  8, 14, -1, -1, -1, -1},
-  { 0,  4,  6,  8, 14, -1, -1, -1},
-  { 2,  4,  6,  8, 14, -1, -1, -1},
-  { 0,  2,  4,  6,  8, 14, -1, -1},
-  {10, 14, -1, -1, -1, -1, -1, -1},
-  { 0, 10, 14, -1, -1, -1, -1, -1},
-  { 2, 10, 14, -1, -1, -1, -1, -1},
-  { 0,  2, 10, 14, -1, -1, -1, -1},
-  { 4, 10, 14, -1, -1, -1, -1, -1},
-  { 0,  4, 10, 14, -1, -1, -1, -1},
-  { 2,  4, 10, 14, -1, -1, -1, -1},
-  { 0,  2,  4, 10, 14, -1, -1, -1},
-  { 6, 10, 14, -1, -1, -1, -1, -1},
-  { 0,  6, 10, 14, -1, -1, -1, -1},
-  { 2,  6, 10, 14, -1, -1, -1, -1},
-  { 0,  2,  6, 10, 14, -1, -1, -1},
-  { 4,  6, 10, 14, -1, -1, -1, -1},
-  { 0,  4,  6, 10, 14, -1, -1, -1},
-  { 2,  4,  6, 10, 14, -1, -1, -1},
-  { 0,  2,  4,  6, 10, 14, -1, -1},
-  { 8, 10, 14, -1, -1, -1, -1, -1},
-  { 0,  8, 10, 14, -1, -1, -1, -1},
-  { 2,  8, 10, 14, -1, -1, -1, -1},
-  { 0,  2,  8, 10, 14, -1, -1, -1},
-  { 4,  8, 10, 14, -1, -1, -1, -1},
-  { 0,  4,  8, 10, 14, -1, -1, -1},
-  { 2,  4,  8, 10, 14, -1, -1, -1},
-  { 0,  2,  4,  8, 10, 14, -1, -1},
-  { 6,  8, 10, 14, -1, -1, -1, -1},
-  { 0,  6,  8, 10, 14, -1, -1, -1},
-  { 2,  6,  8, 10, 14, -1, -1, -1},
-  { 0,  2,  6,  8, 10, 14, -1, -1},
-  { 4,  6,  8, 10, 14, -1, -1, -1},
-  { 0,  4,  6,  8, 10, 14, -1, -1},
-  { 2,  4,  6,  8, 10, 14, -1, -1},
-  { 0,  2,  4,  6,  8, 10, 14, -1},
-  {12, 14, -1, -1, -1, -1, -1, -1},
-  { 0, 12, 14, -1, -1, -1, -1, -1},
-  { 2, 12, 14, -1, -1, -1, -1, -1},
-  { 0,  2, 12, 14, -1, -1, -1, -1},
-  { 4, 12, 14, -1, -1, -1, -1, -1},
-  { 0,  4, 12, 14, -1, -1, -1, -1},
-  { 2,  4, 12, 14, -1, -1, -1, -1},
-  { 0,  2,  4, 12, 14, -1, -1, -1},
-  { 6, 12, 14, -1, -1, -1, -1, -1},
-  { 0,  6, 12, 14, -1, -1, -1, -1},
-  { 2,  6, 12, 14, -1, -1, -1, -1},
-  { 0,  2,  6, 12, 14, -1, -1, -1},
-  { 4,  6, 12, 14, -1, -1, -1, -1},
-  { 0,  4,  6, 12, 14, -1, -1, -1},
-  { 2,  4,  6, 12, 14, -1, -1, -1},
-  { 0,  2,  4,  6, 12, 14, -1, -1},
-  { 8, 12, 14, -1, -1, -1, -1, -1},
-  { 0,  8, 12, 14, -1, -1, -1, -1},
-  { 2,  8, 12, 14, -1, -1, -1, -1},
-  { 0,  2,  8, 12, 14, -1, -1, -1},
-  { 4,  8, 12, 14, -1, -1, -1, -1},
-  { 0,  4,  8, 12, 14, -1, -1, -1},
-  { 2,  4,  8, 12, 14, -1, -1, -1},
-  { 0,  2,  4,  8, 12, 14, -1, -1},
-  { 6,  8, 12, 14, -1, -1, -1, -1},
-  { 0,  6,  8, 12, 14, -1, -1, -1},
-  { 2,  6,  8, 12, 14, -1, -1, -1},
-  { 0,  2,  6,  8, 12, 14, -1, -1},
-  { 4,  6,  8, 12, 14, -1, -1, -1},
-  { 0,  4,  6,  8, 12, 14, -1, -1},
-  { 2,  4,  6,  8, 12, 14, -1, -1},
-  { 0,  2,  4,  6,  8, 12, 14, -1},
-  {10, 12, 14, -1, -1, -1, -1, -1},
-  { 0, 10, 12, 14, -1, -1, -1, -1},
-  { 2, 10, 12, 14, -1, -1, -1, -1},
-  { 0,  2, 10, 12, 14, -1, -1, -1},
-  { 4, 10, 12, 14, -1, -1, -1, -1},
-  { 0,  4, 10, 12, 14, -1, -1, -1},
-  { 2,  4, 10, 12, 14, -1, -1, -1},
-  { 0,  2,  4, 10, 12, 14, -1, -1},
-  { 6, 10, 12, 14, -1, -1, -1, -1},
-  { 0,  6, 10, 12, 14, -1, -1, -1},
-  { 2,  6, 10, 12, 14, -1, -1, -1},
-  { 0,  2,  6, 10, 12, 14, -1, -1},
-  { 4,  6, 10, 12, 14, -1, -1, -1},
-  { 0,  4,  6, 10, 12, 14, -1, -1},
-  { 2,  4,  6, 10, 12, 14, -1, -1},
-  { 0,  2,  4,  6, 10, 12, 14, -1},
-  { 8, 10, 12, 14, -1, -1, -1, -1},
-  { 0,  8, 10, 12, 14, -1, -1, -1},
-  { 2,  8, 10, 12, 14, -1, -1, -1},
-  { 0,  2,  8, 10, 12, 14, -1, -1},
-  { 4,  8, 10, 12, 14, -1, -1, -1},
-  { 0,  4,  8, 10, 12, 14, -1, -1},
-  { 2,  4,  8, 10, 12, 14, -1, -1},
-  { 0,  2,  4,  8, 10, 12, 14, -1},
-  { 6,  8, 10, 12, 14, -1, -1, -1},
-  { 0,  6,  8, 10, 12, 14, -1, -1},
-  { 2,  6,  8, 10, 12, 14, -1, -1},
-  { 0,  2,  6,  8, 10, 12, 14, -1},
-  { 4,  6,  8, 10, 12, 14, -1, -1},
-  { 0,  4,  6,  8, 10, 12, 14, -1},
-  { 2,  4,  6,  8, 10, 12, 14, -1},
-  { 0,  2,  4,  6,  8, 10, 12, 14}
-};
-#endif
-
-#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a)
-#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a)
-
-unsigned int rej_uniform_avx(int16_t * restrict r, const uint8_t *buf)
-{
-  unsigned int ctr, pos;
-  uint16_t val0, val1;
-  uint32_t good;
-#ifdef BMI
-  uint64_t idx0, idx1, idx2, idx3;
-#endif
-  const __m256i bound  = _mm256_load_si256(&qdata.vec[_16XQ/16]);
-  const __m256i ones   = _mm256_set1_epi8(1);
-  const __m256i mask  = _mm256_set1_epi16(0xFFF);
-  const __m256i idx8  = _mm256_set_epi8(15,14,14,13,12,11,11,10,
-                                         9, 8, 8, 7, 6, 5, 5, 4,
-                                        11,10,10, 9, 8, 7, 7, 6,
-                                         5, 4, 4, 3, 2, 1, 1, 0);
-  __m256i f0, f1, g0, g1, g2, g3;
-  __m128i f, t, pilo, pihi;
-
-  ctr = pos = 0;
-  while(ctr <= KYBER_N - 32 && pos <= REJ_UNIFORM_AVX_BUFLEN - 56) {
-    f0 = _mm256_loadu_si256((__m256i *)&buf[pos]);
-    f1 = _mm256_loadu_si256((__m256i *)&buf[pos+24]);
-    f0 = _mm256_permute4x64_epi64(f0, 0x94);
-    f1 = _mm256_permute4x64_epi64(f1, 0x94);
-    f0 = _mm256_shuffle_epi8(f0, idx8);
-    f1 = _mm256_shuffle_epi8(f1, idx8);
-    g0 = _mm256_srli_epi16(f0, 4);
-    g1 = _mm256_srli_epi16(f1, 4);
-    f0 = _mm256_blend_epi16(f0, g0, 0xAA);
-    f1 = _mm256_blend_epi16(f1, g1, 0xAA);
-    f0 = _mm256_and_si256(f0, mask);
-    f1 = _mm256_and_si256(f1, mask);
-    pos += 48;
-
-    g0 = _mm256_cmpgt_epi16(bound, f0);
-    g1 = _mm256_cmpgt_epi16(bound, f1);
-
-    g0 = _mm256_packs_epi16(g0, g1);
-    good = _mm256_movemask_epi8(g0);
-
-#ifdef BMI
-    idx0 = _pdep_u64(good >>  0, 0x0101010101010101);
-    idx1 = _pdep_u64(good >>  8, 0x0101010101010101);
-    idx2 = _pdep_u64(good >> 16, 0x0101010101010101);
-    idx3 = _pdep_u64(good >> 24, 0x0101010101010101);
-    idx0 = (idx0 << 8) - idx0;
-    idx0  = _pext_u64(0x0E0C0A0806040200, idx0);
-    idx1 = (idx1 << 8) - idx1;
-    idx1  = _pext_u64(0x0E0C0A0806040200, idx1);
-    idx2 = (idx2 << 8) - idx2;
-    idx2  = _pext_u64(0x0E0C0A0806040200, idx2);
-    idx3 = (idx3 << 8) - idx3;
-    idx3  = _pext_u64(0x0E0C0A0806040200, idx3);
-
-    g0 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx0));
-    g1 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx1));
-    g0 = _mm256_inserti128_si256(g0, _mm_cvtsi64_si128(idx2), 1);
-    g1 = _mm256_inserti128_si256(g1, _mm_cvtsi64_si128(idx3), 1);
-#else
-    g0 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx[(good >>  0) & 0xFF]));
-    g1 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx[(good >>  8) & 0xFF]));
-    g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((__m128i *)&idx[(good >> 16) & 0xFF]), 1);
-    g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((__m128i *)&idx[(good >> 24) & 0xFF]), 1);
-#endif
-
-    g2 = _mm256_add_epi8(g0, ones);
-    g3 = _mm256_add_epi8(g1, ones);
-    g0 = _mm256_unpacklo_epi8(g0, g2);
-    g1 = _mm256_unpacklo_epi8(g1, g3);
-
-    f0 = _mm256_shuffle_epi8(f0, g0);
-    f1 = _mm256_shuffle_epi8(f1, g1);
-
-    _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f0));
-    ctr += _mm_popcnt_u32((good >>  0) & 0xFF);
-    _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f0, 1));
-    ctr += _mm_popcnt_u32((good >> 16) & 0xFF);
-    _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f1));
-    ctr += _mm_popcnt_u32((good >>  8) & 0xFF);
-    _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f1, 1));
-    ctr += _mm_popcnt_u32((good >> 24) & 0xFF);
-  }
-
-  while(ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_AVX_BUFLEN - 16) {
-    f = _mm_loadu_si128((__m128i *)&buf[pos]);
-    f = _mm_shuffle_epi8(f, _mm256_castsi256_si128(idx8));
-    t = _mm_srli_epi16(f, 4);
-    f = _mm_blend_epi16(f, t, 0xAA);
-    f = _mm_and_si128(f, _mm256_castsi256_si128(mask));
-    pos += 12;
-
-    t = _mm_cmpgt_epi16(_mm256_castsi256_si128(bound), f);
-    good = _mm_movemask_epi8(t);
-
-#ifdef BMI
-    good &= 0x5555;
-    idx0 = _pdep_u64(good, 0x1111111111111111);
-    idx0 = (idx0 << 8) - idx0;
-    idx0 = _pext_u64(0x0E0C0A0806040200, idx0);
-    pilo = _mm_cvtsi64_si128(idx0);
-#else
-    good = _pext_u32(good, 0x5555);
-    pilo = _mm_loadl_epi64((__m128i *)&idx[good]);
-#endif
-
-    pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones));
-    pilo = _mm_unpacklo_epi8(pilo, pihi);
-    f = _mm_shuffle_epi8(f, pilo);
-    _mm_storeu_si128((__m128i *)&r[ctr], f);
-    ctr += _mm_popcnt_u32(good);
-  }
-
-  while(ctr < KYBER_N && pos <= REJ_UNIFORM_AVX_BUFLEN - 3) {
-    val0 = ((buf[pos+0] >> 0) | ((uint16_t)buf[pos+1] << 8)) & 0xFFF;
-    val1 = ((buf[pos+1] >> 4) | ((uint16_t)buf[pos+2] << 4));
-    pos += 3;
-
-    if(val0 < KYBER_Q)
-      r[ctr++] = val0;
-    if(val1 < KYBER_Q && ctr < KYBER_N)
-      r[ctr++] = val1;
-  }
-
-  return ctr;
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/rejsample.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/rejsample.h
deleted file mode 100644
index 3be5e2192e..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/rejsample.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef REJSAMPLE_H
-#define REJSAMPLE_H
-
-#include <stdint.h>
-#include "params.h"
-#include "symmetric.h"
-
-#define REJ_UNIFORM_AVX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES)
-#define REJ_UNIFORM_AVX_BUFLEN (REJ_UNIFORM_AVX_NBLOCKS*XOF_BLOCKBYTES)
-
-#define rej_uniform_avx KYBER_NAMESPACE(rej_uniform_avx)
-unsigned int rej_uniform_avx(int16_t *r, const uint8_t *buf);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/shuffle.S b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/shuffle.S
deleted file mode 100644
index 18325ebec0..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/shuffle.S
+++ /dev/null
@@ -1,255 +0,0 @@
-#include "consts.h"
-.include "fq.inc"
-.include "shuffle.inc"
-
-/*
-nttpack_avx:
-#load
-vmovdqa		(%rdi),%ymm4
-vmovdqa		32(%rdi),%ymm5
-vmovdqa		64(%rdi),%ymm6
-vmovdqa		96(%rdi),%ymm7
-vmovdqa		128(%rdi),%ymm8
-vmovdqa		160(%rdi),%ymm9
-vmovdqa		192(%rdi),%ymm10
-vmovdqa		224(%rdi),%ymm11
-
-shuffle1	4,5,3,5
-shuffle1	6,7,4,7
-shuffle1	8,9,6,9
-shuffle1	10,11,8,11
-
-shuffle2	3,4,10,4
-shuffle2	6,8,3,8
-shuffle2	5,7,6,7
-shuffle2	9,11,5,11
-
-shuffle4	10,3,9,3
-shuffle4	6,5,10,5
-shuffle4	4,8,6,8
-shuffle4	7,11,4,11
-
-shuffle8	9,10,7,10
-shuffle8	6,4,9,4
-shuffle8	3,5,6,5
-shuffle8	8,11,3,11
-
-#store
-vmovdqa		%ymm7,(%rdi)
-vmovdqa		%ymm9,32(%rdi)
-vmovdqa		%ymm6,64(%rdi)
-vmovdqa		%ymm3,96(%rdi)
-vmovdqa		%ymm10,128(%rdi)
-vmovdqa		%ymm4,160(%rdi)
-vmovdqa		%ymm5,192(%rdi)
-vmovdqa		%ymm11,224(%rdi)
-
-ret
-*/
-
-.text
-nttunpack128_avx:
-#load
-vmovdqa		(%rdi),%ymm4
-vmovdqa		32(%rdi),%ymm5
-vmovdqa		64(%rdi),%ymm6
-vmovdqa		96(%rdi),%ymm7
-vmovdqa		128(%rdi),%ymm8
-vmovdqa		160(%rdi),%ymm9
-vmovdqa		192(%rdi),%ymm10
-vmovdqa		224(%rdi),%ymm11
-
-shuffle8	4,8,3,8
-shuffle8	5,9,4,9
-shuffle8	6,10,5,10
-shuffle8	7,11,6,11
-
-shuffle4	3,5,7,5
-shuffle4	8,10,3,10
-shuffle4	4,6,8,6
-shuffle4	9,11,4,11
-
-shuffle2	7,8,9,8
-shuffle2	5,6,7,6
-shuffle2	3,4,5,4
-shuffle2	10,11,3,11
-
-shuffle1	9,5,10,5
-shuffle1	8,4,9,4
-shuffle1	7,3,8,3
-shuffle1	6,11,7,11
-
-#store
-vmovdqa		%ymm10,(%rdi)
-vmovdqa		%ymm5,32(%rdi)
-vmovdqa		%ymm9,64(%rdi)
-vmovdqa		%ymm4,96(%rdi)
-vmovdqa		%ymm8,128(%rdi)
-vmovdqa		%ymm3,160(%rdi)
-vmovdqa		%ymm7,192(%rdi)
-vmovdqa		%ymm11,224(%rdi)
-
-ret
-
-.global cdecl(nttunpack_avx)
-cdecl(nttunpack_avx):
-call		nttunpack128_avx
-add		$256,%rdi
-call		nttunpack128_avx
-ret
-
-ntttobytes128_avx:
-#load
-vmovdqa		(%rsi),%ymm5
-vmovdqa		32(%rsi),%ymm6
-vmovdqa		64(%rsi),%ymm7
-vmovdqa		96(%rsi),%ymm8
-vmovdqa		128(%rsi),%ymm9
-vmovdqa		160(%rsi),%ymm10
-vmovdqa		192(%rsi),%ymm11
-vmovdqa		224(%rsi),%ymm12
-
-#csubq
-csubq		5,13
-csubq		6,13
-csubq		7,13
-csubq		8,13
-csubq		9,13
-csubq		10,13
-csubq		11,13
-csubq		12,13
-
-#bitpack
-vpsllw		$12,%ymm6,%ymm4
-vpor		%ymm4,%ymm5,%ymm4
-
-vpsrlw		$4,%ymm6,%ymm5
-vpsllw		$8,%ymm7,%ymm6
-vpor		%ymm5,%ymm6,%ymm5
-
-vpsrlw		$8,%ymm7,%ymm6
-vpsllw		$4,%ymm8,%ymm7
-vpor		%ymm6,%ymm7,%ymm6
-
-vpsllw		$12,%ymm10,%ymm7
-vpor		%ymm7,%ymm9,%ymm7
-
-vpsrlw		$4,%ymm10,%ymm8
-vpsllw		$8,%ymm11,%ymm9
-vpor		%ymm8,%ymm9,%ymm8
-
-vpsrlw		$8,%ymm11,%ymm9
-vpsllw		$4,%ymm12,%ymm10
-vpor		%ymm9,%ymm10,%ymm9
-
-shuffle1	4,5,3,5
-shuffle1	6,7,4,7
-shuffle1	8,9,6,9
-
-shuffle2	3,4,8,4
-shuffle2	6,5,3,5
-shuffle2	7,9,6,9
-
-shuffle4	8,3,7,3
-shuffle4	6,4,8,4
-shuffle4	5,9,6,9
-
-shuffle8	7,8,5,8
-shuffle8	6,3,7,3
-shuffle8	4,9,6,9
-
-#store
-vmovdqu		%ymm5,(%rdi)
-vmovdqu		%ymm7,32(%rdi)
-vmovdqu		%ymm6,64(%rdi)
-vmovdqu		%ymm8,96(%rdi)
-vmovdqu		%ymm3,128(%rdi)
-vmovdqu		%ymm9,160(%rdi)
-
-ret
-
-.global cdecl(ntttobytes_avx)
-cdecl(ntttobytes_avx):
-#consts
-vmovdqa		_16XQ*2(%rdx),%ymm0
-call		ntttobytes128_avx
-add		$256,%rsi
-add		$192,%rdi
-call		ntttobytes128_avx
-ret
-
-nttfrombytes128_avx:
-#load
-vmovdqu		(%rsi),%ymm4
-vmovdqu		32(%rsi),%ymm5
-vmovdqu		64(%rsi),%ymm6
-vmovdqu		96(%rsi),%ymm7
-vmovdqu		128(%rsi),%ymm8
-vmovdqu		160(%rsi),%ymm9
-
-shuffle8	4,7,3,7
-shuffle8	5,8,4,8
-shuffle8	6,9,5,9
-
-shuffle4	3,8,6,8
-shuffle4	7,5,3,5
-shuffle4	4,9,7,9
-
-shuffle2	6,5,4,5
-shuffle2	8,7,6,7
-shuffle2	3,9,8,9
-
-shuffle1	4,7,10,7
-shuffle1	5,8,4,8
-shuffle1	6,9,5,9
-
-#bitunpack
-vpsrlw		$12,%ymm10,%ymm11
-vpsllw		$4,%ymm7,%ymm12
-vpor		%ymm11,%ymm12,%ymm11
-vpand		%ymm0,%ymm10,%ymm10
-vpand		%ymm0,%ymm11,%ymm11
-
-vpsrlw		$8,%ymm7,%ymm12
-vpsllw		$8,%ymm4,%ymm13
-vpor		%ymm12,%ymm13,%ymm12
-vpand		%ymm0,%ymm12,%ymm12
-
-vpsrlw		$4,%ymm4,%ymm13
-vpand		%ymm0,%ymm13,%ymm13
-
-vpsrlw		$12,%ymm8,%ymm14
-vpsllw		$4,%ymm5,%ymm15
-vpor		%ymm14,%ymm15,%ymm14
-vpand		%ymm0,%ymm8,%ymm8
-vpand		%ymm0,%ymm14,%ymm14
-
-vpsrlw		$8,%ymm5,%ymm15
-vpsllw		$8,%ymm9,%ymm1
-vpor		%ymm15,%ymm1,%ymm15
-vpand		%ymm0,%ymm15,%ymm15
-
-vpsrlw		$4,%ymm9,%ymm1
-vpand		%ymm0,%ymm1,%ymm1
-
-#store
-vmovdqa		%ymm10,(%rdi)
-vmovdqa		%ymm11,32(%rdi)
-vmovdqa		%ymm12,64(%rdi)
-vmovdqa		%ymm13,96(%rdi)
-vmovdqa		%ymm8,128(%rdi)
-vmovdqa		%ymm14,160(%rdi)
-vmovdqa		%ymm15,192(%rdi)
-vmovdqa		%ymm1,224(%rdi)
-
-ret
-
-.global cdecl(nttfrombytes_avx)
-cdecl(nttfrombytes_avx):
-#consts
-vmovdqa		_16XMASK*2(%rdx),%ymm0
-call		nttfrombytes128_avx
-add		$256,%rdi
-add		$192,%rsi
-call		nttfrombytes128_avx
-ret
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/shuffle.inc b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/shuffle.inc
deleted file mode 100644
index 73e9ffe03c..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/shuffle.inc
+++ /dev/null
@@ -1,25 +0,0 @@
-.macro shuffle8 r0,r1,r2,r3
-vperm2i128	$0x20,%ymm\r1,%ymm\r0,%ymm\r2
-vperm2i128	$0x31,%ymm\r1,%ymm\r0,%ymm\r3
-.endm
-
-.macro shuffle4 r0,r1,r2,r3
-vpunpcklqdq	%ymm\r1,%ymm\r0,%ymm\r2
-vpunpckhqdq	%ymm\r1,%ymm\r0,%ymm\r3
-.endm
-
-.macro shuffle2 r0,r1,r2,r3
-#vpsllq		$32,%ymm\r1,%ymm\r2
-vmovsldup	%ymm\r1,%ymm\r2
-vpblendd	$0xAA,%ymm\r2,%ymm\r0,%ymm\r2
-vpsrlq		$32,%ymm\r0,%ymm\r0
-#vmovshdup	%ymm\r0,%ymm\r0
-vpblendd	$0xAA,%ymm\r1,%ymm\r0,%ymm\r3
-.endm
-
-.macro shuffle1 r0,r1,r2,r3
-vpslld		$16,%ymm\r1,%ymm\r2
-vpblendw	$0xAA,%ymm\r2,%ymm\r0,%ymm\r2
-vpsrld		$16,%ymm\r0,%ymm\r0
-vpblendw	$0xAA,%ymm\r1,%ymm\r0,%ymm\r3
-.endm
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/symmetric-shake.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/symmetric-shake.c
deleted file mode 100644
index 20f451882e..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/symmetric-shake.c
+++ /dev/null
@@ -1,74 +0,0 @@
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-#include "params.h"
-#include "symmetric.h"
-#include "fips202.h"
-
-/*************************************************
-* Name:        kyber_shake128_absorb
-*
-* Description: Absorb step of the SHAKE128 specialized for the Kyber context.
-*
-* Arguments:   - keccak_state *state: pointer to (uninitialized) output Keccak state
-*              - const uint8_t *seed: pointer to KYBER_SYMBYTES input to be absorbed into state
-*              - uint8_t i: additional byte of input
-*              - uint8_t j: additional byte of input
-**************************************************/
-void kyber_shake128_absorb(shake128incctx *state,
-                           const uint8_t seed[KYBER_SYMBYTES],
-                           uint8_t x,
-                           uint8_t y)
-{
-  uint8_t extseed[KYBER_SYMBYTES+2];
-
-  memcpy(extseed, seed, KYBER_SYMBYTES);
-  extseed[KYBER_SYMBYTES+0] = x;
-  extseed[KYBER_SYMBYTES+1] = y;
-
-  shake128_absorb_once(state, extseed, sizeof(extseed));
-}
-
-/*************************************************
-* Name:        kyber_shake256_prf
-*
-* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input
-*              and then generates outlen bytes of SHAKE256 output
-*
-* Arguments:   - uint8_t *out: pointer to output
-*              - size_t outlen: number of requested output bytes
-*              - const uint8_t *key: pointer to the key (of length KYBER_SYMBYTES)
-*              - uint8_t nonce: single-byte nonce (public PRF input)
-**************************************************/
-void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce)
-{
-  uint8_t extkey[KYBER_SYMBYTES+1];
-
-  memcpy(extkey, key, KYBER_SYMBYTES);
-  extkey[KYBER_SYMBYTES] = nonce;
-
-  shake256(out, outlen, extkey, sizeof(extkey));
-}
-
-/*************************************************
-* Name:        kyber_shake256_prf
-*
-* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input
-*              and then generates outlen bytes of SHAKE256 output
-*
-* Arguments:   - uint8_t *out: pointer to output
-*              - size_t outlen: number of requested output bytes
-*              - const uint8_t *key: pointer to the key (of length KYBER_SYMBYTES)
-*              - uint8_t nonce: single-byte nonce (public PRF input)
-**************************************************/
-void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SYMBYTES], const uint8_t input[KYBER_CIPHERTEXTBYTES])
-{
-  shake256incctx s;
-
-  shake256_inc_init(&s);
-  shake256_inc_absorb(&s, key, KYBER_SYMBYTES);
-  shake256_inc_absorb(&s, input, KYBER_CIPHERTEXTBYTES);
-  shake256_inc_finalize(&s);
-  shake256_inc_squeeze(out, KYBER_SSBYTES, &s);
-  shake256_inc_ctx_release(&s);
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/symmetric.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/symmetric.h
deleted file mode 100644
index e4941f7a86..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/symmetric.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef SYMMETRIC_H
-#define SYMMETRIC_H
-
-#include <stddef.h>
-#include <stdint.h>
-#include "params.h"
-
-#include "fips202.h"
-#include "fips202x4.h"
-
-typedef shake128incctx xof_state;
-
-#define kyber_shake128_absorb KYBER_NAMESPACE(kyber_shake128_absorb)
-void kyber_shake128_absorb(shake128incctx *s,
-                           const uint8_t seed[KYBER_SYMBYTES],
-                           uint8_t x,
-                           uint8_t y);
-
-#define kyber_shake256_prf KYBER_NAMESPACE(kyber_shake256_prf)
-void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce);
-
-#define kyber_shake256_rkprf KYBER_NAMESPACE(kyber_shake256_rkprf)
-void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SYMBYTES], const uint8_t input[KYBER_CIPHERTEXTBYTES]);
-
-#define XOF_BLOCKBYTES SHAKE128_RATE
-
-#define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES)
-#define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES)
-#define xof_absorb(STATE, SEED, X, Y) kyber_shake128_absorb(STATE, SEED, X, Y)
-#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE)
-#define prf(OUT, OUTBYTES, KEY, NONCE) kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE)
-#define rkprf(OUT, KEY, INPUT) kyber_shake256_rkprf(OUT, KEY, INPUT)
-
-#endif /* SYMMETRIC_H */
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/verify.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/verify.c
deleted file mode 100644
index 06243b837f..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/verify.c
+++ /dev/null
@@ -1,83 +0,0 @@
-#include <stdlib.h>
-#include <stdint.h>
-#include <immintrin.h>
-#include "verify.h"
-
-/*************************************************
-* Name:        verify
-*
-* Description: Compare two arrays for equality in constant time.
-*
-* Arguments:   const uint8_t *a: pointer to first byte array
-*              const uint8_t *b: pointer to second byte array
-*              size_t len: length of the byte arrays
-*
-* Returns 0 if the byte arrays are equal, 1 otherwise
-**************************************************/
-int verify(const uint8_t *a, const uint8_t *b, size_t len)
-{
-  size_t i;
-  uint64_t r;
-  __m256i f, g, h;
-
-  h = _mm256_setzero_si256();
-  for(i=0;i<len/32;i++) {
-    f = _mm256_loadu_si256((__m256i *)&a[32*i]);
-    g = _mm256_loadu_si256((__m256i *)&b[32*i]);
-    f = _mm256_xor_si256(f,g);
-    h = _mm256_or_si256(h,f);
-  }
-  r = 1 - _mm256_testz_si256(h,h);
-
-  a += 32*i;
-  b += 32*i;
-  len -= 32*i;
-  for(i=0;i<len;i++)
-    r |= a[i] ^ b[i];
-
-  r = (-r) >> 63;
-  return r;
-}
-
-/*************************************************
-* Name:        cmov
-*
-* Description: Copy len bytes from x to r if b is 1;
-*              don't modify x if b is 0. Requires b to be in {0,1};
-*              assumes two's complement representation of negative integers.
-*              Runs in constant time.
-*
-* Arguments:   uint8_t *r: pointer to output byte array
-*              const uint8_t *x: pointer to input byte array
-*              size_t len: Amount of bytes to be copied
-*              uint8_t b: Condition bit; has to be in {0,1}
-**************************************************/
-void cmov(uint8_t * restrict r, const uint8_t *x, size_t len, uint8_t b)
-{
-  size_t i;
-  __m256i xvec, rvec, bvec;
-
-#if defined(__GNUC__) || defined(__clang__)
-  // Prevent the compiler from
-  //    1) inferring that b is 0/1-valued, and
-  //    2) handling the two cases with a branch.
-  // This is not necessary when verify.c and kem.c are separate translation
-  // units, but we expect that downstream consumers will copy this code and/or
-  // change how it is built.
-  __asm__("" : "+r"(b) : /* no inputs */);
-#endif
-
-  bvec = _mm256_set1_epi64x(-(uint64_t)b);
-  for(i=0;i<len/32;i++) {
-    rvec = _mm256_loadu_si256((__m256i *)&r[32*i]);
-    xvec = _mm256_loadu_si256((__m256i *)&x[32*i]);
-    rvec = _mm256_blendv_epi8(rvec,xvec,bvec);
-    _mm256_storeu_si256((__m256i *)&r[32*i],rvec);
-  }
-
-  r += 32*i;
-  x += 32*i;
-  len -= 32*i;
-  for(i=0;i<len;i++)
-    r[i] ^= -b & (x[i] ^ r[i]);
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/verify.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/verify.h
deleted file mode 100644
index 09f0ad5046..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_avx2/verify.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef VERIFY_H
-#define VERIFY_H
-
-#include <stddef.h>
-#include <stdint.h>
-#include "params.h"
-
-#define verify KYBER_NAMESPACE(verify)
-int verify(const uint8_t *a, const uint8_t *b, size_t len);
-
-#define cmov KYBER_NAMESPACE(cmov)
-void cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b);
-
-#define cmov_int16 KYBER_NAMESPACE(cmov_int16)
-void cmov_int16(int16_t *r, int16_t v, uint16_t b);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/api.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/api.h
deleted file mode 100644
index 70d40f3f3e..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/api.h
+++ /dev/null
@@ -1,66 +0,0 @@
-#ifndef API_H
-#define API_H
-
-#include <stdint.h>
-
-#define pqcrystals_kyber512_SECRETKEYBYTES 1632
-#define pqcrystals_kyber512_PUBLICKEYBYTES 800
-#define pqcrystals_kyber512_CIPHERTEXTBYTES 768
-#define pqcrystals_kyber512_KEYPAIRCOINBYTES 64
-#define pqcrystals_kyber512_ENCCOINBYTES 32
-#define pqcrystals_kyber512_BYTES 32
-
-#define pqcrystals_kyber512_ref_SECRETKEYBYTES pqcrystals_kyber512_SECRETKEYBYTES
-#define pqcrystals_kyber512_ref_PUBLICKEYBYTES pqcrystals_kyber512_PUBLICKEYBYTES
-#define pqcrystals_kyber512_ref_CIPHERTEXTBYTES pqcrystals_kyber512_CIPHERTEXTBYTES
-#define pqcrystals_kyber512_ref_KEYPAIRCOINBYTES pqcrystals_kyber512_KEYPAIRCOINBYTES
-#define pqcrystals_kyber512_ref_ENCCOINBYTES pqcrystals_kyber512_ENCCOINBYTES
-#define pqcrystals_kyber512_ref_BYTES pqcrystals_kyber512_BYTES
-
-int pqcrystals_kyber512_ref_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
-int pqcrystals_kyber512_ref_keypair(uint8_t *pk, uint8_t *sk);
-int pqcrystals_kyber512_ref_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
-int pqcrystals_kyber512_ref_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-int pqcrystals_kyber512_ref_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-
-#define pqcrystals_kyber768_SECRETKEYBYTES 2400
-#define pqcrystals_kyber768_PUBLICKEYBYTES 1184
-#define pqcrystals_kyber768_CIPHERTEXTBYTES 1088
-#define pqcrystals_kyber768_KEYPAIRCOINBYTES 64
-#define pqcrystals_kyber768_ENCCOINBYTES 32
-#define pqcrystals_kyber768_BYTES 32
-
-#define pqcrystals_kyber768_ref_SECRETKEYBYTES pqcrystals_kyber768_SECRETKEYBYTES
-#define pqcrystals_kyber768_ref_PUBLICKEYBYTES pqcrystals_kyber768_PUBLICKEYBYTES
-#define pqcrystals_kyber768_ref_CIPHERTEXTBYTES pqcrystals_kyber768_CIPHERTEXTBYTES
-#define pqcrystals_kyber768_ref_KEYPAIRCOINBYTES pqcrystals_kyber768_KEYPAIRCOINBYTES
-#define pqcrystals_kyber768_ref_ENCCOINBYTES pqcrystals_kyber768_ENCCOINBYTES
-#define pqcrystals_kyber768_ref_BYTES pqcrystals_kyber768_BYTES
-
-int pqcrystals_kyber768_ref_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
-int pqcrystals_kyber768_ref_keypair(uint8_t *pk, uint8_t *sk);
-int pqcrystals_kyber768_ref_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
-int pqcrystals_kyber768_ref_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-int pqcrystals_kyber768_ref_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-
-#define pqcrystals_kyber1024_SECRETKEYBYTES 3168
-#define pqcrystals_kyber1024_PUBLICKEYBYTES 1568
-#define pqcrystals_kyber1024_CIPHERTEXTBYTES 1568
-#define pqcrystals_kyber1024_KEYPAIRCOINBYTES 64
-#define pqcrystals_kyber1024_ENCCOINBYTES 32
-#define pqcrystals_kyber1024_BYTES 32
-
-#define pqcrystals_kyber1024_ref_SECRETKEYBYTES pqcrystals_kyber1024_SECRETKEYBYTES
-#define pqcrystals_kyber1024_ref_PUBLICKEYBYTES pqcrystals_kyber1024_PUBLICKEYBYTES
-#define pqcrystals_kyber1024_ref_CIPHERTEXTBYTES pqcrystals_kyber1024_CIPHERTEXTBYTES
-#define pqcrystals_kyber1024_ref_KEYPAIRCOINBYTES pqcrystals_kyber1024_KEYPAIRCOINBYTES
-#define pqcrystals_kyber1024_ref_ENCCOINBYTES pqcrystals_kyber1024_ENCCOINBYTES
-#define pqcrystals_kyber1024_ref_BYTES pqcrystals_kyber1024_BYTES
-
-int pqcrystals_kyber1024_ref_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
-int pqcrystals_kyber1024_ref_keypair(uint8_t *pk, uint8_t *sk);
-int pqcrystals_kyber1024_ref_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
-int pqcrystals_kyber1024_ref_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-int pqcrystals_kyber1024_ref_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/cbd.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/cbd.c
deleted file mode 100644
index 1500ffea56..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/cbd.c
+++ /dev/null
@@ -1,128 +0,0 @@
-#include <stdint.h>
-#include "params.h"
-#include "cbd.h"
-
-/*************************************************
-* Name:        load32_littleendian
-*
-* Description: load 4 bytes into a 32-bit integer
-*              in little-endian order
-*
-* Arguments:   - const uint8_t *x: pointer to input byte array
-*
-* Returns 32-bit unsigned integer loaded from x
-**************************************************/
-static uint32_t load32_littleendian(const uint8_t x[4])
-{
-  uint32_t r;
-  r  = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  r |= (uint32_t)x[3] << 24;
-  return r;
-}
-
-/*************************************************
-* Name:        load24_littleendian
-*
-* Description: load 3 bytes into a 32-bit integer
-*              in little-endian order.
-*              This function is only needed for Kyber-512
-*
-* Arguments:   - const uint8_t *x: pointer to input byte array
-*
-* Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
-**************************************************/
-#if KYBER_ETA1 == 3
-static uint32_t load24_littleendian(const uint8_t x[3])
-{
-  uint32_t r;
-  r  = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  return r;
-}
-#endif
-
-
-/*************************************************
-* Name:        cbd2
-*
-* Description: Given an array of uniformly random bytes, compute
-*              polynomial with coefficients distributed according to
-*              a centered binomial distribution with parameter eta=2
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *buf: pointer to input byte array
-**************************************************/
-static void cbd2(poly *r, const uint8_t buf[2*KYBER_N/4])
-{
-  unsigned int i,j;
-  uint32_t t,d;
-  int16_t a,b;
-
-  for(i=0;i<KYBER_N/8;i++) {
-    t  = load32_littleendian(buf+4*i);
-    d  = t & 0x55555555;
-    d += (t>>1) & 0x55555555;
-
-    for(j=0;j<8;j++) {
-      a = (d >> (4*j+0)) & 0x3;
-      b = (d >> (4*j+2)) & 0x3;
-      r->coeffs[8*i+j] = a - b;
-    }
-  }
-}
-
-/*************************************************
-* Name:        cbd3
-*
-* Description: Given an array of uniformly random bytes, compute
-*              polynomial with coefficients distributed according to
-*              a centered binomial distribution with parameter eta=3.
-*              This function is only needed for Kyber-512
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *buf: pointer to input byte array
-**************************************************/
-#if KYBER_ETA1 == 3
-static void cbd3(poly *r, const uint8_t buf[3*KYBER_N/4])
-{
-  unsigned int i,j;
-  uint32_t t,d;
-  int16_t a,b;
-
-  for(i=0;i<KYBER_N/4;i++) {
-    t  = load24_littleendian(buf+3*i);
-    d  = t & 0x00249249;
-    d += (t>>1) & 0x00249249;
-    d += (t>>2) & 0x00249249;
-
-    for(j=0;j<4;j++) {
-      a = (d >> (6*j+0)) & 0x7;
-      b = (d >> (6*j+3)) & 0x7;
-      r->coeffs[4*i+j] = a - b;
-    }
-  }
-}
-#endif
-
-void poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1*KYBER_N/4])
-{
-#if KYBER_ETA1 == 2
-  cbd2(r, buf);
-#elif KYBER_ETA1 == 3
-  cbd3(r, buf);
-#else
-#error "This implementation requires eta1 in {2,3}"
-#endif
-}
-
-void poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2*KYBER_N/4])
-{
-#if KYBER_ETA2 == 2
-  cbd2(r, buf);
-#else
-#error "This implementation requires eta2 = 2"
-#endif
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/cbd.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/cbd.h
deleted file mode 100644
index 7b677d745d..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/cbd.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef CBD_H
-#define CBD_H
-
-#include <stdint.h>
-#include "params.h"
-#include "poly.h"
-
-#define poly_cbd_eta1 KYBER_NAMESPACE(poly_cbd_eta1)
-void poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1*KYBER_N/4]);
-
-#define poly_cbd_eta2 KYBER_NAMESPACE(poly_cbd_eta2)
-void poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2*KYBER_N/4]);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/indcpa.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/indcpa.c
deleted file mode 100644
index 726cfa985d..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/indcpa.c
+++ /dev/null
@@ -1,334 +0,0 @@
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-#include "params.h"
-#include "indcpa.h"
-#include "polyvec.h"
-#include "poly.h"
-#include "ntt.h"
-#include "symmetric.h"
-#include "randombytes.h"
-
-/*************************************************
-* Name:        pack_pk
-*
-* Description: Serialize the public key as concatenation of the
-*              serialized vector of polynomials pk
-*              and the public seed used to generate the matrix A.
-*
-* Arguments:   uint8_t *r: pointer to the output serialized public key
-*              polyvec *pk: pointer to the input public-key polyvec
-*              const uint8_t *seed: pointer to the input public seed
-**************************************************/
-static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES],
-                    polyvec *pk,
-                    const uint8_t seed[KYBER_SYMBYTES])
-{
-  polyvec_tobytes(r, pk);
-  memcpy(r+KYBER_POLYVECBYTES, seed, KYBER_SYMBYTES);
-}
-
-/*************************************************
-* Name:        unpack_pk
-*
-* Description: De-serialize public key from a byte array;
-*              approximate inverse of pack_pk
-*
-* Arguments:   - polyvec *pk: pointer to output public-key polynomial vector
-*              - uint8_t *seed: pointer to output seed to generate matrix A
-*              - const uint8_t *packedpk: pointer to input serialized public key
-**************************************************/
-static void unpack_pk(polyvec *pk,
-                      uint8_t seed[KYBER_SYMBYTES],
-                      const uint8_t packedpk[KYBER_INDCPA_PUBLICKEYBYTES])
-{
-  polyvec_frombytes(pk, packedpk);
-  memcpy(seed, packedpk+KYBER_POLYVECBYTES, KYBER_SYMBYTES);
-}
-
-/*************************************************
-* Name:        pack_sk
-*
-* Description: Serialize the secret key
-*
-* Arguments:   - uint8_t *r: pointer to output serialized secret key
-*              - polyvec *sk: pointer to input vector of polynomials (secret key)
-**************************************************/
-static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk)
-{
-  polyvec_tobytes(r, sk);
-}
-
-/*************************************************
-* Name:        unpack_sk
-*
-* Description: De-serialize the secret key; inverse of pack_sk
-*
-* Arguments:   - polyvec *sk: pointer to output vector of polynomials (secret key)
-*              - const uint8_t *packedsk: pointer to input serialized secret key
-**************************************************/
-static void unpack_sk(polyvec *sk, const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES])
-{
-  polyvec_frombytes(sk, packedsk);
-}
-
-/*************************************************
-* Name:        pack_ciphertext
-*
-* Description: Serialize the ciphertext as concatenation of the
-*              compressed and serialized vector of polynomials b
-*              and the compressed and serialized polynomial v
-*
-* Arguments:   uint8_t *r: pointer to the output serialized ciphertext
-*              poly *pk: pointer to the input vector of polynomials b
-*              poly *v: pointer to the input polynomial v
-**************************************************/
-static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], polyvec *b, poly *v)
-{
-  polyvec_compress(r, b);
-  poly_compress(r+KYBER_POLYVECCOMPRESSEDBYTES, v);
-}
-
-/*************************************************
-* Name:        unpack_ciphertext
-*
-* Description: De-serialize and decompress ciphertext from a byte array;
-*              approximate inverse of pack_ciphertext
-*
-* Arguments:   - polyvec *b: pointer to the output vector of polynomials b
-*              - poly *v: pointer to the output polynomial v
-*              - const uint8_t *c: pointer to the input serialized ciphertext
-**************************************************/
-static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[KYBER_INDCPA_BYTES])
-{
-  polyvec_decompress(b, c);
-  poly_decompress(v, c+KYBER_POLYVECCOMPRESSEDBYTES);
-}
-
-/*************************************************
-* Name:        rej_uniform
-*
-* Description: Run rejection sampling on uniform random bytes to generate
-*              uniform random integers mod q
-*
-* Arguments:   - int16_t *r: pointer to output buffer
-*              - unsigned int len: requested number of 16-bit integers (uniform mod q)
-*              - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes)
-*              - unsigned int buflen: length of input buffer in bytes
-*
-* Returns number of sampled 16-bit integers (at most len)
-**************************************************/
-static unsigned int rej_uniform(int16_t *r,
-                                unsigned int len,
-                                const uint8_t *buf,
-                                unsigned int buflen)
-{
-  unsigned int ctr, pos;
-  uint16_t val0, val1;
-
-  ctr = pos = 0;
-  while(ctr < len && pos + 3 <= buflen) {
-    val0 = ((buf[pos+0] >> 0) | ((uint16_t)buf[pos+1] << 8)) & 0xFFF;
-    val1 = ((buf[pos+1] >> 4) | ((uint16_t)buf[pos+2] << 4)) & 0xFFF;
-    pos += 3;
-
-    if(val0 < KYBER_Q)
-      r[ctr++] = val0;
-    if(ctr < len && val1 < KYBER_Q)
-      r[ctr++] = val1;
-  }
-
-  return ctr;
-}
-
-#define gen_a(A,B)  gen_matrix(A,B,0)
-#define gen_at(A,B) gen_matrix(A,B,1)
-
-/*************************************************
-* Name:        gen_matrix
-*
-* Description: Deterministically generate matrix A (or the transpose of A)
-*              from a seed. Entries of the matrix are polynomials that look
-*              uniformly random. Performs rejection sampling on output of
-*              a XOF
-*
-* Arguments:   - polyvec *a: pointer to ouptput matrix A
-*              - const uint8_t *seed: pointer to input seed
-*              - int transposed: boolean deciding whether A or A^T is generated
-**************************************************/
-#if(XOF_BLOCKBYTES % 3)
-#error "Implementation of gen_matrix assumes that XOF_BLOCKBYTES is a multiple of 3"
-#endif
-
-#define GEN_MATRIX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES)
-// Not static for benchmarking
-void gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed)
-{
-  unsigned int ctr, i, j;
-  unsigned int buflen;
-  uint8_t buf[GEN_MATRIX_NBLOCKS*XOF_BLOCKBYTES];
-  xof_state state;
-  xof_init(&state, seed);
-
-  for(i=0;i<KYBER_K;i++) {
-    for(j=0;j<KYBER_K;j++) {
-      if(transposed)
-        xof_absorb(&state, seed, i, j);
-      else
-        xof_absorb(&state, seed, j, i);
-
-      xof_squeezeblocks(buf, GEN_MATRIX_NBLOCKS, &state);
-      buflen = GEN_MATRIX_NBLOCKS*XOF_BLOCKBYTES;
-      ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, buflen);
-
-      while(ctr < KYBER_N) {
-        xof_squeezeblocks(buf, 1, &state);
-        buflen = XOF_BLOCKBYTES;
-        ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, buflen);
-      }
-    }
-  }
-  xof_release(&state);
-}
-
-/*************************************************
-* Name:        indcpa_keypair_derand
-*
-* Description: Generates public and private key for the CPA-secure
-*              public-key encryption scheme underlying Kyber
-*
-* Arguments:   - uint8_t *pk: pointer to output public key
-*                             (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
-*              - uint8_t *sk: pointer to output private key
-*                             (of length KYBER_INDCPA_SECRETKEYBYTES bytes)
-*              - const uint8_t *coins: pointer to input randomness
-*                             (of length KYBER_SYMBYTES bytes)
-**************************************************/
-void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
-                           uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES],
-                           const uint8_t coins[KYBER_SYMBYTES])
-{
-  unsigned int i;
-  uint8_t buf[2*KYBER_SYMBYTES];
-  const uint8_t *publicseed = buf;
-  const uint8_t *noiseseed = buf+KYBER_SYMBYTES;
-  uint8_t nonce = 0;
-  polyvec a[KYBER_K], e, pkpv, skpv;
-
-  memcpy(buf, coins, KYBER_SYMBYTES);
-  buf[KYBER_SYMBYTES] = KYBER_K;
-  hash_g(buf, buf, KYBER_SYMBYTES+1);
-
-  gen_a(a, publicseed);
-
-  for(i=0;i<KYBER_K;i++)
-    poly_getnoise_eta1(&skpv.vec[i], noiseseed, nonce++);
-  for(i=0;i<KYBER_K;i++)
-    poly_getnoise_eta1(&e.vec[i], noiseseed, nonce++);
-
-  polyvec_ntt(&skpv);
-  polyvec_ntt(&e);
-
-  // matrix-vector multiplication
-  for(i=0;i<KYBER_K;i++) {
-    polyvec_basemul_acc_montgomery(&pkpv.vec[i], &a[i], &skpv);
-    poly_tomont(&pkpv.vec[i]);
-  }
-
-  polyvec_add(&pkpv, &pkpv, &e);
-  polyvec_reduce(&pkpv);
-
-  pack_sk(sk, &skpv);
-  pack_pk(pk, &pkpv, publicseed);
-}
-
-
-/*************************************************
-* Name:        indcpa_enc
-*
-* Description: Encryption function of the CPA-secure
-*              public-key encryption scheme underlying Kyber.
-*
-* Arguments:   - uint8_t *c: pointer to output ciphertext
-*                            (of length KYBER_INDCPA_BYTES bytes)
-*              - const uint8_t *m: pointer to input message
-*                                  (of length KYBER_INDCPA_MSGBYTES bytes)
-*              - const uint8_t *pk: pointer to input public key
-*                                   (of length KYBER_INDCPA_PUBLICKEYBYTES)
-*              - const uint8_t *coins: pointer to input random coins used as seed
-*                                      (of length KYBER_SYMBYTES) to deterministically
-*                                      generate all randomness
-**************************************************/
-void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
-                const uint8_t m[KYBER_INDCPA_MSGBYTES],
-                const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
-                const uint8_t coins[KYBER_SYMBYTES])
-{
-  unsigned int i;
-  uint8_t seed[KYBER_SYMBYTES];
-  uint8_t nonce = 0;
-  polyvec sp, pkpv, ep, at[KYBER_K], b;
-  poly v, k, epp;
-
-  unpack_pk(&pkpv, seed, pk);
-  poly_frommsg(&k, m);
-  gen_at(at, seed);
-
-  for(i=0;i<KYBER_K;i++)
-    poly_getnoise_eta1(sp.vec+i, coins, nonce++);
-  for(i=0;i<KYBER_K;i++)
-    poly_getnoise_eta2(ep.vec+i, coins, nonce++);
-  poly_getnoise_eta2(&epp, coins, nonce++);
-
-  polyvec_ntt(&sp);
-
-  // matrix-vector multiplication
-  for(i=0;i<KYBER_K;i++)
-    polyvec_basemul_acc_montgomery(&b.vec[i], &at[i], &sp);
-
-  polyvec_basemul_acc_montgomery(&v, &pkpv, &sp);
-
-  polyvec_invntt_tomont(&b);
-  poly_invntt_tomont(&v);
-
-  polyvec_add(&b, &b, &ep);
-  poly_add(&v, &v, &epp);
-  poly_add(&v, &v, &k);
-  polyvec_reduce(&b);
-  poly_reduce(&v);
-
-  pack_ciphertext(c, &b, &v);
-}
-
-/*************************************************
-* Name:        indcpa_dec
-*
-* Description: Decryption function of the CPA-secure
-*              public-key encryption scheme underlying Kyber.
-*
-* Arguments:   - uint8_t *m: pointer to output decrypted message
-*                            (of length KYBER_INDCPA_MSGBYTES)
-*              - const uint8_t *c: pointer to input ciphertext
-*                                  (of length KYBER_INDCPA_BYTES)
-*              - const uint8_t *sk: pointer to input secret key
-*                                   (of length KYBER_INDCPA_SECRETKEYBYTES)
-**************************************************/
-void indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES],
-                const uint8_t c[KYBER_INDCPA_BYTES],
-                const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES])
-{
-  polyvec b, skpv;
-  poly v, mp;
-
-  unpack_ciphertext(&b, &v, c);
-  unpack_sk(&skpv, sk);
-
-  polyvec_ntt(&b);
-  polyvec_basemul_acc_montgomery(&mp, &skpv, &b);
-  poly_invntt_tomont(&mp);
-
-  poly_sub(&mp, &v, &mp);
-  poly_reduce(&mp);
-
-  poly_tomsg(m, &mp);
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/indcpa.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/indcpa.h
deleted file mode 100644
index 6dd5088c85..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/indcpa.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef INDCPA_H
-#define INDCPA_H
-
-#include <stdint.h>
-#include "params.h"
-#include "polyvec.h"
-
-#define gen_matrix KYBER_NAMESPACE(gen_matrix)
-void gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed);
-
-#define indcpa_keypair_derand KYBER_NAMESPACE(indcpa_keypair_derand)
-void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
-                           uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES],
-                           const uint8_t coins[KYBER_SYMBYTES]);
-
-#define indcpa_enc KYBER_NAMESPACE(indcpa_enc)
-void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
-                const uint8_t m[KYBER_INDCPA_MSGBYTES],
-                const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
-                const uint8_t coins[KYBER_SYMBYTES]);
-
-#define indcpa_dec KYBER_NAMESPACE(indcpa_dec)
-void indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES],
-                const uint8_t c[KYBER_INDCPA_BYTES],
-                const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/kem.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/kem.c
deleted file mode 100644
index 63abc1029c..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/kem.c
+++ /dev/null
@@ -1,169 +0,0 @@
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-#include "params.h"
-#include "kem.h"
-#include "indcpa.h"
-#include "verify.h"
-#include "symmetric.h"
-#include "randombytes.h"
-/*************************************************
-* Name:        crypto_kem_keypair_derand
-*
-* Description: Generates public and private key
-*              for CCA-secure Kyber key encapsulation mechanism
-*
-* Arguments:   - uint8_t *pk: pointer to output public key
-*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
-*              - uint8_t *sk: pointer to output private key
-*                (an already allocated array of KYBER_SECRETKEYBYTES bytes)
-*              - uint8_t *coins: pointer to input randomness
-*                (an already allocated array filled with 2*KYBER_SYMBYTES random bytes)
-**
-* Returns 0 (success)
-**************************************************/
-int crypto_kem_keypair_derand(uint8_t *pk,
-                              uint8_t *sk,
-                              const uint8_t *coins)
-{
-  indcpa_keypair_derand(pk, sk, coins);
-  memcpy(sk+KYBER_INDCPA_SECRETKEYBYTES, pk, KYBER_PUBLICKEYBYTES);
-  hash_h(sk+KYBER_SECRETKEYBYTES-2*KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
-  /* Value z for pseudo-random output on reject */
-  memcpy(sk+KYBER_SECRETKEYBYTES-KYBER_SYMBYTES, coins+KYBER_SYMBYTES, KYBER_SYMBYTES);
-  return 0;
-}
-
-/*************************************************
-* Name:        crypto_kem_keypair
-*
-* Description: Generates public and private key
-*              for CCA-secure Kyber key encapsulation mechanism
-*
-* Arguments:   - uint8_t *pk: pointer to output public key
-*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
-*              - uint8_t *sk: pointer to output private key
-*                (an already allocated array of KYBER_SECRETKEYBYTES bytes)
-*
-* Returns 0 (success)
-**************************************************/
-int crypto_kem_keypair(uint8_t *pk,
-                       uint8_t *sk)
-{
-  uint8_t coins[2*KYBER_SYMBYTES];
-  randombytes(coins, 2*KYBER_SYMBYTES);
-  crypto_kem_keypair_derand(pk, sk, coins);
-  return 0;
-}
-
-/*************************************************
-* Name:        crypto_kem_enc_derand
-*
-* Description: Generates cipher text and shared
-*              secret for given public key
-*
-* Arguments:   - uint8_t *ct: pointer to output cipher text
-*                (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
-*              - uint8_t *ss: pointer to output shared secret
-*                (an already allocated array of KYBER_SSBYTES bytes)
-*              - const uint8_t *pk: pointer to input public key
-*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
-*              - const uint8_t *coins: pointer to input randomness
-*                (an already allocated array filled with KYBER_SYMBYTES random bytes)
-**
-* Returns 0 (success)
-**************************************************/
-int crypto_kem_enc_derand(uint8_t *ct,
-                          uint8_t *ss,
-                          const uint8_t *pk,
-                          const uint8_t *coins)
-{
-  uint8_t buf[2*KYBER_SYMBYTES];
-  /* Will contain key, coins */
-  uint8_t kr[2*KYBER_SYMBYTES];
-
-  memcpy(buf, coins, KYBER_SYMBYTES);
-
-  /* Multitarget countermeasure for coins + contributory KEM */
-  hash_h(buf+KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
-  hash_g(kr, buf, 2*KYBER_SYMBYTES);
-
-  /* coins are in kr+KYBER_SYMBYTES */
-  indcpa_enc(ct, buf, pk, kr+KYBER_SYMBYTES);
-
-  memcpy(ss,kr,KYBER_SYMBYTES);
-  return 0;
-}
-
-/*************************************************
-* Name:        crypto_kem_enc
-*
-* Description: Generates cipher text and shared
-*              secret for given public key
-*
-* Arguments:   - uint8_t *ct: pointer to output cipher text
-*                (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
-*              - uint8_t *ss: pointer to output shared secret
-*                (an already allocated array of KYBER_SSBYTES bytes)
-*              - const uint8_t *pk: pointer to input public key
-*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
-*
-* Returns 0 (success)
-**************************************************/
-int crypto_kem_enc(uint8_t *ct,
-                   uint8_t *ss,
-                   const uint8_t *pk)
-{
-  uint8_t coins[KYBER_SYMBYTES];
-  randombytes(coins, KYBER_SYMBYTES);
-  crypto_kem_enc_derand(ct, ss, pk, coins);
-  return 0;
-}
-
-/*************************************************
-* Name:        crypto_kem_dec
-*
-* Description: Generates shared secret for given
-*              cipher text and private key
-*
-* Arguments:   - uint8_t *ss: pointer to output shared secret
-*                (an already allocated array of KYBER_SSBYTES bytes)
-*              - const uint8_t *ct: pointer to input cipher text
-*                (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
-*              - const uint8_t *sk: pointer to input private key
-*                (an already allocated array of KYBER_SECRETKEYBYTES bytes)
-*
-* Returns 0.
-*
-* On failure, ss will contain a pseudo-random value.
-**************************************************/
-int crypto_kem_dec(uint8_t *ss,
-                   const uint8_t *ct,
-                   const uint8_t *sk)
-{
-  int fail;
-  uint8_t buf[2*KYBER_SYMBYTES];
-  /* Will contain key, coins */
-  uint8_t kr[2*KYBER_SYMBYTES];
-  uint8_t cmp[KYBER_CIPHERTEXTBYTES+KYBER_SYMBYTES];
-  const uint8_t *pk = sk+KYBER_INDCPA_SECRETKEYBYTES;
-
-  indcpa_dec(buf, ct, sk);
-
-  /* Multitarget countermeasure for coins + contributory KEM */
-  memcpy(buf+KYBER_SYMBYTES, sk+KYBER_SECRETKEYBYTES-2*KYBER_SYMBYTES, KYBER_SYMBYTES);
-  hash_g(kr, buf, 2*KYBER_SYMBYTES);
-
-  /* coins are in kr+KYBER_SYMBYTES */
-  indcpa_enc(cmp, buf, pk, kr+KYBER_SYMBYTES);
-
-  fail = verify(ct, cmp, KYBER_CIPHERTEXTBYTES);
-
-  /* Compute rejection key */
-  rkprf(ss,sk+KYBER_SECRETKEYBYTES-KYBER_SYMBYTES,ct);
-
-  /* Copy true key to return buffer if fail is false */
-  cmov(ss,kr,KYBER_SYMBYTES,!fail);
-
-  return 0;
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/kem.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/kem.h
deleted file mode 100644
index 234f11966b..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/kem.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef KEM_H
-#define KEM_H
-
-#include <stdint.h>
-#include "params.h"
-
-#define CRYPTO_SECRETKEYBYTES  KYBER_SECRETKEYBYTES
-#define CRYPTO_PUBLICKEYBYTES  KYBER_PUBLICKEYBYTES
-#define CRYPTO_CIPHERTEXTBYTES KYBER_CIPHERTEXTBYTES
-#define CRYPTO_BYTES           KYBER_SSBYTES
-
-#if   (KYBER_K == 2)
-#define CRYPTO_ALGNAME "Kyber512"
-#elif (KYBER_K == 3)
-#define CRYPTO_ALGNAME "Kyber768"
-#elif (KYBER_K == 4)
-#define CRYPTO_ALGNAME "Kyber1024"
-#endif
-
-#define crypto_kem_keypair_derand KYBER_NAMESPACE(keypair_derand)
-int crypto_kem_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
-
-#define crypto_kem_keypair KYBER_NAMESPACE(keypair)
-int crypto_kem_keypair(uint8_t *pk, uint8_t *sk);
-
-#define crypto_kem_enc_derand KYBER_NAMESPACE(enc_derand)
-int crypto_kem_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
-
-#define crypto_kem_enc KYBER_NAMESPACE(enc)
-int crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-
-#define crypto_kem_dec KYBER_NAMESPACE(dec)
-int crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/ntt.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/ntt.c
deleted file mode 100644
index 2f2eb10b2f..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/ntt.c
+++ /dev/null
@@ -1,146 +0,0 @@
-#include <stdint.h>
-#include "params.h"
-#include "ntt.h"
-#include "reduce.h"
-
-/* Code to generate zetas and zetas_inv used in the number-theoretic transform:
-
-#define KYBER_ROOT_OF_UNITY 17
-
-static const uint8_t tree[128] = {
-  0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120,
-  4, 68, 36, 100, 20, 84, 52, 116, 12, 76, 44, 108, 28, 92, 60, 124,
-  2, 66, 34, 98, 18, 82, 50, 114, 10, 74, 42, 106, 26, 90, 58, 122,
-  6, 70, 38, 102, 22, 86, 54, 118, 14, 78, 46, 110, 30, 94, 62, 126,
-  1, 65, 33, 97, 17, 81, 49, 113, 9, 73, 41, 105, 25, 89, 57, 121,
-  5, 69, 37, 101, 21, 85, 53, 117, 13, 77, 45, 109, 29, 93, 61, 125,
-  3, 67, 35, 99, 19, 83, 51, 115, 11, 75, 43, 107, 27, 91, 59, 123,
-  7, 71, 39, 103, 23, 87, 55, 119, 15, 79, 47, 111, 31, 95, 63, 127
-};
-
-void init_ntt() {
-  unsigned int i;
-  int16_t tmp[128];
-
-  tmp[0] = MONT;
-  for(i=1;i<128;i++)
-    tmp[i] = fqmul(tmp[i-1],MONT*KYBER_ROOT_OF_UNITY % KYBER_Q);
-
-  for(i=0;i<128;i++) {
-    zetas[i] = tmp[tree[i]];
-    if(zetas[i] > KYBER_Q/2)
-      zetas[i] -= KYBER_Q;
-    if(zetas[i] < -KYBER_Q/2)
-      zetas[i] += KYBER_Q;
-  }
-}
-*/
-
-const int16_t zetas[128] = {
-  -1044,  -758,  -359, -1517,  1493,  1422,   287,   202,
-   -171,   622,  1577,   182,   962, -1202, -1474,  1468,
-    573, -1325,   264,   383,  -829,  1458, -1602,  -130,
-   -681,  1017,   732,   608, -1542,   411,  -205, -1571,
-   1223,   652,  -552,  1015, -1293,  1491,  -282, -1544,
-    516,    -8,  -320,  -666, -1618, -1162,   126,  1469,
-   -853,   -90,  -271,   830,   107, -1421,  -247,  -951,
-   -398,   961, -1508,  -725,   448, -1065,   677, -1275,
-  -1103,   430,   555,   843, -1251,   871,  1550,   105,
-    422,   587,   177,  -235,  -291,  -460,  1574,  1653,
-   -246,   778,  1159,  -147,  -777,  1483,  -602,  1119,
-  -1590,   644,  -872,   349,   418,   329,  -156,   -75,
-    817,  1097,   603,   610,  1322, -1285, -1465,   384,
-  -1215,  -136,  1218, -1335,  -874,   220, -1187, -1659,
-  -1185, -1530, -1278,   794, -1510,  -854,  -870,   478,
-   -108,  -308,   996,   991,   958, -1460,  1522,  1628
-};
-
-/*************************************************
-* Name:        fqmul
-*
-* Description: Multiplication followed by Montgomery reduction
-*
-* Arguments:   - int16_t a: first factor
-*              - int16_t b: second factor
-*
-* Returns 16-bit integer congruent to a*b*R^{-1} mod q
-**************************************************/
-static int16_t fqmul(int16_t a, int16_t b) {
-  return montgomery_reduce((int32_t)a*b);
-}
-
-/*************************************************
-* Name:        ntt
-*
-* Description: Inplace number-theoretic transform (NTT) in Rq.
-*              input is in standard order, output is in bitreversed order
-*
-* Arguments:   - int16_t r[256]: pointer to input/output vector of elements of Zq
-**************************************************/
-void ntt(int16_t r[256]) {
-  unsigned int len, start, j, k;
-  int16_t t, zeta;
-
-  k = 1;
-  for(len = 128; len >= 2; len >>= 1) {
-    for(start = 0; start < 256; start = j + len) {
-      zeta = zetas[k++];
-      for(j = start; j < start + len; j++) {
-        t = fqmul(zeta, r[j + len]);
-        r[j + len] = r[j] - t;
-        r[j] = r[j] + t;
-      }
-    }
-  }
-}
-
-/*************************************************
-* Name:        invntt_tomont
-*
-* Description: Inplace inverse number-theoretic transform in Rq and
-*              multiplication by Montgomery factor 2^16.
-*              Input is in bitreversed order, output is in standard order
-*
-* Arguments:   - int16_t r[256]: pointer to input/output vector of elements of Zq
-**************************************************/
-void invntt(int16_t r[256]) {
-  unsigned int start, len, j, k;
-  int16_t t, zeta;
-  const int16_t f = 1441; // mont^2/128
-
-  k = 127;
-  for(len = 2; len <= 128; len <<= 1) {
-    for(start = 0; start < 256; start = j + len) {
-      zeta = zetas[k--];
-      for(j = start; j < start + len; j++) {
-        t = r[j];
-        r[j] = barrett_reduce(t + r[j + len]);
-        r[j + len] = r[j + len] - t;
-        r[j + len] = fqmul(zeta, r[j + len]);
-      }
-    }
-  }
-
-  for(j = 0; j < 256; j++)
-    r[j] = fqmul(r[j], f);
-}
-
-/*************************************************
-* Name:        basemul
-*
-* Description: Multiplication of polynomials in Zq[X]/(X^2-zeta)
-*              used for multiplication of elements in Rq in NTT domain
-*
-* Arguments:   - int16_t r[2]: pointer to the output polynomial
-*              - const int16_t a[2]: pointer to the first factor
-*              - const int16_t b[2]: pointer to the second factor
-*              - int16_t zeta: integer defining the reduction polynomial
-**************************************************/
-void basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta)
-{
-  r[0]  = fqmul(a[1], b[1]);
-  r[0]  = fqmul(r[0], zeta);
-  r[0] += fqmul(a[0], b[0]);
-  r[1]  = fqmul(a[0], b[1]);
-  r[1] += fqmul(a[1], b[0]);
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/ntt.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/ntt.h
deleted file mode 100644
index 227ea74f08..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/ntt.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef NTT_H
-#define NTT_H
-
-#include <stdint.h>
-#include "params.h"
-
-#define zetas KYBER_NAMESPACE(zetas)
-extern const int16_t zetas[128];
-
-#define ntt KYBER_NAMESPACE(ntt)
-void ntt(int16_t poly[256]);
-
-#define invntt KYBER_NAMESPACE(invntt)
-void invntt(int16_t poly[256]);
-
-#define basemul KYBER_NAMESPACE(basemul)
-void basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/params.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/params.h
deleted file mode 100644
index fb4190b311..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/params.h
+++ /dev/null
@@ -1,55 +0,0 @@
-#ifndef PARAMS_H
-#define PARAMS_H
-
-#ifndef KYBER_K
-#define KYBER_K 3	/* Change this for different security strengths */
-#endif
-
-
-/* Don't change parameters below this line */
-#if   (KYBER_K == 2)
-#define KYBER_NAMESPACE(s) pqcrystals_ml_kem_512_ref_##s
-#elif (KYBER_K == 3)
-#define KYBER_NAMESPACE(s) pqcrystals_ml_kem_768_ref_##s
-#elif (KYBER_K == 4)
-#define KYBER_NAMESPACE(s) pqcrystals_ml_kem_1024_ref_##s
-#else
-#error "KYBER_K must be in {2,3,4}"
-#endif
-
-#define KYBER_N 256
-#define KYBER_Q 3329
-
-#define KYBER_SYMBYTES 32   /* size in bytes of hashes, and seeds */
-#define KYBER_SSBYTES  32   /* size in bytes of shared key */
-
-#define KYBER_POLYBYTES		384
-#define KYBER_POLYVECBYTES	(KYBER_K * KYBER_POLYBYTES)
-
-#if KYBER_K == 2
-#define KYBER_ETA1 3
-#define KYBER_POLYCOMPRESSEDBYTES    128
-#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320)
-#elif KYBER_K == 3
-#define KYBER_ETA1 2
-#define KYBER_POLYCOMPRESSEDBYTES    128
-#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320)
-#elif KYBER_K == 4
-#define KYBER_ETA1 2
-#define KYBER_POLYCOMPRESSEDBYTES    160
-#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 352)
-#endif
-
-#define KYBER_ETA2 2
-
-#define KYBER_INDCPA_MSGBYTES       (KYBER_SYMBYTES)
-#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES)
-#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES)
-#define KYBER_INDCPA_BYTES          (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES)
-
-#define KYBER_PUBLICKEYBYTES  (KYBER_INDCPA_PUBLICKEYBYTES)
-/* 32 bytes of additional space to save H(pk) */
-#define KYBER_SECRETKEYBYTES  (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES)
-#define KYBER_CIPHERTEXTBYTES (KYBER_INDCPA_BYTES)
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/poly.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/poly.c
deleted file mode 100644
index cbd3abfb54..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/poly.c
+++ /dev/null
@@ -1,360 +0,0 @@
-#include <stdint.h>
-#include "params.h"
-#include "poly.h"
-#include "ntt.h"
-#include "reduce.h"
-#include "cbd.h"
-#include "symmetric.h"
-#include "verify.h"
-
-/*************************************************
-* Name:        poly_compress
-*
-* Description: Compression and subsequent serialization of a polynomial
-*
-* Arguments:   - uint8_t *r: pointer to output byte array
-*                            (of length KYBER_POLYCOMPRESSEDBYTES)
-*              - const poly *a: pointer to input polynomial
-**************************************************/
-void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a)
-{
-  unsigned int i,j;
-  int16_t u;
-  uint32_t d0;
-  uint8_t t[8];
-
-#if (KYBER_POLYCOMPRESSEDBYTES == 128)
-
-  for(i=0;i<KYBER_N/8;i++) {
-    for(j=0;j<8;j++) {
-      // map to positive standard representatives
-      u  = a->coeffs[8*i+j];
-      u += (u >> 15) & KYBER_Q;
-/*    t[j] = ((((uint16_t)u << 4) + KYBER_Q/2)/KYBER_Q) & 15; */
-      d0 = u << 4;
-      d0 += 1665;
-      d0 *= 80635;
-      d0 >>= 28;
-      t[j] = d0 & 0xf;
-    }
-
-    r[0] = t[0] | (t[1] << 4);
-    r[1] = t[2] | (t[3] << 4);
-    r[2] = t[4] | (t[5] << 4);
-    r[3] = t[6] | (t[7] << 4);
-    r += 4;
-  }
-#elif (KYBER_POLYCOMPRESSEDBYTES == 160)
-  for(i=0;i<KYBER_N/8;i++) {
-    for(j=0;j<8;j++) {
-      // map to positive standard representatives
-      u  = a->coeffs[8*i+j];
-      u += (u >> 15) & KYBER_Q;
-/*    t[j] = ((((uint32_t)u << 5) + KYBER_Q/2)/KYBER_Q) & 31; */
-      d0 = u << 5;
-      d0 += 1664;
-      d0 *= 40318;
-      d0 >>= 27;
-      t[j] = d0 & 0x1f;
-    }
-
-    r[0] = (t[0] >> 0) | (t[1] << 5);
-    r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7);
-    r[2] = (t[3] >> 1) | (t[4] << 4);
-    r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6);
-    r[4] = (t[6] >> 2) | (t[7] << 3);
-    r += 5;
-  }
-#else
-#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}"
-#endif
-}
-
-/*************************************************
-* Name:        poly_decompress
-*
-* Description: De-serialization and subsequent decompression of a polynomial;
-*              approximate inverse of poly_compress
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *a: pointer to input byte array
-*                                  (of length KYBER_POLYCOMPRESSEDBYTES bytes)
-**************************************************/
-void poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES])
-{
-  unsigned int i;
-
-#if (KYBER_POLYCOMPRESSEDBYTES == 128)
-  for(i=0;i<KYBER_N/2;i++) {
-    r->coeffs[2*i+0] = (((uint16_t)(a[0] & 15)*KYBER_Q) + 8) >> 4;
-    r->coeffs[2*i+1] = (((uint16_t)(a[0] >> 4)*KYBER_Q) + 8) >> 4;
-    a += 1;
-  }
-#elif (KYBER_POLYCOMPRESSEDBYTES == 160)
-  unsigned int j;
-  uint8_t t[8];
-  for(i=0;i<KYBER_N/8;i++) {
-    t[0] = (a[0] >> 0);
-    t[1] = (a[0] >> 5) | (a[1] << 3);
-    t[2] = (a[1] >> 2);
-    t[3] = (a[1] >> 7) | (a[2] << 1);
-    t[4] = (a[2] >> 4) | (a[3] << 4);
-    t[5] = (a[3] >> 1);
-    t[6] = (a[3] >> 6) | (a[4] << 2);
-    t[7] = (a[4] >> 3);
-    a += 5;
-
-    for(j=0;j<8;j++)
-      r->coeffs[8*i+j] = ((uint32_t)(t[j] & 31)*KYBER_Q + 16) >> 5;
-  }
-#else
-#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}"
-#endif
-}
-
-/*************************************************
-* Name:        poly_tobytes
-*
-* Description: Serialization of a polynomial
-*
-* Arguments:   - uint8_t *r: pointer to output byte array
-*                            (needs space for KYBER_POLYBYTES bytes)
-*              - const poly *a: pointer to input polynomial
-**************************************************/
-void poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a)
-{
-  unsigned int i;
-  uint16_t t0, t1;
-
-  for(i=0;i<KYBER_N/2;i++) {
-    // map to positive standard representatives
-    t0  = a->coeffs[2*i];
-    t0 += ((int16_t)t0 >> 15) & KYBER_Q;
-    t1 = a->coeffs[2*i+1];
-    t1 += ((int16_t)t1 >> 15) & KYBER_Q;
-    r[3*i+0] = (t0 >> 0);
-    r[3*i+1] = (t0 >> 8) | (t1 << 4);
-    r[3*i+2] = (t1 >> 4);
-  }
-}
-
-/*************************************************
-* Name:        poly_frombytes
-*
-* Description: De-serialization of a polynomial;
-*              inverse of poly_tobytes
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *a: pointer to input byte array
-*                                  (of KYBER_POLYBYTES bytes)
-**************************************************/
-void poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES])
-{
-  unsigned int i;
-  for(i=0;i<KYBER_N/2;i++) {
-    r->coeffs[2*i]   = ((a[3*i+0] >> 0) | ((uint16_t)a[3*i+1] << 8)) & 0xFFF;
-    r->coeffs[2*i+1] = ((a[3*i+1] >> 4) | ((uint16_t)a[3*i+2] << 4)) & 0xFFF;
-  }
-}
-
-/*************************************************
-* Name:        poly_frommsg
-*
-* Description: Convert 32-byte message to polynomial
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *msg: pointer to input message
-**************************************************/
-void poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES])
-{
-  unsigned int i,j;
-
-#if (KYBER_INDCPA_MSGBYTES != KYBER_N/8)
-#error "KYBER_INDCPA_MSGBYTES must be equal to KYBER_N/8 bytes!"
-#endif
-
-  for(i=0;i<KYBER_N/8;i++) {
-    for(j=0;j<8;j++) {
-      r->coeffs[8*i+j] = 0;
-      cmov_int16(r->coeffs+8*i+j, ((KYBER_Q+1)/2), (msg[i] >> j)&1);
-    }
-  }
-}
-
-/*************************************************
-* Name:        poly_tomsg
-*
-* Description: Convert polynomial to 32-byte message
-*
-* Arguments:   - uint8_t *msg: pointer to output message
-*              - const poly *a: pointer to input polynomial
-**************************************************/
-void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *a)
-{
-  unsigned int i,j;
-  uint32_t t;
-
-  for(i=0;i<KYBER_N/8;i++) {
-    msg[i] = 0;
-    for(j=0;j<8;j++) {
-      t  = a->coeffs[8*i+j];
-      // t += ((int16_t)t >> 15) & KYBER_Q;
-      // t  = (((t << 1) + KYBER_Q/2)/KYBER_Q) & 1;
-      t <<= 1;
-      t += 1665;
-      t *= 80635;
-      t >>= 28;
-      t &= 1;
-      msg[i] |= t << j;
-    }
-  }
-}
-
-/*************************************************
-* Name:        poly_getnoise_eta1
-*
-* Description: Sample a polynomial deterministically from a seed and a nonce,
-*              with output polynomial close to centered binomial distribution
-*              with parameter KYBER_ETA1
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *seed: pointer to input seed
-*                                     (of length KYBER_SYMBYTES bytes)
-*              - uint8_t nonce: one-byte input nonce
-**************************************************/
-void poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce)
-{
-  uint8_t buf[KYBER_ETA1*KYBER_N/4];
-  prf(buf, sizeof(buf), seed, nonce);
-  poly_cbd_eta1(r, buf);
-}
-
-/*************************************************
-* Name:        poly_getnoise_eta2
-*
-* Description: Sample a polynomial deterministically from a seed and a nonce,
-*              with output polynomial close to centered binomial distribution
-*              with parameter KYBER_ETA2
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *seed: pointer to input seed
-*                                     (of length KYBER_SYMBYTES bytes)
-*              - uint8_t nonce: one-byte input nonce
-**************************************************/
-void poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce)
-{
-  uint8_t buf[KYBER_ETA2*KYBER_N/4];
-  prf(buf, sizeof(buf), seed, nonce);
-  poly_cbd_eta2(r, buf);
-}
-
-
-/*************************************************
-* Name:        poly_ntt
-*
-* Description: Computes negacyclic number-theoretic transform (NTT) of
-*              a polynomial in place;
-*              inputs assumed to be in normal order, output in bitreversed order
-*
-* Arguments:   - uint16_t *r: pointer to in/output polynomial
-**************************************************/
-void poly_ntt(poly *r)
-{
-  ntt(r->coeffs);
-  poly_reduce(r);
-}
-
-/*************************************************
-* Name:        poly_invntt_tomont
-*
-* Description: Computes inverse of negacyclic number-theoretic transform (NTT)
-*              of a polynomial in place;
-*              inputs assumed to be in bitreversed order, output in normal order
-*
-* Arguments:   - uint16_t *a: pointer to in/output polynomial
-**************************************************/
-void poly_invntt_tomont(poly *r)
-{
-  invntt(r->coeffs);
-}
-
-/*************************************************
-* Name:        poly_basemul_montgomery
-*
-* Description: Multiplication of two polynomials in NTT domain
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const poly *a: pointer to first input polynomial
-*              - const poly *b: pointer to second input polynomial
-**************************************************/
-void poly_basemul_montgomery(poly *r, const poly *a, const poly *b)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_N/4;i++) {
-    basemul(&r->coeffs[4*i], &a->coeffs[4*i], &b->coeffs[4*i], zetas[64+i]);
-    basemul(&r->coeffs[4*i+2], &a->coeffs[4*i+2], &b->coeffs[4*i+2], -zetas[64+i]);
-  }
-}
-
-/*************************************************
-* Name:        poly_tomont
-*
-* Description: Inplace conversion of all coefficients of a polynomial
-*              from normal domain to Montgomery domain
-*
-* Arguments:   - poly *r: pointer to input/output polynomial
-**************************************************/
-void poly_tomont(poly *r)
-{
-  unsigned int i;
-  const int16_t f = (1ULL << 32) % KYBER_Q;
-  for(i=0;i<KYBER_N;i++)
-    r->coeffs[i] = montgomery_reduce((int32_t)r->coeffs[i]*f);
-}
-
-/*************************************************
-* Name:        poly_reduce
-*
-* Description: Applies Barrett reduction to all coefficients of a polynomial
-*              for details of the Barrett reduction see comments in reduce.c
-*
-* Arguments:   - poly *r: pointer to input/output polynomial
-**************************************************/
-void poly_reduce(poly *r)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_N;i++)
-    r->coeffs[i] = barrett_reduce(r->coeffs[i]);
-}
-
-/*************************************************
-* Name:        poly_add
-*
-* Description: Add two polynomials; no modular reduction is performed
-*
-* Arguments: - poly *r: pointer to output polynomial
-*            - const poly *a: pointer to first input polynomial
-*            - const poly *b: pointer to second input polynomial
-**************************************************/
-void poly_add(poly *r, const poly *a, const poly *b)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_N;i++)
-    r->coeffs[i] = a->coeffs[i] + b->coeffs[i];
-}
-
-/*************************************************
-* Name:        poly_sub
-*
-* Description: Subtract two polynomials; no modular reduction is performed
-*
-* Arguments: - poly *r:       pointer to output polynomial
-*            - const poly *a: pointer to first input polynomial
-*            - const poly *b: pointer to second input polynomial
-**************************************************/
-void poly_sub(poly *r, const poly *a, const poly *b)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_N;i++)
-    r->coeffs[i] = a->coeffs[i] - b->coeffs[i];
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/poly.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/poly.h
deleted file mode 100644
index 9a99c7cdad..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/poly.h
+++ /dev/null
@@ -1,53 +0,0 @@
-#ifndef POLY_H
-#define POLY_H
-
-#include <stdint.h>
-#include "params.h"
-
-/*
- * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial
- * coeffs[0] + X*coeffs[1] + X^2*coeffs[2] + ... + X^{n-1}*coeffs[n-1]
- */
-typedef struct{
-  int16_t coeffs[KYBER_N];
-} poly;
-
-#define poly_compress KYBER_NAMESPACE(poly_compress)
-void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a);
-#define poly_decompress KYBER_NAMESPACE(poly_decompress)
-void poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]);
-
-#define poly_tobytes KYBER_NAMESPACE(poly_tobytes)
-void poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a);
-#define poly_frombytes KYBER_NAMESPACE(poly_frombytes)
-void poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]);
-
-#define poly_frommsg KYBER_NAMESPACE(poly_frommsg)
-void poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]);
-#define poly_tomsg KYBER_NAMESPACE(poly_tomsg)
-void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *r);
-
-#define poly_getnoise_eta1 KYBER_NAMESPACE(poly_getnoise_eta1)
-void poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);
-
-#define poly_getnoise_eta2 KYBER_NAMESPACE(poly_getnoise_eta2)
-void poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);
-
-#define poly_ntt KYBER_NAMESPACE(poly_ntt)
-void poly_ntt(poly *r);
-#define poly_invntt_tomont KYBER_NAMESPACE(poly_invntt_tomont)
-void poly_invntt_tomont(poly *r);
-#define poly_basemul_montgomery KYBER_NAMESPACE(poly_basemul_montgomery)
-void poly_basemul_montgomery(poly *r, const poly *a, const poly *b);
-#define poly_tomont KYBER_NAMESPACE(poly_tomont)
-void poly_tomont(poly *r);
-
-#define poly_reduce KYBER_NAMESPACE(poly_reduce)
-void poly_reduce(poly *r);
-
-#define poly_add KYBER_NAMESPACE(poly_add)
-void poly_add(poly *r, const poly *a, const poly *b);
-#define poly_sub KYBER_NAMESPACE(poly_sub)
-void poly_sub(poly *r, const poly *a, const poly *b);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/polyvec.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/polyvec.c
deleted file mode 100644
index 669f6a5f1d..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/polyvec.c
+++ /dev/null
@@ -1,246 +0,0 @@
-#include <stdint.h>
-#include "params.h"
-#include "poly.h"
-#include "polyvec.h"
-
-/*************************************************
-* Name:        polyvec_compress
-*
-* Description: Compress and serialize vector of polynomials
-*
-* Arguments:   - uint8_t *r: pointer to output byte array
-*                            (needs space for KYBER_POLYVECCOMPRESSEDBYTES)
-*              - const polyvec *a: pointer to input vector of polynomials
-**************************************************/
-void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a)
-{
-  unsigned int i,j,k;
-  uint64_t d0;
-
-#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
-  uint16_t t[8];
-  for(i=0;i<KYBER_K;i++) {
-    for(j=0;j<KYBER_N/8;j++) {
-      for(k=0;k<8;k++) {
-        t[k]  = a->vec[i].coeffs[8*j+k];
-        t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
-/*      t[k]  = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff; */
-        d0 = t[k];
-        d0 <<= 11;
-        d0 += 1664;
-        d0 *= 645084;
-        d0 >>= 31;
-        t[k] = d0 & 0x7ff;
-      }
-
-      r[ 0] = (t[0] >>  0);
-      r[ 1] = (t[0] >>  8) | (t[1] << 3);
-      r[ 2] = (t[1] >>  5) | (t[2] << 6);
-      r[ 3] = (t[2] >>  2);
-      r[ 4] = (t[2] >> 10) | (t[3] << 1);
-      r[ 5] = (t[3] >>  7) | (t[4] << 4);
-      r[ 6] = (t[4] >>  4) | (t[5] << 7);
-      r[ 7] = (t[5] >>  1);
-      r[ 8] = (t[5] >>  9) | (t[6] << 2);
-      r[ 9] = (t[6] >>  6) | (t[7] << 5);
-      r[10] = (t[7] >>  3);
-      r += 11;
-    }
-  }
-#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
-  uint16_t t[4];
-  for(i=0;i<KYBER_K;i++) {
-    for(j=0;j<KYBER_N/4;j++) {
-      for(k=0;k<4;k++) {
-        t[k]  = a->vec[i].coeffs[4*j+k];
-        t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
-/*      t[k]  = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff; */
-        d0 = t[k];
-        d0 <<= 10;
-        d0 += 1665;
-        d0 *= 1290167;
-        d0 >>= 32;
-        t[k] = d0 & 0x3ff;
-      }
-
-      r[0] = (t[0] >> 0);
-      r[1] = (t[0] >> 8) | (t[1] << 2);
-      r[2] = (t[1] >> 6) | (t[2] << 4);
-      r[3] = (t[2] >> 4) | (t[3] << 6);
-      r[4] = (t[3] >> 2);
-      r += 5;
-    }
-  }
-#else
-#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
-#endif
-}
-
-/*************************************************
-* Name:        polyvec_decompress
-*
-* Description: De-serialize and decompress vector of polynomials;
-*              approximate inverse of polyvec_compress
-*
-* Arguments:   - polyvec *r:       pointer to output vector of polynomials
-*              - const uint8_t *a: pointer to input byte array
-*                                  (of length KYBER_POLYVECCOMPRESSEDBYTES)
-**************************************************/
-void polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES])
-{
-  unsigned int i,j,k;
-
-#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
-  uint16_t t[8];
-  for(i=0;i<KYBER_K;i++) {
-    for(j=0;j<KYBER_N/8;j++) {
-      t[0] = (a[0] >> 0) | ((uint16_t)a[ 1] << 8);
-      t[1] = (a[1] >> 3) | ((uint16_t)a[ 2] << 5);
-      t[2] = (a[2] >> 6) | ((uint16_t)a[ 3] << 2) | ((uint16_t)a[4] << 10);
-      t[3] = (a[4] >> 1) | ((uint16_t)a[ 5] << 7);
-      t[4] = (a[5] >> 4) | ((uint16_t)a[ 6] << 4);
-      t[5] = (a[6] >> 7) | ((uint16_t)a[ 7] << 1) | ((uint16_t)a[8] << 9);
-      t[6] = (a[8] >> 2) | ((uint16_t)a[ 9] << 6);
-      t[7] = (a[9] >> 5) | ((uint16_t)a[10] << 3);
-      a += 11;
-
-      for(k=0;k<8;k++)
-        r->vec[i].coeffs[8*j+k] = ((uint32_t)(t[k] & 0x7FF)*KYBER_Q + 1024) >> 11;
-    }
-  }
-#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
-  uint16_t t[4];
-  for(i=0;i<KYBER_K;i++) {
-    for(j=0;j<KYBER_N/4;j++) {
-      t[0] = (a[0] >> 0) | ((uint16_t)a[1] << 8);
-      t[1] = (a[1] >> 2) | ((uint16_t)a[2] << 6);
-      t[2] = (a[2] >> 4) | ((uint16_t)a[3] << 4);
-      t[3] = (a[3] >> 6) | ((uint16_t)a[4] << 2);
-      a += 5;
-
-      for(k=0;k<4;k++)
-        r->vec[i].coeffs[4*j+k] = ((uint32_t)(t[k] & 0x3FF)*KYBER_Q + 512) >> 10;
-    }
-  }
-#else
-#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
-#endif
-}
-
-/*************************************************
-* Name:        polyvec_tobytes
-*
-* Description: Serialize vector of polynomials
-*
-* Arguments:   - uint8_t *r: pointer to output byte array
-*                            (needs space for KYBER_POLYVECBYTES)
-*              - const polyvec *a: pointer to input vector of polynomials
-**************************************************/
-void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_tobytes(r+i*KYBER_POLYBYTES, &a->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvec_frombytes
-*
-* Description: De-serialize vector of polynomials;
-*              inverse of polyvec_tobytes
-*
-* Arguments:   - uint8_t *r:       pointer to output byte array
-*              - const polyvec *a: pointer to input vector of polynomials
-*                                  (of length KYBER_POLYVECBYTES)
-**************************************************/
-void polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES])
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_frombytes(&r->vec[i], a+i*KYBER_POLYBYTES);
-}
-
-/*************************************************
-* Name:        polyvec_ntt
-*
-* Description: Apply forward NTT to all elements of a vector of polynomials
-*
-* Arguments:   - polyvec *r: pointer to in/output vector of polynomials
-**************************************************/
-void polyvec_ntt(polyvec *r)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_ntt(&r->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvec_invntt_tomont
-*
-* Description: Apply inverse NTT to all elements of a vector of polynomials
-*              and multiply by Montgomery factor 2^16
-*
-* Arguments:   - polyvec *r: pointer to in/output vector of polynomials
-**************************************************/
-void polyvec_invntt_tomont(polyvec *r)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_invntt_tomont(&r->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvec_basemul_acc_montgomery
-*
-* Description: Multiply elements of a and b in NTT domain, accumulate into r,
-*              and multiply by 2^-16.
-*
-* Arguments: - poly *r: pointer to output polynomial
-*            - const polyvec *a: pointer to first input vector of polynomials
-*            - const polyvec *b: pointer to second input vector of polynomials
-**************************************************/
-void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
-{
-  unsigned int i;
-  poly t;
-
-  poly_basemul_montgomery(r, &a->vec[0], &b->vec[0]);
-  for(i=1;i<KYBER_K;i++) {
-    poly_basemul_montgomery(&t, &a->vec[i], &b->vec[i]);
-    poly_add(r, r, &t);
-  }
-
-  poly_reduce(r);
-}
-
-/*************************************************
-* Name:        polyvec_reduce
-*
-* Description: Applies Barrett reduction to each coefficient
-*              of each element of a vector of polynomials;
-*              for details of the Barrett reduction see comments in reduce.c
-*
-* Arguments:   - polyvec *r: pointer to input/output polynomial
-**************************************************/
-void polyvec_reduce(polyvec *r)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_reduce(&r->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvec_add
-*
-* Description: Add vectors of polynomials
-*
-* Arguments: - polyvec *r: pointer to output vector of polynomials
-*            - const polyvec *a: pointer to first input vector of polynomials
-*            - const polyvec *b: pointer to second input vector of polynomials
-**************************************************/
-void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_add(&r->vec[i], &a->vec[i], &b->vec[i]);
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/polyvec.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/polyvec.h
deleted file mode 100644
index 57b605494e..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/polyvec.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef POLYVEC_H
-#define POLYVEC_H
-
-#include <stdint.h>
-#include "params.h"
-#include "poly.h"
-
-typedef struct{
-  poly vec[KYBER_K];
-} polyvec;
-
-#define polyvec_compress KYBER_NAMESPACE(polyvec_compress)
-void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a);
-#define polyvec_decompress KYBER_NAMESPACE(polyvec_decompress)
-void polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]);
-
-#define polyvec_tobytes KYBER_NAMESPACE(polyvec_tobytes)
-void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a);
-#define polyvec_frombytes KYBER_NAMESPACE(polyvec_frombytes)
-void polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]);
-
-#define polyvec_ntt KYBER_NAMESPACE(polyvec_ntt)
-void polyvec_ntt(polyvec *r);
-#define polyvec_invntt_tomont KYBER_NAMESPACE(polyvec_invntt_tomont)
-void polyvec_invntt_tomont(polyvec *r);
-
-#define polyvec_basemul_acc_montgomery KYBER_NAMESPACE(polyvec_basemul_acc_montgomery)
-void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b);
-
-#define polyvec_reduce KYBER_NAMESPACE(polyvec_reduce)
-void polyvec_reduce(polyvec *r);
-
-#define polyvec_add KYBER_NAMESPACE(polyvec_add)
-void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/reduce.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/reduce.c
deleted file mode 100644
index 9d8e7edf83..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/reduce.c
+++ /dev/null
@@ -1,42 +0,0 @@
-#include <stdint.h>
-#include "params.h"
-#include "reduce.h"
-
-/*************************************************
-* Name:        montgomery_reduce
-*
-* Description: Montgomery reduction; given a 32-bit integer a, computes
-*              16-bit integer congruent to a * R^-1 mod q, where R=2^16
-*
-* Arguments:   - int32_t a: input integer to be reduced;
-*                           has to be in {-q2^15,...,q2^15-1}
-*
-* Returns:     integer in {-q+1,...,q-1} congruent to a * R^-1 modulo q.
-**************************************************/
-int16_t montgomery_reduce(int32_t a)
-{
-  int16_t t;
-
-  t = (int16_t)a*QINV;
-  t = (a - (int32_t)t*KYBER_Q) >> 16;
-  return t;
-}
-
-/*************************************************
-* Name:        barrett_reduce
-*
-* Description: Barrett reduction; given a 16-bit integer a, computes
-*              centered representative congruent to a mod q in {-(q-1)/2,...,(q-1)/2}
-*
-* Arguments:   - int16_t a: input integer to be reduced
-*
-* Returns:     integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
-**************************************************/
-int16_t barrett_reduce(int16_t a) {
-  int16_t t;
-  const int16_t v = ((1<<26) + KYBER_Q/2)/KYBER_Q;
-
-  t  = ((int32_t)v*a + (1<<25)) >> 26;
-  t *= KYBER_Q;
-  return a - t;
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/reduce.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/reduce.h
deleted file mode 100644
index c1bc1e4c7b..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/reduce.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef REDUCE_H
-#define REDUCE_H
-
-#include <stdint.h>
-#include "params.h"
-
-#define MONT -1044 // 2^16 mod q
-#define QINV -3327 // q^-1 mod 2^16
-
-#define montgomery_reduce KYBER_NAMESPACE(montgomery_reduce)
-int16_t montgomery_reduce(int32_t a);
-
-#define barrett_reduce KYBER_NAMESPACE(barrett_reduce)
-int16_t barrett_reduce(int16_t a);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/symmetric-shake.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/symmetric-shake.c
deleted file mode 100644
index 20f451882e..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/symmetric-shake.c
+++ /dev/null
@@ -1,74 +0,0 @@
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-#include "params.h"
-#include "symmetric.h"
-#include "fips202.h"
-
-/*************************************************
-* Name:        kyber_shake128_absorb
-*
-* Description: Absorb step of the SHAKE128 specialized for the Kyber context.
-*
-* Arguments:   - keccak_state *state: pointer to (uninitialized) output Keccak state
-*              - const uint8_t *seed: pointer to KYBER_SYMBYTES input to be absorbed into state
-*              - uint8_t i: additional byte of input
-*              - uint8_t j: additional byte of input
-**************************************************/
-void kyber_shake128_absorb(shake128incctx *state,
-                           const uint8_t seed[KYBER_SYMBYTES],
-                           uint8_t x,
-                           uint8_t y)
-{
-  uint8_t extseed[KYBER_SYMBYTES+2];
-
-  memcpy(extseed, seed, KYBER_SYMBYTES);
-  extseed[KYBER_SYMBYTES+0] = x;
-  extseed[KYBER_SYMBYTES+1] = y;
-
-  shake128_absorb_once(state, extseed, sizeof(extseed));
-}
-
-/*************************************************
-* Name:        kyber_shake256_prf
-*
-* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input
-*              and then generates outlen bytes of SHAKE256 output
-*
-* Arguments:   - uint8_t *out: pointer to output
-*              - size_t outlen: number of requested output bytes
-*              - const uint8_t *key: pointer to the key (of length KYBER_SYMBYTES)
-*              - uint8_t nonce: single-byte nonce (public PRF input)
-**************************************************/
-void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce)
-{
-  uint8_t extkey[KYBER_SYMBYTES+1];
-
-  memcpy(extkey, key, KYBER_SYMBYTES);
-  extkey[KYBER_SYMBYTES] = nonce;
-
-  shake256(out, outlen, extkey, sizeof(extkey));
-}
-
-/*************************************************
-* Name:        kyber_shake256_prf
-*
-* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input
-*              and then generates outlen bytes of SHAKE256 output
-*
-* Arguments:   - uint8_t *out: pointer to output
-*              - size_t outlen: number of requested output bytes
-*              - const uint8_t *key: pointer to the key (of length KYBER_SYMBYTES)
-*              - uint8_t nonce: single-byte nonce (public PRF input)
-**************************************************/
-void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SYMBYTES], const uint8_t input[KYBER_CIPHERTEXTBYTES])
-{
-  shake256incctx s;
-
-  shake256_inc_init(&s);
-  shake256_inc_absorb(&s, key, KYBER_SYMBYTES);
-  shake256_inc_absorb(&s, input, KYBER_CIPHERTEXTBYTES);
-  shake256_inc_finalize(&s);
-  shake256_inc_squeeze(out, KYBER_SSBYTES, &s);
-  shake256_inc_ctx_release(&s);
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/symmetric.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/symmetric.h
deleted file mode 100644
index 2acc66f98d..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/symmetric.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef SYMMETRIC_H
-#define SYMMETRIC_H
-
-#include <stddef.h>
-#include <stdint.h>
-#include "params.h"
-
-#include "fips202.h"
-
-typedef shake128incctx xof_state;
-
-#define kyber_shake128_absorb KYBER_NAMESPACE(kyber_shake128_absorb)
-void kyber_shake128_absorb(shake128incctx *s,
-                           const uint8_t seed[KYBER_SYMBYTES],
-                           uint8_t x,
-                           uint8_t y);
-
-#define kyber_shake256_prf KYBER_NAMESPACE(kyber_shake256_prf)
-void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce);
-
-#define kyber_shake256_rkprf KYBER_NAMESPACE(kyber_shake256_rkprf)
-void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SYMBYTES], const uint8_t input[KYBER_CIPHERTEXTBYTES]);
-
-#define XOF_BLOCKBYTES SHAKE128_RATE
-
-#define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES)
-#define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES)
-#define xof_init(STATE, SEED) shake128_inc_init(STATE)
-#define xof_absorb(STATE, SEED, X, Y) kyber_shake128_absorb(STATE, SEED, X, Y)
-#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE)
-#define xof_release(STATE) shake128_inc_ctx_release(STATE)
-#define prf(OUT, OUTBYTES, KEY, NONCE) kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE)
-#define rkprf(OUT, KEY, INPUT) kyber_shake256_rkprf(OUT, KEY, INPUT)
-
-#endif /* SYMMETRIC_H */
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/verify.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/verify.c
deleted file mode 100644
index 914ccd448f..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/verify.c
+++ /dev/null
@@ -1,75 +0,0 @@
-#include <stddef.h>
-#include <stdint.h>
-#include "verify.h"
-
-/*************************************************
-* Name:        verify
-*
-* Description: Compare two arrays for equality in constant time.
-*
-* Arguments:   const uint8_t *a: pointer to first byte array
-*              const uint8_t *b: pointer to second byte array
-*              size_t len:       length of the byte arrays
-*
-* Returns 0 if the byte arrays are equal, 1 otherwise
-**************************************************/
-int verify(const uint8_t *a, const uint8_t *b, size_t len)
-{
-  size_t i;
-  uint8_t r = 0;
-
-  for(i=0;i<len;i++)
-    r |= a[i] ^ b[i];
-
-  return (-(uint64_t)r) >> 63;
-}
-
-/*************************************************
-* Name:        cmov
-*
-* Description: Copy len bytes from x to r if b is 1;
-*              don't modify x if b is 0. Requires b to be in {0,1};
-*              assumes two's complement representation of negative integers.
-*              Runs in constant time.
-*
-* Arguments:   uint8_t *r:       pointer to output byte array
-*              const uint8_t *x: pointer to input byte array
-*              size_t len:       Amount of bytes to be copied
-*              uint8_t b:        Condition bit; has to be in {0,1}
-**************************************************/
-void cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b)
-{
-  size_t i;
-
-#if defined(__GNUC__) || defined(__clang__)
-  // Prevent the compiler from
-  //    1) inferring that b is 0/1-valued, and
-  //    2) handling the two cases with a branch.
-  // This is not necessary when verify.c and kem.c are separate translation
-  // units, but we expect that downstream consumers will copy this code and/or
-  // change how it is built.
-  __asm__("" : "+r"(b) : /* no inputs */);
-#endif
-
-  b = -b;
-  for(i=0;i<len;i++)
-    r[i] ^= b & (r[i] ^ x[i]);
-}
-
-
-/*************************************************
-* Name:        cmov_int16
-*
-* Description: Copy input v to *r if b is 1, don't modify *r if b is 0. 
-*              Requires b to be in {0,1};
-*              Runs in constant time.
-*
-* Arguments:   int16_t *r:       pointer to output int16_t
-*              int16_t v:        input int16_t 
-*              uint8_t b:        Condition bit; has to be in {0,1}
-**************************************************/
-void cmov_int16(int16_t *r, int16_t v, uint16_t b)
-{
-  b = -b;
-  *r ^= b & ((*r) ^ v);
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/verify.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/verify.h
deleted file mode 100644
index 09f0ad5046..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_ref/verify.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef VERIFY_H
-#define VERIFY_H
-
-#include <stddef.h>
-#include <stdint.h>
-#include "params.h"
-
-#define verify KYBER_NAMESPACE(verify)
-int verify(const uint8_t *a, const uint8_t *b, size_t len);
-
-#define cmov KYBER_NAMESPACE(cmov)
-void cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b);
-
-#define cmov_int16 KYBER_NAMESPACE(cmov_int16)
-void cmov_int16(int16_t *r, int16_t v, uint16_t b);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/align.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/align.h
deleted file mode 100644
index 3463866f37..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/align.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef ALIGN_H
-#define ALIGN_H
-
-#include <stdint.h>
-#include <immintrin.h>
-
-#define ALIGNED_UINT8(N)        \
-    union {                     \
-        uint8_t coeffs[N];      \
-        __m256i vec[(N+31)/32]; \
-    }
-
-#define ALIGNED_INT16(N)        \
-    union {                     \
-        int16_t coeffs[N];      \
-        __m256i vec[(N+15)/16]; \
-    }
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/api.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/api.h
deleted file mode 100644
index a154e80f1d..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/api.h
+++ /dev/null
@@ -1,66 +0,0 @@
-#ifndef API_H
-#define API_H
-
-#include <stdint.h>
-
-#define pqcrystals_kyber512_SECRETKEYBYTES 1632
-#define pqcrystals_kyber512_PUBLICKEYBYTES 800
-#define pqcrystals_kyber512_CIPHERTEXTBYTES 768
-#define pqcrystals_kyber512_KEYPAIRCOINBYTES 64
-#define pqcrystals_kyber512_ENCCOINBYTES 32
-#define pqcrystals_kyber512_BYTES 32
-
-#define pqcrystals_kyber512_avx2_SECRETKEYBYTES pqcrystals_kyber512_SECRETKEYBYTES
-#define pqcrystals_kyber512_avx2_PUBLICKEYBYTES pqcrystals_kyber512_PUBLICKEYBYTES
-#define pqcrystals_kyber512_avx2_CIPHERTEXTBYTES pqcrystals_kyber512_CIPHERTEXTBYTES
-#define pqcrystals_kyber512_avx2_KEYPAIRCOINBYTES pqcrystals_kyber512_KEYPAIRCOINBYTES
-#define pqcrystals_kyber512_avx2_ENCCOINBYTES pqcrystals_kyber512_ENCCOINBYTES
-#define pqcrystals_kyber512_avx2_BYTES pqcrystals_kyber512_BYTES
-
-int pqcrystals_kyber512_avx2_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
-int pqcrystals_kyber512_avx2_keypair(uint8_t *pk, uint8_t *sk);
-int pqcrystals_kyber512_avx2_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
-int pqcrystals_kyber512_avx2_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-int pqcrystals_kyber512_avx2_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-
-#define pqcrystals_kyber768_SECRETKEYBYTES 2400
-#define pqcrystals_kyber768_PUBLICKEYBYTES 1184
-#define pqcrystals_kyber768_CIPHERTEXTBYTES 1088
-#define pqcrystals_kyber768_KEYPAIRCOINBYTES 64
-#define pqcrystals_kyber768_ENCCOINBYTES 32
-#define pqcrystals_kyber768_BYTES 32
-
-#define pqcrystals_kyber768_avx2_SECRETKEYBYTES pqcrystals_kyber768_SECRETKEYBYTES
-#define pqcrystals_kyber768_avx2_PUBLICKEYBYTES pqcrystals_kyber768_PUBLICKEYBYTES
-#define pqcrystals_kyber768_avx2_CIPHERTEXTBYTES pqcrystals_kyber768_CIPHERTEXTBYTES
-#define pqcrystals_kyber768_avx2_KEYPAIRCOINBYTES pqcrystals_kyber768_KEYPAIRCOINBYTES
-#define pqcrystals_kyber768_avx2_ENCCOINBYTES pqcrystals_kyber768_ENCCOINBYTES
-#define pqcrystals_kyber768_avx2_BYTES pqcrystals_kyber768_BYTES
-
-int pqcrystals_kyber768_avx2_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
-int pqcrystals_kyber768_avx2_keypair(uint8_t *pk, uint8_t *sk);
-int pqcrystals_kyber768_avx2_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
-int pqcrystals_kyber768_avx2_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-int pqcrystals_kyber768_avx2_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-
-#define pqcrystals_kyber1024_SECRETKEYBYTES 3168
-#define pqcrystals_kyber1024_PUBLICKEYBYTES 1568
-#define pqcrystals_kyber1024_CIPHERTEXTBYTES 1568
-#define pqcrystals_kyber1024_KEYPAIRCOINBYTES 64
-#define pqcrystals_kyber1024_ENCCOINBYTES 32
-#define pqcrystals_kyber1024_BYTES 32
-
-#define pqcrystals_kyber1024_avx2_SECRETKEYBYTES pqcrystals_kyber1024_SECRETKEYBYTES
-#define pqcrystals_kyber1024_avx2_PUBLICKEYBYTES pqcrystals_kyber1024_PUBLICKEYBYTES
-#define pqcrystals_kyber1024_avx2_CIPHERTEXTBYTES pqcrystals_kyber1024_CIPHERTEXTBYTES
-#define pqcrystals_kyber1024_avx2_KEYPAIRCOINBYTES pqcrystals_kyber1024_KEYPAIRCOINBYTES
-#define pqcrystals_kyber1024_avx2_ENCCOINBYTES pqcrystals_kyber1024_ENCCOINBYTES
-#define pqcrystals_kyber1024_avx2_BYTES pqcrystals_kyber1024_BYTES
-
-int pqcrystals_kyber1024_avx2_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
-int pqcrystals_kyber1024_avx2_keypair(uint8_t *pk, uint8_t *sk);
-int pqcrystals_kyber1024_avx2_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
-int pqcrystals_kyber1024_avx2_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-int pqcrystals_kyber1024_avx2_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/basemul.S b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/basemul.S
deleted file mode 100644
index 36990639b2..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/basemul.S
+++ /dev/null
@@ -1,105 +0,0 @@
-#include "consts.h"
-
-.macro schoolbook off
-vmovdqa		_16XQINV*2(%rcx),%ymm0
-vmovdqa		(64*\off+ 0)*2(%rsi),%ymm1		# a0
-vmovdqa		(64*\off+16)*2(%rsi),%ymm2		# b0
-vmovdqa		(64*\off+32)*2(%rsi),%ymm3		# a1
-vmovdqa		(64*\off+48)*2(%rsi),%ymm4		# b1
-
-vpmullw		%ymm0,%ymm1,%ymm9			# a0.lo
-vpmullw		%ymm0,%ymm2,%ymm10			# b0.lo
-vpmullw		%ymm0,%ymm3,%ymm11			# a1.lo
-vpmullw		%ymm0,%ymm4,%ymm12			# b1.lo
-
-vmovdqa		(64*\off+ 0)*2(%rdx),%ymm5		# c0
-vmovdqa		(64*\off+16)*2(%rdx),%ymm6		# d0
-
-vpmulhw		%ymm5,%ymm1,%ymm13			# a0c0.hi
-vpmulhw		%ymm6,%ymm1,%ymm1			# a0d0.hi
-vpmulhw		%ymm5,%ymm2,%ymm14			# b0c0.hi
-vpmulhw		%ymm6,%ymm2,%ymm2			# b0d0.hi
-
-vmovdqa		(64*\off+32)*2(%rdx),%ymm7		# c1
-vmovdqa		(64*\off+48)*2(%rdx),%ymm8		# d1
-
-vpmulhw		%ymm7,%ymm3,%ymm15			# a1c1.hi
-vpmulhw		%ymm8,%ymm3,%ymm3			# a1d1.hi
-vpmulhw		%ymm7,%ymm4,%ymm0			# b1c1.hi
-vpmulhw		%ymm8,%ymm4,%ymm4			# b1d1.hi
-
-vmovdqa		%ymm13,(%rsp)
-
-vpmullw		%ymm5,%ymm9,%ymm13			# a0c0.lo
-vpmullw		%ymm6,%ymm9,%ymm9			# a0d0.lo
-vpmullw		%ymm5,%ymm10,%ymm5			# b0c0.lo
-vpmullw		%ymm6,%ymm10,%ymm10			# b0d0.lo
-
-vpmullw		%ymm7,%ymm11,%ymm6			# a1c1.lo
-vpmullw		%ymm8,%ymm11,%ymm11			# a1d1.lo
-vpmullw		%ymm7,%ymm12,%ymm7			# b1c1.lo
-vpmullw		%ymm8,%ymm12,%ymm12			# b1d1.lo
-
-vmovdqa		_16XQ*2(%rcx),%ymm8
-vpmulhw		%ymm8,%ymm13,%ymm13
-vpmulhw		%ymm8,%ymm9,%ymm9
-vpmulhw		%ymm8,%ymm5,%ymm5
-vpmulhw		%ymm8,%ymm10,%ymm10
-vpmulhw		%ymm8,%ymm6,%ymm6
-vpmulhw		%ymm8,%ymm11,%ymm11
-vpmulhw		%ymm8,%ymm7,%ymm7
-vpmulhw		%ymm8,%ymm12,%ymm12
-
-vpsubw		(%rsp),%ymm13,%ymm13			# -a0c0
-vpsubw		%ymm9,%ymm1,%ymm9			# a0d0
-vpsubw		%ymm5,%ymm14,%ymm5			# b0c0
-vpsubw		%ymm10,%ymm2,%ymm10			# b0d0
-
-vpsubw		%ymm6,%ymm15,%ymm6			# a1c1
-vpsubw		%ymm11,%ymm3,%ymm11			# a1d1
-vpsubw		%ymm7,%ymm0,%ymm7			# b1c1
-vpsubw		%ymm12,%ymm4,%ymm12			# b1d1
-
-vmovdqa		(%r9),%ymm0
-vmovdqa		32(%r9),%ymm1
-vpmullw		%ymm0,%ymm10,%ymm2
-vpmullw		%ymm0,%ymm12,%ymm3
-vpmulhw		%ymm1,%ymm10,%ymm10
-vpmulhw		%ymm1,%ymm12,%ymm12
-vpmulhw		%ymm8,%ymm2,%ymm2
-vpmulhw		%ymm8,%ymm3,%ymm3
-vpsubw		%ymm2,%ymm10,%ymm10			# rb0d0
-vpsubw		%ymm3,%ymm12,%ymm12			# rb1d1
-
-vpaddw		%ymm5,%ymm9,%ymm9
-vpaddw		%ymm7,%ymm11,%ymm11
-vpsubw		%ymm13,%ymm10,%ymm13
-vpsubw		%ymm12,%ymm6,%ymm6
-
-vmovdqa		%ymm13,(64*\off+ 0)*2(%rdi)
-vmovdqa		%ymm9,(64*\off+16)*2(%rdi)
-vmovdqa		%ymm6,(64*\off+32)*2(%rdi)
-vmovdqa		%ymm11,(64*\off+48)*2(%rdi)
-.endm
-
-.text
-.global cdecl(basemul_avx)
-cdecl(basemul_avx):
-mov		%rsp,%r8
-and		$-32,%rsp
-sub		$32,%rsp
-
-lea		(_ZETAS_EXP+176)*2(%rcx),%r9
-schoolbook	0
-
-add		$32*2,%r9
-schoolbook	1
-
-add		$192*2,%r9
-schoolbook	2
-
-add		$32*2,%r9
-schoolbook	3
-
-mov		%r8,%rsp
-ret
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/cbd.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/cbd.c
deleted file mode 100644
index dad473c79e..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/cbd.c
+++ /dev/null
@@ -1,144 +0,0 @@
-#include <stdint.h>
-#include <immintrin.h>
-#include "params.h"
-#include "cbd.h"
-
-/*************************************************
-* Name:        cbd2
-*
-* Description: Given an array of uniformly random bytes, compute
-*              polynomial with coefficients distributed according to
-*              a centered binomial distribution with parameter eta=2
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const __m256i *buf: pointer to aligned input byte array
-**************************************************/
-static void cbd2(poly * restrict r, const __m256i buf[2*KYBER_N/128])
-{
-  unsigned int i;
-  __m256i f0, f1, f2, f3;
-  const __m256i mask55 = _mm256_set1_epi32(0x55555555);
-  const __m256i mask33 = _mm256_set1_epi32(0x33333333);
-  const __m256i mask03 = _mm256_set1_epi32(0x03030303);
-  const __m256i mask0F = _mm256_set1_epi32(0x0F0F0F0F);
-
-  for(i = 0; i < KYBER_N/64; i++) {
-    f0 = _mm256_load_si256(&buf[i]);
-
-    f1 = _mm256_srli_epi16(f0, 1);
-    f0 = _mm256_and_si256(mask55, f0);
-    f1 = _mm256_and_si256(mask55, f1);
-    f0 = _mm256_add_epi8(f0, f1);
-
-    f1 = _mm256_srli_epi16(f0, 2);
-    f0 = _mm256_and_si256(mask33, f0);
-    f1 = _mm256_and_si256(mask33, f1);
-    f0 = _mm256_add_epi8(f0, mask33);
-    f0 = _mm256_sub_epi8(f0, f1);
-
-    f1 = _mm256_srli_epi16(f0, 4);
-    f0 = _mm256_and_si256(mask0F, f0);
-    f1 = _mm256_and_si256(mask0F, f1);
-    f0 = _mm256_sub_epi8(f0, mask03);
-    f1 = _mm256_sub_epi8(f1, mask03);
-
-    f2 = _mm256_unpacklo_epi8(f0, f1);
-    f3 = _mm256_unpackhi_epi8(f0, f1);
-
-    f0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f2));
-    f1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f2,1));
-    f2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f3));
-    f3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f3,1));
-
-    _mm256_store_si256(&r->vec[4*i+0], f0);
-    _mm256_store_si256(&r->vec[4*i+1], f2);
-    _mm256_store_si256(&r->vec[4*i+2], f1);
-    _mm256_store_si256(&r->vec[4*i+3], f3);
-  }
-}
-
-#if KYBER_ETA1 == 3
-/*************************************************
-* Name:        cbd3
-*
-* Description: Given an array of uniformly random bytes, compute
-*              polynomial with coefficients distributed according to
-*              a centered binomial distribution with parameter eta=3
-*              This function is only needed for Kyber-512
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const __m256i *buf: pointer to aligned input byte array
-**************************************************/
-static void cbd3(poly * restrict r, const uint8_t buf[3*KYBER_N/4+8])
-{
-  unsigned int i;
-  __m256i f0, f1, f2, f3;
-  const __m256i mask249 = _mm256_set1_epi32(0x249249);
-  const __m256i mask6DB = _mm256_set1_epi32(0x6DB6DB);
-  const __m256i mask07 = _mm256_set1_epi32(7);
-  const __m256i mask70 = _mm256_set1_epi32(7 << 16);
-  const __m256i mask3 = _mm256_set1_epi16(3);
-  const __m256i shufbidx = _mm256_set_epi8(-1,15,14,13,-1,12,11,10,-1, 9, 8, 7,-1, 6, 5, 4,
-                                           -1,11,10, 9,-1, 8, 7, 6,-1, 5, 4, 3,-1, 2, 1, 0);
-
-  for(i = 0; i < KYBER_N/32; i++) {
-    f0 = _mm256_loadu_si256((__m256i *)&buf[24*i]);
-    f0 = _mm256_permute4x64_epi64(f0,0x94);
-    f0 = _mm256_shuffle_epi8(f0,shufbidx);
-
-    f1 = _mm256_srli_epi32(f0,1);
-    f2 = _mm256_srli_epi32(f0,2);
-    f0 = _mm256_and_si256(mask249,f0);
-    f1 = _mm256_and_si256(mask249,f1);
-    f2 = _mm256_and_si256(mask249,f2);
-    f0 = _mm256_add_epi32(f0,f1);
-    f0 = _mm256_add_epi32(f0,f2);
-
-    f1 = _mm256_srli_epi32(f0,3);
-    f0 = _mm256_add_epi32(f0,mask6DB);
-    f0 = _mm256_sub_epi32(f0,f1);
-
-    f1 = _mm256_slli_epi32(f0,10);
-    f2 = _mm256_srli_epi32(f0,12);
-    f3 = _mm256_srli_epi32(f0, 2);
-    f0 = _mm256_and_si256(f0,mask07);
-    f1 = _mm256_and_si256(f1,mask70);
-    f2 = _mm256_and_si256(f2,mask07);
-    f3 = _mm256_and_si256(f3,mask70);
-    f0 = _mm256_add_epi16(f0,f1);
-    f1 = _mm256_add_epi16(f2,f3);
-    f0 = _mm256_sub_epi16(f0,mask3);
-    f1 = _mm256_sub_epi16(f1,mask3);
-
-    f2 = _mm256_unpacklo_epi32(f0,f1);
-    f3 = _mm256_unpackhi_epi32(f0,f1);
-
-    f0 = _mm256_permute2x128_si256(f2,f3,0x20);
-    f1 = _mm256_permute2x128_si256(f2,f3,0x31);
-
-    _mm256_store_si256(&r->vec[2*i+0], f0);
-    _mm256_store_si256(&r->vec[2*i+1], f1);
-  }
-}
-#endif
-
-/* buf 32 bytes longer for cbd3 */
-void poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1*KYBER_N/128+1])
-{
-#if KYBER_ETA1 == 2
-  cbd2(r, buf);
-#elif KYBER_ETA1 == 3
-  cbd3(r, (uint8_t *)buf);
-#else
-#error "This implementation requires eta1 in {2,3}"
-#endif
-}
-
-void poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2*KYBER_N/128])
-{
-#if KYBER_ETA2 == 2
-  cbd2(r, buf);
-#else
-#error "This implementation requires eta2 = 2"
-#endif
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/cbd.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/cbd.h
deleted file mode 100644
index 05788e06b4..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/cbd.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef CBD_H
-#define CBD_H
-
-#include <stdint.h>
-#include <immintrin.h>
-#include "params.h"
-#include "poly.h"
-
-#define poly_cbd_eta1 KYBER_NAMESPACE(poly_cbd_eta1)
-void poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1*KYBER_N/128+1]);
-
-#define poly_cbd_eta2 KYBER_NAMESPACE(poly_cbd_eta2)
-void poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2*KYBER_N/128]);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/consts.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/consts.c
deleted file mode 100644
index 84e596893d..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/consts.c
+++ /dev/null
@@ -1,121 +0,0 @@
-#include "align.h"
-#include "params.h"
-#include "consts.h"
-
-#define Q KYBER_Q
-#define MONT -1044 // 2^16 mod q
-#define QINV -3327 // q^-1 mod 2^16
-#define V 20159 // floor(2^26/q + 0.5)
-#define FHI 1441 // mont^2/128
-#define FLO -10079 // qinv*FHI
-#define MONTSQHI 1353 // mont^2
-#define MONTSQLO 20553 // qinv*MONTSQHI
-#define MASK 4095
-#define SHIFT 32
-
-const qdata_t qdata = {{
-#define _16XQ 0
-  Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q,
-
-#define _16XQINV 16
-  QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV,
-  QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV,
-
-#define _16XV 32
-  V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
-
-#define _16XFLO 48
-  FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO,
-  FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO,
-
-#define _16XFHI 64
-  FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI,
-  FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI,
-
-#define _16XMONTSQLO 80
-  MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
-  MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
-  MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
-  MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
-
-#define _16XMONTSQHI 96
-  MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
-  MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
-  MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
-  MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
-
-#define _16XMASK 112
-  MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK,
-  MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK,
-
-#define _REVIDXB 128
-  3854, 3340, 2826, 2312, 1798, 1284, 770, 256,
-  3854, 3340, 2826, 2312, 1798, 1284, 770, 256,
-
-#define _REVIDXD 144
-  7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0, 0,
-
-#define _ZETAS_EXP 160
-   31498,  31498,  31498,  31498,   -758,   -758,   -758,   -758,
-    5237,   5237,   5237,   5237,   1397,   1397,   1397,   1397,
-   14745,  14745,  14745,  14745,  14745,  14745,  14745,  14745,
-   14745,  14745,  14745,  14745,  14745,  14745,  14745,  14745,
-    -359,   -359,   -359,   -359,   -359,   -359,   -359,   -359,
-    -359,   -359,   -359,   -359,   -359,   -359,   -359,   -359,
-   13525,  13525,  13525,  13525,  13525,  13525,  13525,  13525,
-  -12402, -12402, -12402, -12402, -12402, -12402, -12402, -12402,
-    1493,   1493,   1493,   1493,   1493,   1493,   1493,   1493,
-    1422,   1422,   1422,   1422,   1422,   1422,   1422,   1422,
-  -20907, -20907, -20907, -20907,  27758,  27758,  27758,  27758,
-   -3799,  -3799,  -3799,  -3799, -15690, -15690, -15690, -15690,
-    -171,   -171,   -171,   -171,    622,    622,    622,    622,
-    1577,   1577,   1577,   1577,    182,    182,    182,    182,
-   -5827,  -5827,  17363,  17363, -26360, -26360, -29057, -29057,
-    5571,   5571,  -1102,  -1102,  21438,  21438, -26242, -26242,
-     573,    573,  -1325,  -1325,    264,    264,    383,    383,
-    -829,   -829,   1458,   1458,  -1602,  -1602,   -130,   -130,
-   -5689,  -6516,   1496,  30967, -23565,  20179,  20710,  25080,
-  -12796,  26616,  16064, -12442,   9134,   -650, -25986,  27837,
-    1223,    652,   -552,   1015,  -1293,   1491,   -282,  -1544,
-     516,     -8,   -320,   -666,  -1618,  -1162,    126,   1469,
-    -335, -11477, -32227,  20494, -27738,    945, -14883,   6182,
-   32010,  10631,  29175, -28762, -18486,  17560, -14430,  -5276,
-   -1103,    555,  -1251,   1550,    422,    177,   -291,   1574,
-    -246,   1159,   -777,   -602,  -1590,   -872,    418,   -156,
-   11182,  13387, -14233, -21655,  13131,  -4587,  23092,   5493,
-  -32502,  30317, -18741,  12639,  20100,  18525,  19529, -12619,
-     430,    843,    871,    105,    587,   -235,   -460,   1653,
-     778,   -147,   1483,   1119,    644,    349,    329,    -75,
-     787,    787,    787,    787,    787,    787,    787,    787,
-     787,    787,    787,    787,    787,    787,    787,    787,
-   -1517,  -1517,  -1517,  -1517,  -1517,  -1517,  -1517,  -1517,
-   -1517,  -1517,  -1517,  -1517,  -1517,  -1517,  -1517,  -1517,
-   28191,  28191,  28191,  28191,  28191,  28191,  28191,  28191,
-  -16694, -16694, -16694, -16694, -16694, -16694, -16694, -16694,
-     287,    287,    287,    287,    287,    287,    287,    287,
-     202,    202,    202,    202,    202,    202,    202,    202,
-   10690,  10690,  10690,  10690,   1358,   1358,   1358,   1358,
-  -11202, -11202, -11202, -11202,  31164,  31164,  31164,  31164,
-     962,    962,    962,    962,  -1202,  -1202,  -1202,  -1202,
-   -1474,  -1474,  -1474,  -1474,   1468,   1468,   1468,   1468,
-  -28073, -28073,  24313,  24313, -10532, -10532,   8800,   8800,
-   18426,  18426,   8859,   8859,  26675,  26675, -16163, -16163,
-    -681,   -681,   1017,   1017,    732,    732,    608,    608,
-   -1542,  -1542,    411,    411,   -205,   -205,  -1571,  -1571,
-   19883, -28250, -15887,  -8898, -28309,   9075, -30199,  18249,
-   13426,  14017, -29156, -12757,  16832,   4311, -24155, -17915,
-    -853,    -90,   -271,    830,    107,  -1421,   -247,   -951,
-    -398,    961,  -1508,   -725,    448,  -1065,    677,  -1275,
-  -31183,  25435,  -7382,  24391, -20927,  10946,  24214,  16989,
-   10335,  -7934, -22502,  10906,  31636,  28644,  23998, -17422,
-     817,    603,   1322,  -1465,  -1215,   1218,   -874,  -1187,
-   -1185,  -1278,  -1510,   -870,   -108,    996,    958,   1522,
-   20297,   2146,  15355, -32384,  -6280, -14903, -11044,  14469,
-  -21498, -20198,  23210, -17442, -23860, -20257,   7756,  23132,
-    1097,    610,  -1285,    384,   -136,  -1335,    220,  -1659,
-   -1530,    794,   -854,    478,   -308,    991,  -1460,   1628,
-
-#define _16XSHIFT 624
-  SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT,
-  SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT
-}};
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/consts.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/consts.h
deleted file mode 100644
index f95899cd8e..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/consts.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef CONSTS_H
-#define CONSTS_H
-
-#include "params.h"
-
-#define _16XQ            0
-#define _16XQINV        16
-#define _16XV           32
-#define _16XFLO         48
-#define _16XFHI         64
-#define _16XMONTSQLO    80
-#define _16XMONTSQHI    96
-#define _16XMASK       112
-#define _REVIDXB       128
-#define _REVIDXD       144
-#define _ZETAS_EXP     160
-#define	_16XSHIFT      624
-
-/* The C ABI on MacOS exports all symbols with a leading
- * underscore. This means that any symbols we refer to from
- * C files (functions) can't be found, and all symbols we
- * refer to from ASM also can't be found.
- *
- * This define helps us get around this
- */
-#ifdef __ASSEMBLER__
-#if defined(__WIN32__) || defined(__APPLE__)
-#define decorate(s) _##s
-#define cdecl2(s) decorate(s)
-#define cdecl(s) cdecl2(KYBER_NAMESPACE(##s))
-#else
-#define cdecl(s) KYBER_NAMESPACE(##s)
-#endif
-#endif
-
-#ifndef __ASSEMBLER__
-#include "align.h"
-typedef ALIGNED_INT16(640) qdata_t;
-#define qdata KYBER_NAMESPACE(qdata)
-extern const qdata_t qdata;
-#endif
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/fq.S b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/fq.S
deleted file mode 100644
index 3bb1ebd3d8..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/fq.S
+++ /dev/null
@@ -1,88 +0,0 @@
-#include "consts.h"
-.include "fq.inc"
-
-.text
-reduce128_avx:
-#load
-vmovdqa		(%rdi),%ymm2
-vmovdqa		32(%rdi),%ymm3
-vmovdqa		64(%rdi),%ymm4
-vmovdqa		96(%rdi),%ymm5
-vmovdqa		128(%rdi),%ymm6
-vmovdqa		160(%rdi),%ymm7
-vmovdqa		192(%rdi),%ymm8
-vmovdqa		224(%rdi),%ymm9
-
-red16		2
-red16		3
-red16		4
-red16		5
-red16		6
-red16		7
-red16		8
-red16		9
-
-#store
-vmovdqa		%ymm2,(%rdi)
-vmovdqa		%ymm3,32(%rdi)
-vmovdqa		%ymm4,64(%rdi)
-vmovdqa		%ymm5,96(%rdi)
-vmovdqa		%ymm6,128(%rdi)
-vmovdqa		%ymm7,160(%rdi)
-vmovdqa		%ymm8,192(%rdi)
-vmovdqa		%ymm9,224(%rdi)
-
-ret
-
-.global cdecl(reduce_avx)
-cdecl(reduce_avx):
-#consts
-vmovdqa		_16XQ*2(%rsi),%ymm0
-vmovdqa		_16XV*2(%rsi),%ymm1
-call		reduce128_avx
-add		$256,%rdi
-call		reduce128_avx
-ret
-
-tomont128_avx:
-#load
-vmovdqa		(%rdi),%ymm3
-vmovdqa		32(%rdi),%ymm4
-vmovdqa		64(%rdi),%ymm5
-vmovdqa		96(%rdi),%ymm6
-vmovdqa		128(%rdi),%ymm7
-vmovdqa		160(%rdi),%ymm8
-vmovdqa		192(%rdi),%ymm9
-vmovdqa		224(%rdi),%ymm10
-
-fqmulprecomp	1,2,3,11
-fqmulprecomp	1,2,4,12
-fqmulprecomp	1,2,5,13
-fqmulprecomp	1,2,6,14
-fqmulprecomp	1,2,7,15
-fqmulprecomp	1,2,8,11
-fqmulprecomp	1,2,9,12
-fqmulprecomp	1,2,10,13
-
-#store
-vmovdqa		%ymm3,(%rdi)
-vmovdqa		%ymm4,32(%rdi)
-vmovdqa		%ymm5,64(%rdi)
-vmovdqa		%ymm6,96(%rdi)
-vmovdqa		%ymm7,128(%rdi)
-vmovdqa		%ymm8,160(%rdi)
-vmovdqa		%ymm9,192(%rdi)
-vmovdqa		%ymm10,224(%rdi)
-
-ret
-
-.global cdecl(tomont_avx)
-cdecl(tomont_avx):
-#consts
-vmovdqa		_16XQ*2(%rsi),%ymm0
-vmovdqa		_16XMONTSQLO*2(%rsi),%ymm1
-vmovdqa		_16XMONTSQHI*2(%rsi),%ymm2
-call		tomont128_avx
-add		$256,%rdi
-call		tomont128_avx
-ret
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/fq.inc b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/fq.inc
deleted file mode 100644
index 4b7afc3118..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/fq.inc
+++ /dev/null
@@ -1,30 +0,0 @@
-.macro red16 r,rs=0,x=12
-vpmulhw         %ymm1,%ymm\r,%ymm\x
-.if \rs
-vpmulhrsw	%ymm\rs,%ymm\x,%ymm\x
-.else
-vpsraw          $10,%ymm\x,%ymm\x
-.endif
-vpmullw         %ymm0,%ymm\x,%ymm\x
-vpsubw          %ymm\x,%ymm\r,%ymm\r
-.endm
-
-.macro csubq r,x=12
-vpsubw		%ymm0,%ymm\r,%ymm\r
-vpsraw		$15,%ymm\r,%ymm\x
-vpand		%ymm0,%ymm\x,%ymm\x
-vpaddw		%ymm\x,%ymm\r,%ymm\r
-.endm
-
-.macro caddq r,x=12
-vpsraw		$15,%ymm\r,%ymm\x
-vpand		%ymm0,%ymm\x,%ymm\x
-vpaddw		%ymm\x,%ymm\r,%ymm\r
-.endm
-
-.macro fqmulprecomp al,ah,b,x=12
-vpmullw		%ymm\al,%ymm\b,%ymm\x
-vpmulhw		%ymm\ah,%ymm\b,%ymm\b
-vpmulhw		%ymm0,%ymm\x,%ymm\x
-vpsubw		%ymm\x,%ymm\b,%ymm\b
-.endm
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/indcpa.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/indcpa.c
deleted file mode 100644
index c4b2b3a89f..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/indcpa.c
+++ /dev/null
@@ -1,568 +0,0 @@
-#include <stddef.h>
-#include <stdint.h>
-#include <immintrin.h>
-#include <string.h>
-#include "align.h"
-#include "params.h"
-#include "indcpa.h"
-#include "polyvec.h"
-#include "poly.h"
-#include "ntt.h"
-#include "cbd.h"
-#include "rejsample.h"
-#include "symmetric.h"
-#include "randombytes.h"
-
-/*************************************************
-* Name:        pack_pk
-*
-* Description: Serialize the public key as concatenation of the
-*              serialized vector of polynomials pk and the
-*              public seed used to generate the matrix A.
-*              The polynomial coefficients in pk are assumed to
-*              lie in the invertal [0,q], i.e. pk must be reduced
-*              by polyvec_reduce().
-*
-* Arguments:   uint8_t *r: pointer to the output serialized public key
-*              polyvec *pk: pointer to the input public-key polyvec
-*              const uint8_t *seed: pointer to the input public seed
-**************************************************/
-static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES],
-                    polyvec *pk,
-                    const uint8_t seed[KYBER_SYMBYTES])
-{
-  polyvec_tobytes(r, pk);
-  memcpy(r+KYBER_POLYVECBYTES, seed, KYBER_SYMBYTES);
-}
-
-/*************************************************
-* Name:        unpack_pk
-*
-* Description: De-serialize public key from a byte array;
-*              approximate inverse of pack_pk
-*
-* Arguments:   - polyvec *pk: pointer to output public-key polynomial vector
-*              - uint8_t *seed: pointer to output seed to generate matrix A
-*              - const uint8_t *packedpk: pointer to input serialized public key
-**************************************************/
-static void unpack_pk(polyvec *pk,
-                      uint8_t seed[KYBER_SYMBYTES],
-                      const uint8_t packedpk[KYBER_INDCPA_PUBLICKEYBYTES])
-{
-  polyvec_frombytes(pk, packedpk);
-  memcpy(seed, packedpk+KYBER_POLYVECBYTES, KYBER_SYMBYTES);
-}
-
-/*************************************************
-* Name:        pack_sk
-*
-* Description: Serialize the secret key.
-*              The polynomial coefficients in sk are assumed to
-*              lie in the invertal [0,q], i.e. sk must be reduced
-*              by polyvec_reduce().
-*
-* Arguments:   - uint8_t *r: pointer to output serialized secret key
-*              - polyvec *sk: pointer to input vector of polynomials (secret key)
-**************************************************/
-static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk)
-{
-  polyvec_tobytes(r, sk);
-}
-
-/*************************************************
-* Name:        unpack_sk
-*
-* Description: De-serialize the secret key; inverse of pack_sk
-*
-* Arguments:   - polyvec *sk: pointer to output vector of polynomials (secret key)
-*              - const uint8_t *packedsk: pointer to input serialized secret key
-**************************************************/
-static void unpack_sk(polyvec *sk, const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES])
-{
-  polyvec_frombytes(sk, packedsk);
-}
-
-/*************************************************
-* Name:        pack_ciphertext
-*
-* Description: Serialize the ciphertext as concatenation of the
-*              compressed and serialized vector of polynomials b
-*              and the compressed and serialized polynomial v.
-*              The polynomial coefficients in b and v are assumed to
-*              lie in the invertal [0,q], i.e. b and v must be reduced
-*              by polyvec_reduce() and poly_reduce(), respectively.
-*
-* Arguments:   uint8_t *r: pointer to the output serialized ciphertext
-*              poly *pk: pointer to the input vector of polynomials b
-*              poly *v: pointer to the input polynomial v
-**************************************************/
-static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], polyvec *b, poly *v)
-{
-  polyvec_compress(r, b);
-  poly_compress(r+KYBER_POLYVECCOMPRESSEDBYTES, v);
-}
-
-/*************************************************
-* Name:        unpack_ciphertext
-*
-* Description: De-serialize and decompress ciphertext from a byte array;
-*              approximate inverse of pack_ciphertext
-*
-* Arguments:   - polyvec *b: pointer to the output vector of polynomials b
-*              - poly *v: pointer to the output polynomial v
-*              - const uint8_t *c: pointer to the input serialized ciphertext
-**************************************************/
-static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[KYBER_INDCPA_BYTES])
-{
-  polyvec_decompress(b, c);
-  poly_decompress(v, c+KYBER_POLYVECCOMPRESSEDBYTES);
-}
-
-/*************************************************
-* Name:        rej_uniform
-*
-* Description: Run rejection sampling on uniform random bytes to generate
-*              uniform random integers mod q
-*
-* Arguments:   - int16_t *r: pointer to output array
-*              - unsigned int len: requested number of 16-bit integers (uniform mod q)
-*              - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes)
-*              - unsigned int buflen: length of input buffer in bytes
-*
-* Returns number of sampled 16-bit integers (at most len)
-**************************************************/
-static unsigned int rej_uniform(int16_t *r,
-                                unsigned int len,
-                                const uint8_t *buf,
-                                unsigned int buflen)
-{
-  unsigned int ctr, pos;
-  uint16_t val0, val1;
-
-  ctr = pos = 0;
-  while(ctr < len && pos <= buflen - 3) {  // buflen is always at least 3
-    val0 = ((buf[pos+0] >> 0) | ((uint16_t)buf[pos+1] << 8)) & 0xFFF;
-    val1 = ((buf[pos+1] >> 4) | ((uint16_t)buf[pos+2] << 4)) & 0xFFF;
-    pos += 3;
-
-    if(val0 < KYBER_Q)
-      r[ctr++] = val0;
-    if(ctr < len && val1 < KYBER_Q)
-      r[ctr++] = val1;
-  }
-
-  return ctr;
-}
-
-#define gen_a(A,B)  gen_matrix(A,B,0)
-#define gen_at(A,B) gen_matrix(A,B,1)
-
-/*************************************************
-* Name:        gen_matrix
-*
-* Description: Deterministically generate matrix A (or the transpose of A)
-*              from a seed. Entries of the matrix are polynomials that look
-*              uniformly random. Performs rejection sampling on output of
-*              a XOF
-*
-* Arguments:   - polyvec *a: pointer to ouptput matrix A
-*              - const uint8_t *seed: pointer to input seed
-*              - int transposed: boolean deciding whether A or A^T is generated
-**************************************************/
-#if KYBER_K == 2
-void gen_matrix(polyvec *a, const uint8_t seed[32], int transposed)
-{
-  unsigned int ctr0, ctr1, ctr2, ctr3;
-  ALIGNED_UINT8(REJ_UNIFORM_AVX_NBLOCKS*SHAKE128_RATE) buf[4];
-  __m256i f;
-  shake128x4incctx state;
-
-  f = _mm256_loadu_si256((__m256i *)seed);
-  _mm256_store_si256(buf[0].vec, f);
-  _mm256_store_si256(buf[1].vec, f);
-  _mm256_store_si256(buf[2].vec, f);
-  _mm256_store_si256(buf[3].vec, f);
-
-  if(transposed) {
-    buf[0].coeffs[32] = 0;
-    buf[0].coeffs[33] = 0;
-    buf[1].coeffs[32] = 0;
-    buf[1].coeffs[33] = 1;
-    buf[2].coeffs[32] = 1;
-    buf[2].coeffs[33] = 0;
-    buf[3].coeffs[32] = 1;
-    buf[3].coeffs[33] = 1;
-  }
-  else {
-    buf[0].coeffs[32] = 0;
-    buf[0].coeffs[33] = 0;
-    buf[1].coeffs[32] = 1;
-    buf[1].coeffs[33] = 0;
-    buf[2].coeffs[32] = 0;
-    buf[2].coeffs[33] = 1;
-    buf[3].coeffs[32] = 1;
-    buf[3].coeffs[33] = 1;
-  }
-
-  shake128x4_inc_init(&state);
-  shake128x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 34);
-  shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state);
-
-  ctr0 = rej_uniform_avx(a[0].vec[0].coeffs, buf[0].coeffs);
-  ctr1 = rej_uniform_avx(a[0].vec[1].coeffs, buf[1].coeffs);
-  ctr2 = rej_uniform_avx(a[1].vec[0].coeffs, buf[2].coeffs);
-  ctr3 = rej_uniform_avx(a[1].vec[1].coeffs, buf[3].coeffs);
-
-  while(ctr0 < KYBER_N || ctr1 < KYBER_N || ctr2 < KYBER_N || ctr3 < KYBER_N) {
-    shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 1, &state);
-
-    ctr0 += rej_uniform(a[0].vec[0].coeffs + ctr0, KYBER_N - ctr0, buf[0].coeffs, SHAKE128_RATE);
-    ctr1 += rej_uniform(a[0].vec[1].coeffs + ctr1, KYBER_N - ctr1, buf[1].coeffs, SHAKE128_RATE);
-    ctr2 += rej_uniform(a[1].vec[0].coeffs + ctr2, KYBER_N - ctr2, buf[2].coeffs, SHAKE128_RATE);
-    ctr3 += rej_uniform(a[1].vec[1].coeffs + ctr3, KYBER_N - ctr3, buf[3].coeffs, SHAKE128_RATE);
-  }
-
-  poly_nttunpack(&a[0].vec[0]);
-  poly_nttunpack(&a[0].vec[1]);
-  poly_nttunpack(&a[1].vec[0]);
-  poly_nttunpack(&a[1].vec[1]);
-  shake128x4_inc_ctx_release(&state);
-}
-#elif KYBER_K == 3
-void gen_matrix(polyvec *a, const uint8_t seed[32], int transposed)
-{
-  unsigned int ctr0, ctr1, ctr2, ctr3;
-  ALIGNED_UINT8(REJ_UNIFORM_AVX_NBLOCKS*SHAKE128_RATE) buf[4];
-  __m256i f;
-  shake128x4incctx state;
-  shake128incctx state1x;
-
-  f = _mm256_loadu_si256((__m256i *)seed);
-  _mm256_store_si256(buf[0].vec, f);
-  _mm256_store_si256(buf[1].vec, f);
-  _mm256_store_si256(buf[2].vec, f);
-  _mm256_store_si256(buf[3].vec, f);
-
-  if(transposed) {
-    buf[0].coeffs[32] = 0;
-    buf[0].coeffs[33] = 0;
-    buf[1].coeffs[32] = 0;
-    buf[1].coeffs[33] = 1;
-    buf[2].coeffs[32] = 0;
-    buf[2].coeffs[33] = 2;
-    buf[3].coeffs[32] = 1;
-    buf[3].coeffs[33] = 0;
-  }
-  else {
-    buf[0].coeffs[32] = 0;
-    buf[0].coeffs[33] = 0;
-    buf[1].coeffs[32] = 1;
-    buf[1].coeffs[33] = 0;
-    buf[2].coeffs[32] = 2;
-    buf[2].coeffs[33] = 0;
-    buf[3].coeffs[32] = 0;
-    buf[3].coeffs[33] = 1;
-  }
-
-  shake128x4_inc_init(&state);
-  shake128x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 34);
-  shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state);
-
-  ctr0 = rej_uniform_avx(a[0].vec[0].coeffs, buf[0].coeffs);
-  ctr1 = rej_uniform_avx(a[0].vec[1].coeffs, buf[1].coeffs);
-  ctr2 = rej_uniform_avx(a[0].vec[2].coeffs, buf[2].coeffs);
-  ctr3 = rej_uniform_avx(a[1].vec[0].coeffs, buf[3].coeffs);
-
-  while(ctr0 < KYBER_N || ctr1 < KYBER_N || ctr2 < KYBER_N || ctr3 < KYBER_N) {
-    shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 1, &state);
-
-    ctr0 += rej_uniform(a[0].vec[0].coeffs + ctr0, KYBER_N - ctr0, buf[0].coeffs, SHAKE128_RATE);
-    ctr1 += rej_uniform(a[0].vec[1].coeffs + ctr1, KYBER_N - ctr1, buf[1].coeffs, SHAKE128_RATE);
-    ctr2 += rej_uniform(a[0].vec[2].coeffs + ctr2, KYBER_N - ctr2, buf[2].coeffs, SHAKE128_RATE);
-    ctr3 += rej_uniform(a[1].vec[0].coeffs + ctr3, KYBER_N - ctr3, buf[3].coeffs, SHAKE128_RATE);
-  }
-
-  poly_nttunpack(&a[0].vec[0]);
-  poly_nttunpack(&a[0].vec[1]);
-  poly_nttunpack(&a[0].vec[2]);
-  poly_nttunpack(&a[1].vec[0]);
-
-  f = _mm256_loadu_si256((__m256i *)seed);
-  _mm256_store_si256(buf[0].vec, f);
-  _mm256_store_si256(buf[1].vec, f);
-  _mm256_store_si256(buf[2].vec, f);
-  _mm256_store_si256(buf[3].vec, f);
-
-  if(transposed) {
-    buf[0].coeffs[32] = 1;
-    buf[0].coeffs[33] = 1;
-    buf[1].coeffs[32] = 1;
-    buf[1].coeffs[33] = 2;
-    buf[2].coeffs[32] = 2;
-    buf[2].coeffs[33] = 0;
-    buf[3].coeffs[32] = 2;
-    buf[3].coeffs[33] = 1;
-  }
-  else {
-    buf[0].coeffs[32] = 1;
-    buf[0].coeffs[33] = 1;
-    buf[1].coeffs[32] = 2;
-    buf[1].coeffs[33] = 1;
-    buf[2].coeffs[32] = 0;
-    buf[2].coeffs[33] = 2;
-    buf[3].coeffs[32] = 1;
-    buf[3].coeffs[33] = 2;
-  }
-
-  shake128x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 34);
-  shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state);
-
-  ctr0 = rej_uniform_avx(a[1].vec[1].coeffs, buf[0].coeffs);
-  ctr1 = rej_uniform_avx(a[1].vec[2].coeffs, buf[1].coeffs);
-  ctr2 = rej_uniform_avx(a[2].vec[0].coeffs, buf[2].coeffs);
-  ctr3 = rej_uniform_avx(a[2].vec[1].coeffs, buf[3].coeffs);
-
-  while(ctr0 < KYBER_N || ctr1 < KYBER_N || ctr2 < KYBER_N || ctr3 < KYBER_N) {
-    shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 1, &state);
-
-    ctr0 += rej_uniform(a[1].vec[1].coeffs + ctr0, KYBER_N - ctr0, buf[0].coeffs, SHAKE128_RATE);
-    ctr1 += rej_uniform(a[1].vec[2].coeffs + ctr1, KYBER_N - ctr1, buf[1].coeffs, SHAKE128_RATE);
-    ctr2 += rej_uniform(a[2].vec[0].coeffs + ctr2, KYBER_N - ctr2, buf[2].coeffs, SHAKE128_RATE);
-    ctr3 += rej_uniform(a[2].vec[1].coeffs + ctr3, KYBER_N - ctr3, buf[3].coeffs, SHAKE128_RATE);
-  }
-  shake128x4_inc_ctx_release(&state);
-
-  poly_nttunpack(&a[1].vec[1]);
-  poly_nttunpack(&a[1].vec[2]);
-  poly_nttunpack(&a[2].vec[0]);
-  poly_nttunpack(&a[2].vec[1]);
-
-  f = _mm256_loadu_si256((__m256i *)seed);
-  _mm256_store_si256(buf[0].vec, f);
-  buf[0].coeffs[32] = 2;
-  buf[0].coeffs[33] = 2;
-
-  shake128_inc_init(&state1x);
-  shake128_absorb_once(&state1x, buf[0].coeffs, 34);
-  shake128_squeezeblocks(buf[0].coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state1x);
-  ctr0 = rej_uniform_avx(a[2].vec[2].coeffs, buf[0].coeffs);
-  while(ctr0 < KYBER_N) {
-    shake128_squeezeblocks(buf[0].coeffs, 1, &state1x);
-    ctr0 += rej_uniform(a[2].vec[2].coeffs + ctr0, KYBER_N - ctr0, buf[0].coeffs, SHAKE128_RATE);
-  }
-  shake128_inc_ctx_release(&state1x);
-
-  poly_nttunpack(&a[2].vec[2]);
-}
-#elif KYBER_K == 4
-void gen_matrix(polyvec *a, const uint8_t seed[32], int transposed)
-{
-  unsigned int i, ctr0, ctr1, ctr2, ctr3;
-  ALIGNED_UINT8(REJ_UNIFORM_AVX_NBLOCKS*SHAKE128_RATE) buf[4];
-  __m256i f;
-  shake128x4incctx state;
-  shake128x4_inc_init(&state);
-
-  for(i=0;i<4;i++) {
-    f = _mm256_loadu_si256((__m256i *)seed);
-    _mm256_store_si256(buf[0].vec, f);
-    _mm256_store_si256(buf[1].vec, f);
-    _mm256_store_si256(buf[2].vec, f);
-    _mm256_store_si256(buf[3].vec, f);
-
-    if(transposed) {
-      buf[0].coeffs[32] = i;
-      buf[0].coeffs[33] = 0;
-      buf[1].coeffs[32] = i;
-      buf[1].coeffs[33] = 1;
-      buf[2].coeffs[32] = i;
-      buf[2].coeffs[33] = 2;
-      buf[3].coeffs[32] = i;
-      buf[3].coeffs[33] = 3;
-    }
-    else {
-      buf[0].coeffs[32] = 0;
-      buf[0].coeffs[33] = i;
-      buf[1].coeffs[32] = 1;
-      buf[1].coeffs[33] = i;
-      buf[2].coeffs[32] = 2;
-      buf[2].coeffs[33] = i;
-      buf[3].coeffs[32] = 3;
-      buf[3].coeffs[33] = i;
-    }
-
-    shake128x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 34);
-    shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state);
-
-    ctr0 = rej_uniform_avx(a[i].vec[0].coeffs, buf[0].coeffs);
-    ctr1 = rej_uniform_avx(a[i].vec[1].coeffs, buf[1].coeffs);
-    ctr2 = rej_uniform_avx(a[i].vec[2].coeffs, buf[2].coeffs);
-    ctr3 = rej_uniform_avx(a[i].vec[3].coeffs, buf[3].coeffs);
-
-    while(ctr0 < KYBER_N || ctr1 < KYBER_N || ctr2 < KYBER_N || ctr3 < KYBER_N) {
-      shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 1, &state);
-
-      ctr0 += rej_uniform(a[i].vec[0].coeffs + ctr0, KYBER_N - ctr0, buf[0].coeffs, SHAKE128_RATE);
-      ctr1 += rej_uniform(a[i].vec[1].coeffs + ctr1, KYBER_N - ctr1, buf[1].coeffs, SHAKE128_RATE);
-      ctr2 += rej_uniform(a[i].vec[2].coeffs + ctr2, KYBER_N - ctr2, buf[2].coeffs, SHAKE128_RATE);
-      ctr3 += rej_uniform(a[i].vec[3].coeffs + ctr3, KYBER_N - ctr3, buf[3].coeffs, SHAKE128_RATE);
-    }
-
-    poly_nttunpack(&a[i].vec[0]);
-    poly_nttunpack(&a[i].vec[1]);
-    poly_nttunpack(&a[i].vec[2]);
-    poly_nttunpack(&a[i].vec[3]);
-  }
-  shake128x4_inc_ctx_release(&state);
-}
-#endif
-
-/*************************************************
-* Name:        indcpa_keypair_derand
-*
-* Description: Generates public and private key for the CPA-secure
-*              public-key encryption scheme underlying Kyber
-*
-* Arguments:   - uint8_t *pk: pointer to output public key
-*                             (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
-*              - uint8_t *sk: pointer to output private key
-*                             (of length KYBER_INDCPA_SECRETKEYBYTES bytes)
-*              - const uint8_t *coins: pointer to input randomness
-*                             (of length KYBER_SYMBYTES bytes)
-**************************************************/
-void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
-                           uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES],
-                           const uint8_t coins[KYBER_SYMBYTES])
-{
-  unsigned int i;
-  uint8_t buf[2*KYBER_SYMBYTES];
-  const uint8_t *publicseed = buf;
-  const uint8_t *noiseseed = buf + KYBER_SYMBYTES;
-  polyvec a[KYBER_K], e, pkpv, skpv;
-
-  memcpy(buf, coins, KYBER_SYMBYTES);
-  buf[KYBER_SYMBYTES] = KYBER_K;
-  hash_g(buf, buf, KYBER_SYMBYTES+1);
-
-  gen_a(a, publicseed);
-
-#if KYBER_K == 2
-  poly_getnoise_eta1_4x(skpv.vec+0, skpv.vec+1, e.vec+0, e.vec+1, noiseseed, 0, 1, 2, 3);
-#elif KYBER_K == 3
-  poly_getnoise_eta1_4x(skpv.vec+0, skpv.vec+1, skpv.vec+2, e.vec+0, noiseseed, 0, 1, 2, 3);
-  poly_getnoise_eta1_4x(e.vec+1, e.vec+2, pkpv.vec+0, pkpv.vec+1, noiseseed, 4, 5, 6, 7);
-#elif KYBER_K == 4
-  poly_getnoise_eta1_4x(skpv.vec+0, skpv.vec+1, skpv.vec+2, skpv.vec+3, noiseseed,  0, 1, 2, 3);
-  poly_getnoise_eta1_4x(e.vec+0, e.vec+1, e.vec+2, e.vec+3, noiseseed, 4, 5, 6, 7);
-#endif
-
-  polyvec_ntt(&skpv);
-  polyvec_reduce(&skpv);
-  polyvec_ntt(&e);
-
-  // matrix-vector multiplication
-  for(i=0;i<KYBER_K;i++) {
-    polyvec_basemul_acc_montgomery(&pkpv.vec[i], &a[i], &skpv);
-    poly_tomont(&pkpv.vec[i]);
-  }
-
-  polyvec_add(&pkpv, &pkpv, &e);
-  polyvec_reduce(&pkpv);
-
-  pack_sk(sk, &skpv);
-  pack_pk(pk, &pkpv, publicseed);
-}
-
-/*************************************************
-* Name:        indcpa_enc
-*
-* Description: Encryption function of the CPA-secure
-*              public-key encryption scheme underlying Kyber.
-*
-* Arguments:   - uint8_t *c: pointer to output ciphertext
-*                            (of length KYBER_INDCPA_BYTES bytes)
-*              - const uint8_t *m: pointer to input message
-*                                  (of length KYBER_INDCPA_MSGBYTES bytes)
-*              - const uint8_t *pk: pointer to input public key
-*                                   (of length KYBER_INDCPA_PUBLICKEYBYTES)
-*              - const uint8_t *coins: pointer to input random coins used as seed
-*                                      (of length KYBER_SYMBYTES) to deterministically
-*                                      generate all randomness
-**************************************************/
-void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
-                const uint8_t m[KYBER_INDCPA_MSGBYTES],
-                const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
-                const uint8_t coins[KYBER_SYMBYTES])
-{
-  unsigned int i;
-  uint8_t seed[KYBER_SYMBYTES];
-  polyvec sp, pkpv, ep, at[KYBER_K], b;
-  poly v, k, epp;
-
-  unpack_pk(&pkpv, seed, pk);
-  poly_frommsg(&k, m);
-  gen_at(at, seed);
-
-#if KYBER_K == 2
-  poly_getnoise_eta1122_4x(sp.vec+0, sp.vec+1, ep.vec+0, ep.vec+1, coins, 0, 1, 2, 3);
-  poly_getnoise_eta2(&epp, coins, 4);
-#elif KYBER_K == 3
-  poly_getnoise_eta1_4x(sp.vec+0, sp.vec+1, sp.vec+2, ep.vec+0, coins, 0, 1, 2 ,3);
-  poly_getnoise_eta1_4x(ep.vec+1, ep.vec+2, &epp, b.vec+0, coins,  4, 5, 6, 7);
-#elif KYBER_K == 4
-  poly_getnoise_eta1_4x(sp.vec+0, sp.vec+1, sp.vec+2, sp.vec+3, coins, 0, 1, 2, 3);
-  poly_getnoise_eta1_4x(ep.vec+0, ep.vec+1, ep.vec+2, ep.vec+3, coins, 4, 5, 6, 7);
-  poly_getnoise_eta2(&epp, coins, 8);
-#endif
-
-  polyvec_ntt(&sp);
-
-  // matrix-vector multiplication
-  for(i=0;i<KYBER_K;i++)
-    polyvec_basemul_acc_montgomery(&b.vec[i], &at[i], &sp);
-  polyvec_basemul_acc_montgomery(&v, &pkpv, &sp);
-
-  polyvec_invntt_tomont(&b);
-  poly_invntt_tomont(&v);
-
-  polyvec_add(&b, &b, &ep);
-  poly_add(&v, &v, &epp);
-  poly_add(&v, &v, &k);
-  polyvec_reduce(&b);
-  poly_reduce(&v);
-
-  pack_ciphertext(c, &b, &v);
-}
-
-/*************************************************
-* Name:        indcpa_dec
-*
-* Description: Decryption function of the CPA-secure
-*              public-key encryption scheme underlying Kyber.
-*
-* Arguments:   - uint8_t *m: pointer to output decrypted message
-*                            (of length KYBER_INDCPA_MSGBYTES)
-*              - const uint8_t *c: pointer to input ciphertext
-*                                  (of length KYBER_INDCPA_BYTES)
-*              - const uint8_t *sk: pointer to input secret key
-*                                   (of length KYBER_INDCPA_SECRETKEYBYTES)
-**************************************************/
-void indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES],
-                const uint8_t c[KYBER_INDCPA_BYTES],
-                const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES])
-{
-  polyvec b, skpv;
-  poly v, mp;
-
-  unpack_ciphertext(&b, &v, c);
-  unpack_sk(&skpv, sk);
-
-  polyvec_ntt(&b);
-  polyvec_basemul_acc_montgomery(&mp, &skpv, &b);
-  poly_invntt_tomont(&mp);
-
-  poly_sub(&mp, &v, &mp);
-  poly_reduce(&mp);
-
-  poly_tomsg(m, &mp);
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/indcpa.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/indcpa.h
deleted file mode 100644
index 6dd5088c85..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/indcpa.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef INDCPA_H
-#define INDCPA_H
-
-#include <stdint.h>
-#include "params.h"
-#include "polyvec.h"
-
-#define gen_matrix KYBER_NAMESPACE(gen_matrix)
-void gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed);
-
-#define indcpa_keypair_derand KYBER_NAMESPACE(indcpa_keypair_derand)
-void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
-                           uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES],
-                           const uint8_t coins[KYBER_SYMBYTES]);
-
-#define indcpa_enc KYBER_NAMESPACE(indcpa_enc)
-void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
-                const uint8_t m[KYBER_INDCPA_MSGBYTES],
-                const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
-                const uint8_t coins[KYBER_SYMBYTES]);
-
-#define indcpa_dec KYBER_NAMESPACE(indcpa_dec)
-void indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES],
-                const uint8_t c[KYBER_INDCPA_BYTES],
-                const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/invntt.S b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/invntt.S
deleted file mode 100644
index 76d4189996..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/invntt.S
+++ /dev/null
@@ -1,193 +0,0 @@
-#include "consts.h"
-.include "shuffle.inc"
-.include "fq.inc"
-
-.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=2,zl1=2,zh0=3,zh1=3
-vpsubw		%ymm\rl0,%ymm\rh0,%ymm12
-vpaddw		%ymm\rh0,%ymm\rl0,%ymm\rl0
-vpsubw		%ymm\rl1,%ymm\rh1,%ymm13
-
-vpmullw		%ymm\zl0,%ymm12,%ymm\rh0
-vpaddw		%ymm\rh1,%ymm\rl1,%ymm\rl1
-vpsubw		%ymm\rl2,%ymm\rh2,%ymm14
-
-vpmullw		%ymm\zl0,%ymm13,%ymm\rh1
-vpaddw		%ymm\rh2,%ymm\rl2,%ymm\rl2
-vpsubw		%ymm\rl3,%ymm\rh3,%ymm15
-
-vpmullw		%ymm\zl1,%ymm14,%ymm\rh2
-vpaddw		%ymm\rh3,%ymm\rl3,%ymm\rl3
-vpmullw		%ymm\zl1,%ymm15,%ymm\rh3
-
-vpmulhw		%ymm\zh0,%ymm12,%ymm12
-vpmulhw		%ymm\zh0,%ymm13,%ymm13
-
-vpmulhw		%ymm\zh1,%ymm14,%ymm14
-vpmulhw		%ymm\zh1,%ymm15,%ymm15
-
-vpmulhw		%ymm0,%ymm\rh0,%ymm\rh0
-
-vpmulhw		%ymm0,%ymm\rh1,%ymm\rh1
-
-vpmulhw		%ymm0,%ymm\rh2,%ymm\rh2
-vpmulhw		%ymm0,%ymm\rh3,%ymm\rh3
-
-#
-
-#
-
-vpsubw		%ymm\rh0,%ymm12,%ymm\rh0
-
-vpsubw		%ymm\rh1,%ymm13,%ymm\rh1
-
-vpsubw		%ymm\rh2,%ymm14,%ymm\rh2
-vpsubw		%ymm\rh3,%ymm15,%ymm\rh3
-.endm
-
-.macro intt_levels0t5 off
-/* level 0 */
-vmovdqa		_16XFLO*2(%rsi),%ymm2
-vmovdqa		_16XFHI*2(%rsi),%ymm3
-
-vmovdqa         (128*\off+  0)*2(%rdi),%ymm4
-vmovdqa         (128*\off+ 32)*2(%rdi),%ymm6
-vmovdqa         (128*\off+ 16)*2(%rdi),%ymm5
-vmovdqa         (128*\off+ 48)*2(%rdi),%ymm7
-
-fqmulprecomp	2,3,4
-fqmulprecomp	2,3,6
-fqmulprecomp	2,3,5
-fqmulprecomp	2,3,7
-
-vmovdqa         (128*\off+ 64)*2(%rdi),%ymm8
-vmovdqa         (128*\off+ 96)*2(%rdi),%ymm10
-vmovdqa         (128*\off+ 80)*2(%rdi),%ymm9
-vmovdqa         (128*\off+112)*2(%rdi),%ymm11
-
-fqmulprecomp	2,3,8
-fqmulprecomp	2,3,10
-fqmulprecomp	2,3,9
-fqmulprecomp	2,3,11
-
-vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+208)*2(%rsi),%ymm15
-vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+176)*2(%rsi),%ymm1
-vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+224)*2(%rsi),%ymm2
-vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+192)*2(%rsi),%ymm3
-vmovdqa		_REVIDXB*2(%rsi),%ymm12
-vpshufb		%ymm12,%ymm15,%ymm15
-vpshufb		%ymm12,%ymm1,%ymm1
-vpshufb		%ymm12,%ymm2,%ymm2
-vpshufb		%ymm12,%ymm3,%ymm3
-
-butterfly	4,5,8,9,6,7,10,11,15,1,2,3
-
-/* level 1 */
-vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+144)*2(%rsi),%ymm2
-vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+160)*2(%rsi),%ymm3
-vmovdqa		_REVIDXB*2(%rsi),%ymm1
-vpshufb		%ymm1,%ymm2,%ymm2
-vpshufb		%ymm1,%ymm3,%ymm3
-
-butterfly	4,5,6,7,8,9,10,11,2,2,3,3
-
-shuffle1	4,5,3,5
-shuffle1	6,7,4,7
-shuffle1	8,9,6,9
-shuffle1	10,11,8,11
-
-/* level 2 */
-vmovdqa		_REVIDXD*2(%rsi),%ymm12
-vpermd		(_ZETAS_EXP+(1-\off)*224+112)*2(%rsi),%ymm12,%ymm2
-vpermd		(_ZETAS_EXP+(1-\off)*224+128)*2(%rsi),%ymm12,%ymm10
-
-butterfly	3,4,6,8,5,7,9,11,2,2,10,10
-
-vmovdqa		_16XV*2(%rsi),%ymm1
-red16		3
-
-shuffle2	3,4,10,4
-shuffle2	6,8,3,8
-shuffle2	5,7,6,7
-shuffle2	9,11,5,11
-
-/* level 3 */
-vpermq		$0x1B,(_ZETAS_EXP+(1-\off)*224+80)*2(%rsi),%ymm2
-vpermq		$0x1B,(_ZETAS_EXP+(1-\off)*224+96)*2(%rsi),%ymm9
-
-butterfly	10,3,6,5,4,8,7,11,2,2,9,9
-
-shuffle4	10,3,9,3
-shuffle4	6,5,10,5
-shuffle4	4,8,6,8
-shuffle4	7,11,4,11
-
-/* level 4 */
-vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+48)*2(%rsi),%ymm2
-vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+64)*2(%rsi),%ymm7
-
-butterfly	9,10,6,4,3,5,8,11,2,2,7,7
-
-red16		9
-
-shuffle8	9,10,7,10
-shuffle8	6,4,9,4
-shuffle8	3,5,6,5
-shuffle8	8,11,3,11
-
-/* level 5 */
-vmovdqa		(_ZETAS_EXP+(1-\off)*224+16)*2(%rsi),%ymm2
-vmovdqa		(_ZETAS_EXP+(1-\off)*224+32)*2(%rsi),%ymm8
-
-butterfly	7,9,6,3,10,4,5,11,2,2,8,8
-
-vmovdqa         %ymm7,(128*\off+  0)*2(%rdi)
-vmovdqa         %ymm9,(128*\off+ 16)*2(%rdi)
-vmovdqa         %ymm6,(128*\off+ 32)*2(%rdi)
-vmovdqa         %ymm3,(128*\off+ 48)*2(%rdi)
-vmovdqa         %ymm10,(128*\off+ 64)*2(%rdi)
-vmovdqa         %ymm4,(128*\off+ 80)*2(%rdi)
-vmovdqa         %ymm5,(128*\off+ 96)*2(%rdi)
-vmovdqa         %ymm11,(128*\off+112)*2(%rdi)
-.endm
-
-.macro intt_level6 off
-/* level 6 */
-vmovdqa         (64*\off+  0)*2(%rdi),%ymm4
-vmovdqa         (64*\off+128)*2(%rdi),%ymm8
-vmovdqa         (64*\off+ 16)*2(%rdi),%ymm5
-vmovdqa         (64*\off+144)*2(%rdi),%ymm9
-vpbroadcastq	(_ZETAS_EXP+0)*2(%rsi),%ymm2
-
-vmovdqa         (64*\off+ 32)*2(%rdi),%ymm6
-vmovdqa         (64*\off+160)*2(%rdi),%ymm10
-vmovdqa         (64*\off+ 48)*2(%rdi),%ymm7
-vmovdqa         (64*\off+176)*2(%rdi),%ymm11
-vpbroadcastq	(_ZETAS_EXP+4)*2(%rsi),%ymm3
-
-butterfly	4,5,6,7,8,9,10,11
-
-.if \off == 0
-red16		4
-.endif
-
-vmovdqa		%ymm4,(64*\off+  0)*2(%rdi)
-vmovdqa		%ymm5,(64*\off+ 16)*2(%rdi)
-vmovdqa		%ymm6,(64*\off+ 32)*2(%rdi)
-vmovdqa		%ymm7,(64*\off+ 48)*2(%rdi)
-vmovdqa		%ymm8,(64*\off+128)*2(%rdi)
-vmovdqa		%ymm9,(64*\off+144)*2(%rdi)
-vmovdqa		%ymm10,(64*\off+160)*2(%rdi)
-vmovdqa		%ymm11,(64*\off+176)*2(%rdi)
-.endm
-
-.text
-.global cdecl(invntt_avx)
-cdecl(invntt_avx):
-vmovdqa         _16XQ*2(%rsi),%ymm0
-
-intt_levels0t5	0
-intt_levels0t5	1
-
-intt_level6	0
-intt_level6	1
-ret
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/kem.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/kem.c
deleted file mode 100644
index 63abc1029c..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/kem.c
+++ /dev/null
@@ -1,169 +0,0 @@
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-#include "params.h"
-#include "kem.h"
-#include "indcpa.h"
-#include "verify.h"
-#include "symmetric.h"
-#include "randombytes.h"
-/*************************************************
-* Name:        crypto_kem_keypair_derand
-*
-* Description: Generates public and private key
-*              for CCA-secure Kyber key encapsulation mechanism
-*
-* Arguments:   - uint8_t *pk: pointer to output public key
-*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
-*              - uint8_t *sk: pointer to output private key
-*                (an already allocated array of KYBER_SECRETKEYBYTES bytes)
-*              - uint8_t *coins: pointer to input randomness
-*                (an already allocated array filled with 2*KYBER_SYMBYTES random bytes)
-**
-* Returns 0 (success)
-**************************************************/
-int crypto_kem_keypair_derand(uint8_t *pk,
-                              uint8_t *sk,
-                              const uint8_t *coins)
-{
-  indcpa_keypair_derand(pk, sk, coins);
-  memcpy(sk+KYBER_INDCPA_SECRETKEYBYTES, pk, KYBER_PUBLICKEYBYTES);
-  hash_h(sk+KYBER_SECRETKEYBYTES-2*KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
-  /* Value z for pseudo-random output on reject */
-  memcpy(sk+KYBER_SECRETKEYBYTES-KYBER_SYMBYTES, coins+KYBER_SYMBYTES, KYBER_SYMBYTES);
-  return 0;
-}
-
-/*************************************************
-* Name:        crypto_kem_keypair
-*
-* Description: Generates public and private key
-*              for CCA-secure Kyber key encapsulation mechanism
-*
-* Arguments:   - uint8_t *pk: pointer to output public key
-*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
-*              - uint8_t *sk: pointer to output private key
-*                (an already allocated array of KYBER_SECRETKEYBYTES bytes)
-*
-* Returns 0 (success)
-**************************************************/
-int crypto_kem_keypair(uint8_t *pk,
-                       uint8_t *sk)
-{
-  uint8_t coins[2*KYBER_SYMBYTES];
-  randombytes(coins, 2*KYBER_SYMBYTES);
-  crypto_kem_keypair_derand(pk, sk, coins);
-  return 0;
-}
-
-/*************************************************
-* Name:        crypto_kem_enc_derand
-*
-* Description: Generates cipher text and shared
-*              secret for given public key
-*
-* Arguments:   - uint8_t *ct: pointer to output cipher text
-*                (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
-*              - uint8_t *ss: pointer to output shared secret
-*                (an already allocated array of KYBER_SSBYTES bytes)
-*              - const uint8_t *pk: pointer to input public key
-*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
-*              - const uint8_t *coins: pointer to input randomness
-*                (an already allocated array filled with KYBER_SYMBYTES random bytes)
-**
-* Returns 0 (success)
-**************************************************/
-int crypto_kem_enc_derand(uint8_t *ct,
-                          uint8_t *ss,
-                          const uint8_t *pk,
-                          const uint8_t *coins)
-{
-  uint8_t buf[2*KYBER_SYMBYTES];
-  /* Will contain key, coins */
-  uint8_t kr[2*KYBER_SYMBYTES];
-
-  memcpy(buf, coins, KYBER_SYMBYTES);
-
-  /* Multitarget countermeasure for coins + contributory KEM */
-  hash_h(buf+KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
-  hash_g(kr, buf, 2*KYBER_SYMBYTES);
-
-  /* coins are in kr+KYBER_SYMBYTES */
-  indcpa_enc(ct, buf, pk, kr+KYBER_SYMBYTES);
-
-  memcpy(ss,kr,KYBER_SYMBYTES);
-  return 0;
-}
-
-/*************************************************
-* Name:        crypto_kem_enc
-*
-* Description: Generates cipher text and shared
-*              secret for given public key
-*
-* Arguments:   - uint8_t *ct: pointer to output cipher text
-*                (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
-*              - uint8_t *ss: pointer to output shared secret
-*                (an already allocated array of KYBER_SSBYTES bytes)
-*              - const uint8_t *pk: pointer to input public key
-*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
-*
-* Returns 0 (success)
-**************************************************/
-int crypto_kem_enc(uint8_t *ct,
-                   uint8_t *ss,
-                   const uint8_t *pk)
-{
-  uint8_t coins[KYBER_SYMBYTES];
-  randombytes(coins, KYBER_SYMBYTES);
-  crypto_kem_enc_derand(ct, ss, pk, coins);
-  return 0;
-}
-
-/*************************************************
-* Name:        crypto_kem_dec
-*
-* Description: Generates shared secret for given
-*              cipher text and private key
-*
-* Arguments:   - uint8_t *ss: pointer to output shared secret
-*                (an already allocated array of KYBER_SSBYTES bytes)
-*              - const uint8_t *ct: pointer to input cipher text
-*                (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
-*              - const uint8_t *sk: pointer to input private key
-*                (an already allocated array of KYBER_SECRETKEYBYTES bytes)
-*
-* Returns 0.
-*
-* On failure, ss will contain a pseudo-random value.
-**************************************************/
-int crypto_kem_dec(uint8_t *ss,
-                   const uint8_t *ct,
-                   const uint8_t *sk)
-{
-  int fail;
-  uint8_t buf[2*KYBER_SYMBYTES];
-  /* Will contain key, coins */
-  uint8_t kr[2*KYBER_SYMBYTES];
-  uint8_t cmp[KYBER_CIPHERTEXTBYTES+KYBER_SYMBYTES];
-  const uint8_t *pk = sk+KYBER_INDCPA_SECRETKEYBYTES;
-
-  indcpa_dec(buf, ct, sk);
-
-  /* Multitarget countermeasure for coins + contributory KEM */
-  memcpy(buf+KYBER_SYMBYTES, sk+KYBER_SECRETKEYBYTES-2*KYBER_SYMBYTES, KYBER_SYMBYTES);
-  hash_g(kr, buf, 2*KYBER_SYMBYTES);
-
-  /* coins are in kr+KYBER_SYMBYTES */
-  indcpa_enc(cmp, buf, pk, kr+KYBER_SYMBYTES);
-
-  fail = verify(ct, cmp, KYBER_CIPHERTEXTBYTES);
-
-  /* Compute rejection key */
-  rkprf(ss,sk+KYBER_SECRETKEYBYTES-KYBER_SYMBYTES,ct);
-
-  /* Copy true key to return buffer if fail is false */
-  cmov(ss,kr,KYBER_SYMBYTES,!fail);
-
-  return 0;
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/kem.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/kem.h
deleted file mode 100644
index 234f11966b..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/kem.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef KEM_H
-#define KEM_H
-
-#include <stdint.h>
-#include "params.h"
-
-#define CRYPTO_SECRETKEYBYTES  KYBER_SECRETKEYBYTES
-#define CRYPTO_PUBLICKEYBYTES  KYBER_PUBLICKEYBYTES
-#define CRYPTO_CIPHERTEXTBYTES KYBER_CIPHERTEXTBYTES
-#define CRYPTO_BYTES           KYBER_SSBYTES
-
-#if   (KYBER_K == 2)
-#define CRYPTO_ALGNAME "Kyber512"
-#elif (KYBER_K == 3)
-#define CRYPTO_ALGNAME "Kyber768"
-#elif (KYBER_K == 4)
-#define CRYPTO_ALGNAME "Kyber1024"
-#endif
-
-#define crypto_kem_keypair_derand KYBER_NAMESPACE(keypair_derand)
-int crypto_kem_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
-
-#define crypto_kem_keypair KYBER_NAMESPACE(keypair)
-int crypto_kem_keypair(uint8_t *pk, uint8_t *sk);
-
-#define crypto_kem_enc_derand KYBER_NAMESPACE(enc_derand)
-int crypto_kem_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
-
-#define crypto_kem_enc KYBER_NAMESPACE(enc)
-int crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-
-#define crypto_kem_dec KYBER_NAMESPACE(dec)
-int crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/ntt.S b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/ntt.S
deleted file mode 100644
index 0ce7b41297..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/ntt.S
+++ /dev/null
@@ -1,189 +0,0 @@
-#include "consts.h"
-.include "shuffle.inc"
-
-.macro mul rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=2,zh1=2
-vpmullw		%ymm\zl0,%ymm\rh0,%ymm12
-vpmullw		%ymm\zl0,%ymm\rh1,%ymm13
-
-vpmullw		%ymm\zl1,%ymm\rh2,%ymm14
-vpmullw		%ymm\zl1,%ymm\rh3,%ymm15
-
-vpmulhw		%ymm\zh0,%ymm\rh0,%ymm\rh0
-vpmulhw		%ymm\zh0,%ymm\rh1,%ymm\rh1
-
-vpmulhw		%ymm\zh1,%ymm\rh2,%ymm\rh2
-vpmulhw		%ymm\zh1,%ymm\rh3,%ymm\rh3
-.endm
-
-.macro reduce
-vpmulhw		%ymm0,%ymm12,%ymm12
-vpmulhw		%ymm0,%ymm13,%ymm13
-
-vpmulhw		%ymm0,%ymm14,%ymm14
-vpmulhw		%ymm0,%ymm15,%ymm15
-.endm
-
-.macro update rln,rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3
-vpaddw		%ymm\rh0,%ymm\rl0,%ymm\rln
-vpsubw		%ymm\rh0,%ymm\rl0,%ymm\rh0
-vpaddw		%ymm\rh1,%ymm\rl1,%ymm\rl0
-
-vpsubw		%ymm\rh1,%ymm\rl1,%ymm\rh1
-vpaddw		%ymm\rh2,%ymm\rl2,%ymm\rl1
-vpsubw		%ymm\rh2,%ymm\rl2,%ymm\rh2
-
-vpaddw		%ymm\rh3,%ymm\rl3,%ymm\rl2
-vpsubw		%ymm\rh3,%ymm\rl3,%ymm\rh3
-
-vpsubw		%ymm12,%ymm\rln,%ymm\rln
-vpaddw		%ymm12,%ymm\rh0,%ymm\rh0
-vpsubw		%ymm13,%ymm\rl0,%ymm\rl0
-
-vpaddw		%ymm13,%ymm\rh1,%ymm\rh1
-vpsubw		%ymm14,%ymm\rl1,%ymm\rl1
-vpaddw		%ymm14,%ymm\rh2,%ymm\rh2
-
-vpsubw		%ymm15,%ymm\rl2,%ymm\rl2
-vpaddw		%ymm15,%ymm\rh3,%ymm\rh3
-.endm
-
-.macro level0 off
-vpbroadcastq	(_ZETAS_EXP+0)*2(%rsi),%ymm15
-vmovdqa		(64*\off+128)*2(%rdi),%ymm8
-vmovdqa		(64*\off+144)*2(%rdi),%ymm9
-vmovdqa		(64*\off+160)*2(%rdi),%ymm10
-vmovdqa		(64*\off+176)*2(%rdi),%ymm11
-vpbroadcastq	(_ZETAS_EXP+4)*2(%rsi),%ymm2
-
-mul		8,9,10,11
-
-vmovdqa		(64*\off+  0)*2(%rdi),%ymm4
-vmovdqa		(64*\off+ 16)*2(%rdi),%ymm5
-vmovdqa		(64*\off+ 32)*2(%rdi),%ymm6
-vmovdqa		(64*\off+ 48)*2(%rdi),%ymm7
-
-reduce
-update		3,4,5,6,7,8,9,10,11
-
-vmovdqa		%ymm3,(64*\off+  0)*2(%rdi)
-vmovdqa		%ymm4,(64*\off+ 16)*2(%rdi)
-vmovdqa		%ymm5,(64*\off+ 32)*2(%rdi)
-vmovdqa		%ymm6,(64*\off+ 48)*2(%rdi)
-vmovdqa		%ymm8,(64*\off+128)*2(%rdi)
-vmovdqa		%ymm9,(64*\off+144)*2(%rdi)
-vmovdqa		%ymm10,(64*\off+160)*2(%rdi)
-vmovdqa		%ymm11,(64*\off+176)*2(%rdi)
-.endm
-
-.macro levels1t6 off
-/* level 1 */
-vmovdqa		(_ZETAS_EXP+224*\off+16)*2(%rsi),%ymm15
-vmovdqa		(128*\off+ 64)*2(%rdi),%ymm8
-vmovdqa		(128*\off+ 80)*2(%rdi),%ymm9
-vmovdqa		(128*\off+ 96)*2(%rdi),%ymm10
-vmovdqa		(128*\off+112)*2(%rdi),%ymm11
-vmovdqa		(_ZETAS_EXP+224*\off+32)*2(%rsi),%ymm2
-
-mul		8,9,10,11
-
-vmovdqa		(128*\off+  0)*2(%rdi),%ymm4
-vmovdqa	 	(128*\off+ 16)*2(%rdi),%ymm5
-vmovdqa		(128*\off+ 32)*2(%rdi),%ymm6
-vmovdqa		(128*\off+ 48)*2(%rdi),%ymm7
-
-reduce
-update		3,4,5,6,7,8,9,10,11
-
-/* level 2 */
-shuffle8	5,10,7,10
-shuffle8	6,11,5,11
-
-vmovdqa		(_ZETAS_EXP+224*\off+48)*2(%rsi),%ymm15
-vmovdqa		(_ZETAS_EXP+224*\off+64)*2(%rsi),%ymm2
-
-mul		7,10,5,11
-
-shuffle8	3,8,6,8
-shuffle8	4,9,3,9
-
-reduce
-update		4,6,8,3,9,7,10,5,11
-
-/* level 3 */
-shuffle4	8,5,9,5
-shuffle4	3,11,8,11
-
-vmovdqa		(_ZETAS_EXP+224*\off+80)*2(%rsi),%ymm15
-vmovdqa		(_ZETAS_EXP+224*\off+96)*2(%rsi),%ymm2
-
-mul		9,5,8,11
-
-shuffle4	4,7,3,7
-shuffle4	6,10,4,10
-
-reduce
-update		6,3,7,4,10,9,5,8,11
-
-/* level 4 */
-shuffle2	7,8,10,8
-shuffle2	4,11,7,11
-
-vmovdqa		(_ZETAS_EXP+224*\off+112)*2(%rsi),%ymm15
-vmovdqa		(_ZETAS_EXP+224*\off+128)*2(%rsi),%ymm2
-
-mul		10,8,7,11
-
-shuffle2	6,9,4,9
-shuffle2	3,5,6,5
-
-reduce
-update		3,4,9,6,5,10,8,7,11
-
-/* level 5 */
-shuffle1	9,7,5,7
-shuffle1	6,11,9,11
-
-vmovdqa		(_ZETAS_EXP+224*\off+144)*2(%rsi),%ymm15
-vmovdqa		(_ZETAS_EXP+224*\off+160)*2(%rsi),%ymm2
-
-mul		5,7,9,11
-
-shuffle1	3,10,6,10
-shuffle1	4,8,3,8
-
-reduce
-update		4,6,10,3,8,5,7,9,11
-
-/* level 6 */
-vmovdqa		(_ZETAS_EXP+224*\off+176)*2(%rsi),%ymm14
-vmovdqa		(_ZETAS_EXP+224*\off+208)*2(%rsi),%ymm15
-vmovdqa		(_ZETAS_EXP+224*\off+192)*2(%rsi),%ymm8
-vmovdqa		(_ZETAS_EXP+224*\off+224)*2(%rsi),%ymm2
-
-mul		10,3,9,11,14,15,8,2
-
-reduce
-update		8,4,6,5,7,10,3,9,11
-
-vmovdqa		%ymm8,(128*\off+  0)*2(%rdi)
-vmovdqa		%ymm4,(128*\off+ 16)*2(%rdi)
-vmovdqa		%ymm10,(128*\off+ 32)*2(%rdi)
-vmovdqa		%ymm3,(128*\off+ 48)*2(%rdi)
-vmovdqa		%ymm6,(128*\off+ 64)*2(%rdi)
-vmovdqa		%ymm5,(128*\off+ 80)*2(%rdi)
-vmovdqa		%ymm9,(128*\off+ 96)*2(%rdi)
-vmovdqa		%ymm11,(128*\off+112)*2(%rdi)
-.endm
-
-.text
-.global cdecl(ntt_avx)
-cdecl(ntt_avx):
-vmovdqa		_16XQ*2(%rsi),%ymm0
-
-level0		0
-level0		1
-
-levels1t6	0
-levels1t6	1
-
-ret
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/ntt.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/ntt.h
deleted file mode 100644
index a4f48e343b..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/ntt.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef NTT_H
-#define NTT_H
-
-#include <stdint.h>
-#include <immintrin.h>
-
-#define ntt_avx KYBER_NAMESPACE(ntt_avx)
-void ntt_avx(__m256i *r, const __m256i *qdata);
-#define invntt_avx KYBER_NAMESPACE(invntt_avx)
-void invntt_avx(__m256i *r, const __m256i *qdata);
-
-#define nttpack_avx KYBER_NAMESPACE(nttpack_avx)
-void nttpack_avx(__m256i *r, const __m256i *qdata);
-#define nttunpack_avx KYBER_NAMESPACE(nttunpack_avx)
-void nttunpack_avx(__m256i *r, const __m256i *qdata);
-
-#define basemul_avx KYBER_NAMESPACE(basemul_avx)
-void basemul_avx(__m256i *r,
-                 const __m256i *a,
-                 const __m256i *b,
-                 const __m256i *qdata);
-
-#define ntttobytes_avx KYBER_NAMESPACE(ntttobytes_avx)
-void ntttobytes_avx(uint8_t *r, const __m256i *a, const __m256i *qdata);
-#define nttfrombytes_avx KYBER_NAMESPACE(nttfrombytes_avx)
-void nttfrombytes_avx(__m256i *r, const uint8_t *a, const __m256i *qdata);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/params.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/params.h
deleted file mode 100644
index ecfabce4a5..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/params.h
+++ /dev/null
@@ -1,68 +0,0 @@
-#ifndef PARAMS_H
-#define PARAMS_H
-
-#ifndef KYBER_K
-#define KYBER_K 3	/* Change this for different security strengths */
-#endif
-
-//#define KYBER_90S	/* Uncomment this if you want the 90S variant */
-
-/* Don't change parameters below this line */
-#if   (KYBER_K == 2)
-#ifdef KYBER_90S
-#define KYBER_NAMESPACE(s) pqcrystals_kyber512_90s_avx2_##s
-#else
-#define KYBER_NAMESPACE(s) pqcrystals_ml_kem_512_avx2_##s
-#endif
-#elif (KYBER_K == 3)
-#ifdef KYBER_90S
-#define KYBER_NAMESPACE(s) pqcrystals_kyber768_90s_avx2_##s
-#else
-#define KYBER_NAMESPACE(s) pqcrystals_ml_kem_768_avx2_##s
-#endif
-#elif (KYBER_K == 4)
-#ifdef KYBER_90S
-#define KYBER_NAMESPACE(s) pqcrystals_kyber1024_90s_avx2_##s
-#else
-#define KYBER_NAMESPACE(s) pqcrystals_ml_kem_1024_avx2_##s
-#endif
-#else
-#error "KYBER_K must be in {2,3,4}"
-#endif
-
-#define KYBER_N 256
-#define KYBER_Q 3329
-
-#define KYBER_SYMBYTES 32   /* size in bytes of hashes, and seeds */
-#define KYBER_SSBYTES  32   /* size in bytes of shared key */
-
-#define KYBER_POLYBYTES		384
-#define KYBER_POLYVECBYTES	(KYBER_K * KYBER_POLYBYTES)
-
-#if KYBER_K == 2
-#define KYBER_ETA1 3
-#define KYBER_POLYCOMPRESSEDBYTES    128
-#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320)
-#elif KYBER_K == 3
-#define KYBER_ETA1 2
-#define KYBER_POLYCOMPRESSEDBYTES    128
-#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320)
-#elif KYBER_K == 4
-#define KYBER_ETA1 2
-#define KYBER_POLYCOMPRESSEDBYTES    160
-#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 352)
-#endif
-
-#define KYBER_ETA2 2
-
-#define KYBER_INDCPA_MSGBYTES       (KYBER_SYMBYTES)
-#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES)
-#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES)
-#define KYBER_INDCPA_BYTES          (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES)
-
-#define KYBER_PUBLICKEYBYTES  (KYBER_INDCPA_PUBLICKEYBYTES)
-/* 32 bytes of additional space to save H(pk) */
-#define KYBER_SECRETKEYBYTES  (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES)
-#define KYBER_CIPHERTEXTBYTES (KYBER_INDCPA_BYTES)
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/poly.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/poly.c
deleted file mode 100644
index 681fd6d23e..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/poly.c
+++ /dev/null
@@ -1,519 +0,0 @@
-#include <stdint.h>
-#include <immintrin.h>
-#include <string.h>
-#include "align.h"
-#include "fips202x4.h"
-#include "params.h"
-#include "poly.h"
-#include "ntt.h"
-#include "consts.h"
-#include "reduce.h"
-#include "cbd.h"
-#include "symmetric.h"
-
-/*************************************************
-* Name:        poly_compress
-*
-* Description: Compression and subsequent serialization of a polynomial.
-*              The coefficients of the input polynomial are assumed to
-*              lie in the invertal [0,q], i.e. the polynomial must be reduced
-*              by poly_reduce().
-*
-* Arguments:   - uint8_t *r: pointer to output byte array
-*                            (of length KYBER_POLYCOMPRESSEDBYTES)
-*              - const poly *a: pointer to input polynomial
-**************************************************/
-#if (KYBER_POLYCOMPRESSEDBYTES == 128)
-void poly_compress(uint8_t r[128], const poly * restrict a)
-{
-  unsigned int i;
-  __m256i f0, f1, f2, f3;
-  const __m256i v = _mm256_load_si256(&qdata.vec[_16XV/16]);
-  const __m256i shift1 = _mm256_set1_epi16(1 << 9);
-  const __m256i mask = _mm256_set1_epi16(15);
-  const __m256i shift2 = _mm256_set1_epi16((16 << 8) + 1);
-  const __m256i permdidx = _mm256_set_epi32(7,3,6,2,5,1,4,0);
-
-  for(i=0;i<KYBER_N/64;i++) {
-    f0 = _mm256_load_si256(&a->vec[4*i+0]);
-    f1 = _mm256_load_si256(&a->vec[4*i+1]);
-    f2 = _mm256_load_si256(&a->vec[4*i+2]);
-    f3 = _mm256_load_si256(&a->vec[4*i+3]);
-    f0 = _mm256_mulhi_epi16(f0,v);
-    f1 = _mm256_mulhi_epi16(f1,v);
-    f2 = _mm256_mulhi_epi16(f2,v);
-    f3 = _mm256_mulhi_epi16(f3,v);
-    f0 = _mm256_mulhrs_epi16(f0,shift1);
-    f1 = _mm256_mulhrs_epi16(f1,shift1);
-    f2 = _mm256_mulhrs_epi16(f2,shift1);
-    f3 = _mm256_mulhrs_epi16(f3,shift1);
-    f0 = _mm256_and_si256(f0,mask);
-    f1 = _mm256_and_si256(f1,mask);
-    f2 = _mm256_and_si256(f2,mask);
-    f3 = _mm256_and_si256(f3,mask);
-    f0 = _mm256_packus_epi16(f0,f1);
-    f2 = _mm256_packus_epi16(f2,f3);
-    f0 = _mm256_maddubs_epi16(f0,shift2);
-    f2 = _mm256_maddubs_epi16(f2,shift2);
-    f0 = _mm256_packus_epi16(f0,f2);
-    f0 = _mm256_permutevar8x32_epi32(f0,permdidx);
-    _mm256_storeu_si256((__m256i *)&r[32*i],f0);
-  }
-}
-
-void poly_decompress(poly * restrict r, const uint8_t a[128])
-{
-  unsigned int i;
-  __m128i t;
-  __m256i f;
-  const __m256i q = _mm256_load_si256(&qdata.vec[_16XQ/16]);
-  const __m256i shufbidx = _mm256_set_epi8(7,7,7,7,6,6,6,6,5,5,5,5,4,4,4,4,
-                                           3,3,3,3,2,2,2,2,1,1,1,1,0,0,0,0);
-  const __m256i mask = _mm256_set1_epi32(0x00F0000F);
-  const __m256i shift = _mm256_set1_epi32((128 << 16) + 2048);
-
-  for(i=0;i<KYBER_N/16;i++) {
-    t = _mm_loadl_epi64((__m128i *)&a[8*i]);
-    f = _mm256_broadcastsi128_si256(t);
-    f = _mm256_shuffle_epi8(f,shufbidx);
-    f = _mm256_and_si256(f,mask);
-    f = _mm256_mullo_epi16(f,shift);
-    f = _mm256_mulhrs_epi16(f,q);
-    _mm256_store_si256(&r->vec[i],f);
-  }
-}
-
-#elif (KYBER_POLYCOMPRESSEDBYTES == 160)
-void poly_compress(uint8_t r[160], const poly * restrict a)
-{
-  unsigned int i;
-  __m256i f0, f1;
-  __m128i t0, t1;
-  const __m256i v = _mm256_load_si256(&qdata.vec[_16XV/16]);
-  const __m256i shift1 = _mm256_set1_epi16(1 << 10);
-  const __m256i mask = _mm256_set1_epi16(31);
-  const __m256i shift2 = _mm256_set1_epi16((32 << 8) + 1);
-  const __m256i shift3 = _mm256_set1_epi32((1024 << 16) + 1);
-  const __m256i sllvdidx = _mm256_set1_epi64x(12);
-  const __m256i shufbidx = _mm256_set_epi8( 8,-1,-1,-1,-1,-1, 4, 3, 2, 1, 0,-1,12,11,10, 9,
-                                           -1,12,11,10, 9, 8,-1,-1,-1,-1,-1 ,4, 3, 2, 1, 0);
-
-  for(i=0;i<KYBER_N/32;i++) {
-    f0 = _mm256_load_si256(&a->vec[2*i+0]);
-    f1 = _mm256_load_si256(&a->vec[2*i+1]);
-    f0 = _mm256_mulhi_epi16(f0,v);
-    f1 = _mm256_mulhi_epi16(f1,v);
-    f0 = _mm256_mulhrs_epi16(f0,shift1);
-    f1 = _mm256_mulhrs_epi16(f1,shift1);
-    f0 = _mm256_and_si256(f0,mask);
-    f1 = _mm256_and_si256(f1,mask);
-    f0 = _mm256_packus_epi16(f0,f1);
-    f0 = _mm256_maddubs_epi16(f0,shift2);	// a0 a1 a2 a3 b0 b1 b2 b3 a4 a5 a6 a7 b4 b5 b6 b7
-    f0 = _mm256_madd_epi16(f0,shift3);		// a0 a1 b0 b1 a2 a3 b2 b3
-    f0 = _mm256_sllv_epi32(f0,sllvdidx);
-    f0 = _mm256_srlv_epi64(f0,sllvdidx);
-    f0 = _mm256_shuffle_epi8(f0,shufbidx);
-    t0 = _mm256_castsi256_si128(f0);
-    t1 = _mm256_extracti128_si256(f0,1);
-    t0 = _mm_blendv_epi8(t0,t1,_mm256_castsi256_si128(shufbidx));
-    _mm_storeu_si128((__m128i *)&r[20*i+ 0],t0);
-    memcpy(&r[20*i+16],&t1,4);
-  }
-}
-
-void poly_decompress(poly * restrict r, const uint8_t a[160])
-{
-  unsigned int i;
-  __m128i t;
-  __m256i f;
-  int16_t ti;
-  const __m256i q = _mm256_load_si256(&qdata.vec[_16XQ/16]);
-  const __m256i shufbidx = _mm256_set_epi8(9,9,9,8,8,8,8,7,7,6,6,6,6,5,5,5,
-                                           4,4,4,3,3,3,3,2,2,1,1,1,1,0,0,0);
-  const __m256i mask = _mm256_set_epi16(248,1984,62,496,3968,124,992,31,
-                                        248,1984,62,496,3968,124,992,31);
-  const __m256i shift = _mm256_set_epi16(128,16,512,64,8,256,32,1024,
-                                         128,16,512,64,8,256,32,1024);
-
-  for(i=0;i<KYBER_N/16;i++) {
-    t = _mm_loadl_epi64((__m128i *)&a[10*i+0]);
-    memcpy(&ti,&a[10*i+8],2);
-    t = _mm_insert_epi16(t,ti,4);
-    f = _mm256_broadcastsi128_si256(t);
-    f = _mm256_shuffle_epi8(f,shufbidx);
-    f = _mm256_and_si256(f,mask);
-    f = _mm256_mullo_epi16(f,shift);
-    f = _mm256_mulhrs_epi16(f,q);
-    _mm256_store_si256(&r->vec[i],f);
-  }
-}
-
-#endif
-
-/*************************************************
-* Name:        poly_tobytes
-*
-* Description: Serialization of a polynomial in NTT representation.
-*              The coefficients of the input polynomial are assumed to
-*              lie in the invertal [0,q], i.e. the polynomial must be reduced
-*              by poly_reduce(). The coefficients are orderd as output by
-*              poly_ntt(); the serialized output coefficients are in bitreversed
-*              order.
-*
-* Arguments:   - uint8_t *r: pointer to output byte array
-*                            (needs space for KYBER_POLYBYTES bytes)
-*              - poly *a: pointer to input polynomial
-**************************************************/
-void poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a)
-{
-  ntttobytes_avx(r, a->vec, qdata.vec);
-}
-
-/*************************************************
-* Name:        poly_frombytes
-*
-* Description: De-serialization of a polynomial;
-*              inverse of poly_tobytes
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *a: pointer to input byte array
-*                                  (of KYBER_POLYBYTES bytes)
-**************************************************/
-void poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES])
-{
-  nttfrombytes_avx(r->vec, a, qdata.vec);
-}
-
-/*************************************************
-* Name:        poly_frommsg
-*
-* Description: Convert 32-byte message to polynomial
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *msg: pointer to input message
-**************************************************/
-void poly_frommsg(poly * restrict r, const uint8_t msg[KYBER_INDCPA_MSGBYTES])
-{
-#if (KYBER_INDCPA_MSGBYTES != 32)
-#error "KYBER_INDCPA_MSGBYTES must be equal to 32!"
-#endif
-  __m256i f, g0, g1, g2, g3, h0, h1, h2, h3;
-  const __m256i shift = _mm256_broadcastsi128_si256(_mm_set_epi32(0,1,2,3));
-  const __m256i idx = _mm256_broadcastsi128_si256(_mm_set_epi8(15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0));
-  const __m256i hqs = _mm256_set1_epi16((KYBER_Q+1)/2);
-
-#define FROMMSG64(i)						\
-  g3 = _mm256_shuffle_epi32(f,0x55*i);				\
-  g3 = _mm256_sllv_epi32(g3,shift);				\
-  g3 = _mm256_shuffle_epi8(g3,idx);				\
-  g0 = _mm256_slli_epi16(g3,12);				\
-  g1 = _mm256_slli_epi16(g3,8);					\
-  g2 = _mm256_slli_epi16(g3,4);					\
-  g0 = _mm256_srai_epi16(g0,15);				\
-  g1 = _mm256_srai_epi16(g1,15);				\
-  g2 = _mm256_srai_epi16(g2,15);				\
-  g3 = _mm256_srai_epi16(g3,15);				\
-  g0 = _mm256_and_si256(g0,hqs);  /* 19 18 17 16  3  2  1  0 */	\
-  g1 = _mm256_and_si256(g1,hqs);  /* 23 22 21 20  7  6  5  4 */	\
-  g2 = _mm256_and_si256(g2,hqs);  /* 27 26 25 24 11 10  9  8 */	\
-  g3 = _mm256_and_si256(g3,hqs);  /* 31 30 29 28 15 14 13 12 */	\
-  h0 = _mm256_unpacklo_epi64(g0,g1);				\
-  h2 = _mm256_unpackhi_epi64(g0,g1);				\
-  h1 = _mm256_unpacklo_epi64(g2,g3);				\
-  h3 = _mm256_unpackhi_epi64(g2,g3);				\
-  g0 = _mm256_permute2x128_si256(h0,h1,0x20);			\
-  g2 = _mm256_permute2x128_si256(h0,h1,0x31);			\
-  g1 = _mm256_permute2x128_si256(h2,h3,0x20);			\
-  g3 = _mm256_permute2x128_si256(h2,h3,0x31);			\
-  _mm256_store_si256(&r->vec[0+2*i+0],g0);	\
-  _mm256_store_si256(&r->vec[0+2*i+1],g1);	\
-  _mm256_store_si256(&r->vec[8+2*i+0],g2);	\
-  _mm256_store_si256(&r->vec[8+2*i+1],g3)
-
-  f = _mm256_loadu_si256((__m256i *)msg);
-  FROMMSG64(0);
-  FROMMSG64(1);
-  FROMMSG64(2);
-  FROMMSG64(3);
-}
-
-/*************************************************
-* Name:        poly_tomsg
-*
-* Description: Convert polynomial to 32-byte message.
-*              The coefficients of the input polynomial are assumed to
-*              lie in the invertal [0,q], i.e. the polynomial must be reduced
-*              by poly_reduce().
-*
-* Arguments:   - uint8_t *msg: pointer to output message
-*              - poly *a: pointer to input polynomial
-**************************************************/
-void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly * restrict a)
-{
-  unsigned int i;
-  uint32_t small;
-  __m256i f0, f1, g0, g1;
-  const __m256i hq = _mm256_set1_epi16((KYBER_Q - 1)/2);
-  const __m256i hhq = _mm256_set1_epi16((KYBER_Q - 1)/4);
-
-  for(i=0;i<KYBER_N/32;i++) {
-    f0 = _mm256_load_si256(&a->vec[2*i+0]);
-    f1 = _mm256_load_si256(&a->vec[2*i+1]);
-    f0 = _mm256_sub_epi16(hq, f0);
-    f1 = _mm256_sub_epi16(hq, f1);
-    g0 = _mm256_srai_epi16(f0, 15);
-    g1 = _mm256_srai_epi16(f1, 15);
-    f0 = _mm256_xor_si256(f0, g0);
-    f1 = _mm256_xor_si256(f1, g1);
-    f0 = _mm256_sub_epi16(f0, hhq);
-    f1 = _mm256_sub_epi16(f1, hhq);
-    f0 = _mm256_packs_epi16(f0, f1);
-    f0 = _mm256_permute4x64_epi64(f0, 0xD8);
-    small = _mm256_movemask_epi8(f0);
-    memcpy(&msg[4*i], &small, 4);
-  }
-}
-
-/*************************************************
-* Name:        poly_getnoise_eta1
-*
-* Description: Sample a polynomial deterministically from a seed and a nonce,
-*              with output polynomial close to centered binomial distribution
-*              with parameter KYBER_ETA1
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *seed: pointer to input seed
-*                                     (of length KYBER_SYMBYTES bytes)
-*              - uint8_t nonce: one-byte input nonce
-**************************************************/
-void poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce)
-{
-  ALIGNED_UINT8(KYBER_ETA1*KYBER_N/4+32) buf; // +32 bytes as required by poly_cbd_eta1
-  prf(buf.coeffs, KYBER_ETA1*KYBER_N/4, seed, nonce);
-  poly_cbd_eta1(r, buf.vec);
-}
-
-/*************************************************
-* Name:        poly_getnoise_eta2
-*
-* Description: Sample a polynomial deterministically from a seed and a nonce,
-*              with output polynomial close to centered binomial distribution
-*              with parameter KYBER_ETA2
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *seed: pointer to input seed
-*                                     (of length KYBER_SYMBYTES bytes)
-*              - uint8_t nonce: one-byte input nonce
-**************************************************/
-void poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce)
-{
-  ALIGNED_UINT8(KYBER_ETA2*KYBER_N/4) buf;
-  prf(buf.coeffs, KYBER_ETA2*KYBER_N/4, seed, nonce);
-  poly_cbd_eta2(r, buf.vec);
-}
-
-#ifndef KYBER_90S
-#define NOISE_NBLOCKS ((KYBER_ETA1*KYBER_N/4+SHAKE256_RATE-1)/SHAKE256_RATE)
-void poly_getnoise_eta1_4x(poly *r0,
-                           poly *r1,
-                           poly *r2,
-                           poly *r3,
-                           const uint8_t seed[32],
-                           uint8_t nonce0,
-                           uint8_t nonce1,
-                           uint8_t nonce2,
-                           uint8_t nonce3)
-{
-  ALIGNED_UINT8(NOISE_NBLOCKS*SHAKE256_RATE) buf[4];
-  __m256i f;
-  shake256x4incctx state;
-
-  f = _mm256_loadu_si256((__m256i *)seed);
-  _mm256_store_si256(buf[0].vec, f);
-  _mm256_store_si256(buf[1].vec, f);
-  _mm256_store_si256(buf[2].vec, f);
-  _mm256_store_si256(buf[3].vec, f);
-
-  buf[0].coeffs[32] = nonce0;
-  buf[1].coeffs[32] = nonce1;
-  buf[2].coeffs[32] = nonce2;
-  buf[3].coeffs[32] = nonce3;
-
-  shake256x4_inc_init(&state);
-  shake256x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 33);
-  shake256x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, NOISE_NBLOCKS, &state);
-  shake256x4_inc_ctx_release(&state);
-
-  poly_cbd_eta1(r0, buf[0].vec);
-  poly_cbd_eta1(r1, buf[1].vec);
-  poly_cbd_eta1(r2, buf[2].vec);
-  poly_cbd_eta1(r3, buf[3].vec);
-}
-
-#if KYBER_K == 2
-void poly_getnoise_eta1122_4x(poly *r0,
-                              poly *r1,
-                              poly *r2,
-                              poly *r3,
-                              const uint8_t seed[32],
-                              uint8_t nonce0,
-                              uint8_t nonce1,
-                              uint8_t nonce2,
-                              uint8_t nonce3)
-{
-  ALIGNED_UINT8(NOISE_NBLOCKS*SHAKE256_RATE) buf[4];
-  __m256i f;
-  shake256x4incctx state;
-
-  f = _mm256_loadu_si256((__m256i *)seed);
-  _mm256_store_si256(buf[0].vec, f);
-  _mm256_store_si256(buf[1].vec, f);
-  _mm256_store_si256(buf[2].vec, f);
-  _mm256_store_si256(buf[3].vec, f);
-
-  buf[0].coeffs[32] = nonce0;
-  buf[1].coeffs[32] = nonce1;
-  buf[2].coeffs[32] = nonce2;
-  buf[3].coeffs[32] = nonce3;
-
-  shake256x4_inc_init(&state);
-  shake256x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 33);
-  shake256x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, NOISE_NBLOCKS, &state);
-  shake256x4_inc_ctx_release(&state);
-
-  poly_cbd_eta1(r0, buf[0].vec);
-  poly_cbd_eta1(r1, buf[1].vec);
-  poly_cbd_eta2(r2, buf[2].vec);
-  poly_cbd_eta2(r3, buf[3].vec);
-}
-#endif
-#endif
-
-/*************************************************
-* Name:        poly_ntt
-*
-* Description: Computes negacyclic number-theoretic transform (NTT) of
-*              a polynomial in place.
-*              Input coefficients assumed to be in normal order,
-*              output coefficients are in special order that is natural
-*              for the vectorization. Input coefficients are assumed to be
-*              bounded by q in absolute value, output coefficients are bounded
-*              by 16118 in absolute value.
-*
-* Arguments:   - poly *r: pointer to in/output polynomial
-**************************************************/
-void poly_ntt(poly *r)
-{
-  ntt_avx(r->vec, qdata.vec);
-}
-
-/*************************************************
-* Name:        poly_invntt_tomont
-*
-* Description: Computes inverse of negacyclic number-theoretic transform (NTT)
-*              of a polynomial in place;
-*              Input coefficients assumed to be in special order from vectorized
-*              forward ntt, output in normal order. Input coefficients can be
-*              arbitrary 16-bit integers, output coefficients are bounded by 14870
-*              in absolute value.
-*
-* Arguments:   - poly *a: pointer to in/output polynomial
-**************************************************/
-void poly_invntt_tomont(poly *r)
-{
-  invntt_avx(r->vec, qdata.vec);
-}
-
-void poly_nttunpack(poly *r)
-{
-  nttunpack_avx(r->vec, qdata.vec);
-}
-
-/*************************************************
-* Name:        poly_basemul_montgomery
-*
-* Description: Multiplication of two polynomials in NTT domain.
-*              One of the input polynomials needs to have coefficients
-*              bounded by q, the other polynomial can have arbitrary
-*              coefficients. Output coefficients are bounded by 6656.
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const poly *a: pointer to first input polynomial
-*              - const poly *b: pointer to second input polynomial
-**************************************************/
-void poly_basemul_montgomery(poly *r, const poly *a, const poly *b)
-{
-  basemul_avx(r->vec, a->vec, b->vec, qdata.vec);
-}
-
-/*************************************************
-* Name:        poly_tomont
-*
-* Description: Inplace conversion of all coefficients of a polynomial
-*              from normal domain to Montgomery domain
-*
-* Arguments:   - poly *r: pointer to input/output polynomial
-**************************************************/
-void poly_tomont(poly *r)
-{
-  tomont_avx(r->vec, qdata.vec);
-}
-
-/*************************************************
-* Name:        poly_reduce
-*
-* Description: Applies Barrett reduction to all coefficients of a polynomial
-*              for details of the Barrett reduction see comments in reduce.c
-*
-* Arguments:   - poly *r: pointer to input/output polynomial
-**************************************************/
-void poly_reduce(poly *r)
-{
-  reduce_avx(r->vec, qdata.vec);
-}
-
-/*************************************************
-* Name:        poly_add
-*
-* Description: Add two polynomials. No modular reduction
-*              is performed.
-*
-* Arguments: - poly *r: pointer to output polynomial
-*            - const poly *a: pointer to first input polynomial
-*            - const poly *b: pointer to second input polynomial
-**************************************************/
-void poly_add(poly *r, const poly *a, const poly *b)
-{
-  unsigned int i;
-  __m256i f0, f1;
-
-  for(i=0;i<KYBER_N/16;i++) {
-    f0 = _mm256_load_si256(&a->vec[i]);
-    f1 = _mm256_load_si256(&b->vec[i]);
-    f0 = _mm256_add_epi16(f0, f1);
-    _mm256_store_si256(&r->vec[i], f0);
-  }
-}
-
-/*************************************************
-* Name:        poly_sub
-*
-* Description: Subtract two polynomials. No modular reduction
-*              is performed.
-*
-* Arguments: - poly *r: pointer to output polynomial
-*            - const poly *a: pointer to first input polynomial
-*            - const poly *b: pointer to second input polynomial
-**************************************************/
-void poly_sub(poly *r, const poly *a, const poly *b)
-{
-  unsigned int i;
-  __m256i f0, f1;
-
-  for(i=0;i<KYBER_N/16;i++) {
-    f0 = _mm256_load_si256(&a->vec[i]);
-    f1 = _mm256_load_si256(&b->vec[i]);
-    f0 = _mm256_sub_epi16(f0, f1);
-    _mm256_store_si256(&r->vec[i], f0);
-  }
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/poly.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/poly.h
deleted file mode 100644
index 6a9cf71c70..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/poly.h
+++ /dev/null
@@ -1,77 +0,0 @@
-#ifndef POLY_H
-#define POLY_H
-
-#include <stdint.h>
-#include "align.h"
-#include "params.h"
-
-typedef ALIGNED_INT16(KYBER_N) poly;
-
-#define poly_compress KYBER_NAMESPACE(poly_compress)
-void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a);
-#define poly_decompress KYBER_NAMESPACE(poly_decompress)
-void poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]);
-
-#define poly_tobytes KYBER_NAMESPACE(poly_tobytes)
-void poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a);
-#define poly_frombytes KYBER_NAMESPACE(poly_frombytes)
-void poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]);
-
-#define poly_frommsg KYBER_NAMESPACE(poly_frommsg)
-void poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]);
-#define poly_tomsg KYBER_NAMESPACE(poly_tomsg)
-void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *r);
-
-#define poly_getnoise_eta1 KYBER_NAMESPACE(poly_getnoise_eta1)
-void poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);
-
-#define poly_getnoise_eta2 KYBER_NAMESPACE(poly_getnoise_eta2)
-void poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);
-
-#ifndef KYBER_90S
-#define poly_getnoise_eta1_4x KYBER_NAMESPACE(poly_getnoise_eta2_4x)
-void poly_getnoise_eta1_4x(poly *r0,
-                           poly *r1,
-                           poly *r2,
-                           poly *r3,
-                           const uint8_t seed[32],
-                           uint8_t nonce0,
-                           uint8_t nonce1,
-                           uint8_t nonce2,
-                           uint8_t nonce3);
-
-#if KYBER_K == 2
-#define poly_getnoise_eta1122_4x KYBER_NAMESPACE(poly_getnoise_eta1122_4x)
-void poly_getnoise_eta1122_4x(poly *r0,
-                              poly *r1,
-                              poly *r2,
-                              poly *r3,
-                              const uint8_t seed[32],
-                              uint8_t nonce0,
-                              uint8_t nonce1,
-                              uint8_t nonce2,
-                              uint8_t nonce3);
-#endif
-#endif
-
-
-#define poly_ntt KYBER_NAMESPACE(poly_ntt)
-void poly_ntt(poly *r);
-#define poly_invntt_tomont KYBER_NAMESPACE(poly_invntt_tomont)
-void poly_invntt_tomont(poly *r);
-#define poly_nttunpack KYBER_NAMESPACE(poly_nttunpack)
-void poly_nttunpack(poly *r);
-#define poly_basemul_montgomery KYBER_NAMESPACE(poly_basemul_montgomery)
-void poly_basemul_montgomery(poly *r, const poly *a, const poly *b);
-#define poly_tomont KYBER_NAMESPACE(poly_tomont)
-void poly_tomont(poly *r);
-
-#define poly_reduce KYBER_NAMESPACE(poly_reduce)
-void poly_reduce(poly *r);
-
-#define poly_add KYBER_NAMESPACE(poly_add)
-void poly_add(poly *r, const poly *a, const poly *b);
-#define poly_sub KYBER_NAMESPACE(poly_sub)
-void poly_sub(poly *r, const poly *a, const poly *b);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/polyvec.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/polyvec.c
deleted file mode 100644
index a0174b7b3f..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/polyvec.c
+++ /dev/null
@@ -1,307 +0,0 @@
-#include <stdint.h>
-#include <immintrin.h>
-#include <string.h>
-#include "params.h"
-#include "polyvec.h"
-#include "poly.h"
-#include "ntt.h"
-#include "consts.h"
-
-#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
-static void poly_compress10(uint8_t r[320], const poly * restrict a)
-{
-  unsigned int i;
-  __m256i f0, f1, f2;
-  __m128i t0, t1;
-  const __m256i v = _mm256_load_si256(&qdata.vec[_16XV/16]);
-  const __m256i v8 = _mm256_slli_epi16(v,3);
-  const __m256i off = _mm256_set1_epi16(15);
-  const __m256i shift1 = _mm256_set1_epi16(1 << 12);
-  const __m256i mask = _mm256_set1_epi16(1023);
-  const __m256i shift2 = _mm256_set1_epi64x((1024LL << 48) + (1LL << 32) + (1024 << 16) + 1);
-  const __m256i sllvdidx = _mm256_set1_epi64x(12);
-  const __m256i shufbidx = _mm256_set_epi8( 8, 4, 3, 2, 1, 0,-1,-1,-1,-1,-1,-1,12,11,10, 9,
-                                           -1,-1,-1,-1,-1,-1,12,11,10, 9, 8, 4, 3, 2, 1, 0);
-
-  for(i=0;i<KYBER_N/16;i++) {
-    f0 = _mm256_load_si256(&a->vec[i]);
-    f1 = _mm256_mullo_epi16(f0,v8);
-    f2 = _mm256_add_epi16(f0,off);
-    f0 = _mm256_slli_epi16(f0,3);
-    f0 = _mm256_mulhi_epi16(f0,v);
-    f2 = _mm256_sub_epi16(f1,f2);
-    f1 = _mm256_andnot_si256(f1,f2);
-    f1 = _mm256_srli_epi16(f1,15);
-    f0 = _mm256_sub_epi16(f0,f1);
-    f0 = _mm256_mulhrs_epi16(f0,shift1);
-    f0 = _mm256_and_si256(f0,mask);
-    f0 = _mm256_madd_epi16(f0,shift2);
-    f0 = _mm256_sllv_epi32(f0,sllvdidx);
-    f0 = _mm256_srli_epi64(f0,12);
-    f0 = _mm256_shuffle_epi8(f0,shufbidx);
-    t0 = _mm256_castsi256_si128(f0);
-    t1 = _mm256_extracti128_si256(f0,1);
-    t0 = _mm_blend_epi16(t0,t1,0xE0);
-    _mm_storeu_si128((__m128i *)&r[20*i+ 0],t0);
-    memcpy(&r[20*i+16],&t1,4);
-  }
-}
-
-static void poly_decompress10(poly * restrict r, const uint8_t a[320+12])
-{
-  unsigned int i;
-  __m256i f;
-  const __m256i q = _mm256_set1_epi32((KYBER_Q << 16) + 4*KYBER_Q);
-  const __m256i shufbidx = _mm256_set_epi8(11,10,10, 9, 9, 8, 8, 7,
-                                            6, 5, 5, 4, 4, 3, 3, 2,
-                                            9, 8, 8, 7, 7, 6, 6, 5,
-                                            4, 3, 3, 2, 2, 1, 1, 0);
-  const __m256i sllvdidx = _mm256_set1_epi64x(4);
-  const __m256i mask = _mm256_set1_epi32((32736 << 16) + 8184);
-
-  for(i=0;i<KYBER_N/16;i++) {
-    f = _mm256_loadu_si256((__m256i *)&a[20*i]);
-    f = _mm256_permute4x64_epi64(f,0x94);
-    f = _mm256_shuffle_epi8(f,shufbidx);
-    f = _mm256_sllv_epi32(f,sllvdidx);
-    f = _mm256_srli_epi16(f,1);
-    f = _mm256_and_si256(f,mask);
-    f = _mm256_mulhrs_epi16(f,q);
-    _mm256_store_si256(&r->vec[i],f);
-  }
-}
-
-#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
-static void poly_compress11(uint8_t r[352+2], const poly * restrict a)
-{
-  unsigned int i;
-  __m256i f0, f1, f2;
-  __m128i t0, t1;
-  const __m256i v = _mm256_load_si256(&qdata.vec[_16XV/16]);
-  const __m256i v8 = _mm256_slli_epi16(v,3);
-  const __m256i off = _mm256_set1_epi16(36);
-  const __m256i shift1 = _mm256_set1_epi16(1 << 13);
-  const __m256i mask = _mm256_set1_epi16(2047);
-  const __m256i shift2 = _mm256_set1_epi64x((2048LL << 48) + (1LL << 32) + (2048 << 16) + 1);
-  const __m256i sllvdidx = _mm256_set1_epi64x(10);
-  const __m256i srlvqidx = _mm256_set_epi64x(30,10,30,10);
-  const __m256i shufbidx = _mm256_set_epi8( 4, 3, 2, 1, 0, 0,-1,-1,-1,-1,10, 9, 8, 7, 6, 5,
-                                           -1,-1,-1,-1,-1,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-
-  for(i=0;i<KYBER_N/16;i++) {
-    f0 = _mm256_load_si256(&a->vec[i]);
-    f1 = _mm256_mullo_epi16(f0,v8);
-    f2 = _mm256_add_epi16(f0,off);
-    f0 = _mm256_slli_epi16(f0,3);
-    f0 = _mm256_mulhi_epi16(f0,v);
-    f2 = _mm256_sub_epi16(f1,f2);
-    f1 = _mm256_andnot_si256(f1,f2);
-    f1 = _mm256_srli_epi16(f1,15);
-    f0 = _mm256_sub_epi16(f0,f1);
-    f0 = _mm256_mulhrs_epi16(f0,shift1);
-    f0 = _mm256_and_si256(f0,mask);
-    f0 = _mm256_madd_epi16(f0,shift2);
-    f0 = _mm256_sllv_epi32(f0,sllvdidx);
-    f1 = _mm256_bsrli_epi128(f0,8);
-    f0 = _mm256_srlv_epi64(f0,srlvqidx);
-    f1 = _mm256_slli_epi64(f1,34);
-    f0 = _mm256_add_epi64(f0,f1);
-    f0 = _mm256_shuffle_epi8(f0,shufbidx);
-    t0 = _mm256_castsi256_si128(f0);
-    t1 = _mm256_extracti128_si256(f0,1);
-    t0 = _mm_blendv_epi8(t0,t1,_mm256_castsi256_si128(shufbidx));
-    _mm_storeu_si128((__m128i *)&r[22*i+ 0],t0);
-    _mm_storel_epi64((__m128i *)&r[22*i+16],t1);
-  }
-}
-
-static void poly_decompress11(poly * restrict r, const uint8_t a[352+10])
-{
-  unsigned int i;
-  __m256i f;
-  const __m256i q = _mm256_load_si256(&qdata.vec[_16XQ/16]);
-  const __m256i shufbidx = _mm256_set_epi8(13,12,12,11,10, 9, 9, 8,
-                                            8, 7, 6, 5, 5, 4, 4, 3,
-                                           10, 9, 9, 8, 7, 6, 6, 5,
-                                            5, 4, 3, 2, 2, 1, 1, 0);
-  const __m256i srlvdidx = _mm256_set_epi32(0,0,1,0,0,0,1,0);
-  const __m256i srlvqidx = _mm256_set_epi64x(2,0,2,0);
-  const __m256i shift = _mm256_set_epi16(4,32,1,8,32,1,4,32,4,32,1,8,32,1,4,32);
-  const __m256i mask = _mm256_set1_epi16(32752);
-
-  for(i=0;i<KYBER_N/16;i++) {
-    f = _mm256_loadu_si256((__m256i *)&a[22*i]);
-    f = _mm256_permute4x64_epi64(f,0x94);
-    f = _mm256_shuffle_epi8(f,shufbidx);
-    f = _mm256_srlv_epi32(f,srlvdidx);
-    f = _mm256_srlv_epi64(f,srlvqidx);
-    f = _mm256_mullo_epi16(f,shift);
-    f = _mm256_srli_epi16(f,1);
-    f = _mm256_and_si256(f,mask);
-    f = _mm256_mulhrs_epi16(f,q);
-    _mm256_store_si256(&r->vec[i],f);
-  }
-}
-
-#endif
-
-/*************************************************
-* Name:        polyvec_compress
-*
-* Description: Compress and serialize vector of polynomials
-*
-* Arguments:   - uint8_t *r: pointer to output byte array
-*                            (needs space for KYBER_POLYVECCOMPRESSEDBYTES)
-*              - polyvec *a: pointer to input vector of polynomials
-**************************************************/
-void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES+2], const polyvec *a)
-{
-  unsigned int i;
-
-#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
-  for(i=0;i<KYBER_K;i++)
-    poly_compress10(&r[320*i],&a->vec[i]);
-#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
-  for(i=0;i<KYBER_K;i++)
-    poly_compress11(&r[352*i],&a->vec[i]);
-#endif
-}
-
-/*************************************************
-* Name:        polyvec_decompress
-*
-* Description: De-serialize and decompress vector of polynomials;
-*              approximate inverse of polyvec_compress
-*
-* Arguments:   - polyvec *r: pointer to output vector of polynomials
-*              - const uint8_t *a: pointer to input byte array
-*                                  (of length KYBER_POLYVECCOMPRESSEDBYTES)
-**************************************************/
-void polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES+12])
-{
-  unsigned int i;
-
-#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
-  for(i=0;i<KYBER_K;i++)
-    poly_decompress10(&r->vec[i],&a[320*i]);
-#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
-  for(i=0;i<KYBER_K;i++)
-    poly_decompress11(&r->vec[i],&a[352*i]);
-#endif
-}
-
-/*************************************************
-* Name:        polyvec_tobytes
-*
-* Description: Serialize vector of polynomials
-*
-* Arguments:   - uint8_t *r: pointer to output byte array
-*                            (needs space for KYBER_POLYVECBYTES)
-*              - polyvec *a: pointer to input vector of polynomials
-**************************************************/
-void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_tobytes(r+i*KYBER_POLYBYTES, &a->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvec_frombytes
-*
-* Description: De-serialize vector of polynomials;
-*              inverse of polyvec_tobytes
-*
-* Arguments:   - uint8_t *r: pointer to output byte array
-*              - const polyvec *a: pointer to input vector of polynomials
-*                                  (of length KYBER_POLYVECBYTES)
-**************************************************/
-void polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES])
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_frombytes(&r->vec[i], a+i*KYBER_POLYBYTES);
-}
-
-/*************************************************
-* Name:        polyvec_ntt
-*
-* Description: Apply forward NTT to all elements of a vector of polynomials
-*
-* Arguments:   - polyvec *r: pointer to in/output vector of polynomials
-**************************************************/
-void polyvec_ntt(polyvec *r)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_ntt(&r->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvec_invntt_tomont
-*
-* Description: Apply inverse NTT to all elements of a vector of polynomials
-*              and multiply by Montgomery factor 2^16
-*
-* Arguments:   - polyvec *r: pointer to in/output vector of polynomials
-**************************************************/
-void polyvec_invntt_tomont(polyvec *r)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_invntt_tomont(&r->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvec_basemul_acc_montgomery
-*
-* Description: Multiply elements in a and b in NTT domain, accumulate into r,
-*              and multiply by 2^-16.
-*
-* Arguments: - poly *r: pointer to output polynomial
-*            - const polyvec *a: pointer to first input vector of polynomials
-*            - const polyvec *b: pointer to second input vector of polynomials
-**************************************************/
-void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
-{
-  unsigned int i;
-  poly tmp;
-
-  poly_basemul_montgomery(r,&a->vec[0],&b->vec[0]);
-  for(i=1;i<KYBER_K;i++) {
-    poly_basemul_montgomery(&tmp,&a->vec[i],&b->vec[i]);
-    poly_add(r,r,&tmp);
-  }
-}
-
-/*************************************************
-* Name:        polyvec_reduce
-*
-* Description: Applies Barrett reduction to each coefficient
-*              of each element of a vector of polynomials;
-*              for details of the Barrett reduction see comments in reduce.c
-*
-* Arguments:   - polyvec *r: pointer to input/output polynomial
-**************************************************/
-void polyvec_reduce(polyvec *r)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_reduce(&r->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvec_add
-*
-* Description: Add vectors of polynomials
-*
-* Arguments: - polyvec *r:       pointer to output vector of polynomials
-*            - const polyvec *a: pointer to first input vector of polynomials
-*            - const polyvec *b: pointer to second input vector of polynomials
-**************************************************/
-void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_add(&r->vec[i], &a->vec[i], &b->vec[i]);
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/polyvec.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/polyvec.h
deleted file mode 100644
index 2ce23c31ff..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/polyvec.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef POLYVEC_H
-#define POLYVEC_H
-
-#include <stdint.h>
-#include "params.h"
-#include "poly.h"
-
-typedef struct{
-  poly vec[KYBER_K];
-} polyvec;
-
-#define polyvec_compress KYBER_NAMESPACE(polyvec_compress)
-void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES+2], const polyvec *a);
-#define polyvec_decompress KYBER_NAMESPACE(polyvec_decompress)
-void polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES+12]);
-
-#define polyvec_tobytes KYBER_NAMESPACE(polyvec_tobytes)
-void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a);
-#define polyvec_frombytes KYBER_NAMESPACE(polyvec_frombytes)
-void polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]);
-
-#define polyvec_ntt KYBER_NAMESPACE(polyvec_ntt)
-void polyvec_ntt(polyvec *r);
-#define polyvec_invntt_tomont KYBER_NAMESPACE(polyvec_invntt_tomont)
-void polyvec_invntt_tomont(polyvec *r);
-
-#define polyvec_basemul_acc_montgomery KYBER_NAMESPACE(polyvec_basemul_acc_montgomery)
-void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b);
-
-#define polyvec_reduce KYBER_NAMESPACE(polyvec_reduce)
-void polyvec_reduce(polyvec *r);
-
-#define polyvec_add KYBER_NAMESPACE(polyvec_add)
-void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/reduce.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/reduce.h
deleted file mode 100644
index 5368185b5f..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/reduce.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef REDUCE_H
-#define REDUCE_H
-
-#include "params.h"
-#include <immintrin.h>
-
-#define reduce_avx KYBER_NAMESPACE(reduce_avx)
-void reduce_avx(__m256i *r, const __m256i *qdata);
-#define tomont_avx KYBER_NAMESPACE(tomont_avx)
-void tomont_avx(__m256i *r, const __m256i *qdata);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/rejsample.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/rejsample.c
deleted file mode 100644
index 9060a44cb9..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/rejsample.c
+++ /dev/null
@@ -1,398 +0,0 @@
-#include <stdint.h>
-#include <immintrin.h>
-#include <string.h>
-#include "params.h"
-#include "consts.h"
-#include "rejsample.h"
-
-//#define BMI
-
-#ifndef BMI
-static const uint8_t idx[256][8] = {
-  {-1, -1, -1, -1, -1, -1, -1, -1},
-  { 0, -1, -1, -1, -1, -1, -1, -1},
-  { 2, -1, -1, -1, -1, -1, -1, -1},
-  { 0,  2, -1, -1, -1, -1, -1, -1},
-  { 4, -1, -1, -1, -1, -1, -1, -1},
-  { 0,  4, -1, -1, -1, -1, -1, -1},
-  { 2,  4, -1, -1, -1, -1, -1, -1},
-  { 0,  2,  4, -1, -1, -1, -1, -1},
-  { 6, -1, -1, -1, -1, -1, -1, -1},
-  { 0,  6, -1, -1, -1, -1, -1, -1},
-  { 2,  6, -1, -1, -1, -1, -1, -1},
-  { 0,  2,  6, -1, -1, -1, -1, -1},
-  { 4,  6, -1, -1, -1, -1, -1, -1},
-  { 0,  4,  6, -1, -1, -1, -1, -1},
-  { 2,  4,  6, -1, -1, -1, -1, -1},
-  { 0,  2,  4,  6, -1, -1, -1, -1},
-  { 8, -1, -1, -1, -1, -1, -1, -1},
-  { 0,  8, -1, -1, -1, -1, -1, -1},
-  { 2,  8, -1, -1, -1, -1, -1, -1},
-  { 0,  2,  8, -1, -1, -1, -1, -1},
-  { 4,  8, -1, -1, -1, -1, -1, -1},
-  { 0,  4,  8, -1, -1, -1, -1, -1},
-  { 2,  4,  8, -1, -1, -1, -1, -1},
-  { 0,  2,  4,  8, -1, -1, -1, -1},
-  { 6,  8, -1, -1, -1, -1, -1, -1},
-  { 0,  6,  8, -1, -1, -1, -1, -1},
-  { 2,  6,  8, -1, -1, -1, -1, -1},
-  { 0,  2,  6,  8, -1, -1, -1, -1},
-  { 4,  6,  8, -1, -1, -1, -1, -1},
-  { 0,  4,  6,  8, -1, -1, -1, -1},
-  { 2,  4,  6,  8, -1, -1, -1, -1},
-  { 0,  2,  4,  6,  8, -1, -1, -1},
-  {10, -1, -1, -1, -1, -1, -1, -1},
-  { 0, 10, -1, -1, -1, -1, -1, -1},
-  { 2, 10, -1, -1, -1, -1, -1, -1},
-  { 0,  2, 10, -1, -1, -1, -1, -1},
-  { 4, 10, -1, -1, -1, -1, -1, -1},
-  { 0,  4, 10, -1, -1, -1, -1, -1},
-  { 2,  4, 10, -1, -1, -1, -1, -1},
-  { 0,  2,  4, 10, -1, -1, -1, -1},
-  { 6, 10, -1, -1, -1, -1, -1, -1},
-  { 0,  6, 10, -1, -1, -1, -1, -1},
-  { 2,  6, 10, -1, -1, -1, -1, -1},
-  { 0,  2,  6, 10, -1, -1, -1, -1},
-  { 4,  6, 10, -1, -1, -1, -1, -1},
-  { 0,  4,  6, 10, -1, -1, -1, -1},
-  { 2,  4,  6, 10, -1, -1, -1, -1},
-  { 0,  2,  4,  6, 10, -1, -1, -1},
-  { 8, 10, -1, -1, -1, -1, -1, -1},
-  { 0,  8, 10, -1, -1, -1, -1, -1},
-  { 2,  8, 10, -1, -1, -1, -1, -1},
-  { 0,  2,  8, 10, -1, -1, -1, -1},
-  { 4,  8, 10, -1, -1, -1, -1, -1},
-  { 0,  4,  8, 10, -1, -1, -1, -1},
-  { 2,  4,  8, 10, -1, -1, -1, -1},
-  { 0,  2,  4,  8, 10, -1, -1, -1},
-  { 6,  8, 10, -1, -1, -1, -1, -1},
-  { 0,  6,  8, 10, -1, -1, -1, -1},
-  { 2,  6,  8, 10, -1, -1, -1, -1},
-  { 0,  2,  6,  8, 10, -1, -1, -1},
-  { 4,  6,  8, 10, -1, -1, -1, -1},
-  { 0,  4,  6,  8, 10, -1, -1, -1},
-  { 2,  4,  6,  8, 10, -1, -1, -1},
-  { 0,  2,  4,  6,  8, 10, -1, -1},
-  {12, -1, -1, -1, -1, -1, -1, -1},
-  { 0, 12, -1, -1, -1, -1, -1, -1},
-  { 2, 12, -1, -1, -1, -1, -1, -1},
-  { 0,  2, 12, -1, -1, -1, -1, -1},
-  { 4, 12, -1, -1, -1, -1, -1, -1},
-  { 0,  4, 12, -1, -1, -1, -1, -1},
-  { 2,  4, 12, -1, -1, -1, -1, -1},
-  { 0,  2,  4, 12, -1, -1, -1, -1},
-  { 6, 12, -1, -1, -1, -1, -1, -1},
-  { 0,  6, 12, -1, -1, -1, -1, -1},
-  { 2,  6, 12, -1, -1, -1, -1, -1},
-  { 0,  2,  6, 12, -1, -1, -1, -1},
-  { 4,  6, 12, -1, -1, -1, -1, -1},
-  { 0,  4,  6, 12, -1, -1, -1, -1},
-  { 2,  4,  6, 12, -1, -1, -1, -1},
-  { 0,  2,  4,  6, 12, -1, -1, -1},
-  { 8, 12, -1, -1, -1, -1, -1, -1},
-  { 0,  8, 12, -1, -1, -1, -1, -1},
-  { 2,  8, 12, -1, -1, -1, -1, -1},
-  { 0,  2,  8, 12, -1, -1, -1, -1},
-  { 4,  8, 12, -1, -1, -1, -1, -1},
-  { 0,  4,  8, 12, -1, -1, -1, -1},
-  { 2,  4,  8, 12, -1, -1, -1, -1},
-  { 0,  2,  4,  8, 12, -1, -1, -1},
-  { 6,  8, 12, -1, -1, -1, -1, -1},
-  { 0,  6,  8, 12, -1, -1, -1, -1},
-  { 2,  6,  8, 12, -1, -1, -1, -1},
-  { 0,  2,  6,  8, 12, -1, -1, -1},
-  { 4,  6,  8, 12, -1, -1, -1, -1},
-  { 0,  4,  6,  8, 12, -1, -1, -1},
-  { 2,  4,  6,  8, 12, -1, -1, -1},
-  { 0,  2,  4,  6,  8, 12, -1, -1},
-  {10, 12, -1, -1, -1, -1, -1, -1},
-  { 0, 10, 12, -1, -1, -1, -1, -1},
-  { 2, 10, 12, -1, -1, -1, -1, -1},
-  { 0,  2, 10, 12, -1, -1, -1, -1},
-  { 4, 10, 12, -1, -1, -1, -1, -1},
-  { 0,  4, 10, 12, -1, -1, -1, -1},
-  { 2,  4, 10, 12, -1, -1, -1, -1},
-  { 0,  2,  4, 10, 12, -1, -1, -1},
-  { 6, 10, 12, -1, -1, -1, -1, -1},
-  { 0,  6, 10, 12, -1, -1, -1, -1},
-  { 2,  6, 10, 12, -1, -1, -1, -1},
-  { 0,  2,  6, 10, 12, -1, -1, -1},
-  { 4,  6, 10, 12, -1, -1, -1, -1},
-  { 0,  4,  6, 10, 12, -1, -1, -1},
-  { 2,  4,  6, 10, 12, -1, -1, -1},
-  { 0,  2,  4,  6, 10, 12, -1, -1},
-  { 8, 10, 12, -1, -1, -1, -1, -1},
-  { 0,  8, 10, 12, -1, -1, -1, -1},
-  { 2,  8, 10, 12, -1, -1, -1, -1},
-  { 0,  2,  8, 10, 12, -1, -1, -1},
-  { 4,  8, 10, 12, -1, -1, -1, -1},
-  { 0,  4,  8, 10, 12, -1, -1, -1},
-  { 2,  4,  8, 10, 12, -1, -1, -1},
-  { 0,  2,  4,  8, 10, 12, -1, -1},
-  { 6,  8, 10, 12, -1, -1, -1, -1},
-  { 0,  6,  8, 10, 12, -1, -1, -1},
-  { 2,  6,  8, 10, 12, -1, -1, -1},
-  { 0,  2,  6,  8, 10, 12, -1, -1},
-  { 4,  6,  8, 10, 12, -1, -1, -1},
-  { 0,  4,  6,  8, 10, 12, -1, -1},
-  { 2,  4,  6,  8, 10, 12, -1, -1},
-  { 0,  2,  4,  6,  8, 10, 12, -1},
-  {14, -1, -1, -1, -1, -1, -1, -1},
-  { 0, 14, -1, -1, -1, -1, -1, -1},
-  { 2, 14, -1, -1, -1, -1, -1, -1},
-  { 0,  2, 14, -1, -1, -1, -1, -1},
-  { 4, 14, -1, -1, -1, -1, -1, -1},
-  { 0,  4, 14, -1, -1, -1, -1, -1},
-  { 2,  4, 14, -1, -1, -1, -1, -1},
-  { 0,  2,  4, 14, -1, -1, -1, -1},
-  { 6, 14, -1, -1, -1, -1, -1, -1},
-  { 0,  6, 14, -1, -1, -1, -1, -1},
-  { 2,  6, 14, -1, -1, -1, -1, -1},
-  { 0,  2,  6, 14, -1, -1, -1, -1},
-  { 4,  6, 14, -1, -1, -1, -1, -1},
-  { 0,  4,  6, 14, -1, -1, -1, -1},
-  { 2,  4,  6, 14, -1, -1, -1, -1},
-  { 0,  2,  4,  6, 14, -1, -1, -1},
-  { 8, 14, -1, -1, -1, -1, -1, -1},
-  { 0,  8, 14, -1, -1, -1, -1, -1},
-  { 2,  8, 14, -1, -1, -1, -1, -1},
-  { 0,  2,  8, 14, -1, -1, -1, -1},
-  { 4,  8, 14, -1, -1, -1, -1, -1},
-  { 0,  4,  8, 14, -1, -1, -1, -1},
-  { 2,  4,  8, 14, -1, -1, -1, -1},
-  { 0,  2,  4,  8, 14, -1, -1, -1},
-  { 6,  8, 14, -1, -1, -1, -1, -1},
-  { 0,  6,  8, 14, -1, -1, -1, -1},
-  { 2,  6,  8, 14, -1, -1, -1, -1},
-  { 0,  2,  6,  8, 14, -1, -1, -1},
-  { 4,  6,  8, 14, -1, -1, -1, -1},
-  { 0,  4,  6,  8, 14, -1, -1, -1},
-  { 2,  4,  6,  8, 14, -1, -1, -1},
-  { 0,  2,  4,  6,  8, 14, -1, -1},
-  {10, 14, -1, -1, -1, -1, -1, -1},
-  { 0, 10, 14, -1, -1, -1, -1, -1},
-  { 2, 10, 14, -1, -1, -1, -1, -1},
-  { 0,  2, 10, 14, -1, -1, -1, -1},
-  { 4, 10, 14, -1, -1, -1, -1, -1},
-  { 0,  4, 10, 14, -1, -1, -1, -1},
-  { 2,  4, 10, 14, -1, -1, -1, -1},
-  { 0,  2,  4, 10, 14, -1, -1, -1},
-  { 6, 10, 14, -1, -1, -1, -1, -1},
-  { 0,  6, 10, 14, -1, -1, -1, -1},
-  { 2,  6, 10, 14, -1, -1, -1, -1},
-  { 0,  2,  6, 10, 14, -1, -1, -1},
-  { 4,  6, 10, 14, -1, -1, -1, -1},
-  { 0,  4,  6, 10, 14, -1, -1, -1},
-  { 2,  4,  6, 10, 14, -1, -1, -1},
-  { 0,  2,  4,  6, 10, 14, -1, -1},
-  { 8, 10, 14, -1, -1, -1, -1, -1},
-  { 0,  8, 10, 14, -1, -1, -1, -1},
-  { 2,  8, 10, 14, -1, -1, -1, -1},
-  { 0,  2,  8, 10, 14, -1, -1, -1},
-  { 4,  8, 10, 14, -1, -1, -1, -1},
-  { 0,  4,  8, 10, 14, -1, -1, -1},
-  { 2,  4,  8, 10, 14, -1, -1, -1},
-  { 0,  2,  4,  8, 10, 14, -1, -1},
-  { 6,  8, 10, 14, -1, -1, -1, -1},
-  { 0,  6,  8, 10, 14, -1, -1, -1},
-  { 2,  6,  8, 10, 14, -1, -1, -1},
-  { 0,  2,  6,  8, 10, 14, -1, -1},
-  { 4,  6,  8, 10, 14, -1, -1, -1},
-  { 0,  4,  6,  8, 10, 14, -1, -1},
-  { 2,  4,  6,  8, 10, 14, -1, -1},
-  { 0,  2,  4,  6,  8, 10, 14, -1},
-  {12, 14, -1, -1, -1, -1, -1, -1},
-  { 0, 12, 14, -1, -1, -1, -1, -1},
-  { 2, 12, 14, -1, -1, -1, -1, -1},
-  { 0,  2, 12, 14, -1, -1, -1, -1},
-  { 4, 12, 14, -1, -1, -1, -1, -1},
-  { 0,  4, 12, 14, -1, -1, -1, -1},
-  { 2,  4, 12, 14, -1, -1, -1, -1},
-  { 0,  2,  4, 12, 14, -1, -1, -1},
-  { 6, 12, 14, -1, -1, -1, -1, -1},
-  { 0,  6, 12, 14, -1, -1, -1, -1},
-  { 2,  6, 12, 14, -1, -1, -1, -1},
-  { 0,  2,  6, 12, 14, -1, -1, -1},
-  { 4,  6, 12, 14, -1, -1, -1, -1},
-  { 0,  4,  6, 12, 14, -1, -1, -1},
-  { 2,  4,  6, 12, 14, -1, -1, -1},
-  { 0,  2,  4,  6, 12, 14, -1, -1},
-  { 8, 12, 14, -1, -1, -1, -1, -1},
-  { 0,  8, 12, 14, -1, -1, -1, -1},
-  { 2,  8, 12, 14, -1, -1, -1, -1},
-  { 0,  2,  8, 12, 14, -1, -1, -1},
-  { 4,  8, 12, 14, -1, -1, -1, -1},
-  { 0,  4,  8, 12, 14, -1, -1, -1},
-  { 2,  4,  8, 12, 14, -1, -1, -1},
-  { 0,  2,  4,  8, 12, 14, -1, -1},
-  { 6,  8, 12, 14, -1, -1, -1, -1},
-  { 0,  6,  8, 12, 14, -1, -1, -1},
-  { 2,  6,  8, 12, 14, -1, -1, -1},
-  { 0,  2,  6,  8, 12, 14, -1, -1},
-  { 4,  6,  8, 12, 14, -1, -1, -1},
-  { 0,  4,  6,  8, 12, 14, -1, -1},
-  { 2,  4,  6,  8, 12, 14, -1, -1},
-  { 0,  2,  4,  6,  8, 12, 14, -1},
-  {10, 12, 14, -1, -1, -1, -1, -1},
-  { 0, 10, 12, 14, -1, -1, -1, -1},
-  { 2, 10, 12, 14, -1, -1, -1, -1},
-  { 0,  2, 10, 12, 14, -1, -1, -1},
-  { 4, 10, 12, 14, -1, -1, -1, -1},
-  { 0,  4, 10, 12, 14, -1, -1, -1},
-  { 2,  4, 10, 12, 14, -1, -1, -1},
-  { 0,  2,  4, 10, 12, 14, -1, -1},
-  { 6, 10, 12, 14, -1, -1, -1, -1},
-  { 0,  6, 10, 12, 14, -1, -1, -1},
-  { 2,  6, 10, 12, 14, -1, -1, -1},
-  { 0,  2,  6, 10, 12, 14, -1, -1},
-  { 4,  6, 10, 12, 14, -1, -1, -1},
-  { 0,  4,  6, 10, 12, 14, -1, -1},
-  { 2,  4,  6, 10, 12, 14, -1, -1},
-  { 0,  2,  4,  6, 10, 12, 14, -1},
-  { 8, 10, 12, 14, -1, -1, -1, -1},
-  { 0,  8, 10, 12, 14, -1, -1, -1},
-  { 2,  8, 10, 12, 14, -1, -1, -1},
-  { 0,  2,  8, 10, 12, 14, -1, -1},
-  { 4,  8, 10, 12, 14, -1, -1, -1},
-  { 0,  4,  8, 10, 12, 14, -1, -1},
-  { 2,  4,  8, 10, 12, 14, -1, -1},
-  { 0,  2,  4,  8, 10, 12, 14, -1},
-  { 6,  8, 10, 12, 14, -1, -1, -1},
-  { 0,  6,  8, 10, 12, 14, -1, -1},
-  { 2,  6,  8, 10, 12, 14, -1, -1},
-  { 0,  2,  6,  8, 10, 12, 14, -1},
-  { 4,  6,  8, 10, 12, 14, -1, -1},
-  { 0,  4,  6,  8, 10, 12, 14, -1},
-  { 2,  4,  6,  8, 10, 12, 14, -1},
-  { 0,  2,  4,  6,  8, 10, 12, 14}
-};
-#endif
-
-#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a)
-#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a)
-
-unsigned int rej_uniform_avx(int16_t * restrict r, const uint8_t *buf)
-{
-  unsigned int ctr, pos;
-  uint16_t val0, val1;
-  uint32_t good;
-#ifdef BMI
-  uint64_t idx0, idx1, idx2, idx3;
-#endif
-  const __m256i bound  = _mm256_load_si256(&qdata.vec[_16XQ/16]);
-  const __m256i ones   = _mm256_set1_epi8(1);
-  const __m256i mask  = _mm256_set1_epi16(0xFFF);
-  const __m256i idx8  = _mm256_set_epi8(15,14,14,13,12,11,11,10,
-                                         9, 8, 8, 7, 6, 5, 5, 4,
-                                        11,10,10, 9, 8, 7, 7, 6,
-                                         5, 4, 4, 3, 2, 1, 1, 0);
-  __m256i f0, f1, g0, g1, g2, g3;
-  __m128i f, t, pilo, pihi;
-
-  ctr = pos = 0;
-  while(ctr <= KYBER_N - 32 && pos <= REJ_UNIFORM_AVX_BUFLEN - 56) {
-    f0 = _mm256_loadu_si256((__m256i *)&buf[pos]);
-    f1 = _mm256_loadu_si256((__m256i *)&buf[pos+24]);
-    f0 = _mm256_permute4x64_epi64(f0, 0x94);
-    f1 = _mm256_permute4x64_epi64(f1, 0x94);
-    f0 = _mm256_shuffle_epi8(f0, idx8);
-    f1 = _mm256_shuffle_epi8(f1, idx8);
-    g0 = _mm256_srli_epi16(f0, 4);
-    g1 = _mm256_srli_epi16(f1, 4);
-    f0 = _mm256_blend_epi16(f0, g0, 0xAA);
-    f1 = _mm256_blend_epi16(f1, g1, 0xAA);
-    f0 = _mm256_and_si256(f0, mask);
-    f1 = _mm256_and_si256(f1, mask);
-    pos += 48;
-
-    g0 = _mm256_cmpgt_epi16(bound, f0);
-    g1 = _mm256_cmpgt_epi16(bound, f1);
-
-    g0 = _mm256_packs_epi16(g0, g1);
-    good = _mm256_movemask_epi8(g0);
-
-#ifdef BMI
-    idx0 = _pdep_u64(good >>  0, 0x0101010101010101);
-    idx1 = _pdep_u64(good >>  8, 0x0101010101010101);
-    idx2 = _pdep_u64(good >> 16, 0x0101010101010101);
-    idx3 = _pdep_u64(good >> 24, 0x0101010101010101);
-    idx0 = (idx0 << 8) - idx0;
-    idx0  = _pext_u64(0x0E0C0A0806040200, idx0);
-    idx1 = (idx1 << 8) - idx1;
-    idx1  = _pext_u64(0x0E0C0A0806040200, idx1);
-    idx2 = (idx2 << 8) - idx2;
-    idx2  = _pext_u64(0x0E0C0A0806040200, idx2);
-    idx3 = (idx3 << 8) - idx3;
-    idx3  = _pext_u64(0x0E0C0A0806040200, idx3);
-
-    g0 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx0));
-    g1 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx1));
-    g0 = _mm256_inserti128_si256(g0, _mm_cvtsi64_si128(idx2), 1);
-    g1 = _mm256_inserti128_si256(g1, _mm_cvtsi64_si128(idx3), 1);
-#else
-    g0 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx[(good >>  0) & 0xFF]));
-    g1 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx[(good >>  8) & 0xFF]));
-    g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((__m128i *)&idx[(good >> 16) & 0xFF]), 1);
-    g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((__m128i *)&idx[(good >> 24) & 0xFF]), 1);
-#endif
-
-    g2 = _mm256_add_epi8(g0, ones);
-    g3 = _mm256_add_epi8(g1, ones);
-    g0 = _mm256_unpacklo_epi8(g0, g2);
-    g1 = _mm256_unpacklo_epi8(g1, g3);
-
-    f0 = _mm256_shuffle_epi8(f0, g0);
-    f1 = _mm256_shuffle_epi8(f1, g1);
-
-    _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f0));
-    ctr += _mm_popcnt_u32((good >>  0) & 0xFF);
-    _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f0, 1));
-    ctr += _mm_popcnt_u32((good >> 16) & 0xFF);
-    _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f1));
-    ctr += _mm_popcnt_u32((good >>  8) & 0xFF);
-    _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f1, 1));
-    ctr += _mm_popcnt_u32((good >> 24) & 0xFF);
-  }
-
-  while(ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_AVX_BUFLEN - 16) {
-    f = _mm_loadu_si128((__m128i *)&buf[pos]);
-    f = _mm_shuffle_epi8(f, _mm256_castsi256_si128(idx8));
-    t = _mm_srli_epi16(f, 4);
-    f = _mm_blend_epi16(f, t, 0xAA);
-    f = _mm_and_si128(f, _mm256_castsi256_si128(mask));
-    pos += 12;
-
-    t = _mm_cmpgt_epi16(_mm256_castsi256_si128(bound), f);
-    good = _mm_movemask_epi8(t);
-
-#ifdef BMI
-    good &= 0x5555;
-    idx0 = _pdep_u64(good, 0x1111111111111111);
-    idx0 = (idx0 << 8) - idx0;
-    idx0 = _pext_u64(0x0E0C0A0806040200, idx0);
-    pilo = _mm_cvtsi64_si128(idx0);
-#else
-    good = _pext_u32(good, 0x5555);
-    pilo = _mm_loadl_epi64((__m128i *)&idx[good]);
-#endif
-
-    pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones));
-    pilo = _mm_unpacklo_epi8(pilo, pihi);
-    f = _mm_shuffle_epi8(f, pilo);
-    _mm_storeu_si128((__m128i *)&r[ctr], f);
-    ctr += _mm_popcnt_u32(good);
-  }
-
-  while(ctr < KYBER_N && pos <= REJ_UNIFORM_AVX_BUFLEN - 3) {
-    val0 = ((buf[pos+0] >> 0) | ((uint16_t)buf[pos+1] << 8)) & 0xFFF;
-    val1 = ((buf[pos+1] >> 4) | ((uint16_t)buf[pos+2] << 4));
-    pos += 3;
-
-    if(val0 < KYBER_Q)
-      r[ctr++] = val0;
-    if(val1 < KYBER_Q && ctr < KYBER_N)
-      r[ctr++] = val1;
-  }
-
-  return ctr;
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/rejsample.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/rejsample.h
deleted file mode 100644
index 3be5e2192e..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/rejsample.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef REJSAMPLE_H
-#define REJSAMPLE_H
-
-#include <stdint.h>
-#include "params.h"
-#include "symmetric.h"
-
-#define REJ_UNIFORM_AVX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES)
-#define REJ_UNIFORM_AVX_BUFLEN (REJ_UNIFORM_AVX_NBLOCKS*XOF_BLOCKBYTES)
-
-#define rej_uniform_avx KYBER_NAMESPACE(rej_uniform_avx)
-unsigned int rej_uniform_avx(int16_t *r, const uint8_t *buf);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/shuffle.S b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/shuffle.S
deleted file mode 100644
index 18325ebec0..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/shuffle.S
+++ /dev/null
@@ -1,255 +0,0 @@
-#include "consts.h"
-.include "fq.inc"
-.include "shuffle.inc"
-
-/*
-nttpack_avx:
-#load
-vmovdqa		(%rdi),%ymm4
-vmovdqa		32(%rdi),%ymm5
-vmovdqa		64(%rdi),%ymm6
-vmovdqa		96(%rdi),%ymm7
-vmovdqa		128(%rdi),%ymm8
-vmovdqa		160(%rdi),%ymm9
-vmovdqa		192(%rdi),%ymm10
-vmovdqa		224(%rdi),%ymm11
-
-shuffle1	4,5,3,5
-shuffle1	6,7,4,7
-shuffle1	8,9,6,9
-shuffle1	10,11,8,11
-
-shuffle2	3,4,10,4
-shuffle2	6,8,3,8
-shuffle2	5,7,6,7
-shuffle2	9,11,5,11
-
-shuffle4	10,3,9,3
-shuffle4	6,5,10,5
-shuffle4	4,8,6,8
-shuffle4	7,11,4,11
-
-shuffle8	9,10,7,10
-shuffle8	6,4,9,4
-shuffle8	3,5,6,5
-shuffle8	8,11,3,11
-
-#store
-vmovdqa		%ymm7,(%rdi)
-vmovdqa		%ymm9,32(%rdi)
-vmovdqa		%ymm6,64(%rdi)
-vmovdqa		%ymm3,96(%rdi)
-vmovdqa		%ymm10,128(%rdi)
-vmovdqa		%ymm4,160(%rdi)
-vmovdqa		%ymm5,192(%rdi)
-vmovdqa		%ymm11,224(%rdi)
-
-ret
-*/
-
-.text
-nttunpack128_avx:
-#load
-vmovdqa		(%rdi),%ymm4
-vmovdqa		32(%rdi),%ymm5
-vmovdqa		64(%rdi),%ymm6
-vmovdqa		96(%rdi),%ymm7
-vmovdqa		128(%rdi),%ymm8
-vmovdqa		160(%rdi),%ymm9
-vmovdqa		192(%rdi),%ymm10
-vmovdqa		224(%rdi),%ymm11
-
-shuffle8	4,8,3,8
-shuffle8	5,9,4,9
-shuffle8	6,10,5,10
-shuffle8	7,11,6,11
-
-shuffle4	3,5,7,5
-shuffle4	8,10,3,10
-shuffle4	4,6,8,6
-shuffle4	9,11,4,11
-
-shuffle2	7,8,9,8
-shuffle2	5,6,7,6
-shuffle2	3,4,5,4
-shuffle2	10,11,3,11
-
-shuffle1	9,5,10,5
-shuffle1	8,4,9,4
-shuffle1	7,3,8,3
-shuffle1	6,11,7,11
-
-#store
-vmovdqa		%ymm10,(%rdi)
-vmovdqa		%ymm5,32(%rdi)
-vmovdqa		%ymm9,64(%rdi)
-vmovdqa		%ymm4,96(%rdi)
-vmovdqa		%ymm8,128(%rdi)
-vmovdqa		%ymm3,160(%rdi)
-vmovdqa		%ymm7,192(%rdi)
-vmovdqa		%ymm11,224(%rdi)
-
-ret
-
-.global cdecl(nttunpack_avx)
-cdecl(nttunpack_avx):
-call		nttunpack128_avx
-add		$256,%rdi
-call		nttunpack128_avx
-ret
-
-ntttobytes128_avx:
-#load
-vmovdqa		(%rsi),%ymm5
-vmovdqa		32(%rsi),%ymm6
-vmovdqa		64(%rsi),%ymm7
-vmovdqa		96(%rsi),%ymm8
-vmovdqa		128(%rsi),%ymm9
-vmovdqa		160(%rsi),%ymm10
-vmovdqa		192(%rsi),%ymm11
-vmovdqa		224(%rsi),%ymm12
-
-#csubq
-csubq		5,13
-csubq		6,13
-csubq		7,13
-csubq		8,13
-csubq		9,13
-csubq		10,13
-csubq		11,13
-csubq		12,13
-
-#bitpack
-vpsllw		$12,%ymm6,%ymm4
-vpor		%ymm4,%ymm5,%ymm4
-
-vpsrlw		$4,%ymm6,%ymm5
-vpsllw		$8,%ymm7,%ymm6
-vpor		%ymm5,%ymm6,%ymm5
-
-vpsrlw		$8,%ymm7,%ymm6
-vpsllw		$4,%ymm8,%ymm7
-vpor		%ymm6,%ymm7,%ymm6
-
-vpsllw		$12,%ymm10,%ymm7
-vpor		%ymm7,%ymm9,%ymm7
-
-vpsrlw		$4,%ymm10,%ymm8
-vpsllw		$8,%ymm11,%ymm9
-vpor		%ymm8,%ymm9,%ymm8
-
-vpsrlw		$8,%ymm11,%ymm9
-vpsllw		$4,%ymm12,%ymm10
-vpor		%ymm9,%ymm10,%ymm9
-
-shuffle1	4,5,3,5
-shuffle1	6,7,4,7
-shuffle1	8,9,6,9
-
-shuffle2	3,4,8,4
-shuffle2	6,5,3,5
-shuffle2	7,9,6,9
-
-shuffle4	8,3,7,3
-shuffle4	6,4,8,4
-shuffle4	5,9,6,9
-
-shuffle8	7,8,5,8
-shuffle8	6,3,7,3
-shuffle8	4,9,6,9
-
-#store
-vmovdqu		%ymm5,(%rdi)
-vmovdqu		%ymm7,32(%rdi)
-vmovdqu		%ymm6,64(%rdi)
-vmovdqu		%ymm8,96(%rdi)
-vmovdqu		%ymm3,128(%rdi)
-vmovdqu		%ymm9,160(%rdi)
-
-ret
-
-.global cdecl(ntttobytes_avx)
-cdecl(ntttobytes_avx):
-#consts
-vmovdqa		_16XQ*2(%rdx),%ymm0
-call		ntttobytes128_avx
-add		$256,%rsi
-add		$192,%rdi
-call		ntttobytes128_avx
-ret
-
-nttfrombytes128_avx:
-#load
-vmovdqu		(%rsi),%ymm4
-vmovdqu		32(%rsi),%ymm5
-vmovdqu		64(%rsi),%ymm6
-vmovdqu		96(%rsi),%ymm7
-vmovdqu		128(%rsi),%ymm8
-vmovdqu		160(%rsi),%ymm9
-
-shuffle8	4,7,3,7
-shuffle8	5,8,4,8
-shuffle8	6,9,5,9
-
-shuffle4	3,8,6,8
-shuffle4	7,5,3,5
-shuffle4	4,9,7,9
-
-shuffle2	6,5,4,5
-shuffle2	8,7,6,7
-shuffle2	3,9,8,9
-
-shuffle1	4,7,10,7
-shuffle1	5,8,4,8
-shuffle1	6,9,5,9
-
-#bitunpack
-vpsrlw		$12,%ymm10,%ymm11
-vpsllw		$4,%ymm7,%ymm12
-vpor		%ymm11,%ymm12,%ymm11
-vpand		%ymm0,%ymm10,%ymm10
-vpand		%ymm0,%ymm11,%ymm11
-
-vpsrlw		$8,%ymm7,%ymm12
-vpsllw		$8,%ymm4,%ymm13
-vpor		%ymm12,%ymm13,%ymm12
-vpand		%ymm0,%ymm12,%ymm12
-
-vpsrlw		$4,%ymm4,%ymm13
-vpand		%ymm0,%ymm13,%ymm13
-
-vpsrlw		$12,%ymm8,%ymm14
-vpsllw		$4,%ymm5,%ymm15
-vpor		%ymm14,%ymm15,%ymm14
-vpand		%ymm0,%ymm8,%ymm8
-vpand		%ymm0,%ymm14,%ymm14
-
-vpsrlw		$8,%ymm5,%ymm15
-vpsllw		$8,%ymm9,%ymm1
-vpor		%ymm15,%ymm1,%ymm15
-vpand		%ymm0,%ymm15,%ymm15
-
-vpsrlw		$4,%ymm9,%ymm1
-vpand		%ymm0,%ymm1,%ymm1
-
-#store
-vmovdqa		%ymm10,(%rdi)
-vmovdqa		%ymm11,32(%rdi)
-vmovdqa		%ymm12,64(%rdi)
-vmovdqa		%ymm13,96(%rdi)
-vmovdqa		%ymm8,128(%rdi)
-vmovdqa		%ymm14,160(%rdi)
-vmovdqa		%ymm15,192(%rdi)
-vmovdqa		%ymm1,224(%rdi)
-
-ret
-
-.global cdecl(nttfrombytes_avx)
-cdecl(nttfrombytes_avx):
-#consts
-vmovdqa		_16XMASK*2(%rdx),%ymm0
-call		nttfrombytes128_avx
-add		$256,%rdi
-add		$192,%rsi
-call		nttfrombytes128_avx
-ret
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/shuffle.inc b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/shuffle.inc
deleted file mode 100644
index 73e9ffe03c..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/shuffle.inc
+++ /dev/null
@@ -1,25 +0,0 @@
-.macro shuffle8 r0,r1,r2,r3
-vperm2i128	$0x20,%ymm\r1,%ymm\r0,%ymm\r2
-vperm2i128	$0x31,%ymm\r1,%ymm\r0,%ymm\r3
-.endm
-
-.macro shuffle4 r0,r1,r2,r3
-vpunpcklqdq	%ymm\r1,%ymm\r0,%ymm\r2
-vpunpckhqdq	%ymm\r1,%ymm\r0,%ymm\r3
-.endm
-
-.macro shuffle2 r0,r1,r2,r3
-#vpsllq		$32,%ymm\r1,%ymm\r2
-vmovsldup	%ymm\r1,%ymm\r2
-vpblendd	$0xAA,%ymm\r2,%ymm\r0,%ymm\r2
-vpsrlq		$32,%ymm\r0,%ymm\r0
-#vmovshdup	%ymm\r0,%ymm\r0
-vpblendd	$0xAA,%ymm\r1,%ymm\r0,%ymm\r3
-.endm
-
-.macro shuffle1 r0,r1,r2,r3
-vpslld		$16,%ymm\r1,%ymm\r2
-vpblendw	$0xAA,%ymm\r2,%ymm\r0,%ymm\r2
-vpsrld		$16,%ymm\r0,%ymm\r0
-vpblendw	$0xAA,%ymm\r1,%ymm\r0,%ymm\r3
-.endm
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/symmetric-shake.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/symmetric-shake.c
deleted file mode 100644
index 20f451882e..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/symmetric-shake.c
+++ /dev/null
@@ -1,74 +0,0 @@
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-#include "params.h"
-#include "symmetric.h"
-#include "fips202.h"
-
-/*************************************************
-* Name:        kyber_shake128_absorb
-*
-* Description: Absorb step of the SHAKE128 specialized for the Kyber context.
-*
-* Arguments:   - keccak_state *state: pointer to (uninitialized) output Keccak state
-*              - const uint8_t *seed: pointer to KYBER_SYMBYTES input to be absorbed into state
-*              - uint8_t i: additional byte of input
-*              - uint8_t j: additional byte of input
-**************************************************/
-void kyber_shake128_absorb(shake128incctx *state,
-                           const uint8_t seed[KYBER_SYMBYTES],
-                           uint8_t x,
-                           uint8_t y)
-{
-  uint8_t extseed[KYBER_SYMBYTES+2];
-
-  memcpy(extseed, seed, KYBER_SYMBYTES);
-  extseed[KYBER_SYMBYTES+0] = x;
-  extseed[KYBER_SYMBYTES+1] = y;
-
-  shake128_absorb_once(state, extseed, sizeof(extseed));
-}
-
-/*************************************************
-* Name:        kyber_shake256_prf
-*
-* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input
-*              and then generates outlen bytes of SHAKE256 output
-*
-* Arguments:   - uint8_t *out: pointer to output
-*              - size_t outlen: number of requested output bytes
-*              - const uint8_t *key: pointer to the key (of length KYBER_SYMBYTES)
-*              - uint8_t nonce: single-byte nonce (public PRF input)
-**************************************************/
-void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce)
-{
-  uint8_t extkey[KYBER_SYMBYTES+1];
-
-  memcpy(extkey, key, KYBER_SYMBYTES);
-  extkey[KYBER_SYMBYTES] = nonce;
-
-  shake256(out, outlen, extkey, sizeof(extkey));
-}
-
-/*************************************************
-* Name:        kyber_shake256_prf
-*
-* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input
-*              and then generates outlen bytes of SHAKE256 output
-*
-* Arguments:   - uint8_t *out: pointer to output
-*              - size_t outlen: number of requested output bytes
-*              - const uint8_t *key: pointer to the key (of length KYBER_SYMBYTES)
-*              - uint8_t nonce: single-byte nonce (public PRF input)
-**************************************************/
-void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SYMBYTES], const uint8_t input[KYBER_CIPHERTEXTBYTES])
-{
-  shake256incctx s;
-
-  shake256_inc_init(&s);
-  shake256_inc_absorb(&s, key, KYBER_SYMBYTES);
-  shake256_inc_absorb(&s, input, KYBER_CIPHERTEXTBYTES);
-  shake256_inc_finalize(&s);
-  shake256_inc_squeeze(out, KYBER_SSBYTES, &s);
-  shake256_inc_ctx_release(&s);
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/symmetric.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/symmetric.h
deleted file mode 100644
index e4941f7a86..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/symmetric.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef SYMMETRIC_H
-#define SYMMETRIC_H
-
-#include <stddef.h>
-#include <stdint.h>
-#include "params.h"
-
-#include "fips202.h"
-#include "fips202x4.h"
-
-typedef shake128incctx xof_state;
-
-#define kyber_shake128_absorb KYBER_NAMESPACE(kyber_shake128_absorb)
-void kyber_shake128_absorb(shake128incctx *s,
-                           const uint8_t seed[KYBER_SYMBYTES],
-                           uint8_t x,
-                           uint8_t y);
-
-#define kyber_shake256_prf KYBER_NAMESPACE(kyber_shake256_prf)
-void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce);
-
-#define kyber_shake256_rkprf KYBER_NAMESPACE(kyber_shake256_rkprf)
-void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SYMBYTES], const uint8_t input[KYBER_CIPHERTEXTBYTES]);
-
-#define XOF_BLOCKBYTES SHAKE128_RATE
-
-#define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES)
-#define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES)
-#define xof_absorb(STATE, SEED, X, Y) kyber_shake128_absorb(STATE, SEED, X, Y)
-#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE)
-#define prf(OUT, OUTBYTES, KEY, NONCE) kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE)
-#define rkprf(OUT, KEY, INPUT) kyber_shake256_rkprf(OUT, KEY, INPUT)
-
-#endif /* SYMMETRIC_H */
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/verify.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/verify.c
deleted file mode 100644
index 06243b837f..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/verify.c
+++ /dev/null
@@ -1,83 +0,0 @@
-#include <stdlib.h>
-#include <stdint.h>
-#include <immintrin.h>
-#include "verify.h"
-
-/*************************************************
-* Name:        verify
-*
-* Description: Compare two arrays for equality in constant time.
-*
-* Arguments:   const uint8_t *a: pointer to first byte array
-*              const uint8_t *b: pointer to second byte array
-*              size_t len: length of the byte arrays
-*
-* Returns 0 if the byte arrays are equal, 1 otherwise
-**************************************************/
-int verify(const uint8_t *a, const uint8_t *b, size_t len)
-{
-  size_t i;
-  uint64_t r;
-  __m256i f, g, h;
-
-  h = _mm256_setzero_si256();
-  for(i=0;i<len/32;i++) {
-    f = _mm256_loadu_si256((__m256i *)&a[32*i]);
-    g = _mm256_loadu_si256((__m256i *)&b[32*i]);
-    f = _mm256_xor_si256(f,g);
-    h = _mm256_or_si256(h,f);
-  }
-  r = 1 - _mm256_testz_si256(h,h);
-
-  a += 32*i;
-  b += 32*i;
-  len -= 32*i;
-  for(i=0;i<len;i++)
-    r |= a[i] ^ b[i];
-
-  r = (-r) >> 63;
-  return r;
-}
-
-/*************************************************
-* Name:        cmov
-*
-* Description: Copy len bytes from x to r if b is 1;
-*              don't modify x if b is 0. Requires b to be in {0,1};
-*              assumes two's complement representation of negative integers.
-*              Runs in constant time.
-*
-* Arguments:   uint8_t *r: pointer to output byte array
-*              const uint8_t *x: pointer to input byte array
-*              size_t len: Amount of bytes to be copied
-*              uint8_t b: Condition bit; has to be in {0,1}
-**************************************************/
-void cmov(uint8_t * restrict r, const uint8_t *x, size_t len, uint8_t b)
-{
-  size_t i;
-  __m256i xvec, rvec, bvec;
-
-#if defined(__GNUC__) || defined(__clang__)
-  // Prevent the compiler from
-  //    1) inferring that b is 0/1-valued, and
-  //    2) handling the two cases with a branch.
-  // This is not necessary when verify.c and kem.c are separate translation
-  // units, but we expect that downstream consumers will copy this code and/or
-  // change how it is built.
-  __asm__("" : "+r"(b) : /* no inputs */);
-#endif
-
-  bvec = _mm256_set1_epi64x(-(uint64_t)b);
-  for(i=0;i<len/32;i++) {
-    rvec = _mm256_loadu_si256((__m256i *)&r[32*i]);
-    xvec = _mm256_loadu_si256((__m256i *)&x[32*i]);
-    rvec = _mm256_blendv_epi8(rvec,xvec,bvec);
-    _mm256_storeu_si256((__m256i *)&r[32*i],rvec);
-  }
-
-  r += 32*i;
-  x += 32*i;
-  len -= 32*i;
-  for(i=0;i<len;i++)
-    r[i] ^= -b & (x[i] ^ r[i]);
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/verify.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/verify.h
deleted file mode 100644
index 09f0ad5046..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_avx2/verify.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef VERIFY_H
-#define VERIFY_H
-
-#include <stddef.h>
-#include <stdint.h>
-#include "params.h"
-
-#define verify KYBER_NAMESPACE(verify)
-int verify(const uint8_t *a, const uint8_t *b, size_t len);
-
-#define cmov KYBER_NAMESPACE(cmov)
-void cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b);
-
-#define cmov_int16 KYBER_NAMESPACE(cmov_int16)
-void cmov_int16(int16_t *r, int16_t v, uint16_t b);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/LICENSE b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/LICENSE
deleted file mode 100644
index 7922ab8007..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/LICENSE
+++ /dev/null
@@ -1,6 +0,0 @@
-Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/);
-or Apache 2.0 License (https://www.apache.org/licenses/LICENSE-2.0.html).
-
-For Keccak and AES we are using public-domain
-code from sources and by authors listed in
-comments on top of the respective files.
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/api.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/api.h
deleted file mode 100644
index 70d40f3f3e..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/api.h
+++ /dev/null
@@ -1,66 +0,0 @@
-#ifndef API_H
-#define API_H
-
-#include <stdint.h>
-
-#define pqcrystals_kyber512_SECRETKEYBYTES 1632
-#define pqcrystals_kyber512_PUBLICKEYBYTES 800
-#define pqcrystals_kyber512_CIPHERTEXTBYTES 768
-#define pqcrystals_kyber512_KEYPAIRCOINBYTES 64
-#define pqcrystals_kyber512_ENCCOINBYTES 32
-#define pqcrystals_kyber512_BYTES 32
-
-#define pqcrystals_kyber512_ref_SECRETKEYBYTES pqcrystals_kyber512_SECRETKEYBYTES
-#define pqcrystals_kyber512_ref_PUBLICKEYBYTES pqcrystals_kyber512_PUBLICKEYBYTES
-#define pqcrystals_kyber512_ref_CIPHERTEXTBYTES pqcrystals_kyber512_CIPHERTEXTBYTES
-#define pqcrystals_kyber512_ref_KEYPAIRCOINBYTES pqcrystals_kyber512_KEYPAIRCOINBYTES
-#define pqcrystals_kyber512_ref_ENCCOINBYTES pqcrystals_kyber512_ENCCOINBYTES
-#define pqcrystals_kyber512_ref_BYTES pqcrystals_kyber512_BYTES
-
-int pqcrystals_kyber512_ref_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
-int pqcrystals_kyber512_ref_keypair(uint8_t *pk, uint8_t *sk);
-int pqcrystals_kyber512_ref_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
-int pqcrystals_kyber512_ref_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-int pqcrystals_kyber512_ref_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-
-#define pqcrystals_kyber768_SECRETKEYBYTES 2400
-#define pqcrystals_kyber768_PUBLICKEYBYTES 1184
-#define pqcrystals_kyber768_CIPHERTEXTBYTES 1088
-#define pqcrystals_kyber768_KEYPAIRCOINBYTES 64
-#define pqcrystals_kyber768_ENCCOINBYTES 32
-#define pqcrystals_kyber768_BYTES 32
-
-#define pqcrystals_kyber768_ref_SECRETKEYBYTES pqcrystals_kyber768_SECRETKEYBYTES
-#define pqcrystals_kyber768_ref_PUBLICKEYBYTES pqcrystals_kyber768_PUBLICKEYBYTES
-#define pqcrystals_kyber768_ref_CIPHERTEXTBYTES pqcrystals_kyber768_CIPHERTEXTBYTES
-#define pqcrystals_kyber768_ref_KEYPAIRCOINBYTES pqcrystals_kyber768_KEYPAIRCOINBYTES
-#define pqcrystals_kyber768_ref_ENCCOINBYTES pqcrystals_kyber768_ENCCOINBYTES
-#define pqcrystals_kyber768_ref_BYTES pqcrystals_kyber768_BYTES
-
-int pqcrystals_kyber768_ref_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
-int pqcrystals_kyber768_ref_keypair(uint8_t *pk, uint8_t *sk);
-int pqcrystals_kyber768_ref_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
-int pqcrystals_kyber768_ref_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-int pqcrystals_kyber768_ref_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-
-#define pqcrystals_kyber1024_SECRETKEYBYTES 3168
-#define pqcrystals_kyber1024_PUBLICKEYBYTES 1568
-#define pqcrystals_kyber1024_CIPHERTEXTBYTES 1568
-#define pqcrystals_kyber1024_KEYPAIRCOINBYTES 64
-#define pqcrystals_kyber1024_ENCCOINBYTES 32
-#define pqcrystals_kyber1024_BYTES 32
-
-#define pqcrystals_kyber1024_ref_SECRETKEYBYTES pqcrystals_kyber1024_SECRETKEYBYTES
-#define pqcrystals_kyber1024_ref_PUBLICKEYBYTES pqcrystals_kyber1024_PUBLICKEYBYTES
-#define pqcrystals_kyber1024_ref_CIPHERTEXTBYTES pqcrystals_kyber1024_CIPHERTEXTBYTES
-#define pqcrystals_kyber1024_ref_KEYPAIRCOINBYTES pqcrystals_kyber1024_KEYPAIRCOINBYTES
-#define pqcrystals_kyber1024_ref_ENCCOINBYTES pqcrystals_kyber1024_ENCCOINBYTES
-#define pqcrystals_kyber1024_ref_BYTES pqcrystals_kyber1024_BYTES
-
-int pqcrystals_kyber1024_ref_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
-int pqcrystals_kyber1024_ref_keypair(uint8_t *pk, uint8_t *sk);
-int pqcrystals_kyber1024_ref_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
-int pqcrystals_kyber1024_ref_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-int pqcrystals_kyber1024_ref_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/cbd.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/cbd.c
deleted file mode 100644
index 1500ffea56..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/cbd.c
+++ /dev/null
@@ -1,128 +0,0 @@
-#include <stdint.h>
-#include "params.h"
-#include "cbd.h"
-
-/*************************************************
-* Name:        load32_littleendian
-*
-* Description: load 4 bytes into a 32-bit integer
-*              in little-endian order
-*
-* Arguments:   - const uint8_t *x: pointer to input byte array
-*
-* Returns 32-bit unsigned integer loaded from x
-**************************************************/
-static uint32_t load32_littleendian(const uint8_t x[4])
-{
-  uint32_t r;
-  r  = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  r |= (uint32_t)x[3] << 24;
-  return r;
-}
-
-/*************************************************
-* Name:        load24_littleendian
-*
-* Description: load 3 bytes into a 32-bit integer
-*              in little-endian order.
-*              This function is only needed for Kyber-512
-*
-* Arguments:   - const uint8_t *x: pointer to input byte array
-*
-* Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
-**************************************************/
-#if KYBER_ETA1 == 3
-static uint32_t load24_littleendian(const uint8_t x[3])
-{
-  uint32_t r;
-  r  = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  return r;
-}
-#endif
-
-
-/*************************************************
-* Name:        cbd2
-*
-* Description: Given an array of uniformly random bytes, compute
-*              polynomial with coefficients distributed according to
-*              a centered binomial distribution with parameter eta=2
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *buf: pointer to input byte array
-**************************************************/
-static void cbd2(poly *r, const uint8_t buf[2*KYBER_N/4])
-{
-  unsigned int i,j;
-  uint32_t t,d;
-  int16_t a,b;
-
-  for(i=0;i<KYBER_N/8;i++) {
-    t  = load32_littleendian(buf+4*i);
-    d  = t & 0x55555555;
-    d += (t>>1) & 0x55555555;
-
-    for(j=0;j<8;j++) {
-      a = (d >> (4*j+0)) & 0x3;
-      b = (d >> (4*j+2)) & 0x3;
-      r->coeffs[8*i+j] = a - b;
-    }
-  }
-}
-
-/*************************************************
-* Name:        cbd3
-*
-* Description: Given an array of uniformly random bytes, compute
-*              polynomial with coefficients distributed according to
-*              a centered binomial distribution with parameter eta=3.
-*              This function is only needed for Kyber-512
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *buf: pointer to input byte array
-**************************************************/
-#if KYBER_ETA1 == 3
-static void cbd3(poly *r, const uint8_t buf[3*KYBER_N/4])
-{
-  unsigned int i,j;
-  uint32_t t,d;
-  int16_t a,b;
-
-  for(i=0;i<KYBER_N/4;i++) {
-    t  = load24_littleendian(buf+3*i);
-    d  = t & 0x00249249;
-    d += (t>>1) & 0x00249249;
-    d += (t>>2) & 0x00249249;
-
-    for(j=0;j<4;j++) {
-      a = (d >> (6*j+0)) & 0x7;
-      b = (d >> (6*j+3)) & 0x7;
-      r->coeffs[4*i+j] = a - b;
-    }
-  }
-}
-#endif
-
-void poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1*KYBER_N/4])
-{
-#if KYBER_ETA1 == 2
-  cbd2(r, buf);
-#elif KYBER_ETA1 == 3
-  cbd3(r, buf);
-#else
-#error "This implementation requires eta1 in {2,3}"
-#endif
-}
-
-void poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2*KYBER_N/4])
-{
-#if KYBER_ETA2 == 2
-  cbd2(r, buf);
-#else
-#error "This implementation requires eta2 = 2"
-#endif
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/cbd.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/cbd.h
deleted file mode 100644
index 7b677d745d..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/cbd.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef CBD_H
-#define CBD_H
-
-#include <stdint.h>
-#include "params.h"
-#include "poly.h"
-
-#define poly_cbd_eta1 KYBER_NAMESPACE(poly_cbd_eta1)
-void poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1*KYBER_N/4]);
-
-#define poly_cbd_eta2 KYBER_NAMESPACE(poly_cbd_eta2)
-void poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2*KYBER_N/4]);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/indcpa.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/indcpa.c
deleted file mode 100644
index 726cfa985d..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/indcpa.c
+++ /dev/null
@@ -1,334 +0,0 @@
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-#include "params.h"
-#include "indcpa.h"
-#include "polyvec.h"
-#include "poly.h"
-#include "ntt.h"
-#include "symmetric.h"
-#include "randombytes.h"
-
-/*************************************************
-* Name:        pack_pk
-*
-* Description: Serialize the public key as concatenation of the
-*              serialized vector of polynomials pk
-*              and the public seed used to generate the matrix A.
-*
-* Arguments:   uint8_t *r: pointer to the output serialized public key
-*              polyvec *pk: pointer to the input public-key polyvec
-*              const uint8_t *seed: pointer to the input public seed
-**************************************************/
-static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES],
-                    polyvec *pk,
-                    const uint8_t seed[KYBER_SYMBYTES])
-{
-  polyvec_tobytes(r, pk);
-  memcpy(r+KYBER_POLYVECBYTES, seed, KYBER_SYMBYTES);
-}
-
-/*************************************************
-* Name:        unpack_pk
-*
-* Description: De-serialize public key from a byte array;
-*              approximate inverse of pack_pk
-*
-* Arguments:   - polyvec *pk: pointer to output public-key polynomial vector
-*              - uint8_t *seed: pointer to output seed to generate matrix A
-*              - const uint8_t *packedpk: pointer to input serialized public key
-**************************************************/
-static void unpack_pk(polyvec *pk,
-                      uint8_t seed[KYBER_SYMBYTES],
-                      const uint8_t packedpk[KYBER_INDCPA_PUBLICKEYBYTES])
-{
-  polyvec_frombytes(pk, packedpk);
-  memcpy(seed, packedpk+KYBER_POLYVECBYTES, KYBER_SYMBYTES);
-}
-
-/*************************************************
-* Name:        pack_sk
-*
-* Description: Serialize the secret key
-*
-* Arguments:   - uint8_t *r: pointer to output serialized secret key
-*              - polyvec *sk: pointer to input vector of polynomials (secret key)
-**************************************************/
-static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk)
-{
-  polyvec_tobytes(r, sk);
-}
-
-/*************************************************
-* Name:        unpack_sk
-*
-* Description: De-serialize the secret key; inverse of pack_sk
-*
-* Arguments:   - polyvec *sk: pointer to output vector of polynomials (secret key)
-*              - const uint8_t *packedsk: pointer to input serialized secret key
-**************************************************/
-static void unpack_sk(polyvec *sk, const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES])
-{
-  polyvec_frombytes(sk, packedsk);
-}
-
-/*************************************************
-* Name:        pack_ciphertext
-*
-* Description: Serialize the ciphertext as concatenation of the
-*              compressed and serialized vector of polynomials b
-*              and the compressed and serialized polynomial v
-*
-* Arguments:   uint8_t *r: pointer to the output serialized ciphertext
-*              poly *pk: pointer to the input vector of polynomials b
-*              poly *v: pointer to the input polynomial v
-**************************************************/
-static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], polyvec *b, poly *v)
-{
-  polyvec_compress(r, b);
-  poly_compress(r+KYBER_POLYVECCOMPRESSEDBYTES, v);
-}
-
-/*************************************************
-* Name:        unpack_ciphertext
-*
-* Description: De-serialize and decompress ciphertext from a byte array;
-*              approximate inverse of pack_ciphertext
-*
-* Arguments:   - polyvec *b: pointer to the output vector of polynomials b
-*              - poly *v: pointer to the output polynomial v
-*              - const uint8_t *c: pointer to the input serialized ciphertext
-**************************************************/
-static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[KYBER_INDCPA_BYTES])
-{
-  polyvec_decompress(b, c);
-  poly_decompress(v, c+KYBER_POLYVECCOMPRESSEDBYTES);
-}
-
-/*************************************************
-* Name:        rej_uniform
-*
-* Description: Run rejection sampling on uniform random bytes to generate
-*              uniform random integers mod q
-*
-* Arguments:   - int16_t *r: pointer to output buffer
-*              - unsigned int len: requested number of 16-bit integers (uniform mod q)
-*              - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes)
-*              - unsigned int buflen: length of input buffer in bytes
-*
-* Returns number of sampled 16-bit integers (at most len)
-**************************************************/
-static unsigned int rej_uniform(int16_t *r,
-                                unsigned int len,
-                                const uint8_t *buf,
-                                unsigned int buflen)
-{
-  unsigned int ctr, pos;
-  uint16_t val0, val1;
-
-  ctr = pos = 0;
-  while(ctr < len && pos + 3 <= buflen) {
-    val0 = ((buf[pos+0] >> 0) | ((uint16_t)buf[pos+1] << 8)) & 0xFFF;
-    val1 = ((buf[pos+1] >> 4) | ((uint16_t)buf[pos+2] << 4)) & 0xFFF;
-    pos += 3;
-
-    if(val0 < KYBER_Q)
-      r[ctr++] = val0;
-    if(ctr < len && val1 < KYBER_Q)
-      r[ctr++] = val1;
-  }
-
-  return ctr;
-}
-
-#define gen_a(A,B)  gen_matrix(A,B,0)
-#define gen_at(A,B) gen_matrix(A,B,1)
-
-/*************************************************
-* Name:        gen_matrix
-*
-* Description: Deterministically generate matrix A (or the transpose of A)
-*              from a seed. Entries of the matrix are polynomials that look
-*              uniformly random. Performs rejection sampling on output of
-*              a XOF
-*
-* Arguments:   - polyvec *a: pointer to ouptput matrix A
-*              - const uint8_t *seed: pointer to input seed
-*              - int transposed: boolean deciding whether A or A^T is generated
-**************************************************/
-#if(XOF_BLOCKBYTES % 3)
-#error "Implementation of gen_matrix assumes that XOF_BLOCKBYTES is a multiple of 3"
-#endif
-
-#define GEN_MATRIX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES)
-// Not static for benchmarking
-void gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed)
-{
-  unsigned int ctr, i, j;
-  unsigned int buflen;
-  uint8_t buf[GEN_MATRIX_NBLOCKS*XOF_BLOCKBYTES];
-  xof_state state;
-  xof_init(&state, seed);
-
-  for(i=0;i<KYBER_K;i++) {
-    for(j=0;j<KYBER_K;j++) {
-      if(transposed)
-        xof_absorb(&state, seed, i, j);
-      else
-        xof_absorb(&state, seed, j, i);
-
-      xof_squeezeblocks(buf, GEN_MATRIX_NBLOCKS, &state);
-      buflen = GEN_MATRIX_NBLOCKS*XOF_BLOCKBYTES;
-      ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, buflen);
-
-      while(ctr < KYBER_N) {
-        xof_squeezeblocks(buf, 1, &state);
-        buflen = XOF_BLOCKBYTES;
-        ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, buflen);
-      }
-    }
-  }
-  xof_release(&state);
-}
-
-/*************************************************
-* Name:        indcpa_keypair_derand
-*
-* Description: Generates public and private key for the CPA-secure
-*              public-key encryption scheme underlying Kyber
-*
-* Arguments:   - uint8_t *pk: pointer to output public key
-*                             (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
-*              - uint8_t *sk: pointer to output private key
-*                             (of length KYBER_INDCPA_SECRETKEYBYTES bytes)
-*              - const uint8_t *coins: pointer to input randomness
-*                             (of length KYBER_SYMBYTES bytes)
-**************************************************/
-void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
-                           uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES],
-                           const uint8_t coins[KYBER_SYMBYTES])
-{
-  unsigned int i;
-  uint8_t buf[2*KYBER_SYMBYTES];
-  const uint8_t *publicseed = buf;
-  const uint8_t *noiseseed = buf+KYBER_SYMBYTES;
-  uint8_t nonce = 0;
-  polyvec a[KYBER_K], e, pkpv, skpv;
-
-  memcpy(buf, coins, KYBER_SYMBYTES);
-  buf[KYBER_SYMBYTES] = KYBER_K;
-  hash_g(buf, buf, KYBER_SYMBYTES+1);
-
-  gen_a(a, publicseed);
-
-  for(i=0;i<KYBER_K;i++)
-    poly_getnoise_eta1(&skpv.vec[i], noiseseed, nonce++);
-  for(i=0;i<KYBER_K;i++)
-    poly_getnoise_eta1(&e.vec[i], noiseseed, nonce++);
-
-  polyvec_ntt(&skpv);
-  polyvec_ntt(&e);
-
-  // matrix-vector multiplication
-  for(i=0;i<KYBER_K;i++) {
-    polyvec_basemul_acc_montgomery(&pkpv.vec[i], &a[i], &skpv);
-    poly_tomont(&pkpv.vec[i]);
-  }
-
-  polyvec_add(&pkpv, &pkpv, &e);
-  polyvec_reduce(&pkpv);
-
-  pack_sk(sk, &skpv);
-  pack_pk(pk, &pkpv, publicseed);
-}
-
-
-/*************************************************
-* Name:        indcpa_enc
-*
-* Description: Encryption function of the CPA-secure
-*              public-key encryption scheme underlying Kyber.
-*
-* Arguments:   - uint8_t *c: pointer to output ciphertext
-*                            (of length KYBER_INDCPA_BYTES bytes)
-*              - const uint8_t *m: pointer to input message
-*                                  (of length KYBER_INDCPA_MSGBYTES bytes)
-*              - const uint8_t *pk: pointer to input public key
-*                                   (of length KYBER_INDCPA_PUBLICKEYBYTES)
-*              - const uint8_t *coins: pointer to input random coins used as seed
-*                                      (of length KYBER_SYMBYTES) to deterministically
-*                                      generate all randomness
-**************************************************/
-void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
-                const uint8_t m[KYBER_INDCPA_MSGBYTES],
-                const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
-                const uint8_t coins[KYBER_SYMBYTES])
-{
-  unsigned int i;
-  uint8_t seed[KYBER_SYMBYTES];
-  uint8_t nonce = 0;
-  polyvec sp, pkpv, ep, at[KYBER_K], b;
-  poly v, k, epp;
-
-  unpack_pk(&pkpv, seed, pk);
-  poly_frommsg(&k, m);
-  gen_at(at, seed);
-
-  for(i=0;i<KYBER_K;i++)
-    poly_getnoise_eta1(sp.vec+i, coins, nonce++);
-  for(i=0;i<KYBER_K;i++)
-    poly_getnoise_eta2(ep.vec+i, coins, nonce++);
-  poly_getnoise_eta2(&epp, coins, nonce++);
-
-  polyvec_ntt(&sp);
-
-  // matrix-vector multiplication
-  for(i=0;i<KYBER_K;i++)
-    polyvec_basemul_acc_montgomery(&b.vec[i], &at[i], &sp);
-
-  polyvec_basemul_acc_montgomery(&v, &pkpv, &sp);
-
-  polyvec_invntt_tomont(&b);
-  poly_invntt_tomont(&v);
-
-  polyvec_add(&b, &b, &ep);
-  poly_add(&v, &v, &epp);
-  poly_add(&v, &v, &k);
-  polyvec_reduce(&b);
-  poly_reduce(&v);
-
-  pack_ciphertext(c, &b, &v);
-}
-
-/*************************************************
-* Name:        indcpa_dec
-*
-* Description: Decryption function of the CPA-secure
-*              public-key encryption scheme underlying Kyber.
-*
-* Arguments:   - uint8_t *m: pointer to output decrypted message
-*                            (of length KYBER_INDCPA_MSGBYTES)
-*              - const uint8_t *c: pointer to input ciphertext
-*                                  (of length KYBER_INDCPA_BYTES)
-*              - const uint8_t *sk: pointer to input secret key
-*                                   (of length KYBER_INDCPA_SECRETKEYBYTES)
-**************************************************/
-void indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES],
-                const uint8_t c[KYBER_INDCPA_BYTES],
-                const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES])
-{
-  polyvec b, skpv;
-  poly v, mp;
-
-  unpack_ciphertext(&b, &v, c);
-  unpack_sk(&skpv, sk);
-
-  polyvec_ntt(&b);
-  polyvec_basemul_acc_montgomery(&mp, &skpv, &b);
-  poly_invntt_tomont(&mp);
-
-  poly_sub(&mp, &v, &mp);
-  poly_reduce(&mp);
-
-  poly_tomsg(m, &mp);
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/indcpa.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/indcpa.h
deleted file mode 100644
index 6dd5088c85..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/indcpa.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef INDCPA_H
-#define INDCPA_H
-
-#include <stdint.h>
-#include "params.h"
-#include "polyvec.h"
-
-#define gen_matrix KYBER_NAMESPACE(gen_matrix)
-void gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed);
-
-#define indcpa_keypair_derand KYBER_NAMESPACE(indcpa_keypair_derand)
-void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
-                           uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES],
-                           const uint8_t coins[KYBER_SYMBYTES]);
-
-#define indcpa_enc KYBER_NAMESPACE(indcpa_enc)
-void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
-                const uint8_t m[KYBER_INDCPA_MSGBYTES],
-                const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
-                const uint8_t coins[KYBER_SYMBYTES]);
-
-#define indcpa_dec KYBER_NAMESPACE(indcpa_dec)
-void indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES],
-                const uint8_t c[KYBER_INDCPA_BYTES],
-                const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/kem.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/kem.c
deleted file mode 100644
index 63abc1029c..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/kem.c
+++ /dev/null
@@ -1,169 +0,0 @@
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-#include "params.h"
-#include "kem.h"
-#include "indcpa.h"
-#include "verify.h"
-#include "symmetric.h"
-#include "randombytes.h"
-/*************************************************
-* Name:        crypto_kem_keypair_derand
-*
-* Description: Generates public and private key
-*              for CCA-secure Kyber key encapsulation mechanism
-*
-* Arguments:   - uint8_t *pk: pointer to output public key
-*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
-*              - uint8_t *sk: pointer to output private key
-*                (an already allocated array of KYBER_SECRETKEYBYTES bytes)
-*              - uint8_t *coins: pointer to input randomness
-*                (an already allocated array filled with 2*KYBER_SYMBYTES random bytes)
-**
-* Returns 0 (success)
-**************************************************/
-int crypto_kem_keypair_derand(uint8_t *pk,
-                              uint8_t *sk,
-                              const uint8_t *coins)
-{
-  indcpa_keypair_derand(pk, sk, coins);
-  memcpy(sk+KYBER_INDCPA_SECRETKEYBYTES, pk, KYBER_PUBLICKEYBYTES);
-  hash_h(sk+KYBER_SECRETKEYBYTES-2*KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
-  /* Value z for pseudo-random output on reject */
-  memcpy(sk+KYBER_SECRETKEYBYTES-KYBER_SYMBYTES, coins+KYBER_SYMBYTES, KYBER_SYMBYTES);
-  return 0;
-}
-
-/*************************************************
-* Name:        crypto_kem_keypair
-*
-* Description: Generates public and private key
-*              for CCA-secure Kyber key encapsulation mechanism
-*
-* Arguments:   - uint8_t *pk: pointer to output public key
-*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
-*              - uint8_t *sk: pointer to output private key
-*                (an already allocated array of KYBER_SECRETKEYBYTES bytes)
-*
-* Returns 0 (success)
-**************************************************/
-int crypto_kem_keypair(uint8_t *pk,
-                       uint8_t *sk)
-{
-  uint8_t coins[2*KYBER_SYMBYTES];
-  randombytes(coins, 2*KYBER_SYMBYTES);
-  crypto_kem_keypair_derand(pk, sk, coins);
-  return 0;
-}
-
-/*************************************************
-* Name:        crypto_kem_enc_derand
-*
-* Description: Generates cipher text and shared
-*              secret for given public key
-*
-* Arguments:   - uint8_t *ct: pointer to output cipher text
-*                (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
-*              - uint8_t *ss: pointer to output shared secret
-*                (an already allocated array of KYBER_SSBYTES bytes)
-*              - const uint8_t *pk: pointer to input public key
-*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
-*              - const uint8_t *coins: pointer to input randomness
-*                (an already allocated array filled with KYBER_SYMBYTES random bytes)
-**
-* Returns 0 (success)
-**************************************************/
-int crypto_kem_enc_derand(uint8_t *ct,
-                          uint8_t *ss,
-                          const uint8_t *pk,
-                          const uint8_t *coins)
-{
-  uint8_t buf[2*KYBER_SYMBYTES];
-  /* Will contain key, coins */
-  uint8_t kr[2*KYBER_SYMBYTES];
-
-  memcpy(buf, coins, KYBER_SYMBYTES);
-
-  /* Multitarget countermeasure for coins + contributory KEM */
-  hash_h(buf+KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
-  hash_g(kr, buf, 2*KYBER_SYMBYTES);
-
-  /* coins are in kr+KYBER_SYMBYTES */
-  indcpa_enc(ct, buf, pk, kr+KYBER_SYMBYTES);
-
-  memcpy(ss,kr,KYBER_SYMBYTES);
-  return 0;
-}
-
-/*************************************************
-* Name:        crypto_kem_enc
-*
-* Description: Generates cipher text and shared
-*              secret for given public key
-*
-* Arguments:   - uint8_t *ct: pointer to output cipher text
-*                (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
-*              - uint8_t *ss: pointer to output shared secret
-*                (an already allocated array of KYBER_SSBYTES bytes)
-*              - const uint8_t *pk: pointer to input public key
-*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
-*
-* Returns 0 (success)
-**************************************************/
-int crypto_kem_enc(uint8_t *ct,
-                   uint8_t *ss,
-                   const uint8_t *pk)
-{
-  uint8_t coins[KYBER_SYMBYTES];
-  randombytes(coins, KYBER_SYMBYTES);
-  crypto_kem_enc_derand(ct, ss, pk, coins);
-  return 0;
-}
-
-/*************************************************
-* Name:        crypto_kem_dec
-*
-* Description: Generates shared secret for given
-*              cipher text and private key
-*
-* Arguments:   - uint8_t *ss: pointer to output shared secret
-*                (an already allocated array of KYBER_SSBYTES bytes)
-*              - const uint8_t *ct: pointer to input cipher text
-*                (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
-*              - const uint8_t *sk: pointer to input private key
-*                (an already allocated array of KYBER_SECRETKEYBYTES bytes)
-*
-* Returns 0.
-*
-* On failure, ss will contain a pseudo-random value.
-**************************************************/
-int crypto_kem_dec(uint8_t *ss,
-                   const uint8_t *ct,
-                   const uint8_t *sk)
-{
-  int fail;
-  uint8_t buf[2*KYBER_SYMBYTES];
-  /* Will contain key, coins */
-  uint8_t kr[2*KYBER_SYMBYTES];
-  uint8_t cmp[KYBER_CIPHERTEXTBYTES+KYBER_SYMBYTES];
-  const uint8_t *pk = sk+KYBER_INDCPA_SECRETKEYBYTES;
-
-  indcpa_dec(buf, ct, sk);
-
-  /* Multitarget countermeasure for coins + contributory KEM */
-  memcpy(buf+KYBER_SYMBYTES, sk+KYBER_SECRETKEYBYTES-2*KYBER_SYMBYTES, KYBER_SYMBYTES);
-  hash_g(kr, buf, 2*KYBER_SYMBYTES);
-
-  /* coins are in kr+KYBER_SYMBYTES */
-  indcpa_enc(cmp, buf, pk, kr+KYBER_SYMBYTES);
-
-  fail = verify(ct, cmp, KYBER_CIPHERTEXTBYTES);
-
-  /* Compute rejection key */
-  rkprf(ss,sk+KYBER_SECRETKEYBYTES-KYBER_SYMBYTES,ct);
-
-  /* Copy true key to return buffer if fail is false */
-  cmov(ss,kr,KYBER_SYMBYTES,!fail);
-
-  return 0;
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/kem.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/kem.h
deleted file mode 100644
index 234f11966b..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/kem.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef KEM_H
-#define KEM_H
-
-#include <stdint.h>
-#include "params.h"
-
-#define CRYPTO_SECRETKEYBYTES  KYBER_SECRETKEYBYTES
-#define CRYPTO_PUBLICKEYBYTES  KYBER_PUBLICKEYBYTES
-#define CRYPTO_CIPHERTEXTBYTES KYBER_CIPHERTEXTBYTES
-#define CRYPTO_BYTES           KYBER_SSBYTES
-
-#if   (KYBER_K == 2)
-#define CRYPTO_ALGNAME "Kyber512"
-#elif (KYBER_K == 3)
-#define CRYPTO_ALGNAME "Kyber768"
-#elif (KYBER_K == 4)
-#define CRYPTO_ALGNAME "Kyber1024"
-#endif
-
-#define crypto_kem_keypair_derand KYBER_NAMESPACE(keypair_derand)
-int crypto_kem_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
-
-#define crypto_kem_keypair KYBER_NAMESPACE(keypair)
-int crypto_kem_keypair(uint8_t *pk, uint8_t *sk);
-
-#define crypto_kem_enc_derand KYBER_NAMESPACE(enc_derand)
-int crypto_kem_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
-
-#define crypto_kem_enc KYBER_NAMESPACE(enc)
-int crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-
-#define crypto_kem_dec KYBER_NAMESPACE(dec)
-int crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/ntt.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/ntt.c
deleted file mode 100644
index 2f2eb10b2f..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/ntt.c
+++ /dev/null
@@ -1,146 +0,0 @@
-#include <stdint.h>
-#include "params.h"
-#include "ntt.h"
-#include "reduce.h"
-
-/* Code to generate zetas and zetas_inv used in the number-theoretic transform:
-
-#define KYBER_ROOT_OF_UNITY 17
-
-static const uint8_t tree[128] = {
-  0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120,
-  4, 68, 36, 100, 20, 84, 52, 116, 12, 76, 44, 108, 28, 92, 60, 124,
-  2, 66, 34, 98, 18, 82, 50, 114, 10, 74, 42, 106, 26, 90, 58, 122,
-  6, 70, 38, 102, 22, 86, 54, 118, 14, 78, 46, 110, 30, 94, 62, 126,
-  1, 65, 33, 97, 17, 81, 49, 113, 9, 73, 41, 105, 25, 89, 57, 121,
-  5, 69, 37, 101, 21, 85, 53, 117, 13, 77, 45, 109, 29, 93, 61, 125,
-  3, 67, 35, 99, 19, 83, 51, 115, 11, 75, 43, 107, 27, 91, 59, 123,
-  7, 71, 39, 103, 23, 87, 55, 119, 15, 79, 47, 111, 31, 95, 63, 127
-};
-
-void init_ntt() {
-  unsigned int i;
-  int16_t tmp[128];
-
-  tmp[0] = MONT;
-  for(i=1;i<128;i++)
-    tmp[i] = fqmul(tmp[i-1],MONT*KYBER_ROOT_OF_UNITY % KYBER_Q);
-
-  for(i=0;i<128;i++) {
-    zetas[i] = tmp[tree[i]];
-    if(zetas[i] > KYBER_Q/2)
-      zetas[i] -= KYBER_Q;
-    if(zetas[i] < -KYBER_Q/2)
-      zetas[i] += KYBER_Q;
-  }
-}
-*/
-
-const int16_t zetas[128] = {
-  -1044,  -758,  -359, -1517,  1493,  1422,   287,   202,
-   -171,   622,  1577,   182,   962, -1202, -1474,  1468,
-    573, -1325,   264,   383,  -829,  1458, -1602,  -130,
-   -681,  1017,   732,   608, -1542,   411,  -205, -1571,
-   1223,   652,  -552,  1015, -1293,  1491,  -282, -1544,
-    516,    -8,  -320,  -666, -1618, -1162,   126,  1469,
-   -853,   -90,  -271,   830,   107, -1421,  -247,  -951,
-   -398,   961, -1508,  -725,   448, -1065,   677, -1275,
-  -1103,   430,   555,   843, -1251,   871,  1550,   105,
-    422,   587,   177,  -235,  -291,  -460,  1574,  1653,
-   -246,   778,  1159,  -147,  -777,  1483,  -602,  1119,
-  -1590,   644,  -872,   349,   418,   329,  -156,   -75,
-    817,  1097,   603,   610,  1322, -1285, -1465,   384,
-  -1215,  -136,  1218, -1335,  -874,   220, -1187, -1659,
-  -1185, -1530, -1278,   794, -1510,  -854,  -870,   478,
-   -108,  -308,   996,   991,   958, -1460,  1522,  1628
-};
-
-/*************************************************
-* Name:        fqmul
-*
-* Description: Multiplication followed by Montgomery reduction
-*
-* Arguments:   - int16_t a: first factor
-*              - int16_t b: second factor
-*
-* Returns 16-bit integer congruent to a*b*R^{-1} mod q
-**************************************************/
-static int16_t fqmul(int16_t a, int16_t b) {
-  return montgomery_reduce((int32_t)a*b);
-}
-
-/*************************************************
-* Name:        ntt
-*
-* Description: Inplace number-theoretic transform (NTT) in Rq.
-*              input is in standard order, output is in bitreversed order
-*
-* Arguments:   - int16_t r[256]: pointer to input/output vector of elements of Zq
-**************************************************/
-void ntt(int16_t r[256]) {
-  unsigned int len, start, j, k;
-  int16_t t, zeta;
-
-  k = 1;
-  for(len = 128; len >= 2; len >>= 1) {
-    for(start = 0; start < 256; start = j + len) {
-      zeta = zetas[k++];
-      for(j = start; j < start + len; j++) {
-        t = fqmul(zeta, r[j + len]);
-        r[j + len] = r[j] - t;
-        r[j] = r[j] + t;
-      }
-    }
-  }
-}
-
-/*************************************************
-* Name:        invntt_tomont
-*
-* Description: Inplace inverse number-theoretic transform in Rq and
-*              multiplication by Montgomery factor 2^16.
-*              Input is in bitreversed order, output is in standard order
-*
-* Arguments:   - int16_t r[256]: pointer to input/output vector of elements of Zq
-**************************************************/
-void invntt(int16_t r[256]) {
-  unsigned int start, len, j, k;
-  int16_t t, zeta;
-  const int16_t f = 1441; // mont^2/128
-
-  k = 127;
-  for(len = 2; len <= 128; len <<= 1) {
-    for(start = 0; start < 256; start = j + len) {
-      zeta = zetas[k--];
-      for(j = start; j < start + len; j++) {
-        t = r[j];
-        r[j] = barrett_reduce(t + r[j + len]);
-        r[j + len] = r[j + len] - t;
-        r[j + len] = fqmul(zeta, r[j + len]);
-      }
-    }
-  }
-
-  for(j = 0; j < 256; j++)
-    r[j] = fqmul(r[j], f);
-}
-
-/*************************************************
-* Name:        basemul
-*
-* Description: Multiplication of polynomials in Zq[X]/(X^2-zeta)
-*              used for multiplication of elements in Rq in NTT domain
-*
-* Arguments:   - int16_t r[2]: pointer to the output polynomial
-*              - const int16_t a[2]: pointer to the first factor
-*              - const int16_t b[2]: pointer to the second factor
-*              - int16_t zeta: integer defining the reduction polynomial
-**************************************************/
-void basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta)
-{
-  r[0]  = fqmul(a[1], b[1]);
-  r[0]  = fqmul(r[0], zeta);
-  r[0] += fqmul(a[0], b[0]);
-  r[1]  = fqmul(a[0], b[1]);
-  r[1] += fqmul(a[1], b[0]);
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/ntt.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/ntt.h
deleted file mode 100644
index 227ea74f08..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/ntt.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef NTT_H
-#define NTT_H
-
-#include <stdint.h>
-#include "params.h"
-
-#define zetas KYBER_NAMESPACE(zetas)
-extern const int16_t zetas[128];
-
-#define ntt KYBER_NAMESPACE(ntt)
-void ntt(int16_t poly[256]);
-
-#define invntt KYBER_NAMESPACE(invntt)
-void invntt(int16_t poly[256]);
-
-#define basemul KYBER_NAMESPACE(basemul)
-void basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/params.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/params.h
deleted file mode 100644
index fb4190b311..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/params.h
+++ /dev/null
@@ -1,55 +0,0 @@
-#ifndef PARAMS_H
-#define PARAMS_H
-
-#ifndef KYBER_K
-#define KYBER_K 3	/* Change this for different security strengths */
-#endif
-
-
-/* Don't change parameters below this line */
-#if   (KYBER_K == 2)
-#define KYBER_NAMESPACE(s) pqcrystals_ml_kem_512_ref_##s
-#elif (KYBER_K == 3)
-#define KYBER_NAMESPACE(s) pqcrystals_ml_kem_768_ref_##s
-#elif (KYBER_K == 4)
-#define KYBER_NAMESPACE(s) pqcrystals_ml_kem_1024_ref_##s
-#else
-#error "KYBER_K must be in {2,3,4}"
-#endif
-
-#define KYBER_N 256
-#define KYBER_Q 3329
-
-#define KYBER_SYMBYTES 32   /* size in bytes of hashes, and seeds */
-#define KYBER_SSBYTES  32   /* size in bytes of shared key */
-
-#define KYBER_POLYBYTES		384
-#define KYBER_POLYVECBYTES	(KYBER_K * KYBER_POLYBYTES)
-
-#if KYBER_K == 2
-#define KYBER_ETA1 3
-#define KYBER_POLYCOMPRESSEDBYTES    128
-#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320)
-#elif KYBER_K == 3
-#define KYBER_ETA1 2
-#define KYBER_POLYCOMPRESSEDBYTES    128
-#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320)
-#elif KYBER_K == 4
-#define KYBER_ETA1 2
-#define KYBER_POLYCOMPRESSEDBYTES    160
-#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 352)
-#endif
-
-#define KYBER_ETA2 2
-
-#define KYBER_INDCPA_MSGBYTES       (KYBER_SYMBYTES)
-#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES)
-#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES)
-#define KYBER_INDCPA_BYTES          (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES)
-
-#define KYBER_PUBLICKEYBYTES  (KYBER_INDCPA_PUBLICKEYBYTES)
-/* 32 bytes of additional space to save H(pk) */
-#define KYBER_SECRETKEYBYTES  (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES)
-#define KYBER_CIPHERTEXTBYTES (KYBER_INDCPA_BYTES)
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/poly.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/poly.c
deleted file mode 100644
index cbd3abfb54..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/poly.c
+++ /dev/null
@@ -1,360 +0,0 @@
-#include <stdint.h>
-#include "params.h"
-#include "poly.h"
-#include "ntt.h"
-#include "reduce.h"
-#include "cbd.h"
-#include "symmetric.h"
-#include "verify.h"
-
-/*************************************************
-* Name:        poly_compress
-*
-* Description: Compression and subsequent serialization of a polynomial
-*
-* Arguments:   - uint8_t *r: pointer to output byte array
-*                            (of length KYBER_POLYCOMPRESSEDBYTES)
-*              - const poly *a: pointer to input polynomial
-**************************************************/
-void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a)
-{
-  unsigned int i,j;
-  int16_t u;
-  uint32_t d0;
-  uint8_t t[8];
-
-#if (KYBER_POLYCOMPRESSEDBYTES == 128)
-
-  for(i=0;i<KYBER_N/8;i++) {
-    for(j=0;j<8;j++) {
-      // map to positive standard representatives
-      u  = a->coeffs[8*i+j];
-      u += (u >> 15) & KYBER_Q;
-/*    t[j] = ((((uint16_t)u << 4) + KYBER_Q/2)/KYBER_Q) & 15; */
-      d0 = u << 4;
-      d0 += 1665;
-      d0 *= 80635;
-      d0 >>= 28;
-      t[j] = d0 & 0xf;
-    }
-
-    r[0] = t[0] | (t[1] << 4);
-    r[1] = t[2] | (t[3] << 4);
-    r[2] = t[4] | (t[5] << 4);
-    r[3] = t[6] | (t[7] << 4);
-    r += 4;
-  }
-#elif (KYBER_POLYCOMPRESSEDBYTES == 160)
-  for(i=0;i<KYBER_N/8;i++) {
-    for(j=0;j<8;j++) {
-      // map to positive standard representatives
-      u  = a->coeffs[8*i+j];
-      u += (u >> 15) & KYBER_Q;
-/*    t[j] = ((((uint32_t)u << 5) + KYBER_Q/2)/KYBER_Q) & 31; */
-      d0 = u << 5;
-      d0 += 1664;
-      d0 *= 40318;
-      d0 >>= 27;
-      t[j] = d0 & 0x1f;
-    }
-
-    r[0] = (t[0] >> 0) | (t[1] << 5);
-    r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7);
-    r[2] = (t[3] >> 1) | (t[4] << 4);
-    r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6);
-    r[4] = (t[6] >> 2) | (t[7] << 3);
-    r += 5;
-  }
-#else
-#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}"
-#endif
-}
-
-/*************************************************
-* Name:        poly_decompress
-*
-* Description: De-serialization and subsequent decompression of a polynomial;
-*              approximate inverse of poly_compress
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *a: pointer to input byte array
-*                                  (of length KYBER_POLYCOMPRESSEDBYTES bytes)
-**************************************************/
-void poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES])
-{
-  unsigned int i;
-
-#if (KYBER_POLYCOMPRESSEDBYTES == 128)
-  for(i=0;i<KYBER_N/2;i++) {
-    r->coeffs[2*i+0] = (((uint16_t)(a[0] & 15)*KYBER_Q) + 8) >> 4;
-    r->coeffs[2*i+1] = (((uint16_t)(a[0] >> 4)*KYBER_Q) + 8) >> 4;
-    a += 1;
-  }
-#elif (KYBER_POLYCOMPRESSEDBYTES == 160)
-  unsigned int j;
-  uint8_t t[8];
-  for(i=0;i<KYBER_N/8;i++) {
-    t[0] = (a[0] >> 0);
-    t[1] = (a[0] >> 5) | (a[1] << 3);
-    t[2] = (a[1] >> 2);
-    t[3] = (a[1] >> 7) | (a[2] << 1);
-    t[4] = (a[2] >> 4) | (a[3] << 4);
-    t[5] = (a[3] >> 1);
-    t[6] = (a[3] >> 6) | (a[4] << 2);
-    t[7] = (a[4] >> 3);
-    a += 5;
-
-    for(j=0;j<8;j++)
-      r->coeffs[8*i+j] = ((uint32_t)(t[j] & 31)*KYBER_Q + 16) >> 5;
-  }
-#else
-#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}"
-#endif
-}
-
-/*************************************************
-* Name:        poly_tobytes
-*
-* Description: Serialization of a polynomial
-*
-* Arguments:   - uint8_t *r: pointer to output byte array
-*                            (needs space for KYBER_POLYBYTES bytes)
-*              - const poly *a: pointer to input polynomial
-**************************************************/
-void poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a)
-{
-  unsigned int i;
-  uint16_t t0, t1;
-
-  for(i=0;i<KYBER_N/2;i++) {
-    // map to positive standard representatives
-    t0  = a->coeffs[2*i];
-    t0 += ((int16_t)t0 >> 15) & KYBER_Q;
-    t1 = a->coeffs[2*i+1];
-    t1 += ((int16_t)t1 >> 15) & KYBER_Q;
-    r[3*i+0] = (t0 >> 0);
-    r[3*i+1] = (t0 >> 8) | (t1 << 4);
-    r[3*i+2] = (t1 >> 4);
-  }
-}
-
-/*************************************************
-* Name:        poly_frombytes
-*
-* Description: De-serialization of a polynomial;
-*              inverse of poly_tobytes
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *a: pointer to input byte array
-*                                  (of KYBER_POLYBYTES bytes)
-**************************************************/
-void poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES])
-{
-  unsigned int i;
-  for(i=0;i<KYBER_N/2;i++) {
-    r->coeffs[2*i]   = ((a[3*i+0] >> 0) | ((uint16_t)a[3*i+1] << 8)) & 0xFFF;
-    r->coeffs[2*i+1] = ((a[3*i+1] >> 4) | ((uint16_t)a[3*i+2] << 4)) & 0xFFF;
-  }
-}
-
-/*************************************************
-* Name:        poly_frommsg
-*
-* Description: Convert 32-byte message to polynomial
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *msg: pointer to input message
-**************************************************/
-void poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES])
-{
-  unsigned int i,j;
-
-#if (KYBER_INDCPA_MSGBYTES != KYBER_N/8)
-#error "KYBER_INDCPA_MSGBYTES must be equal to KYBER_N/8 bytes!"
-#endif
-
-  for(i=0;i<KYBER_N/8;i++) {
-    for(j=0;j<8;j++) {
-      r->coeffs[8*i+j] = 0;
-      cmov_int16(r->coeffs+8*i+j, ((KYBER_Q+1)/2), (msg[i] >> j)&1);
-    }
-  }
-}
-
-/*************************************************
-* Name:        poly_tomsg
-*
-* Description: Convert polynomial to 32-byte message
-*
-* Arguments:   - uint8_t *msg: pointer to output message
-*              - const poly *a: pointer to input polynomial
-**************************************************/
-void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *a)
-{
-  unsigned int i,j;
-  uint32_t t;
-
-  for(i=0;i<KYBER_N/8;i++) {
-    msg[i] = 0;
-    for(j=0;j<8;j++) {
-      t  = a->coeffs[8*i+j];
-      // t += ((int16_t)t >> 15) & KYBER_Q;
-      // t  = (((t << 1) + KYBER_Q/2)/KYBER_Q) & 1;
-      t <<= 1;
-      t += 1665;
-      t *= 80635;
-      t >>= 28;
-      t &= 1;
-      msg[i] |= t << j;
-    }
-  }
-}
-
-/*************************************************
-* Name:        poly_getnoise_eta1
-*
-* Description: Sample a polynomial deterministically from a seed and a nonce,
-*              with output polynomial close to centered binomial distribution
-*              with parameter KYBER_ETA1
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *seed: pointer to input seed
-*                                     (of length KYBER_SYMBYTES bytes)
-*              - uint8_t nonce: one-byte input nonce
-**************************************************/
-void poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce)
-{
-  uint8_t buf[KYBER_ETA1*KYBER_N/4];
-  prf(buf, sizeof(buf), seed, nonce);
-  poly_cbd_eta1(r, buf);
-}
-
-/*************************************************
-* Name:        poly_getnoise_eta2
-*
-* Description: Sample a polynomial deterministically from a seed and a nonce,
-*              with output polynomial close to centered binomial distribution
-*              with parameter KYBER_ETA2
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *seed: pointer to input seed
-*                                     (of length KYBER_SYMBYTES bytes)
-*              - uint8_t nonce: one-byte input nonce
-**************************************************/
-void poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce)
-{
-  uint8_t buf[KYBER_ETA2*KYBER_N/4];
-  prf(buf, sizeof(buf), seed, nonce);
-  poly_cbd_eta2(r, buf);
-}
-
-
-/*************************************************
-* Name:        poly_ntt
-*
-* Description: Computes negacyclic number-theoretic transform (NTT) of
-*              a polynomial in place;
-*              inputs assumed to be in normal order, output in bitreversed order
-*
-* Arguments:   - uint16_t *r: pointer to in/output polynomial
-**************************************************/
-void poly_ntt(poly *r)
-{
-  ntt(r->coeffs);
-  poly_reduce(r);
-}
-
-/*************************************************
-* Name:        poly_invntt_tomont
-*
-* Description: Computes inverse of negacyclic number-theoretic transform (NTT)
-*              of a polynomial in place;
-*              inputs assumed to be in bitreversed order, output in normal order
-*
-* Arguments:   - uint16_t *a: pointer to in/output polynomial
-**************************************************/
-void poly_invntt_tomont(poly *r)
-{
-  invntt(r->coeffs);
-}
-
-/*************************************************
-* Name:        poly_basemul_montgomery
-*
-* Description: Multiplication of two polynomials in NTT domain
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const poly *a: pointer to first input polynomial
-*              - const poly *b: pointer to second input polynomial
-**************************************************/
-void poly_basemul_montgomery(poly *r, const poly *a, const poly *b)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_N/4;i++) {
-    basemul(&r->coeffs[4*i], &a->coeffs[4*i], &b->coeffs[4*i], zetas[64+i]);
-    basemul(&r->coeffs[4*i+2], &a->coeffs[4*i+2], &b->coeffs[4*i+2], -zetas[64+i]);
-  }
-}
-
-/*************************************************
-* Name:        poly_tomont
-*
-* Description: Inplace conversion of all coefficients of a polynomial
-*              from normal domain to Montgomery domain
-*
-* Arguments:   - poly *r: pointer to input/output polynomial
-**************************************************/
-void poly_tomont(poly *r)
-{
-  unsigned int i;
-  const int16_t f = (1ULL << 32) % KYBER_Q;
-  for(i=0;i<KYBER_N;i++)
-    r->coeffs[i] = montgomery_reduce((int32_t)r->coeffs[i]*f);
-}
-
-/*************************************************
-* Name:        poly_reduce
-*
-* Description: Applies Barrett reduction to all coefficients of a polynomial
-*              for details of the Barrett reduction see comments in reduce.c
-*
-* Arguments:   - poly *r: pointer to input/output polynomial
-**************************************************/
-void poly_reduce(poly *r)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_N;i++)
-    r->coeffs[i] = barrett_reduce(r->coeffs[i]);
-}
-
-/*************************************************
-* Name:        poly_add
-*
-* Description: Add two polynomials; no modular reduction is performed
-*
-* Arguments: - poly *r: pointer to output polynomial
-*            - const poly *a: pointer to first input polynomial
-*            - const poly *b: pointer to second input polynomial
-**************************************************/
-void poly_add(poly *r, const poly *a, const poly *b)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_N;i++)
-    r->coeffs[i] = a->coeffs[i] + b->coeffs[i];
-}
-
-/*************************************************
-* Name:        poly_sub
-*
-* Description: Subtract two polynomials; no modular reduction is performed
-*
-* Arguments: - poly *r:       pointer to output polynomial
-*            - const poly *a: pointer to first input polynomial
-*            - const poly *b: pointer to second input polynomial
-**************************************************/
-void poly_sub(poly *r, const poly *a, const poly *b)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_N;i++)
-    r->coeffs[i] = a->coeffs[i] - b->coeffs[i];
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/poly.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/poly.h
deleted file mode 100644
index 9a99c7cdad..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/poly.h
+++ /dev/null
@@ -1,53 +0,0 @@
-#ifndef POLY_H
-#define POLY_H
-
-#include <stdint.h>
-#include "params.h"
-
-/*
- * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial
- * coeffs[0] + X*coeffs[1] + X^2*coeffs[2] + ... + X^{n-1}*coeffs[n-1]
- */
-typedef struct{
-  int16_t coeffs[KYBER_N];
-} poly;
-
-#define poly_compress KYBER_NAMESPACE(poly_compress)
-void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a);
-#define poly_decompress KYBER_NAMESPACE(poly_decompress)
-void poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]);
-
-#define poly_tobytes KYBER_NAMESPACE(poly_tobytes)
-void poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a);
-#define poly_frombytes KYBER_NAMESPACE(poly_frombytes)
-void poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]);
-
-#define poly_frommsg KYBER_NAMESPACE(poly_frommsg)
-void poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]);
-#define poly_tomsg KYBER_NAMESPACE(poly_tomsg)
-void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *r);
-
-#define poly_getnoise_eta1 KYBER_NAMESPACE(poly_getnoise_eta1)
-void poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);
-
-#define poly_getnoise_eta2 KYBER_NAMESPACE(poly_getnoise_eta2)
-void poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);
-
-#define poly_ntt KYBER_NAMESPACE(poly_ntt)
-void poly_ntt(poly *r);
-#define poly_invntt_tomont KYBER_NAMESPACE(poly_invntt_tomont)
-void poly_invntt_tomont(poly *r);
-#define poly_basemul_montgomery KYBER_NAMESPACE(poly_basemul_montgomery)
-void poly_basemul_montgomery(poly *r, const poly *a, const poly *b);
-#define poly_tomont KYBER_NAMESPACE(poly_tomont)
-void poly_tomont(poly *r);
-
-#define poly_reduce KYBER_NAMESPACE(poly_reduce)
-void poly_reduce(poly *r);
-
-#define poly_add KYBER_NAMESPACE(poly_add)
-void poly_add(poly *r, const poly *a, const poly *b);
-#define poly_sub KYBER_NAMESPACE(poly_sub)
-void poly_sub(poly *r, const poly *a, const poly *b);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/polyvec.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/polyvec.c
deleted file mode 100644
index 669f6a5f1d..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/polyvec.c
+++ /dev/null
@@ -1,246 +0,0 @@
-#include <stdint.h>
-#include "params.h"
-#include "poly.h"
-#include "polyvec.h"
-
-/*************************************************
-* Name:        polyvec_compress
-*
-* Description: Compress and serialize vector of polynomials
-*
-* Arguments:   - uint8_t *r: pointer to output byte array
-*                            (needs space for KYBER_POLYVECCOMPRESSEDBYTES)
-*              - const polyvec *a: pointer to input vector of polynomials
-**************************************************/
-void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a)
-{
-  unsigned int i,j,k;
-  uint64_t d0;
-
-#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
-  uint16_t t[8];
-  for(i=0;i<KYBER_K;i++) {
-    for(j=0;j<KYBER_N/8;j++) {
-      for(k=0;k<8;k++) {
-        t[k]  = a->vec[i].coeffs[8*j+k];
-        t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
-/*      t[k]  = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff; */
-        d0 = t[k];
-        d0 <<= 11;
-        d0 += 1664;
-        d0 *= 645084;
-        d0 >>= 31;
-        t[k] = d0 & 0x7ff;
-      }
-
-      r[ 0] = (t[0] >>  0);
-      r[ 1] = (t[0] >>  8) | (t[1] << 3);
-      r[ 2] = (t[1] >>  5) | (t[2] << 6);
-      r[ 3] = (t[2] >>  2);
-      r[ 4] = (t[2] >> 10) | (t[3] << 1);
-      r[ 5] = (t[3] >>  7) | (t[4] << 4);
-      r[ 6] = (t[4] >>  4) | (t[5] << 7);
-      r[ 7] = (t[5] >>  1);
-      r[ 8] = (t[5] >>  9) | (t[6] << 2);
-      r[ 9] = (t[6] >>  6) | (t[7] << 5);
-      r[10] = (t[7] >>  3);
-      r += 11;
-    }
-  }
-#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
-  uint16_t t[4];
-  for(i=0;i<KYBER_K;i++) {
-    for(j=0;j<KYBER_N/4;j++) {
-      for(k=0;k<4;k++) {
-        t[k]  = a->vec[i].coeffs[4*j+k];
-        t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
-/*      t[k]  = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff; */
-        d0 = t[k];
-        d0 <<= 10;
-        d0 += 1665;
-        d0 *= 1290167;
-        d0 >>= 32;
-        t[k] = d0 & 0x3ff;
-      }
-
-      r[0] = (t[0] >> 0);
-      r[1] = (t[0] >> 8) | (t[1] << 2);
-      r[2] = (t[1] >> 6) | (t[2] << 4);
-      r[3] = (t[2] >> 4) | (t[3] << 6);
-      r[4] = (t[3] >> 2);
-      r += 5;
-    }
-  }
-#else
-#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
-#endif
-}
-
-/*************************************************
-* Name:        polyvec_decompress
-*
-* Description: De-serialize and decompress vector of polynomials;
-*              approximate inverse of polyvec_compress
-*
-* Arguments:   - polyvec *r:       pointer to output vector of polynomials
-*              - const uint8_t *a: pointer to input byte array
-*                                  (of length KYBER_POLYVECCOMPRESSEDBYTES)
-**************************************************/
-void polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES])
-{
-  unsigned int i,j,k;
-
-#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
-  uint16_t t[8];
-  for(i=0;i<KYBER_K;i++) {
-    for(j=0;j<KYBER_N/8;j++) {
-      t[0] = (a[0] >> 0) | ((uint16_t)a[ 1] << 8);
-      t[1] = (a[1] >> 3) | ((uint16_t)a[ 2] << 5);
-      t[2] = (a[2] >> 6) | ((uint16_t)a[ 3] << 2) | ((uint16_t)a[4] << 10);
-      t[3] = (a[4] >> 1) | ((uint16_t)a[ 5] << 7);
-      t[4] = (a[5] >> 4) | ((uint16_t)a[ 6] << 4);
-      t[5] = (a[6] >> 7) | ((uint16_t)a[ 7] << 1) | ((uint16_t)a[8] << 9);
-      t[6] = (a[8] >> 2) | ((uint16_t)a[ 9] << 6);
-      t[7] = (a[9] >> 5) | ((uint16_t)a[10] << 3);
-      a += 11;
-
-      for(k=0;k<8;k++)
-        r->vec[i].coeffs[8*j+k] = ((uint32_t)(t[k] & 0x7FF)*KYBER_Q + 1024) >> 11;
-    }
-  }
-#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
-  uint16_t t[4];
-  for(i=0;i<KYBER_K;i++) {
-    for(j=0;j<KYBER_N/4;j++) {
-      t[0] = (a[0] >> 0) | ((uint16_t)a[1] << 8);
-      t[1] = (a[1] >> 2) | ((uint16_t)a[2] << 6);
-      t[2] = (a[2] >> 4) | ((uint16_t)a[3] << 4);
-      t[3] = (a[3] >> 6) | ((uint16_t)a[4] << 2);
-      a += 5;
-
-      for(k=0;k<4;k++)
-        r->vec[i].coeffs[4*j+k] = ((uint32_t)(t[k] & 0x3FF)*KYBER_Q + 512) >> 10;
-    }
-  }
-#else
-#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
-#endif
-}
-
-/*************************************************
-* Name:        polyvec_tobytes
-*
-* Description: Serialize vector of polynomials
-*
-* Arguments:   - uint8_t *r: pointer to output byte array
-*                            (needs space for KYBER_POLYVECBYTES)
-*              - const polyvec *a: pointer to input vector of polynomials
-**************************************************/
-void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_tobytes(r+i*KYBER_POLYBYTES, &a->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvec_frombytes
-*
-* Description: De-serialize vector of polynomials;
-*              inverse of polyvec_tobytes
-*
-* Arguments:   - uint8_t *r:       pointer to output byte array
-*              - const polyvec *a: pointer to input vector of polynomials
-*                                  (of length KYBER_POLYVECBYTES)
-**************************************************/
-void polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES])
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_frombytes(&r->vec[i], a+i*KYBER_POLYBYTES);
-}
-
-/*************************************************
-* Name:        polyvec_ntt
-*
-* Description: Apply forward NTT to all elements of a vector of polynomials
-*
-* Arguments:   - polyvec *r: pointer to in/output vector of polynomials
-**************************************************/
-void polyvec_ntt(polyvec *r)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_ntt(&r->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvec_invntt_tomont
-*
-* Description: Apply inverse NTT to all elements of a vector of polynomials
-*              and multiply by Montgomery factor 2^16
-*
-* Arguments:   - polyvec *r: pointer to in/output vector of polynomials
-**************************************************/
-void polyvec_invntt_tomont(polyvec *r)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_invntt_tomont(&r->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvec_basemul_acc_montgomery
-*
-* Description: Multiply elements of a and b in NTT domain, accumulate into r,
-*              and multiply by 2^-16.
-*
-* Arguments: - poly *r: pointer to output polynomial
-*            - const polyvec *a: pointer to first input vector of polynomials
-*            - const polyvec *b: pointer to second input vector of polynomials
-**************************************************/
-void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
-{
-  unsigned int i;
-  poly t;
-
-  poly_basemul_montgomery(r, &a->vec[0], &b->vec[0]);
-  for(i=1;i<KYBER_K;i++) {
-    poly_basemul_montgomery(&t, &a->vec[i], &b->vec[i]);
-    poly_add(r, r, &t);
-  }
-
-  poly_reduce(r);
-}
-
-/*************************************************
-* Name:        polyvec_reduce
-*
-* Description: Applies Barrett reduction to each coefficient
-*              of each element of a vector of polynomials;
-*              for details of the Barrett reduction see comments in reduce.c
-*
-* Arguments:   - polyvec *r: pointer to input/output polynomial
-**************************************************/
-void polyvec_reduce(polyvec *r)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_reduce(&r->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvec_add
-*
-* Description: Add vectors of polynomials
-*
-* Arguments: - polyvec *r: pointer to output vector of polynomials
-*            - const polyvec *a: pointer to first input vector of polynomials
-*            - const polyvec *b: pointer to second input vector of polynomials
-**************************************************/
-void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_add(&r->vec[i], &a->vec[i], &b->vec[i]);
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/polyvec.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/polyvec.h
deleted file mode 100644
index 57b605494e..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/polyvec.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef POLYVEC_H
-#define POLYVEC_H
-
-#include <stdint.h>
-#include "params.h"
-#include "poly.h"
-
-typedef struct{
-  poly vec[KYBER_K];
-} polyvec;
-
-#define polyvec_compress KYBER_NAMESPACE(polyvec_compress)
-void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a);
-#define polyvec_decompress KYBER_NAMESPACE(polyvec_decompress)
-void polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]);
-
-#define polyvec_tobytes KYBER_NAMESPACE(polyvec_tobytes)
-void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a);
-#define polyvec_frombytes KYBER_NAMESPACE(polyvec_frombytes)
-void polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]);
-
-#define polyvec_ntt KYBER_NAMESPACE(polyvec_ntt)
-void polyvec_ntt(polyvec *r);
-#define polyvec_invntt_tomont KYBER_NAMESPACE(polyvec_invntt_tomont)
-void polyvec_invntt_tomont(polyvec *r);
-
-#define polyvec_basemul_acc_montgomery KYBER_NAMESPACE(polyvec_basemul_acc_montgomery)
-void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b);
-
-#define polyvec_reduce KYBER_NAMESPACE(polyvec_reduce)
-void polyvec_reduce(polyvec *r);
-
-#define polyvec_add KYBER_NAMESPACE(polyvec_add)
-void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/reduce.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/reduce.c
deleted file mode 100644
index 9d8e7edf83..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/reduce.c
+++ /dev/null
@@ -1,42 +0,0 @@
-#include <stdint.h>
-#include "params.h"
-#include "reduce.h"
-
-/*************************************************
-* Name:        montgomery_reduce
-*
-* Description: Montgomery reduction; given a 32-bit integer a, computes
-*              16-bit integer congruent to a * R^-1 mod q, where R=2^16
-*
-* Arguments:   - int32_t a: input integer to be reduced;
-*                           has to be in {-q2^15,...,q2^15-1}
-*
-* Returns:     integer in {-q+1,...,q-1} congruent to a * R^-1 modulo q.
-**************************************************/
-int16_t montgomery_reduce(int32_t a)
-{
-  int16_t t;
-
-  t = (int16_t)a*QINV;
-  t = (a - (int32_t)t*KYBER_Q) >> 16;
-  return t;
-}
-
-/*************************************************
-* Name:        barrett_reduce
-*
-* Description: Barrett reduction; given a 16-bit integer a, computes
-*              centered representative congruent to a mod q in {-(q-1)/2,...,(q-1)/2}
-*
-* Arguments:   - int16_t a: input integer to be reduced
-*
-* Returns:     integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
-**************************************************/
-int16_t barrett_reduce(int16_t a) {
-  int16_t t;
-  const int16_t v = ((1<<26) + KYBER_Q/2)/KYBER_Q;
-
-  t  = ((int32_t)v*a + (1<<25)) >> 26;
-  t *= KYBER_Q;
-  return a - t;
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/reduce.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/reduce.h
deleted file mode 100644
index c1bc1e4c7b..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/reduce.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef REDUCE_H
-#define REDUCE_H
-
-#include <stdint.h>
-#include "params.h"
-
-#define MONT -1044 // 2^16 mod q
-#define QINV -3327 // q^-1 mod 2^16
-
-#define montgomery_reduce KYBER_NAMESPACE(montgomery_reduce)
-int16_t montgomery_reduce(int32_t a);
-
-#define barrett_reduce KYBER_NAMESPACE(barrett_reduce)
-int16_t barrett_reduce(int16_t a);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/symmetric-shake.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/symmetric-shake.c
deleted file mode 100644
index 20f451882e..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/symmetric-shake.c
+++ /dev/null
@@ -1,74 +0,0 @@
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-#include "params.h"
-#include "symmetric.h"
-#include "fips202.h"
-
-/*************************************************
-* Name:        kyber_shake128_absorb
-*
-* Description: Absorb step of the SHAKE128 specialized for the Kyber context.
-*
-* Arguments:   - keccak_state *state: pointer to (uninitialized) output Keccak state
-*              - const uint8_t *seed: pointer to KYBER_SYMBYTES input to be absorbed into state
-*              - uint8_t i: additional byte of input
-*              - uint8_t j: additional byte of input
-**************************************************/
-void kyber_shake128_absorb(shake128incctx *state,
-                           const uint8_t seed[KYBER_SYMBYTES],
-                           uint8_t x,
-                           uint8_t y)
-{
-  uint8_t extseed[KYBER_SYMBYTES+2];
-
-  memcpy(extseed, seed, KYBER_SYMBYTES);
-  extseed[KYBER_SYMBYTES+0] = x;
-  extseed[KYBER_SYMBYTES+1] = y;
-
-  shake128_absorb_once(state, extseed, sizeof(extseed));
-}
-
-/*************************************************
-* Name:        kyber_shake256_prf
-*
-* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input
-*              and then generates outlen bytes of SHAKE256 output
-*
-* Arguments:   - uint8_t *out: pointer to output
-*              - size_t outlen: number of requested output bytes
-*              - const uint8_t *key: pointer to the key (of length KYBER_SYMBYTES)
-*              - uint8_t nonce: single-byte nonce (public PRF input)
-**************************************************/
-void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce)
-{
-  uint8_t extkey[KYBER_SYMBYTES+1];
-
-  memcpy(extkey, key, KYBER_SYMBYTES);
-  extkey[KYBER_SYMBYTES] = nonce;
-
-  shake256(out, outlen, extkey, sizeof(extkey));
-}
-
-/*************************************************
-* Name:        kyber_shake256_prf
-*
-* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input
-*              and then generates outlen bytes of SHAKE256 output
-*
-* Arguments:   - uint8_t *out: pointer to output
-*              - size_t outlen: number of requested output bytes
-*              - const uint8_t *key: pointer to the key (of length KYBER_SYMBYTES)
-*              - uint8_t nonce: single-byte nonce (public PRF input)
-**************************************************/
-void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SYMBYTES], const uint8_t input[KYBER_CIPHERTEXTBYTES])
-{
-  shake256incctx s;
-
-  shake256_inc_init(&s);
-  shake256_inc_absorb(&s, key, KYBER_SYMBYTES);
-  shake256_inc_absorb(&s, input, KYBER_CIPHERTEXTBYTES);
-  shake256_inc_finalize(&s);
-  shake256_inc_squeeze(out, KYBER_SSBYTES, &s);
-  shake256_inc_ctx_release(&s);
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/symmetric.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/symmetric.h
deleted file mode 100644
index 2acc66f98d..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/symmetric.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef SYMMETRIC_H
-#define SYMMETRIC_H
-
-#include <stddef.h>
-#include <stdint.h>
-#include "params.h"
-
-#include "fips202.h"
-
-typedef shake128incctx xof_state;
-
-#define kyber_shake128_absorb KYBER_NAMESPACE(kyber_shake128_absorb)
-void kyber_shake128_absorb(shake128incctx *s,
-                           const uint8_t seed[KYBER_SYMBYTES],
-                           uint8_t x,
-                           uint8_t y);
-
-#define kyber_shake256_prf KYBER_NAMESPACE(kyber_shake256_prf)
-void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce);
-
-#define kyber_shake256_rkprf KYBER_NAMESPACE(kyber_shake256_rkprf)
-void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SYMBYTES], const uint8_t input[KYBER_CIPHERTEXTBYTES]);
-
-#define XOF_BLOCKBYTES SHAKE128_RATE
-
-#define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES)
-#define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES)
-#define xof_init(STATE, SEED) shake128_inc_init(STATE)
-#define xof_absorb(STATE, SEED, X, Y) kyber_shake128_absorb(STATE, SEED, X, Y)
-#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE)
-#define xof_release(STATE) shake128_inc_ctx_release(STATE)
-#define prf(OUT, OUTBYTES, KEY, NONCE) kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE)
-#define rkprf(OUT, KEY, INPUT) kyber_shake256_rkprf(OUT, KEY, INPUT)
-
-#endif /* SYMMETRIC_H */
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/verify.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/verify.c
deleted file mode 100644
index 914ccd448f..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/verify.c
+++ /dev/null
@@ -1,75 +0,0 @@
-#include <stddef.h>
-#include <stdint.h>
-#include "verify.h"
-
-/*************************************************
-* Name:        verify
-*
-* Description: Compare two arrays for equality in constant time.
-*
-* Arguments:   const uint8_t *a: pointer to first byte array
-*              const uint8_t *b: pointer to second byte array
-*              size_t len:       length of the byte arrays
-*
-* Returns 0 if the byte arrays are equal, 1 otherwise
-**************************************************/
-int verify(const uint8_t *a, const uint8_t *b, size_t len)
-{
-  size_t i;
-  uint8_t r = 0;
-
-  for(i=0;i<len;i++)
-    r |= a[i] ^ b[i];
-
-  return (-(uint64_t)r) >> 63;
-}
-
-/*************************************************
-* Name:        cmov
-*
-* Description: Copy len bytes from x to r if b is 1;
-*              don't modify x if b is 0. Requires b to be in {0,1};
-*              assumes two's complement representation of negative integers.
-*              Runs in constant time.
-*
-* Arguments:   uint8_t *r:       pointer to output byte array
-*              const uint8_t *x: pointer to input byte array
-*              size_t len:       Amount of bytes to be copied
-*              uint8_t b:        Condition bit; has to be in {0,1}
-**************************************************/
-void cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b)
-{
-  size_t i;
-
-#if defined(__GNUC__) || defined(__clang__)
-  // Prevent the compiler from
-  //    1) inferring that b is 0/1-valued, and
-  //    2) handling the two cases with a branch.
-  // This is not necessary when verify.c and kem.c are separate translation
-  // units, but we expect that downstream consumers will copy this code and/or
-  // change how it is built.
-  __asm__("" : "+r"(b) : /* no inputs */);
-#endif
-
-  b = -b;
-  for(i=0;i<len;i++)
-    r[i] ^= b & (r[i] ^ x[i]);
-}
-
-
-/*************************************************
-* Name:        cmov_int16
-*
-* Description: Copy input v to *r if b is 1, don't modify *r if b is 0. 
-*              Requires b to be in {0,1};
-*              Runs in constant time.
-*
-* Arguments:   int16_t *r:       pointer to output int16_t
-*              int16_t v:        input int16_t 
-*              uint8_t b:        Condition bit; has to be in {0,1}
-**************************************************/
-void cmov_int16(int16_t *r, int16_t v, uint16_t b)
-{
-  b = -b;
-  *r ^= b & ((*r) ^ v);
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/verify.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/verify.h
deleted file mode 100644
index 09f0ad5046..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_ref/verify.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef VERIFY_H
-#define VERIFY_H
-
-#include <stddef.h>
-#include <stdint.h>
-#include "params.h"
-
-#define verify KYBER_NAMESPACE(verify)
-int verify(const uint8_t *a, const uint8_t *b, size_t len);
-
-#define cmov KYBER_NAMESPACE(cmov)
-void cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b);
-
-#define cmov_int16 KYBER_NAMESPACE(cmov_int16)
-void cmov_int16(int16_t *r, int16_t v, uint16_t b);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/LICENSE b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/LICENSE
deleted file mode 100644
index 7922ab8007..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/LICENSE
+++ /dev/null
@@ -1,6 +0,0 @@
-Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/);
-or Apache 2.0 License (https://www.apache.org/licenses/LICENSE-2.0.html).
-
-For Keccak and AES we are using public-domain
-code from sources and by authors listed in
-comments on top of the respective files.
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/align.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/align.h
deleted file mode 100644
index 3463866f37..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/align.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef ALIGN_H
-#define ALIGN_H
-
-#include <stdint.h>
-#include <immintrin.h>
-
-#define ALIGNED_UINT8(N)        \
-    union {                     \
-        uint8_t coeffs[N];      \
-        __m256i vec[(N+31)/32]; \
-    }
-
-#define ALIGNED_INT16(N)        \
-    union {                     \
-        int16_t coeffs[N];      \
-        __m256i vec[(N+15)/16]; \
-    }
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/api.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/api.h
deleted file mode 100644
index a154e80f1d..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/api.h
+++ /dev/null
@@ -1,66 +0,0 @@
-#ifndef API_H
-#define API_H
-
-#include <stdint.h>
-
-#define pqcrystals_kyber512_SECRETKEYBYTES 1632
-#define pqcrystals_kyber512_PUBLICKEYBYTES 800
-#define pqcrystals_kyber512_CIPHERTEXTBYTES 768
-#define pqcrystals_kyber512_KEYPAIRCOINBYTES 64
-#define pqcrystals_kyber512_ENCCOINBYTES 32
-#define pqcrystals_kyber512_BYTES 32
-
-#define pqcrystals_kyber512_avx2_SECRETKEYBYTES pqcrystals_kyber512_SECRETKEYBYTES
-#define pqcrystals_kyber512_avx2_PUBLICKEYBYTES pqcrystals_kyber512_PUBLICKEYBYTES
-#define pqcrystals_kyber512_avx2_CIPHERTEXTBYTES pqcrystals_kyber512_CIPHERTEXTBYTES
-#define pqcrystals_kyber512_avx2_KEYPAIRCOINBYTES pqcrystals_kyber512_KEYPAIRCOINBYTES
-#define pqcrystals_kyber512_avx2_ENCCOINBYTES pqcrystals_kyber512_ENCCOINBYTES
-#define pqcrystals_kyber512_avx2_BYTES pqcrystals_kyber512_BYTES
-
-int pqcrystals_kyber512_avx2_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
-int pqcrystals_kyber512_avx2_keypair(uint8_t *pk, uint8_t *sk);
-int pqcrystals_kyber512_avx2_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
-int pqcrystals_kyber512_avx2_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-int pqcrystals_kyber512_avx2_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-
-#define pqcrystals_kyber768_SECRETKEYBYTES 2400
-#define pqcrystals_kyber768_PUBLICKEYBYTES 1184
-#define pqcrystals_kyber768_CIPHERTEXTBYTES 1088
-#define pqcrystals_kyber768_KEYPAIRCOINBYTES 64
-#define pqcrystals_kyber768_ENCCOINBYTES 32
-#define pqcrystals_kyber768_BYTES 32
-
-#define pqcrystals_kyber768_avx2_SECRETKEYBYTES pqcrystals_kyber768_SECRETKEYBYTES
-#define pqcrystals_kyber768_avx2_PUBLICKEYBYTES pqcrystals_kyber768_PUBLICKEYBYTES
-#define pqcrystals_kyber768_avx2_CIPHERTEXTBYTES pqcrystals_kyber768_CIPHERTEXTBYTES
-#define pqcrystals_kyber768_avx2_KEYPAIRCOINBYTES pqcrystals_kyber768_KEYPAIRCOINBYTES
-#define pqcrystals_kyber768_avx2_ENCCOINBYTES pqcrystals_kyber768_ENCCOINBYTES
-#define pqcrystals_kyber768_avx2_BYTES pqcrystals_kyber768_BYTES
-
-int pqcrystals_kyber768_avx2_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
-int pqcrystals_kyber768_avx2_keypair(uint8_t *pk, uint8_t *sk);
-int pqcrystals_kyber768_avx2_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
-int pqcrystals_kyber768_avx2_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-int pqcrystals_kyber768_avx2_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-
-#define pqcrystals_kyber1024_SECRETKEYBYTES 3168
-#define pqcrystals_kyber1024_PUBLICKEYBYTES 1568
-#define pqcrystals_kyber1024_CIPHERTEXTBYTES 1568
-#define pqcrystals_kyber1024_KEYPAIRCOINBYTES 64
-#define pqcrystals_kyber1024_ENCCOINBYTES 32
-#define pqcrystals_kyber1024_BYTES 32
-
-#define pqcrystals_kyber1024_avx2_SECRETKEYBYTES pqcrystals_kyber1024_SECRETKEYBYTES
-#define pqcrystals_kyber1024_avx2_PUBLICKEYBYTES pqcrystals_kyber1024_PUBLICKEYBYTES
-#define pqcrystals_kyber1024_avx2_CIPHERTEXTBYTES pqcrystals_kyber1024_CIPHERTEXTBYTES
-#define pqcrystals_kyber1024_avx2_KEYPAIRCOINBYTES pqcrystals_kyber1024_KEYPAIRCOINBYTES
-#define pqcrystals_kyber1024_avx2_ENCCOINBYTES pqcrystals_kyber1024_ENCCOINBYTES
-#define pqcrystals_kyber1024_avx2_BYTES pqcrystals_kyber1024_BYTES
-
-int pqcrystals_kyber1024_avx2_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
-int pqcrystals_kyber1024_avx2_keypair(uint8_t *pk, uint8_t *sk);
-int pqcrystals_kyber1024_avx2_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
-int pqcrystals_kyber1024_avx2_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-int pqcrystals_kyber1024_avx2_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/basemul.S b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/basemul.S
deleted file mode 100644
index 36990639b2..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/basemul.S
+++ /dev/null
@@ -1,105 +0,0 @@
-#include "consts.h"
-
-.macro schoolbook off
-vmovdqa		_16XQINV*2(%rcx),%ymm0
-vmovdqa		(64*\off+ 0)*2(%rsi),%ymm1		# a0
-vmovdqa		(64*\off+16)*2(%rsi),%ymm2		# b0
-vmovdqa		(64*\off+32)*2(%rsi),%ymm3		# a1
-vmovdqa		(64*\off+48)*2(%rsi),%ymm4		# b1
-
-vpmullw		%ymm0,%ymm1,%ymm9			# a0.lo
-vpmullw		%ymm0,%ymm2,%ymm10			# b0.lo
-vpmullw		%ymm0,%ymm3,%ymm11			# a1.lo
-vpmullw		%ymm0,%ymm4,%ymm12			# b1.lo
-
-vmovdqa		(64*\off+ 0)*2(%rdx),%ymm5		# c0
-vmovdqa		(64*\off+16)*2(%rdx),%ymm6		# d0
-
-vpmulhw		%ymm5,%ymm1,%ymm13			# a0c0.hi
-vpmulhw		%ymm6,%ymm1,%ymm1			# a0d0.hi
-vpmulhw		%ymm5,%ymm2,%ymm14			# b0c0.hi
-vpmulhw		%ymm6,%ymm2,%ymm2			# b0d0.hi
-
-vmovdqa		(64*\off+32)*2(%rdx),%ymm7		# c1
-vmovdqa		(64*\off+48)*2(%rdx),%ymm8		# d1
-
-vpmulhw		%ymm7,%ymm3,%ymm15			# a1c1.hi
-vpmulhw		%ymm8,%ymm3,%ymm3			# a1d1.hi
-vpmulhw		%ymm7,%ymm4,%ymm0			# b1c1.hi
-vpmulhw		%ymm8,%ymm4,%ymm4			# b1d1.hi
-
-vmovdqa		%ymm13,(%rsp)
-
-vpmullw		%ymm5,%ymm9,%ymm13			# a0c0.lo
-vpmullw		%ymm6,%ymm9,%ymm9			# a0d0.lo
-vpmullw		%ymm5,%ymm10,%ymm5			# b0c0.lo
-vpmullw		%ymm6,%ymm10,%ymm10			# b0d0.lo
-
-vpmullw		%ymm7,%ymm11,%ymm6			# a1c1.lo
-vpmullw		%ymm8,%ymm11,%ymm11			# a1d1.lo
-vpmullw		%ymm7,%ymm12,%ymm7			# b1c1.lo
-vpmullw		%ymm8,%ymm12,%ymm12			# b1d1.lo
-
-vmovdqa		_16XQ*2(%rcx),%ymm8
-vpmulhw		%ymm8,%ymm13,%ymm13
-vpmulhw		%ymm8,%ymm9,%ymm9
-vpmulhw		%ymm8,%ymm5,%ymm5
-vpmulhw		%ymm8,%ymm10,%ymm10
-vpmulhw		%ymm8,%ymm6,%ymm6
-vpmulhw		%ymm8,%ymm11,%ymm11
-vpmulhw		%ymm8,%ymm7,%ymm7
-vpmulhw		%ymm8,%ymm12,%ymm12
-
-vpsubw		(%rsp),%ymm13,%ymm13			# -a0c0
-vpsubw		%ymm9,%ymm1,%ymm9			# a0d0
-vpsubw		%ymm5,%ymm14,%ymm5			# b0c0
-vpsubw		%ymm10,%ymm2,%ymm10			# b0d0
-
-vpsubw		%ymm6,%ymm15,%ymm6			# a1c1
-vpsubw		%ymm11,%ymm3,%ymm11			# a1d1
-vpsubw		%ymm7,%ymm0,%ymm7			# b1c1
-vpsubw		%ymm12,%ymm4,%ymm12			# b1d1
-
-vmovdqa		(%r9),%ymm0
-vmovdqa		32(%r9),%ymm1
-vpmullw		%ymm0,%ymm10,%ymm2
-vpmullw		%ymm0,%ymm12,%ymm3
-vpmulhw		%ymm1,%ymm10,%ymm10
-vpmulhw		%ymm1,%ymm12,%ymm12
-vpmulhw		%ymm8,%ymm2,%ymm2
-vpmulhw		%ymm8,%ymm3,%ymm3
-vpsubw		%ymm2,%ymm10,%ymm10			# rb0d0
-vpsubw		%ymm3,%ymm12,%ymm12			# rb1d1
-
-vpaddw		%ymm5,%ymm9,%ymm9
-vpaddw		%ymm7,%ymm11,%ymm11
-vpsubw		%ymm13,%ymm10,%ymm13
-vpsubw		%ymm12,%ymm6,%ymm6
-
-vmovdqa		%ymm13,(64*\off+ 0)*2(%rdi)
-vmovdqa		%ymm9,(64*\off+16)*2(%rdi)
-vmovdqa		%ymm6,(64*\off+32)*2(%rdi)
-vmovdqa		%ymm11,(64*\off+48)*2(%rdi)
-.endm
-
-.text
-.global cdecl(basemul_avx)
-cdecl(basemul_avx):
-mov		%rsp,%r8
-and		$-32,%rsp
-sub		$32,%rsp
-
-lea		(_ZETAS_EXP+176)*2(%rcx),%r9
-schoolbook	0
-
-add		$32*2,%r9
-schoolbook	1
-
-add		$192*2,%r9
-schoolbook	2
-
-add		$32*2,%r9
-schoolbook	3
-
-mov		%r8,%rsp
-ret
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/cbd.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/cbd.c
deleted file mode 100644
index dad473c79e..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/cbd.c
+++ /dev/null
@@ -1,144 +0,0 @@
-#include <stdint.h>
-#include <immintrin.h>
-#include "params.h"
-#include "cbd.h"
-
-/*************************************************
-* Name:        cbd2
-*
-* Description: Given an array of uniformly random bytes, compute
-*              polynomial with coefficients distributed according to
-*              a centered binomial distribution with parameter eta=2
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const __m256i *buf: pointer to aligned input byte array
-**************************************************/
-static void cbd2(poly * restrict r, const __m256i buf[2*KYBER_N/128])
-{
-  unsigned int i;
-  __m256i f0, f1, f2, f3;
-  const __m256i mask55 = _mm256_set1_epi32(0x55555555);
-  const __m256i mask33 = _mm256_set1_epi32(0x33333333);
-  const __m256i mask03 = _mm256_set1_epi32(0x03030303);
-  const __m256i mask0F = _mm256_set1_epi32(0x0F0F0F0F);
-
-  for(i = 0; i < KYBER_N/64; i++) {
-    f0 = _mm256_load_si256(&buf[i]);
-
-    f1 = _mm256_srli_epi16(f0, 1);
-    f0 = _mm256_and_si256(mask55, f0);
-    f1 = _mm256_and_si256(mask55, f1);
-    f0 = _mm256_add_epi8(f0, f1);
-
-    f1 = _mm256_srli_epi16(f0, 2);
-    f0 = _mm256_and_si256(mask33, f0);
-    f1 = _mm256_and_si256(mask33, f1);
-    f0 = _mm256_add_epi8(f0, mask33);
-    f0 = _mm256_sub_epi8(f0, f1);
-
-    f1 = _mm256_srli_epi16(f0, 4);
-    f0 = _mm256_and_si256(mask0F, f0);
-    f1 = _mm256_and_si256(mask0F, f1);
-    f0 = _mm256_sub_epi8(f0, mask03);
-    f1 = _mm256_sub_epi8(f1, mask03);
-
-    f2 = _mm256_unpacklo_epi8(f0, f1);
-    f3 = _mm256_unpackhi_epi8(f0, f1);
-
-    f0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f2));
-    f1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f2,1));
-    f2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f3));
-    f3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f3,1));
-
-    _mm256_store_si256(&r->vec[4*i+0], f0);
-    _mm256_store_si256(&r->vec[4*i+1], f2);
-    _mm256_store_si256(&r->vec[4*i+2], f1);
-    _mm256_store_si256(&r->vec[4*i+3], f3);
-  }
-}
-
-#if KYBER_ETA1 == 3
-/*************************************************
-* Name:        cbd3
-*
-* Description: Given an array of uniformly random bytes, compute
-*              polynomial with coefficients distributed according to
-*              a centered binomial distribution with parameter eta=3
-*              This function is only needed for Kyber-512
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const __m256i *buf: pointer to aligned input byte array
-**************************************************/
-static void cbd3(poly * restrict r, const uint8_t buf[3*KYBER_N/4+8])
-{
-  unsigned int i;
-  __m256i f0, f1, f2, f3;
-  const __m256i mask249 = _mm256_set1_epi32(0x249249);
-  const __m256i mask6DB = _mm256_set1_epi32(0x6DB6DB);
-  const __m256i mask07 = _mm256_set1_epi32(7);
-  const __m256i mask70 = _mm256_set1_epi32(7 << 16);
-  const __m256i mask3 = _mm256_set1_epi16(3);
-  const __m256i shufbidx = _mm256_set_epi8(-1,15,14,13,-1,12,11,10,-1, 9, 8, 7,-1, 6, 5, 4,
-                                           -1,11,10, 9,-1, 8, 7, 6,-1, 5, 4, 3,-1, 2, 1, 0);
-
-  for(i = 0; i < KYBER_N/32; i++) {
-    f0 = _mm256_loadu_si256((__m256i *)&buf[24*i]);
-    f0 = _mm256_permute4x64_epi64(f0,0x94);
-    f0 = _mm256_shuffle_epi8(f0,shufbidx);
-
-    f1 = _mm256_srli_epi32(f0,1);
-    f2 = _mm256_srli_epi32(f0,2);
-    f0 = _mm256_and_si256(mask249,f0);
-    f1 = _mm256_and_si256(mask249,f1);
-    f2 = _mm256_and_si256(mask249,f2);
-    f0 = _mm256_add_epi32(f0,f1);
-    f0 = _mm256_add_epi32(f0,f2);
-
-    f1 = _mm256_srli_epi32(f0,3);
-    f0 = _mm256_add_epi32(f0,mask6DB);
-    f0 = _mm256_sub_epi32(f0,f1);
-
-    f1 = _mm256_slli_epi32(f0,10);
-    f2 = _mm256_srli_epi32(f0,12);
-    f3 = _mm256_srli_epi32(f0, 2);
-    f0 = _mm256_and_si256(f0,mask07);
-    f1 = _mm256_and_si256(f1,mask70);
-    f2 = _mm256_and_si256(f2,mask07);
-    f3 = _mm256_and_si256(f3,mask70);
-    f0 = _mm256_add_epi16(f0,f1);
-    f1 = _mm256_add_epi16(f2,f3);
-    f0 = _mm256_sub_epi16(f0,mask3);
-    f1 = _mm256_sub_epi16(f1,mask3);
-
-    f2 = _mm256_unpacklo_epi32(f0,f1);
-    f3 = _mm256_unpackhi_epi32(f0,f1);
-
-    f0 = _mm256_permute2x128_si256(f2,f3,0x20);
-    f1 = _mm256_permute2x128_si256(f2,f3,0x31);
-
-    _mm256_store_si256(&r->vec[2*i+0], f0);
-    _mm256_store_si256(&r->vec[2*i+1], f1);
-  }
-}
-#endif
-
-/* buf 32 bytes longer for cbd3 */
-void poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1*KYBER_N/128+1])
-{
-#if KYBER_ETA1 == 2
-  cbd2(r, buf);
-#elif KYBER_ETA1 == 3
-  cbd3(r, (uint8_t *)buf);
-#else
-#error "This implementation requires eta1 in {2,3}"
-#endif
-}
-
-void poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2*KYBER_N/128])
-{
-#if KYBER_ETA2 == 2
-  cbd2(r, buf);
-#else
-#error "This implementation requires eta2 = 2"
-#endif
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/cbd.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/cbd.h
deleted file mode 100644
index 05788e06b4..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/cbd.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef CBD_H
-#define CBD_H
-
-#include <stdint.h>
-#include <immintrin.h>
-#include "params.h"
-#include "poly.h"
-
-#define poly_cbd_eta1 KYBER_NAMESPACE(poly_cbd_eta1)
-void poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1*KYBER_N/128+1]);
-
-#define poly_cbd_eta2 KYBER_NAMESPACE(poly_cbd_eta2)
-void poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2*KYBER_N/128]);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/consts.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/consts.c
deleted file mode 100644
index 84e596893d..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/consts.c
+++ /dev/null
@@ -1,121 +0,0 @@
-#include "align.h"
-#include "params.h"
-#include "consts.h"
-
-#define Q KYBER_Q
-#define MONT -1044 // 2^16 mod q
-#define QINV -3327 // q^-1 mod 2^16
-#define V 20159 // floor(2^26/q + 0.5)
-#define FHI 1441 // mont^2/128
-#define FLO -10079 // qinv*FHI
-#define MONTSQHI 1353 // mont^2
-#define MONTSQLO 20553 // qinv*MONTSQHI
-#define MASK 4095
-#define SHIFT 32
-
-const qdata_t qdata = {{
-#define _16XQ 0
-  Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q,
-
-#define _16XQINV 16
-  QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV,
-  QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV,
-
-#define _16XV 32
-  V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
-
-#define _16XFLO 48
-  FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO,
-  FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO,
-
-#define _16XFHI 64
-  FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI,
-  FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI,
-
-#define _16XMONTSQLO 80
-  MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
-  MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
-  MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
-  MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
-
-#define _16XMONTSQHI 96
-  MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
-  MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
-  MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
-  MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
-
-#define _16XMASK 112
-  MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK,
-  MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK,
-
-#define _REVIDXB 128
-  3854, 3340, 2826, 2312, 1798, 1284, 770, 256,
-  3854, 3340, 2826, 2312, 1798, 1284, 770, 256,
-
-#define _REVIDXD 144
-  7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0, 0,
-
-#define _ZETAS_EXP 160
-   31498,  31498,  31498,  31498,   -758,   -758,   -758,   -758,
-    5237,   5237,   5237,   5237,   1397,   1397,   1397,   1397,
-   14745,  14745,  14745,  14745,  14745,  14745,  14745,  14745,
-   14745,  14745,  14745,  14745,  14745,  14745,  14745,  14745,
-    -359,   -359,   -359,   -359,   -359,   -359,   -359,   -359,
-    -359,   -359,   -359,   -359,   -359,   -359,   -359,   -359,
-   13525,  13525,  13525,  13525,  13525,  13525,  13525,  13525,
-  -12402, -12402, -12402, -12402, -12402, -12402, -12402, -12402,
-    1493,   1493,   1493,   1493,   1493,   1493,   1493,   1493,
-    1422,   1422,   1422,   1422,   1422,   1422,   1422,   1422,
-  -20907, -20907, -20907, -20907,  27758,  27758,  27758,  27758,
-   -3799,  -3799,  -3799,  -3799, -15690, -15690, -15690, -15690,
-    -171,   -171,   -171,   -171,    622,    622,    622,    622,
-    1577,   1577,   1577,   1577,    182,    182,    182,    182,
-   -5827,  -5827,  17363,  17363, -26360, -26360, -29057, -29057,
-    5571,   5571,  -1102,  -1102,  21438,  21438, -26242, -26242,
-     573,    573,  -1325,  -1325,    264,    264,    383,    383,
-    -829,   -829,   1458,   1458,  -1602,  -1602,   -130,   -130,
-   -5689,  -6516,   1496,  30967, -23565,  20179,  20710,  25080,
-  -12796,  26616,  16064, -12442,   9134,   -650, -25986,  27837,
-    1223,    652,   -552,   1015,  -1293,   1491,   -282,  -1544,
-     516,     -8,   -320,   -666,  -1618,  -1162,    126,   1469,
-    -335, -11477, -32227,  20494, -27738,    945, -14883,   6182,
-   32010,  10631,  29175, -28762, -18486,  17560, -14430,  -5276,
-   -1103,    555,  -1251,   1550,    422,    177,   -291,   1574,
-    -246,   1159,   -777,   -602,  -1590,   -872,    418,   -156,
-   11182,  13387, -14233, -21655,  13131,  -4587,  23092,   5493,
-  -32502,  30317, -18741,  12639,  20100,  18525,  19529, -12619,
-     430,    843,    871,    105,    587,   -235,   -460,   1653,
-     778,   -147,   1483,   1119,    644,    349,    329,    -75,
-     787,    787,    787,    787,    787,    787,    787,    787,
-     787,    787,    787,    787,    787,    787,    787,    787,
-   -1517,  -1517,  -1517,  -1517,  -1517,  -1517,  -1517,  -1517,
-   -1517,  -1517,  -1517,  -1517,  -1517,  -1517,  -1517,  -1517,
-   28191,  28191,  28191,  28191,  28191,  28191,  28191,  28191,
-  -16694, -16694, -16694, -16694, -16694, -16694, -16694, -16694,
-     287,    287,    287,    287,    287,    287,    287,    287,
-     202,    202,    202,    202,    202,    202,    202,    202,
-   10690,  10690,  10690,  10690,   1358,   1358,   1358,   1358,
-  -11202, -11202, -11202, -11202,  31164,  31164,  31164,  31164,
-     962,    962,    962,    962,  -1202,  -1202,  -1202,  -1202,
-   -1474,  -1474,  -1474,  -1474,   1468,   1468,   1468,   1468,
-  -28073, -28073,  24313,  24313, -10532, -10532,   8800,   8800,
-   18426,  18426,   8859,   8859,  26675,  26675, -16163, -16163,
-    -681,   -681,   1017,   1017,    732,    732,    608,    608,
-   -1542,  -1542,    411,    411,   -205,   -205,  -1571,  -1571,
-   19883, -28250, -15887,  -8898, -28309,   9075, -30199,  18249,
-   13426,  14017, -29156, -12757,  16832,   4311, -24155, -17915,
-    -853,    -90,   -271,    830,    107,  -1421,   -247,   -951,
-    -398,    961,  -1508,   -725,    448,  -1065,    677,  -1275,
-  -31183,  25435,  -7382,  24391, -20927,  10946,  24214,  16989,
-   10335,  -7934, -22502,  10906,  31636,  28644,  23998, -17422,
-     817,    603,   1322,  -1465,  -1215,   1218,   -874,  -1187,
-   -1185,  -1278,  -1510,   -870,   -108,    996,    958,   1522,
-   20297,   2146,  15355, -32384,  -6280, -14903, -11044,  14469,
-  -21498, -20198,  23210, -17442, -23860, -20257,   7756,  23132,
-    1097,    610,  -1285,    384,   -136,  -1335,    220,  -1659,
-   -1530,    794,   -854,    478,   -308,    991,  -1460,   1628,
-
-#define _16XSHIFT 624
-  SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT,
-  SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT
-}};
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/consts.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/consts.h
deleted file mode 100644
index f95899cd8e..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/consts.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef CONSTS_H
-#define CONSTS_H
-
-#include "params.h"
-
-#define _16XQ            0
-#define _16XQINV        16
-#define _16XV           32
-#define _16XFLO         48
-#define _16XFHI         64
-#define _16XMONTSQLO    80
-#define _16XMONTSQHI    96
-#define _16XMASK       112
-#define _REVIDXB       128
-#define _REVIDXD       144
-#define _ZETAS_EXP     160
-#define	_16XSHIFT      624
-
-/* The C ABI on MacOS exports all symbols with a leading
- * underscore. This means that any symbols we refer to from
- * C files (functions) can't be found, and all symbols we
- * refer to from ASM also can't be found.
- *
- * This define helps us get around this
- */
-#ifdef __ASSEMBLER__
-#if defined(__WIN32__) || defined(__APPLE__)
-#define decorate(s) _##s
-#define cdecl2(s) decorate(s)
-#define cdecl(s) cdecl2(KYBER_NAMESPACE(##s))
-#else
-#define cdecl(s) KYBER_NAMESPACE(##s)
-#endif
-#endif
-
-#ifndef __ASSEMBLER__
-#include "align.h"
-typedef ALIGNED_INT16(640) qdata_t;
-#define qdata KYBER_NAMESPACE(qdata)
-extern const qdata_t qdata;
-#endif
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/fq.S b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/fq.S
deleted file mode 100644
index 3bb1ebd3d8..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/fq.S
+++ /dev/null
@@ -1,88 +0,0 @@
-#include "consts.h"
-.include "fq.inc"
-
-.text
-reduce128_avx:
-#load
-vmovdqa		(%rdi),%ymm2
-vmovdqa		32(%rdi),%ymm3
-vmovdqa		64(%rdi),%ymm4
-vmovdqa		96(%rdi),%ymm5
-vmovdqa		128(%rdi),%ymm6
-vmovdqa		160(%rdi),%ymm7
-vmovdqa		192(%rdi),%ymm8
-vmovdqa		224(%rdi),%ymm9
-
-red16		2
-red16		3
-red16		4
-red16		5
-red16		6
-red16		7
-red16		8
-red16		9
-
-#store
-vmovdqa		%ymm2,(%rdi)
-vmovdqa		%ymm3,32(%rdi)
-vmovdqa		%ymm4,64(%rdi)
-vmovdqa		%ymm5,96(%rdi)
-vmovdqa		%ymm6,128(%rdi)
-vmovdqa		%ymm7,160(%rdi)
-vmovdqa		%ymm8,192(%rdi)
-vmovdqa		%ymm9,224(%rdi)
-
-ret
-
-.global cdecl(reduce_avx)
-cdecl(reduce_avx):
-#consts
-vmovdqa		_16XQ*2(%rsi),%ymm0
-vmovdqa		_16XV*2(%rsi),%ymm1
-call		reduce128_avx
-add		$256,%rdi
-call		reduce128_avx
-ret
-
-tomont128_avx:
-#load
-vmovdqa		(%rdi),%ymm3
-vmovdqa		32(%rdi),%ymm4
-vmovdqa		64(%rdi),%ymm5
-vmovdqa		96(%rdi),%ymm6
-vmovdqa		128(%rdi),%ymm7
-vmovdqa		160(%rdi),%ymm8
-vmovdqa		192(%rdi),%ymm9
-vmovdqa		224(%rdi),%ymm10
-
-fqmulprecomp	1,2,3,11
-fqmulprecomp	1,2,4,12
-fqmulprecomp	1,2,5,13
-fqmulprecomp	1,2,6,14
-fqmulprecomp	1,2,7,15
-fqmulprecomp	1,2,8,11
-fqmulprecomp	1,2,9,12
-fqmulprecomp	1,2,10,13
-
-#store
-vmovdqa		%ymm3,(%rdi)
-vmovdqa		%ymm4,32(%rdi)
-vmovdqa		%ymm5,64(%rdi)
-vmovdqa		%ymm6,96(%rdi)
-vmovdqa		%ymm7,128(%rdi)
-vmovdqa		%ymm8,160(%rdi)
-vmovdqa		%ymm9,192(%rdi)
-vmovdqa		%ymm10,224(%rdi)
-
-ret
-
-.global cdecl(tomont_avx)
-cdecl(tomont_avx):
-#consts
-vmovdqa		_16XQ*2(%rsi),%ymm0
-vmovdqa		_16XMONTSQLO*2(%rsi),%ymm1
-vmovdqa		_16XMONTSQHI*2(%rsi),%ymm2
-call		tomont128_avx
-add		$256,%rdi
-call		tomont128_avx
-ret
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/fq.inc b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/fq.inc
deleted file mode 100644
index 4b7afc3118..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/fq.inc
+++ /dev/null
@@ -1,30 +0,0 @@
-.macro red16 r,rs=0,x=12
-vpmulhw         %ymm1,%ymm\r,%ymm\x
-.if \rs
-vpmulhrsw	%ymm\rs,%ymm\x,%ymm\x
-.else
-vpsraw          $10,%ymm\x,%ymm\x
-.endif
-vpmullw         %ymm0,%ymm\x,%ymm\x
-vpsubw          %ymm\x,%ymm\r,%ymm\r
-.endm
-
-.macro csubq r,x=12
-vpsubw		%ymm0,%ymm\r,%ymm\r
-vpsraw		$15,%ymm\r,%ymm\x
-vpand		%ymm0,%ymm\x,%ymm\x
-vpaddw		%ymm\x,%ymm\r,%ymm\r
-.endm
-
-.macro caddq r,x=12
-vpsraw		$15,%ymm\r,%ymm\x
-vpand		%ymm0,%ymm\x,%ymm\x
-vpaddw		%ymm\x,%ymm\r,%ymm\r
-.endm
-
-.macro fqmulprecomp al,ah,b,x=12
-vpmullw		%ymm\al,%ymm\b,%ymm\x
-vpmulhw		%ymm\ah,%ymm\b,%ymm\b
-vpmulhw		%ymm0,%ymm\x,%ymm\x
-vpsubw		%ymm\x,%ymm\b,%ymm\b
-.endm
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/indcpa.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/indcpa.c
deleted file mode 100644
index c4b2b3a89f..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/indcpa.c
+++ /dev/null
@@ -1,568 +0,0 @@
-#include <stddef.h>
-#include <stdint.h>
-#include <immintrin.h>
-#include <string.h>
-#include "align.h"
-#include "params.h"
-#include "indcpa.h"
-#include "polyvec.h"
-#include "poly.h"
-#include "ntt.h"
-#include "cbd.h"
-#include "rejsample.h"
-#include "symmetric.h"
-#include "randombytes.h"
-
-/*************************************************
-* Name:        pack_pk
-*
-* Description: Serialize the public key as concatenation of the
-*              serialized vector of polynomials pk and the
-*              public seed used to generate the matrix A.
-*              The polynomial coefficients in pk are assumed to
-*              lie in the invertal [0,q], i.e. pk must be reduced
-*              by polyvec_reduce().
-*
-* Arguments:   uint8_t *r: pointer to the output serialized public key
-*              polyvec *pk: pointer to the input public-key polyvec
-*              const uint8_t *seed: pointer to the input public seed
-**************************************************/
-static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES],
-                    polyvec *pk,
-                    const uint8_t seed[KYBER_SYMBYTES])
-{
-  polyvec_tobytes(r, pk);
-  memcpy(r+KYBER_POLYVECBYTES, seed, KYBER_SYMBYTES);
-}
-
-/*************************************************
-* Name:        unpack_pk
-*
-* Description: De-serialize public key from a byte array;
-*              approximate inverse of pack_pk
-*
-* Arguments:   - polyvec *pk: pointer to output public-key polynomial vector
-*              - uint8_t *seed: pointer to output seed to generate matrix A
-*              - const uint8_t *packedpk: pointer to input serialized public key
-**************************************************/
-static void unpack_pk(polyvec *pk,
-                      uint8_t seed[KYBER_SYMBYTES],
-                      const uint8_t packedpk[KYBER_INDCPA_PUBLICKEYBYTES])
-{
-  polyvec_frombytes(pk, packedpk);
-  memcpy(seed, packedpk+KYBER_POLYVECBYTES, KYBER_SYMBYTES);
-}
-
-/*************************************************
-* Name:        pack_sk
-*
-* Description: Serialize the secret key.
-*              The polynomial coefficients in sk are assumed to
-*              lie in the invertal [0,q], i.e. sk must be reduced
-*              by polyvec_reduce().
-*
-* Arguments:   - uint8_t *r: pointer to output serialized secret key
-*              - polyvec *sk: pointer to input vector of polynomials (secret key)
-**************************************************/
-static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk)
-{
-  polyvec_tobytes(r, sk);
-}
-
-/*************************************************
-* Name:        unpack_sk
-*
-* Description: De-serialize the secret key; inverse of pack_sk
-*
-* Arguments:   - polyvec *sk: pointer to output vector of polynomials (secret key)
-*              - const uint8_t *packedsk: pointer to input serialized secret key
-**************************************************/
-static void unpack_sk(polyvec *sk, const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES])
-{
-  polyvec_frombytes(sk, packedsk);
-}
-
-/*************************************************
-* Name:        pack_ciphertext
-*
-* Description: Serialize the ciphertext as concatenation of the
-*              compressed and serialized vector of polynomials b
-*              and the compressed and serialized polynomial v.
-*              The polynomial coefficients in b and v are assumed to
-*              lie in the invertal [0,q], i.e. b and v must be reduced
-*              by polyvec_reduce() and poly_reduce(), respectively.
-*
-* Arguments:   uint8_t *r: pointer to the output serialized ciphertext
-*              poly *pk: pointer to the input vector of polynomials b
-*              poly *v: pointer to the input polynomial v
-**************************************************/
-static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], polyvec *b, poly *v)
-{
-  polyvec_compress(r, b);
-  poly_compress(r+KYBER_POLYVECCOMPRESSEDBYTES, v);
-}
-
-/*************************************************
-* Name:        unpack_ciphertext
-*
-* Description: De-serialize and decompress ciphertext from a byte array;
-*              approximate inverse of pack_ciphertext
-*
-* Arguments:   - polyvec *b: pointer to the output vector of polynomials b
-*              - poly *v: pointer to the output polynomial v
-*              - const uint8_t *c: pointer to the input serialized ciphertext
-**************************************************/
-static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[KYBER_INDCPA_BYTES])
-{
-  polyvec_decompress(b, c);
-  poly_decompress(v, c+KYBER_POLYVECCOMPRESSEDBYTES);
-}
-
-/*************************************************
-* Name:        rej_uniform
-*
-* Description: Run rejection sampling on uniform random bytes to generate
-*              uniform random integers mod q
-*
-* Arguments:   - int16_t *r: pointer to output array
-*              - unsigned int len: requested number of 16-bit integers (uniform mod q)
-*              - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes)
-*              - unsigned int buflen: length of input buffer in bytes
-*
-* Returns number of sampled 16-bit integers (at most len)
-**************************************************/
-static unsigned int rej_uniform(int16_t *r,
-                                unsigned int len,
-                                const uint8_t *buf,
-                                unsigned int buflen)
-{
-  unsigned int ctr, pos;
-  uint16_t val0, val1;
-
-  ctr = pos = 0;
-  while(ctr < len && pos <= buflen - 3) {  // buflen is always at least 3
-    val0 = ((buf[pos+0] >> 0) | ((uint16_t)buf[pos+1] << 8)) & 0xFFF;
-    val1 = ((buf[pos+1] >> 4) | ((uint16_t)buf[pos+2] << 4)) & 0xFFF;
-    pos += 3;
-
-    if(val0 < KYBER_Q)
-      r[ctr++] = val0;
-    if(ctr < len && val1 < KYBER_Q)
-      r[ctr++] = val1;
-  }
-
-  return ctr;
-}
-
-#define gen_a(A,B)  gen_matrix(A,B,0)
-#define gen_at(A,B) gen_matrix(A,B,1)
-
-/*************************************************
-* Name:        gen_matrix
-*
-* Description: Deterministically generate matrix A (or the transpose of A)
-*              from a seed. Entries of the matrix are polynomials that look
-*              uniformly random. Performs rejection sampling on output of
-*              a XOF
-*
-* Arguments:   - polyvec *a: pointer to ouptput matrix A
-*              - const uint8_t *seed: pointer to input seed
-*              - int transposed: boolean deciding whether A or A^T is generated
-**************************************************/
-#if KYBER_K == 2
-void gen_matrix(polyvec *a, const uint8_t seed[32], int transposed)
-{
-  unsigned int ctr0, ctr1, ctr2, ctr3;
-  ALIGNED_UINT8(REJ_UNIFORM_AVX_NBLOCKS*SHAKE128_RATE) buf[4];
-  __m256i f;
-  shake128x4incctx state;
-
-  f = _mm256_loadu_si256((__m256i *)seed);
-  _mm256_store_si256(buf[0].vec, f);
-  _mm256_store_si256(buf[1].vec, f);
-  _mm256_store_si256(buf[2].vec, f);
-  _mm256_store_si256(buf[3].vec, f);
-
-  if(transposed) {
-    buf[0].coeffs[32] = 0;
-    buf[0].coeffs[33] = 0;
-    buf[1].coeffs[32] = 0;
-    buf[1].coeffs[33] = 1;
-    buf[2].coeffs[32] = 1;
-    buf[2].coeffs[33] = 0;
-    buf[3].coeffs[32] = 1;
-    buf[3].coeffs[33] = 1;
-  }
-  else {
-    buf[0].coeffs[32] = 0;
-    buf[0].coeffs[33] = 0;
-    buf[1].coeffs[32] = 1;
-    buf[1].coeffs[33] = 0;
-    buf[2].coeffs[32] = 0;
-    buf[2].coeffs[33] = 1;
-    buf[3].coeffs[32] = 1;
-    buf[3].coeffs[33] = 1;
-  }
-
-  shake128x4_inc_init(&state);
-  shake128x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 34);
-  shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state);
-
-  ctr0 = rej_uniform_avx(a[0].vec[0].coeffs, buf[0].coeffs);
-  ctr1 = rej_uniform_avx(a[0].vec[1].coeffs, buf[1].coeffs);
-  ctr2 = rej_uniform_avx(a[1].vec[0].coeffs, buf[2].coeffs);
-  ctr3 = rej_uniform_avx(a[1].vec[1].coeffs, buf[3].coeffs);
-
-  while(ctr0 < KYBER_N || ctr1 < KYBER_N || ctr2 < KYBER_N || ctr3 < KYBER_N) {
-    shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 1, &state);
-
-    ctr0 += rej_uniform(a[0].vec[0].coeffs + ctr0, KYBER_N - ctr0, buf[0].coeffs, SHAKE128_RATE);
-    ctr1 += rej_uniform(a[0].vec[1].coeffs + ctr1, KYBER_N - ctr1, buf[1].coeffs, SHAKE128_RATE);
-    ctr2 += rej_uniform(a[1].vec[0].coeffs + ctr2, KYBER_N - ctr2, buf[2].coeffs, SHAKE128_RATE);
-    ctr3 += rej_uniform(a[1].vec[1].coeffs + ctr3, KYBER_N - ctr3, buf[3].coeffs, SHAKE128_RATE);
-  }
-
-  poly_nttunpack(&a[0].vec[0]);
-  poly_nttunpack(&a[0].vec[1]);
-  poly_nttunpack(&a[1].vec[0]);
-  poly_nttunpack(&a[1].vec[1]);
-  shake128x4_inc_ctx_release(&state);
-}
-#elif KYBER_K == 3
-void gen_matrix(polyvec *a, const uint8_t seed[32], int transposed)
-{
-  unsigned int ctr0, ctr1, ctr2, ctr3;
-  ALIGNED_UINT8(REJ_UNIFORM_AVX_NBLOCKS*SHAKE128_RATE) buf[4];
-  __m256i f;
-  shake128x4incctx state;
-  shake128incctx state1x;
-
-  f = _mm256_loadu_si256((__m256i *)seed);
-  _mm256_store_si256(buf[0].vec, f);
-  _mm256_store_si256(buf[1].vec, f);
-  _mm256_store_si256(buf[2].vec, f);
-  _mm256_store_si256(buf[3].vec, f);
-
-  if(transposed) {
-    buf[0].coeffs[32] = 0;
-    buf[0].coeffs[33] = 0;
-    buf[1].coeffs[32] = 0;
-    buf[1].coeffs[33] = 1;
-    buf[2].coeffs[32] = 0;
-    buf[2].coeffs[33] = 2;
-    buf[3].coeffs[32] = 1;
-    buf[3].coeffs[33] = 0;
-  }
-  else {
-    buf[0].coeffs[32] = 0;
-    buf[0].coeffs[33] = 0;
-    buf[1].coeffs[32] = 1;
-    buf[1].coeffs[33] = 0;
-    buf[2].coeffs[32] = 2;
-    buf[2].coeffs[33] = 0;
-    buf[3].coeffs[32] = 0;
-    buf[3].coeffs[33] = 1;
-  }
-
-  shake128x4_inc_init(&state);
-  shake128x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 34);
-  shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state);
-
-  ctr0 = rej_uniform_avx(a[0].vec[0].coeffs, buf[0].coeffs);
-  ctr1 = rej_uniform_avx(a[0].vec[1].coeffs, buf[1].coeffs);
-  ctr2 = rej_uniform_avx(a[0].vec[2].coeffs, buf[2].coeffs);
-  ctr3 = rej_uniform_avx(a[1].vec[0].coeffs, buf[3].coeffs);
-
-  while(ctr0 < KYBER_N || ctr1 < KYBER_N || ctr2 < KYBER_N || ctr3 < KYBER_N) {
-    shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 1, &state);
-
-    ctr0 += rej_uniform(a[0].vec[0].coeffs + ctr0, KYBER_N - ctr0, buf[0].coeffs, SHAKE128_RATE);
-    ctr1 += rej_uniform(a[0].vec[1].coeffs + ctr1, KYBER_N - ctr1, buf[1].coeffs, SHAKE128_RATE);
-    ctr2 += rej_uniform(a[0].vec[2].coeffs + ctr2, KYBER_N - ctr2, buf[2].coeffs, SHAKE128_RATE);
-    ctr3 += rej_uniform(a[1].vec[0].coeffs + ctr3, KYBER_N - ctr3, buf[3].coeffs, SHAKE128_RATE);
-  }
-
-  poly_nttunpack(&a[0].vec[0]);
-  poly_nttunpack(&a[0].vec[1]);
-  poly_nttunpack(&a[0].vec[2]);
-  poly_nttunpack(&a[1].vec[0]);
-
-  f = _mm256_loadu_si256((__m256i *)seed);
-  _mm256_store_si256(buf[0].vec, f);
-  _mm256_store_si256(buf[1].vec, f);
-  _mm256_store_si256(buf[2].vec, f);
-  _mm256_store_si256(buf[3].vec, f);
-
-  if(transposed) {
-    buf[0].coeffs[32] = 1;
-    buf[0].coeffs[33] = 1;
-    buf[1].coeffs[32] = 1;
-    buf[1].coeffs[33] = 2;
-    buf[2].coeffs[32] = 2;
-    buf[2].coeffs[33] = 0;
-    buf[3].coeffs[32] = 2;
-    buf[3].coeffs[33] = 1;
-  }
-  else {
-    buf[0].coeffs[32] = 1;
-    buf[0].coeffs[33] = 1;
-    buf[1].coeffs[32] = 2;
-    buf[1].coeffs[33] = 1;
-    buf[2].coeffs[32] = 0;
-    buf[2].coeffs[33] = 2;
-    buf[3].coeffs[32] = 1;
-    buf[3].coeffs[33] = 2;
-  }
-
-  shake128x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 34);
-  shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state);
-
-  ctr0 = rej_uniform_avx(a[1].vec[1].coeffs, buf[0].coeffs);
-  ctr1 = rej_uniform_avx(a[1].vec[2].coeffs, buf[1].coeffs);
-  ctr2 = rej_uniform_avx(a[2].vec[0].coeffs, buf[2].coeffs);
-  ctr3 = rej_uniform_avx(a[2].vec[1].coeffs, buf[3].coeffs);
-
-  while(ctr0 < KYBER_N || ctr1 < KYBER_N || ctr2 < KYBER_N || ctr3 < KYBER_N) {
-    shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 1, &state);
-
-    ctr0 += rej_uniform(a[1].vec[1].coeffs + ctr0, KYBER_N - ctr0, buf[0].coeffs, SHAKE128_RATE);
-    ctr1 += rej_uniform(a[1].vec[2].coeffs + ctr1, KYBER_N - ctr1, buf[1].coeffs, SHAKE128_RATE);
-    ctr2 += rej_uniform(a[2].vec[0].coeffs + ctr2, KYBER_N - ctr2, buf[2].coeffs, SHAKE128_RATE);
-    ctr3 += rej_uniform(a[2].vec[1].coeffs + ctr3, KYBER_N - ctr3, buf[3].coeffs, SHAKE128_RATE);
-  }
-  shake128x4_inc_ctx_release(&state);
-
-  poly_nttunpack(&a[1].vec[1]);
-  poly_nttunpack(&a[1].vec[2]);
-  poly_nttunpack(&a[2].vec[0]);
-  poly_nttunpack(&a[2].vec[1]);
-
-  f = _mm256_loadu_si256((__m256i *)seed);
-  _mm256_store_si256(buf[0].vec, f);
-  buf[0].coeffs[32] = 2;
-  buf[0].coeffs[33] = 2;
-
-  shake128_inc_init(&state1x);
-  shake128_absorb_once(&state1x, buf[0].coeffs, 34);
-  shake128_squeezeblocks(buf[0].coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state1x);
-  ctr0 = rej_uniform_avx(a[2].vec[2].coeffs, buf[0].coeffs);
-  while(ctr0 < KYBER_N) {
-    shake128_squeezeblocks(buf[0].coeffs, 1, &state1x);
-    ctr0 += rej_uniform(a[2].vec[2].coeffs + ctr0, KYBER_N - ctr0, buf[0].coeffs, SHAKE128_RATE);
-  }
-  shake128_inc_ctx_release(&state1x);
-
-  poly_nttunpack(&a[2].vec[2]);
-}
-#elif KYBER_K == 4
-void gen_matrix(polyvec *a, const uint8_t seed[32], int transposed)
-{
-  unsigned int i, ctr0, ctr1, ctr2, ctr3;
-  ALIGNED_UINT8(REJ_UNIFORM_AVX_NBLOCKS*SHAKE128_RATE) buf[4];
-  __m256i f;
-  shake128x4incctx state;
-  shake128x4_inc_init(&state);
-
-  for(i=0;i<4;i++) {
-    f = _mm256_loadu_si256((__m256i *)seed);
-    _mm256_store_si256(buf[0].vec, f);
-    _mm256_store_si256(buf[1].vec, f);
-    _mm256_store_si256(buf[2].vec, f);
-    _mm256_store_si256(buf[3].vec, f);
-
-    if(transposed) {
-      buf[0].coeffs[32] = i;
-      buf[0].coeffs[33] = 0;
-      buf[1].coeffs[32] = i;
-      buf[1].coeffs[33] = 1;
-      buf[2].coeffs[32] = i;
-      buf[2].coeffs[33] = 2;
-      buf[3].coeffs[32] = i;
-      buf[3].coeffs[33] = 3;
-    }
-    else {
-      buf[0].coeffs[32] = 0;
-      buf[0].coeffs[33] = i;
-      buf[1].coeffs[32] = 1;
-      buf[1].coeffs[33] = i;
-      buf[2].coeffs[32] = 2;
-      buf[2].coeffs[33] = i;
-      buf[3].coeffs[32] = 3;
-      buf[3].coeffs[33] = i;
-    }
-
-    shake128x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 34);
-    shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state);
-
-    ctr0 = rej_uniform_avx(a[i].vec[0].coeffs, buf[0].coeffs);
-    ctr1 = rej_uniform_avx(a[i].vec[1].coeffs, buf[1].coeffs);
-    ctr2 = rej_uniform_avx(a[i].vec[2].coeffs, buf[2].coeffs);
-    ctr3 = rej_uniform_avx(a[i].vec[3].coeffs, buf[3].coeffs);
-
-    while(ctr0 < KYBER_N || ctr1 < KYBER_N || ctr2 < KYBER_N || ctr3 < KYBER_N) {
-      shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 1, &state);
-
-      ctr0 += rej_uniform(a[i].vec[0].coeffs + ctr0, KYBER_N - ctr0, buf[0].coeffs, SHAKE128_RATE);
-      ctr1 += rej_uniform(a[i].vec[1].coeffs + ctr1, KYBER_N - ctr1, buf[1].coeffs, SHAKE128_RATE);
-      ctr2 += rej_uniform(a[i].vec[2].coeffs + ctr2, KYBER_N - ctr2, buf[2].coeffs, SHAKE128_RATE);
-      ctr3 += rej_uniform(a[i].vec[3].coeffs + ctr3, KYBER_N - ctr3, buf[3].coeffs, SHAKE128_RATE);
-    }
-
-    poly_nttunpack(&a[i].vec[0]);
-    poly_nttunpack(&a[i].vec[1]);
-    poly_nttunpack(&a[i].vec[2]);
-    poly_nttunpack(&a[i].vec[3]);
-  }
-  shake128x4_inc_ctx_release(&state);
-}
-#endif
-
-/*************************************************
-* Name:        indcpa_keypair_derand
-*
-* Description: Generates public and private key for the CPA-secure
-*              public-key encryption scheme underlying Kyber
-*
-* Arguments:   - uint8_t *pk: pointer to output public key
-*                             (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
-*              - uint8_t *sk: pointer to output private key
-*                             (of length KYBER_INDCPA_SECRETKEYBYTES bytes)
-*              - const uint8_t *coins: pointer to input randomness
-*                             (of length KYBER_SYMBYTES bytes)
-**************************************************/
-void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
-                           uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES],
-                           const uint8_t coins[KYBER_SYMBYTES])
-{
-  unsigned int i;
-  uint8_t buf[2*KYBER_SYMBYTES];
-  const uint8_t *publicseed = buf;
-  const uint8_t *noiseseed = buf + KYBER_SYMBYTES;
-  polyvec a[KYBER_K], e, pkpv, skpv;
-
-  memcpy(buf, coins, KYBER_SYMBYTES);
-  buf[KYBER_SYMBYTES] = KYBER_K;
-  hash_g(buf, buf, KYBER_SYMBYTES+1);
-
-  gen_a(a, publicseed);
-
-#if KYBER_K == 2
-  poly_getnoise_eta1_4x(skpv.vec+0, skpv.vec+1, e.vec+0, e.vec+1, noiseseed, 0, 1, 2, 3);
-#elif KYBER_K == 3
-  poly_getnoise_eta1_4x(skpv.vec+0, skpv.vec+1, skpv.vec+2, e.vec+0, noiseseed, 0, 1, 2, 3);
-  poly_getnoise_eta1_4x(e.vec+1, e.vec+2, pkpv.vec+0, pkpv.vec+1, noiseseed, 4, 5, 6, 7);
-#elif KYBER_K == 4
-  poly_getnoise_eta1_4x(skpv.vec+0, skpv.vec+1, skpv.vec+2, skpv.vec+3, noiseseed,  0, 1, 2, 3);
-  poly_getnoise_eta1_4x(e.vec+0, e.vec+1, e.vec+2, e.vec+3, noiseseed, 4, 5, 6, 7);
-#endif
-
-  polyvec_ntt(&skpv);
-  polyvec_reduce(&skpv);
-  polyvec_ntt(&e);
-
-  // matrix-vector multiplication
-  for(i=0;i<KYBER_K;i++) {
-    polyvec_basemul_acc_montgomery(&pkpv.vec[i], &a[i], &skpv);
-    poly_tomont(&pkpv.vec[i]);
-  }
-
-  polyvec_add(&pkpv, &pkpv, &e);
-  polyvec_reduce(&pkpv);
-
-  pack_sk(sk, &skpv);
-  pack_pk(pk, &pkpv, publicseed);
-}
-
-/*************************************************
-* Name:        indcpa_enc
-*
-* Description: Encryption function of the CPA-secure
-*              public-key encryption scheme underlying Kyber.
-*
-* Arguments:   - uint8_t *c: pointer to output ciphertext
-*                            (of length KYBER_INDCPA_BYTES bytes)
-*              - const uint8_t *m: pointer to input message
-*                                  (of length KYBER_INDCPA_MSGBYTES bytes)
-*              - const uint8_t *pk: pointer to input public key
-*                                   (of length KYBER_INDCPA_PUBLICKEYBYTES)
-*              - const uint8_t *coins: pointer to input random coins used as seed
-*                                      (of length KYBER_SYMBYTES) to deterministically
-*                                      generate all randomness
-**************************************************/
-void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
-                const uint8_t m[KYBER_INDCPA_MSGBYTES],
-                const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
-                const uint8_t coins[KYBER_SYMBYTES])
-{
-  unsigned int i;
-  uint8_t seed[KYBER_SYMBYTES];
-  polyvec sp, pkpv, ep, at[KYBER_K], b;
-  poly v, k, epp;
-
-  unpack_pk(&pkpv, seed, pk);
-  poly_frommsg(&k, m);
-  gen_at(at, seed);
-
-#if KYBER_K == 2
-  poly_getnoise_eta1122_4x(sp.vec+0, sp.vec+1, ep.vec+0, ep.vec+1, coins, 0, 1, 2, 3);
-  poly_getnoise_eta2(&epp, coins, 4);
-#elif KYBER_K == 3
-  poly_getnoise_eta1_4x(sp.vec+0, sp.vec+1, sp.vec+2, ep.vec+0, coins, 0, 1, 2 ,3);
-  poly_getnoise_eta1_4x(ep.vec+1, ep.vec+2, &epp, b.vec+0, coins,  4, 5, 6, 7);
-#elif KYBER_K == 4
-  poly_getnoise_eta1_4x(sp.vec+0, sp.vec+1, sp.vec+2, sp.vec+3, coins, 0, 1, 2, 3);
-  poly_getnoise_eta1_4x(ep.vec+0, ep.vec+1, ep.vec+2, ep.vec+3, coins, 4, 5, 6, 7);
-  poly_getnoise_eta2(&epp, coins, 8);
-#endif
-
-  polyvec_ntt(&sp);
-
-  // matrix-vector multiplication
-  for(i=0;i<KYBER_K;i++)
-    polyvec_basemul_acc_montgomery(&b.vec[i], &at[i], &sp);
-  polyvec_basemul_acc_montgomery(&v, &pkpv, &sp);
-
-  polyvec_invntt_tomont(&b);
-  poly_invntt_tomont(&v);
-
-  polyvec_add(&b, &b, &ep);
-  poly_add(&v, &v, &epp);
-  poly_add(&v, &v, &k);
-  polyvec_reduce(&b);
-  poly_reduce(&v);
-
-  pack_ciphertext(c, &b, &v);
-}
-
-/*************************************************
-* Name:        indcpa_dec
-*
-* Description: Decryption function of the CPA-secure
-*              public-key encryption scheme underlying Kyber.
-*
-* Arguments:   - uint8_t *m: pointer to output decrypted message
-*                            (of length KYBER_INDCPA_MSGBYTES)
-*              - const uint8_t *c: pointer to input ciphertext
-*                                  (of length KYBER_INDCPA_BYTES)
-*              - const uint8_t *sk: pointer to input secret key
-*                                   (of length KYBER_INDCPA_SECRETKEYBYTES)
-**************************************************/
-void indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES],
-                const uint8_t c[KYBER_INDCPA_BYTES],
-                const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES])
-{
-  polyvec b, skpv;
-  poly v, mp;
-
-  unpack_ciphertext(&b, &v, c);
-  unpack_sk(&skpv, sk);
-
-  polyvec_ntt(&b);
-  polyvec_basemul_acc_montgomery(&mp, &skpv, &b);
-  poly_invntt_tomont(&mp);
-
-  poly_sub(&mp, &v, &mp);
-  poly_reduce(&mp);
-
-  poly_tomsg(m, &mp);
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/indcpa.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/indcpa.h
deleted file mode 100644
index 6dd5088c85..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/indcpa.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef INDCPA_H
-#define INDCPA_H
-
-#include <stdint.h>
-#include "params.h"
-#include "polyvec.h"
-
-#define gen_matrix KYBER_NAMESPACE(gen_matrix)
-void gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed);
-
-#define indcpa_keypair_derand KYBER_NAMESPACE(indcpa_keypair_derand)
-void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
-                           uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES],
-                           const uint8_t coins[KYBER_SYMBYTES]);
-
-#define indcpa_enc KYBER_NAMESPACE(indcpa_enc)
-void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
-                const uint8_t m[KYBER_INDCPA_MSGBYTES],
-                const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
-                const uint8_t coins[KYBER_SYMBYTES]);
-
-#define indcpa_dec KYBER_NAMESPACE(indcpa_dec)
-void indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES],
-                const uint8_t c[KYBER_INDCPA_BYTES],
-                const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/invntt.S b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/invntt.S
deleted file mode 100644
index 76d4189996..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/invntt.S
+++ /dev/null
@@ -1,193 +0,0 @@
-#include "consts.h"
-.include "shuffle.inc"
-.include "fq.inc"
-
-.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=2,zl1=2,zh0=3,zh1=3
-vpsubw		%ymm\rl0,%ymm\rh0,%ymm12
-vpaddw		%ymm\rh0,%ymm\rl0,%ymm\rl0
-vpsubw		%ymm\rl1,%ymm\rh1,%ymm13
-
-vpmullw		%ymm\zl0,%ymm12,%ymm\rh0
-vpaddw		%ymm\rh1,%ymm\rl1,%ymm\rl1
-vpsubw		%ymm\rl2,%ymm\rh2,%ymm14
-
-vpmullw		%ymm\zl0,%ymm13,%ymm\rh1
-vpaddw		%ymm\rh2,%ymm\rl2,%ymm\rl2
-vpsubw		%ymm\rl3,%ymm\rh3,%ymm15
-
-vpmullw		%ymm\zl1,%ymm14,%ymm\rh2
-vpaddw		%ymm\rh3,%ymm\rl3,%ymm\rl3
-vpmullw		%ymm\zl1,%ymm15,%ymm\rh3
-
-vpmulhw		%ymm\zh0,%ymm12,%ymm12
-vpmulhw		%ymm\zh0,%ymm13,%ymm13
-
-vpmulhw		%ymm\zh1,%ymm14,%ymm14
-vpmulhw		%ymm\zh1,%ymm15,%ymm15
-
-vpmulhw		%ymm0,%ymm\rh0,%ymm\rh0
-
-vpmulhw		%ymm0,%ymm\rh1,%ymm\rh1
-
-vpmulhw		%ymm0,%ymm\rh2,%ymm\rh2
-vpmulhw		%ymm0,%ymm\rh3,%ymm\rh3
-
-#
-
-#
-
-vpsubw		%ymm\rh0,%ymm12,%ymm\rh0
-
-vpsubw		%ymm\rh1,%ymm13,%ymm\rh1
-
-vpsubw		%ymm\rh2,%ymm14,%ymm\rh2
-vpsubw		%ymm\rh3,%ymm15,%ymm\rh3
-.endm
-
-.macro intt_levels0t5 off
-/* level 0 */
-vmovdqa		_16XFLO*2(%rsi),%ymm2
-vmovdqa		_16XFHI*2(%rsi),%ymm3
-
-vmovdqa         (128*\off+  0)*2(%rdi),%ymm4
-vmovdqa         (128*\off+ 32)*2(%rdi),%ymm6
-vmovdqa         (128*\off+ 16)*2(%rdi),%ymm5
-vmovdqa         (128*\off+ 48)*2(%rdi),%ymm7
-
-fqmulprecomp	2,3,4
-fqmulprecomp	2,3,6
-fqmulprecomp	2,3,5
-fqmulprecomp	2,3,7
-
-vmovdqa         (128*\off+ 64)*2(%rdi),%ymm8
-vmovdqa         (128*\off+ 96)*2(%rdi),%ymm10
-vmovdqa         (128*\off+ 80)*2(%rdi),%ymm9
-vmovdqa         (128*\off+112)*2(%rdi),%ymm11
-
-fqmulprecomp	2,3,8
-fqmulprecomp	2,3,10
-fqmulprecomp	2,3,9
-fqmulprecomp	2,3,11
-
-vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+208)*2(%rsi),%ymm15
-vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+176)*2(%rsi),%ymm1
-vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+224)*2(%rsi),%ymm2
-vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+192)*2(%rsi),%ymm3
-vmovdqa		_REVIDXB*2(%rsi),%ymm12
-vpshufb		%ymm12,%ymm15,%ymm15
-vpshufb		%ymm12,%ymm1,%ymm1
-vpshufb		%ymm12,%ymm2,%ymm2
-vpshufb		%ymm12,%ymm3,%ymm3
-
-butterfly	4,5,8,9,6,7,10,11,15,1,2,3
-
-/* level 1 */
-vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+144)*2(%rsi),%ymm2
-vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+160)*2(%rsi),%ymm3
-vmovdqa		_REVIDXB*2(%rsi),%ymm1
-vpshufb		%ymm1,%ymm2,%ymm2
-vpshufb		%ymm1,%ymm3,%ymm3
-
-butterfly	4,5,6,7,8,9,10,11,2,2,3,3
-
-shuffle1	4,5,3,5
-shuffle1	6,7,4,7
-shuffle1	8,9,6,9
-shuffle1	10,11,8,11
-
-/* level 2 */
-vmovdqa		_REVIDXD*2(%rsi),%ymm12
-vpermd		(_ZETAS_EXP+(1-\off)*224+112)*2(%rsi),%ymm12,%ymm2
-vpermd		(_ZETAS_EXP+(1-\off)*224+128)*2(%rsi),%ymm12,%ymm10
-
-butterfly	3,4,6,8,5,7,9,11,2,2,10,10
-
-vmovdqa		_16XV*2(%rsi),%ymm1
-red16		3
-
-shuffle2	3,4,10,4
-shuffle2	6,8,3,8
-shuffle2	5,7,6,7
-shuffle2	9,11,5,11
-
-/* level 3 */
-vpermq		$0x1B,(_ZETAS_EXP+(1-\off)*224+80)*2(%rsi),%ymm2
-vpermq		$0x1B,(_ZETAS_EXP+(1-\off)*224+96)*2(%rsi),%ymm9
-
-butterfly	10,3,6,5,4,8,7,11,2,2,9,9
-
-shuffle4	10,3,9,3
-shuffle4	6,5,10,5
-shuffle4	4,8,6,8
-shuffle4	7,11,4,11
-
-/* level 4 */
-vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+48)*2(%rsi),%ymm2
-vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+64)*2(%rsi),%ymm7
-
-butterfly	9,10,6,4,3,5,8,11,2,2,7,7
-
-red16		9
-
-shuffle8	9,10,7,10
-shuffle8	6,4,9,4
-shuffle8	3,5,6,5
-shuffle8	8,11,3,11
-
-/* level 5 */
-vmovdqa		(_ZETAS_EXP+(1-\off)*224+16)*2(%rsi),%ymm2
-vmovdqa		(_ZETAS_EXP+(1-\off)*224+32)*2(%rsi),%ymm8
-
-butterfly	7,9,6,3,10,4,5,11,2,2,8,8
-
-vmovdqa         %ymm7,(128*\off+  0)*2(%rdi)
-vmovdqa         %ymm9,(128*\off+ 16)*2(%rdi)
-vmovdqa         %ymm6,(128*\off+ 32)*2(%rdi)
-vmovdqa         %ymm3,(128*\off+ 48)*2(%rdi)
-vmovdqa         %ymm10,(128*\off+ 64)*2(%rdi)
-vmovdqa         %ymm4,(128*\off+ 80)*2(%rdi)
-vmovdqa         %ymm5,(128*\off+ 96)*2(%rdi)
-vmovdqa         %ymm11,(128*\off+112)*2(%rdi)
-.endm
-
-.macro intt_level6 off
-/* level 6 */
-vmovdqa         (64*\off+  0)*2(%rdi),%ymm4
-vmovdqa         (64*\off+128)*2(%rdi),%ymm8
-vmovdqa         (64*\off+ 16)*2(%rdi),%ymm5
-vmovdqa         (64*\off+144)*2(%rdi),%ymm9
-vpbroadcastq	(_ZETAS_EXP+0)*2(%rsi),%ymm2
-
-vmovdqa         (64*\off+ 32)*2(%rdi),%ymm6
-vmovdqa         (64*\off+160)*2(%rdi),%ymm10
-vmovdqa         (64*\off+ 48)*2(%rdi),%ymm7
-vmovdqa         (64*\off+176)*2(%rdi),%ymm11
-vpbroadcastq	(_ZETAS_EXP+4)*2(%rsi),%ymm3
-
-butterfly	4,5,6,7,8,9,10,11
-
-.if \off == 0
-red16		4
-.endif
-
-vmovdqa		%ymm4,(64*\off+  0)*2(%rdi)
-vmovdqa		%ymm5,(64*\off+ 16)*2(%rdi)
-vmovdqa		%ymm6,(64*\off+ 32)*2(%rdi)
-vmovdqa		%ymm7,(64*\off+ 48)*2(%rdi)
-vmovdqa		%ymm8,(64*\off+128)*2(%rdi)
-vmovdqa		%ymm9,(64*\off+144)*2(%rdi)
-vmovdqa		%ymm10,(64*\off+160)*2(%rdi)
-vmovdqa		%ymm11,(64*\off+176)*2(%rdi)
-.endm
-
-.text
-.global cdecl(invntt_avx)
-cdecl(invntt_avx):
-vmovdqa         _16XQ*2(%rsi),%ymm0
-
-intt_levels0t5	0
-intt_levels0t5	1
-
-intt_level6	0
-intt_level6	1
-ret
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/kem.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/kem.c
deleted file mode 100644
index 63abc1029c..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/kem.c
+++ /dev/null
@@ -1,169 +0,0 @@
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-#include "params.h"
-#include "kem.h"
-#include "indcpa.h"
-#include "verify.h"
-#include "symmetric.h"
-#include "randombytes.h"
-/*************************************************
-* Name:        crypto_kem_keypair_derand
-*
-* Description: Generates public and private key
-*              for CCA-secure Kyber key encapsulation mechanism
-*
-* Arguments:   - uint8_t *pk: pointer to output public key
-*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
-*              - uint8_t *sk: pointer to output private key
-*                (an already allocated array of KYBER_SECRETKEYBYTES bytes)
-*              - uint8_t *coins: pointer to input randomness
-*                (an already allocated array filled with 2*KYBER_SYMBYTES random bytes)
-**
-* Returns 0 (success)
-**************************************************/
-int crypto_kem_keypair_derand(uint8_t *pk,
-                              uint8_t *sk,
-                              const uint8_t *coins)
-{
-  indcpa_keypair_derand(pk, sk, coins);
-  memcpy(sk+KYBER_INDCPA_SECRETKEYBYTES, pk, KYBER_PUBLICKEYBYTES);
-  hash_h(sk+KYBER_SECRETKEYBYTES-2*KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
-  /* Value z for pseudo-random output on reject */
-  memcpy(sk+KYBER_SECRETKEYBYTES-KYBER_SYMBYTES, coins+KYBER_SYMBYTES, KYBER_SYMBYTES);
-  return 0;
-}
-
-/*************************************************
-* Name:        crypto_kem_keypair
-*
-* Description: Generates public and private key
-*              for CCA-secure Kyber key encapsulation mechanism
-*
-* Arguments:   - uint8_t *pk: pointer to output public key
-*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
-*              - uint8_t *sk: pointer to output private key
-*                (an already allocated array of KYBER_SECRETKEYBYTES bytes)
-*
-* Returns 0 (success)
-**************************************************/
-int crypto_kem_keypair(uint8_t *pk,
-                       uint8_t *sk)
-{
-  uint8_t coins[2*KYBER_SYMBYTES];
-  randombytes(coins, 2*KYBER_SYMBYTES);
-  crypto_kem_keypair_derand(pk, sk, coins);
-  return 0;
-}
-
-/*************************************************
-* Name:        crypto_kem_enc_derand
-*
-* Description: Generates cipher text and shared
-*              secret for given public key
-*
-* Arguments:   - uint8_t *ct: pointer to output cipher text
-*                (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
-*              - uint8_t *ss: pointer to output shared secret
-*                (an already allocated array of KYBER_SSBYTES bytes)
-*              - const uint8_t *pk: pointer to input public key
-*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
-*              - const uint8_t *coins: pointer to input randomness
-*                (an already allocated array filled with KYBER_SYMBYTES random bytes)
-**
-* Returns 0 (success)
-**************************************************/
-int crypto_kem_enc_derand(uint8_t *ct,
-                          uint8_t *ss,
-                          const uint8_t *pk,
-                          const uint8_t *coins)
-{
-  uint8_t buf[2*KYBER_SYMBYTES];
-  /* Will contain key, coins */
-  uint8_t kr[2*KYBER_SYMBYTES];
-
-  memcpy(buf, coins, KYBER_SYMBYTES);
-
-  /* Multitarget countermeasure for coins + contributory KEM */
-  hash_h(buf+KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
-  hash_g(kr, buf, 2*KYBER_SYMBYTES);
-
-  /* coins are in kr+KYBER_SYMBYTES */
-  indcpa_enc(ct, buf, pk, kr+KYBER_SYMBYTES);
-
-  memcpy(ss,kr,KYBER_SYMBYTES);
-  return 0;
-}
-
-/*************************************************
-* Name:        crypto_kem_enc
-*
-* Description: Generates cipher text and shared
-*              secret for given public key
-*
-* Arguments:   - uint8_t *ct: pointer to output cipher text
-*                (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
-*              - uint8_t *ss: pointer to output shared secret
-*                (an already allocated array of KYBER_SSBYTES bytes)
-*              - const uint8_t *pk: pointer to input public key
-*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
-*
-* Returns 0 (success)
-**************************************************/
-int crypto_kem_enc(uint8_t *ct,
-                   uint8_t *ss,
-                   const uint8_t *pk)
-{
-  uint8_t coins[KYBER_SYMBYTES];
-  randombytes(coins, KYBER_SYMBYTES);
-  crypto_kem_enc_derand(ct, ss, pk, coins);
-  return 0;
-}
-
-/*************************************************
-* Name:        crypto_kem_dec
-*
-* Description: Generates shared secret for given
-*              cipher text and private key
-*
-* Arguments:   - uint8_t *ss: pointer to output shared secret
-*                (an already allocated array of KYBER_SSBYTES bytes)
-*              - const uint8_t *ct: pointer to input cipher text
-*                (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
-*              - const uint8_t *sk: pointer to input private key
-*                (an already allocated array of KYBER_SECRETKEYBYTES bytes)
-*
-* Returns 0.
-*
-* On failure, ss will contain a pseudo-random value.
-**************************************************/
-int crypto_kem_dec(uint8_t *ss,
-                   const uint8_t *ct,
-                   const uint8_t *sk)
-{
-  int fail;
-  uint8_t buf[2*KYBER_SYMBYTES];
-  /* Will contain key, coins */
-  uint8_t kr[2*KYBER_SYMBYTES];
-  uint8_t cmp[KYBER_CIPHERTEXTBYTES+KYBER_SYMBYTES];
-  const uint8_t *pk = sk+KYBER_INDCPA_SECRETKEYBYTES;
-
-  indcpa_dec(buf, ct, sk);
-
-  /* Multitarget countermeasure for coins + contributory KEM */
-  memcpy(buf+KYBER_SYMBYTES, sk+KYBER_SECRETKEYBYTES-2*KYBER_SYMBYTES, KYBER_SYMBYTES);
-  hash_g(kr, buf, 2*KYBER_SYMBYTES);
-
-  /* coins are in kr+KYBER_SYMBYTES */
-  indcpa_enc(cmp, buf, pk, kr+KYBER_SYMBYTES);
-
-  fail = verify(ct, cmp, KYBER_CIPHERTEXTBYTES);
-
-  /* Compute rejection key */
-  rkprf(ss,sk+KYBER_SECRETKEYBYTES-KYBER_SYMBYTES,ct);
-
-  /* Copy true key to return buffer if fail is false */
-  cmov(ss,kr,KYBER_SYMBYTES,!fail);
-
-  return 0;
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/kem.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/kem.h
deleted file mode 100644
index 234f11966b..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/kem.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef KEM_H
-#define KEM_H
-
-#include <stdint.h>
-#include "params.h"
-
-#define CRYPTO_SECRETKEYBYTES  KYBER_SECRETKEYBYTES
-#define CRYPTO_PUBLICKEYBYTES  KYBER_PUBLICKEYBYTES
-#define CRYPTO_CIPHERTEXTBYTES KYBER_CIPHERTEXTBYTES
-#define CRYPTO_BYTES           KYBER_SSBYTES
-
-#if   (KYBER_K == 2)
-#define CRYPTO_ALGNAME "Kyber512"
-#elif (KYBER_K == 3)
-#define CRYPTO_ALGNAME "Kyber768"
-#elif (KYBER_K == 4)
-#define CRYPTO_ALGNAME "Kyber1024"
-#endif
-
-#define crypto_kem_keypair_derand KYBER_NAMESPACE(keypair_derand)
-int crypto_kem_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
-
-#define crypto_kem_keypair KYBER_NAMESPACE(keypair)
-int crypto_kem_keypair(uint8_t *pk, uint8_t *sk);
-
-#define crypto_kem_enc_derand KYBER_NAMESPACE(enc_derand)
-int crypto_kem_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
-
-#define crypto_kem_enc KYBER_NAMESPACE(enc)
-int crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-
-#define crypto_kem_dec KYBER_NAMESPACE(dec)
-int crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/ntt.S b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/ntt.S
deleted file mode 100644
index 0ce7b41297..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/ntt.S
+++ /dev/null
@@ -1,189 +0,0 @@
-#include "consts.h"
-.include "shuffle.inc"
-
-.macro mul rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=2,zh1=2
-vpmullw		%ymm\zl0,%ymm\rh0,%ymm12
-vpmullw		%ymm\zl0,%ymm\rh1,%ymm13
-
-vpmullw		%ymm\zl1,%ymm\rh2,%ymm14
-vpmullw		%ymm\zl1,%ymm\rh3,%ymm15
-
-vpmulhw		%ymm\zh0,%ymm\rh0,%ymm\rh0
-vpmulhw		%ymm\zh0,%ymm\rh1,%ymm\rh1
-
-vpmulhw		%ymm\zh1,%ymm\rh2,%ymm\rh2
-vpmulhw		%ymm\zh1,%ymm\rh3,%ymm\rh3
-.endm
-
-.macro reduce
-vpmulhw		%ymm0,%ymm12,%ymm12
-vpmulhw		%ymm0,%ymm13,%ymm13
-
-vpmulhw		%ymm0,%ymm14,%ymm14
-vpmulhw		%ymm0,%ymm15,%ymm15
-.endm
-
-.macro update rln,rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3
-vpaddw		%ymm\rh0,%ymm\rl0,%ymm\rln
-vpsubw		%ymm\rh0,%ymm\rl0,%ymm\rh0
-vpaddw		%ymm\rh1,%ymm\rl1,%ymm\rl0
-
-vpsubw		%ymm\rh1,%ymm\rl1,%ymm\rh1
-vpaddw		%ymm\rh2,%ymm\rl2,%ymm\rl1
-vpsubw		%ymm\rh2,%ymm\rl2,%ymm\rh2
-
-vpaddw		%ymm\rh3,%ymm\rl3,%ymm\rl2
-vpsubw		%ymm\rh3,%ymm\rl3,%ymm\rh3
-
-vpsubw		%ymm12,%ymm\rln,%ymm\rln
-vpaddw		%ymm12,%ymm\rh0,%ymm\rh0
-vpsubw		%ymm13,%ymm\rl0,%ymm\rl0
-
-vpaddw		%ymm13,%ymm\rh1,%ymm\rh1
-vpsubw		%ymm14,%ymm\rl1,%ymm\rl1
-vpaddw		%ymm14,%ymm\rh2,%ymm\rh2
-
-vpsubw		%ymm15,%ymm\rl2,%ymm\rl2
-vpaddw		%ymm15,%ymm\rh3,%ymm\rh3
-.endm
-
-.macro level0 off
-vpbroadcastq	(_ZETAS_EXP+0)*2(%rsi),%ymm15
-vmovdqa		(64*\off+128)*2(%rdi),%ymm8
-vmovdqa		(64*\off+144)*2(%rdi),%ymm9
-vmovdqa		(64*\off+160)*2(%rdi),%ymm10
-vmovdqa		(64*\off+176)*2(%rdi),%ymm11
-vpbroadcastq	(_ZETAS_EXP+4)*2(%rsi),%ymm2
-
-mul		8,9,10,11
-
-vmovdqa		(64*\off+  0)*2(%rdi),%ymm4
-vmovdqa		(64*\off+ 16)*2(%rdi),%ymm5
-vmovdqa		(64*\off+ 32)*2(%rdi),%ymm6
-vmovdqa		(64*\off+ 48)*2(%rdi),%ymm7
-
-reduce
-update		3,4,5,6,7,8,9,10,11
-
-vmovdqa		%ymm3,(64*\off+  0)*2(%rdi)
-vmovdqa		%ymm4,(64*\off+ 16)*2(%rdi)
-vmovdqa		%ymm5,(64*\off+ 32)*2(%rdi)
-vmovdqa		%ymm6,(64*\off+ 48)*2(%rdi)
-vmovdqa		%ymm8,(64*\off+128)*2(%rdi)
-vmovdqa		%ymm9,(64*\off+144)*2(%rdi)
-vmovdqa		%ymm10,(64*\off+160)*2(%rdi)
-vmovdqa		%ymm11,(64*\off+176)*2(%rdi)
-.endm
-
-.macro levels1t6 off
-/* level 1 */
-vmovdqa		(_ZETAS_EXP+224*\off+16)*2(%rsi),%ymm15
-vmovdqa		(128*\off+ 64)*2(%rdi),%ymm8
-vmovdqa		(128*\off+ 80)*2(%rdi),%ymm9
-vmovdqa		(128*\off+ 96)*2(%rdi),%ymm10
-vmovdqa		(128*\off+112)*2(%rdi),%ymm11
-vmovdqa		(_ZETAS_EXP+224*\off+32)*2(%rsi),%ymm2
-
-mul		8,9,10,11
-
-vmovdqa		(128*\off+  0)*2(%rdi),%ymm4
-vmovdqa	 	(128*\off+ 16)*2(%rdi),%ymm5
-vmovdqa		(128*\off+ 32)*2(%rdi),%ymm6
-vmovdqa		(128*\off+ 48)*2(%rdi),%ymm7
-
-reduce
-update		3,4,5,6,7,8,9,10,11
-
-/* level 2 */
-shuffle8	5,10,7,10
-shuffle8	6,11,5,11
-
-vmovdqa		(_ZETAS_EXP+224*\off+48)*2(%rsi),%ymm15
-vmovdqa		(_ZETAS_EXP+224*\off+64)*2(%rsi),%ymm2
-
-mul		7,10,5,11
-
-shuffle8	3,8,6,8
-shuffle8	4,9,3,9
-
-reduce
-update		4,6,8,3,9,7,10,5,11
-
-/* level 3 */
-shuffle4	8,5,9,5
-shuffle4	3,11,8,11
-
-vmovdqa		(_ZETAS_EXP+224*\off+80)*2(%rsi),%ymm15
-vmovdqa		(_ZETAS_EXP+224*\off+96)*2(%rsi),%ymm2
-
-mul		9,5,8,11
-
-shuffle4	4,7,3,7
-shuffle4	6,10,4,10
-
-reduce
-update		6,3,7,4,10,9,5,8,11
-
-/* level 4 */
-shuffle2	7,8,10,8
-shuffle2	4,11,7,11
-
-vmovdqa		(_ZETAS_EXP+224*\off+112)*2(%rsi),%ymm15
-vmovdqa		(_ZETAS_EXP+224*\off+128)*2(%rsi),%ymm2
-
-mul		10,8,7,11
-
-shuffle2	6,9,4,9
-shuffle2	3,5,6,5
-
-reduce
-update		3,4,9,6,5,10,8,7,11
-
-/* level 5 */
-shuffle1	9,7,5,7
-shuffle1	6,11,9,11
-
-vmovdqa		(_ZETAS_EXP+224*\off+144)*2(%rsi),%ymm15
-vmovdqa		(_ZETAS_EXP+224*\off+160)*2(%rsi),%ymm2
-
-mul		5,7,9,11
-
-shuffle1	3,10,6,10
-shuffle1	4,8,3,8
-
-reduce
-update		4,6,10,3,8,5,7,9,11
-
-/* level 6 */
-vmovdqa		(_ZETAS_EXP+224*\off+176)*2(%rsi),%ymm14
-vmovdqa		(_ZETAS_EXP+224*\off+208)*2(%rsi),%ymm15
-vmovdqa		(_ZETAS_EXP+224*\off+192)*2(%rsi),%ymm8
-vmovdqa		(_ZETAS_EXP+224*\off+224)*2(%rsi),%ymm2
-
-mul		10,3,9,11,14,15,8,2
-
-reduce
-update		8,4,6,5,7,10,3,9,11
-
-vmovdqa		%ymm8,(128*\off+  0)*2(%rdi)
-vmovdqa		%ymm4,(128*\off+ 16)*2(%rdi)
-vmovdqa		%ymm10,(128*\off+ 32)*2(%rdi)
-vmovdqa		%ymm3,(128*\off+ 48)*2(%rdi)
-vmovdqa		%ymm6,(128*\off+ 64)*2(%rdi)
-vmovdqa		%ymm5,(128*\off+ 80)*2(%rdi)
-vmovdqa		%ymm9,(128*\off+ 96)*2(%rdi)
-vmovdqa		%ymm11,(128*\off+112)*2(%rdi)
-.endm
-
-.text
-.global cdecl(ntt_avx)
-cdecl(ntt_avx):
-vmovdqa		_16XQ*2(%rsi),%ymm0
-
-level0		0
-level0		1
-
-levels1t6	0
-levels1t6	1
-
-ret
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/ntt.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/ntt.h
deleted file mode 100644
index a4f48e343b..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/ntt.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef NTT_H
-#define NTT_H
-
-#include <stdint.h>
-#include <immintrin.h>
-
-#define ntt_avx KYBER_NAMESPACE(ntt_avx)
-void ntt_avx(__m256i *r, const __m256i *qdata);
-#define invntt_avx KYBER_NAMESPACE(invntt_avx)
-void invntt_avx(__m256i *r, const __m256i *qdata);
-
-#define nttpack_avx KYBER_NAMESPACE(nttpack_avx)
-void nttpack_avx(__m256i *r, const __m256i *qdata);
-#define nttunpack_avx KYBER_NAMESPACE(nttunpack_avx)
-void nttunpack_avx(__m256i *r, const __m256i *qdata);
-
-#define basemul_avx KYBER_NAMESPACE(basemul_avx)
-void basemul_avx(__m256i *r,
-                 const __m256i *a,
-                 const __m256i *b,
-                 const __m256i *qdata);
-
-#define ntttobytes_avx KYBER_NAMESPACE(ntttobytes_avx)
-void ntttobytes_avx(uint8_t *r, const __m256i *a, const __m256i *qdata);
-#define nttfrombytes_avx KYBER_NAMESPACE(nttfrombytes_avx)
-void nttfrombytes_avx(__m256i *r, const uint8_t *a, const __m256i *qdata);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/params.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/params.h
deleted file mode 100644
index ecfabce4a5..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/params.h
+++ /dev/null
@@ -1,68 +0,0 @@
-#ifndef PARAMS_H
-#define PARAMS_H
-
-#ifndef KYBER_K
-#define KYBER_K 3	/* Change this for different security strengths */
-#endif
-
-//#define KYBER_90S	/* Uncomment this if you want the 90S variant */
-
-/* Don't change parameters below this line */
-#if   (KYBER_K == 2)
-#ifdef KYBER_90S
-#define KYBER_NAMESPACE(s) pqcrystals_kyber512_90s_avx2_##s
-#else
-#define KYBER_NAMESPACE(s) pqcrystals_ml_kem_512_avx2_##s
-#endif
-#elif (KYBER_K == 3)
-#ifdef KYBER_90S
-#define KYBER_NAMESPACE(s) pqcrystals_kyber768_90s_avx2_##s
-#else
-#define KYBER_NAMESPACE(s) pqcrystals_ml_kem_768_avx2_##s
-#endif
-#elif (KYBER_K == 4)
-#ifdef KYBER_90S
-#define KYBER_NAMESPACE(s) pqcrystals_kyber1024_90s_avx2_##s
-#else
-#define KYBER_NAMESPACE(s) pqcrystals_ml_kem_1024_avx2_##s
-#endif
-#else
-#error "KYBER_K must be in {2,3,4}"
-#endif
-
-#define KYBER_N 256
-#define KYBER_Q 3329
-
-#define KYBER_SYMBYTES 32   /* size in bytes of hashes, and seeds */
-#define KYBER_SSBYTES  32   /* size in bytes of shared key */
-
-#define KYBER_POLYBYTES		384
-#define KYBER_POLYVECBYTES	(KYBER_K * KYBER_POLYBYTES)
-
-#if KYBER_K == 2
-#define KYBER_ETA1 3
-#define KYBER_POLYCOMPRESSEDBYTES    128
-#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320)
-#elif KYBER_K == 3
-#define KYBER_ETA1 2
-#define KYBER_POLYCOMPRESSEDBYTES    128
-#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320)
-#elif KYBER_K == 4
-#define KYBER_ETA1 2
-#define KYBER_POLYCOMPRESSEDBYTES    160
-#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 352)
-#endif
-
-#define KYBER_ETA2 2
-
-#define KYBER_INDCPA_MSGBYTES       (KYBER_SYMBYTES)
-#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES)
-#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES)
-#define KYBER_INDCPA_BYTES          (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES)
-
-#define KYBER_PUBLICKEYBYTES  (KYBER_INDCPA_PUBLICKEYBYTES)
-/* 32 bytes of additional space to save H(pk) */
-#define KYBER_SECRETKEYBYTES  (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES)
-#define KYBER_CIPHERTEXTBYTES (KYBER_INDCPA_BYTES)
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/poly.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/poly.c
deleted file mode 100644
index 681fd6d23e..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/poly.c
+++ /dev/null
@@ -1,519 +0,0 @@
-#include <stdint.h>
-#include <immintrin.h>
-#include <string.h>
-#include "align.h"
-#include "fips202x4.h"
-#include "params.h"
-#include "poly.h"
-#include "ntt.h"
-#include "consts.h"
-#include "reduce.h"
-#include "cbd.h"
-#include "symmetric.h"
-
-/*************************************************
-* Name:        poly_compress
-*
-* Description: Compression and subsequent serialization of a polynomial.
-*              The coefficients of the input polynomial are assumed to
-*              lie in the invertal [0,q], i.e. the polynomial must be reduced
-*              by poly_reduce().
-*
-* Arguments:   - uint8_t *r: pointer to output byte array
-*                            (of length KYBER_POLYCOMPRESSEDBYTES)
-*              - const poly *a: pointer to input polynomial
-**************************************************/
-#if (KYBER_POLYCOMPRESSEDBYTES == 128)
-void poly_compress(uint8_t r[128], const poly * restrict a)
-{
-  unsigned int i;
-  __m256i f0, f1, f2, f3;
-  const __m256i v = _mm256_load_si256(&qdata.vec[_16XV/16]);
-  const __m256i shift1 = _mm256_set1_epi16(1 << 9);
-  const __m256i mask = _mm256_set1_epi16(15);
-  const __m256i shift2 = _mm256_set1_epi16((16 << 8) + 1);
-  const __m256i permdidx = _mm256_set_epi32(7,3,6,2,5,1,4,0);
-
-  for(i=0;i<KYBER_N/64;i++) {
-    f0 = _mm256_load_si256(&a->vec[4*i+0]);
-    f1 = _mm256_load_si256(&a->vec[4*i+1]);
-    f2 = _mm256_load_si256(&a->vec[4*i+2]);
-    f3 = _mm256_load_si256(&a->vec[4*i+3]);
-    f0 = _mm256_mulhi_epi16(f0,v);
-    f1 = _mm256_mulhi_epi16(f1,v);
-    f2 = _mm256_mulhi_epi16(f2,v);
-    f3 = _mm256_mulhi_epi16(f3,v);
-    f0 = _mm256_mulhrs_epi16(f0,shift1);
-    f1 = _mm256_mulhrs_epi16(f1,shift1);
-    f2 = _mm256_mulhrs_epi16(f2,shift1);
-    f3 = _mm256_mulhrs_epi16(f3,shift1);
-    f0 = _mm256_and_si256(f0,mask);
-    f1 = _mm256_and_si256(f1,mask);
-    f2 = _mm256_and_si256(f2,mask);
-    f3 = _mm256_and_si256(f3,mask);
-    f0 = _mm256_packus_epi16(f0,f1);
-    f2 = _mm256_packus_epi16(f2,f3);
-    f0 = _mm256_maddubs_epi16(f0,shift2);
-    f2 = _mm256_maddubs_epi16(f2,shift2);
-    f0 = _mm256_packus_epi16(f0,f2);
-    f0 = _mm256_permutevar8x32_epi32(f0,permdidx);
-    _mm256_storeu_si256((__m256i *)&r[32*i],f0);
-  }
-}
-
-void poly_decompress(poly * restrict r, const uint8_t a[128])
-{
-  unsigned int i;
-  __m128i t;
-  __m256i f;
-  const __m256i q = _mm256_load_si256(&qdata.vec[_16XQ/16]);
-  const __m256i shufbidx = _mm256_set_epi8(7,7,7,7,6,6,6,6,5,5,5,5,4,4,4,4,
-                                           3,3,3,3,2,2,2,2,1,1,1,1,0,0,0,0);
-  const __m256i mask = _mm256_set1_epi32(0x00F0000F);
-  const __m256i shift = _mm256_set1_epi32((128 << 16) + 2048);
-
-  for(i=0;i<KYBER_N/16;i++) {
-    t = _mm_loadl_epi64((__m128i *)&a[8*i]);
-    f = _mm256_broadcastsi128_si256(t);
-    f = _mm256_shuffle_epi8(f,shufbidx);
-    f = _mm256_and_si256(f,mask);
-    f = _mm256_mullo_epi16(f,shift);
-    f = _mm256_mulhrs_epi16(f,q);
-    _mm256_store_si256(&r->vec[i],f);
-  }
-}
-
-#elif (KYBER_POLYCOMPRESSEDBYTES == 160)
-void poly_compress(uint8_t r[160], const poly * restrict a)
-{
-  unsigned int i;
-  __m256i f0, f1;
-  __m128i t0, t1;
-  const __m256i v = _mm256_load_si256(&qdata.vec[_16XV/16]);
-  const __m256i shift1 = _mm256_set1_epi16(1 << 10);
-  const __m256i mask = _mm256_set1_epi16(31);
-  const __m256i shift2 = _mm256_set1_epi16((32 << 8) + 1);
-  const __m256i shift3 = _mm256_set1_epi32((1024 << 16) + 1);
-  const __m256i sllvdidx = _mm256_set1_epi64x(12);
-  const __m256i shufbidx = _mm256_set_epi8( 8,-1,-1,-1,-1,-1, 4, 3, 2, 1, 0,-1,12,11,10, 9,
-                                           -1,12,11,10, 9, 8,-1,-1,-1,-1,-1 ,4, 3, 2, 1, 0);
-
-  for(i=0;i<KYBER_N/32;i++) {
-    f0 = _mm256_load_si256(&a->vec[2*i+0]);
-    f1 = _mm256_load_si256(&a->vec[2*i+1]);
-    f0 = _mm256_mulhi_epi16(f0,v);
-    f1 = _mm256_mulhi_epi16(f1,v);
-    f0 = _mm256_mulhrs_epi16(f0,shift1);
-    f1 = _mm256_mulhrs_epi16(f1,shift1);
-    f0 = _mm256_and_si256(f0,mask);
-    f1 = _mm256_and_si256(f1,mask);
-    f0 = _mm256_packus_epi16(f0,f1);
-    f0 = _mm256_maddubs_epi16(f0,shift2);	// a0 a1 a2 a3 b0 b1 b2 b3 a4 a5 a6 a7 b4 b5 b6 b7
-    f0 = _mm256_madd_epi16(f0,shift3);		// a0 a1 b0 b1 a2 a3 b2 b3
-    f0 = _mm256_sllv_epi32(f0,sllvdidx);
-    f0 = _mm256_srlv_epi64(f0,sllvdidx);
-    f0 = _mm256_shuffle_epi8(f0,shufbidx);
-    t0 = _mm256_castsi256_si128(f0);
-    t1 = _mm256_extracti128_si256(f0,1);
-    t0 = _mm_blendv_epi8(t0,t1,_mm256_castsi256_si128(shufbidx));
-    _mm_storeu_si128((__m128i *)&r[20*i+ 0],t0);
-    memcpy(&r[20*i+16],&t1,4);
-  }
-}
-
-void poly_decompress(poly * restrict r, const uint8_t a[160])
-{
-  unsigned int i;
-  __m128i t;
-  __m256i f;
-  int16_t ti;
-  const __m256i q = _mm256_load_si256(&qdata.vec[_16XQ/16]);
-  const __m256i shufbidx = _mm256_set_epi8(9,9,9,8,8,8,8,7,7,6,6,6,6,5,5,5,
-                                           4,4,4,3,3,3,3,2,2,1,1,1,1,0,0,0);
-  const __m256i mask = _mm256_set_epi16(248,1984,62,496,3968,124,992,31,
-                                        248,1984,62,496,3968,124,992,31);
-  const __m256i shift = _mm256_set_epi16(128,16,512,64,8,256,32,1024,
-                                         128,16,512,64,8,256,32,1024);
-
-  for(i=0;i<KYBER_N/16;i++) {
-    t = _mm_loadl_epi64((__m128i *)&a[10*i+0]);
-    memcpy(&ti,&a[10*i+8],2);
-    t = _mm_insert_epi16(t,ti,4);
-    f = _mm256_broadcastsi128_si256(t);
-    f = _mm256_shuffle_epi8(f,shufbidx);
-    f = _mm256_and_si256(f,mask);
-    f = _mm256_mullo_epi16(f,shift);
-    f = _mm256_mulhrs_epi16(f,q);
-    _mm256_store_si256(&r->vec[i],f);
-  }
-}
-
-#endif
-
-/*************************************************
-* Name:        poly_tobytes
-*
-* Description: Serialization of a polynomial in NTT representation.
-*              The coefficients of the input polynomial are assumed to
-*              lie in the invertal [0,q], i.e. the polynomial must be reduced
-*              by poly_reduce(). The coefficients are orderd as output by
-*              poly_ntt(); the serialized output coefficients are in bitreversed
-*              order.
-*
-* Arguments:   - uint8_t *r: pointer to output byte array
-*                            (needs space for KYBER_POLYBYTES bytes)
-*              - poly *a: pointer to input polynomial
-**************************************************/
-void poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a)
-{
-  ntttobytes_avx(r, a->vec, qdata.vec);
-}
-
-/*************************************************
-* Name:        poly_frombytes
-*
-* Description: De-serialization of a polynomial;
-*              inverse of poly_tobytes
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *a: pointer to input byte array
-*                                  (of KYBER_POLYBYTES bytes)
-**************************************************/
-void poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES])
-{
-  nttfrombytes_avx(r->vec, a, qdata.vec);
-}
-
-/*************************************************
-* Name:        poly_frommsg
-*
-* Description: Convert 32-byte message to polynomial
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *msg: pointer to input message
-**************************************************/
-void poly_frommsg(poly * restrict r, const uint8_t msg[KYBER_INDCPA_MSGBYTES])
-{
-#if (KYBER_INDCPA_MSGBYTES != 32)
-#error "KYBER_INDCPA_MSGBYTES must be equal to 32!"
-#endif
-  __m256i f, g0, g1, g2, g3, h0, h1, h2, h3;
-  const __m256i shift = _mm256_broadcastsi128_si256(_mm_set_epi32(0,1,2,3));
-  const __m256i idx = _mm256_broadcastsi128_si256(_mm_set_epi8(15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0));
-  const __m256i hqs = _mm256_set1_epi16((KYBER_Q+1)/2);
-
-#define FROMMSG64(i)						\
-  g3 = _mm256_shuffle_epi32(f,0x55*i);				\
-  g3 = _mm256_sllv_epi32(g3,shift);				\
-  g3 = _mm256_shuffle_epi8(g3,idx);				\
-  g0 = _mm256_slli_epi16(g3,12);				\
-  g1 = _mm256_slli_epi16(g3,8);					\
-  g2 = _mm256_slli_epi16(g3,4);					\
-  g0 = _mm256_srai_epi16(g0,15);				\
-  g1 = _mm256_srai_epi16(g1,15);				\
-  g2 = _mm256_srai_epi16(g2,15);				\
-  g3 = _mm256_srai_epi16(g3,15);				\
-  g0 = _mm256_and_si256(g0,hqs);  /* 19 18 17 16  3  2  1  0 */	\
-  g1 = _mm256_and_si256(g1,hqs);  /* 23 22 21 20  7  6  5  4 */	\
-  g2 = _mm256_and_si256(g2,hqs);  /* 27 26 25 24 11 10  9  8 */	\
-  g3 = _mm256_and_si256(g3,hqs);  /* 31 30 29 28 15 14 13 12 */	\
-  h0 = _mm256_unpacklo_epi64(g0,g1);				\
-  h2 = _mm256_unpackhi_epi64(g0,g1);				\
-  h1 = _mm256_unpacklo_epi64(g2,g3);				\
-  h3 = _mm256_unpackhi_epi64(g2,g3);				\
-  g0 = _mm256_permute2x128_si256(h0,h1,0x20);			\
-  g2 = _mm256_permute2x128_si256(h0,h1,0x31);			\
-  g1 = _mm256_permute2x128_si256(h2,h3,0x20);			\
-  g3 = _mm256_permute2x128_si256(h2,h3,0x31);			\
-  _mm256_store_si256(&r->vec[0+2*i+0],g0);	\
-  _mm256_store_si256(&r->vec[0+2*i+1],g1);	\
-  _mm256_store_si256(&r->vec[8+2*i+0],g2);	\
-  _mm256_store_si256(&r->vec[8+2*i+1],g3)
-
-  f = _mm256_loadu_si256((__m256i *)msg);
-  FROMMSG64(0);
-  FROMMSG64(1);
-  FROMMSG64(2);
-  FROMMSG64(3);
-}
-
-/*************************************************
-* Name:        poly_tomsg
-*
-* Description: Convert polynomial to 32-byte message.
-*              The coefficients of the input polynomial are assumed to
-*              lie in the invertal [0,q], i.e. the polynomial must be reduced
-*              by poly_reduce().
-*
-* Arguments:   - uint8_t *msg: pointer to output message
-*              - poly *a: pointer to input polynomial
-**************************************************/
-void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly * restrict a)
-{
-  unsigned int i;
-  uint32_t small;
-  __m256i f0, f1, g0, g1;
-  const __m256i hq = _mm256_set1_epi16((KYBER_Q - 1)/2);
-  const __m256i hhq = _mm256_set1_epi16((KYBER_Q - 1)/4);
-
-  for(i=0;i<KYBER_N/32;i++) {
-    f0 = _mm256_load_si256(&a->vec[2*i+0]);
-    f1 = _mm256_load_si256(&a->vec[2*i+1]);
-    f0 = _mm256_sub_epi16(hq, f0);
-    f1 = _mm256_sub_epi16(hq, f1);
-    g0 = _mm256_srai_epi16(f0, 15);
-    g1 = _mm256_srai_epi16(f1, 15);
-    f0 = _mm256_xor_si256(f0, g0);
-    f1 = _mm256_xor_si256(f1, g1);
-    f0 = _mm256_sub_epi16(f0, hhq);
-    f1 = _mm256_sub_epi16(f1, hhq);
-    f0 = _mm256_packs_epi16(f0, f1);
-    f0 = _mm256_permute4x64_epi64(f0, 0xD8);
-    small = _mm256_movemask_epi8(f0);
-    memcpy(&msg[4*i], &small, 4);
-  }
-}
-
-/*************************************************
-* Name:        poly_getnoise_eta1
-*
-* Description: Sample a polynomial deterministically from a seed and a nonce,
-*              with output polynomial close to centered binomial distribution
-*              with parameter KYBER_ETA1
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *seed: pointer to input seed
-*                                     (of length KYBER_SYMBYTES bytes)
-*              - uint8_t nonce: one-byte input nonce
-**************************************************/
-void poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce)
-{
-  ALIGNED_UINT8(KYBER_ETA1*KYBER_N/4+32) buf; // +32 bytes as required by poly_cbd_eta1
-  prf(buf.coeffs, KYBER_ETA1*KYBER_N/4, seed, nonce);
-  poly_cbd_eta1(r, buf.vec);
-}
-
-/*************************************************
-* Name:        poly_getnoise_eta2
-*
-* Description: Sample a polynomial deterministically from a seed and a nonce,
-*              with output polynomial close to centered binomial distribution
-*              with parameter KYBER_ETA2
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *seed: pointer to input seed
-*                                     (of length KYBER_SYMBYTES bytes)
-*              - uint8_t nonce: one-byte input nonce
-**************************************************/
-void poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce)
-{
-  ALIGNED_UINT8(KYBER_ETA2*KYBER_N/4) buf;
-  prf(buf.coeffs, KYBER_ETA2*KYBER_N/4, seed, nonce);
-  poly_cbd_eta2(r, buf.vec);
-}
-
-#ifndef KYBER_90S
-#define NOISE_NBLOCKS ((KYBER_ETA1*KYBER_N/4+SHAKE256_RATE-1)/SHAKE256_RATE)
-void poly_getnoise_eta1_4x(poly *r0,
-                           poly *r1,
-                           poly *r2,
-                           poly *r3,
-                           const uint8_t seed[32],
-                           uint8_t nonce0,
-                           uint8_t nonce1,
-                           uint8_t nonce2,
-                           uint8_t nonce3)
-{
-  ALIGNED_UINT8(NOISE_NBLOCKS*SHAKE256_RATE) buf[4];
-  __m256i f;
-  shake256x4incctx state;
-
-  f = _mm256_loadu_si256((__m256i *)seed);
-  _mm256_store_si256(buf[0].vec, f);
-  _mm256_store_si256(buf[1].vec, f);
-  _mm256_store_si256(buf[2].vec, f);
-  _mm256_store_si256(buf[3].vec, f);
-
-  buf[0].coeffs[32] = nonce0;
-  buf[1].coeffs[32] = nonce1;
-  buf[2].coeffs[32] = nonce2;
-  buf[3].coeffs[32] = nonce3;
-
-  shake256x4_inc_init(&state);
-  shake256x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 33);
-  shake256x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, NOISE_NBLOCKS, &state);
-  shake256x4_inc_ctx_release(&state);
-
-  poly_cbd_eta1(r0, buf[0].vec);
-  poly_cbd_eta1(r1, buf[1].vec);
-  poly_cbd_eta1(r2, buf[2].vec);
-  poly_cbd_eta1(r3, buf[3].vec);
-}
-
-#if KYBER_K == 2
-void poly_getnoise_eta1122_4x(poly *r0,
-                              poly *r1,
-                              poly *r2,
-                              poly *r3,
-                              const uint8_t seed[32],
-                              uint8_t nonce0,
-                              uint8_t nonce1,
-                              uint8_t nonce2,
-                              uint8_t nonce3)
-{
-  ALIGNED_UINT8(NOISE_NBLOCKS*SHAKE256_RATE) buf[4];
-  __m256i f;
-  shake256x4incctx state;
-
-  f = _mm256_loadu_si256((__m256i *)seed);
-  _mm256_store_si256(buf[0].vec, f);
-  _mm256_store_si256(buf[1].vec, f);
-  _mm256_store_si256(buf[2].vec, f);
-  _mm256_store_si256(buf[3].vec, f);
-
-  buf[0].coeffs[32] = nonce0;
-  buf[1].coeffs[32] = nonce1;
-  buf[2].coeffs[32] = nonce2;
-  buf[3].coeffs[32] = nonce3;
-
-  shake256x4_inc_init(&state);
-  shake256x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 33);
-  shake256x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, NOISE_NBLOCKS, &state);
-  shake256x4_inc_ctx_release(&state);
-
-  poly_cbd_eta1(r0, buf[0].vec);
-  poly_cbd_eta1(r1, buf[1].vec);
-  poly_cbd_eta2(r2, buf[2].vec);
-  poly_cbd_eta2(r3, buf[3].vec);
-}
-#endif
-#endif
-
-/*************************************************
-* Name:        poly_ntt
-*
-* Description: Computes negacyclic number-theoretic transform (NTT) of
-*              a polynomial in place.
-*              Input coefficients assumed to be in normal order,
-*              output coefficients are in special order that is natural
-*              for the vectorization. Input coefficients are assumed to be
-*              bounded by q in absolute value, output coefficients are bounded
-*              by 16118 in absolute value.
-*
-* Arguments:   - poly *r: pointer to in/output polynomial
-**************************************************/
-void poly_ntt(poly *r)
-{
-  ntt_avx(r->vec, qdata.vec);
-}
-
-/*************************************************
-* Name:        poly_invntt_tomont
-*
-* Description: Computes inverse of negacyclic number-theoretic transform (NTT)
-*              of a polynomial in place;
-*              Input coefficients assumed to be in special order from vectorized
-*              forward ntt, output in normal order. Input coefficients can be
-*              arbitrary 16-bit integers, output coefficients are bounded by 14870
-*              in absolute value.
-*
-* Arguments:   - poly *a: pointer to in/output polynomial
-**************************************************/
-void poly_invntt_tomont(poly *r)
-{
-  invntt_avx(r->vec, qdata.vec);
-}
-
-void poly_nttunpack(poly *r)
-{
-  nttunpack_avx(r->vec, qdata.vec);
-}
-
-/*************************************************
-* Name:        poly_basemul_montgomery
-*
-* Description: Multiplication of two polynomials in NTT domain.
-*              One of the input polynomials needs to have coefficients
-*              bounded by q, the other polynomial can have arbitrary
-*              coefficients. Output coefficients are bounded by 6656.
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const poly *a: pointer to first input polynomial
-*              - const poly *b: pointer to second input polynomial
-**************************************************/
-void poly_basemul_montgomery(poly *r, const poly *a, const poly *b)
-{
-  basemul_avx(r->vec, a->vec, b->vec, qdata.vec);
-}
-
-/*************************************************
-* Name:        poly_tomont
-*
-* Description: Inplace conversion of all coefficients of a polynomial
-*              from normal domain to Montgomery domain
-*
-* Arguments:   - poly *r: pointer to input/output polynomial
-**************************************************/
-void poly_tomont(poly *r)
-{
-  tomont_avx(r->vec, qdata.vec);
-}
-
-/*************************************************
-* Name:        poly_reduce
-*
-* Description: Applies Barrett reduction to all coefficients of a polynomial
-*              for details of the Barrett reduction see comments in reduce.c
-*
-* Arguments:   - poly *r: pointer to input/output polynomial
-**************************************************/
-void poly_reduce(poly *r)
-{
-  reduce_avx(r->vec, qdata.vec);
-}
-
-/*************************************************
-* Name:        poly_add
-*
-* Description: Add two polynomials. No modular reduction
-*              is performed.
-*
-* Arguments: - poly *r: pointer to output polynomial
-*            - const poly *a: pointer to first input polynomial
-*            - const poly *b: pointer to second input polynomial
-**************************************************/
-void poly_add(poly *r, const poly *a, const poly *b)
-{
-  unsigned int i;
-  __m256i f0, f1;
-
-  for(i=0;i<KYBER_N/16;i++) {
-    f0 = _mm256_load_si256(&a->vec[i]);
-    f1 = _mm256_load_si256(&b->vec[i]);
-    f0 = _mm256_add_epi16(f0, f1);
-    _mm256_store_si256(&r->vec[i], f0);
-  }
-}
-
-/*************************************************
-* Name:        poly_sub
-*
-* Description: Subtract two polynomials. No modular reduction
-*              is performed.
-*
-* Arguments: - poly *r: pointer to output polynomial
-*            - const poly *a: pointer to first input polynomial
-*            - const poly *b: pointer to second input polynomial
-**************************************************/
-void poly_sub(poly *r, const poly *a, const poly *b)
-{
-  unsigned int i;
-  __m256i f0, f1;
-
-  for(i=0;i<KYBER_N/16;i++) {
-    f0 = _mm256_load_si256(&a->vec[i]);
-    f1 = _mm256_load_si256(&b->vec[i]);
-    f0 = _mm256_sub_epi16(f0, f1);
-    _mm256_store_si256(&r->vec[i], f0);
-  }
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/poly.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/poly.h
deleted file mode 100644
index 6a9cf71c70..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/poly.h
+++ /dev/null
@@ -1,77 +0,0 @@
-#ifndef POLY_H
-#define POLY_H
-
-#include <stdint.h>
-#include "align.h"
-#include "params.h"
-
-typedef ALIGNED_INT16(KYBER_N) poly;
-
-#define poly_compress KYBER_NAMESPACE(poly_compress)
-void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a);
-#define poly_decompress KYBER_NAMESPACE(poly_decompress)
-void poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]);
-
-#define poly_tobytes KYBER_NAMESPACE(poly_tobytes)
-void poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a);
-#define poly_frombytes KYBER_NAMESPACE(poly_frombytes)
-void poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]);
-
-#define poly_frommsg KYBER_NAMESPACE(poly_frommsg)
-void poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]);
-#define poly_tomsg KYBER_NAMESPACE(poly_tomsg)
-void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *r);
-
-#define poly_getnoise_eta1 KYBER_NAMESPACE(poly_getnoise_eta1)
-void poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);
-
-#define poly_getnoise_eta2 KYBER_NAMESPACE(poly_getnoise_eta2)
-void poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);
-
-#ifndef KYBER_90S
-#define poly_getnoise_eta1_4x KYBER_NAMESPACE(poly_getnoise_eta2_4x)
-void poly_getnoise_eta1_4x(poly *r0,
-                           poly *r1,
-                           poly *r2,
-                           poly *r3,
-                           const uint8_t seed[32],
-                           uint8_t nonce0,
-                           uint8_t nonce1,
-                           uint8_t nonce2,
-                           uint8_t nonce3);
-
-#if KYBER_K == 2
-#define poly_getnoise_eta1122_4x KYBER_NAMESPACE(poly_getnoise_eta1122_4x)
-void poly_getnoise_eta1122_4x(poly *r0,
-                              poly *r1,
-                              poly *r2,
-                              poly *r3,
-                              const uint8_t seed[32],
-                              uint8_t nonce0,
-                              uint8_t nonce1,
-                              uint8_t nonce2,
-                              uint8_t nonce3);
-#endif
-#endif
-
-
-#define poly_ntt KYBER_NAMESPACE(poly_ntt)
-void poly_ntt(poly *r);
-#define poly_invntt_tomont KYBER_NAMESPACE(poly_invntt_tomont)
-void poly_invntt_tomont(poly *r);
-#define poly_nttunpack KYBER_NAMESPACE(poly_nttunpack)
-void poly_nttunpack(poly *r);
-#define poly_basemul_montgomery KYBER_NAMESPACE(poly_basemul_montgomery)
-void poly_basemul_montgomery(poly *r, const poly *a, const poly *b);
-#define poly_tomont KYBER_NAMESPACE(poly_tomont)
-void poly_tomont(poly *r);
-
-#define poly_reduce KYBER_NAMESPACE(poly_reduce)
-void poly_reduce(poly *r);
-
-#define poly_add KYBER_NAMESPACE(poly_add)
-void poly_add(poly *r, const poly *a, const poly *b);
-#define poly_sub KYBER_NAMESPACE(poly_sub)
-void poly_sub(poly *r, const poly *a, const poly *b);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/polyvec.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/polyvec.c
deleted file mode 100644
index a0174b7b3f..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/polyvec.c
+++ /dev/null
@@ -1,307 +0,0 @@
-#include <stdint.h>
-#include <immintrin.h>
-#include <string.h>
-#include "params.h"
-#include "polyvec.h"
-#include "poly.h"
-#include "ntt.h"
-#include "consts.h"
-
-#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
-static void poly_compress10(uint8_t r[320], const poly * restrict a)
-{
-  unsigned int i;
-  __m256i f0, f1, f2;
-  __m128i t0, t1;
-  const __m256i v = _mm256_load_si256(&qdata.vec[_16XV/16]);
-  const __m256i v8 = _mm256_slli_epi16(v,3);
-  const __m256i off = _mm256_set1_epi16(15);
-  const __m256i shift1 = _mm256_set1_epi16(1 << 12);
-  const __m256i mask = _mm256_set1_epi16(1023);
-  const __m256i shift2 = _mm256_set1_epi64x((1024LL << 48) + (1LL << 32) + (1024 << 16) + 1);
-  const __m256i sllvdidx = _mm256_set1_epi64x(12);
-  const __m256i shufbidx = _mm256_set_epi8( 8, 4, 3, 2, 1, 0,-1,-1,-1,-1,-1,-1,12,11,10, 9,
-                                           -1,-1,-1,-1,-1,-1,12,11,10, 9, 8, 4, 3, 2, 1, 0);
-
-  for(i=0;i<KYBER_N/16;i++) {
-    f0 = _mm256_load_si256(&a->vec[i]);
-    f1 = _mm256_mullo_epi16(f0,v8);
-    f2 = _mm256_add_epi16(f0,off);
-    f0 = _mm256_slli_epi16(f0,3);
-    f0 = _mm256_mulhi_epi16(f0,v);
-    f2 = _mm256_sub_epi16(f1,f2);
-    f1 = _mm256_andnot_si256(f1,f2);
-    f1 = _mm256_srli_epi16(f1,15);
-    f0 = _mm256_sub_epi16(f0,f1);
-    f0 = _mm256_mulhrs_epi16(f0,shift1);
-    f0 = _mm256_and_si256(f0,mask);
-    f0 = _mm256_madd_epi16(f0,shift2);
-    f0 = _mm256_sllv_epi32(f0,sllvdidx);
-    f0 = _mm256_srli_epi64(f0,12);
-    f0 = _mm256_shuffle_epi8(f0,shufbidx);
-    t0 = _mm256_castsi256_si128(f0);
-    t1 = _mm256_extracti128_si256(f0,1);
-    t0 = _mm_blend_epi16(t0,t1,0xE0);
-    _mm_storeu_si128((__m128i *)&r[20*i+ 0],t0);
-    memcpy(&r[20*i+16],&t1,4);
-  }
-}
-
-static void poly_decompress10(poly * restrict r, const uint8_t a[320+12])
-{
-  unsigned int i;
-  __m256i f;
-  const __m256i q = _mm256_set1_epi32((KYBER_Q << 16) + 4*KYBER_Q);
-  const __m256i shufbidx = _mm256_set_epi8(11,10,10, 9, 9, 8, 8, 7,
-                                            6, 5, 5, 4, 4, 3, 3, 2,
-                                            9, 8, 8, 7, 7, 6, 6, 5,
-                                            4, 3, 3, 2, 2, 1, 1, 0);
-  const __m256i sllvdidx = _mm256_set1_epi64x(4);
-  const __m256i mask = _mm256_set1_epi32((32736 << 16) + 8184);
-
-  for(i=0;i<KYBER_N/16;i++) {
-    f = _mm256_loadu_si256((__m256i *)&a[20*i]);
-    f = _mm256_permute4x64_epi64(f,0x94);
-    f = _mm256_shuffle_epi8(f,shufbidx);
-    f = _mm256_sllv_epi32(f,sllvdidx);
-    f = _mm256_srli_epi16(f,1);
-    f = _mm256_and_si256(f,mask);
-    f = _mm256_mulhrs_epi16(f,q);
-    _mm256_store_si256(&r->vec[i],f);
-  }
-}
-
-#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
-static void poly_compress11(uint8_t r[352+2], const poly * restrict a)
-{
-  unsigned int i;
-  __m256i f0, f1, f2;
-  __m128i t0, t1;
-  const __m256i v = _mm256_load_si256(&qdata.vec[_16XV/16]);
-  const __m256i v8 = _mm256_slli_epi16(v,3);
-  const __m256i off = _mm256_set1_epi16(36);
-  const __m256i shift1 = _mm256_set1_epi16(1 << 13);
-  const __m256i mask = _mm256_set1_epi16(2047);
-  const __m256i shift2 = _mm256_set1_epi64x((2048LL << 48) + (1LL << 32) + (2048 << 16) + 1);
-  const __m256i sllvdidx = _mm256_set1_epi64x(10);
-  const __m256i srlvqidx = _mm256_set_epi64x(30,10,30,10);
-  const __m256i shufbidx = _mm256_set_epi8( 4, 3, 2, 1, 0, 0,-1,-1,-1,-1,10, 9, 8, 7, 6, 5,
-                                           -1,-1,-1,-1,-1,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-
-  for(i=0;i<KYBER_N/16;i++) {
-    f0 = _mm256_load_si256(&a->vec[i]);
-    f1 = _mm256_mullo_epi16(f0,v8);
-    f2 = _mm256_add_epi16(f0,off);
-    f0 = _mm256_slli_epi16(f0,3);
-    f0 = _mm256_mulhi_epi16(f0,v);
-    f2 = _mm256_sub_epi16(f1,f2);
-    f1 = _mm256_andnot_si256(f1,f2);
-    f1 = _mm256_srli_epi16(f1,15);
-    f0 = _mm256_sub_epi16(f0,f1);
-    f0 = _mm256_mulhrs_epi16(f0,shift1);
-    f0 = _mm256_and_si256(f0,mask);
-    f0 = _mm256_madd_epi16(f0,shift2);
-    f0 = _mm256_sllv_epi32(f0,sllvdidx);
-    f1 = _mm256_bsrli_epi128(f0,8);
-    f0 = _mm256_srlv_epi64(f0,srlvqidx);
-    f1 = _mm256_slli_epi64(f1,34);
-    f0 = _mm256_add_epi64(f0,f1);
-    f0 = _mm256_shuffle_epi8(f0,shufbidx);
-    t0 = _mm256_castsi256_si128(f0);
-    t1 = _mm256_extracti128_si256(f0,1);
-    t0 = _mm_blendv_epi8(t0,t1,_mm256_castsi256_si128(shufbidx));
-    _mm_storeu_si128((__m128i *)&r[22*i+ 0],t0);
-    _mm_storel_epi64((__m128i *)&r[22*i+16],t1);
-  }
-}
-
-static void poly_decompress11(poly * restrict r, const uint8_t a[352+10])
-{
-  unsigned int i;
-  __m256i f;
-  const __m256i q = _mm256_load_si256(&qdata.vec[_16XQ/16]);
-  const __m256i shufbidx = _mm256_set_epi8(13,12,12,11,10, 9, 9, 8,
-                                            8, 7, 6, 5, 5, 4, 4, 3,
-                                           10, 9, 9, 8, 7, 6, 6, 5,
-                                            5, 4, 3, 2, 2, 1, 1, 0);
-  const __m256i srlvdidx = _mm256_set_epi32(0,0,1,0,0,0,1,0);
-  const __m256i srlvqidx = _mm256_set_epi64x(2,0,2,0);
-  const __m256i shift = _mm256_set_epi16(4,32,1,8,32,1,4,32,4,32,1,8,32,1,4,32);
-  const __m256i mask = _mm256_set1_epi16(32752);
-
-  for(i=0;i<KYBER_N/16;i++) {
-    f = _mm256_loadu_si256((__m256i *)&a[22*i]);
-    f = _mm256_permute4x64_epi64(f,0x94);
-    f = _mm256_shuffle_epi8(f,shufbidx);
-    f = _mm256_srlv_epi32(f,srlvdidx);
-    f = _mm256_srlv_epi64(f,srlvqidx);
-    f = _mm256_mullo_epi16(f,shift);
-    f = _mm256_srli_epi16(f,1);
-    f = _mm256_and_si256(f,mask);
-    f = _mm256_mulhrs_epi16(f,q);
-    _mm256_store_si256(&r->vec[i],f);
-  }
-}
-
-#endif
-
-/*************************************************
-* Name:        polyvec_compress
-*
-* Description: Compress and serialize vector of polynomials
-*
-* Arguments:   - uint8_t *r: pointer to output byte array
-*                            (needs space for KYBER_POLYVECCOMPRESSEDBYTES)
-*              - polyvec *a: pointer to input vector of polynomials
-**************************************************/
-void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES+2], const polyvec *a)
-{
-  unsigned int i;
-
-#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
-  for(i=0;i<KYBER_K;i++)
-    poly_compress10(&r[320*i],&a->vec[i]);
-#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
-  for(i=0;i<KYBER_K;i++)
-    poly_compress11(&r[352*i],&a->vec[i]);
-#endif
-}
-
-/*************************************************
-* Name:        polyvec_decompress
-*
-* Description: De-serialize and decompress vector of polynomials;
-*              approximate inverse of polyvec_compress
-*
-* Arguments:   - polyvec *r: pointer to output vector of polynomials
-*              - const uint8_t *a: pointer to input byte array
-*                                  (of length KYBER_POLYVECCOMPRESSEDBYTES)
-**************************************************/
-void polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES+12])
-{
-  unsigned int i;
-
-#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
-  for(i=0;i<KYBER_K;i++)
-    poly_decompress10(&r->vec[i],&a[320*i]);
-#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
-  for(i=0;i<KYBER_K;i++)
-    poly_decompress11(&r->vec[i],&a[352*i]);
-#endif
-}
-
-/*************************************************
-* Name:        polyvec_tobytes
-*
-* Description: Serialize vector of polynomials
-*
-* Arguments:   - uint8_t *r: pointer to output byte array
-*                            (needs space for KYBER_POLYVECBYTES)
-*              - polyvec *a: pointer to input vector of polynomials
-**************************************************/
-void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_tobytes(r+i*KYBER_POLYBYTES, &a->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvec_frombytes
-*
-* Description: De-serialize vector of polynomials;
-*              inverse of polyvec_tobytes
-*
-* Arguments:   - uint8_t *r: pointer to output byte array
-*              - const polyvec *a: pointer to input vector of polynomials
-*                                  (of length KYBER_POLYVECBYTES)
-**************************************************/
-void polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES])
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_frombytes(&r->vec[i], a+i*KYBER_POLYBYTES);
-}
-
-/*************************************************
-* Name:        polyvec_ntt
-*
-* Description: Apply forward NTT to all elements of a vector of polynomials
-*
-* Arguments:   - polyvec *r: pointer to in/output vector of polynomials
-**************************************************/
-void polyvec_ntt(polyvec *r)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_ntt(&r->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvec_invntt_tomont
-*
-* Description: Apply inverse NTT to all elements of a vector of polynomials
-*              and multiply by Montgomery factor 2^16
-*
-* Arguments:   - polyvec *r: pointer to in/output vector of polynomials
-**************************************************/
-void polyvec_invntt_tomont(polyvec *r)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_invntt_tomont(&r->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvec_basemul_acc_montgomery
-*
-* Description: Multiply elements in a and b in NTT domain, accumulate into r,
-*              and multiply by 2^-16.
-*
-* Arguments: - poly *r: pointer to output polynomial
-*            - const polyvec *a: pointer to first input vector of polynomials
-*            - const polyvec *b: pointer to second input vector of polynomials
-**************************************************/
-void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
-{
-  unsigned int i;
-  poly tmp;
-
-  poly_basemul_montgomery(r,&a->vec[0],&b->vec[0]);
-  for(i=1;i<KYBER_K;i++) {
-    poly_basemul_montgomery(&tmp,&a->vec[i],&b->vec[i]);
-    poly_add(r,r,&tmp);
-  }
-}
-
-/*************************************************
-* Name:        polyvec_reduce
-*
-* Description: Applies Barrett reduction to each coefficient
-*              of each element of a vector of polynomials;
-*              for details of the Barrett reduction see comments in reduce.c
-*
-* Arguments:   - polyvec *r: pointer to input/output polynomial
-**************************************************/
-void polyvec_reduce(polyvec *r)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_reduce(&r->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvec_add
-*
-* Description: Add vectors of polynomials
-*
-* Arguments: - polyvec *r:       pointer to output vector of polynomials
-*            - const polyvec *a: pointer to first input vector of polynomials
-*            - const polyvec *b: pointer to second input vector of polynomials
-**************************************************/
-void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_add(&r->vec[i], &a->vec[i], &b->vec[i]);
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/polyvec.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/polyvec.h
deleted file mode 100644
index 2ce23c31ff..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/polyvec.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef POLYVEC_H
-#define POLYVEC_H
-
-#include <stdint.h>
-#include "params.h"
-#include "poly.h"
-
-typedef struct{
-  poly vec[KYBER_K];
-} polyvec;
-
-#define polyvec_compress KYBER_NAMESPACE(polyvec_compress)
-void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES+2], const polyvec *a);
-#define polyvec_decompress KYBER_NAMESPACE(polyvec_decompress)
-void polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES+12]);
-
-#define polyvec_tobytes KYBER_NAMESPACE(polyvec_tobytes)
-void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a);
-#define polyvec_frombytes KYBER_NAMESPACE(polyvec_frombytes)
-void polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]);
-
-#define polyvec_ntt KYBER_NAMESPACE(polyvec_ntt)
-void polyvec_ntt(polyvec *r);
-#define polyvec_invntt_tomont KYBER_NAMESPACE(polyvec_invntt_tomont)
-void polyvec_invntt_tomont(polyvec *r);
-
-#define polyvec_basemul_acc_montgomery KYBER_NAMESPACE(polyvec_basemul_acc_montgomery)
-void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b);
-
-#define polyvec_reduce KYBER_NAMESPACE(polyvec_reduce)
-void polyvec_reduce(polyvec *r);
-
-#define polyvec_add KYBER_NAMESPACE(polyvec_add)
-void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/reduce.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/reduce.h
deleted file mode 100644
index 5368185b5f..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/reduce.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef REDUCE_H
-#define REDUCE_H
-
-#include "params.h"
-#include <immintrin.h>
-
-#define reduce_avx KYBER_NAMESPACE(reduce_avx)
-void reduce_avx(__m256i *r, const __m256i *qdata);
-#define tomont_avx KYBER_NAMESPACE(tomont_avx)
-void tomont_avx(__m256i *r, const __m256i *qdata);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/rejsample.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/rejsample.c
deleted file mode 100644
index 9060a44cb9..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/rejsample.c
+++ /dev/null
@@ -1,398 +0,0 @@
-#include <stdint.h>
-#include <immintrin.h>
-#include <string.h>
-#include "params.h"
-#include "consts.h"
-#include "rejsample.h"
-
-//#define BMI
-
-#ifndef BMI
-static const uint8_t idx[256][8] = {
-  {-1, -1, -1, -1, -1, -1, -1, -1},
-  { 0, -1, -1, -1, -1, -1, -1, -1},
-  { 2, -1, -1, -1, -1, -1, -1, -1},
-  { 0,  2, -1, -1, -1, -1, -1, -1},
-  { 4, -1, -1, -1, -1, -1, -1, -1},
-  { 0,  4, -1, -1, -1, -1, -1, -1},
-  { 2,  4, -1, -1, -1, -1, -1, -1},
-  { 0,  2,  4, -1, -1, -1, -1, -1},
-  { 6, -1, -1, -1, -1, -1, -1, -1},
-  { 0,  6, -1, -1, -1, -1, -1, -1},
-  { 2,  6, -1, -1, -1, -1, -1, -1},
-  { 0,  2,  6, -1, -1, -1, -1, -1},
-  { 4,  6, -1, -1, -1, -1, -1, -1},
-  { 0,  4,  6, -1, -1, -1, -1, -1},
-  { 2,  4,  6, -1, -1, -1, -1, -1},
-  { 0,  2,  4,  6, -1, -1, -1, -1},
-  { 8, -1, -1, -1, -1, -1, -1, -1},
-  { 0,  8, -1, -1, -1, -1, -1, -1},
-  { 2,  8, -1, -1, -1, -1, -1, -1},
-  { 0,  2,  8, -1, -1, -1, -1, -1},
-  { 4,  8, -1, -1, -1, -1, -1, -1},
-  { 0,  4,  8, -1, -1, -1, -1, -1},
-  { 2,  4,  8, -1, -1, -1, -1, -1},
-  { 0,  2,  4,  8, -1, -1, -1, -1},
-  { 6,  8, -1, -1, -1, -1, -1, -1},
-  { 0,  6,  8, -1, -1, -1, -1, -1},
-  { 2,  6,  8, -1, -1, -1, -1, -1},
-  { 0,  2,  6,  8, -1, -1, -1, -1},
-  { 4,  6,  8, -1, -1, -1, -1, -1},
-  { 0,  4,  6,  8, -1, -1, -1, -1},
-  { 2,  4,  6,  8, -1, -1, -1, -1},
-  { 0,  2,  4,  6,  8, -1, -1, -1},
-  {10, -1, -1, -1, -1, -1, -1, -1},
-  { 0, 10, -1, -1, -1, -1, -1, -1},
-  { 2, 10, -1, -1, -1, -1, -1, -1},
-  { 0,  2, 10, -1, -1, -1, -1, -1},
-  { 4, 10, -1, -1, -1, -1, -1, -1},
-  { 0,  4, 10, -1, -1, -1, -1, -1},
-  { 2,  4, 10, -1, -1, -1, -1, -1},
-  { 0,  2,  4, 10, -1, -1, -1, -1},
-  { 6, 10, -1, -1, -1, -1, -1, -1},
-  { 0,  6, 10, -1, -1, -1, -1, -1},
-  { 2,  6, 10, -1, -1, -1, -1, -1},
-  { 0,  2,  6, 10, -1, -1, -1, -1},
-  { 4,  6, 10, -1, -1, -1, -1, -1},
-  { 0,  4,  6, 10, -1, -1, -1, -1},
-  { 2,  4,  6, 10, -1, -1, -1, -1},
-  { 0,  2,  4,  6, 10, -1, -1, -1},
-  { 8, 10, -1, -1, -1, -1, -1, -1},
-  { 0,  8, 10, -1, -1, -1, -1, -1},
-  { 2,  8, 10, -1, -1, -1, -1, -1},
-  { 0,  2,  8, 10, -1, -1, -1, -1},
-  { 4,  8, 10, -1, -1, -1, -1, -1},
-  { 0,  4,  8, 10, -1, -1, -1, -1},
-  { 2,  4,  8, 10, -1, -1, -1, -1},
-  { 0,  2,  4,  8, 10, -1, -1, -1},
-  { 6,  8, 10, -1, -1, -1, -1, -1},
-  { 0,  6,  8, 10, -1, -1, -1, -1},
-  { 2,  6,  8, 10, -1, -1, -1, -1},
-  { 0,  2,  6,  8, 10, -1, -1, -1},
-  { 4,  6,  8, 10, -1, -1, -1, -1},
-  { 0,  4,  6,  8, 10, -1, -1, -1},
-  { 2,  4,  6,  8, 10, -1, -1, -1},
-  { 0,  2,  4,  6,  8, 10, -1, -1},
-  {12, -1, -1, -1, -1, -1, -1, -1},
-  { 0, 12, -1, -1, -1, -1, -1, -1},
-  { 2, 12, -1, -1, -1, -1, -1, -1},
-  { 0,  2, 12, -1, -1, -1, -1, -1},
-  { 4, 12, -1, -1, -1, -1, -1, -1},
-  { 0,  4, 12, -1, -1, -1, -1, -1},
-  { 2,  4, 12, -1, -1, -1, -1, -1},
-  { 0,  2,  4, 12, -1, -1, -1, -1},
-  { 6, 12, -1, -1, -1, -1, -1, -1},
-  { 0,  6, 12, -1, -1, -1, -1, -1},
-  { 2,  6, 12, -1, -1, -1, -1, -1},
-  { 0,  2,  6, 12, -1, -1, -1, -1},
-  { 4,  6, 12, -1, -1, -1, -1, -1},
-  { 0,  4,  6, 12, -1, -1, -1, -1},
-  { 2,  4,  6, 12, -1, -1, -1, -1},
-  { 0,  2,  4,  6, 12, -1, -1, -1},
-  { 8, 12, -1, -1, -1, -1, -1, -1},
-  { 0,  8, 12, -1, -1, -1, -1, -1},
-  { 2,  8, 12, -1, -1, -1, -1, -1},
-  { 0,  2,  8, 12, -1, -1, -1, -1},
-  { 4,  8, 12, -1, -1, -1, -1, -1},
-  { 0,  4,  8, 12, -1, -1, -1, -1},
-  { 2,  4,  8, 12, -1, -1, -1, -1},
-  { 0,  2,  4,  8, 12, -1, -1, -1},
-  { 6,  8, 12, -1, -1, -1, -1, -1},
-  { 0,  6,  8, 12, -1, -1, -1, -1},
-  { 2,  6,  8, 12, -1, -1, -1, -1},
-  { 0,  2,  6,  8, 12, -1, -1, -1},
-  { 4,  6,  8, 12, -1, -1, -1, -1},
-  { 0,  4,  6,  8, 12, -1, -1, -1},
-  { 2,  4,  6,  8, 12, -1, -1, -1},
-  { 0,  2,  4,  6,  8, 12, -1, -1},
-  {10, 12, -1, -1, -1, -1, -1, -1},
-  { 0, 10, 12, -1, -1, -1, -1, -1},
-  { 2, 10, 12, -1, -1, -1, -1, -1},
-  { 0,  2, 10, 12, -1, -1, -1, -1},
-  { 4, 10, 12, -1, -1, -1, -1, -1},
-  { 0,  4, 10, 12, -1, -1, -1, -1},
-  { 2,  4, 10, 12, -1, -1, -1, -1},
-  { 0,  2,  4, 10, 12, -1, -1, -1},
-  { 6, 10, 12, -1, -1, -1, -1, -1},
-  { 0,  6, 10, 12, -1, -1, -1, -1},
-  { 2,  6, 10, 12, -1, -1, -1, -1},
-  { 0,  2,  6, 10, 12, -1, -1, -1},
-  { 4,  6, 10, 12, -1, -1, -1, -1},
-  { 0,  4,  6, 10, 12, -1, -1, -1},
-  { 2,  4,  6, 10, 12, -1, -1, -1},
-  { 0,  2,  4,  6, 10, 12, -1, -1},
-  { 8, 10, 12, -1, -1, -1, -1, -1},
-  { 0,  8, 10, 12, -1, -1, -1, -1},
-  { 2,  8, 10, 12, -1, -1, -1, -1},
-  { 0,  2,  8, 10, 12, -1, -1, -1},
-  { 4,  8, 10, 12, -1, -1, -1, -1},
-  { 0,  4,  8, 10, 12, -1, -1, -1},
-  { 2,  4,  8, 10, 12, -1, -1, -1},
-  { 0,  2,  4,  8, 10, 12, -1, -1},
-  { 6,  8, 10, 12, -1, -1, -1, -1},
-  { 0,  6,  8, 10, 12, -1, -1, -1},
-  { 2,  6,  8, 10, 12, -1, -1, -1},
-  { 0,  2,  6,  8, 10, 12, -1, -1},
-  { 4,  6,  8, 10, 12, -1, -1, -1},
-  { 0,  4,  6,  8, 10, 12, -1, -1},
-  { 2,  4,  6,  8, 10, 12, -1, -1},
-  { 0,  2,  4,  6,  8, 10, 12, -1},
-  {14, -1, -1, -1, -1, -1, -1, -1},
-  { 0, 14, -1, -1, -1, -1, -1, -1},
-  { 2, 14, -1, -1, -1, -1, -1, -1},
-  { 0,  2, 14, -1, -1, -1, -1, -1},
-  { 4, 14, -1, -1, -1, -1, -1, -1},
-  { 0,  4, 14, -1, -1, -1, -1, -1},
-  { 2,  4, 14, -1, -1, -1, -1, -1},
-  { 0,  2,  4, 14, -1, -1, -1, -1},
-  { 6, 14, -1, -1, -1, -1, -1, -1},
-  { 0,  6, 14, -1, -1, -1, -1, -1},
-  { 2,  6, 14, -1, -1, -1, -1, -1},
-  { 0,  2,  6, 14, -1, -1, -1, -1},
-  { 4,  6, 14, -1, -1, -1, -1, -1},
-  { 0,  4,  6, 14, -1, -1, -1, -1},
-  { 2,  4,  6, 14, -1, -1, -1, -1},
-  { 0,  2,  4,  6, 14, -1, -1, -1},
-  { 8, 14, -1, -1, -1, -1, -1, -1},
-  { 0,  8, 14, -1, -1, -1, -1, -1},
-  { 2,  8, 14, -1, -1, -1, -1, -1},
-  { 0,  2,  8, 14, -1, -1, -1, -1},
-  { 4,  8, 14, -1, -1, -1, -1, -1},
-  { 0,  4,  8, 14, -1, -1, -1, -1},
-  { 2,  4,  8, 14, -1, -1, -1, -1},
-  { 0,  2,  4,  8, 14, -1, -1, -1},
-  { 6,  8, 14, -1, -1, -1, -1, -1},
-  { 0,  6,  8, 14, -1, -1, -1, -1},
-  { 2,  6,  8, 14, -1, -1, -1, -1},
-  { 0,  2,  6,  8, 14, -1, -1, -1},
-  { 4,  6,  8, 14, -1, -1, -1, -1},
-  { 0,  4,  6,  8, 14, -1, -1, -1},
-  { 2,  4,  6,  8, 14, -1, -1, -1},
-  { 0,  2,  4,  6,  8, 14, -1, -1},
-  {10, 14, -1, -1, -1, -1, -1, -1},
-  { 0, 10, 14, -1, -1, -1, -1, -1},
-  { 2, 10, 14, -1, -1, -1, -1, -1},
-  { 0,  2, 10, 14, -1, -1, -1, -1},
-  { 4, 10, 14, -1, -1, -1, -1, -1},
-  { 0,  4, 10, 14, -1, -1, -1, -1},
-  { 2,  4, 10, 14, -1, -1, -1, -1},
-  { 0,  2,  4, 10, 14, -1, -1, -1},
-  { 6, 10, 14, -1, -1, -1, -1, -1},
-  { 0,  6, 10, 14, -1, -1, -1, -1},
-  { 2,  6, 10, 14, -1, -1, -1, -1},
-  { 0,  2,  6, 10, 14, -1, -1, -1},
-  { 4,  6, 10, 14, -1, -1, -1, -1},
-  { 0,  4,  6, 10, 14, -1, -1, -1},
-  { 2,  4,  6, 10, 14, -1, -1, -1},
-  { 0,  2,  4,  6, 10, 14, -1, -1},
-  { 8, 10, 14, -1, -1, -1, -1, -1},
-  { 0,  8, 10, 14, -1, -1, -1, -1},
-  { 2,  8, 10, 14, -1, -1, -1, -1},
-  { 0,  2,  8, 10, 14, -1, -1, -1},
-  { 4,  8, 10, 14, -1, -1, -1, -1},
-  { 0,  4,  8, 10, 14, -1, -1, -1},
-  { 2,  4,  8, 10, 14, -1, -1, -1},
-  { 0,  2,  4,  8, 10, 14, -1, -1},
-  { 6,  8, 10, 14, -1, -1, -1, -1},
-  { 0,  6,  8, 10, 14, -1, -1, -1},
-  { 2,  6,  8, 10, 14, -1, -1, -1},
-  { 0,  2,  6,  8, 10, 14, -1, -1},
-  { 4,  6,  8, 10, 14, -1, -1, -1},
-  { 0,  4,  6,  8, 10, 14, -1, -1},
-  { 2,  4,  6,  8, 10, 14, -1, -1},
-  { 0,  2,  4,  6,  8, 10, 14, -1},
-  {12, 14, -1, -1, -1, -1, -1, -1},
-  { 0, 12, 14, -1, -1, -1, -1, -1},
-  { 2, 12, 14, -1, -1, -1, -1, -1},
-  { 0,  2, 12, 14, -1, -1, -1, -1},
-  { 4, 12, 14, -1, -1, -1, -1, -1},
-  { 0,  4, 12, 14, -1, -1, -1, -1},
-  { 2,  4, 12, 14, -1, -1, -1, -1},
-  { 0,  2,  4, 12, 14, -1, -1, -1},
-  { 6, 12, 14, -1, -1, -1, -1, -1},
-  { 0,  6, 12, 14, -1, -1, -1, -1},
-  { 2,  6, 12, 14, -1, -1, -1, -1},
-  { 0,  2,  6, 12, 14, -1, -1, -1},
-  { 4,  6, 12, 14, -1, -1, -1, -1},
-  { 0,  4,  6, 12, 14, -1, -1, -1},
-  { 2,  4,  6, 12, 14, -1, -1, -1},
-  { 0,  2,  4,  6, 12, 14, -1, -1},
-  { 8, 12, 14, -1, -1, -1, -1, -1},
-  { 0,  8, 12, 14, -1, -1, -1, -1},
-  { 2,  8, 12, 14, -1, -1, -1, -1},
-  { 0,  2,  8, 12, 14, -1, -1, -1},
-  { 4,  8, 12, 14, -1, -1, -1, -1},
-  { 0,  4,  8, 12, 14, -1, -1, -1},
-  { 2,  4,  8, 12, 14, -1, -1, -1},
-  { 0,  2,  4,  8, 12, 14, -1, -1},
-  { 6,  8, 12, 14, -1, -1, -1, -1},
-  { 0,  6,  8, 12, 14, -1, -1, -1},
-  { 2,  6,  8, 12, 14, -1, -1, -1},
-  { 0,  2,  6,  8, 12, 14, -1, -1},
-  { 4,  6,  8, 12, 14, -1, -1, -1},
-  { 0,  4,  6,  8, 12, 14, -1, -1},
-  { 2,  4,  6,  8, 12, 14, -1, -1},
-  { 0,  2,  4,  6,  8, 12, 14, -1},
-  {10, 12, 14, -1, -1, -1, -1, -1},
-  { 0, 10, 12, 14, -1, -1, -1, -1},
-  { 2, 10, 12, 14, -1, -1, -1, -1},
-  { 0,  2, 10, 12, 14, -1, -1, -1},
-  { 4, 10, 12, 14, -1, -1, -1, -1},
-  { 0,  4, 10, 12, 14, -1, -1, -1},
-  { 2,  4, 10, 12, 14, -1, -1, -1},
-  { 0,  2,  4, 10, 12, 14, -1, -1},
-  { 6, 10, 12, 14, -1, -1, -1, -1},
-  { 0,  6, 10, 12, 14, -1, -1, -1},
-  { 2,  6, 10, 12, 14, -1, -1, -1},
-  { 0,  2,  6, 10, 12, 14, -1, -1},
-  { 4,  6, 10, 12, 14, -1, -1, -1},
-  { 0,  4,  6, 10, 12, 14, -1, -1},
-  { 2,  4,  6, 10, 12, 14, -1, -1},
-  { 0,  2,  4,  6, 10, 12, 14, -1},
-  { 8, 10, 12, 14, -1, -1, -1, -1},
-  { 0,  8, 10, 12, 14, -1, -1, -1},
-  { 2,  8, 10, 12, 14, -1, -1, -1},
-  { 0,  2,  8, 10, 12, 14, -1, -1},
-  { 4,  8, 10, 12, 14, -1, -1, -1},
-  { 0,  4,  8, 10, 12, 14, -1, -1},
-  { 2,  4,  8, 10, 12, 14, -1, -1},
-  { 0,  2,  4,  8, 10, 12, 14, -1},
-  { 6,  8, 10, 12, 14, -1, -1, -1},
-  { 0,  6,  8, 10, 12, 14, -1, -1},
-  { 2,  6,  8, 10, 12, 14, -1, -1},
-  { 0,  2,  6,  8, 10, 12, 14, -1},
-  { 4,  6,  8, 10, 12, 14, -1, -1},
-  { 0,  4,  6,  8, 10, 12, 14, -1},
-  { 2,  4,  6,  8, 10, 12, 14, -1},
-  { 0,  2,  4,  6,  8, 10, 12, 14}
-};
-#endif
-
-#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a)
-#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a)
-
-unsigned int rej_uniform_avx(int16_t * restrict r, const uint8_t *buf)
-{
-  unsigned int ctr, pos;
-  uint16_t val0, val1;
-  uint32_t good;
-#ifdef BMI
-  uint64_t idx0, idx1, idx2, idx3;
-#endif
-  const __m256i bound  = _mm256_load_si256(&qdata.vec[_16XQ/16]);
-  const __m256i ones   = _mm256_set1_epi8(1);
-  const __m256i mask  = _mm256_set1_epi16(0xFFF);
-  const __m256i idx8  = _mm256_set_epi8(15,14,14,13,12,11,11,10,
-                                         9, 8, 8, 7, 6, 5, 5, 4,
-                                        11,10,10, 9, 8, 7, 7, 6,
-                                         5, 4, 4, 3, 2, 1, 1, 0);
-  __m256i f0, f1, g0, g1, g2, g3;
-  __m128i f, t, pilo, pihi;
-
-  ctr = pos = 0;
-  while(ctr <= KYBER_N - 32 && pos <= REJ_UNIFORM_AVX_BUFLEN - 56) {
-    f0 = _mm256_loadu_si256((__m256i *)&buf[pos]);
-    f1 = _mm256_loadu_si256((__m256i *)&buf[pos+24]);
-    f0 = _mm256_permute4x64_epi64(f0, 0x94);
-    f1 = _mm256_permute4x64_epi64(f1, 0x94);
-    f0 = _mm256_shuffle_epi8(f0, idx8);
-    f1 = _mm256_shuffle_epi8(f1, idx8);
-    g0 = _mm256_srli_epi16(f0, 4);
-    g1 = _mm256_srli_epi16(f1, 4);
-    f0 = _mm256_blend_epi16(f0, g0, 0xAA);
-    f1 = _mm256_blend_epi16(f1, g1, 0xAA);
-    f0 = _mm256_and_si256(f0, mask);
-    f1 = _mm256_and_si256(f1, mask);
-    pos += 48;
-
-    g0 = _mm256_cmpgt_epi16(bound, f0);
-    g1 = _mm256_cmpgt_epi16(bound, f1);
-
-    g0 = _mm256_packs_epi16(g0, g1);
-    good = _mm256_movemask_epi8(g0);
-
-#ifdef BMI
-    idx0 = _pdep_u64(good >>  0, 0x0101010101010101);
-    idx1 = _pdep_u64(good >>  8, 0x0101010101010101);
-    idx2 = _pdep_u64(good >> 16, 0x0101010101010101);
-    idx3 = _pdep_u64(good >> 24, 0x0101010101010101);
-    idx0 = (idx0 << 8) - idx0;
-    idx0  = _pext_u64(0x0E0C0A0806040200, idx0);
-    idx1 = (idx1 << 8) - idx1;
-    idx1  = _pext_u64(0x0E0C0A0806040200, idx1);
-    idx2 = (idx2 << 8) - idx2;
-    idx2  = _pext_u64(0x0E0C0A0806040200, idx2);
-    idx3 = (idx3 << 8) - idx3;
-    idx3  = _pext_u64(0x0E0C0A0806040200, idx3);
-
-    g0 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx0));
-    g1 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx1));
-    g0 = _mm256_inserti128_si256(g0, _mm_cvtsi64_si128(idx2), 1);
-    g1 = _mm256_inserti128_si256(g1, _mm_cvtsi64_si128(idx3), 1);
-#else
-    g0 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx[(good >>  0) & 0xFF]));
-    g1 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx[(good >>  8) & 0xFF]));
-    g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((__m128i *)&idx[(good >> 16) & 0xFF]), 1);
-    g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((__m128i *)&idx[(good >> 24) & 0xFF]), 1);
-#endif
-
-    g2 = _mm256_add_epi8(g0, ones);
-    g3 = _mm256_add_epi8(g1, ones);
-    g0 = _mm256_unpacklo_epi8(g0, g2);
-    g1 = _mm256_unpacklo_epi8(g1, g3);
-
-    f0 = _mm256_shuffle_epi8(f0, g0);
-    f1 = _mm256_shuffle_epi8(f1, g1);
-
-    _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f0));
-    ctr += _mm_popcnt_u32((good >>  0) & 0xFF);
-    _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f0, 1));
-    ctr += _mm_popcnt_u32((good >> 16) & 0xFF);
-    _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f1));
-    ctr += _mm_popcnt_u32((good >>  8) & 0xFF);
-    _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f1, 1));
-    ctr += _mm_popcnt_u32((good >> 24) & 0xFF);
-  }
-
-  while(ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_AVX_BUFLEN - 16) {
-    f = _mm_loadu_si128((__m128i *)&buf[pos]);
-    f = _mm_shuffle_epi8(f, _mm256_castsi256_si128(idx8));
-    t = _mm_srli_epi16(f, 4);
-    f = _mm_blend_epi16(f, t, 0xAA);
-    f = _mm_and_si128(f, _mm256_castsi256_si128(mask));
-    pos += 12;
-
-    t = _mm_cmpgt_epi16(_mm256_castsi256_si128(bound), f);
-    good = _mm_movemask_epi8(t);
-
-#ifdef BMI
-    good &= 0x5555;
-    idx0 = _pdep_u64(good, 0x1111111111111111);
-    idx0 = (idx0 << 8) - idx0;
-    idx0 = _pext_u64(0x0E0C0A0806040200, idx0);
-    pilo = _mm_cvtsi64_si128(idx0);
-#else
-    good = _pext_u32(good, 0x5555);
-    pilo = _mm_loadl_epi64((__m128i *)&idx[good]);
-#endif
-
-    pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones));
-    pilo = _mm_unpacklo_epi8(pilo, pihi);
-    f = _mm_shuffle_epi8(f, pilo);
-    _mm_storeu_si128((__m128i *)&r[ctr], f);
-    ctr += _mm_popcnt_u32(good);
-  }
-
-  while(ctr < KYBER_N && pos <= REJ_UNIFORM_AVX_BUFLEN - 3) {
-    val0 = ((buf[pos+0] >> 0) | ((uint16_t)buf[pos+1] << 8)) & 0xFFF;
-    val1 = ((buf[pos+1] >> 4) | ((uint16_t)buf[pos+2] << 4));
-    pos += 3;
-
-    if(val0 < KYBER_Q)
-      r[ctr++] = val0;
-    if(val1 < KYBER_Q && ctr < KYBER_N)
-      r[ctr++] = val1;
-  }
-
-  return ctr;
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/rejsample.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/rejsample.h
deleted file mode 100644
index 3be5e2192e..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/rejsample.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef REJSAMPLE_H
-#define REJSAMPLE_H
-
-#include <stdint.h>
-#include "params.h"
-#include "symmetric.h"
-
-#define REJ_UNIFORM_AVX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES)
-#define REJ_UNIFORM_AVX_BUFLEN (REJ_UNIFORM_AVX_NBLOCKS*XOF_BLOCKBYTES)
-
-#define rej_uniform_avx KYBER_NAMESPACE(rej_uniform_avx)
-unsigned int rej_uniform_avx(int16_t *r, const uint8_t *buf);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/shuffle.S b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/shuffle.S
deleted file mode 100644
index 18325ebec0..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/shuffle.S
+++ /dev/null
@@ -1,255 +0,0 @@
-#include "consts.h"
-.include "fq.inc"
-.include "shuffle.inc"
-
-/*
-nttpack_avx:
-#load
-vmovdqa		(%rdi),%ymm4
-vmovdqa		32(%rdi),%ymm5
-vmovdqa		64(%rdi),%ymm6
-vmovdqa		96(%rdi),%ymm7
-vmovdqa		128(%rdi),%ymm8
-vmovdqa		160(%rdi),%ymm9
-vmovdqa		192(%rdi),%ymm10
-vmovdqa		224(%rdi),%ymm11
-
-shuffle1	4,5,3,5
-shuffle1	6,7,4,7
-shuffle1	8,9,6,9
-shuffle1	10,11,8,11
-
-shuffle2	3,4,10,4
-shuffle2	6,8,3,8
-shuffle2	5,7,6,7
-shuffle2	9,11,5,11
-
-shuffle4	10,3,9,3
-shuffle4	6,5,10,5
-shuffle4	4,8,6,8
-shuffle4	7,11,4,11
-
-shuffle8	9,10,7,10
-shuffle8	6,4,9,4
-shuffle8	3,5,6,5
-shuffle8	8,11,3,11
-
-#store
-vmovdqa		%ymm7,(%rdi)
-vmovdqa		%ymm9,32(%rdi)
-vmovdqa		%ymm6,64(%rdi)
-vmovdqa		%ymm3,96(%rdi)
-vmovdqa		%ymm10,128(%rdi)
-vmovdqa		%ymm4,160(%rdi)
-vmovdqa		%ymm5,192(%rdi)
-vmovdqa		%ymm11,224(%rdi)
-
-ret
-*/
-
-.text
-nttunpack128_avx:
-#load
-vmovdqa		(%rdi),%ymm4
-vmovdqa		32(%rdi),%ymm5
-vmovdqa		64(%rdi),%ymm6
-vmovdqa		96(%rdi),%ymm7
-vmovdqa		128(%rdi),%ymm8
-vmovdqa		160(%rdi),%ymm9
-vmovdqa		192(%rdi),%ymm10
-vmovdqa		224(%rdi),%ymm11
-
-shuffle8	4,8,3,8
-shuffle8	5,9,4,9
-shuffle8	6,10,5,10
-shuffle8	7,11,6,11
-
-shuffle4	3,5,7,5
-shuffle4	8,10,3,10
-shuffle4	4,6,8,6
-shuffle4	9,11,4,11
-
-shuffle2	7,8,9,8
-shuffle2	5,6,7,6
-shuffle2	3,4,5,4
-shuffle2	10,11,3,11
-
-shuffle1	9,5,10,5
-shuffle1	8,4,9,4
-shuffle1	7,3,8,3
-shuffle1	6,11,7,11
-
-#store
-vmovdqa		%ymm10,(%rdi)
-vmovdqa		%ymm5,32(%rdi)
-vmovdqa		%ymm9,64(%rdi)
-vmovdqa		%ymm4,96(%rdi)
-vmovdqa		%ymm8,128(%rdi)
-vmovdqa		%ymm3,160(%rdi)
-vmovdqa		%ymm7,192(%rdi)
-vmovdqa		%ymm11,224(%rdi)
-
-ret
-
-.global cdecl(nttunpack_avx)
-cdecl(nttunpack_avx):
-call		nttunpack128_avx
-add		$256,%rdi
-call		nttunpack128_avx
-ret
-
-ntttobytes128_avx:
-#load
-vmovdqa		(%rsi),%ymm5
-vmovdqa		32(%rsi),%ymm6
-vmovdqa		64(%rsi),%ymm7
-vmovdqa		96(%rsi),%ymm8
-vmovdqa		128(%rsi),%ymm9
-vmovdqa		160(%rsi),%ymm10
-vmovdqa		192(%rsi),%ymm11
-vmovdqa		224(%rsi),%ymm12
-
-#csubq
-csubq		5,13
-csubq		6,13
-csubq		7,13
-csubq		8,13
-csubq		9,13
-csubq		10,13
-csubq		11,13
-csubq		12,13
-
-#bitpack
-vpsllw		$12,%ymm6,%ymm4
-vpor		%ymm4,%ymm5,%ymm4
-
-vpsrlw		$4,%ymm6,%ymm5
-vpsllw		$8,%ymm7,%ymm6
-vpor		%ymm5,%ymm6,%ymm5
-
-vpsrlw		$8,%ymm7,%ymm6
-vpsllw		$4,%ymm8,%ymm7
-vpor		%ymm6,%ymm7,%ymm6
-
-vpsllw		$12,%ymm10,%ymm7
-vpor		%ymm7,%ymm9,%ymm7
-
-vpsrlw		$4,%ymm10,%ymm8
-vpsllw		$8,%ymm11,%ymm9
-vpor		%ymm8,%ymm9,%ymm8
-
-vpsrlw		$8,%ymm11,%ymm9
-vpsllw		$4,%ymm12,%ymm10
-vpor		%ymm9,%ymm10,%ymm9
-
-shuffle1	4,5,3,5
-shuffle1	6,7,4,7
-shuffle1	8,9,6,9
-
-shuffle2	3,4,8,4
-shuffle2	6,5,3,5
-shuffle2	7,9,6,9
-
-shuffle4	8,3,7,3
-shuffle4	6,4,8,4
-shuffle4	5,9,6,9
-
-shuffle8	7,8,5,8
-shuffle8	6,3,7,3
-shuffle8	4,9,6,9
-
-#store
-vmovdqu		%ymm5,(%rdi)
-vmovdqu		%ymm7,32(%rdi)
-vmovdqu		%ymm6,64(%rdi)
-vmovdqu		%ymm8,96(%rdi)
-vmovdqu		%ymm3,128(%rdi)
-vmovdqu		%ymm9,160(%rdi)
-
-ret
-
-.global cdecl(ntttobytes_avx)
-cdecl(ntttobytes_avx):
-#consts
-vmovdqa		_16XQ*2(%rdx),%ymm0
-call		ntttobytes128_avx
-add		$256,%rsi
-add		$192,%rdi
-call		ntttobytes128_avx
-ret
-
-nttfrombytes128_avx:
-#load
-vmovdqu		(%rsi),%ymm4
-vmovdqu		32(%rsi),%ymm5
-vmovdqu		64(%rsi),%ymm6
-vmovdqu		96(%rsi),%ymm7
-vmovdqu		128(%rsi),%ymm8
-vmovdqu		160(%rsi),%ymm9
-
-shuffle8	4,7,3,7
-shuffle8	5,8,4,8
-shuffle8	6,9,5,9
-
-shuffle4	3,8,6,8
-shuffle4	7,5,3,5
-shuffle4	4,9,7,9
-
-shuffle2	6,5,4,5
-shuffle2	8,7,6,7
-shuffle2	3,9,8,9
-
-shuffle1	4,7,10,7
-shuffle1	5,8,4,8
-shuffle1	6,9,5,9
-
-#bitunpack
-vpsrlw		$12,%ymm10,%ymm11
-vpsllw		$4,%ymm7,%ymm12
-vpor		%ymm11,%ymm12,%ymm11
-vpand		%ymm0,%ymm10,%ymm10
-vpand		%ymm0,%ymm11,%ymm11
-
-vpsrlw		$8,%ymm7,%ymm12
-vpsllw		$8,%ymm4,%ymm13
-vpor		%ymm12,%ymm13,%ymm12
-vpand		%ymm0,%ymm12,%ymm12
-
-vpsrlw		$4,%ymm4,%ymm13
-vpand		%ymm0,%ymm13,%ymm13
-
-vpsrlw		$12,%ymm8,%ymm14
-vpsllw		$4,%ymm5,%ymm15
-vpor		%ymm14,%ymm15,%ymm14
-vpand		%ymm0,%ymm8,%ymm8
-vpand		%ymm0,%ymm14,%ymm14
-
-vpsrlw		$8,%ymm5,%ymm15
-vpsllw		$8,%ymm9,%ymm1
-vpor		%ymm15,%ymm1,%ymm15
-vpand		%ymm0,%ymm15,%ymm15
-
-vpsrlw		$4,%ymm9,%ymm1
-vpand		%ymm0,%ymm1,%ymm1
-
-#store
-vmovdqa		%ymm10,(%rdi)
-vmovdqa		%ymm11,32(%rdi)
-vmovdqa		%ymm12,64(%rdi)
-vmovdqa		%ymm13,96(%rdi)
-vmovdqa		%ymm8,128(%rdi)
-vmovdqa		%ymm14,160(%rdi)
-vmovdqa		%ymm15,192(%rdi)
-vmovdqa		%ymm1,224(%rdi)
-
-ret
-
-.global cdecl(nttfrombytes_avx)
-cdecl(nttfrombytes_avx):
-#consts
-vmovdqa		_16XMASK*2(%rdx),%ymm0
-call		nttfrombytes128_avx
-add		$256,%rdi
-add		$192,%rsi
-call		nttfrombytes128_avx
-ret
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/shuffle.inc b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/shuffle.inc
deleted file mode 100644
index 73e9ffe03c..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/shuffle.inc
+++ /dev/null
@@ -1,25 +0,0 @@
-.macro shuffle8 r0,r1,r2,r3
-vperm2i128	$0x20,%ymm\r1,%ymm\r0,%ymm\r2
-vperm2i128	$0x31,%ymm\r1,%ymm\r0,%ymm\r3
-.endm
-
-.macro shuffle4 r0,r1,r2,r3
-vpunpcklqdq	%ymm\r1,%ymm\r0,%ymm\r2
-vpunpckhqdq	%ymm\r1,%ymm\r0,%ymm\r3
-.endm
-
-.macro shuffle2 r0,r1,r2,r3
-#vpsllq		$32,%ymm\r1,%ymm\r2
-vmovsldup	%ymm\r1,%ymm\r2
-vpblendd	$0xAA,%ymm\r2,%ymm\r0,%ymm\r2
-vpsrlq		$32,%ymm\r0,%ymm\r0
-#vmovshdup	%ymm\r0,%ymm\r0
-vpblendd	$0xAA,%ymm\r1,%ymm\r0,%ymm\r3
-.endm
-
-.macro shuffle1 r0,r1,r2,r3
-vpslld		$16,%ymm\r1,%ymm\r2
-vpblendw	$0xAA,%ymm\r2,%ymm\r0,%ymm\r2
-vpsrld		$16,%ymm\r0,%ymm\r0
-vpblendw	$0xAA,%ymm\r1,%ymm\r0,%ymm\r3
-.endm
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/symmetric-shake.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/symmetric-shake.c
deleted file mode 100644
index 20f451882e..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/symmetric-shake.c
+++ /dev/null
@@ -1,74 +0,0 @@
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-#include "params.h"
-#include "symmetric.h"
-#include "fips202.h"
-
-/*************************************************
-* Name:        kyber_shake128_absorb
-*
-* Description: Absorb step of the SHAKE128 specialized for the Kyber context.
-*
-* Arguments:   - keccak_state *state: pointer to (uninitialized) output Keccak state
-*              - const uint8_t *seed: pointer to KYBER_SYMBYTES input to be absorbed into state
-*              - uint8_t i: additional byte of input
-*              - uint8_t j: additional byte of input
-**************************************************/
-void kyber_shake128_absorb(shake128incctx *state,
-                           const uint8_t seed[KYBER_SYMBYTES],
-                           uint8_t x,
-                           uint8_t y)
-{
-  uint8_t extseed[KYBER_SYMBYTES+2];
-
-  memcpy(extseed, seed, KYBER_SYMBYTES);
-  extseed[KYBER_SYMBYTES+0] = x;
-  extseed[KYBER_SYMBYTES+1] = y;
-
-  shake128_absorb_once(state, extseed, sizeof(extseed));
-}
-
-/*************************************************
-* Name:        kyber_shake256_prf
-*
-* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input
-*              and then generates outlen bytes of SHAKE256 output
-*
-* Arguments:   - uint8_t *out: pointer to output
-*              - size_t outlen: number of requested output bytes
-*              - const uint8_t *key: pointer to the key (of length KYBER_SYMBYTES)
-*              - uint8_t nonce: single-byte nonce (public PRF input)
-**************************************************/
-void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce)
-{
-  uint8_t extkey[KYBER_SYMBYTES+1];
-
-  memcpy(extkey, key, KYBER_SYMBYTES);
-  extkey[KYBER_SYMBYTES] = nonce;
-
-  shake256(out, outlen, extkey, sizeof(extkey));
-}
-
-/*************************************************
-* Name:        kyber_shake256_prf
-*
-* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input
-*              and then generates outlen bytes of SHAKE256 output
-*
-* Arguments:   - uint8_t *out: pointer to output
-*              - size_t outlen: number of requested output bytes
-*              - const uint8_t *key: pointer to the key (of length KYBER_SYMBYTES)
-*              - uint8_t nonce: single-byte nonce (public PRF input)
-**************************************************/
-void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SYMBYTES], const uint8_t input[KYBER_CIPHERTEXTBYTES])
-{
-  shake256incctx s;
-
-  shake256_inc_init(&s);
-  shake256_inc_absorb(&s, key, KYBER_SYMBYTES);
-  shake256_inc_absorb(&s, input, KYBER_CIPHERTEXTBYTES);
-  shake256_inc_finalize(&s);
-  shake256_inc_squeeze(out, KYBER_SSBYTES, &s);
-  shake256_inc_ctx_release(&s);
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/symmetric.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/symmetric.h
deleted file mode 100644
index e4941f7a86..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/symmetric.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef SYMMETRIC_H
-#define SYMMETRIC_H
-
-#include <stddef.h>
-#include <stdint.h>
-#include "params.h"
-
-#include "fips202.h"
-#include "fips202x4.h"
-
-typedef shake128incctx xof_state;
-
-#define kyber_shake128_absorb KYBER_NAMESPACE(kyber_shake128_absorb)
-void kyber_shake128_absorb(shake128incctx *s,
-                           const uint8_t seed[KYBER_SYMBYTES],
-                           uint8_t x,
-                           uint8_t y);
-
-#define kyber_shake256_prf KYBER_NAMESPACE(kyber_shake256_prf)
-void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce);
-
-#define kyber_shake256_rkprf KYBER_NAMESPACE(kyber_shake256_rkprf)
-void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SYMBYTES], const uint8_t input[KYBER_CIPHERTEXTBYTES]);
-
-#define XOF_BLOCKBYTES SHAKE128_RATE
-
-#define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES)
-#define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES)
-#define xof_absorb(STATE, SEED, X, Y) kyber_shake128_absorb(STATE, SEED, X, Y)
-#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE)
-#define prf(OUT, OUTBYTES, KEY, NONCE) kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE)
-#define rkprf(OUT, KEY, INPUT) kyber_shake256_rkprf(OUT, KEY, INPUT)
-
-#endif /* SYMMETRIC_H */
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/verify.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/verify.c
deleted file mode 100644
index 06243b837f..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/verify.c
+++ /dev/null
@@ -1,83 +0,0 @@
-#include <stdlib.h>
-#include <stdint.h>
-#include <immintrin.h>
-#include "verify.h"
-
-/*************************************************
-* Name:        verify
-*
-* Description: Compare two arrays for equality in constant time.
-*
-* Arguments:   const uint8_t *a: pointer to first byte array
-*              const uint8_t *b: pointer to second byte array
-*              size_t len: length of the byte arrays
-*
-* Returns 0 if the byte arrays are equal, 1 otherwise
-**************************************************/
-int verify(const uint8_t *a, const uint8_t *b, size_t len)
-{
-  size_t i;
-  uint64_t r;
-  __m256i f, g, h;
-
-  h = _mm256_setzero_si256();
-  for(i=0;i<len/32;i++) {
-    f = _mm256_loadu_si256((__m256i *)&a[32*i]);
-    g = _mm256_loadu_si256((__m256i *)&b[32*i]);
-    f = _mm256_xor_si256(f,g);
-    h = _mm256_or_si256(h,f);
-  }
-  r = 1 - _mm256_testz_si256(h,h);
-
-  a += 32*i;
-  b += 32*i;
-  len -= 32*i;
-  for(i=0;i<len;i++)
-    r |= a[i] ^ b[i];
-
-  r = (-r) >> 63;
-  return r;
-}
-
-/*************************************************
-* Name:        cmov
-*
-* Description: Copy len bytes from x to r if b is 1;
-*              don't modify x if b is 0. Requires b to be in {0,1};
-*              assumes two's complement representation of negative integers.
-*              Runs in constant time.
-*
-* Arguments:   uint8_t *r: pointer to output byte array
-*              const uint8_t *x: pointer to input byte array
-*              size_t len: Amount of bytes to be copied
-*              uint8_t b: Condition bit; has to be in {0,1}
-**************************************************/
-void cmov(uint8_t * restrict r, const uint8_t *x, size_t len, uint8_t b)
-{
-  size_t i;
-  __m256i xvec, rvec, bvec;
-
-#if defined(__GNUC__) || defined(__clang__)
-  // Prevent the compiler from
-  //    1) inferring that b is 0/1-valued, and
-  //    2) handling the two cases with a branch.
-  // This is not necessary when verify.c and kem.c are separate translation
-  // units, but we expect that downstream consumers will copy this code and/or
-  // change how it is built.
-  __asm__("" : "+r"(b) : /* no inputs */);
-#endif
-
-  bvec = _mm256_set1_epi64x(-(uint64_t)b);
-  for(i=0;i<len/32;i++) {
-    rvec = _mm256_loadu_si256((__m256i *)&r[32*i]);
-    xvec = _mm256_loadu_si256((__m256i *)&x[32*i]);
-    rvec = _mm256_blendv_epi8(rvec,xvec,bvec);
-    _mm256_storeu_si256((__m256i *)&r[32*i],rvec);
-  }
-
-  r += 32*i;
-  x += 32*i;
-  len -= 32*i;
-  for(i=0;i<len;i++)
-    r[i] ^= -b & (x[i] ^ r[i]);
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/verify.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/verify.h
deleted file mode 100644
index 09f0ad5046..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_avx2/verify.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef VERIFY_H
-#define VERIFY_H
-
-#include <stddef.h>
-#include <stdint.h>
-#include "params.h"
-
-#define verify KYBER_NAMESPACE(verify)
-int verify(const uint8_t *a, const uint8_t *b, size_t len);
-
-#define cmov KYBER_NAMESPACE(cmov)
-void cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b);
-
-#define cmov_int16 KYBER_NAMESPACE(cmov_int16)
-void cmov_int16(int16_t *r, int16_t v, uint16_t b);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/LICENSE b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/LICENSE
deleted file mode 100644
index 7922ab8007..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/LICENSE
+++ /dev/null
@@ -1,6 +0,0 @@
-Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/);
-or Apache 2.0 License (https://www.apache.org/licenses/LICENSE-2.0.html).
-
-For Keccak and AES we are using public-domain
-code from sources and by authors listed in
-comments on top of the respective files.
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/api.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/api.h
deleted file mode 100644
index 70d40f3f3e..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/api.h
+++ /dev/null
@@ -1,66 +0,0 @@
-#ifndef API_H
-#define API_H
-
-#include <stdint.h>
-
-#define pqcrystals_kyber512_SECRETKEYBYTES 1632
-#define pqcrystals_kyber512_PUBLICKEYBYTES 800
-#define pqcrystals_kyber512_CIPHERTEXTBYTES 768
-#define pqcrystals_kyber512_KEYPAIRCOINBYTES 64
-#define pqcrystals_kyber512_ENCCOINBYTES 32
-#define pqcrystals_kyber512_BYTES 32
-
-#define pqcrystals_kyber512_ref_SECRETKEYBYTES pqcrystals_kyber512_SECRETKEYBYTES
-#define pqcrystals_kyber512_ref_PUBLICKEYBYTES pqcrystals_kyber512_PUBLICKEYBYTES
-#define pqcrystals_kyber512_ref_CIPHERTEXTBYTES pqcrystals_kyber512_CIPHERTEXTBYTES
-#define pqcrystals_kyber512_ref_KEYPAIRCOINBYTES pqcrystals_kyber512_KEYPAIRCOINBYTES
-#define pqcrystals_kyber512_ref_ENCCOINBYTES pqcrystals_kyber512_ENCCOINBYTES
-#define pqcrystals_kyber512_ref_BYTES pqcrystals_kyber512_BYTES
-
-int pqcrystals_kyber512_ref_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
-int pqcrystals_kyber512_ref_keypair(uint8_t *pk, uint8_t *sk);
-int pqcrystals_kyber512_ref_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
-int pqcrystals_kyber512_ref_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-int pqcrystals_kyber512_ref_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-
-#define pqcrystals_kyber768_SECRETKEYBYTES 2400
-#define pqcrystals_kyber768_PUBLICKEYBYTES 1184
-#define pqcrystals_kyber768_CIPHERTEXTBYTES 1088
-#define pqcrystals_kyber768_KEYPAIRCOINBYTES 64
-#define pqcrystals_kyber768_ENCCOINBYTES 32
-#define pqcrystals_kyber768_BYTES 32
-
-#define pqcrystals_kyber768_ref_SECRETKEYBYTES pqcrystals_kyber768_SECRETKEYBYTES
-#define pqcrystals_kyber768_ref_PUBLICKEYBYTES pqcrystals_kyber768_PUBLICKEYBYTES
-#define pqcrystals_kyber768_ref_CIPHERTEXTBYTES pqcrystals_kyber768_CIPHERTEXTBYTES
-#define pqcrystals_kyber768_ref_KEYPAIRCOINBYTES pqcrystals_kyber768_KEYPAIRCOINBYTES
-#define pqcrystals_kyber768_ref_ENCCOINBYTES pqcrystals_kyber768_ENCCOINBYTES
-#define pqcrystals_kyber768_ref_BYTES pqcrystals_kyber768_BYTES
-
-int pqcrystals_kyber768_ref_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
-int pqcrystals_kyber768_ref_keypair(uint8_t *pk, uint8_t *sk);
-int pqcrystals_kyber768_ref_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
-int pqcrystals_kyber768_ref_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-int pqcrystals_kyber768_ref_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-
-#define pqcrystals_kyber1024_SECRETKEYBYTES 3168
-#define pqcrystals_kyber1024_PUBLICKEYBYTES 1568
-#define pqcrystals_kyber1024_CIPHERTEXTBYTES 1568
-#define pqcrystals_kyber1024_KEYPAIRCOINBYTES 64
-#define pqcrystals_kyber1024_ENCCOINBYTES 32
-#define pqcrystals_kyber1024_BYTES 32
-
-#define pqcrystals_kyber1024_ref_SECRETKEYBYTES pqcrystals_kyber1024_SECRETKEYBYTES
-#define pqcrystals_kyber1024_ref_PUBLICKEYBYTES pqcrystals_kyber1024_PUBLICKEYBYTES
-#define pqcrystals_kyber1024_ref_CIPHERTEXTBYTES pqcrystals_kyber1024_CIPHERTEXTBYTES
-#define pqcrystals_kyber1024_ref_KEYPAIRCOINBYTES pqcrystals_kyber1024_KEYPAIRCOINBYTES
-#define pqcrystals_kyber1024_ref_ENCCOINBYTES pqcrystals_kyber1024_ENCCOINBYTES
-#define pqcrystals_kyber1024_ref_BYTES pqcrystals_kyber1024_BYTES
-
-int pqcrystals_kyber1024_ref_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
-int pqcrystals_kyber1024_ref_keypair(uint8_t *pk, uint8_t *sk);
-int pqcrystals_kyber1024_ref_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
-int pqcrystals_kyber1024_ref_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-int pqcrystals_kyber1024_ref_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/cbd.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/cbd.c
deleted file mode 100644
index 1500ffea56..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/cbd.c
+++ /dev/null
@@ -1,128 +0,0 @@
-#include <stdint.h>
-#include "params.h"
-#include "cbd.h"
-
-/*************************************************
-* Name:        load32_littleendian
-*
-* Description: load 4 bytes into a 32-bit integer
-*              in little-endian order
-*
-* Arguments:   - const uint8_t *x: pointer to input byte array
-*
-* Returns 32-bit unsigned integer loaded from x
-**************************************************/
-static uint32_t load32_littleendian(const uint8_t x[4])
-{
-  uint32_t r;
-  r  = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  r |= (uint32_t)x[3] << 24;
-  return r;
-}
-
-/*************************************************
-* Name:        load24_littleendian
-*
-* Description: load 3 bytes into a 32-bit integer
-*              in little-endian order.
-*              This function is only needed for Kyber-512
-*
-* Arguments:   - const uint8_t *x: pointer to input byte array
-*
-* Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
-**************************************************/
-#if KYBER_ETA1 == 3
-static uint32_t load24_littleendian(const uint8_t x[3])
-{
-  uint32_t r;
-  r  = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  return r;
-}
-#endif
-
-
-/*************************************************
-* Name:        cbd2
-*
-* Description: Given an array of uniformly random bytes, compute
-*              polynomial with coefficients distributed according to
-*              a centered binomial distribution with parameter eta=2
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *buf: pointer to input byte array
-**************************************************/
-static void cbd2(poly *r, const uint8_t buf[2*KYBER_N/4])
-{
-  unsigned int i,j;
-  uint32_t t,d;
-  int16_t a,b;
-
-  for(i=0;i<KYBER_N/8;i++) {
-    t  = load32_littleendian(buf+4*i);
-    d  = t & 0x55555555;
-    d += (t>>1) & 0x55555555;
-
-    for(j=0;j<8;j++) {
-      a = (d >> (4*j+0)) & 0x3;
-      b = (d >> (4*j+2)) & 0x3;
-      r->coeffs[8*i+j] = a - b;
-    }
-  }
-}
-
-/*************************************************
-* Name:        cbd3
-*
-* Description: Given an array of uniformly random bytes, compute
-*              polynomial with coefficients distributed according to
-*              a centered binomial distribution with parameter eta=3.
-*              This function is only needed for Kyber-512
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *buf: pointer to input byte array
-**************************************************/
-#if KYBER_ETA1 == 3
-static void cbd3(poly *r, const uint8_t buf[3*KYBER_N/4])
-{
-  unsigned int i,j;
-  uint32_t t,d;
-  int16_t a,b;
-
-  for(i=0;i<KYBER_N/4;i++) {
-    t  = load24_littleendian(buf+3*i);
-    d  = t & 0x00249249;
-    d += (t>>1) & 0x00249249;
-    d += (t>>2) & 0x00249249;
-
-    for(j=0;j<4;j++) {
-      a = (d >> (6*j+0)) & 0x7;
-      b = (d >> (6*j+3)) & 0x7;
-      r->coeffs[4*i+j] = a - b;
-    }
-  }
-}
-#endif
-
-void poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1*KYBER_N/4])
-{
-#if KYBER_ETA1 == 2
-  cbd2(r, buf);
-#elif KYBER_ETA1 == 3
-  cbd3(r, buf);
-#else
-#error "This implementation requires eta1 in {2,3}"
-#endif
-}
-
-void poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2*KYBER_N/4])
-{
-#if KYBER_ETA2 == 2
-  cbd2(r, buf);
-#else
-#error "This implementation requires eta2 = 2"
-#endif
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/cbd.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/cbd.h
deleted file mode 100644
index 7b677d745d..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/cbd.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef CBD_H
-#define CBD_H
-
-#include <stdint.h>
-#include "params.h"
-#include "poly.h"
-
-#define poly_cbd_eta1 KYBER_NAMESPACE(poly_cbd_eta1)
-void poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1*KYBER_N/4]);
-
-#define poly_cbd_eta2 KYBER_NAMESPACE(poly_cbd_eta2)
-void poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2*KYBER_N/4]);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/indcpa.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/indcpa.c
deleted file mode 100644
index 726cfa985d..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/indcpa.c
+++ /dev/null
@@ -1,334 +0,0 @@
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-#include "params.h"
-#include "indcpa.h"
-#include "polyvec.h"
-#include "poly.h"
-#include "ntt.h"
-#include "symmetric.h"
-#include "randombytes.h"
-
-/*************************************************
-* Name:        pack_pk
-*
-* Description: Serialize the public key as concatenation of the
-*              serialized vector of polynomials pk
-*              and the public seed used to generate the matrix A.
-*
-* Arguments:   uint8_t *r: pointer to the output serialized public key
-*              polyvec *pk: pointer to the input public-key polyvec
-*              const uint8_t *seed: pointer to the input public seed
-**************************************************/
-static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES],
-                    polyvec *pk,
-                    const uint8_t seed[KYBER_SYMBYTES])
-{
-  polyvec_tobytes(r, pk);
-  memcpy(r+KYBER_POLYVECBYTES, seed, KYBER_SYMBYTES);
-}
-
-/*************************************************
-* Name:        unpack_pk
-*
-* Description: De-serialize public key from a byte array;
-*              approximate inverse of pack_pk
-*
-* Arguments:   - polyvec *pk: pointer to output public-key polynomial vector
-*              - uint8_t *seed: pointer to output seed to generate matrix A
-*              - const uint8_t *packedpk: pointer to input serialized public key
-**************************************************/
-static void unpack_pk(polyvec *pk,
-                      uint8_t seed[KYBER_SYMBYTES],
-                      const uint8_t packedpk[KYBER_INDCPA_PUBLICKEYBYTES])
-{
-  polyvec_frombytes(pk, packedpk);
-  memcpy(seed, packedpk+KYBER_POLYVECBYTES, KYBER_SYMBYTES);
-}
-
-/*************************************************
-* Name:        pack_sk
-*
-* Description: Serialize the secret key
-*
-* Arguments:   - uint8_t *r: pointer to output serialized secret key
-*              - polyvec *sk: pointer to input vector of polynomials (secret key)
-**************************************************/
-static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk)
-{
-  polyvec_tobytes(r, sk);
-}
-
-/*************************************************
-* Name:        unpack_sk
-*
-* Description: De-serialize the secret key; inverse of pack_sk
-*
-* Arguments:   - polyvec *sk: pointer to output vector of polynomials (secret key)
-*              - const uint8_t *packedsk: pointer to input serialized secret key
-**************************************************/
-static void unpack_sk(polyvec *sk, const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES])
-{
-  polyvec_frombytes(sk, packedsk);
-}
-
-/*************************************************
-* Name:        pack_ciphertext
-*
-* Description: Serialize the ciphertext as concatenation of the
-*              compressed and serialized vector of polynomials b
-*              and the compressed and serialized polynomial v
-*
-* Arguments:   uint8_t *r: pointer to the output serialized ciphertext
-*              poly *pk: pointer to the input vector of polynomials b
-*              poly *v: pointer to the input polynomial v
-**************************************************/
-static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], polyvec *b, poly *v)
-{
-  polyvec_compress(r, b);
-  poly_compress(r+KYBER_POLYVECCOMPRESSEDBYTES, v);
-}
-
-/*************************************************
-* Name:        unpack_ciphertext
-*
-* Description: De-serialize and decompress ciphertext from a byte array;
-*              approximate inverse of pack_ciphertext
-*
-* Arguments:   - polyvec *b: pointer to the output vector of polynomials b
-*              - poly *v: pointer to the output polynomial v
-*              - const uint8_t *c: pointer to the input serialized ciphertext
-**************************************************/
-static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[KYBER_INDCPA_BYTES])
-{
-  polyvec_decompress(b, c);
-  poly_decompress(v, c+KYBER_POLYVECCOMPRESSEDBYTES);
-}
-
-/*************************************************
-* Name:        rej_uniform
-*
-* Description: Run rejection sampling on uniform random bytes to generate
-*              uniform random integers mod q
-*
-* Arguments:   - int16_t *r: pointer to output buffer
-*              - unsigned int len: requested number of 16-bit integers (uniform mod q)
-*              - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes)
-*              - unsigned int buflen: length of input buffer in bytes
-*
-* Returns number of sampled 16-bit integers (at most len)
-**************************************************/
-static unsigned int rej_uniform(int16_t *r,
-                                unsigned int len,
-                                const uint8_t *buf,
-                                unsigned int buflen)
-{
-  unsigned int ctr, pos;
-  uint16_t val0, val1;
-
-  ctr = pos = 0;
-  while(ctr < len && pos + 3 <= buflen) {
-    val0 = ((buf[pos+0] >> 0) | ((uint16_t)buf[pos+1] << 8)) & 0xFFF;
-    val1 = ((buf[pos+1] >> 4) | ((uint16_t)buf[pos+2] << 4)) & 0xFFF;
-    pos += 3;
-
-    if(val0 < KYBER_Q)
-      r[ctr++] = val0;
-    if(ctr < len && val1 < KYBER_Q)
-      r[ctr++] = val1;
-  }
-
-  return ctr;
-}
-
-#define gen_a(A,B)  gen_matrix(A,B,0)
-#define gen_at(A,B) gen_matrix(A,B,1)
-
-/*************************************************
-* Name:        gen_matrix
-*
-* Description: Deterministically generate matrix A (or the transpose of A)
-*              from a seed. Entries of the matrix are polynomials that look
-*              uniformly random. Performs rejection sampling on output of
-*              a XOF
-*
-* Arguments:   - polyvec *a: pointer to ouptput matrix A
-*              - const uint8_t *seed: pointer to input seed
-*              - int transposed: boolean deciding whether A or A^T is generated
-**************************************************/
-#if(XOF_BLOCKBYTES % 3)
-#error "Implementation of gen_matrix assumes that XOF_BLOCKBYTES is a multiple of 3"
-#endif
-
-#define GEN_MATRIX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES)
-// Not static for benchmarking
-void gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed)
-{
-  unsigned int ctr, i, j;
-  unsigned int buflen;
-  uint8_t buf[GEN_MATRIX_NBLOCKS*XOF_BLOCKBYTES];
-  xof_state state;
-  xof_init(&state, seed);
-
-  for(i=0;i<KYBER_K;i++) {
-    for(j=0;j<KYBER_K;j++) {
-      if(transposed)
-        xof_absorb(&state, seed, i, j);
-      else
-        xof_absorb(&state, seed, j, i);
-
-      xof_squeezeblocks(buf, GEN_MATRIX_NBLOCKS, &state);
-      buflen = GEN_MATRIX_NBLOCKS*XOF_BLOCKBYTES;
-      ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, buflen);
-
-      while(ctr < KYBER_N) {
-        xof_squeezeblocks(buf, 1, &state);
-        buflen = XOF_BLOCKBYTES;
-        ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, buflen);
-      }
-    }
-  }
-  xof_release(&state);
-}
-
-/*************************************************
-* Name:        indcpa_keypair_derand
-*
-* Description: Generates public and private key for the CPA-secure
-*              public-key encryption scheme underlying Kyber
-*
-* Arguments:   - uint8_t *pk: pointer to output public key
-*                             (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
-*              - uint8_t *sk: pointer to output private key
-*                             (of length KYBER_INDCPA_SECRETKEYBYTES bytes)
-*              - const uint8_t *coins: pointer to input randomness
-*                             (of length KYBER_SYMBYTES bytes)
-**************************************************/
-void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
-                           uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES],
-                           const uint8_t coins[KYBER_SYMBYTES])
-{
-  unsigned int i;
-  uint8_t buf[2*KYBER_SYMBYTES];
-  const uint8_t *publicseed = buf;
-  const uint8_t *noiseseed = buf+KYBER_SYMBYTES;
-  uint8_t nonce = 0;
-  polyvec a[KYBER_K], e, pkpv, skpv;
-
-  memcpy(buf, coins, KYBER_SYMBYTES);
-  buf[KYBER_SYMBYTES] = KYBER_K;
-  hash_g(buf, buf, KYBER_SYMBYTES+1);
-
-  gen_a(a, publicseed);
-
-  for(i=0;i<KYBER_K;i++)
-    poly_getnoise_eta1(&skpv.vec[i], noiseseed, nonce++);
-  for(i=0;i<KYBER_K;i++)
-    poly_getnoise_eta1(&e.vec[i], noiseseed, nonce++);
-
-  polyvec_ntt(&skpv);
-  polyvec_ntt(&e);
-
-  // matrix-vector multiplication
-  for(i=0;i<KYBER_K;i++) {
-    polyvec_basemul_acc_montgomery(&pkpv.vec[i], &a[i], &skpv);
-    poly_tomont(&pkpv.vec[i]);
-  }
-
-  polyvec_add(&pkpv, &pkpv, &e);
-  polyvec_reduce(&pkpv);
-
-  pack_sk(sk, &skpv);
-  pack_pk(pk, &pkpv, publicseed);
-}
-
-
-/*************************************************
-* Name:        indcpa_enc
-*
-* Description: Encryption function of the CPA-secure
-*              public-key encryption scheme underlying Kyber.
-*
-* Arguments:   - uint8_t *c: pointer to output ciphertext
-*                            (of length KYBER_INDCPA_BYTES bytes)
-*              - const uint8_t *m: pointer to input message
-*                                  (of length KYBER_INDCPA_MSGBYTES bytes)
-*              - const uint8_t *pk: pointer to input public key
-*                                   (of length KYBER_INDCPA_PUBLICKEYBYTES)
-*              - const uint8_t *coins: pointer to input random coins used as seed
-*                                      (of length KYBER_SYMBYTES) to deterministically
-*                                      generate all randomness
-**************************************************/
-void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
-                const uint8_t m[KYBER_INDCPA_MSGBYTES],
-                const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
-                const uint8_t coins[KYBER_SYMBYTES])
-{
-  unsigned int i;
-  uint8_t seed[KYBER_SYMBYTES];
-  uint8_t nonce = 0;
-  polyvec sp, pkpv, ep, at[KYBER_K], b;
-  poly v, k, epp;
-
-  unpack_pk(&pkpv, seed, pk);
-  poly_frommsg(&k, m);
-  gen_at(at, seed);
-
-  for(i=0;i<KYBER_K;i++)
-    poly_getnoise_eta1(sp.vec+i, coins, nonce++);
-  for(i=0;i<KYBER_K;i++)
-    poly_getnoise_eta2(ep.vec+i, coins, nonce++);
-  poly_getnoise_eta2(&epp, coins, nonce++);
-
-  polyvec_ntt(&sp);
-
-  // matrix-vector multiplication
-  for(i=0;i<KYBER_K;i++)
-    polyvec_basemul_acc_montgomery(&b.vec[i], &at[i], &sp);
-
-  polyvec_basemul_acc_montgomery(&v, &pkpv, &sp);
-
-  polyvec_invntt_tomont(&b);
-  poly_invntt_tomont(&v);
-
-  polyvec_add(&b, &b, &ep);
-  poly_add(&v, &v, &epp);
-  poly_add(&v, &v, &k);
-  polyvec_reduce(&b);
-  poly_reduce(&v);
-
-  pack_ciphertext(c, &b, &v);
-}
-
-/*************************************************
-* Name:        indcpa_dec
-*
-* Description: Decryption function of the CPA-secure
-*              public-key encryption scheme underlying Kyber.
-*
-* Arguments:   - uint8_t *m: pointer to output decrypted message
-*                            (of length KYBER_INDCPA_MSGBYTES)
-*              - const uint8_t *c: pointer to input ciphertext
-*                                  (of length KYBER_INDCPA_BYTES)
-*              - const uint8_t *sk: pointer to input secret key
-*                                   (of length KYBER_INDCPA_SECRETKEYBYTES)
-**************************************************/
-void indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES],
-                const uint8_t c[KYBER_INDCPA_BYTES],
-                const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES])
-{
-  polyvec b, skpv;
-  poly v, mp;
-
-  unpack_ciphertext(&b, &v, c);
-  unpack_sk(&skpv, sk);
-
-  polyvec_ntt(&b);
-  polyvec_basemul_acc_montgomery(&mp, &skpv, &b);
-  poly_invntt_tomont(&mp);
-
-  poly_sub(&mp, &v, &mp);
-  poly_reduce(&mp);
-
-  poly_tomsg(m, &mp);
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/indcpa.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/indcpa.h
deleted file mode 100644
index 6dd5088c85..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/indcpa.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef INDCPA_H
-#define INDCPA_H
-
-#include <stdint.h>
-#include "params.h"
-#include "polyvec.h"
-
-#define gen_matrix KYBER_NAMESPACE(gen_matrix)
-void gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed);
-
-#define indcpa_keypair_derand KYBER_NAMESPACE(indcpa_keypair_derand)
-void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
-                           uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES],
-                           const uint8_t coins[KYBER_SYMBYTES]);
-
-#define indcpa_enc KYBER_NAMESPACE(indcpa_enc)
-void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
-                const uint8_t m[KYBER_INDCPA_MSGBYTES],
-                const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
-                const uint8_t coins[KYBER_SYMBYTES]);
-
-#define indcpa_dec KYBER_NAMESPACE(indcpa_dec)
-void indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES],
-                const uint8_t c[KYBER_INDCPA_BYTES],
-                const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/kem.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/kem.c
deleted file mode 100644
index 63abc1029c..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/kem.c
+++ /dev/null
@@ -1,169 +0,0 @@
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-#include "params.h"
-#include "kem.h"
-#include "indcpa.h"
-#include "verify.h"
-#include "symmetric.h"
-#include "randombytes.h"
-/*************************************************
-* Name:        crypto_kem_keypair_derand
-*
-* Description: Generates public and private key
-*              for CCA-secure Kyber key encapsulation mechanism
-*
-* Arguments:   - uint8_t *pk: pointer to output public key
-*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
-*              - uint8_t *sk: pointer to output private key
-*                (an already allocated array of KYBER_SECRETKEYBYTES bytes)
-*              - uint8_t *coins: pointer to input randomness
-*                (an already allocated array filled with 2*KYBER_SYMBYTES random bytes)
-**
-* Returns 0 (success)
-**************************************************/
-int crypto_kem_keypair_derand(uint8_t *pk,
-                              uint8_t *sk,
-                              const uint8_t *coins)
-{
-  indcpa_keypair_derand(pk, sk, coins);
-  memcpy(sk+KYBER_INDCPA_SECRETKEYBYTES, pk, KYBER_PUBLICKEYBYTES);
-  hash_h(sk+KYBER_SECRETKEYBYTES-2*KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
-  /* Value z for pseudo-random output on reject */
-  memcpy(sk+KYBER_SECRETKEYBYTES-KYBER_SYMBYTES, coins+KYBER_SYMBYTES, KYBER_SYMBYTES);
-  return 0;
-}
-
-/*************************************************
-* Name:        crypto_kem_keypair
-*
-* Description: Generates public and private key
-*              for CCA-secure Kyber key encapsulation mechanism
-*
-* Arguments:   - uint8_t *pk: pointer to output public key
-*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
-*              - uint8_t *sk: pointer to output private key
-*                (an already allocated array of KYBER_SECRETKEYBYTES bytes)
-*
-* Returns 0 (success)
-**************************************************/
-int crypto_kem_keypair(uint8_t *pk,
-                       uint8_t *sk)
-{
-  uint8_t coins[2*KYBER_SYMBYTES];
-  randombytes(coins, 2*KYBER_SYMBYTES);
-  crypto_kem_keypair_derand(pk, sk, coins);
-  return 0;
-}
-
-/*************************************************
-* Name:        crypto_kem_enc_derand
-*
-* Description: Generates cipher text and shared
-*              secret for given public key
-*
-* Arguments:   - uint8_t *ct: pointer to output cipher text
-*                (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
-*              - uint8_t *ss: pointer to output shared secret
-*                (an already allocated array of KYBER_SSBYTES bytes)
-*              - const uint8_t *pk: pointer to input public key
-*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
-*              - const uint8_t *coins: pointer to input randomness
-*                (an already allocated array filled with KYBER_SYMBYTES random bytes)
-**
-* Returns 0 (success)
-**************************************************/
-int crypto_kem_enc_derand(uint8_t *ct,
-                          uint8_t *ss,
-                          const uint8_t *pk,
-                          const uint8_t *coins)
-{
-  uint8_t buf[2*KYBER_SYMBYTES];
-  /* Will contain key, coins */
-  uint8_t kr[2*KYBER_SYMBYTES];
-
-  memcpy(buf, coins, KYBER_SYMBYTES);
-
-  /* Multitarget countermeasure for coins + contributory KEM */
-  hash_h(buf+KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
-  hash_g(kr, buf, 2*KYBER_SYMBYTES);
-
-  /* coins are in kr+KYBER_SYMBYTES */
-  indcpa_enc(ct, buf, pk, kr+KYBER_SYMBYTES);
-
-  memcpy(ss,kr,KYBER_SYMBYTES);
-  return 0;
-}
-
-/*************************************************
-* Name:        crypto_kem_enc
-*
-* Description: Generates cipher text and shared
-*              secret for given public key
-*
-* Arguments:   - uint8_t *ct: pointer to output cipher text
-*                (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
-*              - uint8_t *ss: pointer to output shared secret
-*                (an already allocated array of KYBER_SSBYTES bytes)
-*              - const uint8_t *pk: pointer to input public key
-*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
-*
-* Returns 0 (success)
-**************************************************/
-int crypto_kem_enc(uint8_t *ct,
-                   uint8_t *ss,
-                   const uint8_t *pk)
-{
-  uint8_t coins[KYBER_SYMBYTES];
-  randombytes(coins, KYBER_SYMBYTES);
-  crypto_kem_enc_derand(ct, ss, pk, coins);
-  return 0;
-}
-
-/*************************************************
-* Name:        crypto_kem_dec
-*
-* Description: Generates shared secret for given
-*              cipher text and private key
-*
-* Arguments:   - uint8_t *ss: pointer to output shared secret
-*                (an already allocated array of KYBER_SSBYTES bytes)
-*              - const uint8_t *ct: pointer to input cipher text
-*                (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
-*              - const uint8_t *sk: pointer to input private key
-*                (an already allocated array of KYBER_SECRETKEYBYTES bytes)
-*
-* Returns 0.
-*
-* On failure, ss will contain a pseudo-random value.
-**************************************************/
-int crypto_kem_dec(uint8_t *ss,
-                   const uint8_t *ct,
-                   const uint8_t *sk)
-{
-  int fail;
-  uint8_t buf[2*KYBER_SYMBYTES];
-  /* Will contain key, coins */
-  uint8_t kr[2*KYBER_SYMBYTES];
-  uint8_t cmp[KYBER_CIPHERTEXTBYTES+KYBER_SYMBYTES];
-  const uint8_t *pk = sk+KYBER_INDCPA_SECRETKEYBYTES;
-
-  indcpa_dec(buf, ct, sk);
-
-  /* Multitarget countermeasure for coins + contributory KEM */
-  memcpy(buf+KYBER_SYMBYTES, sk+KYBER_SECRETKEYBYTES-2*KYBER_SYMBYTES, KYBER_SYMBYTES);
-  hash_g(kr, buf, 2*KYBER_SYMBYTES);
-
-  /* coins are in kr+KYBER_SYMBYTES */
-  indcpa_enc(cmp, buf, pk, kr+KYBER_SYMBYTES);
-
-  fail = verify(ct, cmp, KYBER_CIPHERTEXTBYTES);
-
-  /* Compute rejection key */
-  rkprf(ss,sk+KYBER_SECRETKEYBYTES-KYBER_SYMBYTES,ct);
-
-  /* Copy true key to return buffer if fail is false */
-  cmov(ss,kr,KYBER_SYMBYTES,!fail);
-
-  return 0;
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/kem.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/kem.h
deleted file mode 100644
index 234f11966b..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/kem.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef KEM_H
-#define KEM_H
-
-#include <stdint.h>
-#include "params.h"
-
-#define CRYPTO_SECRETKEYBYTES  KYBER_SECRETKEYBYTES
-#define CRYPTO_PUBLICKEYBYTES  KYBER_PUBLICKEYBYTES
-#define CRYPTO_CIPHERTEXTBYTES KYBER_CIPHERTEXTBYTES
-#define CRYPTO_BYTES           KYBER_SSBYTES
-
-#if   (KYBER_K == 2)
-#define CRYPTO_ALGNAME "Kyber512"
-#elif (KYBER_K == 3)
-#define CRYPTO_ALGNAME "Kyber768"
-#elif (KYBER_K == 4)
-#define CRYPTO_ALGNAME "Kyber1024"
-#endif
-
-#define crypto_kem_keypair_derand KYBER_NAMESPACE(keypair_derand)
-int crypto_kem_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
-
-#define crypto_kem_keypair KYBER_NAMESPACE(keypair)
-int crypto_kem_keypair(uint8_t *pk, uint8_t *sk);
-
-#define crypto_kem_enc_derand KYBER_NAMESPACE(enc_derand)
-int crypto_kem_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
-
-#define crypto_kem_enc KYBER_NAMESPACE(enc)
-int crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-
-#define crypto_kem_dec KYBER_NAMESPACE(dec)
-int crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/ntt.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/ntt.c
deleted file mode 100644
index 2f2eb10b2f..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/ntt.c
+++ /dev/null
@@ -1,146 +0,0 @@
-#include <stdint.h>
-#include "params.h"
-#include "ntt.h"
-#include "reduce.h"
-
-/* Code to generate zetas and zetas_inv used in the number-theoretic transform:
-
-#define KYBER_ROOT_OF_UNITY 17
-
-static const uint8_t tree[128] = {
-  0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120,
-  4, 68, 36, 100, 20, 84, 52, 116, 12, 76, 44, 108, 28, 92, 60, 124,
-  2, 66, 34, 98, 18, 82, 50, 114, 10, 74, 42, 106, 26, 90, 58, 122,
-  6, 70, 38, 102, 22, 86, 54, 118, 14, 78, 46, 110, 30, 94, 62, 126,
-  1, 65, 33, 97, 17, 81, 49, 113, 9, 73, 41, 105, 25, 89, 57, 121,
-  5, 69, 37, 101, 21, 85, 53, 117, 13, 77, 45, 109, 29, 93, 61, 125,
-  3, 67, 35, 99, 19, 83, 51, 115, 11, 75, 43, 107, 27, 91, 59, 123,
-  7, 71, 39, 103, 23, 87, 55, 119, 15, 79, 47, 111, 31, 95, 63, 127
-};
-
-void init_ntt() {
-  unsigned int i;
-  int16_t tmp[128];
-
-  tmp[0] = MONT;
-  for(i=1;i<128;i++)
-    tmp[i] = fqmul(tmp[i-1],MONT*KYBER_ROOT_OF_UNITY % KYBER_Q);
-
-  for(i=0;i<128;i++) {
-    zetas[i] = tmp[tree[i]];
-    if(zetas[i] > KYBER_Q/2)
-      zetas[i] -= KYBER_Q;
-    if(zetas[i] < -KYBER_Q/2)
-      zetas[i] += KYBER_Q;
-  }
-}
-*/
-
-const int16_t zetas[128] = {
-  -1044,  -758,  -359, -1517,  1493,  1422,   287,   202,
-   -171,   622,  1577,   182,   962, -1202, -1474,  1468,
-    573, -1325,   264,   383,  -829,  1458, -1602,  -130,
-   -681,  1017,   732,   608, -1542,   411,  -205, -1571,
-   1223,   652,  -552,  1015, -1293,  1491,  -282, -1544,
-    516,    -8,  -320,  -666, -1618, -1162,   126,  1469,
-   -853,   -90,  -271,   830,   107, -1421,  -247,  -951,
-   -398,   961, -1508,  -725,   448, -1065,   677, -1275,
-  -1103,   430,   555,   843, -1251,   871,  1550,   105,
-    422,   587,   177,  -235,  -291,  -460,  1574,  1653,
-   -246,   778,  1159,  -147,  -777,  1483,  -602,  1119,
-  -1590,   644,  -872,   349,   418,   329,  -156,   -75,
-    817,  1097,   603,   610,  1322, -1285, -1465,   384,
-  -1215,  -136,  1218, -1335,  -874,   220, -1187, -1659,
-  -1185, -1530, -1278,   794, -1510,  -854,  -870,   478,
-   -108,  -308,   996,   991,   958, -1460,  1522,  1628
-};
-
-/*************************************************
-* Name:        fqmul
-*
-* Description: Multiplication followed by Montgomery reduction
-*
-* Arguments:   - int16_t a: first factor
-*              - int16_t b: second factor
-*
-* Returns 16-bit integer congruent to a*b*R^{-1} mod q
-**************************************************/
-static int16_t fqmul(int16_t a, int16_t b) {
-  return montgomery_reduce((int32_t)a*b);
-}
-
-/*************************************************
-* Name:        ntt
-*
-* Description: Inplace number-theoretic transform (NTT) in Rq.
-*              input is in standard order, output is in bitreversed order
-*
-* Arguments:   - int16_t r[256]: pointer to input/output vector of elements of Zq
-**************************************************/
-void ntt(int16_t r[256]) {
-  unsigned int len, start, j, k;
-  int16_t t, zeta;
-
-  k = 1;
-  for(len = 128; len >= 2; len >>= 1) {
-    for(start = 0; start < 256; start = j + len) {
-      zeta = zetas[k++];
-      for(j = start; j < start + len; j++) {
-        t = fqmul(zeta, r[j + len]);
-        r[j + len] = r[j] - t;
-        r[j] = r[j] + t;
-      }
-    }
-  }
-}
-
-/*************************************************
-* Name:        invntt_tomont
-*
-* Description: Inplace inverse number-theoretic transform in Rq and
-*              multiplication by Montgomery factor 2^16.
-*              Input is in bitreversed order, output is in standard order
-*
-* Arguments:   - int16_t r[256]: pointer to input/output vector of elements of Zq
-**************************************************/
-void invntt(int16_t r[256]) {
-  unsigned int start, len, j, k;
-  int16_t t, zeta;
-  const int16_t f = 1441; // mont^2/128
-
-  k = 127;
-  for(len = 2; len <= 128; len <<= 1) {
-    for(start = 0; start < 256; start = j + len) {
-      zeta = zetas[k--];
-      for(j = start; j < start + len; j++) {
-        t = r[j];
-        r[j] = barrett_reduce(t + r[j + len]);
-        r[j + len] = r[j + len] - t;
-        r[j + len] = fqmul(zeta, r[j + len]);
-      }
-    }
-  }
-
-  for(j = 0; j < 256; j++)
-    r[j] = fqmul(r[j], f);
-}
-
-/*************************************************
-* Name:        basemul
-*
-* Description: Multiplication of polynomials in Zq[X]/(X^2-zeta)
-*              used for multiplication of elements in Rq in NTT domain
-*
-* Arguments:   - int16_t r[2]: pointer to the output polynomial
-*              - const int16_t a[2]: pointer to the first factor
-*              - const int16_t b[2]: pointer to the second factor
-*              - int16_t zeta: integer defining the reduction polynomial
-**************************************************/
-void basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta)
-{
-  r[0]  = fqmul(a[1], b[1]);
-  r[0]  = fqmul(r[0], zeta);
-  r[0] += fqmul(a[0], b[0]);
-  r[1]  = fqmul(a[0], b[1]);
-  r[1] += fqmul(a[1], b[0]);
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/ntt.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/ntt.h
deleted file mode 100644
index 227ea74f08..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/ntt.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef NTT_H
-#define NTT_H
-
-#include <stdint.h>
-#include "params.h"
-
-#define zetas KYBER_NAMESPACE(zetas)
-extern const int16_t zetas[128];
-
-#define ntt KYBER_NAMESPACE(ntt)
-void ntt(int16_t poly[256]);
-
-#define invntt KYBER_NAMESPACE(invntt)
-void invntt(int16_t poly[256]);
-
-#define basemul KYBER_NAMESPACE(basemul)
-void basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/params.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/params.h
deleted file mode 100644
index fb4190b311..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/params.h
+++ /dev/null
@@ -1,55 +0,0 @@
-#ifndef PARAMS_H
-#define PARAMS_H
-
-#ifndef KYBER_K
-#define KYBER_K 3	/* Change this for different security strengths */
-#endif
-
-
-/* Don't change parameters below this line */
-#if   (KYBER_K == 2)
-#define KYBER_NAMESPACE(s) pqcrystals_ml_kem_512_ref_##s
-#elif (KYBER_K == 3)
-#define KYBER_NAMESPACE(s) pqcrystals_ml_kem_768_ref_##s
-#elif (KYBER_K == 4)
-#define KYBER_NAMESPACE(s) pqcrystals_ml_kem_1024_ref_##s
-#else
-#error "KYBER_K must be in {2,3,4}"
-#endif
-
-#define KYBER_N 256
-#define KYBER_Q 3329
-
-#define KYBER_SYMBYTES 32   /* size in bytes of hashes, and seeds */
-#define KYBER_SSBYTES  32   /* size in bytes of shared key */
-
-#define KYBER_POLYBYTES		384
-#define KYBER_POLYVECBYTES	(KYBER_K * KYBER_POLYBYTES)
-
-#if KYBER_K == 2
-#define KYBER_ETA1 3
-#define KYBER_POLYCOMPRESSEDBYTES    128
-#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320)
-#elif KYBER_K == 3
-#define KYBER_ETA1 2
-#define KYBER_POLYCOMPRESSEDBYTES    128
-#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320)
-#elif KYBER_K == 4
-#define KYBER_ETA1 2
-#define KYBER_POLYCOMPRESSEDBYTES    160
-#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 352)
-#endif
-
-#define KYBER_ETA2 2
-
-#define KYBER_INDCPA_MSGBYTES       (KYBER_SYMBYTES)
-#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES)
-#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES)
-#define KYBER_INDCPA_BYTES          (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES)
-
-#define KYBER_PUBLICKEYBYTES  (KYBER_INDCPA_PUBLICKEYBYTES)
-/* 32 bytes of additional space to save H(pk) */
-#define KYBER_SECRETKEYBYTES  (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES)
-#define KYBER_CIPHERTEXTBYTES (KYBER_INDCPA_BYTES)
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/poly.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/poly.c
deleted file mode 100644
index cbd3abfb54..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/poly.c
+++ /dev/null
@@ -1,360 +0,0 @@
-#include <stdint.h>
-#include "params.h"
-#include "poly.h"
-#include "ntt.h"
-#include "reduce.h"
-#include "cbd.h"
-#include "symmetric.h"
-#include "verify.h"
-
-/*************************************************
-* Name:        poly_compress
-*
-* Description: Compression and subsequent serialization of a polynomial
-*
-* Arguments:   - uint8_t *r: pointer to output byte array
-*                            (of length KYBER_POLYCOMPRESSEDBYTES)
-*              - const poly *a: pointer to input polynomial
-**************************************************/
-void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a)
-{
-  unsigned int i,j;
-  int16_t u;
-  uint32_t d0;
-  uint8_t t[8];
-
-#if (KYBER_POLYCOMPRESSEDBYTES == 128)
-
-  for(i=0;i<KYBER_N/8;i++) {
-    for(j=0;j<8;j++) {
-      // map to positive standard representatives
-      u  = a->coeffs[8*i+j];
-      u += (u >> 15) & KYBER_Q;
-/*    t[j] = ((((uint16_t)u << 4) + KYBER_Q/2)/KYBER_Q) & 15; */
-      d0 = u << 4;
-      d0 += 1665;
-      d0 *= 80635;
-      d0 >>= 28;
-      t[j] = d0 & 0xf;
-    }
-
-    r[0] = t[0] | (t[1] << 4);
-    r[1] = t[2] | (t[3] << 4);
-    r[2] = t[4] | (t[5] << 4);
-    r[3] = t[6] | (t[7] << 4);
-    r += 4;
-  }
-#elif (KYBER_POLYCOMPRESSEDBYTES == 160)
-  for(i=0;i<KYBER_N/8;i++) {
-    for(j=0;j<8;j++) {
-      // map to positive standard representatives
-      u  = a->coeffs[8*i+j];
-      u += (u >> 15) & KYBER_Q;
-/*    t[j] = ((((uint32_t)u << 5) + KYBER_Q/2)/KYBER_Q) & 31; */
-      d0 = u << 5;
-      d0 += 1664;
-      d0 *= 40318;
-      d0 >>= 27;
-      t[j] = d0 & 0x1f;
-    }
-
-    r[0] = (t[0] >> 0) | (t[1] << 5);
-    r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7);
-    r[2] = (t[3] >> 1) | (t[4] << 4);
-    r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6);
-    r[4] = (t[6] >> 2) | (t[7] << 3);
-    r += 5;
-  }
-#else
-#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}"
-#endif
-}
-
-/*************************************************
-* Name:        poly_decompress
-*
-* Description: De-serialization and subsequent decompression of a polynomial;
-*              approximate inverse of poly_compress
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *a: pointer to input byte array
-*                                  (of length KYBER_POLYCOMPRESSEDBYTES bytes)
-**************************************************/
-void poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES])
-{
-  unsigned int i;
-
-#if (KYBER_POLYCOMPRESSEDBYTES == 128)
-  for(i=0;i<KYBER_N/2;i++) {
-    r->coeffs[2*i+0] = (((uint16_t)(a[0] & 15)*KYBER_Q) + 8) >> 4;
-    r->coeffs[2*i+1] = (((uint16_t)(a[0] >> 4)*KYBER_Q) + 8) >> 4;
-    a += 1;
-  }
-#elif (KYBER_POLYCOMPRESSEDBYTES == 160)
-  unsigned int j;
-  uint8_t t[8];
-  for(i=0;i<KYBER_N/8;i++) {
-    t[0] = (a[0] >> 0);
-    t[1] = (a[0] >> 5) | (a[1] << 3);
-    t[2] = (a[1] >> 2);
-    t[3] = (a[1] >> 7) | (a[2] << 1);
-    t[4] = (a[2] >> 4) | (a[3] << 4);
-    t[5] = (a[3] >> 1);
-    t[6] = (a[3] >> 6) | (a[4] << 2);
-    t[7] = (a[4] >> 3);
-    a += 5;
-
-    for(j=0;j<8;j++)
-      r->coeffs[8*i+j] = ((uint32_t)(t[j] & 31)*KYBER_Q + 16) >> 5;
-  }
-#else
-#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}"
-#endif
-}
-
-/*************************************************
-* Name:        poly_tobytes
-*
-* Description: Serialization of a polynomial
-*
-* Arguments:   - uint8_t *r: pointer to output byte array
-*                            (needs space for KYBER_POLYBYTES bytes)
-*              - const poly *a: pointer to input polynomial
-**************************************************/
-void poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a)
-{
-  unsigned int i;
-  uint16_t t0, t1;
-
-  for(i=0;i<KYBER_N/2;i++) {
-    // map to positive standard representatives
-    t0  = a->coeffs[2*i];
-    t0 += ((int16_t)t0 >> 15) & KYBER_Q;
-    t1 = a->coeffs[2*i+1];
-    t1 += ((int16_t)t1 >> 15) & KYBER_Q;
-    r[3*i+0] = (t0 >> 0);
-    r[3*i+1] = (t0 >> 8) | (t1 << 4);
-    r[3*i+2] = (t1 >> 4);
-  }
-}
-
-/*************************************************
-* Name:        poly_frombytes
-*
-* Description: De-serialization of a polynomial;
-*              inverse of poly_tobytes
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *a: pointer to input byte array
-*                                  (of KYBER_POLYBYTES bytes)
-**************************************************/
-void poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES])
-{
-  unsigned int i;
-  for(i=0;i<KYBER_N/2;i++) {
-    r->coeffs[2*i]   = ((a[3*i+0] >> 0) | ((uint16_t)a[3*i+1] << 8)) & 0xFFF;
-    r->coeffs[2*i+1] = ((a[3*i+1] >> 4) | ((uint16_t)a[3*i+2] << 4)) & 0xFFF;
-  }
-}
-
-/*************************************************
-* Name:        poly_frommsg
-*
-* Description: Convert 32-byte message to polynomial
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *msg: pointer to input message
-**************************************************/
-void poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES])
-{
-  unsigned int i,j;
-
-#if (KYBER_INDCPA_MSGBYTES != KYBER_N/8)
-#error "KYBER_INDCPA_MSGBYTES must be equal to KYBER_N/8 bytes!"
-#endif
-
-  for(i=0;i<KYBER_N/8;i++) {
-    for(j=0;j<8;j++) {
-      r->coeffs[8*i+j] = 0;
-      cmov_int16(r->coeffs+8*i+j, ((KYBER_Q+1)/2), (msg[i] >> j)&1);
-    }
-  }
-}
-
-/*************************************************
-* Name:        poly_tomsg
-*
-* Description: Convert polynomial to 32-byte message
-*
-* Arguments:   - uint8_t *msg: pointer to output message
-*              - const poly *a: pointer to input polynomial
-**************************************************/
-void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *a)
-{
-  unsigned int i,j;
-  uint32_t t;
-
-  for(i=0;i<KYBER_N/8;i++) {
-    msg[i] = 0;
-    for(j=0;j<8;j++) {
-      t  = a->coeffs[8*i+j];
-      // t += ((int16_t)t >> 15) & KYBER_Q;
-      // t  = (((t << 1) + KYBER_Q/2)/KYBER_Q) & 1;
-      t <<= 1;
-      t += 1665;
-      t *= 80635;
-      t >>= 28;
-      t &= 1;
-      msg[i] |= t << j;
-    }
-  }
-}
-
-/*************************************************
-* Name:        poly_getnoise_eta1
-*
-* Description: Sample a polynomial deterministically from a seed and a nonce,
-*              with output polynomial close to centered binomial distribution
-*              with parameter KYBER_ETA1
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *seed: pointer to input seed
-*                                     (of length KYBER_SYMBYTES bytes)
-*              - uint8_t nonce: one-byte input nonce
-**************************************************/
-void poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce)
-{
-  uint8_t buf[KYBER_ETA1*KYBER_N/4];
-  prf(buf, sizeof(buf), seed, nonce);
-  poly_cbd_eta1(r, buf);
-}
-
-/*************************************************
-* Name:        poly_getnoise_eta2
-*
-* Description: Sample a polynomial deterministically from a seed and a nonce,
-*              with output polynomial close to centered binomial distribution
-*              with parameter KYBER_ETA2
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *seed: pointer to input seed
-*                                     (of length KYBER_SYMBYTES bytes)
-*              - uint8_t nonce: one-byte input nonce
-**************************************************/
-void poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce)
-{
-  uint8_t buf[KYBER_ETA2*KYBER_N/4];
-  prf(buf, sizeof(buf), seed, nonce);
-  poly_cbd_eta2(r, buf);
-}
-
-
-/*************************************************
-* Name:        poly_ntt
-*
-* Description: Computes negacyclic number-theoretic transform (NTT) of
-*              a polynomial in place;
-*              inputs assumed to be in normal order, output in bitreversed order
-*
-* Arguments:   - uint16_t *r: pointer to in/output polynomial
-**************************************************/
-void poly_ntt(poly *r)
-{
-  ntt(r->coeffs);
-  poly_reduce(r);
-}
-
-/*************************************************
-* Name:        poly_invntt_tomont
-*
-* Description: Computes inverse of negacyclic number-theoretic transform (NTT)
-*              of a polynomial in place;
-*              inputs assumed to be in bitreversed order, output in normal order
-*
-* Arguments:   - uint16_t *a: pointer to in/output polynomial
-**************************************************/
-void poly_invntt_tomont(poly *r)
-{
-  invntt(r->coeffs);
-}
-
-/*************************************************
-* Name:        poly_basemul_montgomery
-*
-* Description: Multiplication of two polynomials in NTT domain
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const poly *a: pointer to first input polynomial
-*              - const poly *b: pointer to second input polynomial
-**************************************************/
-void poly_basemul_montgomery(poly *r, const poly *a, const poly *b)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_N/4;i++) {
-    basemul(&r->coeffs[4*i], &a->coeffs[4*i], &b->coeffs[4*i], zetas[64+i]);
-    basemul(&r->coeffs[4*i+2], &a->coeffs[4*i+2], &b->coeffs[4*i+2], -zetas[64+i]);
-  }
-}
-
-/*************************************************
-* Name:        poly_tomont
-*
-* Description: Inplace conversion of all coefficients of a polynomial
-*              from normal domain to Montgomery domain
-*
-* Arguments:   - poly *r: pointer to input/output polynomial
-**************************************************/
-void poly_tomont(poly *r)
-{
-  unsigned int i;
-  const int16_t f = (1ULL << 32) % KYBER_Q;
-  for(i=0;i<KYBER_N;i++)
-    r->coeffs[i] = montgomery_reduce((int32_t)r->coeffs[i]*f);
-}
-
-/*************************************************
-* Name:        poly_reduce
-*
-* Description: Applies Barrett reduction to all coefficients of a polynomial
-*              for details of the Barrett reduction see comments in reduce.c
-*
-* Arguments:   - poly *r: pointer to input/output polynomial
-**************************************************/
-void poly_reduce(poly *r)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_N;i++)
-    r->coeffs[i] = barrett_reduce(r->coeffs[i]);
-}
-
-/*************************************************
-* Name:        poly_add
-*
-* Description: Add two polynomials; no modular reduction is performed
-*
-* Arguments: - poly *r: pointer to output polynomial
-*            - const poly *a: pointer to first input polynomial
-*            - const poly *b: pointer to second input polynomial
-**************************************************/
-void poly_add(poly *r, const poly *a, const poly *b)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_N;i++)
-    r->coeffs[i] = a->coeffs[i] + b->coeffs[i];
-}
-
-/*************************************************
-* Name:        poly_sub
-*
-* Description: Subtract two polynomials; no modular reduction is performed
-*
-* Arguments: - poly *r:       pointer to output polynomial
-*            - const poly *a: pointer to first input polynomial
-*            - const poly *b: pointer to second input polynomial
-**************************************************/
-void poly_sub(poly *r, const poly *a, const poly *b)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_N;i++)
-    r->coeffs[i] = a->coeffs[i] - b->coeffs[i];
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/poly.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/poly.h
deleted file mode 100644
index 9a99c7cdad..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/poly.h
+++ /dev/null
@@ -1,53 +0,0 @@
-#ifndef POLY_H
-#define POLY_H
-
-#include <stdint.h>
-#include "params.h"
-
-/*
- * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial
- * coeffs[0] + X*coeffs[1] + X^2*coeffs[2] + ... + X^{n-1}*coeffs[n-1]
- */
-typedef struct{
-  int16_t coeffs[KYBER_N];
-} poly;
-
-#define poly_compress KYBER_NAMESPACE(poly_compress)
-void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a);
-#define poly_decompress KYBER_NAMESPACE(poly_decompress)
-void poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]);
-
-#define poly_tobytes KYBER_NAMESPACE(poly_tobytes)
-void poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a);
-#define poly_frombytes KYBER_NAMESPACE(poly_frombytes)
-void poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]);
-
-#define poly_frommsg KYBER_NAMESPACE(poly_frommsg)
-void poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]);
-#define poly_tomsg KYBER_NAMESPACE(poly_tomsg)
-void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *r);
-
-#define poly_getnoise_eta1 KYBER_NAMESPACE(poly_getnoise_eta1)
-void poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);
-
-#define poly_getnoise_eta2 KYBER_NAMESPACE(poly_getnoise_eta2)
-void poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);
-
-#define poly_ntt KYBER_NAMESPACE(poly_ntt)
-void poly_ntt(poly *r);
-#define poly_invntt_tomont KYBER_NAMESPACE(poly_invntt_tomont)
-void poly_invntt_tomont(poly *r);
-#define poly_basemul_montgomery KYBER_NAMESPACE(poly_basemul_montgomery)
-void poly_basemul_montgomery(poly *r, const poly *a, const poly *b);
-#define poly_tomont KYBER_NAMESPACE(poly_tomont)
-void poly_tomont(poly *r);
-
-#define poly_reduce KYBER_NAMESPACE(poly_reduce)
-void poly_reduce(poly *r);
-
-#define poly_add KYBER_NAMESPACE(poly_add)
-void poly_add(poly *r, const poly *a, const poly *b);
-#define poly_sub KYBER_NAMESPACE(poly_sub)
-void poly_sub(poly *r, const poly *a, const poly *b);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/polyvec.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/polyvec.c
deleted file mode 100644
index 669f6a5f1d..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/polyvec.c
+++ /dev/null
@@ -1,246 +0,0 @@
-#include <stdint.h>
-#include "params.h"
-#include "poly.h"
-#include "polyvec.h"
-
-/*************************************************
-* Name:        polyvec_compress
-*
-* Description: Compress and serialize vector of polynomials
-*
-* Arguments:   - uint8_t *r: pointer to output byte array
-*                            (needs space for KYBER_POLYVECCOMPRESSEDBYTES)
-*              - const polyvec *a: pointer to input vector of polynomials
-**************************************************/
-void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a)
-{
-  unsigned int i,j,k;
-  uint64_t d0;
-
-#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
-  uint16_t t[8];
-  for(i=0;i<KYBER_K;i++) {
-    for(j=0;j<KYBER_N/8;j++) {
-      for(k=0;k<8;k++) {
-        t[k]  = a->vec[i].coeffs[8*j+k];
-        t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
-/*      t[k]  = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff; */
-        d0 = t[k];
-        d0 <<= 11;
-        d0 += 1664;
-        d0 *= 645084;
-        d0 >>= 31;
-        t[k] = d0 & 0x7ff;
-      }
-
-      r[ 0] = (t[0] >>  0);
-      r[ 1] = (t[0] >>  8) | (t[1] << 3);
-      r[ 2] = (t[1] >>  5) | (t[2] << 6);
-      r[ 3] = (t[2] >>  2);
-      r[ 4] = (t[2] >> 10) | (t[3] << 1);
-      r[ 5] = (t[3] >>  7) | (t[4] << 4);
-      r[ 6] = (t[4] >>  4) | (t[5] << 7);
-      r[ 7] = (t[5] >>  1);
-      r[ 8] = (t[5] >>  9) | (t[6] << 2);
-      r[ 9] = (t[6] >>  6) | (t[7] << 5);
-      r[10] = (t[7] >>  3);
-      r += 11;
-    }
-  }
-#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
-  uint16_t t[4];
-  for(i=0;i<KYBER_K;i++) {
-    for(j=0;j<KYBER_N/4;j++) {
-      for(k=0;k<4;k++) {
-        t[k]  = a->vec[i].coeffs[4*j+k];
-        t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
-/*      t[k]  = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff; */
-        d0 = t[k];
-        d0 <<= 10;
-        d0 += 1665;
-        d0 *= 1290167;
-        d0 >>= 32;
-        t[k] = d0 & 0x3ff;
-      }
-
-      r[0] = (t[0] >> 0);
-      r[1] = (t[0] >> 8) | (t[1] << 2);
-      r[2] = (t[1] >> 6) | (t[2] << 4);
-      r[3] = (t[2] >> 4) | (t[3] << 6);
-      r[4] = (t[3] >> 2);
-      r += 5;
-    }
-  }
-#else
-#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
-#endif
-}
-
-/*************************************************
-* Name:        polyvec_decompress
-*
-* Description: De-serialize and decompress vector of polynomials;
-*              approximate inverse of polyvec_compress
-*
-* Arguments:   - polyvec *r:       pointer to output vector of polynomials
-*              - const uint8_t *a: pointer to input byte array
-*                                  (of length KYBER_POLYVECCOMPRESSEDBYTES)
-**************************************************/
-void polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES])
-{
-  unsigned int i,j,k;
-
-#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
-  uint16_t t[8];
-  for(i=0;i<KYBER_K;i++) {
-    for(j=0;j<KYBER_N/8;j++) {
-      t[0] = (a[0] >> 0) | ((uint16_t)a[ 1] << 8);
-      t[1] = (a[1] >> 3) | ((uint16_t)a[ 2] << 5);
-      t[2] = (a[2] >> 6) | ((uint16_t)a[ 3] << 2) | ((uint16_t)a[4] << 10);
-      t[3] = (a[4] >> 1) | ((uint16_t)a[ 5] << 7);
-      t[4] = (a[5] >> 4) | ((uint16_t)a[ 6] << 4);
-      t[5] = (a[6] >> 7) | ((uint16_t)a[ 7] << 1) | ((uint16_t)a[8] << 9);
-      t[6] = (a[8] >> 2) | ((uint16_t)a[ 9] << 6);
-      t[7] = (a[9] >> 5) | ((uint16_t)a[10] << 3);
-      a += 11;
-
-      for(k=0;k<8;k++)
-        r->vec[i].coeffs[8*j+k] = ((uint32_t)(t[k] & 0x7FF)*KYBER_Q + 1024) >> 11;
-    }
-  }
-#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
-  uint16_t t[4];
-  for(i=0;i<KYBER_K;i++) {
-    for(j=0;j<KYBER_N/4;j++) {
-      t[0] = (a[0] >> 0) | ((uint16_t)a[1] << 8);
-      t[1] = (a[1] >> 2) | ((uint16_t)a[2] << 6);
-      t[2] = (a[2] >> 4) | ((uint16_t)a[3] << 4);
-      t[3] = (a[3] >> 6) | ((uint16_t)a[4] << 2);
-      a += 5;
-
-      for(k=0;k<4;k++)
-        r->vec[i].coeffs[4*j+k] = ((uint32_t)(t[k] & 0x3FF)*KYBER_Q + 512) >> 10;
-    }
-  }
-#else
-#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
-#endif
-}
-
-/*************************************************
-* Name:        polyvec_tobytes
-*
-* Description: Serialize vector of polynomials
-*
-* Arguments:   - uint8_t *r: pointer to output byte array
-*                            (needs space for KYBER_POLYVECBYTES)
-*              - const polyvec *a: pointer to input vector of polynomials
-**************************************************/
-void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_tobytes(r+i*KYBER_POLYBYTES, &a->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvec_frombytes
-*
-* Description: De-serialize vector of polynomials;
-*              inverse of polyvec_tobytes
-*
-* Arguments:   - uint8_t *r:       pointer to output byte array
-*              - const polyvec *a: pointer to input vector of polynomials
-*                                  (of length KYBER_POLYVECBYTES)
-**************************************************/
-void polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES])
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_frombytes(&r->vec[i], a+i*KYBER_POLYBYTES);
-}
-
-/*************************************************
-* Name:        polyvec_ntt
-*
-* Description: Apply forward NTT to all elements of a vector of polynomials
-*
-* Arguments:   - polyvec *r: pointer to in/output vector of polynomials
-**************************************************/
-void polyvec_ntt(polyvec *r)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_ntt(&r->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvec_invntt_tomont
-*
-* Description: Apply inverse NTT to all elements of a vector of polynomials
-*              and multiply by Montgomery factor 2^16
-*
-* Arguments:   - polyvec *r: pointer to in/output vector of polynomials
-**************************************************/
-void polyvec_invntt_tomont(polyvec *r)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_invntt_tomont(&r->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvec_basemul_acc_montgomery
-*
-* Description: Multiply elements of a and b in NTT domain, accumulate into r,
-*              and multiply by 2^-16.
-*
-* Arguments: - poly *r: pointer to output polynomial
-*            - const polyvec *a: pointer to first input vector of polynomials
-*            - const polyvec *b: pointer to second input vector of polynomials
-**************************************************/
-void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
-{
-  unsigned int i;
-  poly t;
-
-  poly_basemul_montgomery(r, &a->vec[0], &b->vec[0]);
-  for(i=1;i<KYBER_K;i++) {
-    poly_basemul_montgomery(&t, &a->vec[i], &b->vec[i]);
-    poly_add(r, r, &t);
-  }
-
-  poly_reduce(r);
-}
-
-/*************************************************
-* Name:        polyvec_reduce
-*
-* Description: Applies Barrett reduction to each coefficient
-*              of each element of a vector of polynomials;
-*              for details of the Barrett reduction see comments in reduce.c
-*
-* Arguments:   - polyvec *r: pointer to input/output polynomial
-**************************************************/
-void polyvec_reduce(polyvec *r)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_reduce(&r->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvec_add
-*
-* Description: Add vectors of polynomials
-*
-* Arguments: - polyvec *r: pointer to output vector of polynomials
-*            - const polyvec *a: pointer to first input vector of polynomials
-*            - const polyvec *b: pointer to second input vector of polynomials
-**************************************************/
-void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b)
-{
-  unsigned int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_add(&r->vec[i], &a->vec[i], &b->vec[i]);
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/polyvec.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/polyvec.h
deleted file mode 100644
index 57b605494e..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/polyvec.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef POLYVEC_H
-#define POLYVEC_H
-
-#include <stdint.h>
-#include "params.h"
-#include "poly.h"
-
-typedef struct{
-  poly vec[KYBER_K];
-} polyvec;
-
-#define polyvec_compress KYBER_NAMESPACE(polyvec_compress)
-void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a);
-#define polyvec_decompress KYBER_NAMESPACE(polyvec_decompress)
-void polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]);
-
-#define polyvec_tobytes KYBER_NAMESPACE(polyvec_tobytes)
-void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a);
-#define polyvec_frombytes KYBER_NAMESPACE(polyvec_frombytes)
-void polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]);
-
-#define polyvec_ntt KYBER_NAMESPACE(polyvec_ntt)
-void polyvec_ntt(polyvec *r);
-#define polyvec_invntt_tomont KYBER_NAMESPACE(polyvec_invntt_tomont)
-void polyvec_invntt_tomont(polyvec *r);
-
-#define polyvec_basemul_acc_montgomery KYBER_NAMESPACE(polyvec_basemul_acc_montgomery)
-void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b);
-
-#define polyvec_reduce KYBER_NAMESPACE(polyvec_reduce)
-void polyvec_reduce(polyvec *r);
-
-#define polyvec_add KYBER_NAMESPACE(polyvec_add)
-void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/reduce.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/reduce.c
deleted file mode 100644
index 9d8e7edf83..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/reduce.c
+++ /dev/null
@@ -1,42 +0,0 @@
-#include <stdint.h>
-#include "params.h"
-#include "reduce.h"
-
-/*************************************************
-* Name:        montgomery_reduce
-*
-* Description: Montgomery reduction; given a 32-bit integer a, computes
-*              16-bit integer congruent to a * R^-1 mod q, where R=2^16
-*
-* Arguments:   - int32_t a: input integer to be reduced;
-*                           has to be in {-q2^15,...,q2^15-1}
-*
-* Returns:     integer in {-q+1,...,q-1} congruent to a * R^-1 modulo q.
-**************************************************/
-int16_t montgomery_reduce(int32_t a)
-{
-  int16_t t;
-
-  t = (int16_t)a*QINV;
-  t = (a - (int32_t)t*KYBER_Q) >> 16;
-  return t;
-}
-
-/*************************************************
-* Name:        barrett_reduce
-*
-* Description: Barrett reduction; given a 16-bit integer a, computes
-*              centered representative congruent to a mod q in {-(q-1)/2,...,(q-1)/2}
-*
-* Arguments:   - int16_t a: input integer to be reduced
-*
-* Returns:     integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
-**************************************************/
-int16_t barrett_reduce(int16_t a) {
-  int16_t t;
-  const int16_t v = ((1<<26) + KYBER_Q/2)/KYBER_Q;
-
-  t  = ((int32_t)v*a + (1<<25)) >> 26;
-  t *= KYBER_Q;
-  return a - t;
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/reduce.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/reduce.h
deleted file mode 100644
index c1bc1e4c7b..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/reduce.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef REDUCE_H
-#define REDUCE_H
-
-#include <stdint.h>
-#include "params.h"
-
-#define MONT -1044 // 2^16 mod q
-#define QINV -3327 // q^-1 mod 2^16
-
-#define montgomery_reduce KYBER_NAMESPACE(montgomery_reduce)
-int16_t montgomery_reduce(int32_t a);
-
-#define barrett_reduce KYBER_NAMESPACE(barrett_reduce)
-int16_t barrett_reduce(int16_t a);
-
-#endif
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/symmetric-shake.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/symmetric-shake.c
deleted file mode 100644
index 20f451882e..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/symmetric-shake.c
+++ /dev/null
@@ -1,74 +0,0 @@
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-#include "params.h"
-#include "symmetric.h"
-#include "fips202.h"
-
-/*************************************************
-* Name:        kyber_shake128_absorb
-*
-* Description: Absorb step of the SHAKE128 specialized for the Kyber context.
-*
-* Arguments:   - keccak_state *state: pointer to (uninitialized) output Keccak state
-*              - const uint8_t *seed: pointer to KYBER_SYMBYTES input to be absorbed into state
-*              - uint8_t i: additional byte of input
-*              - uint8_t j: additional byte of input
-**************************************************/
-void kyber_shake128_absorb(shake128incctx *state,
-                           const uint8_t seed[KYBER_SYMBYTES],
-                           uint8_t x,
-                           uint8_t y)
-{
-  uint8_t extseed[KYBER_SYMBYTES+2];
-
-  memcpy(extseed, seed, KYBER_SYMBYTES);
-  extseed[KYBER_SYMBYTES+0] = x;
-  extseed[KYBER_SYMBYTES+1] = y;
-
-  shake128_absorb_once(state, extseed, sizeof(extseed));
-}
-
-/*************************************************
-* Name:        kyber_shake256_prf
-*
-* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input
-*              and then generates outlen bytes of SHAKE256 output
-*
-* Arguments:   - uint8_t *out: pointer to output
-*              - size_t outlen: number of requested output bytes
-*              - const uint8_t *key: pointer to the key (of length KYBER_SYMBYTES)
-*              - uint8_t nonce: single-byte nonce (public PRF input)
-**************************************************/
-void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce)
-{
-  uint8_t extkey[KYBER_SYMBYTES+1];
-
-  memcpy(extkey, key, KYBER_SYMBYTES);
-  extkey[KYBER_SYMBYTES] = nonce;
-
-  shake256(out, outlen, extkey, sizeof(extkey));
-}
-
-/*************************************************
-* Name:        kyber_shake256_prf
-*
-* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input
-*              and then generates outlen bytes of SHAKE256 output
-*
-* Arguments:   - uint8_t *out: pointer to output
-*              - size_t outlen: number of requested output bytes
-*              - const uint8_t *key: pointer to the key (of length KYBER_SYMBYTES)
-*              - uint8_t nonce: single-byte nonce (public PRF input)
-**************************************************/
-void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SYMBYTES], const uint8_t input[KYBER_CIPHERTEXTBYTES])
-{
-  shake256incctx s;
-
-  shake256_inc_init(&s);
-  shake256_inc_absorb(&s, key, KYBER_SYMBYTES);
-  shake256_inc_absorb(&s, input, KYBER_CIPHERTEXTBYTES);
-  shake256_inc_finalize(&s);
-  shake256_inc_squeeze(out, KYBER_SSBYTES, &s);
-  shake256_inc_ctx_release(&s);
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/symmetric.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/symmetric.h
deleted file mode 100644
index 2acc66f98d..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/symmetric.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef SYMMETRIC_H
-#define SYMMETRIC_H
-
-#include <stddef.h>
-#include <stdint.h>
-#include "params.h"
-
-#include "fips202.h"
-
-typedef shake128incctx xof_state;
-
-#define kyber_shake128_absorb KYBER_NAMESPACE(kyber_shake128_absorb)
-void kyber_shake128_absorb(shake128incctx *s,
-                           const uint8_t seed[KYBER_SYMBYTES],
-                           uint8_t x,
-                           uint8_t y);
-
-#define kyber_shake256_prf KYBER_NAMESPACE(kyber_shake256_prf)
-void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce);
-
-#define kyber_shake256_rkprf KYBER_NAMESPACE(kyber_shake256_rkprf)
-void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SYMBYTES], const uint8_t input[KYBER_CIPHERTEXTBYTES]);
-
-#define XOF_BLOCKBYTES SHAKE128_RATE
-
-#define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES)
-#define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES)
-#define xof_init(STATE, SEED) shake128_inc_init(STATE)
-#define xof_absorb(STATE, SEED, X, Y) kyber_shake128_absorb(STATE, SEED, X, Y)
-#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE)
-#define xof_release(STATE) shake128_inc_ctx_release(STATE)
-#define prf(OUT, OUTBYTES, KEY, NONCE) kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE)
-#define rkprf(OUT, KEY, INPUT) kyber_shake256_rkprf(OUT, KEY, INPUT)
-
-#endif /* SYMMETRIC_H */
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/verify.c b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/verify.c
deleted file mode 100644
index 914ccd448f..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/verify.c
+++ /dev/null
@@ -1,75 +0,0 @@
-#include <stddef.h>
-#include <stdint.h>
-#include "verify.h"
-
-/*************************************************
-* Name:        verify
-*
-* Description: Compare two arrays for equality in constant time.
-*
-* Arguments:   const uint8_t *a: pointer to first byte array
-*              const uint8_t *b: pointer to second byte array
-*              size_t len:       length of the byte arrays
-*
-* Returns 0 if the byte arrays are equal, 1 otherwise
-**************************************************/
-int verify(const uint8_t *a, const uint8_t *b, size_t len)
-{
-  size_t i;
-  uint8_t r = 0;
-
-  for(i=0;i<len;i++)
-    r |= a[i] ^ b[i];
-
-  return (-(uint64_t)r) >> 63;
-}
-
-/*************************************************
-* Name:        cmov
-*
-* Description: Copy len bytes from x to r if b is 1;
-*              don't modify x if b is 0. Requires b to be in {0,1};
-*              assumes two's complement representation of negative integers.
-*              Runs in constant time.
-*
-* Arguments:   uint8_t *r:       pointer to output byte array
-*              const uint8_t *x: pointer to input byte array
-*              size_t len:       Amount of bytes to be copied
-*              uint8_t b:        Condition bit; has to be in {0,1}
-**************************************************/
-void cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b)
-{
-  size_t i;
-
-#if defined(__GNUC__) || defined(__clang__)
-  // Prevent the compiler from
-  //    1) inferring that b is 0/1-valued, and
-  //    2) handling the two cases with a branch.
-  // This is not necessary when verify.c and kem.c are separate translation
-  // units, but we expect that downstream consumers will copy this code and/or
-  // change how it is built.
-  __asm__("" : "+r"(b) : /* no inputs */);
-#endif
-
-  b = -b;
-  for(i=0;i<len;i++)
-    r[i] ^= b & (r[i] ^ x[i]);
-}
-
-
-/*************************************************
-* Name:        cmov_int16
-*
-* Description: Copy input v to *r if b is 1, don't modify *r if b is 0. 
-*              Requires b to be in {0,1};
-*              Runs in constant time.
-*
-* Arguments:   int16_t *r:       pointer to output int16_t
-*              int16_t v:        input int16_t 
-*              uint8_t b:        Condition bit; has to be in {0,1}
-**************************************************/
-void cmov_int16(int16_t *r, int16_t v, uint16_t b)
-{
-  b = -b;
-  *r ^= b & ((*r) ^ v);
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/verify.h b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/verify.h
deleted file mode 100644
index 09f0ad5046..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_ref/verify.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef VERIFY_H
-#define VERIFY_H
-
-#include <stddef.h>
-#include <stdint.h>
-#include "params.h"
-
-#define verify KYBER_NAMESPACE(verify)
-int verify(const uint8_t *a, const uint8_t *b, size_t len);
-
-#define cmov KYBER_NAMESPACE(cmov)
-void cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b);
-
-#define cmov_int16 KYBER_NAMESPACE(cmov_int16)
-void cmov_int16(int16_t *r, int16_t v, uint16_t b);
-
-#endif
diff --git a/src/oqsconfig.h.cmake b/src/oqsconfig.h.cmake
index 967c35e64e..9acc66c56d 100644
--- a/src/oqsconfig.h.cmake
+++ b/src/oqsconfig.h.cmake
@@ -128,11 +128,8 @@
 
 #cmakedefine OQS_ENABLE_KEM_ML_KEM 1
 #cmakedefine OQS_ENABLE_KEM_ml_kem_512 1
-#cmakedefine OQS_ENABLE_KEM_ml_kem_512_avx2 1
 #cmakedefine OQS_ENABLE_KEM_ml_kem_768 1
-#cmakedefine OQS_ENABLE_KEM_ml_kem_768_avx2 1
 #cmakedefine OQS_ENABLE_KEM_ml_kem_1024 1
-#cmakedefine OQS_ENABLE_KEM_ml_kem_1024_avx2 1
 
 #cmakedefine OQS_ENABLE_SIG_DILITHIUM 1
 #cmakedefine OQS_ENABLE_SIG_dilithium_2 1
diff --git a/tests/test_binary.py b/tests/test_binary.py
index 53e114df00..a673e545ae 100644
--- a/tests/test_binary.py
+++ b/tests/test_binary.py
@@ -33,7 +33,7 @@ def test_namespace():
             symbols.append(line)
 
     # ideally this would be just ['oqs', 'pqclean'], but contains exceptions (e.g., providing compat implementations of unavailable platform functions)
-    namespaces = ['oqs', 'pqclean', 'keccak', 'pqcrystals', 'pqmayo', 'init', 'fini', 'seedexpander', '__x86.get_pc_thunk', 'libjade', 'jade', '__jade', '__jasmin_syscall']
+    namespaces = ['oqs', 'pqclean', 'keccak', 'pqcrystals', 'pqmayo', 'init', 'fini', 'seedexpander', '__x86.get_pc_thunk', 'libjade', 'jade', '__jade', '__jasmin_syscall', 'pqcp']
     non_namespaced = []
 
     for symbolstr in symbols: