diff --git a/.coveragerc b/.coveragerc
deleted file mode 100644
index e69de29bb..000000000
diff --git a/.github/workflows/package.yml b/.github/workflows/package.yml
index 06a70841a..eaabe8aa1 100644
--- a/.github/workflows/package.yml
+++ b/.github/workflows/package.yml
@@ -6,11 +6,14 @@ on:
       - main
   release:
     types: [published]
+  pull_request:
 
 jobs:
-  dist:
-    name: Build Packages
+  build_sdist:
+    name: Build Source Package
     runs-on: ubuntu-latest
+    outputs:
+      filename: ${{steps.build.outputs.filename}}
 
     steps:
       - uses: actions/checkout@v4
@@ -28,22 +31,52 @@ jobs:
       - name: Install Python deps
         run: pip install -U build
 
-      - name: Build distribution
-        run: python -m build
+      - name: Build source distribution
+        id: build
+        run: |
+          python -m build -s
+          basename dist/*.tar.gz |sed -e 's/^/filename=/' >> "$GITHUB_OUTPUT"
+          cat "$GITHUB_OUTPUT"
 
       - name: Save archive
         uses: actions/upload-artifact@v4
         with:
-          name: pypi-pkgs
+          name: dist-src
           path: dist
 
       - name: List dist dir
         run: ls -R dist
 
+  build_wheels:
+    name: Build wheels on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    needs: [build_sdist]
+    strategy:
+      matrix:
+        os: [ubuntu-20.04, windows-2019, macos-11]
+
+    steps:
+      - name: Fetch compiled package distributions
+        uses: actions/download-artifact@v4
+        with:
+          name: dist-src
+          path: dist
+
+      - name: Build wheels
+        uses: pypa/cibuildwheel@v2.16.5
+        with:
+          package-dir: dist/${{needs.build_sdist.outputs.filename}}
+          output-dir: dist
+
+      - uses: actions/upload-artifact@v4
+        with:
+          name: dist-wheels-${{ matrix.os }}
+          path: ./dist/*.whl
+
   pypi-publish:
     name: Publish to PyPI
     runs-on: ubuntu-latest
-    needs: [dist]
+    needs: [build_sdist, build_wheels]
     if: github.event_name == 'release'
 
     environment: release
@@ -54,8 +87,9 @@ jobs:
       - name: Fetch compiled package distributions
         uses: actions/download-artifact@v4
         with:
-          name: pypi-pkgs
+          pattern: dist-*
           path: dist
+          merge-multiple: true
 
       - name: Publish package distributions to PyPI
         uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index d440c160f..b2c42f642 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -49,6 +49,10 @@ jobs:
           python -V
           numba -s
 
+      - name: Compile extension modules
+        run: |
+          just build-inplace
+
       - name: Test LKPY
         run: |
           python -m pytest --cov=lenskit --verbose --log-file=test.log
@@ -80,15 +84,11 @@ jobs:
         with:
           fetch-depth: 0
 
-      - name: Create Conda environment file
-        run: |
-          pipx run pyproject2conda yaml -p 3.10 -e test -e demo -e dev -o environment.yml
-
       - name: 👢 Set up Conda environment
         uses: mamba-org/setup-micromamba@v1
         id: setup
         with:
-          environment-file: environment.yml
+          environment-file: envs/lenskit-py3.11-dev.yaml
           environment-name: lkpy
           cache-environment: true
           init-shell: bash
@@ -108,6 +108,10 @@ jobs:
             !data/*.zip
           key: test-mldata-000
 
+      - name: Compile extension modules
+        run: |
+          just build-inplace
+
       - name: Download ML data
         run: |
           python -m lenskit.datasets.fetch ml-100k
@@ -137,6 +141,40 @@ jobs:
         with:
           artifact-name: test-check-docs
 
+  cython-cover:
+    name: Measure Cython Coverage
+    timeout-minutes: 30
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: 👢 Set up Conda environment
+        uses: mamba-org/setup-micromamba@v1
+        id: setup
+        with:
+          environment-file: envs/lenskit-py3.11-dev.yaml
+          environment-name: lkpy
+          cache-environment: true
+          init-shell: bash
+
+      - name: Compile extension modules
+        run: |
+          just build-inplace
+        env:
+          BUILD_FOR_COVER: 1
+
+      - name: Run Eval Tests
+        run: |
+          python -m pytest --cov=lenskit -m 'not slow' --log-file test-cython-cover.log
+
+      - name: Process test results
+        uses: lenskit/lkbuild/actions/save-test-results@main
+        with:
+          artifact-name: test-cython-cover
+
   vanilla:
     name: Vanilla Python ${{matrix.python}} on ${{matrix.platform}}
     runs-on: ${{matrix.platform}}
@@ -237,6 +275,7 @@ jobs:
     - vanilla
     - check-docs
     - mindep
+    - cython-cover
 
     steps:
       - name: Check out source
diff --git a/envs/lenskit-py3.10-ci.yaml b/envs/lenskit-py3.10-ci.yaml
index 4295864c0..d88eabce3 100644
--- a/envs/lenskit-py3.10-ci.yaml
+++ b/envs/lenskit-py3.10-ci.yaml
@@ -12,10 +12,12 @@ channels:
 dependencies:
   - python=3.10
   - binpickle>=0.3.2
+  - c-compiler
   - cffi>=1.15.0
   - copier==9.*
   - coverage>=5
   - csr>=0.5
+  - cython==3.*
   - docopt>=0.6
   - hypothesis>=6
   - ipython>=7
@@ -24,6 +26,7 @@ dependencies:
   - numpy>=1.23
   - pandas<3,>=1.5
   - pyproject2conda~=0.11
+  - pytest-benchmark==4.*
   - pytest-cov>=2.12
   - pytest-doctestplus==1.*
   - pytest==7.*
diff --git a/envs/lenskit-py3.10-dev.yaml b/envs/lenskit-py3.10-dev.yaml
index c7fa0553a..8fd38e608 100644
--- a/envs/lenskit-py3.10-dev.yaml
+++ b/envs/lenskit-py3.10-dev.yaml
@@ -12,10 +12,12 @@ channels:
 dependencies:
   - python=3.10
   - binpickle>=0.3.2
+  - c-compiler
   - cffi>=1.15.0
   - copier==9.*
   - coverage>=5
   - csr>=0.5
+  - cython==3.*
   - docopt>=0.6
   - hypothesis>=6
   - ipython>=7
@@ -29,6 +31,7 @@ dependencies:
   - numpy>=1.23
   - pandas<3,>=1.5
   - pyproject2conda~=0.11
+  - pytest-benchmark==4.*
   - pytest-cov>=2.12
   - pytest-doctestplus==1.*
   - pytest==7.*
@@ -44,7 +47,6 @@ dependencies:
   - sphinx_rtd_theme>=0.5
   - sphinxcontrib-bibtex>=2.0
   - sphinxext-opengraph>=0.5
-  - tbb
   - threadpoolctl>=3.0
   - tqdm>=4
   - pip
diff --git a/envs/lenskit-py3.10-test.yaml b/envs/lenskit-py3.10-test.yaml
index 643453fa1..d0f792a1b 100644
--- a/envs/lenskit-py3.10-test.yaml
+++ b/envs/lenskit-py3.10-test.yaml
@@ -15,10 +15,12 @@ dependencies:
   - cffi>=1.15.0
   - coverage>=5
   - csr>=0.5
+  - cython==3.*
   - hypothesis>=6
   - numba<0.59,>=0.56
   - numpy>=1.23
   - pandas<3,>=1.5
+  - pytest-benchmark==4.*
   - pytest-cov>=2.12
   - pytest-doctestplus==1.*
   - pytest==7.*
diff --git a/envs/lenskit-py3.11-ci.yaml b/envs/lenskit-py3.11-ci.yaml
index a76eea719..6d37a929b 100644
--- a/envs/lenskit-py3.11-ci.yaml
+++ b/envs/lenskit-py3.11-ci.yaml
@@ -12,10 +12,12 @@ channels:
 dependencies:
   - python=3.11
   - binpickle>=0.3.2
+  - c-compiler
   - cffi>=1.15.0
   - copier==9.*
   - coverage>=5
   - csr>=0.5
+  - cython==3.*
   - docopt>=0.6
   - hypothesis>=6
   - ipython>=7
@@ -24,6 +26,7 @@ dependencies:
   - numpy>=1.23
   - pandas<3,>=1.5
   - pyproject2conda~=0.11
+  - pytest-benchmark==4.*
   - pytest-cov>=2.12
   - pytest-doctestplus==1.*
   - pytest==7.*
diff --git a/envs/lenskit-py3.11-dev.yaml b/envs/lenskit-py3.11-dev.yaml
index b1e994e25..ff5412016 100644
--- a/envs/lenskit-py3.11-dev.yaml
+++ b/envs/lenskit-py3.11-dev.yaml
@@ -12,10 +12,12 @@ channels:
 dependencies:
   - python=3.11
   - binpickle>=0.3.2
+  - c-compiler
   - cffi>=1.15.0
   - copier==9.*
   - coverage>=5
   - csr>=0.5
+  - cython==3.*
   - docopt>=0.6
   - hypothesis>=6
   - ipython>=7
@@ -29,6 +31,7 @@ dependencies:
   - numpy>=1.23
   - pandas<3,>=1.5
   - pyproject2conda~=0.11
+  - pytest-benchmark==4.*
   - pytest-cov>=2.12
   - pytest-doctestplus==1.*
   - pytest==7.*
@@ -44,7 +47,6 @@ dependencies:
   - sphinx_rtd_theme>=0.5
   - sphinxcontrib-bibtex>=2.0
   - sphinxext-opengraph>=0.5
-  - tbb
   - threadpoolctl>=3.0
   - tqdm>=4
   - pip
diff --git a/envs/lenskit-py3.11-test.yaml b/envs/lenskit-py3.11-test.yaml
index c47f53d8d..4ee3e3b77 100644
--- a/envs/lenskit-py3.11-test.yaml
+++ b/envs/lenskit-py3.11-test.yaml
@@ -15,10 +15,12 @@ dependencies:
   - cffi>=1.15.0
   - coverage>=5
   - csr>=0.5
+  - cython==3.*
   - hypothesis>=6
   - numba<0.59,>=0.56
   - numpy>=1.23
   - pandas<3,>=1.5
+  - pytest-benchmark==4.*
   - pytest-cov>=2.12
   - pytest-doctestplus==1.*
   - pytest==7.*
diff --git a/justfile b/justfile
index b7e76b7de..b390357e5 100644
--- a/justfile
+++ b/justfile
@@ -11,6 +11,10 @@ clean:
 build:
     python -m build -n
 
+# build the extension modules in-place for testing
+build-inplace:
+    python setup.py build_ext --inplace
+
 # install the package
 [confirm("this installs package from a wheel, continue [y/N]?")]
 install:
@@ -29,15 +33,15 @@ setup-conda-env version="3.11" env="dev":
     conda env create -n lkpy -f envs/lenskit-py{{version}}-{{env}}.yaml
 
 # run tests with default configuration
-test:
+test: build-inplace
     python -m pytest
 
 # run fast tests
-test-fast:
+test-fast: build-inplace
     python -m pytest -m 'not slow'
 
 # run tests matching a keyword query
-test-matching query:
+test-matching query: build-inplace
     python -m pytest -k {{query}}
 
 # build documentation
diff --git a/lenskit/util/csmatrix.pyi b/lenskit/util/csmatrix.pyi
new file mode 100644
index 000000000..33c4b702e
--- /dev/null
+++ b/lenskit/util/csmatrix.pyi
@@ -0,0 +1,24 @@
+import numpy as np
+import numpy.typing as npt
+from scipy.sparse import csr_array
+
+class CSMatrix:
+    nrows: int
+    ncols: int
+    nnz: int
+
+    rowptr: npt.NDArray[np.int32]
+    colind: npt.NDArray[np.int32]
+    values: npt.NDArray[np.float64]
+
+    def __init__(
+        self,
+        nr: int,
+        nc: int,
+        rps: npt.NDArray[np.int32],
+        cis: npt.NDArray[np.int32],
+        vs: npt.NDArray[np.float64],
+    ): ...
+    def row_ep(self, row: int) -> tuple[int, int]: ...
+    @staticmethod
+    def from_scipy(matrix: csr_array) -> CSMatrix: ...
diff --git a/lenskit/util/csmatrix.pyx b/lenskit/util/csmatrix.pyx
new file mode 100644
index 000000000..a0ac84ad4
--- /dev/null
+++ b/lenskit/util/csmatrix.pyx
@@ -0,0 +1,27 @@
+# cython: language_level=3str
+
+cdef class CSMatrix:
+    cdef readonly int nrows, ncols, nnz
+    cdef readonly int[:] rowptr
+    cdef readonly int[:] colind
+    cdef readonly double[:] values
+
+    def __cinit__(self, int nr, int nc, int[:] rps, int[:] cis, double[:] vs):
+        self.nrows = nr
+        self.ncols = nc
+        self.rowptr = rps
+        self.colind = cis
+        self.values = vs
+        self.nnz = self.rowptr[nr]
+
+    @staticmethod
+    def from_scipy(m):
+        nr, nc = m.shape
+
+        return CSMatrix(nr, nc, m.indptr, m.indices, m.data)
+
+    cpdef (int,int) row_ep(self, row):
+        if row < 0 or row >= self.nrows:
+            raise IndexError(f"invalid row {row} for {self.nrows}x{self.ncols} matrix")
+
+        return self.rowptr[row], self.rowptr[row+1]
diff --git a/lenskit/util/kvp.pyx b/lenskit/util/kvp.pyx
new file mode 100644
index 000000000..44eb3a974
--- /dev/null
+++ b/lenskit/util/kvp.pyx
@@ -0,0 +1,109 @@
+# cython: language_level=3str, initializedcheck=False
+cimport cython
+
+cdef class KVPHeap:
+    cdef readonly int sp, ep, lim
+    cdef int[::1] keys
+    cdef double[::1] vals
+
+    def __cinit__(self, int sp, int ep, int lim, int[::1] keys, double[::1] vals):
+        if ep < sp:
+            raise ValueError("ep before sp")
+        if ep - sp > lim:
+            raise ValueError("array already exceeds limit")
+        if sp + lim > keys.shape[0]:
+            raise ValueError("key array too short")
+        if sp + lim > vals.shape[0]:
+            raise ValueError("value array too short")
+
+        self.sp = sp
+        self.ep = ep
+        self.lim = lim
+        self.keys = keys
+        self.vals = vals
+
+    cpdef int insert(self, int k, double v) except -1:
+        if self.ep - self.sp < self.lim:
+            # insert into heap without size problems
+            # put on end, then upheap
+            self.keys[self.ep] = k
+            self.vals[self.ep] = v
+            self._upheap()
+            self.ep = self.ep + 1
+            return self.ep
+
+        elif v > self.vals[self.sp]:
+            # heap is full, but new value is larger than old min
+            # stick it on the front, and downheap
+            self.keys[self.sp] = k
+            self.vals[self.sp] = v
+            self._downheap(self.lim)
+            return self.ep
+
+        else:
+            # heap is full and new value doesn't belong
+            return self.ep
+
+
+    cpdef void sort(self):
+        cdef int i = self.ep - self.sp - 1
+        while i > 0:
+            self._swap(i, 0)
+            self._downheap(i)
+            i -= 1
+
+
+    cdef void _downheap(self, int limit) noexcept nogil:
+        cdef bint finished = False
+        cdef int pos = 0
+        cdef int min, left, right
+        while not finished:
+            min = pos
+            left = 2 * pos + 1
+            right = 2 * pos + 2
+            if left < limit and self._val(left) < self._val(min):
+                min = left
+            if right < limit and self._val(right) < self._val(min):
+                min = right
+            if min != pos:
+                # we want to swap!
+                self._swap(pos, min)
+                pos = min
+            else:
+                finished = True
+
+
+    cdef void _upheap(self) noexcept nogil:
+        cdef int pos = self.ep - self.sp
+        cdef int parent = (pos - 1) // 2
+        while pos > 0 and self._val(parent) > self._val(pos):
+            self._swap(parent, pos)
+            pos = parent
+            parent = (pos - 1) // 2
+
+
+    cdef int _offset(self, int i) noexcept nogil:
+        return self.sp + i
+
+
+    cdef void _swap(self, int i1, int i2) noexcept nogil:
+        cdef int p1 = self._offset(i1)
+        cdef int p2 = self._offset(i2)
+        cdef int tk
+        cdef double tv
+
+        tk = self.keys[p1]
+        self.keys[p1] = self.keys[p2]
+        self.keys[p2] = tk
+
+        tv = self.vals[p1]
+        self.vals[p1] = self.vals[p2]
+        self.vals[p2] = tv
+
+
+    cdef int _key(self, int i) noexcept nogil:
+        return self.keys[self._offset(i)]
+
+
+    cdef double _val(self, int i) noexcept nogil:
+        return self.vals[self._offset(i)]
diff --git a/pyproject.toml b/pyproject.toml
index 58f39d37b..21bcc6000 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools>=64", "setuptools_scm>=8"]
+requires = ["setuptools>=64", "setuptools_scm>=8", "cython==3.*"]
 build-backend = "setuptools.build_meta"
 
 [project]
@@ -41,6 +41,7 @@ dev = [
   "ruff >= 0.2",
   "copier ==9.*",
   "unbeheader ~= 1.3",        # p2c: -p
+  "cython ==3.*",
   "ipython >= 7",
   "pyproject2conda ~=0.11",
   "sphinx-autobuild >= 2021",
@@ -52,6 +53,8 @@ test = [
   "pytest-doctestplus ==1.*",
   "pytest-cov >= 2.12",
   "coverage >= 5",
+  "pytest-benchmark >=4.0.0",
+  "cython ==3.*",
   "hypothesis >= 6",
 ]
 doc = [
@@ -89,12 +92,19 @@ exclude = [
   "tasks.py",
 ]
 
-[tool.setuptools]
-packages = ["lenskit"]
+[tool.setuptools.packages.find]
+include = ["lenskit*"]
 
 [tool.setuptools_scm]
 version_scheme = "release-branch-semver"
 
+[tool.cibuildwheel]
+build = "cp310* cp311*"
+skip = "pp*"
+
+[tool.cibuildwheel.macos]
+archs = "x86_64 arm64"
+
 # settings for generating conda environments for dev & CI, when needed
 [tool.pyproject2conda]
 channels = ["conda-forge"]
@@ -106,11 +116,11 @@ deps = ["tbb", "just"]
 
 [tool.pyproject2conda.envs.dev]
 extras = ["dev", "test", "doc", "demo", "sklearn"]
-deps = ["just==1.*", "tbb"]
+deps = ["just==1.*", "c-compiler", "tbb"]
 
 [tool.pyproject2conda.envs.ci]
 extras = ["test", "sklearn", "dev"]
-deps = ["just==1.*", "tbb"]
+deps = ["just==1.*", "c-compiler", "tbb"]
 
 [tool.ruff]
 line-length = 100
@@ -148,3 +158,6 @@ exclude = [
 ]
 reportMissingImports = true
 reportMissingTypeStubs = false
+
+[tool.coverage.run]
+plugins = ["Cython.Coverage"]
diff --git a/setup.py b/setup.py
new file mode 100644
index 000000000..13901024d
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,43 @@
+# This file is part of LensKit.
+# Copyright (C) 2018-2023 Boise State University
+# Copyright (C) 2023-2024 Drexel University
+# Licensed under the MIT license, see LICENSE.md for details.
+# SPDX-License-Identifier: MIT
+
+import os
+
+from setuptools import Extension, setup
+
+try:
+    from Cython.Build import cythonize
+
+    USE_CYTHON = True
+except ImportError:
+    USE_CYTHON = False
+
+
+COVERAGE = os.environ.get("BUILD_FOR_COVER", None)
+EXT_SPECS = {"lenskit.util.kvp": None, "lenskit.util.csmatrix": None}
+
+CYTHON_OPTIONS = {}
+C_DEFINES = []
+if COVERAGE:
+    print("enabling tracing")
+    CYTHON_OPTIONS["linetrace"] = True
+    C_DEFINES.append(("CYTHON_TRACE_NOGIL", "1"))
+
+
+def _make_extension(name: str, opts: None) -> Extension:
+    path = name.replace(".", "/")
+    if USE_CYTHON:
+        path += ".pyx"
+    else:
+        path += ".c"
+    return Extension(name, [path], define_macros=C_DEFINES)
+
+
+EXTENSIONS = [_make_extension(ext, opts) for (ext, opts) in EXT_SPECS.items()]
+if USE_CYTHON:
+    EXTENSIONS = cythonize(EXTENSIONS, compiler_directives=CYTHON_OPTIONS)
+print(EXTENSIONS[0].__dict__)
+setup(ext_modules=EXTENSIONS)
diff --git a/tests/test_csmatrix.py b/tests/test_csmatrix.py
new file mode 100644
index 000000000..3de548382
--- /dev/null
+++ b/tests/test_csmatrix.py
@@ -0,0 +1,62 @@
+import numpy as np
+import scipy.sparse as sps
+from numba import njit
+
+import hypothesis.extra.numpy as nph
+import hypothesis.strategies as st
+from hypothesis import assume, given, settings
+from pytest import mark
+
+from lenskit.util.csmatrix import CSMatrix
+
+
+@st.composite
+def sparse_matrices(draw, max_shape=(50, 50), density=st.floats(0, 1), format="csr"):
+    ubr, ubc = max_shape
+
+    rows = draw(st.integers(1, ubr))
+    cols = draw(st.integers(1, ubc))
+    dens = draw(density)
+    prod = rows * cols
+    nnz = int(prod * dens)
+
+    points = draw(nph.arrays("int32", nnz, elements=st.integers(0, prod - 1), unique=True))
+    values = draw(nph.arrays("float64", nnz))
+    rvs = points % rows
+    cvs = points // rows
+    assert np.all(rvs < rows)
+    assert np.all(cvs < cols)
+
+    return sps.csr_array((values, (rvs, cvs)), shape=(rows, cols))
+
+
+@given(sparse_matrices())
+def test_init_matrix(m: sps.csr_array):
+    print(m.shape, m.nnz, m.indptr.dtype, m.indices.dtype)
+    nr, nc = m.shape
+
+    m2 = CSMatrix(nr, nc, m.indptr, m.indices, m.data)
+
+    assert m2.nrows == nr
+    assert m2.ncols == nc
+    assert m2.nnz == m.nnz
+
+
+@given(sparse_matrices())
+def test_from_scipy(m: sps.csr_array):
+    print(m.shape, m.nnz, m.indptr.dtype, m.indices.dtype)
+    m2 = CSMatrix.from_scipy(m)
+
+    assert m2.nrows == m.shape[0]
+    assert m2.ncols == m.shape[1]
+    assert m2.nnz == m.nnz
+
+
+@given(sparse_matrices())
+def test_csm_row_ep(m: sps.csr_array):
+    m2 = CSMatrix.from_scipy(m)
+
+    for i in range(m2.nrows):
+        sp, ep = m2.row_ep(i)
+        assert sp == m2.rowptr[i]
+        assert ep == m2.rowptr[i + 1]
diff --git a/tests/test_util_accum.py b/tests/test_util_accum.py
index 8e6cd4f35..055aef33e 100644
--- a/tests/test_util_accum.py
+++ b/tests/test_util_accum.py
@@ -5,13 +5,14 @@
 # SPDX-License-Identifier: MIT
 
 import numpy as np
+from numba import njit
 
-from lenskit.util.accum import kvp_minheap_insert, kvp_minheap_sort
-
-
-from hypothesis import given, assume, settings
-import hypothesis.strategies as st
 import hypothesis.extra.numpy as nph
+import hypothesis.strategies as st
+from hypothesis import assume, given, settings
+from pytest import mark
+
+from lenskit.util.accum import kvp_minheap_insert, kvp_minheap_sort
 
 
 def test_kvp_add_to_empty():
@@ -185,3 +186,25 @@ def test_kvp_sort(values):
     assert vs[-1] == np.min(ovs)
     assert all(ks == oks[ord])
     assert all(vs == ovs[ord])
+
+
+@mark.benchmark(group="KVPSort")
+def test_kvp_sort_numba(rng, benchmark):
+    N = 10000
+    K = 500
+    in_keys = np.arange(N)
+    in_vals = rng.uniform(size=N)
+
+    def op():
+        ks = np.zeros(K, np.int32)
+        vs = np.zeros(K, np.float64)
+        ep = 0
+        for i in range(N):
+            ep = kvp_minheap_insert(0, ep, K, in_keys[i], in_vals[i], ks, vs)
+
+        kvp_minheap_sort(0, ep, ks, vs)
+
+    # dry run to compile
+    op()
+
+    benchmark(op)
diff --git a/tests/test_util_kvp.py b/tests/test_util_kvp.py
new file mode 100644
index 000000000..4bc8d644d
--- /dev/null
+++ b/tests/test_util_kvp.py
@@ -0,0 +1,216 @@
+# This file is part of LensKit.
+# Copyright (C) 2018-2023 Boise State University
+# Copyright (C) 2023-2024 Drexel University
+# Licensed under the MIT license, see LICENSE.md for details.
+# SPDX-License-Identifier: MIT
+
+import numpy as np
+
+import hypothesis.extra.numpy as nph
+import hypothesis.strategies as st
+from hypothesis import assume, given, settings
+from pytest import mark
+
+from lenskit.util.kvp import KVPHeap
+
+
+def test_kvp_add_to_empty():
+    ks = np.empty(10, dtype=np.int32)
+    vs = np.empty(10)
+
+    # insert an item
+    kvp = KVPHeap(0, 0, 10, ks, vs)
+    n = kvp.insert(5, 3.0)
+
+    # ep has moved
+    assert n == 1
+    assert kvp.ep == 1
+
+    # item is there
+    assert ks[0] == 5
+    assert vs[0] == 3.0
+
+
+def test_kvp_add_larger():
+    ks = np.empty(10, dtype=np.int32)
+    vs = np.empty(10)
+
+    # insert an item
+    kvp = KVPHeap(0, 0, 10, ks, vs)
+    n = kvp.insert(5, 3.0)
+    n = kvp.insert(1, 6.0)
+
+    # ep has moved
+    assert n == 2
+    assert kvp.ep == 2
+
+    # data is there
+    assert all(ks[:2] == [5, 1])
+    assert all(vs[:2] == [3.0, 6.0])
+
+
+def test_kvp_add_smaller():
+    ks = np.empty(10, dtype=np.int32)
+    vs = np.empty(10)
+
+    # insert an item
+    kvp = KVPHeap(0, 0, 10, ks, vs)
+    n = kvp.insert(5, 3.0)
+    n = kvp.insert(1, 1.0)
+
+    # ep has moved
+    assert n == 2
+
+    # data is there
+    assert all(ks[:2] == [1, 5])
+    assert all(vs[:2] == [1.0, 3.0])
+
+
+@given(st.integers(10, 100), st.data())
+def test_kvp_add_several(kvp_len, data):
+    "Test filling up a KVP."
+    ks = np.full(kvp_len, -1, dtype=np.int32)
+    vs = np.zeros(kvp_len)
+
+    n = 0
+
+    values = np.random.randn(kvp_len) * 100
+
+    kvp = KVPHeap(0, 0, kvp_len, ks, vs)
+    for k, v in enumerate(values):
+        n = kvp.insert(k, v)
+
+    assert n == kvp_len
+    # all key slots are used
+    assert all(ks >= 0)
+    # all keys are there
+    assert all(np.sort(ks) == list(range(kvp_len)))
+    # value is the smallest
+    assert vs[0] == np.min(vs)
+
+    # it rejects a smaller value; -10000 is below our min value
+    special_k = 500
+    n2 = kvp.insert(special_k, -10000)
+
+    assert n2 == n
+    assert all(ks != special_k)
+    assert all(vs > -5000.0)
+
+    # it inserts a larger value somewhere
+    old_mk = ks[0]
+    old_mv = vs[0]
+    assume(np.median(vs) < 50)
+    nv = data.draw(st.floats(np.median(vs), 100))
+    n2 = kvp.insert(special_k, nv)
+
+    assert n2 == n
+    # the old value minimum key has been removed
+    assert all(ks != old_mk)
+    # the old minimum value has been removed
+    assert all(vs > old_mv)
+    assert np.count_nonzero(ks == special_k) == 1
+
+
+@given(st.data())
+def test_kvp_add_middle(data):
+    "Test that KVP works in the middle of an array."
+    ks = np.full(100, -1, dtype=np.int32)
+    vs = np.full(100, np.nan)
+
+    n = 25
+    avs = []
+
+    values = st.floats(-100, 100)
+    kvp = KVPHeap(25, 25, 10, ks, vs)
+    for k in range(25):
+        v = data.draw(values)
+        avs.append(v)
+        n = kvp.insert(k, v)
+
+    assert n == 35
+    # all the keys
+    assert all(ks[25:35] >= 0)
+    # value is the smallest
+    assert vs[25] == np.min(vs[25:35])
+    # highest-ranked keys
+    assert all(np.sort(vs[25:35]) == np.sort(avs)[15:])
+
+    # early is untouched
+    assert all(ks[:25] == -1)
+    assert all(np.isnan(vs[:25]))
+    assert all(ks[35:] == -1)
+    assert all(np.isnan(vs[35:]))
+
+
+def test_kvp_insert_min():
+    ks = np.full(10, -1, dtype=np.int32)
+    vs = np.zeros(10)
+
+    n = 0
+
+    # something less than existing data
+    kvp = KVPHeap(0, 0, 10, ks, vs)
+    n = kvp.insert(5, -3)
+    assert n == 1
+    assert ks[0] == 5
+    assert vs[0] == -3.0
+
+    # equal to existing data
+    kvp = KVPHeap(0, 0, 10, ks, vs)
+    n = kvp.insert(7, -3.0)
+    assert n == 1
+    assert ks[0] == 7
+    assert vs[0] == -3.0
+
+    # greater than to existing data
+    kvp = KVPHeap(0, 0, 10, ks, vs)
+    n = kvp.insert(9, 5.0)
+    assert n == 1
+    assert ks[0] == 9
+    assert vs[0] == 5.0
+
+
+@settings(deadline=None)
+@given(nph.arrays(np.float64, 20, elements=st.floats(-100, 100), unique=True))
+def test_kvp_sort(values):
+    "Test that sorting logic works"
+    ks = np.full(10, -1, dtype=np.int32)
+    vs = np.zeros(10)
+
+    n = 0
+
+    kvp = KVPHeap(0, 0, 10, ks, vs)
+    for k in range(20):
+        v = values[k]
+        n = kvp.insert(k, v)
+
+    assert n == 10
+
+    ovs = vs.copy()
+    oks = ks.copy()
+    ord = np.argsort(ovs)
+    ord = ord[::-1]
+
+    kvp.sort()
+    assert vs[0] == np.max(ovs)
+    assert vs[-1] == np.min(ovs)
+    assert all(ks == oks[ord])
+    assert all(vs == ovs[ord])
+
+
+@mark.benchmark(group="KVPSort")
+def test_kvp_sort_cython(rng, benchmark):
+    N = 10000
+    K = 500
+    in_keys = np.arange(N)
+    in_vals = rng.uniform(size=N)
+
+    def op():
+        ks = np.zeros(K, np.int32)
+        vs = np.zeros(K, np.float64)
+        kvp = KVPHeap(0, 0, K, ks, vs)
+        for i in range(N):
+            kvp.insert(in_keys[i], in_vals[i])
+        kvp.sort()
+
+    benchmark(op)