From bdfa94f66a6abbeb6f0d7ff55b42e2c40a6ca6b2 Mon Sep 17 00:00:00 2001
From: Marmaduke Woodman <marmaduke.woodman@univ-amu.fr>
Date: Wed, 26 Jun 2024 17:08:32 +0200
Subject: [PATCH 01/12] start work on build env

---
 .github/workflows.bak/build.yml               | 90 +++++++++++++++++
 .../contrib-tests.yml                         |  0
 .../ebrains-sync.yml                          |  0
 .../lib-tests.yml_disabled                    |  0
 .../mrs-ci-rtx-update.sh                      |  0
 .../mrs-ci-rtx.dockerfile                     |  0
 .../notebooks.yml                             |  0
 .../{workflows => workflows.bak}/pg-tests.yml |  0
 .../win-tests.yml                             |  0
 .github/workflows/build.yml                   | 96 +++++++++----------
 10 files changed, 138 insertions(+), 48 deletions(-)
 create mode 100644 .github/workflows.bak/build.yml
 rename .github/{workflows => workflows.bak}/contrib-tests.yml (100%)
 rename .github/{workflows => workflows.bak}/ebrains-sync.yml (100%)
 rename .github/{workflows => workflows.bak}/lib-tests.yml_disabled (100%)
 rename .github/{workflows => workflows.bak}/mrs-ci-rtx-update.sh (100%)
 rename .github/{workflows => workflows.bak}/mrs-ci-rtx.dockerfile (100%)
 rename .github/{workflows => workflows.bak}/notebooks.yml (100%)
 rename .github/{workflows => workflows.bak}/pg-tests.yml (100%)
 rename .github/{workflows => workflows.bak}/win-tests.yml (100%)

diff --git a/.github/workflows.bak/build.yml b/.github/workflows.bak/build.yml
new file mode 100644
index 0000000000..f502c2c6f0
--- /dev/null
+++ b/.github/workflows.bak/build.yml
@@ -0,0 +1,90 @@
+name: Test Py
+on: [push]
+
+jobs:
+  build:
+    name: Test and Inspect
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [ "3.10", "3.11" ]
+
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0  # Shallow clones should be disabled for a better relevancy of analysis
+
+      - name: set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        id: setPy
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: put ~/.local/bin on $PATH
+        run: echo "PATH=$HOME/.local/bin:$PATH" >> $GITHUB_ENV
+
+#      - name: cache ~/.local for pip deps
+#        id: cache-local
+#        uses: actions/cache@v3
+#        with:
+#          path: ~/.local
+#          key: pip-${{ steps.setPy.outputs.version }}-${{ hashFiles('tvb_framework/requirements.txt') }}
+
+      - name: install tools and dependencies
+#        if: steps.cache-local.outputs.cache-hit != 'true'
+        run: |
+          sudo apt-get update
+          sudo apt install libbz2-dev libhdf5-serial-dev liblzo2-dev
+          python3 -m pip install --upgrade setuptools==59.8.0 pip wheel
+          pip3 install --user --upgrade numpy
+          python3 -m pip install scikit-build
+          pip3 install --user -r tvb_framework/requirements.txt
+          pip3 install --user --no-build-isolation tvb-gdist
+
+      - name: setup tvb
+        run: |
+          cd tvb_build
+          bash install_full_tvb.sh
+
+      - name: cache data
+        id: cache-data
+        uses: actions/cache@v3
+        with:
+          path: tvb_data
+          key: tvb-data
+
+      - name: download data
+        if: steps.cache-data.outputs.cache-hit != 'true'
+        run: |
+          wget -q https://zenodo.org/record/10128131/files/tvb_data.zip?download=1 -O tvb_data.zip
+          mkdir tvb_data
+          unzip tvb_data.zip -d tvb_data
+          rm tvb_data.zip
+
+      - name: setup data
+        run: |
+          cd tvb_data
+          python3 setup.py develop
+
+      - name: run library tests
+        run: pytest -v tvb_library --cov --cov-report=xml && mv coverage.xml coverage-library.xml
+
+      - name: run framework tests
+        env:
+          KEYCLOAK_CLIENT_ID: ${{ secrets.KEYCLOAK_CLIENT_ID }} # KEYCLOAK_CLIENT_ID & SECRET have priority in tests
+          KEYCLOAK_CLIENT_SECRET: ${{ secrets.KEYCLOAK_CLIENT_SECRET }}
+        run: pytest -v tvb_framework  --cov --cov-report=xml --ignore=tvb_framework/tvb/interfaces/rest/client/tests/rest_test.py && mv coverage.xml coverage-framework.xml
+
+      - name: run storage tests
+        run: pytest -v tvb_storage  --cov --cov-report=xml && mv coverage.xml coverage-storage.xml
+
+      - name: Prepare PATH for Sonar
+        run: |
+          echo "PATH=$PATH:/opt/sonar-scanner/bin:/opt/nodejs/bin" >> $GITHUB_ENV
+
+      - name: SonarCloud Scan
+        uses: SonarSource/sonarcloud-github-action@master
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}  # Needed to get PR information, if any
+          SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
diff --git a/.github/workflows/contrib-tests.yml b/.github/workflows.bak/contrib-tests.yml
similarity index 100%
rename from .github/workflows/contrib-tests.yml
rename to .github/workflows.bak/contrib-tests.yml
diff --git a/.github/workflows/ebrains-sync.yml b/.github/workflows.bak/ebrains-sync.yml
similarity index 100%
rename from .github/workflows/ebrains-sync.yml
rename to .github/workflows.bak/ebrains-sync.yml
diff --git a/.github/workflows/lib-tests.yml_disabled b/.github/workflows.bak/lib-tests.yml_disabled
similarity index 100%
rename from .github/workflows/lib-tests.yml_disabled
rename to .github/workflows.bak/lib-tests.yml_disabled
diff --git a/.github/workflows/mrs-ci-rtx-update.sh b/.github/workflows.bak/mrs-ci-rtx-update.sh
similarity index 100%
rename from .github/workflows/mrs-ci-rtx-update.sh
rename to .github/workflows.bak/mrs-ci-rtx-update.sh
diff --git a/.github/workflows/mrs-ci-rtx.dockerfile b/.github/workflows.bak/mrs-ci-rtx.dockerfile
similarity index 100%
rename from .github/workflows/mrs-ci-rtx.dockerfile
rename to .github/workflows.bak/mrs-ci-rtx.dockerfile
diff --git a/.github/workflows/notebooks.yml b/.github/workflows.bak/notebooks.yml
similarity index 100%
rename from .github/workflows/notebooks.yml
rename to .github/workflows.bak/notebooks.yml
diff --git a/.github/workflows/pg-tests.yml b/.github/workflows.bak/pg-tests.yml
similarity index 100%
rename from .github/workflows/pg-tests.yml
rename to .github/workflows.bak/pg-tests.yml
diff --git a/.github/workflows/win-tests.yml b/.github/workflows.bak/win-tests.yml
similarity index 100%
rename from .github/workflows/win-tests.yml
rename to .github/workflows.bak/win-tests.yml
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index f502c2c6f0..18bef1db3e 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1,19 +1,19 @@
-name: Test Py
+name: build kernel library
 on: [push]
 
 jobs:
   build:
-    name: Test and Inspect
+    name: build
     runs-on: ubuntu-latest
     strategy:
       fail-fast: false
       matrix:
-        python-version: [ "3.10", "3.11" ]
+        python-version: [ "3.10" ]
 
     steps:
       - uses: actions/checkout@v3
         with:
-          fetch-depth: 0  # Shallow clones should be disabled for a better relevancy of analysis
+          fetch-depth: 0
 
       - name: set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v4
@@ -24,13 +24,6 @@ jobs:
       - name: put ~/.local/bin on $PATH
         run: echo "PATH=$HOME/.local/bin:$PATH" >> $GITHUB_ENV
 
-#      - name: cache ~/.local for pip deps
-#        id: cache-local
-#        uses: actions/cache@v3
-#        with:
-#          path: ~/.local
-#          key: pip-${{ steps.setPy.outputs.version }}-${{ hashFiles('tvb_framework/requirements.txt') }}
-
       - name: install tools and dependencies
 #        if: steps.cache-local.outputs.cache-hit != 'true'
         run: |
@@ -42,49 +35,56 @@ jobs:
           pip3 install --user -r tvb_framework/requirements.txt
           pip3 install --user --no-build-isolation tvb-gdist
 
-      - name: setup tvb
+      - name: install compilers and related
         run: |
-          cd tvb_build
-          bash install_full_tvb.sh
+          pip3 install --user ctypesgen
+          curl -LO https://github.com/ispc/ispc/releases/download/v1.24.0/ispc-v1.24.0-linux.tar.gz
+          tar xzf ispc-v1.24.0-linux.tar.gz
+          ls -lh
 
-      - name: cache data
-        id: cache-data
-        uses: actions/cache@v3
-        with:
-          path: tvb_data
-          key: tvb-data
+      # - name: setup tvb
+      #   run: |
+      #     cd tvb_build
+      #     bash install_full_tvb.sh
 
-      - name: download data
-        if: steps.cache-data.outputs.cache-hit != 'true'
-        run: |
-          wget -q https://zenodo.org/record/10128131/files/tvb_data.zip?download=1 -O tvb_data.zip
-          mkdir tvb_data
-          unzip tvb_data.zip -d tvb_data
-          rm tvb_data.zip
+      # - name: cache data
+      #   id: cache-data
+      #   uses: actions/cache@v3
+      #   with:
+      #     path: tvb_data
+      #     key: tvb-data
 
-      - name: setup data
-        run: |
-          cd tvb_data
-          python3 setup.py develop
+      # - name: download data
+      #   if: steps.cache-data.outputs.cache-hit != 'true'
+      #   run: |
+      #     wget -q https://zenodo.org/record/10128131/files/tvb_data.zip?download=1 -O tvb_data.zip
+      #     mkdir tvb_data
+      #     unzip tvb_data.zip -d tvb_data
+      #     rm tvb_data.zip
 
-      - name: run library tests
-        run: pytest -v tvb_library --cov --cov-report=xml && mv coverage.xml coverage-library.xml
+      # - name: setup data
+      #   run: |
+      #     cd tvb_data
+      #     python3 setup.py develop
 
-      - name: run framework tests
-        env:
-          KEYCLOAK_CLIENT_ID: ${{ secrets.KEYCLOAK_CLIENT_ID }} # KEYCLOAK_CLIENT_ID & SECRET have priority in tests
-          KEYCLOAK_CLIENT_SECRET: ${{ secrets.KEYCLOAK_CLIENT_SECRET }}
-        run: pytest -v tvb_framework  --cov --cov-report=xml --ignore=tvb_framework/tvb/interfaces/rest/client/tests/rest_test.py && mv coverage.xml coverage-framework.xml
+      # - name: run library tests
+      #   run: pytest -v tvb_library --cov --cov-report=xml && mv coverage.xml coverage-library.xml
 
-      - name: run storage tests
-        run: pytest -v tvb_storage  --cov --cov-report=xml && mv coverage.xml coverage-storage.xml
+      # - name: run framework tests
+      #   env:
+      #     KEYCLOAK_CLIENT_ID: ${{ secrets.KEYCLOAK_CLIENT_ID }} # KEYCLOAK_CLIENT_ID & SECRET have priority in tests
+      #     KEYCLOAK_CLIENT_SECRET: ${{ secrets.KEYCLOAK_CLIENT_SECRET }}
+      #   run: pytest -v tvb_framework  --cov --cov-report=xml --ignore=tvb_framework/tvb/interfaces/rest/client/tests/rest_test.py && mv coverage.xml coverage-framework.xml
 
-      - name: Prepare PATH for Sonar
-        run: |
-          echo "PATH=$PATH:/opt/sonar-scanner/bin:/opt/nodejs/bin" >> $GITHUB_ENV
+      # - name: run storage tests
+      #   run: pytest -v tvb_storage  --cov --cov-report=xml && mv coverage.xml coverage-storage.xml
+
+      # - name: Prepare PATH for Sonar
+      #   run: |
+      #     echo "PATH=$PATH:/opt/sonar-scanner/bin:/opt/nodejs/bin" >> $GITHUB_ENV
 
-      - name: SonarCloud Scan
-        uses: SonarSource/sonarcloud-github-action@master
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}  # Needed to get PR information, if any
-          SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
+      # - name: SonarCloud Scan
+      #   uses: SonarSource/sonarcloud-github-action@master
+      #   env:
+      #     GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}  # Needed to get PR information, if any
+      #     SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}

From c3f57deeced893c8a70e5ea275b2c187ff5c3b00 Mon Sep 17 00:00:00 2001
From: Marmaduke Woodman <marmaduke.woodman@univ-amu.fr>
Date: Wed, 26 Jun 2024 17:15:36 +0200
Subject: [PATCH 02/12] add checks

---
 .github/workflows/build.yml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 18bef1db3e..a663f43f71 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -40,7 +40,13 @@ jobs:
           pip3 install --user ctypesgen
           curl -LO https://github.com/ispc/ispc/releases/download/v1.24.0/ispc-v1.24.0-linux.tar.gz
           tar xzf ispc-v1.24.0-linux.tar.gz
-          ls -lh
+          echo "PATH=$PWD/ispc-v1.24.0-linux/bin:$PATH" >> $GITHUB_ENV
+
+      - name: check tools work
+        run: |
+          ctypesgen --version
+          ispc --version
+          gcc -v
 
       # - name: setup tvb
       #   run: |

From aac0394024c57765472bb1677b47918c29ad83e5 Mon Sep 17 00:00:00 2001
From: Marmaduke Woodman <marmaduke.woodman@univ-amu.fr>
Date: Wed, 26 Jun 2024 17:25:23 +0200
Subject: [PATCH 03/12] add first kernels, build & test scripts

---
 .github/workflows/build.yml |  24 ++++-
 tvb_kernels/README.md       |  49 ++++++++++
 tvb_kernels/mkispc.sh       |  15 +++
 tvb_kernels/nodes.ispc      | 187 ++++++++++++++++++++++++++++++++++++
 tvb_kernels/test_nodes.py   | 182 +++++++++++++++++++++++++++++++++++
 5 files changed, 452 insertions(+), 5 deletions(-)
 create mode 100644 tvb_kernels/README.md
 create mode 100755 tvb_kernels/mkispc.sh
 create mode 100644 tvb_kernels/nodes.ispc
 create mode 100644 tvb_kernels/test_nodes.py

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index a663f43f71..b32ed2dc22 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -28,7 +28,7 @@ jobs:
 #        if: steps.cache-local.outputs.cache-hit != 'true'
         run: |
           sudo apt-get update
-          sudo apt install libbz2-dev libhdf5-serial-dev liblzo2-dev
+          sudo apt install libbz2-dev libhdf5-serial-dev liblzo2-dev cmake
           python3 -m pip install --upgrade setuptools==59.8.0 pip wheel
           pip3 install --user --upgrade numpy
           python3 -m pip install scikit-build
@@ -37,10 +37,16 @@ jobs:
 
       - name: install compilers and related
         run: |
-          pip3 install --user ctypesgen
-          curl -LO https://github.com/ispc/ispc/releases/download/v1.24.0/ispc-v1.24.0-linux.tar.gz
-          tar xzf ispc-v1.24.0-linux.tar.gz
-          echo "PATH=$PWD/ispc-v1.24.0-linux/bin:$PATH" >> $GITHUB_ENV
+          pip3 install --user ctypesgen nanobind
+          # curl -LO https://github.com/ispc/ispc/releases/download/v1.24.0/ispc-v1.24.0-linux.tar.gz
+          # tar xzf ispc-v1.24.0-linux.tar.gz
+          # echo "PATH=$PWD/ispc-v1.24.0-linux/bin:$PATH" >> $GITHUB_ENV
+
+      - name: setup ispc
+        uses: ScatteredRay/install-ispc-action@main
+        with:
+          version: 1.23.0
+          platform: linux
 
       - name: check tools work
         run: |
@@ -48,6 +54,14 @@ jobs:
           ispc --version
           gcc -v
 
+      - name: test first kernels
+        run: |
+          cd tvb_kernels
+          ./mkispc.sh nodes.ispc
+          cmake -S . -B build
+          cmake --build build
+          PYTHONPATH=build python test_nodes.py
+
       # - name: setup tvb
       #   run: |
       #     cd tvb_build
diff --git a/tvb_kernels/README.md b/tvb_kernels/README.md
new file mode 100644
index 0000000000..971247deee
--- /dev/null
+++ b/tvb_kernels/README.md
@@ -0,0 +1,49 @@
+# tvb_kernels
+
+This is a library of computational kernels for TVB.
+
+## scope
+
+in order of priority
+
+- [ ] sparse delay coupling functions
+- [ ] fused heun neural mass model step functions
+- [ ] neural ODE
+- [ ] bold / tavg monitors
+
+## building
+
+For now, a `make` invocation is enough, which calls `mkispc.sh` to build
+the ISPC kernels, then CMake to build the nanobind wrappers.  The next
+steps will convert this to a pure CMake based process.
+ 
+## variants
+
+### implementation
+
+ISPC compiler provides the best performance, followed by a C/C++ compiler.
+To handle cases where all compilers are not available, the pattern will
+be as follows
+
+Python class owns kernel workspace (arrays, floats, int) and has a
+short, obvious NumPy based implementation.
+
+A nanobind binding layer optionally implements calls to
+- a short, obvious C++ implementation
+- an (possibly multithreaded) ISPC implementation
+- a CUDA implementation, if arrays are on GPU
+
+### batch variants
+
+It may be desirable to compute batches of a kernel.
+
+### parameters
+
+Parameters may be global or "spatialized" meaning different
+values for every node.  Kernels should provide separate calls
+for both cases.
+
+## matlab
+
+Given some users in matlab, it could be useful to wrap some of the algorithms
+as MEX files.
diff --git a/tvb_kernels/mkispc.sh b/tvb_kernels/mkispc.sh
new file mode 100755
index 0000000000..4f07d3b624
--- /dev/null
+++ b/tvb_kernels/mkispc.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+src=$1
+set -eux
+
+h=${src}.h
+obj=${src}.o
+dll=${src}.so
+mod=${src%".ispc"}
+py=${mod}.py
+
+ispc -g -O3 --pic ${src} -h ${h} -o ${obj}
+gcc -shared ${obj} -o ${dll}
+rm -f ${py}
+ctypesgen -l ./${dll} ${h} > ${py}
diff --git a/tvb_kernels/nodes.ispc b/tvb_kernels/nodes.ispc
new file mode 100644
index 0000000000..4ed349f465
--- /dev/null
+++ b/tvb_kernels/nodes.ispc
@@ -0,0 +1,187 @@
+// pointer is constant and same for all threads, as are elements pointed to
+typedef const uniform   int *const uniform data_ints;
+typedef const uniform float *const uniform data_floats;
+
+// pointer is constant and same for all threads,
+// elements pointed to are same for all threads, but modifiable
+typedef uniform   int *const uniform work_ints;
+typedef uniform float *const uniform work_floats;
+
+struct params {
+  const int count;
+  data_floats values;
+};
+
+// connectivity model with csr format sparse connections & delay buffer
+struct connectivity {
+  const uniform int num_node;
+  const uniform int num_nonzero;
+  const uniform int num_cvar;
+  // horizon must be power of two
+  const int horizon;
+  const int horizon_minus_1;
+  data_floats weights; // (num_nonzero,)
+  data_ints indices; // (num_nonzero,)
+  data_ints indptr; // (num_nodes+1,)
+  data_ints idelays; // (num_nonzero,)
+  work_floats buf; // delay buffer (num_cvar, num_nodes, horizon)
+  work_floats cx1;
+  work_floats cx2;
+};
+
+struct sim {
+  // keep invariant stuff at the top, per sim stuff below
+  const int rng_seed;
+  const int num_node;
+  const int num_svar;
+  const int num_time;
+  const int num_params;
+  const int num_spatial_params;
+  const float dt;
+  const int oversample; // TODO "oversample" for stability,
+  const int num_skip; // skip per output sample
+  work_floats z_scale; // (num_svar), sigma*sqrt(dt)
+
+  // parameters
+  const uniform params global_params;
+  const uniform params spatial_params;
+
+  work_floats state_trace; // (num_time//num_skip, num_svar, num_nodes)
+  work_floats states; // full states (num_svar, num_nodes)
+
+  const uniform connectivity conn;
+};
+
+typedef const uniform struct sim the_sim;
+typedef const uniform struct connectivity the_conn;
+
+// case of small number of nodes (90 floats is 360 bytes), dense, long horizon
+// probably just hit a single node's horizon: random increments
+export void cx_all_j(
+    the_conn &c,
+    uniform int t, uniform int j)
+{
+    uniform float * const buf = c.buf + j*c.horizon;
+    uniform int th = t + c.horizon;
+    // here assuming columns are targets
+    foreach (l = c.indptr[j] ... c.indptr[j + 1])
+    {
+        int i = c.indices[l];
+        float w = c.weights[l];
+        int d = c.idelays[l];
+        int p1 = (th - d) & c.horizon_minus_1;
+        int p2 = (th - d + 1) & c.horizon_minus_1;
+        c.cx1[i] += w * buf[p1];
+        c.cx2[i] += w * buf[p2];
+    }
+}
+
+export void cx_all(the_conn &c, uniform int t)
+{
+  foreach (i = 0 ... c.num_node)
+    c.cx1[i] = c.cx2[i] = 0.0f;
+
+  for (uniform int j=0; j<c.num_node; j++)
+    cx_all_j(c, t, j);
+}
+
+export void cx_all2(the_conn &c, uniform int t)
+{
+  uniform int th = t + c.horizon;
+  foreach (i = 0 ... c.num_node)
+  {
+    float cx1=0.f, cx2=0.f;
+    for (int l = c.indptr[i]; l < c.indptr[i + 1]; l++)
+    {
+      int j = c.indices[l];
+      float w = c.weights[l];
+      int d = c.idelays[l];
+      int p1 = (th - d) & c.horizon_minus_1;
+      int p2 = (th - d + 1) & c.horizon_minus_1;
+      cx1 += w * c.buf[j*c.horizon + p1];
+      cx2 += w * c.buf[j*c.horizon + p2];
+    }
+    c.cx1[i] = cx1;
+    c.cx2[i] = cx2;
+  }
+}
+
+export void cx_all3(the_conn &c, uniform int t)
+{
+  uniform int th = t + c.horizon;
+  foreach (i = 0 ... c.num_node)
+  {
+    float cx1=0.f, cx2=0.f;
+    for (int l = c.indptr[i]; l < c.indptr[i + 1]; l++)
+    {
+      int j = c.indices[l];
+      float w = c.weights[l];
+      int d = c.idelays[l];
+      int p1 = (th - d) & c.horizon_minus_1;
+      int p2 = (th - d + 1) & c.horizon_minus_1;
+      cx1 += w * c.buf[j + p1*c.num_node];
+      cx2 += w * c.buf[j + p2*c.num_node];
+    }
+    c.cx1[i] = cx1;
+    c.cx2[i] = cx2;
+  }
+}
+
+export void cx_all_nop(the_conn &c, uniform int t)
+{
+  // just to test overhead of binding layer
+  return;
+}
+
+void dfun(float &dx, float &dy, float x, float y, float cx, data_floats p)
+{
+  float a=p[0], tau=p[1], k=p[2];
+  dx = tau*(x - x*x*x*0.333f + y);
+  dy = (a + k*cx - x)*tau;
+}
+
+void save_trace(the_sim &s, const uniform int t_trace)
+{
+    uniform int i0 = t_trace * s.num_svar * s.num_node;
+    work_floats out = s.state_trace + i0;
+    foreach (i_node = 0 ... s.num_node)
+    {
+      out[i_node] = s.states[i_node];
+      out[i_node+s.num_node] = s.states[i_node+s.num_node];
+    }
+}
+
+export void run_batches(the_sim &s)
+{
+  return;
+    RNGState rng;
+
+    const uniform int num_trace = s.num_time / s.num_skip;
+    for (uniform int t_trace=0; t_trace < num_trace; t_trace++)
+    {
+        const uniform int t_total = t_trace * s.num_skip;
+        for (uniform int t_skip=0; t_skip<s.num_skip; t_skip++)
+        {
+          uniform int t = t_total + t_skip;
+          uniform int wt = t & s.conn.horizon_minus_1;
+
+          // per inter-domain connectivity
+          cx_all(s.conn, t);
+
+          // per domain
+          foreach (i = 0 ... s.num_node)
+          {
+              float dx1, dy1, dx2, dy2;
+              float x = s.states[i], y=s.states[i+s.num_node];
+              dfun(dx1, dy1, x, y, s.conn.cx1[i], s.global_params.values);
+              dfun(dx2, dy2, x+dx1*s.dt, y+dy1*s.dt, s.conn.cx2[i], s.global_params.values);
+              s.states[i] = x + s.dt*0.5f*(dx1 + dx2);
+              s.states[i+s.num_node] = y + s.dt*0.5f*(dy1 + dy2);
+
+              s.conn.buf[wt + i*s.num_node] = s.states[i];
+          }
+        }
+
+        save_trace(s, t_trace);
+    }
+}
diff --git a/tvb_kernels/test_nodes.py b/tvb_kernels/test_nodes.py
new file mode 100644
index 0000000000..652ac6fa93
--- /dev/null
+++ b/tvb_kernels/test_nodes.py
@@ -0,0 +1,182 @@
+import numpy as np
+import scipy.sparse
+import nodes
+import ctypes as ct
+
+# test connectivity
+np.random.seed(43)
+
+num_node = 90
+sparsity = 0.4
+
+dt = 0.1
+horizon = 256
+horizonm1 = horizon - 1
+assert ((horizon) & (horizonm1)) == 0
+
+weights, lengths = np.random.rand(2, num_node, num_node).astype('f')
+lengths[:] *= 0.8
+lengths *= (horizon*dt*0.8)
+zero_mask = weights < (1-sparsity)
+weights[zero_mask] = 0
+# so far so good
+
+# for the new approach, we need to invert this, so use CSC instead
+csc = scipy.sparse.csc_matrix(weights)
+
+# since numpy is row major, we need to transpose zero mask then ravel
+nnz_ravel = np.argwhere(~zero_mask.T.copy().ravel())[:, 0]
+
+# then we also need to extract them in the transposed order
+idelays = (lengths.T.copy().ravel()[nnz_ravel] / dt).astype('i') + 2
+
+# then we can test
+buf = np.r_[:1.0:1j*num_node*horizon].reshape(1, num_node, horizon).astype('f')*4.0
+buf = np.random.randn(1, num_node, horizon).astype('f')
+c_buf = buf.copy()
+c_cx1, c_cx2 = np.zeros((2, num_node), 'f')
+
+to_c = lambda a: a.ctypes.data_as(ct.POINTER(ct.c_float if a.dtype==np.float32 else ct.c_int))
+
+conn = nodes.connectivity(
+    num_node=num_node,
+    num_nonzero=nnz_ravel.size,
+    num_cvar=1,
+    horizon=horizon,
+    horizon_minus_1=horizonm1,
+    weights=to_c(csc.data),
+    indices=to_c(csc.indices),
+    indptr=to_c(csc.indptr),
+    idelays=to_c(idelays),
+    buf=to_c(c_buf),
+    cx1=to_c(c_cx1),
+    cx2=to_c(c_cx2),
+)
+p_conn = ct.byref(conn)
+
+def setup_cx_all2_csr():
+    csr = scipy.sparse.csr_matrix(weights)
+    nnz_ravel = np.argwhere(~zero_mask.ravel())[:, 0]
+    idelays = (lengths.ravel()[nnz_ravel] / dt).astype('i') + 2
+    c_buf = buf.copy()
+    c_cx1, c_cx2 = np.zeros((2, num_node), 'f')
+    conn = nodes.connectivity(
+        num_node=num_node,
+        num_nonzero=nnz_ravel.size,
+        num_cvar=1,
+        horizon=horizon,
+        horizon_minus_1=horizonm1,
+        weights=to_c(csr.data),
+        indices=to_c(csr.indices),
+        indptr=to_c(csr.indptr),
+        idelays=to_c(idelays),
+        buf=to_c(c_buf),
+        cx1=to_c(c_cx1),
+        cx2=to_c(c_cx2),
+    )
+    p_conn = ct.byref(conn)
+    return p_conn, c_cx1, locals()
+
+
+def setup_cx_all3_csr():
+    csr = scipy.sparse.csr_matrix(weights)
+    nnz_ravel = np.argwhere(~zero_mask.ravel())[:, 0]
+    idelays = (lengths.ravel()[nnz_ravel] / dt).astype('i') + 2
+    c_buf = buf.transpose(0,2,1).copy()
+    c_cx1, c_cx2 = np.zeros((2, num_node), 'f')
+    conn = nodes.connectivity(
+        num_node=num_node,
+        num_nonzero=nnz_ravel.size,
+        num_cvar=1,
+        horizon=horizon,
+        horizon_minus_1=horizonm1,
+        weights=to_c(csr.data),
+        indices=to_c(csr.indices),
+        indptr=to_c(csr.indptr),
+        idelays=to_c(idelays),
+        buf=to_c(c_buf),
+        cx1=to_c(c_cx1),
+        cx2=to_c(c_cx2),
+    )
+    p_conn = ct.byref(conn)
+    return p_conn, c_cx1, locals()
+
+p_conn2, c_cx12, cxall2work = setup_cx_all2_csr()
+p_conn3, c_cx13, cxall3work = setup_cx_all3_csr()
+
+# run them
+nodes.cx_all(p_conn, 15)
+nodes.cx_all2(p_conn2, 15)
+nodes.cx_all3(p_conn3, 15)
+
+
+# now a numpy version of similar
+cx1, cx2 = np.zeros((2, num_node), 'f')
+for j in range(num_node):
+    j0, j1 = csc.indptr[j:j+2]
+    p1 = (15 - idelays[j0:j1] + horizon) % horizon
+    cx1[csc.indices[j0:j1]] += csc.data[j0:j1]*buf[0, j, p1]
+
+# ah maybe a dense version
+idelays_dense = (lengths / dt).astype('i') + 2
+wxij = weights*buf[0,
+    np.tile(np.r_[:num_node], (num_node,1)),
+    (15 - idelays_dense + horizon) % horizon]
+cx13 = wxij.sum(axis=1)
+
+np.testing.assert_allclose(cx1, cx13, 2e-6, 1e-6)  # the two numpy versions agree
+np.testing.assert_allclose(cx1, c_cx1, 2e-6, 1e-6)
+np.testing.assert_allclose(cx1, c_cx12, 2e-6, 1e-6)
+np.testing.assert_allclose(cx1, c_cx13, 2e-6, 1e-6)
+
+import pylab as pl
+pl.plot(cx1, c_cx1, '.')
+pl.plot(cx1, c_cx12, '.', alpha=0.4)
+pl.plot(cx1, c_cx13, '.', alpha=0.4)
+pl.savefig('nodes.jpg')
+
+# now benchmarking
+
+def run_sim_np(weights, lengths, zero_mask):
+    csr_weights = scipy.sparse.csr_matrix(weights)
+    idelays = (lengths[~zero_mask]/dt).astype('i')+2
+    
+    idelays2 = -horizon + np.c_[idelays, idelays-1].T
+    assert idelays2.shape == (2, csr_weights.nnz)
+    buffer = np.zeros((num_node, horizon), 'f')
+
+    def cfun(t):
+        cx = buffer[csr_weights.indices, (t-idelays2) % horizon]
+        cx *= csr_weights.data
+        cx = np.add.reduceat(cx, csr_weights.indptr[:-1], axis=1)
+        return cx  # (2, num_node)
+    return cfun
+
+
+cfun_np = run_sim_np(weights, lengths, zero_mask)
+cx = cfun_np(15)
+
+import time
+
+fs = [
+    lambda i: cfun_np(i),
+    lambda i: nodes.cx_all(p_conn, i),
+    lambda i: nodes.cx_all2(p_conn2, i),
+    lambda i: nodes.cx_all3(p_conn3, i),
+    lambda i: nodes.cx_all_nop(p_conn3, i),
+]
+tt = []
+
+for f in fs:
+    tik = time.time()
+    for i in range(100_000):
+        f(i)
+    tok = time.time()
+    tt.append(tok - tik)
+
+ij_pct = (tt[1]-tt[2])/tt[2]*100
+ijT_pct = (tt[1]-tt[3])/tt[3]*100
+nop_pct = tt[4]/tt[1]*100
+print(f'np {tt[0]:0.3f} cj {tt[1]:0.3f}, ci {tt[2]:0.3f} ciT {tt[3]:0.3f}'
+      f' x {tt[0]/tt[1]:0.1f}'
+      f' ij% {ij_pct:0.2f} ijT%{ijT_pct:0.2f} overhead {nop_pct:0.2f}% ')

From c0ec48c5a760e31acd3a7e8fdbc377abb6986187 Mon Sep 17 00:00:00 2001
From: Marmaduke Woodman <marmaduke.woodman@univ-amu.fr>
Date: Fri, 28 Jun 2024 10:35:54 +0200
Subject: [PATCH 04/12] add nanobound coupling kernel & build

---
 .github/workflows.bak/build.yml               |  90 -------
 .github/workflows/build.yml                   | 112 ++++----
 .../contrib-tests.yml                         |   0
 .../ebrains-sync.yml                          |   0
 .github/workflows/kernels.yml                 |  74 ++++++
 .../lib-tests.yml_disabled                    |   0
 .../mrs-ci-rtx-update.sh                      |   0
 .../mrs-ci-rtx.dockerfile                     |   0
 .../notebooks.yml                             |   0
 .../{workflows.bak => workflows}/pg-tests.yml |   0
 .../win-tests.yml                             |   0
 tvb_kernels/.gitignore                        |   7 +
 tvb_kernels/CMakeLists.txt                    |  57 ++++
 tvb_kernels/README.md                         |  58 ++--
 tvb_kernels/ghci-build.yml                    |   1 +
 tvb_kernels/mkispc.sh                         |  15 --
 tvb_kernels/nodes.ispc                        | 187 -------------
 tvb_kernels/pyproject.toml                    |  23 ++
 tvb_kernels/rebuild.sh                        |  11 +
 tvb_kernels/spack.yaml                        |  20 ++
 tvb_kernels/test_nodes.py                     | 248 ++++++------------
 tvb_kernels/tvb_kernels.cpp                   |  49 ++++
 tvb_kernels/tvbk.h                            |  58 ++++
 tvb_kernels/tvbk_conn.c                       |  50 ++++
 24 files changed, 497 insertions(+), 563 deletions(-)
 delete mode 100644 .github/workflows.bak/build.yml
 rename .github/{workflows.bak => workflows}/contrib-tests.yml (100%)
 rename .github/{workflows.bak => workflows}/ebrains-sync.yml (100%)
 create mode 100644 .github/workflows/kernels.yml
 rename .github/{workflows.bak => workflows}/lib-tests.yml_disabled (100%)
 rename .github/{workflows.bak => workflows}/mrs-ci-rtx-update.sh (100%)
 rename .github/{workflows.bak => workflows}/mrs-ci-rtx.dockerfile (100%)
 rename .github/{workflows.bak => workflows}/notebooks.yml (100%)
 rename .github/{workflows.bak => workflows}/pg-tests.yml (100%)
 rename .github/{workflows.bak => workflows}/win-tests.yml (100%)
 create mode 100644 tvb_kernels/.gitignore
 create mode 100644 tvb_kernels/CMakeLists.txt
 create mode 120000 tvb_kernels/ghci-build.yml
 delete mode 100755 tvb_kernels/mkispc.sh
 delete mode 100644 tvb_kernels/nodes.ispc
 create mode 100644 tvb_kernels/pyproject.toml
 create mode 100644 tvb_kernels/rebuild.sh
 create mode 100644 tvb_kernels/spack.yaml
 create mode 100644 tvb_kernels/tvb_kernels.cpp
 create mode 100644 tvb_kernels/tvbk.h
 create mode 100644 tvb_kernels/tvbk_conn.c

diff --git a/.github/workflows.bak/build.yml b/.github/workflows.bak/build.yml
deleted file mode 100644
index f502c2c6f0..0000000000
--- a/.github/workflows.bak/build.yml
+++ /dev/null
@@ -1,90 +0,0 @@
-name: Test Py
-on: [push]
-
-jobs:
-  build:
-    name: Test and Inspect
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: [ "3.10", "3.11" ]
-
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          fetch-depth: 0  # Shallow clones should be disabled for a better relevancy of analysis
-
-      - name: set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v4
-        id: setPy
-        with:
-          python-version: ${{ matrix.python-version }}
-
-      - name: put ~/.local/bin on $PATH
-        run: echo "PATH=$HOME/.local/bin:$PATH" >> $GITHUB_ENV
-
-#      - name: cache ~/.local for pip deps
-#        id: cache-local
-#        uses: actions/cache@v3
-#        with:
-#          path: ~/.local
-#          key: pip-${{ steps.setPy.outputs.version }}-${{ hashFiles('tvb_framework/requirements.txt') }}
-
-      - name: install tools and dependencies
-#        if: steps.cache-local.outputs.cache-hit != 'true'
-        run: |
-          sudo apt-get update
-          sudo apt install libbz2-dev libhdf5-serial-dev liblzo2-dev
-          python3 -m pip install --upgrade setuptools==59.8.0 pip wheel
-          pip3 install --user --upgrade numpy
-          python3 -m pip install scikit-build
-          pip3 install --user -r tvb_framework/requirements.txt
-          pip3 install --user --no-build-isolation tvb-gdist
-
-      - name: setup tvb
-        run: |
-          cd tvb_build
-          bash install_full_tvb.sh
-
-      - name: cache data
-        id: cache-data
-        uses: actions/cache@v3
-        with:
-          path: tvb_data
-          key: tvb-data
-
-      - name: download data
-        if: steps.cache-data.outputs.cache-hit != 'true'
-        run: |
-          wget -q https://zenodo.org/record/10128131/files/tvb_data.zip?download=1 -O tvb_data.zip
-          mkdir tvb_data
-          unzip tvb_data.zip -d tvb_data
-          rm tvb_data.zip
-
-      - name: setup data
-        run: |
-          cd tvb_data
-          python3 setup.py develop
-
-      - name: run library tests
-        run: pytest -v tvb_library --cov --cov-report=xml && mv coverage.xml coverage-library.xml
-
-      - name: run framework tests
-        env:
-          KEYCLOAK_CLIENT_ID: ${{ secrets.KEYCLOAK_CLIENT_ID }} # KEYCLOAK_CLIENT_ID & SECRET have priority in tests
-          KEYCLOAK_CLIENT_SECRET: ${{ secrets.KEYCLOAK_CLIENT_SECRET }}
-        run: pytest -v tvb_framework  --cov --cov-report=xml --ignore=tvb_framework/tvb/interfaces/rest/client/tests/rest_test.py && mv coverage.xml coverage-framework.xml
-
-      - name: run storage tests
-        run: pytest -v tvb_storage  --cov --cov-report=xml && mv coverage.xml coverage-storage.xml
-
-      - name: Prepare PATH for Sonar
-        run: |
-          echo "PATH=$PATH:/opt/sonar-scanner/bin:/opt/nodejs/bin" >> $GITHUB_ENV
-
-      - name: SonarCloud Scan
-        uses: SonarSource/sonarcloud-github-action@master
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}  # Needed to get PR information, if any
-          SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index b32ed2dc22..f502c2c6f0 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1,19 +1,19 @@
-name: build kernel library
+name: Test Py
 on: [push]
 
 jobs:
   build:
-    name: build
+    name: Test and Inspect
     runs-on: ubuntu-latest
     strategy:
       fail-fast: false
       matrix:
-        python-version: [ "3.10" ]
+        python-version: [ "3.10", "3.11" ]
 
     steps:
       - uses: actions/checkout@v3
         with:
-          fetch-depth: 0
+          fetch-depth: 0  # Shallow clones should be disabled for a better relevancy of analysis
 
       - name: set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v4
@@ -24,87 +24,67 @@ jobs:
       - name: put ~/.local/bin on $PATH
         run: echo "PATH=$HOME/.local/bin:$PATH" >> $GITHUB_ENV
 
+#      - name: cache ~/.local for pip deps
+#        id: cache-local
+#        uses: actions/cache@v3
+#        with:
+#          path: ~/.local
+#          key: pip-${{ steps.setPy.outputs.version }}-${{ hashFiles('tvb_framework/requirements.txt') }}
+
       - name: install tools and dependencies
 #        if: steps.cache-local.outputs.cache-hit != 'true'
         run: |
           sudo apt-get update
-          sudo apt install libbz2-dev libhdf5-serial-dev liblzo2-dev cmake
+          sudo apt install libbz2-dev libhdf5-serial-dev liblzo2-dev
           python3 -m pip install --upgrade setuptools==59.8.0 pip wheel
           pip3 install --user --upgrade numpy
           python3 -m pip install scikit-build
           pip3 install --user -r tvb_framework/requirements.txt
           pip3 install --user --no-build-isolation tvb-gdist
 
-      - name: install compilers and related
+      - name: setup tvb
         run: |
-          pip3 install --user ctypesgen nanobind
-          # curl -LO https://github.com/ispc/ispc/releases/download/v1.24.0/ispc-v1.24.0-linux.tar.gz
-          # tar xzf ispc-v1.24.0-linux.tar.gz
-          # echo "PATH=$PWD/ispc-v1.24.0-linux/bin:$PATH" >> $GITHUB_ENV
+          cd tvb_build
+          bash install_full_tvb.sh
 
-      - name: setup ispc
-        uses: ScatteredRay/install-ispc-action@main
+      - name: cache data
+        id: cache-data
+        uses: actions/cache@v3
         with:
-          version: 1.23.0
-          platform: linux
+          path: tvb_data
+          key: tvb-data
 
-      - name: check tools work
+      - name: download data
+        if: steps.cache-data.outputs.cache-hit != 'true'
         run: |
-          ctypesgen --version
-          ispc --version
-          gcc -v
+          wget -q https://zenodo.org/record/10128131/files/tvb_data.zip?download=1 -O tvb_data.zip
+          mkdir tvb_data
+          unzip tvb_data.zip -d tvb_data
+          rm tvb_data.zip
 
-      - name: test first kernels
+      - name: setup data
         run: |
-          cd tvb_kernels
-          ./mkispc.sh nodes.ispc
-          cmake -S . -B build
-          cmake --build build
-          PYTHONPATH=build python test_nodes.py
-
-      # - name: setup tvb
-      #   run: |
-      #     cd tvb_build
-      #     bash install_full_tvb.sh
-
-      # - name: cache data
-      #   id: cache-data
-      #   uses: actions/cache@v3
-      #   with:
-      #     path: tvb_data
-      #     key: tvb-data
+          cd tvb_data
+          python3 setup.py develop
 
-      # - name: download data
-      #   if: steps.cache-data.outputs.cache-hit != 'true'
-      #   run: |
-      #     wget -q https://zenodo.org/record/10128131/files/tvb_data.zip?download=1 -O tvb_data.zip
-      #     mkdir tvb_data
-      #     unzip tvb_data.zip -d tvb_data
-      #     rm tvb_data.zip
+      - name: run library tests
+        run: pytest -v tvb_library --cov --cov-report=xml && mv coverage.xml coverage-library.xml
 
-      # - name: setup data
-      #   run: |
-      #     cd tvb_data
-      #     python3 setup.py develop
+      - name: run framework tests
+        env:
+          KEYCLOAK_CLIENT_ID: ${{ secrets.KEYCLOAK_CLIENT_ID }} # KEYCLOAK_CLIENT_ID & SECRET have priority in tests
+          KEYCLOAK_CLIENT_SECRET: ${{ secrets.KEYCLOAK_CLIENT_SECRET }}
+        run: pytest -v tvb_framework  --cov --cov-report=xml --ignore=tvb_framework/tvb/interfaces/rest/client/tests/rest_test.py && mv coverage.xml coverage-framework.xml
 
-      # - name: run library tests
-      #   run: pytest -v tvb_library --cov --cov-report=xml && mv coverage.xml coverage-library.xml
+      - name: run storage tests
+        run: pytest -v tvb_storage  --cov --cov-report=xml && mv coverage.xml coverage-storage.xml
 
-      # - name: run framework tests
-      #   env:
-      #     KEYCLOAK_CLIENT_ID: ${{ secrets.KEYCLOAK_CLIENT_ID }} # KEYCLOAK_CLIENT_ID & SECRET have priority in tests
-      #     KEYCLOAK_CLIENT_SECRET: ${{ secrets.KEYCLOAK_CLIENT_SECRET }}
-      #   run: pytest -v tvb_framework  --cov --cov-report=xml --ignore=tvb_framework/tvb/interfaces/rest/client/tests/rest_test.py && mv coverage.xml coverage-framework.xml
-
-      # - name: run storage tests
-      #   run: pytest -v tvb_storage  --cov --cov-report=xml && mv coverage.xml coverage-storage.xml
-
-      # - name: Prepare PATH for Sonar
-      #   run: |
-      #     echo "PATH=$PATH:/opt/sonar-scanner/bin:/opt/nodejs/bin" >> $GITHUB_ENV
+      - name: Prepare PATH for Sonar
+        run: |
+          echo "PATH=$PATH:/opt/sonar-scanner/bin:/opt/nodejs/bin" >> $GITHUB_ENV
 
-      # - name: SonarCloud Scan
-      #   uses: SonarSource/sonarcloud-github-action@master
-      #   env:
-      #     GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}  # Needed to get PR information, if any
-      #     SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
+      - name: SonarCloud Scan
+        uses: SonarSource/sonarcloud-github-action@master
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}  # Needed to get PR information, if any
+          SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
diff --git a/.github/workflows.bak/contrib-tests.yml b/.github/workflows/contrib-tests.yml
similarity index 100%
rename from .github/workflows.bak/contrib-tests.yml
rename to .github/workflows/contrib-tests.yml
diff --git a/.github/workflows.bak/ebrains-sync.yml b/.github/workflows/ebrains-sync.yml
similarity index 100%
rename from .github/workflows.bak/ebrains-sync.yml
rename to .github/workflows/ebrains-sync.yml
diff --git a/.github/workflows/kernels.yml b/.github/workflows/kernels.yml
new file mode 100644
index 0000000000..d7adca3c7a
--- /dev/null
+++ b/.github/workflows/kernels.yml
@@ -0,0 +1,74 @@
+name: build & test kernel library
+on: [push]
+
+jobs:
+  pip-tvbk:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: true
+      matrix:
+        python-version: [ "3.10", ]
+
+    steps:
+
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      - name: set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        id: setPy
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: install tools and dependencies
+        run: python3 -m pip install -U setuptools pip wheel pytest scipy
+
+      - name: build
+        run: cd tvb_kernels; pip install .
+
+      - name: test
+        run: cd tvb_kernels; python -m pytest
+
+
+  spack-tvbk:
+    runs-on: ubuntu-latest
+    steps:
+
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      - name: setup spack
+        uses: spack/setup-spack@v2
+        with:
+          ref: develop
+          buildcache: true
+          color: true
+          path: spack
+
+      - name: install deps w/ spack
+        shell: spack-bash {0}
+        run: |
+          cd tvb_kernels
+          spack -e . concretize
+          spack -e . install
+          spack env activate .
+          spack env status
+          pip install pytest
+
+      - name: build nanobound library
+        shell: spack-bash {0}
+        run: |
+          cd tvb_kernels
+          spack env activate .
+          pip install .
+          pytest
+
+      # https://github.com/spack/setup-spack?tab=readme-ov-file#example-caching-your-own-binaries-for-public-repositories
+      - name: Push packages and update index
+        run: |
+          cd tvb_kernels
+          spack -e . mirror set --push --oci-username ${{ github.actor }} --oci-password "${{ secrets.GITHUB_TOKEN }}" local-buildcache
+          spack -e . buildcache push --base-image ubuntu:22.04 --update-index local-buildcache
+        if: ${{ !cancelled() }}
diff --git a/.github/workflows.bak/lib-tests.yml_disabled b/.github/workflows/lib-tests.yml_disabled
similarity index 100%
rename from .github/workflows.bak/lib-tests.yml_disabled
rename to .github/workflows/lib-tests.yml_disabled
diff --git a/.github/workflows.bak/mrs-ci-rtx-update.sh b/.github/workflows/mrs-ci-rtx-update.sh
similarity index 100%
rename from .github/workflows.bak/mrs-ci-rtx-update.sh
rename to .github/workflows/mrs-ci-rtx-update.sh
diff --git a/.github/workflows.bak/mrs-ci-rtx.dockerfile b/.github/workflows/mrs-ci-rtx.dockerfile
similarity index 100%
rename from .github/workflows.bak/mrs-ci-rtx.dockerfile
rename to .github/workflows/mrs-ci-rtx.dockerfile
diff --git a/.github/workflows.bak/notebooks.yml b/.github/workflows/notebooks.yml
similarity index 100%
rename from .github/workflows.bak/notebooks.yml
rename to .github/workflows/notebooks.yml
diff --git a/.github/workflows.bak/pg-tests.yml b/.github/workflows/pg-tests.yml
similarity index 100%
rename from .github/workflows.bak/pg-tests.yml
rename to .github/workflows/pg-tests.yml
diff --git a/.github/workflows.bak/win-tests.yml b/.github/workflows/win-tests.yml
similarity index 100%
rename from .github/workflows.bak/win-tests.yml
rename to .github/workflows/win-tests.yml
diff --git a/tvb_kernels/.gitignore b/tvb_kernels/.gitignore
new file mode 100644
index 0000000000..7e0eaaea18
--- /dev/null
+++ b/tvb_kernels/.gitignore
@@ -0,0 +1,7 @@
+.cache
+build
+*.jpg
+build
+.spack-env
+spack.lock
+*.whl
diff --git a/tvb_kernels/CMakeLists.txt b/tvb_kernels/CMakeLists.txt
new file mode 100644
index 0000000000..1ff9f87590
--- /dev/null
+++ b/tvb_kernels/CMakeLists.txt
@@ -0,0 +1,57 @@
+cmake_minimum_required(VERSION 3.18...3.27)
+project(tvb_kernels LANGUAGES C CXX)
+
+if (NOT SKBUILD)
+  message(WARNING "\
+  This CMake file is meant to be executed using 'scikit-build-core'.
+  Running it directly will almost certainly not produce the desired
+  result. If you are a user trying to install this package, use the
+  command below, which will install all necessary build dependencies,
+  compile the package in an isolated environment, and then install it.
+  =====================================================================
+   $ pip install .
+  =====================================================================
+  If you are a software developer, and this is your own package, then
+  it is usually much more efficient to install the build dependencies
+  in your environment once and use the following command that avoids
+  a costly creation of a new virtual environment at every compilation:
+  =====================================================================
+   $ pip install nanobind scikit-build-core[pyproject]
+   $ pip install --no-build-isolation -ve .
+  =====================================================================
+  You may optionally add -Ceditable.rebuild=true to auto-rebuild when
+  the package is imported. Otherwise, you need to rerun the above
+  after editing C++ files.")
+endif()
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+add_compile_options(-std=c++17 -fopenmp-simd -mavx2 -march=native -mtune=native)
+
+find_package(Python 3.8 
+  REQUIRED COMPONENTS Interpreter Development.Module
+)
+
+if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+  set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
+  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
+endif()
+
+# Detect the installed nanobind package and import it into CMake
+execute_process(
+  COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir
+  OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE NB_DIR)
+list(APPEND CMAKE_PREFIX_PATH "${NB_DIR}")
+find_package(nanobind CONFIG REQUIRED)
+
+nanobind_add_module(tvb_kernels tvb_kernels.cpp tvbk_conn.c)
+
+nanobind_add_stub(
+  tvb_kernels_stub
+  MODULE tvb_kernels
+  OUTPUT tvb_kernels.pyi
+  PYTHON_PATH $<TARGET_FILE_DIR:tvb_kernels>
+  DEPENDS tvb_kernels
+  MARKER_FILE py.typed
+)
+
+install(TARGETS tvb_kernels LIBRARY DESTINATION tvb_kernels)
diff --git a/tvb_kernels/README.md b/tvb_kernels/README.md
index 971247deee..60ed194728 100644
--- a/tvb_kernels/README.md
+++ b/tvb_kernels/README.md
@@ -6,44 +6,34 @@ This is a library of computational kernels for TVB.
 
 in order of priority
 
-- [ ] sparse delay coupling functions
+- [x] sparse delay coupling functions
 - [ ] fused heun neural mass model step functions
 - [ ] neural ODE
 - [ ] bold / tavg monitors
 
-## building
-
-For now, a `make` invocation is enough, which calls `mkispc.sh` to build
-the ISPC kernels, then CMake to build the nanobind wrappers.  The next
-steps will convert this to a pure CMake based process.
- 
-## variants
-
-### implementation
-
-ISPC compiler provides the best performance, followed by a C/C++ compiler.
-To handle cases where all compilers are not available, the pattern will
-be as follows
-
-Python class owns kernel workspace (arrays, floats, int) and has a
-short, obvious NumPy based implementation.
+which will then be used internally by TVB.
 
-A nanobind binding layer optionally implements calls to
-- a short, obvious C++ implementation
-- an (possibly multithreaded) ISPC implementation
-- a CUDA implementation, if arrays are on GPU
-
-### batch variants
-
-It may be desirable to compute batches of a kernel.
-
-### parameters
-
-Parameters may be global or "spatialized" meaning different
-values for every node.  Kernels should provide separate calls
-for both cases.
+## building
 
-## matlab
+No stable versions are available, so users can install from
+source with `pip install .` or to just get a wheel `pip wheel .`
 
-Given some users in matlab, it could be useful to wrap some of the algorithms
-as MEX files.
+For development, prefer
+```
+pip install nanobind scikit-build-core[pyproject]
+pip install --no-build-isolation -Ceditable.rebuild=true -ve .
+```
+ 
+## roadmap
+
+- improve api (ownership, checking etc)
+- variants
+  - single
+  - batches
+  - spatialized parameters
+- cuda ports for Jax, CuPy and Torch users 
+- mex functions?
+- scalable components
+  - domains
+  - projections
+  - small vm to automate inner loop
diff --git a/tvb_kernels/ghci-build.yml b/tvb_kernels/ghci-build.yml
new file mode 120000
index 0000000000..3d02e1e42d
--- /dev/null
+++ b/tvb_kernels/ghci-build.yml
@@ -0,0 +1 @@
+../.github/workflows/build.yml
\ No newline at end of file
diff --git a/tvb_kernels/mkispc.sh b/tvb_kernels/mkispc.sh
deleted file mode 100755
index 4f07d3b624..0000000000
--- a/tvb_kernels/mkispc.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-src=$1
-set -eux
-
-h=${src}.h
-obj=${src}.o
-dll=${src}.so
-mod=${src%".ispc"}
-py=${mod}.py
-
-ispc -g -O3 --pic ${src} -h ${h} -o ${obj}
-gcc -shared ${obj} -o ${dll}
-rm -f ${py}
-ctypesgen -l ./${dll} ${h} > ${py}
diff --git a/tvb_kernels/nodes.ispc b/tvb_kernels/nodes.ispc
deleted file mode 100644
index 4ed349f465..0000000000
--- a/tvb_kernels/nodes.ispc
+++ /dev/null
@@ -1,187 +0,0 @@
-// pointer is constant and same for all threads, as are elements pointed to
-typedef const uniform   int *const uniform data_ints;
-typedef const uniform float *const uniform data_floats;
-
-// pointer is constant and same for all threads,
-// elements pointed to are same for all threads, but modifiable
-typedef uniform   int *const uniform work_ints;
-typedef uniform float *const uniform work_floats;
-
-struct params {
-  const int count;
-  data_floats values;
-};
-
-// connectivity model with csr format sparse connections & delay buffer
-struct connectivity {
-  const uniform int num_node;
-  const uniform int num_nonzero;
-  const uniform int num_cvar;
-  // horizon must be power of two
-  const int horizon;
-  const int horizon_minus_1;
-  data_floats weights; // (num_nonzero,)
-  data_ints indices; // (num_nonzero,)
-  data_ints indptr; // (num_nodes+1,)
-  data_ints idelays; // (num_nonzero,)
-  work_floats buf; // delay buffer (num_cvar, num_nodes, horizon)
-  work_floats cx1;
-  work_floats cx2;
-};
-
-struct sim {
-  // keep invariant stuff at the top, per sim stuff below
-  const int rng_seed;
-  const int num_node;
-  const int num_svar;
-  const int num_time;
-  const int num_params;
-  const int num_spatial_params;
-  const float dt;
-  const int oversample; // TODO "oversample" for stability,
-  const int num_skip; // skip per output sample
-  work_floats z_scale; // (num_svar), sigma*sqrt(dt)
-
-  // parameters
-  const uniform params global_params;
-  const uniform params spatial_params;
-
-  work_floats state_trace; // (num_time//num_skip, num_svar, num_nodes)
-  work_floats states; // full states (num_svar, num_nodes)
-
-  const uniform connectivity conn;
-};
-
-typedef const uniform struct sim the_sim;
-typedef const uniform struct connectivity the_conn;
-
-// case of small number of nodes (90 floats is 360 bytes), dense, long horizon
-// probably just hit a single node's horizon: random increments
-export void cx_all_j(
-    the_conn &c,
-    uniform int t, uniform int j)
-{
-    uniform float * const buf = c.buf + j*c.horizon;
-    uniform int th = t + c.horizon;
-    // here assuming columns are targets
-    foreach (l = c.indptr[j] ... c.indptr[j + 1])
-    {
-        int i = c.indices[l];
-        float w = c.weights[l];
-        int d = c.idelays[l];
-        int p1 = (th - d) & c.horizon_minus_1;
-        int p2 = (th - d + 1) & c.horizon_minus_1;
-        c.cx1[i] += w * buf[p1];
-        c.cx2[i] += w * buf[p2];
-    }
-}
-
-export void cx_all(the_conn &c, uniform int t)
-{
-  foreach (i = 0 ... c.num_node)
-    c.cx1[i] = c.cx2[i] = 0.0f;
-
-  for (uniform int j=0; j<c.num_node; j++)
-    cx_all_j(c, t, j);
-}
-
-export void cx_all2(the_conn &c, uniform int t)
-{
-  uniform int th = t + c.horizon;
-  foreach (i = 0 ... c.num_node)
-  {
-    float cx1=0.f, cx2=0.f;
-    for (int l = c.indptr[i]; l < c.indptr[i + 1]; l++)
-    {
-      int j = c.indices[l];
-      float w = c.weights[l];
-      int d = c.idelays[l];
-      int p1 = (th - d) & c.horizon_minus_1;
-      int p2 = (th - d + 1) & c.horizon_minus_1;
-      cx1 += w * c.buf[j*c.horizon + p1];
-      cx2 += w * c.buf[j*c.horizon + p2];
-    }
-    c.cx1[i] = cx1;
-    c.cx2[i] = cx2;
-  }
-}
-
-export void cx_all3(the_conn &c, uniform int t)
-{
-  uniform int th = t + c.horizon;
-  foreach (i = 0 ... c.num_node)
-  {
-    float cx1=0.f, cx2=0.f;
-    for (int l = c.indptr[i]; l < c.indptr[i + 1]; l++)
-    {
-      int j = c.indices[l];
-      float w = c.weights[l];
-      int d = c.idelays[l];
-      int p1 = (th - d) & c.horizon_minus_1;
-      int p2 = (th - d + 1) & c.horizon_minus_1;
-      cx1 += w * c.buf[j + p1*c.num_node];
-      cx2 += w * c.buf[j + p2*c.num_node];
-    }
-    c.cx1[i] = cx1;
-    c.cx2[i] = cx2;
-  }
-}
-
-export void cx_all_nop(the_conn &c, uniform int t)
-{
-  // just to test overhead of binding layer
-  return;
-}
-
-void dfun(float &dx, float &dy, float x, float y, float cx, data_floats p)
-{
-  float a=p[0], tau=p[1], k=p[2];
-  dx = tau*(x - x*x*x*0.333f + y);
-  dy = (a + k*cx - x)*tau;
-}
-
-void save_trace(the_sim &s, const uniform int t_trace)
-{
-    uniform int i0 = t_trace * s.num_svar * s.num_node;
-    work_floats out = s.state_trace + i0;
-    foreach (i_node = 0 ... s.num_node)
-    {
-      out[i_node] = s.states[i_node];
-      out[i_node+s.num_node] = s.states[i_node+s.num_node];
-    }
-}
-
-export void run_batches(the_sim &s)
-{
-  return;
-    RNGState rng;
-
-    const uniform int num_trace = s.num_time / s.num_skip;
-    for (uniform int t_trace=0; t_trace < num_trace; t_trace++)
-    {
-        const uniform int t_total = t_trace * s.num_skip;
-        for (uniform int t_skip=0; t_skip<s.num_skip; t_skip++)
-        {
-          uniform int t = t_total + t_skip;
-          uniform int wt = t & s.conn.horizon_minus_1;
-
-          // per inter-domain connectivity
-          cx_all(s.conn, t);
-
-          // per domain
-          foreach (i = 0 ... s.num_node)
-          {
-              float dx1, dy1, dx2, dy2;
-              float x = s.states[i], y=s.states[i+s.num_node];
-              dfun(dx1, dy1, x, y, s.conn.cx1[i], s.global_params.values);
-              dfun(dx2, dy2, x+dx1*s.dt, y+dy1*s.dt, s.conn.cx2[i], s.global_params.values);
-              s.states[i] = x + s.dt*0.5f*(dx1 + dx2);
-              s.states[i+s.num_node] = y + s.dt*0.5f*(dy1 + dy2);
-
-              s.conn.buf[wt + i*s.num_node] = s.states[i];
-          }
-        }
-
-        save_trace(s, t_trace);
-    }
-}
diff --git a/tvb_kernels/pyproject.toml b/tvb_kernels/pyproject.toml
new file mode 100644
index 0000000000..de34153a36
--- /dev/null
+++ b/tvb_kernels/pyproject.toml
@@ -0,0 +1,23 @@
+[build-system]
+requires = ["scikit-build-core >=0.4.3", "nanobind >=1.3.2", "typing_extensions"]
+build-backend = "scikit_build_core.build"
+
+[project]
+name = "tvb_kernels"
+version = "0.0.1"
+description = "TVB computational kernels"
+readme = "README.md"
+requires-python = ">=3.8"
+authors = [
+    { name = "Marmaduke Woodman", email = "marmaduke.woodman@univ-amu.fr" },
+]
+classifiers = [ ]
+dependencies = [ "numpy" ]
+
+[project.urls]
+Homepage = "https://github.com/the-virtual-brain/tvb-root"
+
+[tool.scikit-build]
+minimum-version = "0.4"
+build-dir = "build/{wheel_tag}"
+wheel.py-api = "cp312"
diff --git a/tvb_kernels/rebuild.sh b/tvb_kernels/rebuild.sh
new file mode 100644
index 0000000000..f18d0eac54
--- /dev/null
+++ b/tvb_kernels/rebuild.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+rm -rf build
+
+cmake -S . -B build -DCMAKE_BUILD_TYPE=RelWithDebInfo
+
+pushd build
+make
+popd
+
+ctypesgen -l build/libtvbk.so src/nodes.h > nodes.py
diff --git a/tvb_kernels/spack.yaml b/tvb_kernels/spack.yaml
new file mode 100644
index 0000000000..5eb851f7e6
--- /dev/null
+++ b/tvb_kernels/spack.yaml
@@ -0,0 +1,20 @@
+# This is a Spack Environment file.
+#
+# It describes a set of packages to be installed, along with
+# configuration settings.
+spack:
+  # add package specs to the `specs` list
+  specs:
+  - py-numpy
+  - py-nanobind
+  - py-scipy
+  - py-matplotlib
+  - py-pip
+  - cmake
+  concretizer:
+    unify: true
+
+  mirrors:
+    local-buildcache:
+      url: oci://ghcr.io/maedoc/spack-buildcache
+      signed: false
diff --git a/tvb_kernels/test_nodes.py b/tvb_kernels/test_nodes.py
index 652ac6fa93..b6fa0d36d6 100644
--- a/tvb_kernels/test_nodes.py
+++ b/tvb_kernels/test_nodes.py
@@ -1,182 +1,88 @@
 import numpy as np
 import scipy.sparse
-import nodes
-import ctypes as ct
-
-# test connectivity
-np.random.seed(43)
-
-num_node = 90
-sparsity = 0.4
-
-dt = 0.1
-horizon = 256
-horizonm1 = horizon - 1
-assert ((horizon) & (horizonm1)) == 0
-
-weights, lengths = np.random.rand(2, num_node, num_node).astype('f')
-lengths[:] *= 0.8
-lengths *= (horizon*dt*0.8)
-zero_mask = weights < (1-sparsity)
-weights[zero_mask] = 0
-# so far so good
-
-# for the new approach, we need to invert this, so use CSC instead
-csc = scipy.sparse.csc_matrix(weights)
-
-# since numpy is row major, we need to transpose zero mask then ravel
-nnz_ravel = np.argwhere(~zero_mask.T.copy().ravel())[:, 0]
-
-# then we also need to extract them in the transposed order
-idelays = (lengths.T.copy().ravel()[nnz_ravel] / dt).astype('i') + 2
-
-# then we can test
-buf = np.r_[:1.0:1j*num_node*horizon].reshape(1, num_node, horizon).astype('f')*4.0
-buf = np.random.randn(1, num_node, horizon).astype('f')
-c_buf = buf.copy()
-c_cx1, c_cx2 = np.zeros((2, num_node), 'f')
-
-to_c = lambda a: a.ctypes.data_as(ct.POINTER(ct.c_float if a.dtype==np.float32 else ct.c_int))
-
-conn = nodes.connectivity(
-    num_node=num_node,
-    num_nonzero=nnz_ravel.size,
-    num_cvar=1,
-    horizon=horizon,
-    horizon_minus_1=horizonm1,
-    weights=to_c(csc.data),
-    indices=to_c(csc.indices),
-    indptr=to_c(csc.indptr),
-    idelays=to_c(idelays),
-    buf=to_c(c_buf),
-    cx1=to_c(c_cx1),
-    cx2=to_c(c_cx2),
-)
-p_conn = ct.byref(conn)
-
-def setup_cx_all2_csr():
-    csr = scipy.sparse.csr_matrix(weights)
-    nnz_ravel = np.argwhere(~zero_mask.ravel())[:, 0]
-    idelays = (lengths.ravel()[nnz_ravel] / dt).astype('i') + 2
+import enum
+
+from tvb_kernels import tvb_kernels
+
+
+class CxMode(enum.Enum):
+    CX_J = 1
+    CX_I = 2
+    CX_NOP = 3
+
+
+def base_setup(mode: CxMode = CxMode.CX_J):
+    np.random.seed(43)
+    num_node = 90
+    sparsity = 0.4
+    dt = 0.1
+    horizon = 256
+    horizonm1 = horizon - 1
+    assert ((horizon) & (horizonm1)) == 0
+    weights, lengths = np.random.rand(2, num_node, num_node).astype('f')
+    lengths[:] *= 0.8
+    lengths *= (horizon*dt*0.8)
+    zero_mask = weights < (1-sparsity)
+    weights[zero_mask] = 0
+    # so far so good
+
+    if mode == CxMode.CX_J:
+        # cx_j requires csc format, cx_i requires csr
+        spw = scipy.sparse.csc_matrix(weights)
+        # since numpy is row major, we need to transpose zero mask then ravel
+        nnz_ravel = np.argwhere(~zero_mask.T.copy().ravel())[:, 0]
+        # then we also need to extract them in the transposed order
+        idelays = (lengths.T.copy().ravel()[nnz_ravel] / dt).astype('i') + 2
+    elif mode == CxMode.CX_I:
+        spw = scipy.sparse.csr_matrix(weights)
+        nnz_ravel = np.argwhere(~zero_mask.ravel())[:, 0]
+        idelays = (lengths.ravel()[nnz_ravel] / dt).astype('i') + 2
+    else:
+        raise NotImplementedError
+
+    # then we can test
+    buf = np.r_[:1.0:1j*num_node *
+                horizon].reshape(1, num_node, horizon).astype('f')*4.0
+    buf = np.random.randn(1, num_node, horizon).astype('f')
     c_buf = buf.copy()
-    c_cx1, c_cx2 = np.zeros((2, num_node), 'f')
-    conn = nodes.connectivity(
-        num_node=num_node,
-        num_nonzero=nnz_ravel.size,
-        num_cvar=1,
-        horizon=horizon,
-        horizon_minus_1=horizonm1,
-        weights=to_c(csr.data),
-        indices=to_c(csr.indices),
-        indptr=to_c(csr.indptr),
-        idelays=to_c(idelays),
-        buf=to_c(c_buf),
-        cx1=to_c(c_cx1),
-        cx2=to_c(c_cx2),
+    c_cx1, c_cx2 = c_cx = np.zeros((2, num_node), 'f')
+
+    # create teh C struct w/ our data
+    conn = tvb_kernels.Conn(
+        weights=spw.data,
+        indices=spw.indices,
+        indptr=spw.indptr,
+        idelays=idelays,
+        buf=c_buf,
+        cx=c_cx
     )
-    p_conn = ct.byref(conn)
-    return p_conn, c_cx1, locals()
-
-
-def setup_cx_all3_csr():
-    csr = scipy.sparse.csr_matrix(weights)
-    nnz_ravel = np.argwhere(~zero_mask.ravel())[:, 0]
-    idelays = (lengths.ravel()[nnz_ravel] / dt).astype('i') + 2
-    c_buf = buf.transpose(0,2,1).copy()
-    c_cx1, c_cx2 = np.zeros((2, num_node), 'f')
-    conn = nodes.connectivity(
-        num_node=num_node,
-        num_nonzero=nnz_ravel.size,
-        num_cvar=1,
-        horizon=horizon,
-        horizon_minus_1=horizonm1,
-        weights=to_c(csr.data),
-        indices=to_c(csr.indices),
-        indptr=to_c(csr.indptr),
-        idelays=to_c(idelays),
-        buf=to_c(c_buf),
-        cx1=to_c(c_cx1),
-        cx2=to_c(c_cx2),
-    )
-    p_conn = ct.byref(conn)
-    return p_conn, c_cx1, locals()
-
-p_conn2, c_cx12, cxall2work = setup_cx_all2_csr()
-p_conn3, c_cx13, cxall3work = setup_cx_all3_csr()
-
-# run them
-nodes.cx_all(p_conn, 15)
-nodes.cx_all2(p_conn2, 15)
-nodes.cx_all3(p_conn3, 15)
-
-
-# now a numpy version of similar
-cx1, cx2 = np.zeros((2, num_node), 'f')
-for j in range(num_node):
-    j0, j1 = csc.indptr[j:j+2]
-    p1 = (15 - idelays[j0:j1] + horizon) % horizon
-    cx1[csc.indices[j0:j1]] += csc.data[j0:j1]*buf[0, j, p1]
-
-# ah maybe a dense version
-idelays_dense = (lengths / dt).astype('i') + 2
-wxij = weights*buf[0,
-    np.tile(np.r_[:num_node], (num_node,1)),
-    (15 - idelays_dense + horizon) % horizon]
-cx13 = wxij.sum(axis=1)
-
-np.testing.assert_allclose(cx1, cx13, 2e-6, 1e-6)  # the two numpy versions agree
-np.testing.assert_allclose(cx1, c_cx1, 2e-6, 1e-6)
-np.testing.assert_allclose(cx1, c_cx12, 2e-6, 1e-6)
-np.testing.assert_allclose(cx1, c_cx13, 2e-6, 1e-6)
-
-import pylab as pl
-pl.plot(cx1, c_cx1, '.')
-pl.plot(cx1, c_cx12, '.', alpha=0.4)
-pl.plot(cx1, c_cx13, '.', alpha=0.4)
-pl.savefig('nodes.jpg')
-
-# now benchmarking
-
-def run_sim_np(weights, lengths, zero_mask):
-    csr_weights = scipy.sparse.csr_matrix(weights)
-    idelays = (lengths[~zero_mask]/dt).astype('i')+2
-    
-    idelays2 = -horizon + np.c_[idelays, idelays-1].T
-    assert idelays2.shape == (2, csr_weights.nnz)
-    buffer = np.zeros((num_node, horizon), 'f')
 
-    def cfun(t):
-        cx = buffer[csr_weights.indices, (t-idelays2) % horizon]
-        cx *= csr_weights.data
-        cx = np.add.reduceat(cx, csr_weights.indptr[:-1], axis=1)
-        return cx  # (2, num_node)
-    return cfun
+    # impl simple numpy version
+    def make_cfun_np():
+        csr_weights = scipy.sparse.csr_matrix(weights)
+        idelays = (lengths[~zero_mask]/dt).astype('i')+2
+        idelays2 = -horizon + np.c_[idelays, idelays-1].T
+        assert idelays2.shape == (2, csr_weights.nnz)
+        buffer = buf[0]
 
+        def cfun_np(t):
+            cx = buffer[csr_weights.indices, (t-idelays2) % horizon]
+            cx *= csr_weights.data
+            cx = np.add.reduceat(cx, csr_weights.indptr[:-1], axis=1)
+            return cx  # (2, num_node)
+        return cfun_np
 
-cfun_np = run_sim_np(weights, lengths, zero_mask)
-cx = cfun_np(15)
+    return conn, c_cx, locals(), make_cfun_np()
 
-import time
 
-fs = [
-    lambda i: cfun_np(i),
-    lambda i: nodes.cx_all(p_conn, i),
-    lambda i: nodes.cx_all2(p_conn2, i),
-    lambda i: nodes.cx_all3(p_conn3, i),
-    lambda i: nodes.cx_all_nop(p_conn3, i),
-]
-tt = []
+def test_conn_kernels():
+    connj, cxj, dataj, cfun_np = base_setup(CxMode.CX_J)
+    conni, cxi, datai, _ = base_setup(CxMode.CX_I)
 
-for f in fs:
-    tik = time.time()
-    for i in range(100_000):
-        f(i)
-    tok = time.time()
-    tt.append(tok - tik)
+    for t in range(1024):
+        cx = cfun_np(t)
+        tvb_kernels.cx_j(connj, t)
+        tvb_kernels.cx_i(conni, t)
+        np.testing.assert_allclose(cx, cxj, 1e-4, 1e-6)
+        np.testing.assert_allclose(cx, cxi, 1e-4, 1e-6)
 
-ij_pct = (tt[1]-tt[2])/tt[2]*100
-ijT_pct = (tt[1]-tt[3])/tt[3]*100
-nop_pct = tt[4]/tt[1]*100
-print(f'np {tt[0]:0.3f} cj {tt[1]:0.3f}, ci {tt[2]:0.3f} ciT {tt[3]:0.3f}'
-      f' x {tt[0]/tt[1]:0.1f}'
-      f' ij% {ij_pct:0.2f} ijT%{ijT_pct:0.2f} overhead {nop_pct:0.2f}% ')
diff --git a/tvb_kernels/tvb_kernels.cpp b/tvb_kernels/tvb_kernels.cpp
new file mode 100644
index 0000000000..c015027ffa
--- /dev/null
+++ b/tvb_kernels/tvb_kernels.cpp
@@ -0,0 +1,49 @@
+#include <nanobind/nanobind.h>
+#include <nanobind/ndarray.h>
+
+extern "C" {
+#include "tvbk.h"
+}
+
+namespace nb = nanobind;
+using namespace nb::literals;
+
+static void conn_init_from_arrays(
+    tvbk_conn *t,
+    /* add more type info for validation */
+    nb::ndarray<float, nb::shape<-1>> weights,
+    nb::ndarray<int32_t, nb::shape<-1>> indices,
+    nb::ndarray<int32_t, nb::shape<-1>> indptr,
+    nb::ndarray<int32_t, nb::shape<-1>> idelays,
+    nb::ndarray<float, nb::shape<-1, -1, -1>, nb::c_contig> buf,
+    nb::ndarray<float, nb::shape<-1, -1>, nb::c_contig> cx) {
+  new (t) tvbk_conn{/* TODO converting from unsigned long to int here */
+                    .num_node = static_cast<int>(buf.shape(1)),
+                    .num_nonzero = static_cast<int>(weights.shape(0)),
+                    .num_cvar = static_cast<int>(buf.shape(0)),
+                    .horizon = static_cast<int>(buf.shape(2)),
+                    .horizon_minus_1 = static_cast<int>(buf.shape(2) - 1)};
+  if (!((t->horizon & t->horizon_minus_1) == 0))
+    throw nb::value_error("horizon (buf.shape[2]) must be power of 2");
+  /* TODO more shape validation */
+  t->weights = weights.data();
+  t->indices = indices.data();
+  t->indptr = indptr.data();
+  t->idelays = idelays.data();
+  t->buf = buf.data();
+  /* TODO cx1, cx2 should be owned by model instance */
+  t->cx1 = cx.data();
+  t->cx2 = cx.data() + t->num_node;
+}
+
+NB_MODULE(tvb_kernels, m) {
+  nb::class_<tvbk_conn>(m, "Conn")
+      .def("__init__", &conn_init_from_arrays, "weights"_a, "indices"_a,
+           "indptr"_a, "idelays"_a, "buf"_a, "cx"_a);
+    /* TODO add accessors for arrays? */
+    /* TODO take ownership of arrays? */
+
+  m.def("cx_nop", &tvbk_cx_nop);
+  m.def("cx_j", &tvbk_cx_j);
+  m.def("cx_i", &tvbk_cx_i);
+}
diff --git a/tvb_kernels/tvbk.h b/tvb_kernels/tvbk.h
new file mode 100644
index 0000000000..50fcdb3bff
--- /dev/null
+++ b/tvb_kernels/tvbk.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <stdint.h>
+
+typedef struct tvbk_params tvbk_params;
+
+struct tvbk_params {
+  const int count;
+  const float *values;
+};
+
+// conn model with csr format sparse connections & delay buffer
+typedef struct tvbk_conn tvbk_conn;
+
+struct tvbk_conn {
+  const int num_node;
+  const int num_nonzero;
+  const int num_cvar;
+  // horizon must be power of two
+  const int horizon;
+  const int horizon_minus_1;
+  const float *weights; // (num_nonzero,)
+  const int *indices;   // (num_nonzero,)
+  const int *indptr;    // (num_nodes+1,)
+  const int *idelays;   // (num_nonzero,)
+  float *buf;           // delay buffer (num_cvar, num_nodes, horizon)
+  float *cx1;
+  float *cx2;
+};
+
+/* not currently used */
+typedef struct tvbk_sim tvbk_sim;
+struct tvbk_sim {
+  // keep invariant stuff at the top, per sim stuff below
+  const int rng_seed;
+  const int num_node;
+  const int num_svar;
+  const int num_time;
+  const int num_params;
+  const int num_spatial_params;
+  const float dt;
+  const int oversample; // TODO "oversample" for stability,
+  const int num_skip;   // skip per output sample
+  float *z_scale;       // (num_svar), sigma*sqrt(dt)
+
+  // parameters
+  const tvbk_params global_params;
+  const tvbk_params spatial_params;
+
+  float *state_trace; // (num_time//num_skip, num_svar, num_nodes)
+  float *states;      // full states (num_svar, num_nodes)
+
+  const tvbk_conn conn;
+};
+
+void tvbk_cx_j(const tvbk_conn *c, uint32_t t);
+void tvbk_cx_i(const tvbk_conn *c, uint32_t t);
+void tvbk_cx_nop(const tvbk_conn *c, uint32_t t);
diff --git a/tvb_kernels/tvbk_conn.c b/tvb_kernels/tvbk_conn.c
new file mode 100644
index 0000000000..5a2f12a55f
--- /dev/null
+++ b/tvb_kernels/tvbk_conn.c
@@ -0,0 +1,50 @@
+#include "tvbk.h"
+
+static void cx_all_j(const tvbk_conn *c, uint32_t t, uint32_t j) {
+  float *const buf = c->buf + j * c->horizon;
+  uint32_t th = t + c->horizon;
+#pragma omp simd
+  for (uint32_t l = c->indptr[j]; l < c->indptr[j + 1]; l++) {
+    uint32_t i = c->indices[l];
+    float w = c->weights[l];
+    uint32_t d = c->idelays[l];
+    uint32_t p1 = (th - d) & c->horizon_minus_1;
+    uint32_t p2 = (th - d + 1) & c->horizon_minus_1;
+    c->cx1[i] += w * buf[p1];
+    c->cx2[i] += w * buf[p2];
+  }
+}
+
+void tvbk_cx_j(const tvbk_conn *c, uint32_t t) {
+#pragma omp simd
+  for (int i = 0; i < c->num_node; i++)
+    c->cx1[i] = c->cx2[i] = 0.0f;
+  for (int j = 0; j < c->num_node; j++)
+    cx_all_j(c, t, j);
+}
+
+void tvbk_cx_i(const tvbk_conn *c, uint32_t t) {
+  uint32_t th = t + c->horizon;
+#pragma omp simd
+  for (int i = 0; i < c->num_node; i++) {
+    float cx1 = 0.f, cx2 = 0.f;
+    for (uint32_t l = c->indptr[i]; l < c->indptr[i + 1]; l++) {
+      uint32_t j = c->indices[l];
+      float w = c->weights[l];
+      uint32_t d = c->idelays[l];
+      uint32_t p1 = (th - d) & c->horizon_minus_1;
+      uint32_t p2 = (th - d + 1) & c->horizon_minus_1;
+      cx1 += w * c->buf[j * c->horizon + p1];
+      cx2 += w * c->buf[j * c->horizon + p2];
+    }
+    c->cx1[i] = cx1;
+    c->cx2[i] = cx2;
+  }
+}
+
+void tvbk_cx_nop(const tvbk_conn *c, uint32_t t) {
+  (void)c;
+  (void)t;
+
+  return;
+}

From 4230f28f046f66a69cda8bf3c575ff21e2f98086 Mon Sep 17 00:00:00 2001
From: Marmaduke Woodman <marmaduke.woodman@univ-amu.fr>
Date: Tue, 3 Sep 2024 15:14:55 +0200
Subject: [PATCH 05/12] rm old build script

---
 tvb_kernels/rebuild.sh | 11 -----------
 1 file changed, 11 deletions(-)
 delete mode 100644 tvb_kernels/rebuild.sh

diff --git a/tvb_kernels/rebuild.sh b/tvb_kernels/rebuild.sh
deleted file mode 100644
index f18d0eac54..0000000000
--- a/tvb_kernels/rebuild.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-rm -rf build
-
-cmake -S . -B build -DCMAKE_BUILD_TYPE=RelWithDebInfo
-
-pushd build
-make
-popd
-
-ctypesgen -l build/libtvbk.so src/nodes.h > nodes.py

From 2f63b53fe6d19ff02a867ae4c9de922cdfae4e14 Mon Sep 17 00:00:00 2001
From: Marmaduke Woodman <marmaduke.woodman@univ-amu.fr>
Date: Tue, 3 Sep 2024 17:58:33 +0200
Subject: [PATCH 06/12] test embind

---
 tvb_kernels/em/.gitignore      |  3 +++
 tvb_kernels/em/build.sh        |  3 +++
 tvb_kernels/em/tvb_kernels.cpp | 19 +++++++++++++++++++
 3 files changed, 25 insertions(+)
 create mode 100644 tvb_kernels/em/.gitignore
 create mode 100755 tvb_kernels/em/build.sh
 create mode 100644 tvb_kernels/em/tvb_kernels.cpp

diff --git a/tvb_kernels/em/.gitignore b/tvb_kernels/em/.gitignore
new file mode 100644
index 0000000000..f4481f02b0
--- /dev/null
+++ b/tvb_kernels/em/.gitignore
@@ -0,0 +1,3 @@
+compile_commands.json
+*.js
+*.wasm
diff --git a/tvb_kernels/em/build.sh b/tvb_kernels/em/build.sh
new file mode 100755
index 0000000000..9b15825680
--- /dev/null
+++ b/tvb_kernels/em/build.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+bear -- emcc -I../ -lembind -o tvb_kernels.js tvb_kernels.cpp ../tvbk_conn.c
diff --git a/tvb_kernels/em/tvb_kernels.cpp b/tvb_kernels/em/tvb_kernels.cpp
new file mode 100644
index 0000000000..5c849d6f8c
--- /dev/null
+++ b/tvb_kernels/em/tvb_kernels.cpp
@@ -0,0 +1,19 @@
+#include <emscripten/bind.h>
+
+using namespace emscripten;
+
+extern "C" {
+#include "tvbk.h"
+}
+
+class Conn {
+  const tvbk_conn conn;
+
+public:
+  Conn() : conn({0}) {}
+  void cx_nop(uint32_t t) { tvbk_cx_nop(&conn, t); }
+};
+
+EMSCRIPTEN_BINDINGS(tvb_kernels) {
+  class_<Conn>("Conn").constructor().function("cx_nop", &Conn::cx_nop);
+}

From 0719f58445331d5597a7c80b419ca33926523dde Mon Sep 17 00:00:00 2001
From: Marmaduke Woodman <marmaduke.woodman@univ-amu.fr>
Date: Wed, 4 Sep 2024 12:12:09 +0200
Subject: [PATCH 07/12] ctypes binding much simpler to debug

---
 .github/workflows/kernels.yml              | 22 +++--
 tvb_kernels/.gitignore                     |  6 ++
 tvb_kernels/CMakeLists.txt                 | 57 -------------
 tvb_kernels/Makefile                       | 19 +++++
 tvb_kernels/{em/tvb_kernels.cpp => em.cpp} |  0
 tvb_kernels/em/.gitignore                  |  3 -
 tvb_kernels/em/build.sh                    |  3 -
 tvb_kernels/ghci-build.yml                 |  2 +-
 tvb_kernels/pyproject.toml                 | 15 ++--
 tvb_kernels/spack.yaml                     | 10 +--
 tvb_kernels/test_conn.py                   | 61 ++++++++++++++
 tvb_kernels/test_nodes.py                  | 88 --------------------
 tvb_kernels/tvb_kernels.cpp                | 49 -----------
 tvb_kernels/tvb_kernels/__init__.py        | 97 ++++++++++++++++++++++
 tvb_kernels/tvbk.h                         |  6 +-
 15 files changed, 211 insertions(+), 227 deletions(-)
 delete mode 100644 tvb_kernels/CMakeLists.txt
 create mode 100644 tvb_kernels/Makefile
 rename tvb_kernels/{em/tvb_kernels.cpp => em.cpp} (100%)
 delete mode 100644 tvb_kernels/em/.gitignore
 delete mode 100755 tvb_kernels/em/build.sh
 create mode 100644 tvb_kernels/test_conn.py
 delete mode 100644 tvb_kernels/test_nodes.py
 delete mode 100644 tvb_kernels/tvb_kernels.cpp
 create mode 100644 tvb_kernels/tvb_kernels/__init__.py

diff --git a/.github/workflows/kernels.yml b/.github/workflows/kernels.yml
index d7adca3c7a..2928aa15f9 100644
--- a/.github/workflows/kernels.yml
+++ b/.github/workflows/kernels.yml
@@ -1,6 +1,11 @@
 name: build & test kernel library
 on: [push]
 
+# only run latest push
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
   pip-tvbk:
     runs-on: ubuntu-latest
@@ -22,13 +27,12 @@ jobs:
           python-version: ${{ matrix.python-version }}
 
       - name: install tools and dependencies
-        run: python3 -m pip install -U setuptools pip wheel pytest scipy
-
-      - name: build
-        run: cd tvb_kernels; pip install .
+        run: python3 -m pip install -U setuptools pip wheel pytest scipy ctypesgen
 
-      - name: test
-        run: cd tvb_kernels; python -m pytest
+      - name: build & teste
+        run: |
+          cd tvb_kernels
+          make
 
 
   spack-tvbk:
@@ -55,14 +59,14 @@ jobs:
           spack -e . install
           spack env activate .
           spack env status
-          pip install pytest
+          pip install pytest ctypesgen
 
-      - name: build nanobound library
+      - name: build & test
         shell: spack-bash {0}
         run: |
           cd tvb_kernels
           spack env activate .
-          pip install .
+          make
           pytest
 
       # https://github.com/spack/setup-spack?tab=readme-ov-file#example-caching-your-own-binaries-for-public-repositories
diff --git a/tvb_kernels/.gitignore b/tvb_kernels/.gitignore
index 7e0eaaea18..e951c1b984 100644
--- a/tvb_kernels/.gitignore
+++ b/tvb_kernels/.gitignore
@@ -5,3 +5,9 @@ build
 .spack-env
 spack.lock
 *.whl
+*.so
+*.o
+tvb_kernels/_ctg_tvbk.py
+tvbk.js
+tvbk.wasm
+compile_commands.json
diff --git a/tvb_kernels/CMakeLists.txt b/tvb_kernels/CMakeLists.txt
deleted file mode 100644
index 1ff9f87590..0000000000
--- a/tvb_kernels/CMakeLists.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-cmake_minimum_required(VERSION 3.18...3.27)
-project(tvb_kernels LANGUAGES C CXX)
-
-if (NOT SKBUILD)
-  message(WARNING "\
-  This CMake file is meant to be executed using 'scikit-build-core'.
-  Running it directly will almost certainly not produce the desired
-  result. If you are a user trying to install this package, use the
-  command below, which will install all necessary build dependencies,
-  compile the package in an isolated environment, and then install it.
-  =====================================================================
-   $ pip install .
-  =====================================================================
-  If you are a software developer, and this is your own package, then
-  it is usually much more efficient to install the build dependencies
-  in your environment once and use the following command that avoids
-  a costly creation of a new virtual environment at every compilation:
-  =====================================================================
-   $ pip install nanobind scikit-build-core[pyproject]
-   $ pip install --no-build-isolation -ve .
-  =====================================================================
-  You may optionally add -Ceditable.rebuild=true to auto-rebuild when
-  the package is imported. Otherwise, you need to rerun the above
-  after editing C++ files.")
-endif()
-
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-add_compile_options(-std=c++17 -fopenmp-simd -mavx2 -march=native -mtune=native)
-
-find_package(Python 3.8 
-  REQUIRED COMPONENTS Interpreter Development.Module
-)
-
-if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
-  set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
-  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
-endif()
-
-# Detect the installed nanobind package and import it into CMake
-execute_process(
-  COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir
-  OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE NB_DIR)
-list(APPEND CMAKE_PREFIX_PATH "${NB_DIR}")
-find_package(nanobind CONFIG REQUIRED)
-
-nanobind_add_module(tvb_kernels tvb_kernels.cpp tvbk_conn.c)
-
-nanobind_add_stub(
-  tvb_kernels_stub
-  MODULE tvb_kernels
-  OUTPUT tvb_kernels.pyi
-  PYTHON_PATH $<TARGET_FILE_DIR:tvb_kernels>
-  DEPENDS tvb_kernels
-  MARKER_FILE py.typed
-)
-
-install(TARGETS tvb_kernels LIBRARY DESTINATION tvb_kernels)
diff --git a/tvb_kernels/Makefile b/tvb_kernels/Makefile
new file mode 100644
index 0000000000..c335839899
--- /dev/null
+++ b/tvb_kernels/Makefile
@@ -0,0 +1,19 @@
+# run `bear -- make -B -j` to get compile_commands.json
+
+CC = gcc
+CFLAGS = -msse4.2 -O3 -funroll-loops -fopenmp-simd
+SRC = tvbk_conn.c
+OBJ = $(patsubst %.c,%.o,$(SRC))
+SO = libtvbk.so
+ctg = tvb_kernels/_ctg_tvbk.py
+
+all: $(SO) $(ctg)
+
+$(SO) : $(OBJ)
+	$(CC) -shared $< -o $@
+	
+$(ctg): tvbk.h
+	ctypesgen -l $(SO) $< > $@
+
+tvbk.wasm: em.cpp tvbk.h $(SRC)
+	emcc -lembind -o tvbk.js em.cpp $(SRC)
diff --git a/tvb_kernels/em/tvb_kernels.cpp b/tvb_kernels/em.cpp
similarity index 100%
rename from tvb_kernels/em/tvb_kernels.cpp
rename to tvb_kernels/em.cpp
diff --git a/tvb_kernels/em/.gitignore b/tvb_kernels/em/.gitignore
deleted file mode 100644
index f4481f02b0..0000000000
--- a/tvb_kernels/em/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-compile_commands.json
-*.js
-*.wasm
diff --git a/tvb_kernels/em/build.sh b/tvb_kernels/em/build.sh
deleted file mode 100755
index 9b15825680..0000000000
--- a/tvb_kernels/em/build.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash
-
-bear -- emcc -I../ -lembind -o tvb_kernels.js tvb_kernels.cpp ../tvbk_conn.c
diff --git a/tvb_kernels/ghci-build.yml b/tvb_kernels/ghci-build.yml
index 3d02e1e42d..0592cd0e6d 120000
--- a/tvb_kernels/ghci-build.yml
+++ b/tvb_kernels/ghci-build.yml
@@ -1 +1 @@
-../.github/workflows/build.yml
\ No newline at end of file
+../.github/workflows/kernels.yml
\ No newline at end of file
diff --git a/tvb_kernels/pyproject.toml b/tvb_kernels/pyproject.toml
index de34153a36..6526085bd2 100644
--- a/tvb_kernels/pyproject.toml
+++ b/tvb_kernels/pyproject.toml
@@ -1,6 +1,6 @@
 [build-system]
-requires = ["scikit-build-core >=0.4.3", "nanobind >=1.3.2", "typing_extensions"]
-build-backend = "scikit_build_core.build"
+requires = ["hatchling", "setuptools>=45", "setuptools_scm[toml]>=6.2"]
+build-backend = "hatchling.build"
 
 [project]
 name = "tvb_kernels"
@@ -17,7 +17,10 @@ dependencies = [ "numpy" ]
 [project.urls]
 Homepage = "https://github.com/the-virtual-brain/tvb-root"
 
-[tool.scikit-build]
-minimum-version = "0.4"
-build-dir = "build/{wheel_tag}"
-wheel.py-api = "cp312"
+[tool.cibuildwheel]
+build-verbosity = 1
+test-command = "pytest"
+test-requires = "pytest"
+
+skip = ["pp*"] # don't build pypy wheels
+archs = ["auto"]
diff --git a/tvb_kernels/spack.yaml b/tvb_kernels/spack.yaml
index 5eb851f7e6..bcaa0ccb8f 100644
--- a/tvb_kernels/spack.yaml
+++ b/tvb_kernels/spack.yaml
@@ -1,16 +1,10 @@
-# This is a Spack Environment file.
-#
-# It describes a set of packages to be installed, along with
-# configuration settings.
 spack:
-  # add package specs to the `specs` list
+
   specs:
   - py-numpy
-  - py-nanobind
   - py-scipy
-  - py-matplotlib
   - py-pip
-  - cmake
+
   concretizer:
     unify: true
 
diff --git a/tvb_kernels/test_conn.py b/tvb_kernels/test_conn.py
new file mode 100644
index 0000000000..9a22e2f1bd
--- /dev/null
+++ b/tvb_kernels/test_conn.py
@@ -0,0 +1,61 @@
+import numpy as np
+import scipy.sparse
+import enum
+
+import tvb_kernels
+
+
+def rand_weights(seed=43, sparsity=0.4, dt=0.1, horizon=256, num_node=90):
+    np.random.seed(seed)
+    horizonm1 = horizon - 1
+    assert ((horizon) & (horizonm1)) == 0
+    weights, lengths = np.random.rand(2, num_node, num_node).astype('f')
+    lengths[:] *= 0.8
+    lengths *= (horizon*dt*0.8)
+    zero_mask = weights < (1-sparsity)
+    weights[zero_mask] = 0
+    return weights, lengths
+
+
+def base_setup(mode: tvb_kernels.CxMode = tvb_kernels.CxMode.CX_J):
+    cv = 1.0
+    dt = 0.1
+    num_node = 90
+    horizon = 256
+    weights, lengths = rand_weights(num_node=num_node, horizon=horizon, dt=dt)
+    buf = tvb_kernels.Buf(num_node, horizon)
+    cx = tvb_kernels.Cx(num_node)
+    conn = tvb_kernels.Conn(buf, cx, dt, cv, weights, lengths, mode=mode)
+
+    # then we can test
+    buf.buf[:] = np.r_[:1.0:1j*num_node *
+                       horizon].reshape(num_node, horizon).astype('f')*4.0
+
+    # impl simple numpy version
+    def make_cfun_np():
+        zero_mask = weights == 0
+        csr_weights = scipy.sparse.csr_matrix(weights)
+        idelays = (lengths[~zero_mask]/cv/dt).astype('i')+2
+        idelays2 = -horizon + np.c_[idelays, idelays-1].T
+        assert idelays2.shape == (2, csr_weights.nnz)
+
+        def cfun_np(t):
+            cx = buf.buf[csr_weights.indices, (t-idelays2) % horizon]
+            cx *= csr_weights.data
+            cx = np.add.reduceat(cx, csr_weights.indptr[:-1], axis=1)
+            return cx  # (2, num_node)
+        return cfun_np
+
+    return conn, cx, buf, make_cfun_np()
+
+
+def test_conn_kernels():
+    connj, cxj, _, cfun_np = base_setup(tvb_kernels.CxMode.CX_J)
+    conni, cxi, _, _ = base_setup(tvb_kernels.CxMode.CX_I)
+
+    for t in range(1024):
+        cx = cfun_np(t)
+        connj(t)
+        conni(t)
+        np.testing.assert_allclose(cx, cxj.cx, 1e-4, 1e-6)
+        np.testing.assert_allclose(cx, cxi.cx, 1e-4, 1e-6)
diff --git a/tvb_kernels/test_nodes.py b/tvb_kernels/test_nodes.py
deleted file mode 100644
index b6fa0d36d6..0000000000
--- a/tvb_kernels/test_nodes.py
+++ /dev/null
@@ -1,88 +0,0 @@
-import numpy as np
-import scipy.sparse
-import enum
-
-from tvb_kernels import tvb_kernels
-
-
-class CxMode(enum.Enum):
-    CX_J = 1
-    CX_I = 2
-    CX_NOP = 3
-
-
-def base_setup(mode: CxMode = CxMode.CX_J):
-    np.random.seed(43)
-    num_node = 90
-    sparsity = 0.4
-    dt = 0.1
-    horizon = 256
-    horizonm1 = horizon - 1
-    assert ((horizon) & (horizonm1)) == 0
-    weights, lengths = np.random.rand(2, num_node, num_node).astype('f')
-    lengths[:] *= 0.8
-    lengths *= (horizon*dt*0.8)
-    zero_mask = weights < (1-sparsity)
-    weights[zero_mask] = 0
-    # so far so good
-
-    if mode == CxMode.CX_J:
-        # cx_j requires csc format, cx_i requires csr
-        spw = scipy.sparse.csc_matrix(weights)
-        # since numpy is row major, we need to transpose zero mask then ravel
-        nnz_ravel = np.argwhere(~zero_mask.T.copy().ravel())[:, 0]
-        # then we also need to extract them in the transposed order
-        idelays = (lengths.T.copy().ravel()[nnz_ravel] / dt).astype('i') + 2
-    elif mode == CxMode.CX_I:
-        spw = scipy.sparse.csr_matrix(weights)
-        nnz_ravel = np.argwhere(~zero_mask.ravel())[:, 0]
-        idelays = (lengths.ravel()[nnz_ravel] / dt).astype('i') + 2
-    else:
-        raise NotImplementedError
-
-    # then we can test
-    buf = np.r_[:1.0:1j*num_node *
-                horizon].reshape(1, num_node, horizon).astype('f')*4.0
-    buf = np.random.randn(1, num_node, horizon).astype('f')
-    c_buf = buf.copy()
-    c_cx1, c_cx2 = c_cx = np.zeros((2, num_node), 'f')
-
-    # create teh C struct w/ our data
-    conn = tvb_kernels.Conn(
-        weights=spw.data,
-        indices=spw.indices,
-        indptr=spw.indptr,
-        idelays=idelays,
-        buf=c_buf,
-        cx=c_cx
-    )
-
-    # impl simple numpy version
-    def make_cfun_np():
-        csr_weights = scipy.sparse.csr_matrix(weights)
-        idelays = (lengths[~zero_mask]/dt).astype('i')+2
-        idelays2 = -horizon + np.c_[idelays, idelays-1].T
-        assert idelays2.shape == (2, csr_weights.nnz)
-        buffer = buf[0]
-
-        def cfun_np(t):
-            cx = buffer[csr_weights.indices, (t-idelays2) % horizon]
-            cx *= csr_weights.data
-            cx = np.add.reduceat(cx, csr_weights.indptr[:-1], axis=1)
-            return cx  # (2, num_node)
-        return cfun_np
-
-    return conn, c_cx, locals(), make_cfun_np()
-
-
-def test_conn_kernels():
-    connj, cxj, dataj, cfun_np = base_setup(CxMode.CX_J)
-    conni, cxi, datai, _ = base_setup(CxMode.CX_I)
-
-    for t in range(1024):
-        cx = cfun_np(t)
-        tvb_kernels.cx_j(connj, t)
-        tvb_kernels.cx_i(conni, t)
-        np.testing.assert_allclose(cx, cxj, 1e-4, 1e-6)
-        np.testing.assert_allclose(cx, cxi, 1e-4, 1e-6)
-
diff --git a/tvb_kernels/tvb_kernels.cpp b/tvb_kernels/tvb_kernels.cpp
deleted file mode 100644
index c015027ffa..0000000000
--- a/tvb_kernels/tvb_kernels.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-#include <nanobind/nanobind.h>
-#include <nanobind/ndarray.h>
-
-extern "C" {
-#include "tvbk.h"
-}
-
-namespace nb = nanobind;
-using namespace nb::literals;
-
-static void conn_init_from_arrays(
-    tvbk_conn *t,
-    /* add more type info for validation */
-    nb::ndarray<float, nb::shape<-1>> weights,
-    nb::ndarray<int32_t, nb::shape<-1>> indices,
-    nb::ndarray<int32_t, nb::shape<-1>> indptr,
-    nb::ndarray<int32_t, nb::shape<-1>> idelays,
-    nb::ndarray<float, nb::shape<-1, -1, -1>, nb::c_contig> buf,
-    nb::ndarray<float, nb::shape<-1, -1>, nb::c_contig> cx) {
-  new (t) tvbk_conn{/* TODO converting from unsigned long to int here */
-                    .num_node = static_cast<int>(buf.shape(1)),
-                    .num_nonzero = static_cast<int>(weights.shape(0)),
-                    .num_cvar = static_cast<int>(buf.shape(0)),
-                    .horizon = static_cast<int>(buf.shape(2)),
-                    .horizon_minus_1 = static_cast<int>(buf.shape(2) - 1)};
-  if (!((t->horizon & t->horizon_minus_1) == 0))
-    throw nb::value_error("horizon (buf.shape[2]) must be power of 2");
-  /* TODO more shape validation */
-  t->weights = weights.data();
-  t->indices = indices.data();
-  t->indptr = indptr.data();
-  t->idelays = idelays.data();
-  t->buf = buf.data();
-  /* TODO cx1, cx2 should be owned by model instance */
-  t->cx1 = cx.data();
-  t->cx2 = cx.data() + t->num_node;
-}
-
-NB_MODULE(tvb_kernels, m) {
-  nb::class_<tvbk_conn>(m, "Conn")
-      .def("__init__", &conn_init_from_arrays, "weights"_a, "indices"_a,
-           "indptr"_a, "idelays"_a, "buf"_a, "cx"_a);
-    /* TODO add accessors for arrays? */
-    /* TODO take ownership of arrays? */
-
-  m.def("cx_nop", &tvbk_cx_nop);
-  m.def("cx_j", &tvbk_cx_j);
-  m.def("cx_i", &tvbk_cx_i);
-}
diff --git a/tvb_kernels/tvb_kernels/__init__.py b/tvb_kernels/tvb_kernels/__init__.py
new file mode 100644
index 0000000000..310e33366d
--- /dev/null
+++ b/tvb_kernels/tvb_kernels/__init__.py
@@ -0,0 +1,97 @@
+import enum
+import numpy as np
+import ctypes
+
+# TODO mv functionality used to C
+import scipy.sparse
+
+from . import _tvbk
+from . import _ctg_tvbk
+
+
+def _to_ct(a):
+    val_type = {
+        'float32': ctypes.c_float,
+        'uint32': ctypes.c_uint32,
+        'int32': ctypes.c_int32
+    }[a.dtype.name]
+    return a.ctypes.data_as(ctypes.POINTER(val_type))
+
+
+class CxMode(enum.Enum):
+    CX_J = 1
+    CX_I = 2
+    CX_NOP = 3
+
+
+class Cx:
+    def __init__(self, num_node):
+        self.cx = np.zeros((2, num_node), 'f')
+
+
+class Buf:
+    def __init__(self, num_node, horizon):
+        self.buf = np.zeros((num_node, horizon), 'f')
+
+
+class Conn:
+
+    def __init__(self, buf, cx, dt, cv, weights, lengths, mode: CxMode = CxMode.CX_J):
+        self.mode = mode
+        assert weights.shape[0] == weights.shape[1]
+        assert weights.shape == lengths.shape
+        self.num_node = weights.shape[0]
+        zero_mask = weights == 0
+        if mode == CxMode.CX_J:
+            # cx_j requires csc format, cx_i requires csr
+            spw = scipy.sparse.csc_matrix(weights)
+            # since numpy is row major, we need to transpose zero mask then ravel
+            nnz_ravel = np.argwhere(~zero_mask.T.copy().ravel())[:, 0]
+            # then we also need to extract them in the transposed order
+            idelays = (lengths.T.copy().ravel()[
+                       nnz_ravel] / cv / dt).astype('i') + 2
+        elif mode == CxMode.CX_I:
+            spw = scipy.sparse.csr_matrix(weights)
+            nnz_ravel = np.argwhere(~zero_mask.ravel())[:, 0]
+            idelays = (lengths.ravel()[nnz_ravel] / cv / dt).astype('i') + 2
+        else:
+            raise NotImplementedError
+
+        # retain references to arrays so they aren't collected
+        self.idelays = idelays.astype(np.uint32)
+        self.weights = spw.data.astype(np.float32)
+        self.indices = spw.indices.astype(np.uint32)
+        self.indptr = spw.indptr.astype(np.uint32)
+        self._buf = buf.buf
+        self._cx1, self._cx2 = cx.cx
+        self._conn = _ctg_tvbk.tvbk_conn(
+            num_node=weights.shape[0],
+            num_nonzero=self.indices.size,
+            num_cvar=1,
+            horizon=buf.buf.shape[1],
+            # TODO ridonkulus
+            horizon_minus_1=buf.buf.shape[1] - 1,
+            weights=_to_ct(self.weights),
+            indices=_to_ct(self.indices),
+            indptr=_to_ct(self.indptr),
+            idelays=_to_ct(self.idelays),
+            buf=_to_ct(self._buf),
+            cx1=_to_ct(self._cx1),
+            cx2=_to_ct(self._cx2)
+        )
+        # self._conn = _tvbk.Conn(
+        #     weights=self.weights,
+        #     indices=self.indices,
+        #     indptr=self.indptr,
+        #     idelays=self.idelays,
+        #     buf=self._buf,
+        #     cx=self._cx,
+        # )
+
+    def __call__(self, t):
+        if self.mode == CxMode.CX_J:
+            # _tvbk.cx_j(self._conn, t)
+            _ctg_tvbk.tvbk_cx_j(self._conn, t)
+        else:
+            # _tvbk.cx_i(self._conn, t)
+            _ctg_tvbk.tvbk_cx_i(self._conn, t)
diff --git a/tvb_kernels/tvbk.h b/tvb_kernels/tvbk.h
index 50fcdb3bff..840ba78f29 100644
--- a/tvb_kernels/tvbk.h
+++ b/tvb_kernels/tvbk.h
@@ -20,9 +20,9 @@ struct tvbk_conn {
   const int horizon;
   const int horizon_minus_1;
   const float *weights; // (num_nonzero,)
-  const int *indices;   // (num_nonzero,)
-  const int *indptr;    // (num_nodes+1,)
-  const int *idelays;   // (num_nonzero,)
+  const uint32_t *indices;   // (num_nonzero,)
+  const uint32_t *indptr;    // (num_nodes+1,)
+  const uint32_t *idelays;   // (num_nonzero,)
   float *buf;           // delay buffer (num_cvar, num_nodes, horizon)
   float *cx1;
   float *cx2;

From 69e6edd2fad09f58d7932914da92d1dc466ab06c Mon Sep 17 00:00:00 2001
From: Marmaduke Woodman <marmaduke.woodman@univ-amu.fr>
Date: Wed, 4 Sep 2024 13:50:13 +0200
Subject: [PATCH 08/12] disable most workflows for this branch for now

---
 .github/workflows/build.yml         | 4 +++-
 .github/workflows/contrib-tests.yml | 4 +++-
 .github/workflows/kernels.yml       | 4 +++-
 .github/workflows/notebooks.yml     | 6 ++++--
 .github/workflows/pg-tests.yml      | 4 +++-
 .github/workflows/win-tests.yml     | 4 +++-
 6 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index f502c2c6f0..dbcdabee66 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1,5 +1,7 @@
 name: Test Py
-on: [push]
+on:
+  push:
+    branches-ignore: [tvb_kernels]
 
 jobs:
   build:
diff --git a/.github/workflows/contrib-tests.yml b/.github/workflows/contrib-tests.yml
index 815b30f890..61d4b99076 100644
--- a/.github/workflows/contrib-tests.yml
+++ b/.github/workflows/contrib-tests.yml
@@ -1,5 +1,7 @@
 name: Check tvb-contrib
-on: [push]
+on:
+  push:
+    branches-ignore: [tvb_kernels]
 
 jobs:
   build:
diff --git a/.github/workflows/kernels.yml b/.github/workflows/kernels.yml
index 2928aa15f9..16eef4f18b 100644
--- a/.github/workflows/kernels.yml
+++ b/.github/workflows/kernels.yml
@@ -1,5 +1,7 @@
 name: build & test kernel library
-on: [push]
+on:
+  push:
+    branches: [tvb_kernel]
 
 # only run latest push
 concurrency:
diff --git a/.github/workflows/notebooks.yml b/.github/workflows/notebooks.yml
index dcb1c9ef41..6795c76876 100644
--- a/.github/workflows/notebooks.yml
+++ b/.github/workflows/notebooks.yml
@@ -1,5 +1,7 @@
 name: Test Notebooks
-on: [push]
+on:
+  push:
+    branches-ignore: [tvb_kernels]
 
 jobs:
   build:
@@ -59,4 +61,4 @@ jobs:
 
       - name: run notebooks
         run: |
-          python ./tvb_build/notebook_runner.py ./tvb_documentation/demos
\ No newline at end of file
+          python ./tvb_build/notebook_runner.py ./tvb_documentation/demos
diff --git a/.github/workflows/pg-tests.yml b/.github/workflows/pg-tests.yml
index ac9f074465..8f1a9ee067 100644
--- a/.github/workflows/pg-tests.yml
+++ b/.github/workflows/pg-tests.yml
@@ -1,5 +1,7 @@
 name: Test PG
-on: [push]
+on:
+  push:
+    branches-ignore: [tvb_kernels]
 
 jobs:
   build:
diff --git a/.github/workflows/win-tests.yml b/.github/workflows/win-tests.yml
index f4e087dad3..4993a784f7 100644
--- a/.github/workflows/win-tests.yml
+++ b/.github/workflows/win-tests.yml
@@ -1,5 +1,7 @@
 name: Test Win
-on: [push]
+on:
+  push:
+    branches-ignore: [tvb_kernels]
 
 jobs:
   build:

From 527702a51d9832079cd813ffb5e4e44c04fab86f Mon Sep 17 00:00:00 2001
From: Marmaduke Woodman <marmaduke.woodman@univ-amu.fr>
Date: Wed, 4 Sep 2024 13:59:56 +0200
Subject: [PATCH 09/12] clean

---
 tvb_kernels/tvb_kernels/__init__.py | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/tvb_kernels/tvb_kernels/__init__.py b/tvb_kernels/tvb_kernels/__init__.py
index 310e33366d..fcaef0d7e8 100644
--- a/tvb_kernels/tvb_kernels/__init__.py
+++ b/tvb_kernels/tvb_kernels/__init__.py
@@ -5,7 +5,6 @@
 # TODO mv functionality used to C
 import scipy.sparse
 
-from . import _tvbk
 from . import _ctg_tvbk
 
 
@@ -79,19 +78,9 @@ def __init__(self, buf, cx, dt, cv, weights, lengths, mode: CxMode = CxMode.CX_J
             cx1=_to_ct(self._cx1),
             cx2=_to_ct(self._cx2)
         )
-        # self._conn = _tvbk.Conn(
-        #     weights=self.weights,
-        #     indices=self.indices,
-        #     indptr=self.indptr,
-        #     idelays=self.idelays,
-        #     buf=self._buf,
-        #     cx=self._cx,
-        # )
 
     def __call__(self, t):
         if self.mode == CxMode.CX_J:
-            # _tvbk.cx_j(self._conn, t)
             _ctg_tvbk.tvbk_cx_j(self._conn, t)
-        else:
-            # _tvbk.cx_i(self._conn, t)
+        if self.mode == CxMode.CX_I:
             _ctg_tvbk.tvbk_cx_i(self._conn, t)

From 033dbac12050345a53621fcc33cb97e3f3cf2444 Mon Sep 17 00:00:00 2001
From: Marmaduke Woodman <marmaduke.woodman@univ-amu.fr>
Date: Wed, 4 Sep 2024 14:40:14 +0200
Subject: [PATCH 10/12] let hatchling invoke make

---
 .github/workflows/kernels.yml     |  7 ++++---
 tvb_kernels/.gitignore            |  1 +
 tvb_kernels/pyproject.toml        | 22 +++++++++++++++++++++-
 tvb_kernels/{ => src}/Makefile    |  2 +-
 tvb_kernels/{ => src}/em.cpp      |  0
 tvb_kernels/{ => src}/tvbk.h      |  0
 tvb_kernels/{ => src}/tvbk_conn.c |  0
 7 files changed, 27 insertions(+), 5 deletions(-)
 rename tvb_kernels/{ => src}/Makefile (91%)
 rename tvb_kernels/{ => src}/em.cpp (100%)
 rename tvb_kernels/{ => src}/tvbk.h (100%)
 rename tvb_kernels/{ => src}/tvbk_conn.c (100%)

diff --git a/.github/workflows/kernels.yml b/.github/workflows/kernels.yml
index 16eef4f18b..6aed49d57c 100644
--- a/.github/workflows/kernels.yml
+++ b/.github/workflows/kernels.yml
@@ -29,12 +29,13 @@ jobs:
           python-version: ${{ matrix.python-version }}
 
       - name: install tools and dependencies
-        run: python3 -m pip install -U setuptools pip wheel pytest scipy ctypesgen
+        run: python3 -m pip install -U pip pytest scipy ctypesgen
 
       - name: build & teste
         run: |
           cd tvb_kernels
-          make
+          pip install .
+          pytest
 
 
   spack-tvbk:
@@ -68,7 +69,7 @@ jobs:
         run: |
           cd tvb_kernels
           spack env activate .
-          make
+          pip install .
           pytest
 
       # https://github.com/spack/setup-spack?tab=readme-ov-file#example-caching-your-own-binaries-for-public-repositories
diff --git a/tvb_kernels/.gitignore b/tvb_kernels/.gitignore
index e951c1b984..cf8713b9c5 100644
--- a/tvb_kernels/.gitignore
+++ b/tvb_kernels/.gitignore
@@ -11,3 +11,4 @@ tvb_kernels/_ctg_tvbk.py
 tvbk.js
 tvbk.wasm
 compile_commands.json
+_ctg_*.py
diff --git a/tvb_kernels/pyproject.toml b/tvb_kernels/pyproject.toml
index 6526085bd2..ba032bec15 100644
--- a/tvb_kernels/pyproject.toml
+++ b/tvb_kernels/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["hatchling", "setuptools>=45", "setuptools_scm[toml]>=6.2"]
+requires = ["hatchling", "hatch-build-scripts", "ctypesgen"]
 build-backend = "hatchling.build"
 
 [project]
@@ -24,3 +24,23 @@ test-requires = "pytest"
 
 skip = ["pp*"] # don't build pypy wheels
 archs = ["auto"]
+
+# [tool.hatch.version]
+# path = "tvb_kernels/version.py"
+
+[tool.hatch.build]
+include = [
+    "tvb_kernels/*.py",
+]
+
+[[tool.hatch.build.hooks.build-scripts.scripts]]
+work_dir = "src"
+out_dir = "tvb_kernels"
+commands = [ 
+    "make libtvbk.so",
+    "ctypesgen -l libtvbk.so tvbk.h > _ctg_tvbk.py",
+]
+artifacts = [ 
+    "libtvbk.so",
+    "_ctg_tvbk.py",
+]
diff --git a/tvb_kernels/Makefile b/tvb_kernels/src/Makefile
similarity index 91%
rename from tvb_kernels/Makefile
rename to tvb_kernels/src/Makefile
index c335839899..df6afaecdf 100644
--- a/tvb_kernels/Makefile
+++ b/tvb_kernels/src/Makefile
@@ -5,7 +5,7 @@ CFLAGS = -msse4.2 -O3 -funroll-loops -fopenmp-simd
 SRC = tvbk_conn.c
 OBJ = $(patsubst %.c,%.o,$(SRC))
 SO = libtvbk.so
-ctg = tvb_kernels/_ctg_tvbk.py
+ctg = ../tvb_kernels/_ctg_tvbk.py
 
 all: $(SO) $(ctg)
 
diff --git a/tvb_kernels/em.cpp b/tvb_kernels/src/em.cpp
similarity index 100%
rename from tvb_kernels/em.cpp
rename to tvb_kernels/src/em.cpp
diff --git a/tvb_kernels/tvbk.h b/tvb_kernels/src/tvbk.h
similarity index 100%
rename from tvb_kernels/tvbk.h
rename to tvb_kernels/src/tvbk.h
diff --git a/tvb_kernels/tvbk_conn.c b/tvb_kernels/src/tvbk_conn.c
similarity index 100%
rename from tvb_kernels/tvbk_conn.c
rename to tvb_kernels/src/tvbk_conn.c

From 34725dbadb222eabe4bd182049d44e7696c3ac83 Mon Sep 17 00:00:00 2001
From: Marmaduke Woodman <marmaduke.woodman@univ-amu.fr>
Date: Wed, 4 Sep 2024 14:53:15 +0200
Subject: [PATCH 11/12] fix

---
 .github/workflows/kernels.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/kernels.yml b/.github/workflows/kernels.yml
index 6aed49d57c..c7a32ddfa1 100644
--- a/.github/workflows/kernels.yml
+++ b/.github/workflows/kernels.yml
@@ -1,7 +1,6 @@
 name: build & test kernel library
 on:
   push:
-    branches: [tvb_kernel]
 
 # only run latest push
 concurrency:

From eb4fe8ab2f3e4fa950e555808bf2654d5c38f5aa Mon Sep 17 00:00:00 2001
From: Marmaduke Woodman <marmaduke.woodman@univ-amu.fr>
Date: Wed, 4 Sep 2024 16:29:37 +0200
Subject: [PATCH 12/12] refactor buf out of conn

---
 tvb_kernels/pyproject.toml          |  2 +-
 tvb_kernels/src/tvbk.h              | 40 +++++++++++++++++-----------
 tvb_kernels/src/tvbk_conn.c         | 41 ++++++++++++++++-------------
 tvb_kernels/test_conn.py            | 29 ++++++++++----------
 tvb_kernels/tvb_kernels/__init__.py | 34 +++++++++++-------------
 5 files changed, 78 insertions(+), 68 deletions(-)

diff --git a/tvb_kernels/pyproject.toml b/tvb_kernels/pyproject.toml
index ba032bec15..a064f9445a 100644
--- a/tvb_kernels/pyproject.toml
+++ b/tvb_kernels/pyproject.toml
@@ -37,7 +37,7 @@ include = [
 work_dir = "src"
 out_dir = "tvb_kernels"
 commands = [ 
-    "make libtvbk.so",
+    "make -B libtvbk.so",
     "ctypesgen -l libtvbk.so tvbk.h > _ctg_tvbk.py",
 ]
 artifacts = [ 
diff --git a/tvb_kernels/src/tvbk.h b/tvb_kernels/src/tvbk.h
index 840ba78f29..21963af00b 100644
--- a/tvb_kernels/src/tvbk.h
+++ b/tvb_kernels/src/tvbk.h
@@ -5,27 +5,35 @@
 typedef struct tvbk_params tvbk_params;
 
 struct tvbk_params {
-  const int count;
-  const float *values;
+  const uint32_t count;
+  const float *const values;
+};
+
+/* a afferent coupling buffer into which the cx functions
+   accumulate their results */
+typedef struct tvbk_cx tvbk_cx;
+
+struct tvbk_cx {
+  /* values for 1st and 2nd Heun stage respectively.
+     each shaped (num_node, ) */
+  float *const cx1;
+  float *const cx2;
+  /* delay buffer (num_node, num_time)*/
+  float *const buf;
+  const uint32_t num_node;
+  const uint32_t num_time; // horizon, power of 2
 };
 
-// conn model with csr format sparse connections & delay buffer
 typedef struct tvbk_conn tvbk_conn;
 
 struct tvbk_conn {
   const int num_node;
   const int num_nonzero;
   const int num_cvar;
-  // horizon must be power of two
-  const int horizon;
-  const int horizon_minus_1;
-  const float *weights; // (num_nonzero,)
-  const uint32_t *indices;   // (num_nonzero,)
-  const uint32_t *indptr;    // (num_nodes+1,)
-  const uint32_t *idelays;   // (num_nonzero,)
-  float *buf;           // delay buffer (num_cvar, num_nodes, horizon)
-  float *cx1;
-  float *cx2;
+  const float *const weights;    // (num_nonzero,)
+  const uint32_t *const indices; // (num_nonzero,)
+  const uint32_t *const indptr;  // (num_nodes+1,)
+  const uint32_t *const idelays; // (num_nonzero,)
 };
 
 /* not currently used */
@@ -53,6 +61,6 @@ struct tvbk_sim {
   const tvbk_conn conn;
 };
 
-void tvbk_cx_j(const tvbk_conn *c, uint32_t t);
-void tvbk_cx_i(const tvbk_conn *c, uint32_t t);
-void tvbk_cx_nop(const tvbk_conn *c, uint32_t t);
+void tvbk_cx_j(const tvbk_cx *cx, const tvbk_conn *conn, uint32_t t);
+void tvbk_cx_i(const tvbk_cx *cx, const tvbk_conn *conn, uint32_t t);
+void tvbk_cx_nop(const tvbk_cx *cx, const tvbk_conn *conn, uint32_t t);
diff --git a/tvb_kernels/src/tvbk_conn.c b/tvb_kernels/src/tvbk_conn.c
index 5a2f12a55f..8b1ee37b3a 100644
--- a/tvb_kernels/src/tvbk_conn.c
+++ b/tvb_kernels/src/tvbk_conn.c
@@ -1,30 +1,33 @@
 #include "tvbk.h"
 
-static void cx_all_j(const tvbk_conn *c, uint32_t t, uint32_t j) {
-  float *const buf = c->buf + j * c->horizon;
-  uint32_t th = t + c->horizon;
+static void cx_all_j(const tvbk_cx *cx, const tvbk_conn *c, uint32_t t,
+                     uint32_t j) {
+  uint32_t wrap_mask = cx->num_time - 1; // assume num_time is power of 2
+  float *const buf = cx->buf + j * cx->num_time;
+  uint32_t th = t + cx->num_time;
 #pragma omp simd
   for (uint32_t l = c->indptr[j]; l < c->indptr[j + 1]; l++) {
     uint32_t i = c->indices[l];
     float w = c->weights[l];
     uint32_t d = c->idelays[l];
-    uint32_t p1 = (th - d) & c->horizon_minus_1;
-    uint32_t p2 = (th - d + 1) & c->horizon_minus_1;
-    c->cx1[i] += w * buf[p1];
-    c->cx2[i] += w * buf[p2];
+    uint32_t p1 = (th - d) & wrap_mask;
+    uint32_t p2 = (th - d + 1) & wrap_mask;
+    cx->cx1[i] += w * buf[p1];
+    cx->cx2[i] += w * buf[p2];
   }
 }
 
-void tvbk_cx_j(const tvbk_conn *c, uint32_t t) {
+void tvbk_cx_j(const tvbk_cx *cx, const tvbk_conn *c, uint32_t t) {
 #pragma omp simd
   for (int i = 0; i < c->num_node; i++)
-    c->cx1[i] = c->cx2[i] = 0.0f;
+    cx->cx1[i] = cx->cx2[i] = 0.0f;
   for (int j = 0; j < c->num_node; j++)
-    cx_all_j(c, t, j);
+    cx_all_j(cx, c, t, j);
 }
 
-void tvbk_cx_i(const tvbk_conn *c, uint32_t t) {
-  uint32_t th = t + c->horizon;
+void tvbk_cx_i(const tvbk_cx *cx, const tvbk_conn *c, uint32_t t) {
+  uint32_t wrap_mask = cx->num_time - 1; // assume num_time is power of 2
+  uint32_t th = t + cx->num_time;
 #pragma omp simd
   for (int i = 0; i < c->num_node; i++) {
     float cx1 = 0.f, cx2 = 0.f;
@@ -32,17 +35,17 @@ void tvbk_cx_i(const tvbk_conn *c, uint32_t t) {
       uint32_t j = c->indices[l];
       float w = c->weights[l];
       uint32_t d = c->idelays[l];
-      uint32_t p1 = (th - d) & c->horizon_minus_1;
-      uint32_t p2 = (th - d + 1) & c->horizon_minus_1;
-      cx1 += w * c->buf[j * c->horizon + p1];
-      cx2 += w * c->buf[j * c->horizon + p2];
+      uint32_t p1 = (th - d) & wrap_mask;
+      uint32_t p2 = (th - d + 1) & wrap_mask;
+      cx1 += w * cx->buf[j * cx->num_time + p1];
+      cx2 += w * cx->buf[j * cx->num_time + p2];
     }
-    c->cx1[i] = cx1;
-    c->cx2[i] = cx2;
+    cx->cx1[i] = cx1;
+    cx->cx2[i] = cx2;
   }
 }
 
-void tvbk_cx_nop(const tvbk_conn *c, uint32_t t) {
+void tvbk_cx_nop(const tvbk_cx *cx, const tvbk_conn *c, uint32_t t) {
   (void)c;
   (void)t;
 
diff --git a/tvb_kernels/test_conn.py b/tvb_kernels/test_conn.py
index 9a22e2f1bd..353705b0aa 100644
--- a/tvb_kernels/test_conn.py
+++ b/tvb_kernels/test_conn.py
@@ -23,13 +23,12 @@ def base_setup(mode: tvb_kernels.CxMode = tvb_kernels.CxMode.CX_J):
     num_node = 90
     horizon = 256
     weights, lengths = rand_weights(num_node=num_node, horizon=horizon, dt=dt)
-    buf = tvb_kernels.Buf(num_node, horizon)
-    cx = tvb_kernels.Cx(num_node)
-    conn = tvb_kernels.Conn(buf, cx, dt, cv, weights, lengths, mode=mode)
+    cx = tvb_kernels.Cx(num_node, horizon)
+    conn = tvb_kernels.Conn(cx, dt, cv, weights, lengths, mode=mode)
 
     # then we can test
-    buf.buf[:] = np.r_[:1.0:1j*num_node *
-                       horizon].reshape(num_node, horizon).astype('f')*4.0
+    cx.buf[:] = np.r_[:1.0:1j*num_node *
+                      horizon].reshape(num_node, horizon).astype('f')*4.0
 
     # impl simple numpy version
     def make_cfun_np():
@@ -40,22 +39,24 @@ def make_cfun_np():
         assert idelays2.shape == (2, csr_weights.nnz)
 
         def cfun_np(t):
-            cx = buf.buf[csr_weights.indices, (t-idelays2) % horizon]
-            cx *= csr_weights.data
-            cx = np.add.reduceat(cx, csr_weights.indptr[:-1], axis=1)
-            return cx  # (2, num_node)
+            _ = cx.buf[csr_weights.indices, (t-idelays2) % horizon]
+            _ *= csr_weights.data
+            _ = np.add.reduceat(_, csr_weights.indptr[:-1], axis=1)
+            return _  # (2, num_node)
         return cfun_np
 
-    return conn, cx, buf, make_cfun_np()
+    return conn, cx, make_cfun_np()
 
 
 def test_conn_kernels():
-    connj, cxj, _, cfun_np = base_setup(tvb_kernels.CxMode.CX_J)
-    conni, cxi, _, _ = base_setup(tvb_kernels.CxMode.CX_I)
+    connj, cxj, cfun_np = base_setup(tvb_kernels.CxMode.CX_J)
+    conni, cxi, _ = base_setup(tvb_kernels.CxMode.CX_I)
 
     for t in range(1024):
         cx = cfun_np(t)
         connj(t)
         conni(t)
-        np.testing.assert_allclose(cx, cxj.cx, 1e-4, 1e-6)
-        np.testing.assert_allclose(cx, cxi.cx, 1e-4, 1e-6)
+        np.testing.assert_allclose(cx[0], cxj.cx1, 1e-4, 1e-6)
+        np.testing.assert_allclose(cx[1], cxj.cx2, 1e-4, 1e-6)
+        np.testing.assert_allclose(cx[0], cxi.cx1, 1e-4, 1e-6)
+        np.testing.assert_allclose(cx[1], cxi.cx2, 1e-4, 1e-6)
diff --git a/tvb_kernels/tvb_kernels/__init__.py b/tvb_kernels/tvb_kernels/__init__.py
index fcaef0d7e8..77264fa21b 100644
--- a/tvb_kernels/tvb_kernels/__init__.py
+++ b/tvb_kernels/tvb_kernels/__init__.py
@@ -24,18 +24,23 @@ class CxMode(enum.Enum):
 
 
 class Cx:
-    def __init__(self, num_node):
-        self.cx = np.zeros((2, num_node), 'f')
-
-
-class Buf:
-    def __init__(self, num_node, horizon):
-        self.buf = np.zeros((num_node, horizon), 'f')
+    def __init__(self, num_node, num_time):
+        if num_time & (num_time - 1) != 0:
+            raise ValueError('num_time not power of two')
+        self.cx1, self.cx2 = np.zeros((2, num_node), 'f')
+        self.buf = np.zeros((num_node, num_time), 'f')
+        self._cx = _ctg_tvbk.tvbk_cx(
+            cx1=_to_ct(self.cx1),
+            cx2=_to_ct(self.cx2),
+            buf=_to_ct(self.buf),
+            num_node=num_node,
+            num_time=num_time 
+        )
 
 
 class Conn:
 
-    def __init__(self, buf, cx, dt, cv, weights, lengths, mode: CxMode = CxMode.CX_J):
+    def __init__(self, cx, dt, cv, weights, lengths, mode: CxMode = CxMode.CX_J):
         self.mode = mode
         assert weights.shape[0] == weights.shape[1]
         assert weights.shape == lengths.shape
@@ -61,26 +66,19 @@ def __init__(self, buf, cx, dt, cv, weights, lengths, mode: CxMode = CxMode.CX_J
         self.weights = spw.data.astype(np.float32)
         self.indices = spw.indices.astype(np.uint32)
         self.indptr = spw.indptr.astype(np.uint32)
-        self._buf = buf.buf
-        self._cx1, self._cx2 = cx.cx
+        self.cx = cx
         self._conn = _ctg_tvbk.tvbk_conn(
             num_node=weights.shape[0],
             num_nonzero=self.indices.size,
             num_cvar=1,
-            horizon=buf.buf.shape[1],
-            # TODO ridonkulus
-            horizon_minus_1=buf.buf.shape[1] - 1,
             weights=_to_ct(self.weights),
             indices=_to_ct(self.indices),
             indptr=_to_ct(self.indptr),
             idelays=_to_ct(self.idelays),
-            buf=_to_ct(self._buf),
-            cx1=_to_ct(self._cx1),
-            cx2=_to_ct(self._cx2)
         )
 
     def __call__(self, t):
         if self.mode == CxMode.CX_J:
-            _ctg_tvbk.tvbk_cx_j(self._conn, t)
+            _ctg_tvbk.tvbk_cx_j(self.cx._cx, self._conn, t)
         if self.mode == CxMode.CX_I:
-            _ctg_tvbk.tvbk_cx_i(self._conn, t)
+            _ctg_tvbk.tvbk_cx_i(self.cx._cx, self._conn, t)