From 306cd1a4f91b677d0074cdc319ce76d27692dfef Mon Sep 17 00:00:00 2001
From: Pramod S Kumbhar <pramod.s.kumbhar@gmail.com>
Date: Mon, 4 Apr 2022 00:26:03 +0200
Subject: [PATCH 001/128] Support for shared libraries in GPU execution (python
 launch support)

* mod2c now generates code without need of global variables
* coreneuron and mechanism library can be built as shared and it
  enables launching coreneuron on GPU via pyton
* scopmath library can be also shared
* removed acc/openmp global annotations for celsius, pi and secondorder
  and they don't need to be copied on GPU

- [x] MOD2C generates code without using globals / acc declare
      See see BlueBrain/mod2c/pull/78
- [x] Basic test with special and python on GPU
      See https://github.com/BlueBrain/CoreNeuron/issues/141#issuecomment-1086742194
- [ ] Link issues with CUDA part e.g. nrnran123.cu functions result
      into link errors, see
      https://github.com/BlueBrain/CoreNeuron/issues/141#issuecomment-1086742194
      @olupton to rescue!
- [ ] Check celsius usage within coreneuron source code
- [ ] Investigate why acc_deviceptr(ml->data) returns host
      pointer when coreneuron is launched via python. See
      https://github.com/BlueBrain/CoreNeuron/issues/141#issuecomment-1086746848
- [ ] Run neuron test suite and external models like olfactory-buld via
      python
- [ ] Update submodule BlueBrain/mod2c/pull/78
---
 CMake/OpenAccHelper.cmake          |  7 ++++++-
 CMakeLists.txt                     |  8 --------
 coreneuron/CMakeLists.txt          |  2 +-
 coreneuron/apps/main1.cpp          |  2 --
 coreneuron/gpu/nrn_acc_manager.cpp | 17 ++++++++++++-----
 coreneuron/nrnconf.h               |  8 --------
 external/mod2c                     |  2 +-
 extra/nrnivmodl_core_makefile.in   |  3 ++-
 8 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake
index 1c18225b6..8bdf5726a 100644
--- a/CMake/OpenAccHelper.cmake
+++ b/CMake/OpenAccHelper.cmake
@@ -67,7 +67,12 @@ if(CORENRN_ENABLE_GPU)
   # and offloaded OpenACC/OpenMP code. Using -cuda when compiling seems to improve error messages in
   # some cases, and to be recommended by NVIDIA. We pass -gpu=cudaX.Y to ensure that OpenACC/OpenMP
   # code is compiled with the same CUDA version as the explicit CUDA code.
-  set(NVHPC_ACC_COMP_FLAGS "-cuda -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
+  # TODO nordc option is added based on the recommendation from:
+  #        https://forums.developer.nvidia.com/t/separate-compilation-of-mixed-cuda-openacc-code/192701
+  #      but as discussed in
+  #        https://github.com/BlueBrain/CoreNeuron/issues/141#issuecomment-1086742194
+  #      this is still not completely solving underlying link issue.
+  set(NVHPC_ACC_COMP_FLAGS "-cuda -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo,nordc")
   # Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA
   # code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the
   # same default compute capabilities as each other, particularly on GPU-less build machines.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cb1c96b6c..26cc84360 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -303,14 +303,6 @@ if(CORENRN_HAVE_NVHPC_COMPILER)
   endif()
 endif()
 
-# ~~~
-# OpenACC needs to build static library in order to have global/routines working.
-# See https://www.pgroup.com/userforum/viewtopic.php?t=5350
-# ~~~
-if(CORENRN_ENABLE_GPU)
-  set(CORENRN_ENABLE_SHARED OFF)
-endif()
-
 if(CORENRN_ENABLE_SHARED)
   set(COMPILE_LIBRARY_TYPE "SHARED")
 else()
diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 0dc648628..489c85a05 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -248,7 +248,7 @@ target_compile_options(coreneuron
 add_dependencies(coreneuron nrnivmodl-core)
 
 # scopmath is created separately for nrnivmodl-core workflow
-add_library(scopmath STATIC ${CORENEURON_HEADER_FILES} ${SCOPMATH_CODE_FILES})
+add_library(scopmath ${COMPILE_LIBRARY_TYPE} ${CORENEURON_HEADER_FILES} ${SCOPMATH_CODE_FILES})
 target_include_directories(scopmath PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}
                                             ${CORENEURON_PROJECT_BINARY_DIR}/generated)
 
diff --git a/coreneuron/apps/main1.cpp b/coreneuron/apps/main1.cpp
index fb74df7d0..b7139754d 100644
--- a/coreneuron/apps/main1.cpp
+++ b/coreneuron/apps/main1.cpp
@@ -563,8 +563,6 @@ extern "C" int run_solve_core(int argc, char** argv) {
 #endif
     bool compute_gpu = corenrn_param.gpu;
 
-    nrn_pragma_acc(update device(celsius, secondorder, pi) if (compute_gpu))
-    nrn_pragma_omp(target update to(celsius, secondorder, pi) if (compute_gpu))
     {
         double v = corenrn_param.voltage;
         double dt = corenrn_param.dt;
diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index 3eff82fe1..098b943be 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -77,7 +77,7 @@ void cnrn_target_set_default_device(int device_num) {
 
 #ifdef CORENEURON_ENABLE_GPU
 
-static Memb_list* copy_ml_to_device(const Memb_list* ml, int type) {
+static Memb_list* copy_ml_to_device(const Memb_list* ml, int type, double* dml_data) {
     // As we never run code for artificial cell inside GPU we don't copy it.
     int is_art = corenrn.get_is_artificial()[type];
     if (is_art) {
@@ -90,9 +90,9 @@ static Memb_list* copy_ml_to_device(const Memb_list* ml, int type) {
     int szp = corenrn.get_prop_param_size()[type];
     int szdp = corenrn.get_prop_dparam_size()[type];
 
-    double* dptr = cnrn_target_deviceptr(ml->data);
-    cnrn_target_memcpy_to_device(&(d_ml->data), &(dptr));
+    double* dptr = dml_data;
 
+    cnrn_target_memcpy_to_device(&(d_ml->data), &(dptr));
 
     int* d_nodeindices = cnrn_target_copyin(ml->nodeindices, n);
     cnrn_target_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices);
@@ -325,7 +325,6 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         /*copy all double data for thread */
         d__data = cnrn_target_copyin(nt->_data, nt->_ndata);
 
-
         /* Here is the example of using OpenACC data enter/exit
          * Remember that we are not allowed to use nt->_data but we have to use:
          *      double *dtmp = nt->_data;  // now use dtmp!
@@ -395,9 +394,17 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             // book keeping for linked-list
             d_last_tml = d_tml;
 
+            // TODO: acc_deviceptr is returning host pointer when
+            // coreneuron is launched via python instead of special
+            //      see: https://github.com/BlueBrain/CoreNeuron/issues/141#issuecomment-1086746848
+            // As ml->data is always within nt->_data, temporarily calculate
+            // device pointer of ml->data on using offset.
+            double* dml_data = d__data + (tml->ml->data - nt->_data);
+
             /* now for every tml, there is a ml. copy that and setup pointer */
-            Memb_list* d_ml = copy_ml_to_device(tml->ml, tml->index);
+            Memb_list* d_ml = copy_ml_to_device(tml->ml, tml->index, dml_data);
             cnrn_target_memcpy_to_device(&(d_tml->ml), &d_ml);
+
             /* setup nt._ml_list */
             cnrn_target_memcpy_to_device(&(d_ml_list[tml->index]), &d_ml);
         }
diff --git a/coreneuron/nrnconf.h b/coreneuron/nrnconf.h
index b25a2764a..7e4cb6d4e 100644
--- a/coreneuron/nrnconf.h
+++ b/coreneuron/nrnconf.h
@@ -32,17 +32,9 @@ using Symbol = char;
 #define VEC_AREA(i) (_nt->_actual_area[(i)])
 #define VECTORIZE   1
 
-// extern variables require acc declare
-nrn_pragma_omp(declare target)
 extern double celsius;
-nrn_pragma_acc(declare create(celsius))
-
 extern double pi;
-nrn_pragma_acc(declare create(pi))
-
 extern int secondorder;
-nrn_pragma_acc(declare create(secondorder))
-nrn_pragma_omp(end declare target)
 
 extern double t, dt;
 extern int rev_dt;
diff --git a/external/mod2c b/external/mod2c
index 8565d3c17..9d21b18a0 160000
--- a/external/mod2c
+++ b/external/mod2c
@@ -1 +1 @@
-Subproject commit 8565d3c178a195a489fae0623d6338c2e92cd1e5
+Subproject commit 9d21b18a0036810f3ced1a8b16428754b87c8e87
diff --git a/extra/nrnivmodl_core_makefile.in b/extra/nrnivmodl_core_makefile.in
index 6601f7123..bdc9387f1 100644
--- a/extra/nrnivmodl_core_makefile.in
+++ b/extra/nrnivmodl_core_makefile.in
@@ -212,6 +212,7 @@ $(SPECIAL_EXE): coremech_lib_target
 	$(CXX_LINK_EXE_CMD) -o $(SPECIAL_EXE) $(CORENRN_SHARE_CORENRN_DIR)/coreneuron.cpp \
 	  -I$(CORENRN_INC_DIR) $(INCFLAGS) \
 	  -L$(OUTPUT_DIR) -l$(COREMECH_LIB_NAME) $(CORENRNLIB_FLAGS) $(LDFLAGS) \
+	  -L$(CORENRN_LIB_DIR) -lscopmath \
 	  -Wl,-rpath,'$(LIB_RPATH)' -Wl,-rpath,$(CORENRN_LIB_DIR) -Wl,-rpath,'$(INSTALL_LIB_RPATH)'
 
 coremech_lib_target: $(corenrnmech_lib_target)
@@ -226,7 +227,7 @@ $(ENGINEMECH_OBJ): $(CORENRN_SHARE_CORENRN_DIR)/enginemech.cpp | $(MOD_OBJS_DIR)
 coremech_lib_shared: $(ALL_OBJS) $(ENGINEMECH_OBJ) build_always
 	$(CXX_SHARED_LIB_CMD) $(ENGINEMECH_OBJ) -o ${COREMECH_LIB_PATH} $(ALL_OBJS) \
 	  -I$(CORENRN_INC_DIR) $(INCFLAGS) \
-	  $(LDFLAGS) $(CORENRN_LIB_DIR)/libscopmath.a \
+	  $(LDFLAGS) -L$(CORENRN_LIB_DIR) -lscopmath\
 	  ${SONAME_OPTION} $(CORENRNLIB_FLAGS) -Wl,-rpath,$(CORENRN_LIB_DIR);
 
 # build static library of mechanisms

From 045b9cd9de53c475da03926c54abf1db5785cd82 Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
Date: Wed, 20 Apr 2022 12:57:11 +0200
Subject: [PATCH 002/128] Pass Memb_list* as an argument for all common
 prototypes in order to support global variables via argument

* add ml parameter to all relevant function
* switch to static build and remove nordc temporarily
* free ml->instance if not empty
* avoid extracting libscopmath objects and linking
---
 CMake/OpenAccHelper.cmake                       |  2 +-
 CMakeLists.txt                                  |  4 ++++
 coreneuron/CMakeLists.txt                       |  2 +-
 coreneuron/io/core2nrn_data_return.cpp          | 16 +++++++++++++---
 coreneuron/io/nrn2core_data_init.cpp            |  3 ++-
 coreneuron/io/nrn_checkpoint.cpp                |  6 ++++--
 coreneuron/io/nrn_setup.cpp                     |  5 +++++
 coreneuron/io/phase2.cpp                        |  1 +
 coreneuron/mechanism/mech/mod2c_core_thread.hpp |  8 ++++----
 coreneuron/mechanism/mechanism.hpp              |  3 ++-
 coreneuron/mechanism/membfunc.hpp               |  2 ++
 coreneuron/mechanism/patternstim.cpp            |  3 ++-
 extra/nrnivmodl_core_makefile.in                |  6 ++----
 13 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake
index 8bdf5726a..e4e2f7f89 100644
--- a/CMake/OpenAccHelper.cmake
+++ b/CMake/OpenAccHelper.cmake
@@ -72,7 +72,7 @@ if(CORENRN_ENABLE_GPU)
   #      but as discussed in
   #        https://github.com/BlueBrain/CoreNeuron/issues/141#issuecomment-1086742194
   #      this is still not completely solving underlying link issue.
-  set(NVHPC_ACC_COMP_FLAGS "-cuda -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo,nordc")
+  set(NVHPC_ACC_COMP_FLAGS "-cuda -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
   # Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA
   # code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the
   # same default compute capabilities as each other, particularly on GPU-less build machines.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 26cc84360..4366cfee1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -303,6 +303,10 @@ if(CORENRN_HAVE_NVHPC_COMPILER)
   endif()
 endif()
 
+if(CORENRN_ENABLE_GPU)
+  set(CORENRN_ENABLE_SHARED OFF)
+endif()
+
 if(CORENRN_ENABLE_SHARED)
   set(COMPILE_LIBRARY_TYPE "SHARED")
 else()
diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 489c85a05..21861649e 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -305,7 +305,7 @@ add_custom_command(
   OUTPUT ${output_binaries}
   DEPENDS scopmath coreneuron ${NMODL_TARGET_TO_DEPEND} ${modfiles} ${CORENEURON_BUILTIN_MODFILES}
   COMMAND ${CMAKE_BINARY_DIR}/bin/nrnivmodl-core -b ${COMPILE_LIBRARY_TYPE} -m
-          ${CORENRN_MOD2CPP_BINARY} -p 1 "${modfile_directory}"
+          ${CORENRN_MOD2CPP_BINARY} -p 4 "${modfile_directory}"
   WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/bin
   COMMENT "Running nrnivmodl-core with halfgap.mod")
 add_custom_target(nrniv-core ALL DEPENDS ${output_binaries})
diff --git a/coreneuron/io/core2nrn_data_return.cpp b/coreneuron/io/core2nrn_data_return.cpp
index 6a12c197f..87a549ac6 100644
--- a/coreneuron/io/core2nrn_data_return.cpp
+++ b/coreneuron/io/core2nrn_data_return.cpp
@@ -137,7 +137,7 @@ static void core2nrn_corepointer(int tid, NrnThreadMembList* tml) {
         d = ml->data + nrn_i_layout(jp, ml->nodecount, 0, dsz, layout);
         pd = ml->pdata + nrn_i_layout(jp, ml->nodecount, 0, pdsz, layout);
         (*corenrn.get_bbcore_write()[type])(
-            nullptr, nullptr, &dcnt, &icnt, 0, aln_cntml, d, pd, ml->_thread, &nt, 0.0);
+            nullptr, nullptr, &dcnt, &icnt, 0, aln_cntml, d, pd, ml->_thread, &nt, ml, 0.0);
     }
 
     std::unique_ptr<int[]> iArray;
@@ -159,8 +159,18 @@ static void core2nrn_corepointer(int tid, NrnThreadMembList* tml) {
         d = ml->data + nrn_i_layout(jp, ml->nodecount, 0, dsz, layout);
         pd = ml->pdata + nrn_i_layout(jp, ml->nodecount, 0, pdsz, layout);
 
-        (*corenrn.get_bbcore_write()[type])(
-            dArray.get(), iArray.get(), &dcnt, &icnt, 0, aln_cntml, d, pd, ml->_thread, &nt, 0.0);
+        (*corenrn.get_bbcore_write()[type])(dArray.get(),
+                                            iArray.get(),
+                                            &dcnt,
+                                            &icnt,
+                                            0,
+                                            aln_cntml,
+                                            d,
+                                            pd,
+                                            ml->_thread,
+                                            &nt,
+                                            ml,
+                                            0.0);
     }
 
     (*core2nrn_corepointer_mech_)(tid, type, icnt, dcnt, iArray.get(), dArray.get());
diff --git a/coreneuron/io/nrn2core_data_init.cpp b/coreneuron/io/nrn2core_data_init.cpp
index e732dec11..ad7106f6e 100644
--- a/coreneuron/io/nrn2core_data_init.cpp
+++ b/coreneuron/io/nrn2core_data_init.cpp
@@ -407,6 +407,7 @@ extern void** pattern_stim_info_ref(int icnt,
                                     Datum* _ppvar,
                                     ThreadDatum* _thread,
                                     NrnThread* _nt,
+                                    Memb_list* ml,
                                     double v);
 
 extern "C" {
@@ -437,7 +438,7 @@ void nrn2core_patstim_share_info() {
             assert(0);
         }
 
-        void** info = pattern_stim_info_ref(_iml, _cntml, _p, _ppvar, nullptr, nt, 0.0);
+        void** info = pattern_stim_info_ref(_iml, _cntml, _p, _ppvar, nullptr, nt, ml, 0.0);
         (*nrn2core_patternstim_)(info);
     }
 }
diff --git a/coreneuron/io/nrn_checkpoint.cpp b/coreneuron/io/nrn_checkpoint.cpp
index 955848901..ecf432422 100644
--- a/coreneuron/io/nrn_checkpoint.cpp
+++ b/coreneuron/io/nrn_checkpoint.cpp
@@ -449,7 +449,7 @@ void CheckPoints::write_phase2(NrnThread& nt) const {
                 d = ml->data + nrn_i_layout(jp, ml->nodecount, 0, dsz, layout);
                 pd = ml->pdata + nrn_i_layout(jp, ml->nodecount, 0, pdsz, layout);
                 (*corenrn.get_bbcore_write()[type])(
-                    nullptr, nullptr, &dcnt, &icnt, 0, aln_cntml, d, pd, ml->_thread, &nt, 0.0);
+                    nullptr, nullptr, &dcnt, &icnt, 0, aln_cntml, d, pd, ml->_thread, &nt, ml, 0.0);
             }
             fh << icnt << "\n";
             fh << dcnt << "\n";
@@ -478,7 +478,7 @@ void CheckPoints::write_phase2(NrnThread& nt) const {
                 pd = ml->pdata + nrn_i_layout(jp, ml->nodecount, 0, pdsz, layout);
 
                 (*corenrn.get_bbcore_write()[type])(
-                    dArray, iArray, &dcnt, &icnt, 0, aln_cntml, d, pd, ml->_thread, &nt, 0.0);
+                    dArray, iArray, &dcnt, &icnt, 0, aln_cntml, d, pd, ml->_thread, &nt, ml, 0.0);
             }
 
             if (icnt) {
@@ -592,6 +592,7 @@ bool CheckPoints::initialize() {
                                            ml->pdata,
                                            ml->_thread,
                                            nrn_threads,
+                                           ml,
                                            0.0);
             break;
         }
@@ -802,6 +803,7 @@ void CheckPoints::write_tqueue(NrnThread& nt, FileHandler& fh) const {
                 ml->pdata,
                 ml->_thread,
                 nrn_threads,
+                ml,
                 0.0);
             break;
         }
diff --git a/coreneuron/io/nrn_setup.cpp b/coreneuron/io/nrn_setup.cpp
index c22ffc0ce..361ccd185 100644
--- a/coreneuron/io/nrn_setup.cpp
+++ b/coreneuron/io/nrn_setup.cpp
@@ -754,6 +754,11 @@ void nrn_cleanup() {
                 ml->_thread = nullptr;
             }
 
+            if (ml->instance) {
+                free(ml->instance);
+                ml->instance = nullptr;
+            }
+
             NetReceiveBuffer_t* nrb = ml->_net_receive_buffer;
             if (nrb) {
                 if (nrb->_size) {
diff --git a/coreneuron/io/phase2.cpp b/coreneuron/io/phase2.cpp
index a97b335bb..bb3f7f99f 100644
--- a/coreneuron/io/phase2.cpp
+++ b/coreneuron/io/phase2.cpp
@@ -867,6 +867,7 @@ void Phase2::get_info_from_bbcore(NrnThread& nt,
                                                pd,
                                                ml->_thread,
                                                &nt,
+                                               ml,
                                                0.0);
         }
         assert(dk == static_cast<int>(tmls[i].dArray.size()));
diff --git a/coreneuron/mechanism/mech/mod2c_core_thread.hpp b/coreneuron/mechanism/mech/mod2c_core_thread.hpp
index 85ed348f6..d18160f3a 100644
--- a/coreneuron/mechanism/mech/mod2c_core_thread.hpp
+++ b/coreneuron/mechanism/mech/mod2c_core_thread.hpp
@@ -16,14 +16,14 @@ namespace coreneuron {
 
 #define _STRIDE _cntml_padded + _iml
 
-#define _threadargscomma_ _iml, _cntml_padded, _p, _ppvar, _thread, _nt, _v,
+#define _threadargscomma_ _iml, _cntml_padded, _p, _ppvar, _thread, _nt, _ml, _v,
 #define _threadargsprotocomma_                                                                    \
     int _iml, int _cntml_padded, double *_p, Datum *_ppvar, ThreadDatum *_thread, NrnThread *_nt, \
-        double _v,
-#define _threadargs_ _iml, _cntml_padded, _p, _ppvar, _thread, _nt, _v
+        Memb_list *_ml, double _v,
+#define _threadargs_ _iml, _cntml_padded, _p, _ppvar, _thread, _nt, _ml, _v
 #define _threadargsproto_                                                                         \
     int _iml, int _cntml_padded, double *_p, Datum *_ppvar, ThreadDatum *_thread, NrnThread *_nt, \
-        double _v
+        Memb_list *_ml, double _v
 
 struct Elm {
     unsigned row;        /* Row location */
diff --git a/coreneuron/mechanism/mechanism.hpp b/coreneuron/mechanism/mechanism.hpp
index ab78ad502..71b8b0fc6 100644
--- a/coreneuron/mechanism/mechanism.hpp
+++ b/coreneuron/mechanism/mechanism.hpp
@@ -143,6 +143,7 @@ struct Memb_list {
     NetSendBuffer_t* _net_send_buffer = nullptr;
     int nodecount; /* actual node count */
     int _nodecount_padded;
-    void* instance = nullptr; /* mechanism instance */
+    void* instance = nullptr; /* mechanism instance struct from NMODL or global variables struct in
+                                 mod2c */
 };
 }  // namespace coreneuron
diff --git a/coreneuron/mechanism/membfunc.hpp b/coreneuron/mechanism/membfunc.hpp
index 2556f0f87..c7d58e82a 100644
--- a/coreneuron/mechanism/membfunc.hpp
+++ b/coreneuron/mechanism/membfunc.hpp
@@ -151,6 +151,7 @@ using bbcore_read_t = void (*)(double*,
                                Datum*,
                                ThreadDatum*,
                                NrnThread*,
+                               Memb_list*,
                                double);
 
 using bbcore_write_t = void (*)(double*,
@@ -163,6 +164,7 @@ using bbcore_write_t = void (*)(double*,
                                 Datum*,
                                 ThreadDatum*,
                                 NrnThread*,
+                                Memb_list*,
                                 double);
 
 extern int nrn_mech_depend(int type, int* dependencies);
diff --git a/coreneuron/mechanism/patternstim.cpp b/coreneuron/mechanism/patternstim.cpp
index e22b19e98..ca1159788 100644
--- a/coreneuron/mechanism/patternstim.cpp
+++ b/coreneuron/mechanism/patternstim.cpp
@@ -38,6 +38,7 @@ extern void pattern_stim_setup_helper(int size,
                                       Datum* _ppvar,
                                       ThreadDatum* _thread,
                                       NrnThread* _nt,
+                                      Memb_list* ml,
                                       double v);
 
 static size_t read_raster_file(const char* fname, double** tvec, int** gidvec, double tstop);
@@ -93,7 +94,7 @@ void nrn_mkPatternStim(const char* fname, double tstop) {
     } else {
         assert(0);
     }
-    pattern_stim_setup_helper(size, tvec, gidvec, _iml, _cntml, _p, _ppvar, nullptr, nt, 0.0);
+    pattern_stim_setup_helper(size, tvec, gidvec, _iml, _cntml, _p, _ppvar, nullptr, nt, ml, 0.0);
 }
 
 size_t read_raster_file(const char* fname, double** tvec, int** gidvec, double tstop) {
diff --git a/extra/nrnivmodl_core_makefile.in b/extra/nrnivmodl_core_makefile.in
index bdc9387f1..585cf3795 100644
--- a/extra/nrnivmodl_core_makefile.in
+++ b/extra/nrnivmodl_core_makefile.in
@@ -227,15 +227,13 @@ $(ENGINEMECH_OBJ): $(CORENRN_SHARE_CORENRN_DIR)/enginemech.cpp | $(MOD_OBJS_DIR)
 coremech_lib_shared: $(ALL_OBJS) $(ENGINEMECH_OBJ) build_always
 	$(CXX_SHARED_LIB_CMD) $(ENGINEMECH_OBJ) -o ${COREMECH_LIB_PATH} $(ALL_OBJS) \
 	  -I$(CORENRN_INC_DIR) $(INCFLAGS) \
-	  $(LDFLAGS) -L$(CORENRN_LIB_DIR) -lscopmath\
+	  $(LDFLAGS)\
 	  ${SONAME_OPTION} $(CORENRNLIB_FLAGS) -Wl,-rpath,$(CORENRN_LIB_DIR);
 
 # build static library of mechanisms
 coremech_lib_static: $(ALL_OBJS) $(ENGINEMECH_OBJ) build_always
-	mkdir -p $(MOD_OBJS_DIR)/scopmath; \
-	cd $(MOD_OBJS_DIR)/scopmath && ar -x $(CORENRN_LIB_DIR)/libscopmath.a && cd -;\
 	rm -f ${COREMECH_LIB_PATH}; \
-	ar cq ${COREMECH_LIB_PATH} $(ENGINEMECH_OBJ) $(ALL_OBJS) $(MOD_OBJS_DIR)/scopmath/*.o;
+	ar cq ${COREMECH_LIB_PATH} $(ENGINEMECH_OBJ) $(ALL_OBJS);
 
 # compile cpp files to .o
 $(MOD_OBJS_DIR)/%.o: $(MOD_TO_CPP_DIR)/%.cpp | $(MOD_OBJS_DIR)

From ccb8b6bf588dcc6b7a382a2dccba56672d7042d4 Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
Date: Thu, 21 Apr 2022 01:54:38 +0200
Subject: [PATCH 003/128] Add link to libscopmath in neuron as well

---
 CMake/OpenAccHelper.cmake | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake
index e4e2f7f89..d64227154 100644
--- a/CMake/OpenAccHelper.cmake
+++ b/CMake/OpenAccHelper.cmake
@@ -66,12 +66,11 @@ if(CORENRN_ENABLE_GPU)
   # linking. Without this, we had problems with linking between the explicit CUDA (.cu) device code
   # and offloaded OpenACC/OpenMP code. Using -cuda when compiling seems to improve error messages in
   # some cases, and to be recommended by NVIDIA. We pass -gpu=cudaX.Y to ensure that OpenACC/OpenMP
-  # code is compiled with the same CUDA version as the explicit CUDA code.
-  # TODO nordc option is added based on the recommendation from:
-  #        https://forums.developer.nvidia.com/t/separate-compilation-of-mixed-cuda-openacc-code/192701
-  #      but as discussed in
-  #        https://github.com/BlueBrain/CoreNeuron/issues/141#issuecomment-1086742194
-  #      this is still not completely solving underlying link issue.
+  # code is compiled with the same CUDA version as the explicit CUDA code. TODO nordc option is
+  # added based on the recommendation from:
+  # https://forums.developer.nvidia.com/t/separate-compilation-of-mixed-cuda-openacc-code/192701 but
+  # as discussed in https://github.com/BlueBrain/CoreNeuron/issues/141#issuecomment-1086742194 this
+  # is still not completely solving underlying link issue.
   set(NVHPC_ACC_COMP_FLAGS "-cuda -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
   # Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA
   # code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the
@@ -106,11 +105,11 @@ if(CORENRN_ENABLE_GPU)
     GLOBAL
     PROPERTY
       CORENEURON_LIB_LINK_FLAGS
-      "${NVHPC_ACC_COMP_FLAGS} -rdynamic -lrt -Wl,--whole-archive -L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech -L$(libdir) -lcoreneuron -Wl,--no-whole-archive"
+      "${NVHPC_ACC_COMP_FLAGS} -rdynamic -lrt -Wl,--whole-archive -L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech -lscopmath -L$(libdir) -lcoreneuron -Wl,--no-whole-archive"
   )
 else()
   set_property(GLOBAL PROPERTY CORENEURON_LIB_LINK_FLAGS
-                               "-L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech")
+                               "-L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech -lscopmath")
 endif(CORENRN_ENABLE_GPU)
 
 if(CORENRN_HAVE_NVHPC_COMPILER)

From b2fcc727b7824ff6aa8f4ede0e0389f59c020bf5 Mon Sep 17 00:00:00 2001
From: Pramod S Kumbhar <pramod.s.kumbhar@gmail.com>
Date: Thu, 21 Apr 2022 14:27:21 +0200
Subject: [PATCH 004/128] Memb_list for pattern.mod should be calloc'd for
 zero-initialisation

---
 coreneuron/mechanism/patternstim.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/coreneuron/mechanism/patternstim.cpp b/coreneuron/mechanism/patternstim.cpp
index ca1159788..4f5e4e4e6 100644
--- a/coreneuron/mechanism/patternstim.cpp
+++ b/coreneuron/mechanism/patternstim.cpp
@@ -138,7 +138,7 @@ size_t read_raster_file(const char* fname, double** tvec, int** gidvec, double t
 
 // see nrn_setup.cpp:read_phase2 for how it creates NrnThreadMembList instances.
 static NrnThreadMembList* alloc_nrn_thread_memb(int type) {
-    NrnThreadMembList* tml = (NrnThreadMembList*) emalloc(sizeof(NrnThreadMembList));
+    NrnThreadMembList* tml = (NrnThreadMembList*) ecalloc(1, sizeof(NrnThreadMembList));
     tml->dependencies = nullptr;
     tml->ndependencies = 0;
     tml->index = type;
@@ -149,7 +149,7 @@ static NrnThreadMembList* alloc_nrn_thread_memb(int type) {
     int psize = corenrn.get_prop_param_size()[type];
     int dsize = corenrn.get_prop_dparam_size()[type];
     int layout = corenrn.get_mech_data_layout()[type];
-    tml->ml = (Memb_list*) emalloc(sizeof(Memb_list));
+    tml->ml = (Memb_list*) ecalloc(1, sizeof(Memb_list));
     tml->ml->nodecount = 1;
     tml->ml->_nodecount_padded = tml->ml->nodecount;
     tml->ml->nodeindices = nullptr;

From 59d6d96e6d767de2b5d693cc0ef08a8b7e8daa30 Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
Date: Fri, 22 Apr 2022 23:22:37 +0200
Subject: [PATCH 005/128] Add global_variables per membrane list and cleanup
 for GPU   * update mod2c   * clean global_variables on cpu and gpu

---
 coreneuron/gpu/nrn_acc_manager.cpp | 5 +++++
 coreneuron/io/nrn_setup.cpp        | 5 +++++
 coreneuron/mechanism/mechanism.hpp | 5 +++--
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index 098b943be..ed3bf659c 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -256,6 +256,11 @@ static void delete_ml_from_device(Memb_list* ml, int type) {
         int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
         cnrn_target_delete(ml->pdata, pcnt);
     }
+    if (ml->global_variables) {
+        cnrn_target_delete(reinterpret_cast<char*>(ml->global_variables),
+                           ml->global_variables_size);
+    }
+
     cnrn_target_delete(ml->nodeindices, n);
     cnrn_target_delete(ml);
 }
diff --git a/coreneuron/io/nrn_setup.cpp b/coreneuron/io/nrn_setup.cpp
index 361ccd185..98382f9da 100644
--- a/coreneuron/io/nrn_setup.cpp
+++ b/coreneuron/io/nrn_setup.cpp
@@ -759,6 +759,11 @@ void nrn_cleanup() {
                 ml->instance = nullptr;
             }
 
+            if (ml->global_variables) {
+                free(ml->global_variables);
+                ml->global_variables = nullptr;
+            }
+
             NetReceiveBuffer_t* nrb = ml->_net_receive_buffer;
             if (nrb) {
                 if (nrb->_size) {
diff --git a/coreneuron/mechanism/mechanism.hpp b/coreneuron/mechanism/mechanism.hpp
index 71b8b0fc6..1c177976c 100644
--- a/coreneuron/mechanism/mechanism.hpp
+++ b/coreneuron/mechanism/mechanism.hpp
@@ -143,7 +143,8 @@ struct Memb_list {
     NetSendBuffer_t* _net_send_buffer = nullptr;
     int nodecount; /* actual node count */
     int _nodecount_padded;
-    void* instance = nullptr; /* mechanism instance struct from NMODL or global variables struct in
-                                 mod2c */
+    void* instance = nullptr;         /* mechanism instance struct from NMODL */
+    void* global_variables = nullptr; /* global variables struct for each mechanism */
+    int global_variables_size = 0;    /* size of global variables struct in bytes */
 };
 }  // namespace coreneuron

From 12dabf1d99bb9ca82a6805e15fda12cbf9aeec67 Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
Date: Sat, 23 Apr 2022 11:46:41 +0200
Subject: [PATCH 006/128] redefine nrn_ghk with celsius as an argument  *
 remove duplicate prototypes from nrnoc_aux.hpp  * update mod2c

---
 coreneuron/mechanism/eion.cpp     | 2 +-
 coreneuron/mechanism/membfunc.hpp | 3 ++-
 coreneuron/utils/nrnoc_aux.hpp    | 4 ----
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/coreneuron/mechanism/eion.cpp b/coreneuron/mechanism/eion.cpp
index 4bc077880..f61a13579 100644
--- a/coreneuron/mechanism/eion.cpp
+++ b/coreneuron/mechanism/eion.cpp
@@ -211,7 +211,7 @@ static double efun(double x) {
 
 nrn_pragma_omp(end declare target)
 
-double nrn_ghk(double v, double ci, double co, double z) {
+double nrn_ghk(double v, double ci, double co, double z, double celsius) {
     double temp = z * v / ktf;
     double eco = co * efun(temp);
     double eci = ci * efun(-temp);
diff --git a/coreneuron/mechanism/membfunc.hpp b/coreneuron/mechanism/membfunc.hpp
index c7d58e82a..ab07d7ea8 100644
--- a/coreneuron/mechanism/membfunc.hpp
+++ b/coreneuron/mechanism/membfunc.hpp
@@ -116,7 +116,8 @@ extern void nrn_wrote_conc(int, double*, int, int, double**, double, int);
 nrn_pragma_acc(routine seq)
 double nrn_nernst(double ci, double co, double z, double celsius);
 nrn_pragma_acc(routine seq)
-extern double nrn_ghk(double v, double ci, double co, double z);
+//TODO: check if this should be via overload
+extern double nrn_ghk(double v, double ci, double co, double z, double celsius);
 nrn_pragma_omp(end declare target)
 extern void hoc_register_prop_size(int, int, int);
 extern void hoc_register_dparam_semantics(int type, int, const char* name);
diff --git a/coreneuron/utils/nrnoc_aux.hpp b/coreneuron/utils/nrnoc_aux.hpp
index 3c2f23326..a67569d56 100644
--- a/coreneuron/utils/nrnoc_aux.hpp
+++ b/coreneuron/utils/nrnoc_aux.hpp
@@ -35,8 +35,4 @@ extern void hoc_warning(const char*, const char*);
 
 extern double hoc_Exp(double x);
 
-// defined in eion.cpp and this file included in translated mod files.
-extern double nrn_nernst(double ci, double co, double z, double celsius);
-extern double nrn_ghk(double v, double ci, double co, double z);
-
 }  // namespace coreneuron

From 1139a74e99524eb43660d0e44a4fbd011c8b3a0f Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Fri, 22 Apr 2022 14:15:49 +0200
Subject: [PATCH 007/128] Reorganise unit definitions, inline functions.

- nrn_ghk is now declared inline.
---
 coreneuron/CMakeLists.txt          | 11 ++------
 coreneuron/apps/main1.cpp          |  6 ++---
 coreneuron/io/global_vars.cpp      |  2 +-
 coreneuron/mechanism/eion.cpp      | 42 +++++-------------------------
 coreneuron/mechanism/membfunc.hpp  | 38 ++++++++++++++++++++-------
 coreneuron/nrnoc/nrnunits_modern.h | 36 -------------------------
 coreneuron/utils/nrnoc_aux.hpp     |  1 -
 coreneuron/utils/units.hpp         | 38 +++++++++++++++++++++++++++
 tests/integration/CMakeLists.txt   |  2 +-
 9 files changed, 80 insertions(+), 96 deletions(-)
 delete mode 100644 coreneuron/nrnoc/nrnunits_modern.h
 create mode 100644 coreneuron/utils/units.hpp

diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 21861649e..665205802 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -120,16 +120,9 @@ if(CORENRN_ENABLE_GPU)
 endif()
 
 # =============================================================================
-# eion.cpp depends on CORENRN_USE_LEGACY_UNITS
+# CORENEURON_USE_LEGACY_UNITS is used in membfunc.hpp so define it everywhere
 # =============================================================================
-set(LegacyFR_FILES
-    ${CMAKE_CURRENT_SOURCE_DIR}/mechanism/eion.cpp ${CMAKE_CURRENT_SOURCE_DIR}/apps/main1.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/io/global_vars.cpp)
-
-set_property(
-  SOURCE ${LegacyFR_FILES}
-  APPEND
-  PROPERTY COMPILE_DEFINITIONS "CORENRN_USE_LEGACY_UNITS=${CORENRN_USE_LEGACY_UNITS}")
+add_compile_definitions(CORENEURON_USE_LEGACY_UNITS=${CORENRN_USE_LEGACY_UNITS})
 
 # =============================================================================
 # create libraries
diff --git a/coreneuron/apps/main1.cpp b/coreneuron/apps/main1.cpp
index b7139754d..8e05a5d69 100644
--- a/coreneuron/apps/main1.cpp
+++ b/coreneuron/apps/main1.cpp
@@ -1,6 +1,6 @@
 /*
 # =============================================================================
-# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
+# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
 #
 # See top-level LICENSE file for details.
 # =============================================================================.
@@ -51,9 +51,9 @@ const char* corenrn_version() {
     return coreneuron::bbcore_write_version;
 }
 
-// the CORENRN_USE_LEGACY_UNITS determined by CORENRN_ENABLE_LEGACY_UNITS
+// the CORENEURON_USE_LEGACY_UNITS determined by CORENRN_ENABLE_LEGACY_UNITS
 bool corenrn_units_use_legacy() {
-    return CORENRN_USE_LEGACY_UNITS;
+    return CORENEURON_USE_LEGACY_UNITS;
 }
 
 void (*nrn2core_part2_clean_)();
diff --git a/coreneuron/io/global_vars.cpp b/coreneuron/io/global_vars.cpp
index 128a1cdb9..815423ea9 100644
--- a/coreneuron/io/global_vars.cpp
+++ b/coreneuron/io/global_vars.cpp
@@ -142,7 +142,7 @@ void set_globals(const char* path, bool cli_global_seed, int cli_global_seed_val
                 } else if (strcmp(name, "Random123_globalindex") == 0) {
                     nrnran123_set_globalindex((uint32_t) n);
                 } else if (strcmp(name, "_nrnunit_use_legacy_") == 0) {
-                    if (n != CORENRN_USE_LEGACY_UNITS) {
+                    if (n != CORENEURON_USE_LEGACY_UNITS) {
                         hoc_execerror(
                             "CORENRN_ENABLE_LEGACY_UNITS not"
                             " consistent with NEURON value of"
diff --git a/coreneuron/mechanism/eion.cpp b/coreneuron/mechanism/eion.cpp
index f61a13579..1dbd0d2db 100644
--- a/coreneuron/mechanism/eion.cpp
+++ b/coreneuron/mechanism/eion.cpp
@@ -1,6 +1,6 @@
 /*
 # =============================================================================
-# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
+# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
 #
 # See top-level LICENSE file for details.
 # =============================================================================.
@@ -154,23 +154,9 @@ the USEION statement of any model using this ion\n",
     }
 }
 
-#ifndef CORENRN_USE_LEGACY_UNITS
-#define CORENRN_USE_LEGACY_UNITS 0
-#endif
-
-#if CORENRN_USE_LEGACY_UNITS == 1
-#define FARADAY     96485.309
-#define gasconstant 8.3134
-#else
-#include "coreneuron/nrnoc/nrnunits_modern.h"
-#define FARADAY     _faraday_codata2018
-#define gasconstant _gasconstant_codata2018
-#endif
-
-#define ktf (1000. * gasconstant * (celsius + 273.15) / FARADAY)
-
-double nrn_nernst(double ci, double co, double z, double celsius) {
-    /*printf("nrn_nernst %g %g %g\n", ci, co, z);*/
+// std::log isn't constexpr, but there are argument values for which nrn_nernst
+// is a constant expression
+constexpr double nrn_nernst(double ci, double co, double z, double celsius) {
     if (z == 0) {
         return 0.;
     }
@@ -179,7 +165,7 @@ double nrn_nernst(double ci, double co, double z, double celsius) {
     } else if (co <= 0.) {
         return -1e6;
     } else {
-        return ktf / z * log(co / ci);
+        return ktf(celsius) / z * std::log(co / ci);
     }
 }
 
@@ -200,24 +186,8 @@ void nrn_wrote_conc(int type,
         pe[0] = nrn_nernst(pe[1 * _STRIDE], pe[2 * _STRIDE], gimap[type][2], celsius);
     }
 }
-
-static double efun(double x) {
-    if (fabs(x) < 1e-4) {
-        return 1. - x / 2.;
-    } else {
-        return x / (exp(x) - 1);
-    }
-}
-
 nrn_pragma_omp(end declare target)
 
-double nrn_ghk(double v, double ci, double co, double z, double celsius) {
-    double temp = z * v / ktf;
-    double eco = co * efun(temp);
-    double eci = ci * efun(-temp);
-    return (.001) * z * FARADAY * (eci - eco);
-}
-
 #if VECTORIZE
 #define erev   pd[0 * _STRIDE] /* From Eion */
 #define conci  pd[1 * _STRIDE]
@@ -257,7 +227,7 @@ ion_style("name_ion", [c_style, e_style, einit, eadvance, cinit])
 
 double nrn_nernst_coef(int type) {
     /* for computing jacobian element dconc'/dconc */
-    return ktf / charge;
+    return ktf(celsius) / charge;
 }
 
 /* Must be called prior to any channels which update the currents */
diff --git a/coreneuron/mechanism/membfunc.hpp b/coreneuron/mechanism/membfunc.hpp
index ab07d7ea8..638b52d87 100644
--- a/coreneuron/mechanism/membfunc.hpp
+++ b/coreneuron/mechanism/membfunc.hpp
@@ -1,17 +1,19 @@
 /*
 # =============================================================================
-# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
+# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
 #
 # See top-level LICENSE file for details.
 # =============================================================================.
 */
-
 #pragma once
 
-#include <vector>
-
 #include "coreneuron/mechanism/mechanism.hpp"
 #include "coreneuron/utils/offload.hpp"
+#include "coreneuron/utils/units.hpp"
+
+#include <cmath>
+#include <vector>
+
 namespace coreneuron {
 
 using Pfrpdat = Datum* (*) (void);
@@ -113,12 +115,30 @@ extern void nrn_writes_conc(int, int);
 nrn_pragma_omp(declare target)
 nrn_pragma_acc(routine seq)
 extern void nrn_wrote_conc(int, double*, int, int, double**, double, int);
-nrn_pragma_acc(routine seq)
-double nrn_nernst(double ci, double co, double z, double celsius);
-nrn_pragma_acc(routine seq)
-//TODO: check if this should be via overload
-extern double nrn_ghk(double v, double ci, double co, double z, double celsius);
 nrn_pragma_omp(end declare target)
+constexpr double ktf(double celsius) {
+    return 1000. * units::gasconstant * (celsius + 273.15) / units::faraday;
+}
+inline double nrn_ghk(double v, double ci, double co, double z, double celsius) {
+    auto const efun = [](double x) {
+        if (std::abs(x) < 1e-4) {
+            return 1. - x / 2.;
+        } else {
+            return x / (std::exp(x) - 1.);
+        }
+    };
+    double const temp{z * v / ktf(celsius)};
+    double const eco{co * efun(+temp)};
+    double const eci{ci * efun(-temp)};
+    return .001 * z * units::faraday * (eci - eco);
+}
+/**
+ * This signature requires the use of the `celsius` global variable, which can
+ * cause problems when executing on GPU.
+ */
+[[deprecated]] inline double nrn_ghk(double v, double ci, double co, double z) {
+    return nrn_ghk(v, ci, co, z, celsius);
+}
 extern void hoc_register_prop_size(int, int, int);
 extern void hoc_register_dparam_semantics(int type, int, const char* name);
 extern void hoc_reg_ba(int, mod_f_t, int);
diff --git a/coreneuron/nrnoc/nrnunits_modern.h b/coreneuron/nrnoc/nrnunits_modern.h
deleted file mode 100644
index d93638841..000000000
--- a/coreneuron/nrnoc/nrnunits_modern.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
-# =============================================================================
-# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
-#
-# See top-level LICENSE file for details.
-# =============================================================================
-*/
-
-#pragma once
-
-/**
- NMODL translated MOD files get unit constants typically from
- share/lib/nrnunits.lib.in. But there were other source files that
- hardcode some of the constants. Here we gather a few modern units into
- a single place (but, unfortunately, also in nrnunits.lib.in). Legacy units
- cannot be gathered here because they can differ slightly from place to place.
-
- These come from https://physics.nist.gov/cuu/Constants/index.html.
- Termed the "2018 CODATA recommended values", they became available
- on 20 May 2019 and replace the 2014 CODATA set.
-
- See oc/hoc_init.c, nrnoc/eion.c, nrniv/kschan.h
-**/
-
-
-#define _electron_charge_codata2018 1.602176634e-19 /* coulomb exact*/
-#define _avogadro_number_codata2018 6.02214076e+23  /* exact */
-#define _boltzmann_codata2018       1.380649e-23    /* joule/K exact */
-#define _faraday_codata2018 \
-    (_electron_charge_codata2018 * _avogadro_number_codata2018) /* 96485.33212... coulomb/mol */
-#define _gasconstant_codata2018 \
-    (_boltzmann_codata2018 * _avogadro_number_codata2018) /* 8.314462618... joule/mol-K */
-
-/* e/k in K/millivolt */
-#define _e_over_k_codata2018 \
-    (.001 * _electron_charge_codata2018 / _boltzmann_codata2018) /* 11.604518... K/mV */
diff --git a/coreneuron/utils/nrnoc_aux.hpp b/coreneuron/utils/nrnoc_aux.hpp
index a67569d56..10b5880ea 100644
--- a/coreneuron/utils/nrnoc_aux.hpp
+++ b/coreneuron/utils/nrnoc_aux.hpp
@@ -34,5 +34,4 @@ extern void hoc_execerror(const char*, const char*); /* print and abort */
 extern void hoc_warning(const char*, const char*);
 
 extern double hoc_Exp(double x);
-
 }  // namespace coreneuron
diff --git a/coreneuron/utils/units.hpp b/coreneuron/utils/units.hpp
new file mode 100644
index 000000000..de44343fe
--- /dev/null
+++ b/coreneuron/utils/units.hpp
@@ -0,0 +1,38 @@
+/*
+# =============================================================================
+# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
+#
+# See top-level LICENSE file for details.
+# =============================================================================
+*/
+#pragma once
+namespace coreneuron {
+namespace units {
+#if CORENEURON_USE_LEGACY_UNITS == 1
+constexpr double faraday{96485.309};
+constexpr double gasconstant{8.3134};
+#else
+/* NMODL translated MOD files get unit constants typically from
+ * share/lib/nrnunits.lib.in. But there were other source files that hardcode
+ * some of the constants. Here we gather a few modern units into a single place
+ * (but, unfortunately, also in nrnunits.lib.in). Legacy units cannot be
+ * gathered here because they can differ slightly from place to place.
+ *
+ * These come from https://physics.nist.gov/cuu/Constants/index.html.
+ * Termed the "2018 CODATA recommended values", they became available
+ * on 20 May 2019 and replace the 2014 CODATA set.
+ *
+ * See oc/hoc_init.c, nrnoc/eion.c, nrniv/kschan.h
+ */
+namespace detail {
+constexpr double electron_charge{1.602176634e-19};  // coulomb exact
+constexpr double avogadro_number{6.02214076e+23};   // exact
+constexpr double boltzmann{1.380649e-23};           // joule/K exact
+}  // namespace detail
+constexpr double faraday{detail::electron_charge * detail::avogadro_number};  // 96485.33212...
+                                                                              // coulomb/mol
+constexpr double gasconstant{detail::boltzmann * detail::avogadro_number};    // 8.314462618...
+                                                                              // joule/mol-K
+#endif
+}  // namespace units
+}  // namespace coreneuron
diff --git a/tests/integration/CMakeLists.txt b/tests/integration/CMakeLists.txt
index 7b7e1e1a5..4217af270 100644
--- a/tests/integration/CMakeLists.txt
+++ b/tests/integration/CMakeLists.txt
@@ -64,7 +64,7 @@ if(NOT CORENRN_ENABLE_REPORTING)
   list(
     APPEND
     TEST_CASES_WITH_ARGS
-    "ring_serial!--tstop 100. --celsius 6.3 --datpath ${RING_DATASET_DIR} ${MODEL_STATS_ARG} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_serial"
+    "ring_serial!${GPU_ARGS} --tstop 100. --celsius 6.3 --datpath ${RING_DATASET_DIR} ${MODEL_STATS_ARG} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_serial"
   )
 endif()
 

From e2585beb77035c3839f83f634c1405f3341186ad Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Fri, 22 Apr 2022 17:15:52 +0200
Subject: [PATCH 008/128] Build a shared library

---
 coreneuron/CMakeLists.txt                     |  8 ++++----
 coreneuron/mechanism/membfunc.hpp             |  2 +-
 .../mpi/core/{nrnmpi.cpp => resolve.cpp}      |  0
 coreneuron/permute/cellorder.cpp              |  2 +-
 extra/nrnivmodl_core_makefile.in              | 20 +++++++++++++------
 5 files changed, 20 insertions(+), 12 deletions(-)
 rename coreneuron/mpi/core/{nrnmpi.cpp => resolve.cpp} (100%)

diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 665205802..68b0f54ff 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -38,7 +38,7 @@ set(MPI_LIB_FILES "mpi/lib/mpispike.cpp" "mpi/lib/nrnmpi.cpp")
 set(MPI_CORE_FILES "mpi/core/nrnmpi_def_cinc.cpp")
 if(CORENRN_ENABLE_MPI)
   # Building these requires -ldl, which is only added if MPI is enabled.
-  list(APPEND MPI_CORE_FILES "mpi/core/nrnmpi.cpp" "mpi/core/nrnmpidec.cpp")
+  list(APPEND MPI_CORE_FILES "mpi/core/resolve.cpp" "mpi/core/nrnmpidec.cpp")
 endif()
 file(COPY ${CORENEURON_PROJECT_SOURCE_DIR}/external/Random123/include/Random123
      DESTINATION ${CMAKE_BINARY_DIR}/include)
@@ -108,14 +108,14 @@ if(CORENRN_ENABLE_GPU)
   set_source_files_properties(${OPENACC_EXCLUDED_FILES} PROPERTIES COMPILE_FLAGS
                                                                    "-DDISABLE_OPENACC")
   # Only compile the explicit CUDA implementation of the Hines solver in GPU builds.
-  list(APPEND CORENEURON_CODE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/permute/cellorder.cu)
+  # list(APPEND CORENEURON_CODE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/permute/cellorder.cu)
 
   # Eigen-3.5+ provides better GPU support. However, some functions cannot be called directly from
   # within an OpenACC region. Therefore, we need to wrap them in a special API (decorate them with
   # __device__ & acc routine tokens), which allows us to eventually call them from OpenACC. Calling
   # these functions from CUDA kernels presents no issue ...
   if(CORENRN_ENABLE_NMODL AND EXISTS ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cu)
-    list(APPEND CORENEURON_CODE_FILES ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cu)
+    # list(APPEND CORENEURON_CODE_FILES ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cu)
   endif()
 endif()
 
@@ -287,7 +287,7 @@ if(CORENRN_ENABLE_SHARED)
       CACHE INTERNAL "coreneuron mechanism library")
 else()
   set(corenrn_mech_library
-      "${CMAKE_BINARY_DIR}/bin/${CMAKE_SYSTEM_PROCESSOR}/libcorenrnmech${CMAKE_STATIC_LIBRARY_SUFFIX}"
+      "${CMAKE_BINARY_DIR}/bin/${CMAKE_SYSTEM_PROCESSOR}/libcorenrnmech${CMAKE_SHARED_LIBRARY_SUFFIX}"
       CACHE INTERNAL "coreneuron mechanism library")
 endif()
 
diff --git a/coreneuron/mechanism/membfunc.hpp b/coreneuron/mechanism/membfunc.hpp
index 638b52d87..1bda5aba4 100644
--- a/coreneuron/mechanism/membfunc.hpp
+++ b/coreneuron/mechanism/membfunc.hpp
@@ -114,7 +114,7 @@ extern void nrn_jacob_capacitance(NrnThread*, Memb_list*, int);
 extern void nrn_writes_conc(int, int);
 nrn_pragma_omp(declare target)
 nrn_pragma_acc(routine seq)
-extern void nrn_wrote_conc(int, double*, int, int, double**, double, int);
+void nrn_wrote_conc(int, double*, int, int, double**, double, int);
 nrn_pragma_omp(end declare target)
 constexpr double ktf(double celsius) {
     return 1000. * units::gasconstant * (celsius + 273.15) / units::faraday;
diff --git a/coreneuron/mpi/core/nrnmpi.cpp b/coreneuron/mpi/core/resolve.cpp
similarity index 100%
rename from coreneuron/mpi/core/nrnmpi.cpp
rename to coreneuron/mpi/core/resolve.cpp
diff --git a/coreneuron/permute/cellorder.cpp b/coreneuron/permute/cellorder.cpp
index c95fedcf2..54c2e9b91 100644
--- a/coreneuron/permute/cellorder.cpp
+++ b/coreneuron/permute/cellorder.cpp
@@ -576,7 +576,7 @@ void solve_interleaved2(int ith) {
     if (corenrn_param.gpu && corenrn_param.cuda_interface) {
         auto* d_nt = static_cast<NrnThread*>(acc_deviceptr(nt));
         auto* d_info = static_cast<InterleaveInfo*>(acc_deviceptr(interleave_info + ith));
-        solve_interleaved2_launcher(d_nt, d_info, ncore, acc_get_cuda_stream(nt->stream_id));
+        //solve_interleaved2_launcher(d_nt, d_info, ncore, acc_get_cuda_stream(nt->stream_id));
     } else {
 #endif
         int* ncycles = ii.cellsize;         // nwarp of these
diff --git a/extra/nrnivmodl_core_makefile.in b/extra/nrnivmodl_core_makefile.in
index 585cf3795..9388059fe 100644
--- a/extra/nrnivmodl_core_makefile.in
+++ b/extra/nrnivmodl_core_makefile.in
@@ -38,7 +38,7 @@ MOD_OBJS_DIR = $(OUTPUT_DIR)/corenrn/build
 
 # Linked libraries gathered by CMake
 LDFLAGS = $(LINKFLAGS) @CORENRN_COMMON_LDFLAGS@
-CORENRNLIB_FLAGS = -L$(CORENRN_LIB_DIR) -lcoreneuron
+CORENRNLIB_FLAGS =
 CORENRNLIB_FLAGS += $(if @reportinglib_LIB_DIR@, -W$(subst ;, -W,l,-rpath,@reportinglib_LIB_DIR@),)
 CORENRNLIB_FLAGS += $(if @sonatareport_LIB_DIR@, -W$(subst ;, -W,l,-rpath,@sonatareport_LIB_DIR@),)
 CORENRNLIB_FLAGS += $(if @caliper_LIB_DIR@, -W$(subst ;, -W,l,-rpath,@caliper_LIB_DIR@),)
@@ -114,8 +114,8 @@ ENGINEMECH_OBJ = $(MOD_OBJS_DIR)/enginemech.o
 
 # Depending on static/shared build, determine library name and it's suffix
 ifeq ($(TARGET_LIB_TYPE), STATIC)
-    LIB_SUFFIX = @CMAKE_STATIC_LIBRARY_SUFFIX@
-    corenrnmech_lib_target = coremech_lib_static
+    LIB_SUFFIX = @CMAKE_SHARED_LIBRARY_SUFFIX@
+    corenrnmech_lib_target = coremech_lib_shared
 else
     LIB_SUFFIX = @CMAKE_SHARED_LIBRARY_SUFFIX@
     corenrnmech_lib_target = coremech_lib_shared
@@ -209,7 +209,7 @@ endif
 # main target to build binary
 $(SPECIAL_EXE): coremech_lib_target
 	@printf " => $(C_GREEN)Binary$(C_RESET) creating $(SPECIAL_EXE)\n"
-	$(CXX_LINK_EXE_CMD) -o $(SPECIAL_EXE) $(CORENRN_SHARE_CORENRN_DIR)/coreneuron.cpp \
+	g++ -o $(SPECIAL_EXE) $(CORENRN_SHARE_CORENRN_DIR)/coreneuron.cpp \
 	  -I$(CORENRN_INC_DIR) $(INCFLAGS) \
 	  -L$(OUTPUT_DIR) -l$(COREMECH_LIB_NAME) $(CORENRNLIB_FLAGS) $(LDFLAGS) \
 	  -L$(CORENRN_LIB_DIR) -lscopmath \
@@ -225,10 +225,18 @@ $(ENGINEMECH_OBJ): $(CORENRN_SHARE_CORENRN_DIR)/enginemech.cpp | $(MOD_OBJS_DIR)
 
 # build shared library of mechanisms
 coremech_lib_shared: $(ALL_OBJS) $(ENGINEMECH_OBJ) build_always
+	# extract the object files from libcoreneuron.a
+	mkdir -p $(MOD_OBJS_DIR)/libcoreneuron
+	ar --output=$(MOD_OBJS_DIR)/libcoreneuron x $(CORENRN_LIB_DIR)/libcoreneuron.a
+	# extract the object files from libscopmath.a
+	mkdir -p $(MOD_OBJS_DIR)/libscopmath
+	ar --output=$(MOD_OBJS_DIR)/libscopmath x $(CORENRN_LIB_DIR)/libscopmath.a
 	$(CXX_SHARED_LIB_CMD) $(ENGINEMECH_OBJ) -o ${COREMECH_LIB_PATH} $(ALL_OBJS) \
 	  -I$(CORENRN_INC_DIR) $(INCFLAGS) \
-	  $(LDFLAGS)\
-	  ${SONAME_OPTION} $(CORENRNLIB_FLAGS) -Wl,-rpath,$(CORENRN_LIB_DIR);
+	  $(LDFLAGS) ${SONAME_OPTION} -Wl,--start-group \
+	  $(MOD_OBJS_DIR)/libcoreneuron/*.o \
+		-Wl,--end-group -Wl,--start-group $(MOD_OBJS_DIR)/libscopmath/*.o \
+		-Wl,--end-group $(CORENRNLIB_FLAGS) -Wl,-rpath,$(CORENRN_LIB_DIR);
 
 # build static library of mechanisms
 coremech_lib_static: $(ALL_OBJS) $(ENGINEMECH_OBJ) build_always

From 11785bc72e509650b4ea82aac07957d522541fb9 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Fri, 22 Apr 2022 17:45:51 +0200
Subject: [PATCH 009/128] fudge

---
 coreneuron/gpu/nrn_acc_manager.cpp     | 3 +++
 coreneuron/gpu/nrn_acc_manager.hpp     | 2 +-
 coreneuron/utils/randoms/nrnran123.cpp | 5 +++++
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index ed3bf659c..8121b3b0b 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -10,6 +10,7 @@
 #include <utility>
 
 #include "coreneuron/apps/corenrn_parameters.hpp"
+#include "coreneuron/gpu/nrn_acc_manager.hpp"
 #include "coreneuron/sim/multicore.hpp"
 #include "coreneuron/network/netcon.hpp"
 #include "coreneuron/nrniv/nrniv_decl.h"
@@ -1287,6 +1288,8 @@ void init_gpu() {
         std::cout << " Info : " << num_devices_per_node << " GPUs shared by " << local_size
                   << " ranks per node\n";
     }
+
+    init_nrnran123();
 }
 
 void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) {
diff --git a/coreneuron/gpu/nrn_acc_manager.hpp b/coreneuron/gpu/nrn_acc_manager.hpp
index 72d222cdd..ee5ed2483 100644
--- a/coreneuron/gpu/nrn_acc_manager.hpp
+++ b/coreneuron/gpu/nrn_acc_manager.hpp
@@ -24,6 +24,6 @@ void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb);
 
 void update_weights_from_gpu(NrnThread* threads, int nthreads);
 void init_gpu();
-
+void init_nrnran123();
 }  // namespace coreneuron
 #endif  // _nrn_device_manager_
diff --git a/coreneuron/utils/randoms/nrnran123.cpp b/coreneuron/utils/randoms/nrnran123.cpp
index 77ff88fb3..63c205f5b 100644
--- a/coreneuron/utils/randoms/nrnran123.cpp
+++ b/coreneuron/utils/randoms/nrnran123.cpp
@@ -5,6 +5,7 @@
 # See top-level LICENSE file for details.
 # =============================================================================.
 */
+#include "coreneuron/gpu/nrn_acc_manager.hpp"
 #include "coreneuron/mpi/core/nrnmpi.hpp"
 #include "coreneuron/utils/memory.h"
 #include "coreneuron/utils/nrnmutdec.hpp"
@@ -96,6 +97,10 @@ CORENRN_HOST_DEVICE philox4x32_ctr_t philox4x32_helper(coreneuron::nrnran123_Sta
 }  // namespace
 
 namespace coreneuron {
+void init_nrnran123() {
+    nrn_pragma_acc(enter data copyin(g_k))
+}
+
 std::size_t nrnran123_instance_count() {
     return g_instance_count;
 }

From 49953ff03d135fa04be601ed892b3e04373a1afe Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Mon, 25 Apr 2022 12:36:10 +0200
Subject: [PATCH 010/128] fudge

---
 coreneuron/gpu/nrn_acc_manager.cpp            |  8 +++++---
 coreneuron/io/nrn_setup.cpp                   |  3 ++-
 .../mechanism/mech/mod2c_core_thread.hpp      |  2 +-
 coreneuron/utils/memory.cpp                   |  2 +-
 coreneuron/utils/memory.h                     | 12 ++++++------
 coreneuron/utils/randoms/nrnran123.cpp        | 19 +++++++++----------
 6 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index 8121b3b0b..d4db35a75 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -77,7 +77,7 @@ void cnrn_target_set_default_device(int device_num) {
 }
 
 #ifdef CORENEURON_ENABLE_GPU
-
+#ifndef CORENEURON_UNIFIED_MEMORY
 static Memb_list* copy_ml_to_device(const Memb_list* ml, int type, double* dml_data) {
     // As we never run code for artificial cell inside GPU we don't copy it.
     int is_art = corenrn.get_is_artificial()[type];
@@ -169,6 +169,7 @@ static Memb_list* copy_ml_to_device(const Memb_list* ml, int type, double* dml_d
 
     return d_ml;
 }
+#endif
 
 static void update_ml_on_host(const Memb_list* ml, int type) {
     int is_art = corenrn.get_is_artificial()[type];
@@ -258,6 +259,7 @@ static void delete_ml_from_device(Memb_list* ml, int type) {
         cnrn_target_delete(ml->pdata, pcnt);
     }
     if (ml->global_variables) {
+        // std::byte* in C++17
         cnrn_target_delete(reinterpret_cast<char*>(ml->global_variables),
                            ml->global_variables_size);
     }
@@ -1121,7 +1123,7 @@ void nrn_newtonspace_delete_from_device(NewtonSpace* ns) {
 }
 
 void nrn_sparseobj_copyto_device(SparseObj* so) {
-#ifdef CORENEURON_ENABLE_GPU
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_UNIFIED_MEMORY)
     // FIXME this check needs to be tweaked if we ever want to run with a mix
     //       of CPU and GPU threads.
     if (nrn_threads[0].compute_gpu == 0) {
@@ -1204,7 +1206,7 @@ void nrn_sparseobj_copyto_device(SparseObj* so) {
 }
 
 void nrn_sparseobj_delete_from_device(SparseObj* so) {
-#ifdef CORENEURON_ENABLE_GPU
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_UNIFIED_MEMORY)
     // FIXME this check needs to be tweaked if we ever want to run with a mix
     //       of CPU and GPU threads.
     if (nrn_threads[0].compute_gpu == 0) {
diff --git a/coreneuron/io/nrn_setup.cpp b/coreneuron/io/nrn_setup.cpp
index 98382f9da..8f9461cb1 100644
--- a/coreneuron/io/nrn_setup.cpp
+++ b/coreneuron/io/nrn_setup.cpp
@@ -760,7 +760,8 @@ void nrn_cleanup() {
             }
 
             if (ml->global_variables) {
-                free(ml->global_variables);
+                std::cout << "Cannot generically free Memb_list::global_variables, leaking it" << std::endl;
+                // free(ml->global_variables);
                 ml->global_variables = nullptr;
             }
 
diff --git a/coreneuron/mechanism/mech/mod2c_core_thread.hpp b/coreneuron/mechanism/mech/mod2c_core_thread.hpp
index d18160f3a..4ec7b4ff6 100644
--- a/coreneuron/mechanism/mech/mod2c_core_thread.hpp
+++ b/coreneuron/mechanism/mech/mod2c_core_thread.hpp
@@ -44,7 +44,7 @@ struct Item {
 
 using List = Item; /* list of mixed items */
 
-struct SparseObj {            /* all the state information */
+struct SparseObj : public MemoryManaged {            /* all the state information */
     Elm** rowst{};            /* link to first element in row (solution order)*/
     Elm** diag{};             /* link to pivot element in row (solution order)*/
     void* elmpool{};          /* no interthread cache line sharing for elements */
diff --git a/coreneuron/utils/memory.cpp b/coreneuron/utils/memory.cpp
index 70d928b63..8f45487dc 100644
--- a/coreneuron/utils/memory.cpp
+++ b/coreneuron/utils/memory.cpp
@@ -15,7 +15,7 @@
 #include <cassert>
 
 namespace coreneuron {
-bool unified_memory_enabled() {
+bool gpu_enabled() {
 #ifdef CORENEURON_ENABLE_GPU
     return corenrn_param.gpu;
 #else
diff --git a/coreneuron/utils/memory.h b/coreneuron/utils/memory.h
index 9a2e65645..254c21544 100644
--- a/coreneuron/utils/memory.h
+++ b/coreneuron/utils/memory.h
@@ -22,13 +22,13 @@
 #endif
 
 namespace coreneuron {
-/** @brief Check if allocate_unified will return a unified memory address.
- *
- *  If false, [de]allocate_unified simply forward to new/delete. It is
- *  convenient to include this method here to avoid having to access
- *  corenrn_param directly.
+/**
+ * @brief Check if GPU support is enabled.
+ * 
+ * This returns true if GPU support was enabled at compile time and at runtime
+ * via coreneuron.gpu = True and/or --gpu, otherwise it returnss false.
  */
-bool unified_memory_enabled();
+bool gpu_enabled();
 
 /** @brief Allocate unified memory in GPU builds iff GPU enabled, otherwise new
  */
diff --git a/coreneuron/utils/randoms/nrnran123.cpp b/coreneuron/utils/randoms/nrnran123.cpp
index 63c205f5b..b550a460b 100644
--- a/coreneuron/utils/randoms/nrnran123.cpp
+++ b/coreneuron/utils/randoms/nrnran123.cpp
@@ -98,6 +98,7 @@ CORENRN_HOST_DEVICE philox4x32_ctr_t philox4x32_helper(coreneuron::nrnran123_Sta
 
 namespace coreneuron {
 void init_nrnran123() {
+    // TODO only do this if it isn't already present?
     nrn_pragma_acc(enter data copyin(g_k))
 }
 
@@ -160,20 +161,16 @@ double nrnran123_negexp(nrnran123_State* s) {
 
 /* at cost of a cached  value we could compute two at a time. */
 double nrnran123_normal(nrnran123_State* s) {
-    double w, x, y;
-    double u1, u2;
-
+    double w, u1;
     do {
         u1 = nrnran123_dblpick(s);
-        u2 = nrnran123_dblpick(s);
+        double u2{nrnran123_dblpick(s)};
         u1 = 2. * u1 - 1.;
         u2 = 2. * u2 - 1.;
         w = (u1 * u1) + (u2 * u2);
     } while (w > 1);
-
-    y = std::sqrt((-2. * log(w)) / w);
-    x = u1 * y;
-    return x;
+    double y{std::sqrt((-2. * std::log(w)) / w)};
+    return u1 * y;
 }
 
 double nrnran123_uint2dbl(uint32_t u) {
@@ -196,8 +193,10 @@ void nrnran123_set_globalindex(uint32_t gix) {
         }
     }
     g_k.v[0] = gix;
-    nrn_pragma_acc(update device(g_k))
-    nrn_pragma_omp(target update to(g_k))
+    if(coreneuron::gpu_enabled()) {
+        nrn_pragma_acc(update device(g_k))
+        nrn_pragma_omp(target update to(g_k))
+    }
 }
 
 /** @brief Allocate a new Random123 stream.

From edf36082ea067a00a546f57ea24e0745abbff3bc Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Mon, 25 Apr 2022 16:36:24 +0200
Subject: [PATCH 011/128] scopmath and coreneuron are inside corenrnmech

---
 CMake/OpenAccHelper.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake
index d64227154..f1ff3a4c3 100644
--- a/CMake/OpenAccHelper.cmake
+++ b/CMake/OpenAccHelper.cmake
@@ -105,7 +105,7 @@ if(CORENRN_ENABLE_GPU)
     GLOBAL
     PROPERTY
       CORENEURON_LIB_LINK_FLAGS
-      "${NVHPC_ACC_COMP_FLAGS} -rdynamic -lrt -Wl,--whole-archive -L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech -lscopmath -L$(libdir) -lcoreneuron -Wl,--no-whole-archive"
+      "${NVHPC_ACC_COMP_FLAGS} -rdynamic -lrt -Wl,--whole-archive -L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech -Wl,--no-whole-archive"
   )
 else()
   set_property(GLOBAL PROPERTY CORENEURON_LIB_LINK_FLAGS

From b79cab74dde67b6039271da7ac89578ba2ea3abf Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Mon, 25 Apr 2022 16:36:40 +0200
Subject: [PATCH 012/128] fast_imem may be in unified memory

---
 coreneuron/sim/fast_imem.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/coreneuron/sim/fast_imem.cpp b/coreneuron/sim/fast_imem.cpp
index 1218b7967..b1665645d 100644
--- a/coreneuron/sim/fast_imem.cpp
+++ b/coreneuron/sim/fast_imem.cpp
@@ -21,9 +21,9 @@ bool nrn_use_fast_imem;
 void fast_imem_free() {
     for (auto nt = nrn_threads; nt < nrn_threads + nrn_nthread; ++nt) {
         if (nt->nrn_fast_imem) {
-            free(nt->nrn_fast_imem->nrn_sav_rhs);
-            free(nt->nrn_fast_imem->nrn_sav_d);
-            free(nt->nrn_fast_imem);
+            free_memory(nt->nrn_fast_imem->nrn_sav_rhs);
+            free_memory(nt->nrn_fast_imem->nrn_sav_d);
+            free_memory(nt->nrn_fast_imem);
             nt->nrn_fast_imem = nullptr;
         }
     }

From 5090beb91c97c0249c2602c5b864650769a4674e Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Mon, 25 Apr 2022 16:36:52 +0200
Subject: [PATCH 013/128] cleanup

---
 coreneuron/mechanism/membfunc.hpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/coreneuron/mechanism/membfunc.hpp b/coreneuron/mechanism/membfunc.hpp
index 1bda5aba4..a40d77438 100644
--- a/coreneuron/mechanism/membfunc.hpp
+++ b/coreneuron/mechanism/membfunc.hpp
@@ -132,13 +132,6 @@ inline double nrn_ghk(double v, double ci, double co, double z, double celsius)
     double const eci{ci * efun(-temp)};
     return .001 * z * units::faraday * (eci - eco);
 }
-/**
- * This signature requires the use of the `celsius` global variable, which can
- * cause problems when executing on GPU.
- */
-[[deprecated]] inline double nrn_ghk(double v, double ci, double co, double z) {
-    return nrn_ghk(v, ci, co, z, celsius);
-}
 extern void hoc_register_prop_size(int, int, int);
 extern void hoc_register_dparam_semantics(int type, int, const char* name);
 extern void hoc_reg_ba(int, mod_f_t, int);

From 5f86115ff939ff2828e58b12fa872df3abc0bbf9 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Mon, 25 Apr 2022 16:37:15 +0200
Subject: [PATCH 014/128] don't cudaFree things allocated by NEURON

---
 coreneuron/io/phase2.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/coreneuron/io/phase2.cpp b/coreneuron/io/phase2.cpp
index bb3f7f99f..77cfaa609 100644
--- a/coreneuron/io/phase2.cpp
+++ b/coreneuron/io/phase2.cpp
@@ -337,7 +337,7 @@ void Phase2::read_direct(int thread_id, const NrnThread& nt) {
         offset += nrn_soa_padded_size(nodecounts[i], layout) * param_sizes[type];
         if (nodeindices_) {
             std::copy(nodeindices_, nodeindices_ + nodecounts[i], tml.nodeindices.data());
-            free_memory(nodeindices_);
+            free(nodeindices_); // not free_memory because this is allocated by NEURON?
         }
         if (corenrn.get_is_artificial()[type]) {
             assert(nodeindices_ == nullptr);

From 6acc288ec82c3c8927295e8a4fa27860e97df2d9 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Mon, 25 Apr 2022 16:37:23 +0200
Subject: [PATCH 015/128] random123 hackery

---
 coreneuron/utils/memory.h              |  2 +-
 coreneuron/utils/randoms/nrnran123.cpp | 61 ++++++++++++++++++++------
 2 files changed, 48 insertions(+), 15 deletions(-)

diff --git a/coreneuron/utils/memory.h b/coreneuron/utils/memory.h
index 254c21544..f1b7042c8 100644
--- a/coreneuron/utils/memory.h
+++ b/coreneuron/utils/memory.h
@@ -24,7 +24,7 @@
 namespace coreneuron {
 /**
  * @brief Check if GPU support is enabled.
- * 
+ *
  * This returns true if GPU support was enabled at compile time and at runtime
  * via coreneuron.gpu = True and/or --gpu, otherwise it returnss false.
  */
diff --git a/coreneuron/utils/randoms/nrnran123.cpp b/coreneuron/utils/randoms/nrnran123.cpp
index b550a460b..c815a8f24 100644
--- a/coreneuron/utils/randoms/nrnran123.cpp
+++ b/coreneuron/utils/randoms/nrnran123.cpp
@@ -21,10 +21,14 @@
 #include <unordered_map>
 #endif
 
+#include <nv/target>
+
 // Defining these attributes seems to help nvc++ in OpenMP target offload mode.
 #if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENMP) && defined(__CUDACC__)
 #define CORENRN_HOST_DEVICE __host__ __device__
+#elif defined(__CUDACC__)
+#define CORENRN_HOST_DEVICE __host__ __device__
 #else
 #define CORENRN_HOST_DEVICE
 #endif
@@ -77,29 +81,45 @@ using random123_allocator = coreneuron::unified_allocator<coreneuron::nrnran123_
  * shutdown. If the destructor calls cudaFree and the CUDA runtime has already
  * been shut down then tools like cuda-memcheck reports errors.
  */
-nrn_pragma_omp(declare target)
-philox4x32_key_t g_k{};
-nrn_pragma_omp(end declare target)
-nrn_pragma_acc(declare create(g_k))
-
+// nrn_pragma_omp(declare target)
+// philox4x32_key_t g_k_real{};
+// nrn_pragma_omp(end declare target)
+// nrn_pragma_acc(declare create(g_k))
 OMP_Mutex g_instance_count_mutex;
-
 std::size_t g_instance_count{};
 
+// not sure quite how nvc++ handles these, not sure we actually need the 2
+// different names?
+philox4x32_key_t g_k{};
+__constant__ __device__ philox4x32_key_t g_k_dev{};
+// noinline to force "CUDA" not "acc routine seq" behaviour :shrug:
+__attribute__((noinline)) philox4x32_key_t& global_state() {
+    if target (nv::target::is_device) {
+        // printf("dev: &g_k=%p [seed %d]\n", &g_k_dev, g_k_dev.v[0]);
+        return g_k_dev;
+    } else {
+        // printf("host: &g_k=%p [seed %d]\n", &g_k, g_k.v[0]);
+        return g_k;
+    }
+}
+
 constexpr double SHIFT32 = 1.0 / 4294967297.0; /* 1/(2^32 + 1) */
 
 /** @brief Provide a helper function in global namespace that is declared target for OpenMP
  * offloading to function correctly with NVHPC
  */
 CORENRN_HOST_DEVICE philox4x32_ctr_t philox4x32_helper(coreneuron::nrnran123_State* s) {
-    return philox4x32(s->c, g_k);
+    return philox4x32(s->c, global_state());
 }
 }  // namespace
 
 namespace coreneuron {
 void init_nrnran123() {
-    // TODO only do this if it isn't already present?
-    nrn_pragma_acc(enter data copyin(g_k))
+    // if(coreneuron::gpu_enabled()) {
+    //     // TODO only do this if it isn't already present?
+    //     auto& g_k = global_state();
+    //     nrn_pragma_acc(enter data copyin(g_k))
+    // }
 }
 
 std::size_t nrnran123_instance_count() {
@@ -108,7 +128,7 @@ std::size_t nrnran123_instance_count() {
 
 /* if one sets the global, one should reset all the stream sequences. */
 uint32_t nrnran123_get_globalindex() {
-    return g_k.v[0];
+    return global_state().v[0];
 }
 
 void nrnran123_getseq(nrnran123_State* s, uint32_t* seq, char* which) {
@@ -182,6 +202,7 @@ double nrnran123_uint2dbl(uint32_t u) {
 /* nrn123 streams are created from cpu launcher routine */
 void nrnran123_set_globalindex(uint32_t gix) {
     // If the global seed is changing then we shouldn't have any active streams.
+    auto& g_k = global_state();
     {
         std::lock_guard<OMP_Mutex> _{g_instance_count_mutex};
         if (g_instance_count != 0 && nrnmpi_myid == 0) {
@@ -192,10 +213,22 @@ void nrnran123_set_globalindex(uint32_t gix) {
                 << g_k.v[0] << ')' << std::endl;
         }
     }
-    g_k.v[0] = gix;
-    if(coreneuron::gpu_enabled()) {
-        nrn_pragma_acc(update device(g_k))
-        nrn_pragma_omp(target update to(g_k))
+    if(g_k.v[0] != gix) {
+        g_k.v[0] = gix;
+        if(coreneuron::gpu_enabled()) {
+            {
+                auto const code = cudaMemcpyToSymbol(g_k_dev, &g_k, sizeof(g_k));
+                assert(code == cudaSuccess);
+            }
+            {
+                auto const code = cudaDeviceSynchronize();
+                assert(code == cudaSuccess);
+            }
+            std::cout << "trying to read g_k_dev from host..." << std::endl;
+            std::cout << g_k_dev.v[0] << std::endl;
+            //     nrn_pragma_acc(update device(g_k))
+            //     nrn_pragma_omp(target update to(g_k))
+        }
     }
 }
 

From 7ceaff6f48b89fb3ae59f04e01e88864f41cbf84 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Mon, 25 Apr 2022 21:33:31 +0200
Subject: [PATCH 016/128] homegrown present table to avoid dynamic loading +
 acc_deviceptr limitations

---
 coreneuron/gpu/nrn_acc_manager.cpp     | 64 +++++++++++++++++++++++---
 coreneuron/io/nrn_setup.cpp            |  5 +-
 coreneuron/io/phase2.cpp               |  2 +-
 coreneuron/network/partrans.cpp        | 19 +++-----
 coreneuron/permute/cellorder.cpp       |  2 +-
 coreneuron/sim/fast_imem.cpp           |  2 +-
 coreneuron/sim/multicore.hpp           |  2 +-
 coreneuron/utils/memory.h              |  2 +-
 coreneuron/utils/offload.hpp           | 24 ++++++++--
 coreneuron/utils/randoms/nrnran123.cpp |  4 +-
 10 files changed, 96 insertions(+), 30 deletions(-)

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index d4db35a75..20ade530b 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -32,15 +32,68 @@
 #include <cuda_runtime_api.h>
 #endif
 
+#ifdef CORENEURON_ENABLE_PRESENT_TABLE
+#include <cassert>
+#include <iostream>
+#include <map>
+#include <mutex>
+namespace {
+enum class byte : unsigned char {};  // std::byte in C++17
+std::map<byte const*, std::pair<std::size_t, byte*>> present_table;
+std::mutex present_table_mutex;
+}  // namespace
+#endif
+
 namespace coreneuron {
 extern InterleaveInfo* interleave_info;
-void copy_ivoc_vect_to_device(const IvocVect& iv, IvocVect& div);
-void delete_ivoc_vect_from_device(IvocVect&);
 void nrn_ion_global_map_copyto_device();
 void nrn_ion_global_map_delete_from_device();
 void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay);
 void nrn_VecPlay_delete_from_device(NrnThread* nt);
 
+#ifdef CORENEURON_ENABLE_PRESENT_TABLE
+void* cnrn_target_deviceptr_impl(void const* h_ptr) {
+    if (!h_ptr) {
+        return nullptr;
+    }
+    // note no locking, undefined behaviour if you call this concurrently with
+    // the copyin/delete methods (which do lock)
+    assert(!present_table.empty());
+    // prev(first iterator greater than h_ptr or last if not found) gives the first iterator less
+    // than or equal to h_ptr
+    auto const iter = std::prev(std::upper_bound(
+        present_table.begin(), present_table.end(), h_ptr, [](void const* hp, auto const& entry) {
+            return hp < entry.first;
+        }));
+    assert(iter != present_table.end());
+    byte const* const h_byte_ptr{static_cast<byte const*>(h_ptr)};
+    byte const* const h_start_of_block{iter->first};
+    std::size_t const block_size{iter->second.first};
+    byte* const d_start_of_block{iter->second.second};
+    assert(h_byte_ptr < h_start_of_block + block_size);
+    return d_start_of_block + (h_byte_ptr - h_start_of_block);
+}
+void cnrn_target_copyin_update_present_table(void const* h_ptr, void* d_ptr, std::size_t len) {
+    if (!h_ptr) {
+        assert(!d_ptr);
+        return;
+    }
+    std::lock_guard<std::mutex> _{present_table_mutex};
+    auto const result = present_table.emplace(static_cast<byte const*>(h_ptr),
+                                              std::make_pair(len, static_cast<byte*>(d_ptr)));
+}
+void cnrn_target_delete_update_present_table(void const* h_ptr, std::size_t len) {
+    if (!h_ptr) {
+        return;
+    }
+    std::lock_guard<std::mutex> _{present_table_mutex};
+    auto const iter = present_table.find(static_cast<byte const*>(h_ptr));
+    assert(iter != present_table.end());
+    assert(iter->second.first == len);
+    present_table.erase(iter);
+}
+#endif
+
 int cnrn_target_get_num_devices() {
 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENACC)
@@ -260,8 +313,7 @@ static void delete_ml_from_device(Memb_list* ml, int type) {
     }
     if (ml->global_variables) {
         // std::byte* in C++17
-        cnrn_target_delete(reinterpret_cast<char*>(ml->global_variables),
-                           ml->global_variables_size);
+        cnrn_target_delete(static_cast<char*>(ml->global_variables), ml->global_variables_size);
     }
 
     cnrn_target_delete(ml->nodeindices, n);
@@ -618,7 +670,7 @@ void delete_ivoc_vect_from_device(IvocVect& vec) {
     if (n) {
         cnrn_target_delete(vec.data(), n);
     }
-    cnrn_target_delete(&vec);
+    // cnrn_target_delete(&vec);
 #else
     (void) vec;
 #endif
@@ -1329,7 +1381,7 @@ void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) {
 
 void nrn_VecPlay_delete_from_device(NrnThread* nt) {
     for (int i = 0; i < nt->n_vecplay; i++) {
-        auto* vecplay_instance = reinterpret_cast<VecPlayContinuous*>(nt->_vecplay[i]);
+        auto* vecplay_instance = static_cast<VecPlayContinuous*>(nt->_vecplay[i]);
         cnrn_target_delete(vecplay_instance->e_);
         if (vecplay_instance->discon_indices_) {
             delete_ivoc_vect_from_device(*(vecplay_instance->discon_indices_));
diff --git a/coreneuron/io/nrn_setup.cpp b/coreneuron/io/nrn_setup.cpp
index 8f9461cb1..f34a489c1 100644
--- a/coreneuron/io/nrn_setup.cpp
+++ b/coreneuron/io/nrn_setup.cpp
@@ -754,13 +754,16 @@ void nrn_cleanup() {
                 ml->_thread = nullptr;
             }
 
+            // Probably causes problems with NMODL, which allocates its instance
+            // in unified memory.
             if (ml->instance) {
                 free(ml->instance);
                 ml->instance = nullptr;
             }
 
             if (ml->global_variables) {
-                std::cout << "Cannot generically free Memb_list::global_variables, leaking it" << std::endl;
+                std::cout << "Cannot generically free Memb_list::global_variables, leaking it"
+                          << std::endl;
                 // free(ml->global_variables);
                 ml->global_variables = nullptr;
             }
diff --git a/coreneuron/io/phase2.cpp b/coreneuron/io/phase2.cpp
index 77cfaa609..0b96e1956 100644
--- a/coreneuron/io/phase2.cpp
+++ b/coreneuron/io/phase2.cpp
@@ -337,7 +337,7 @@ void Phase2::read_direct(int thread_id, const NrnThread& nt) {
         offset += nrn_soa_padded_size(nodecounts[i], layout) * param_sizes[type];
         if (nodeindices_) {
             std::copy(nodeindices_, nodeindices_ + nodecounts[i], tml.nodeindices.data());
-            free(nodeindices_); // not free_memory because this is allocated by NEURON?
+            free(nodeindices_);  // not free_memory because this is allocated by NEURON?
         }
         if (corenrn.get_is_artificial()[type]) {
             assert(nodeindices_ == nullptr);
diff --git a/coreneuron/network/partrans.cpp b/coreneuron/network/partrans.cpp
index ddfb49421..28fee5d86 100644
--- a/coreneuron/network/partrans.cpp
+++ b/coreneuron/network/partrans.cpp
@@ -133,12 +133,9 @@ void nrnthread_v_transfer(NrnThread* _nt) {
 void nrn_partrans::copy_gap_indices_to_device() {
     // Ensure index vectors, src_gather, and insrc_buf_ are on the gpu.
     if (insrcdspl_) {
-        int n_insrc_buf = insrcdspl_[nrnmpi_numprocs];
-        static_cast<void>(n_insrc_buf);
-        nrn_pragma_acc(enter data create(insrc_buf_[:n_insrc_buf]))
-        // clang-format off
-        nrn_pragma_omp(target enter data map(alloc: insrc_buf_[:n_insrc_buf]))
-        // clang-format off
+        // TODO: we don't actually need to copy here, just allocate + associate
+        // storage on the device
+        cnrn_target_copyin(insrc_buf_, insrcdspl_[nrnmpi_numprocs]);
     }
     for (int tid = 0; tid < nrn_nthread; ++tid) {
         const NrnThread* nt = nrn_threads + tid;
@@ -150,13 +147,9 @@ void nrn_partrans::copy_gap_indices_to_device() {
 
         if (!ttd.src_indices.empty()) {
             cnrn_target_copyin(ttd.src_indices.data(), ttd.src_indices.size());
-
-            size_t n_src_gather = ttd.src_gather.size();
-            const double* src_gather = ttd.src_gather.data();
-            static_cast<void>(n_src_gather);
-            static_cast<void>(src_gather);
-            nrn_pragma_acc(enter data create(src_gather[:n_src_gather]))
-            nrn_pragma_omp(target enter data map(alloc: src_gather[:n_src_gather]))
+            // TODO: we don't actually need to copy here, just allocate +
+            // associate storage on the device.
+            cnrn_target_copyin(ttd.src_gather.data(), ttd.src_gather.size());
         }
 
         if (ttd.insrc_indices.size()) {
diff --git a/coreneuron/permute/cellorder.cpp b/coreneuron/permute/cellorder.cpp
index 54c2e9b91..2c2fca92e 100644
--- a/coreneuron/permute/cellorder.cpp
+++ b/coreneuron/permute/cellorder.cpp
@@ -576,7 +576,7 @@ void solve_interleaved2(int ith) {
     if (corenrn_param.gpu && corenrn_param.cuda_interface) {
         auto* d_nt = static_cast<NrnThread*>(acc_deviceptr(nt));
         auto* d_info = static_cast<InterleaveInfo*>(acc_deviceptr(interleave_info + ith));
-        //solve_interleaved2_launcher(d_nt, d_info, ncore, acc_get_cuda_stream(nt->stream_id));
+        // solve_interleaved2_launcher(d_nt, d_info, ncore, acc_get_cuda_stream(nt->stream_id));
     } else {
 #endif
         int* ncycles = ii.cellsize;         // nwarp of these
diff --git a/coreneuron/sim/fast_imem.cpp b/coreneuron/sim/fast_imem.cpp
index b1665645d..d3b463a48 100644
--- a/coreneuron/sim/fast_imem.cpp
+++ b/coreneuron/sim/fast_imem.cpp
@@ -34,7 +34,7 @@ void nrn_fast_imem_alloc() {
         fast_imem_free();
         for (auto nt = nrn_threads; nt < nrn_threads + nrn_nthread; ++nt) {
             int n = nt->end;
-            nt->nrn_fast_imem = (NrnFastImem*) ecalloc(1, sizeof(NrnFastImem));
+            nt->nrn_fast_imem = (NrnFastImem*) ecalloc_align(1, sizeof(NrnFastImem));
             nt->nrn_fast_imem->nrn_sav_rhs = (double*) ecalloc_align(n, sizeof(double));
             nt->nrn_fast_imem->nrn_sav_d = (double*) ecalloc_align(n, sizeof(double));
         }
diff --git a/coreneuron/sim/multicore.hpp b/coreneuron/sim/multicore.hpp
index c9b3cb58e..18cd613f3 100644
--- a/coreneuron/sim/multicore.hpp
+++ b/coreneuron/sim/multicore.hpp
@@ -53,7 +53,7 @@ struct NrnFastImem {
     double* nrn_sav_d;
 };
 
-struct TrajectoryRequests {
+struct TrajectoryRequests: public MemoryManaged {
     void** vpr;       /* PlayRecord Objects known by NEURON */
     double** scatter; /* if bsize == 0, each time step */
     double** varrays; /* if bsize > 0, the Vector data pointers. */
diff --git a/coreneuron/utils/memory.h b/coreneuron/utils/memory.h
index f1b7042c8..286cfa5f2 100644
--- a/coreneuron/utils/memory.h
+++ b/coreneuron/utils/memory.h
@@ -1,6 +1,6 @@
 /*
 # =============================================================================
-# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
+# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
 #
 # See top-level LICENSE file for details.
 # =============================================================================.
diff --git a/coreneuron/utils/offload.hpp b/coreneuron/utils/offload.hpp
index 078990107..1f068c4d7 100644
--- a/coreneuron/utils/offload.hpp
+++ b/coreneuron/utils/offload.hpp
@@ -1,6 +1,6 @@
 /*
 # =============================================================================
-# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
+# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
 #
 # See top-level LICENSE file for details.
 # =============================================================================
@@ -25,9 +25,20 @@
 #include <cstddef>
 
 namespace coreneuron {
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENACC)
+// Homegrown implementation for buggy NVHPC versions (<=22.3?)
+#define CORENEURON_ENABLE_PRESENT_TABLE
+void* cnrn_target_deviceptr_impl(void const* h_ptr);
+void cnrn_target_copyin_update_present_table(void const* h_ptr, void* d_ptr, std::size_t len);
+void cnrn_target_delete_update_present_table(void const* h_ptr, std::size_t len);
+#endif
+
 template <typename T>
 T* cnrn_target_deviceptr(const T* h_ptr) {
-#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+#ifdef CORENEURON_ENABLE_PRESENT_TABLE
+    return static_cast<T*>(cnrn_target_deviceptr_impl(h_ptr));
+#elif defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENACC)
     return static_cast<T*>(acc_deviceptr(const_cast<T*>(h_ptr)));
 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
@@ -48,7 +59,11 @@ template <typename T>
 T* cnrn_target_copyin(const T* h_ptr, std::size_t len = 1) {
 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENACC)
-    return static_cast<T*>(acc_copyin(const_cast<T*>(h_ptr), len * sizeof(T)));
+    auto* d_ptr = static_cast<T*>(acc_copyin(const_cast<T*>(h_ptr), len * sizeof(T)));
+#ifdef CORENEURON_ENABLE_PRESENT_TABLE
+    cnrn_target_copyin_update_present_table(h_ptr, d_ptr, len * sizeof(T));
+#endif
+    return d_ptr;
 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENMP)
     nrn_pragma_omp(target enter data map(to : h_ptr[:len]))
@@ -63,6 +78,9 @@ template <typename T>
 void cnrn_target_delete(T* h_ptr, std::size_t len = 1) {
 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENACC)
+#ifdef CORENEURON_ENABLE_PRESENT_TABLE
+    cnrn_target_delete_update_present_table(h_ptr, len * sizeof(T));
+#endif
     acc_delete(h_ptr, len * sizeof(T));
 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENMP)
diff --git a/coreneuron/utils/randoms/nrnran123.cpp b/coreneuron/utils/randoms/nrnran123.cpp
index c815a8f24..6ea75a16b 100644
--- a/coreneuron/utils/randoms/nrnran123.cpp
+++ b/coreneuron/utils/randoms/nrnran123.cpp
@@ -213,9 +213,9 @@ void nrnran123_set_globalindex(uint32_t gix) {
                 << g_k.v[0] << ')' << std::endl;
         }
     }
-    if(g_k.v[0] != gix) {
+    if (g_k.v[0] != gix) {
         g_k.v[0] = gix;
-        if(coreneuron::gpu_enabled()) {
+        if (coreneuron::gpu_enabled()) {
             {
                 auto const code = cudaMemcpyToSymbol(g_k_dev, &g_k, sizeof(g_k));
                 assert(code == cudaSuccess);

From 8b2ffa7f16d07d047519c3a7e995a3c142c3e535 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 26 Apr 2022 10:54:29 +0200
Subject: [PATCH 017/128] Cleanup

---
 coreneuron/gpu/nrn_acc_manager.cpp     |  5 +---
 coreneuron/gpu/nrn_acc_manager.hpp     | 13 ++++-------
 coreneuron/utils/randoms/nrnran123.cpp | 32 +++++++++++++-------------
 3 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index 20ade530b..d0862b31e 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -670,9 +670,8 @@ void delete_ivoc_vect_from_device(IvocVect& vec) {
     if (n) {
         cnrn_target_delete(vec.data(), n);
     }
-    // cnrn_target_delete(&vec);
 #else
-    (void) vec;
+    static_cast<void>(vec);
 #endif
 }
 
@@ -1342,8 +1341,6 @@ void init_gpu() {
         std::cout << " Info : " << num_devices_per_node << " GPUs shared by " << local_size
                   << " ranks per node\n";
     }
-
-    init_nrnran123();
 }
 
 void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) {
diff --git a/coreneuron/gpu/nrn_acc_manager.hpp b/coreneuron/gpu/nrn_acc_manager.hpp
index ee5ed2483..5a2a6f544 100644
--- a/coreneuron/gpu/nrn_acc_manager.hpp
+++ b/coreneuron/gpu/nrn_acc_manager.hpp
@@ -1,17 +1,16 @@
 /*
 # =============================================================================
-# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
+# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
 #
 # See top-level LICENSE file for details.
 # =============================================================================
 */
-
-#ifndef _nrn_device_manager_
-#define _nrn_device_manager_
-
-#include "coreneuron/sim/multicore.hpp"
+#pragma once
 
 namespace coreneuron {
+struct Memb_list;
+struct NrnThread;
+struct NetSendBuffer_t;
 void setup_nrnthreads_on_device(NrnThread* threads, int nthreads);
 void delete_nrnthreads_on_device(NrnThread* threads, int nthreads);
 void update_nrnthreads_on_host(NrnThread* threads, int nthreads);
@@ -24,6 +23,4 @@ void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb);
 
 void update_weights_from_gpu(NrnThread* threads, int nthreads);
 void init_gpu();
-void init_nrnran123();
 }  // namespace coreneuron
-#endif  // _nrn_device_manager_
diff --git a/coreneuron/utils/randoms/nrnran123.cpp b/coreneuron/utils/randoms/nrnran123.cpp
index 6ea75a16b..f5258968e 100644
--- a/coreneuron/utils/randoms/nrnran123.cpp
+++ b/coreneuron/utils/randoms/nrnran123.cpp
@@ -21,13 +21,16 @@
 #include <unordered_map>
 #endif
 
+#ifdef __CUDACC__
 #include <nv/target>
+#endif
 
 // Defining these attributes seems to help nvc++ in OpenMP target offload mode.
 #if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENMP) && defined(__CUDACC__)
 #define CORENRN_HOST_DEVICE __host__ __device__
 #elif defined(__CUDACC__)
+// This is necessary to make the new CUDA-syntax-in-.cpp version compile
 #define CORENRN_HOST_DEVICE __host__ __device__
 #else
 #define CORENRN_HOST_DEVICE
@@ -88,20 +91,24 @@ using random123_allocator = coreneuron::unified_allocator<coreneuron::nrnran123_
 OMP_Mutex g_instance_count_mutex;
 std::size_t g_instance_count{};
 
-// not sure quite how nvc++ handles these, not sure we actually need the 2
-// different names?
 philox4x32_key_t g_k{};
+#ifdef __CUDACC__
+// Not 100% clear we need a different name (g_k_dev) here in addition to g_k,
+// but it's clearer and the overhead cannot be high (if it exists).
 __constant__ __device__ philox4x32_key_t g_k_dev{};
 // noinline to force "CUDA" not "acc routine seq" behaviour :shrug:
 __attribute__((noinline)) philox4x32_key_t& global_state() {
     if target (nv::target::is_device) {
-        // printf("dev: &g_k=%p [seed %d]\n", &g_k_dev, g_k_dev.v[0]);
         return g_k_dev;
     } else {
-        // printf("host: &g_k=%p [seed %d]\n", &g_k, g_k.v[0]);
         return g_k;
     }
 }
+#else
+philox4x32_key_t& global_state() {
+    return g_k;
+}
+#endif
 
 constexpr double SHIFT32 = 1.0 / 4294967297.0; /* 1/(2^32 + 1) */
 
@@ -114,14 +121,6 @@ CORENRN_HOST_DEVICE philox4x32_ctr_t philox4x32_helper(coreneuron::nrnran123_Sta
 }  // namespace
 
 namespace coreneuron {
-void init_nrnran123() {
-    // if(coreneuron::gpu_enabled()) {
-    //     // TODO only do this if it isn't already present?
-    //     auto& g_k = global_state();
-    //     nrn_pragma_acc(enter data copyin(g_k))
-    // }
-}
-
 std::size_t nrnran123_instance_count() {
     return g_instance_count;
 }
@@ -216,6 +215,7 @@ void nrnran123_set_globalindex(uint32_t gix) {
     if (g_k.v[0] != gix) {
         g_k.v[0] = gix;
         if (coreneuron::gpu_enabled()) {
+#ifdef __CUDACC__
             {
                 auto const code = cudaMemcpyToSymbol(g_k_dev, &g_k, sizeof(g_k));
                 assert(code == cudaSuccess);
@@ -224,10 +224,10 @@ void nrnran123_set_globalindex(uint32_t gix) {
                 auto const code = cudaDeviceSynchronize();
                 assert(code == cudaSuccess);
             }
-            std::cout << "trying to read g_k_dev from host..." << std::endl;
-            std::cout << g_k_dev.v[0] << std::endl;
-            //     nrn_pragma_acc(update device(g_k))
-            //     nrn_pragma_omp(target update to(g_k))
+#else
+            nrn_pragma_acc(update device(g_k))
+            nrn_pragma_omp(target update to(g_k))
+#endif
         }
     }
 }

From bf3a0bc802f18df2a993d61010f0bd27c7ad1a0d Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 6 Jul 2022 16:56:29 +0200
Subject: [PATCH 018/128] generate some more ringtests

---
 tests/integration/CMakeLists.txt | 49 +++++++++++++++-----------------
 1 file changed, 23 insertions(+), 26 deletions(-)

diff --git a/tests/integration/CMakeLists.txt b/tests/integration/CMakeLists.txt
index 4217af270..891f1443e 100644
--- a/tests/integration/CMakeLists.txt
+++ b/tests/integration/CMakeLists.txt
@@ -31,6 +31,9 @@ set(PERMUTE2_ARGS "--cell-permute 2")
 set(CUDA_INTERFACE "--cuda-interface")
 if(CORENRN_ENABLE_GPU)
   set(GPU_ARGS "--gpu")
+  set(permutation_modes 1 2)
+else()
+  set(permutation_modes 0 1)
 endif()
 
 # List of tests with arguments
@@ -39,32 +42,37 @@ set(TEST_CASES_WITH_ARGS
     "ring_binqueue!${RING_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_binqueue --binqueue"
     "ring_multisend!${RING_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_multisend --multisend"
     "ring_spike_buffer!${RING_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_spike_buffer --spikebuf 1"
-    "ring_permute1!${RING_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_permute1 ${PERMUTE1_ARGS}"
-    "ring_permute2!${RING_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_permute2 ${PERMUTE2_ARGS}"
     "ring_gap!${RING_GAP_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_gap"
     "ring_gap_binqueue!${RING_GAP_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_gap_binqueue --binqueue"
     "ring_gap_multisend!${RING_GAP_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_gap_multisend --multisend"
-    "ring_gap_permute1!${RING_GAP_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_gap_permute1 ${PERMUTE1_ARGS}"
-    "ring_gap_permute2!${RING_GAP_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_gap_permute2 ${PERMUTE2_ARGS}"
 )
-
-if(CORENRN_ENABLE_GPU)
+set(test_suffixes "" "_binqueue" "_multisend")
+foreach(cell_permute ${permutation_modes})
+  list(APPEND test_suffixes "_permute${cell_permute}")
   list(
     APPEND
     TEST_CASES_WITH_ARGS
-    "ring_permute2_cudaInterface!${RING_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_permute2_cudaInterface ${PERMUTE2_ARGS} ${CUDA_INTERFACE}"
-    "ring_gap_permute2_cudaInterface!${RING_GAP_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_gap_permute2_cudaInterface ${PERMUTE2_ARGS} ${CUDA_INTERFACE}"
+    "ring_permute${cell_permute}!${RING_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_permute${cell_permute} --cell-permute=${cell_permute}"
+    "ring_gap_permute${cell_permute}!${RING_GAP_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_gap_permute${cell_permute} --cell-permute=${cell_permute}"
   )
-endif()
+  # As reports require MPI, do not add test if report is enabled.
+  if(NOT CORENRN_ENABLE_REPORTING)
+    list(APPEND test_suffixes "_serial_permute${cell_permute}")
+    list(
+      APPEND
+      TEST_CASES_WITH_ARGS
+      "ring_serial_permute${cell_permute}!${GPU_ARGS} --cell-permute=${cell_permute} --tstop 100. --celsius 6.3 --datpath ${RING_DATASET_DIR} ${MODEL_STATS_ARG} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_serial_permute${cell_permute}"
+    )
+  endif()
+endforeach()
 
-# ~~~
-# As reports require MPI, do not add test if report is enabled.
-# ~~~
-if(NOT CORENRN_ENABLE_REPORTING)
+if(CORENRN_ENABLE_GPU)
+  list(APPEND test_suffixes "_permute2_cudaInterface")
   list(
     APPEND
     TEST_CASES_WITH_ARGS
-    "ring_serial!${GPU_ARGS} --tstop 100. --celsius 6.3 --datpath ${RING_DATASET_DIR} ${MODEL_STATS_ARG} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_serial"
+    "ring_permute2_cudaInterface!${RING_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_permute2_cudaInterface ${PERMUTE2_ARGS} ${CUDA_INTERFACE}"
+    "ring_gap_permute2_cudaInterface!${RING_GAP_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_gap_permute2_cudaInterface ${PERMUTE2_ARGS} ${CUDA_INTERFACE}"
   )
 endif()
 
@@ -73,18 +81,7 @@ endif()
 # create them and copy reference spikes
 # ~~~
 foreach(data_dir "ring" "ring_gap")
-  foreach(
-    test_suffix
-    ""
-    "_serial"
-    "_multisend"
-    "_binqueue"
-    "_savestate_permute0"
-    "_savestate_permute1"
-    "_savestate_permute2"
-    "_permute1"
-    "_permute2"
-    "_permute2_cudaInterface")
+  foreach(test_suffix ${test_suffixes})
     file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/${data_dir}/out.dat.ref"
          DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/${data_dir}${test_suffix}/")
   endforeach()

From d933c04cea0f8a4422f979e66a98e2c332c4d3c4 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 12 Jul 2022 11:50:41 +0200
Subject: [PATCH 019/128] clang-format

---
 .../mechanism/mech/mod2c_core_thread.hpp      | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/coreneuron/mechanism/mech/mod2c_core_thread.hpp b/coreneuron/mechanism/mech/mod2c_core_thread.hpp
index 4ec7b4ff6..e224137e0 100644
--- a/coreneuron/mechanism/mech/mod2c_core_thread.hpp
+++ b/coreneuron/mechanism/mech/mod2c_core_thread.hpp
@@ -44,16 +44,16 @@ struct Item {
 
 using List = Item; /* list of mixed items */
 
-struct SparseObj : public MemoryManaged {            /* all the state information */
-    Elm** rowst{};            /* link to first element in row (solution order)*/
-    Elm** diag{};             /* link to pivot element in row (solution order)*/
-    void* elmpool{};          /* no interthread cache line sharing for elements */
-    unsigned neqn{};          /* number of equations */
-    unsigned _cntml_padded{}; /* number of instances */
-    unsigned* varord{};       /* row and column order for pivots */
-    double* rhs{};            /* initially- right hand side        finally - answer */
-    unsigned* ngetcall{};     /* per instance counter for number of calls to _getelm */
-    int phase{};              /* 0-solution phase; 1-count phase; 2-build list phase */
+struct SparseObj: public MemoryManaged { /* all the state information */
+    Elm** rowst{};                       /* link to first element in row (solution order)*/
+    Elm** diag{};                        /* link to pivot element in row (solution order)*/
+    void* elmpool{};                     /* no interthread cache line sharing for elements */
+    unsigned neqn{};                     /* number of equations */
+    unsigned _cntml_padded{};            /* number of instances */
+    unsigned* varord{};                  /* row and column order for pivots */
+    double* rhs{};                       /* initially- right hand side        finally - answer */
+    unsigned* ngetcall{};                /* per instance counter for number of calls to _getelm */
+    int phase{};                         /* 0-solution phase; 1-count phase; 2-build list phase */
     int numop{};
     unsigned coef_list_size{};
     double** coef_list{}; /* pointer to (first instance) value in _getelm order */

From d0e7b2cbac084d09cbdc5ed85ea61a549a0830ee Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 12 Jul 2022 14:21:35 +0200
Subject: [PATCH 020/128] cmake-format

---
 coreneuron/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 68b0f54ff..ef5f577bd 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -107,8 +107,8 @@ if(CORENRN_ENABLE_GPU)
 
   set_source_files_properties(${OPENACC_EXCLUDED_FILES} PROPERTIES COMPILE_FLAGS
                                                                    "-DDISABLE_OPENACC")
-  # Only compile the explicit CUDA implementation of the Hines solver in GPU builds.
-  # list(APPEND CORENEURON_CODE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/permute/cellorder.cu)
+  # Only compile the explicit CUDA implementation of the Hines solver in GPU builds. list(APPEND
+  # CORENEURON_CODE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/permute/cellorder.cu)
 
   # Eigen-3.5+ provides better GPU support. However, some functions cannot be called directly from
   # within an OpenACC region. Therefore, we need to wrap them in a special API (decorate them with

From 52951c7253457ce9e2117be20408a4d2fe190a13 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 12 Jul 2022 14:44:09 +0200
Subject: [PATCH 021/128] Don't delete threads from the GPU if we didn't copy
 them there. Disable the CUDA backend while it is disabled in Coreneuron

---
 tests/unit/solver/test_solver.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/unit/solver/test_solver.cpp b/tests/unit/solver/test_solver.cpp
index 6511f03e1..b797ac711 100644
--- a/tests/unit/solver/test_solver.cpp
+++ b/tests/unit/solver/test_solver.cpp
@@ -196,7 +196,9 @@ struct SetupThreads {
     }
 
     ~SetupThreads() {
-        delete_nrnthreads_on_device(nrn_threads, nrn_nthread);
+        if (corenrn_param.gpu) {
+            delete_nrnthreads_on_device(nrn_threads, nrn_nthread);
+        }
         for (auto& nt: *this) {
             free_memory(std::exchange(nt._data, nullptr));
             delete[] std::exchange(nt._permute, nullptr);
@@ -273,7 +275,7 @@ auto active_implementations() {
     ret.push_back(SolverImplementation::CellPermute0_GPU);
     ret.push_back(SolverImplementation::CellPermute1_GPU);
     ret.push_back(SolverImplementation::CellPermute2_GPU);
-    ret.push_back(SolverImplementation::CellPermute2_CUDA);
+    // ret.push_back(SolverImplementation::CellPermute2_CUDA);
 #endif
     return ret;
 }

From 90dd8234f1a6ea22c2347ea679ac382a211c9a6e Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Thu, 14 Jul 2022 09:01:01 +0200
Subject: [PATCH 022/128] use -gpu=nordc and make #pragma acc routine seq
 functions inline

---
 CMake/OpenAccHelper.cmake              |   2 +-
 coreneuron/mechanism/eion.cpp          |  34 -------
 coreneuron/mechanism/membfunc.hpp      |  39 ++++++--
 coreneuron/network/cvodestb.cpp        |  12 ---
 coreneuron/sim/multicore.hpp           |  11 +-
 coreneuron/utils/randoms/nrnran123.cpp | 110 +-------------------
 coreneuron/utils/randoms/nrnran123.h   | 133 +++++++++++++++++++------
 7 files changed, 145 insertions(+), 196 deletions(-)

diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake
index f1ff3a4c3..10c942f75 100644
--- a/CMake/OpenAccHelper.cmake
+++ b/CMake/OpenAccHelper.cmake
@@ -71,7 +71,7 @@ if(CORENRN_ENABLE_GPU)
   # https://forums.developer.nvidia.com/t/separate-compilation-of-mixed-cuda-openacc-code/192701 but
   # as discussed in https://github.com/BlueBrain/CoreNeuron/issues/141#issuecomment-1086742194 this
   # is still not completely solving underlying link issue.
-  set(NVHPC_ACC_COMP_FLAGS "-cuda -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
+  set(NVHPC_ACC_COMP_FLAGS "-cuda -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo,nordc")
   # Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA
   # code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the
   # same default compute capabilities as each other, particularly on GPU-less build machines.
diff --git a/coreneuron/mechanism/eion.cpp b/coreneuron/mechanism/eion.cpp
index 1dbd0d2db..deab46627 100644
--- a/coreneuron/mechanism/eion.cpp
+++ b/coreneuron/mechanism/eion.cpp
@@ -154,40 +154,6 @@ the USEION statement of any model using this ion\n",
     }
 }
 
-// std::log isn't constexpr, but there are argument values for which nrn_nernst
-// is a constant expression
-constexpr double nrn_nernst(double ci, double co, double z, double celsius) {
-    if (z == 0) {
-        return 0.;
-    }
-    if (ci <= 0.) {
-        return 1e6;
-    } else if (co <= 0.) {
-        return -1e6;
-    } else {
-        return ktf(celsius) / z * std::log(co / ci);
-    }
-}
-
-nrn_pragma_omp(declare target)
-void nrn_wrote_conc(int type,
-                    double* p1,
-                    int p2,
-                    int it,
-                    double** gimap,
-                    double celsius,
-                    int _cntml_padded) {
-    if (it & 040) {
-        int _iml = 0;
-        /* passing _nt to this function causes cray compiler to segfault during compilation
-         * hence passing _cntml_padded
-         */
-        double* pe = p1 - p2 * _STRIDE;
-        pe[0] = nrn_nernst(pe[1 * _STRIDE], pe[2 * _STRIDE], gimap[type][2], celsius);
-    }
-}
-nrn_pragma_omp(end declare target)
-
 #if VECTORIZE
 #define erev   pd[0 * _STRIDE] /* From Eion */
 #define conci  pd[1 * _STRIDE]
diff --git a/coreneuron/mechanism/membfunc.hpp b/coreneuron/mechanism/membfunc.hpp
index a40d77438..3d7e9e239 100644
--- a/coreneuron/mechanism/membfunc.hpp
+++ b/coreneuron/mechanism/membfunc.hpp
@@ -112,13 +112,40 @@ extern void hoc_register_watch_check(nrn_watch_check_t, int);
 
 extern void nrn_jacob_capacitance(NrnThread*, Memb_list*, int);
 extern void nrn_writes_conc(int, int);
-nrn_pragma_omp(declare target)
-nrn_pragma_acc(routine seq)
-void nrn_wrote_conc(int, double*, int, int, double**, double, int);
-nrn_pragma_omp(end declare target)
 constexpr double ktf(double celsius) {
     return 1000. * units::gasconstant * (celsius + 273.15) / units::faraday;
 }
+// std::log isn't constexpr, but there are argument values for which nrn_nernst
+// is a constant expression
+constexpr double nrn_nernst(double ci, double co, double z, double celsius) {
+    if (z == 0) {
+        return 0.;
+    }
+    if (ci <= 0.) {
+        return 1e6;
+    } else if (co <= 0.) {
+        return -1e6;
+    } else {
+        return ktf(celsius) / z * std::log(co / ci);
+    }
+}
+constexpr void nrn_wrote_conc(int type,
+                    double* p1,
+                    int p2,
+                    int it,
+                    double** gimap,
+                    double celsius,
+                    int _cntml_padded) {
+    if (it & 040) {
+        constexpr int _iml = 0;
+        int const STRIDE{_cntml_padded + _iml};
+        /* passing _nt to this function causes cray compiler to segfault during compilation
+         * hence passing _cntml_padded
+         */
+        double* pe = p1 - p2 * STRIDE;
+        pe[0] = nrn_nernst(pe[1 * STRIDE], pe[2 * STRIDE], gimap[type][2], celsius);
+    }
+}
 inline double nrn_ghk(double v, double ci, double co, double z, double celsius) {
     auto const efun = [](double x) {
         if (std::abs(x) < 1e-4) {
@@ -195,10 +222,6 @@ extern void artcell_net_move(void**, Point_process*, double);
 extern void nrn2ncs_outputevent(int netcon_output_index, double firetime);
 extern bool nrn_use_localgid_;
 extern void net_sem_from_gpu(int sendtype, int i_vdata, int, int ith, int ipnt, double, double);
-nrn_pragma_acc(routine seq)
-nrn_pragma_omp(declare target)
-extern int at_time(NrnThread*, double);
-nrn_pragma_omp(end declare target)
 
 // _OPENACC and/or NET_RECEIVE_BUFFERING
 extern void net_sem_from_gpu(int, int, int, int, int, double, double);
diff --git a/coreneuron/network/cvodestb.cpp b/coreneuron/network/cvodestb.cpp
index 31c18807e..bd3de5f4c 100644
--- a/coreneuron/network/cvodestb.cpp
+++ b/coreneuron/network/cvodestb.cpp
@@ -84,16 +84,4 @@ void fixed_play_continuous(NrnThread* nt) {
     }
 }
 
-// NOTE : this implementation is duplicated in "coreneuron/mechanism/nrnoc_ml.ispc"
-// for the ISPC backend. If changes are required, make sure to change ISPC as well.
-nrn_pragma_omp(declare target)
-int at_time(NrnThread* nt, double te) {
-    double x = te - 1e-11;
-    if (x <= nt->_t && x > (nt->_t - nt->_dt)) {
-        return 1;
-    }
-    return 0;
-}
-nrn_pragma_omp(end declare target)
-
 }  // namespace coreneuron
diff --git a/coreneuron/sim/multicore.hpp b/coreneuron/sim/multicore.hpp
index 18cd613f3..391b5dcaa 100644
--- a/coreneuron/sim/multicore.hpp
+++ b/coreneuron/sim/multicore.hpp
@@ -192,6 +192,13 @@ extern void direct_mode_initialize();
 extern void nrn_mk_table_check(void);
 extern void nonvint(NrnThread* _nt);
 extern void update(NrnThread*);
-
-
+// NOTE : this implementation is duplicated in "coreneuron/mechanism/nrnoc_ml.ispc"
+// for the ISPC backend. If changes are required, make sure to change ISPC as well.
+constexpr int at_time(NrnThread* nt, double te) {
+    double x = te - 1e-11;
+    if (x <= nt->_t && x > (nt->_t - nt->_dt)) {
+        return 1;
+    }
+    return 0;
+}
 }  // namespace coreneuron
diff --git a/coreneuron/utils/randoms/nrnran123.cpp b/coreneuron/utils/randoms/nrnran123.cpp
index f5258968e..7e2538f3d 100644
--- a/coreneuron/utils/randoms/nrnran123.cpp
+++ b/coreneuron/utils/randoms/nrnran123.cpp
@@ -21,10 +21,6 @@
 #include <unordered_map>
 #endif
 
-#ifdef __CUDACC__
-#include <nv/target>
-#endif
-
 // Defining these attributes seems to help nvc++ in OpenMP target offload mode.
 #if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENMP) && defined(__CUDACC__)
@@ -84,40 +80,8 @@ using random123_allocator = coreneuron::unified_allocator<coreneuron::nrnran123_
  * shutdown. If the destructor calls cudaFree and the CUDA runtime has already
  * been shut down then tools like cuda-memcheck reports errors.
  */
-// nrn_pragma_omp(declare target)
-// philox4x32_key_t g_k_real{};
-// nrn_pragma_omp(end declare target)
-// nrn_pragma_acc(declare create(g_k))
 OMP_Mutex g_instance_count_mutex;
 std::size_t g_instance_count{};
-
-philox4x32_key_t g_k{};
-#ifdef __CUDACC__
-// Not 100% clear we need a different name (g_k_dev) here in addition to g_k,
-// but it's clearer and the overhead cannot be high (if it exists).
-__constant__ __device__ philox4x32_key_t g_k_dev{};
-// noinline to force "CUDA" not "acc routine seq" behaviour :shrug:
-__attribute__((noinline)) philox4x32_key_t& global_state() {
-    if target (nv::target::is_device) {
-        return g_k_dev;
-    } else {
-        return g_k;
-    }
-}
-#else
-philox4x32_key_t& global_state() {
-    return g_k;
-}
-#endif
-
-constexpr double SHIFT32 = 1.0 / 4294967297.0; /* 1/(2^32 + 1) */
-
-/** @brief Provide a helper function in global namespace that is declared target for OpenMP
- * offloading to function correctly with NVHPC
- */
-CORENRN_HOST_DEVICE philox4x32_ctr_t philox4x32_helper(coreneuron::nrnran123_State* s) {
-    return philox4x32(s->c, global_state());
-}
 }  // namespace
 
 namespace coreneuron {
@@ -127,81 +91,13 @@ std::size_t nrnran123_instance_count() {
 
 /* if one sets the global, one should reset all the stream sequences. */
 uint32_t nrnran123_get_globalindex() {
-    return global_state().v[0];
-}
-
-void nrnran123_getseq(nrnran123_State* s, uint32_t* seq, char* which) {
-    *seq = s->c.v[0];
-    *which = s->which_;
-}
-
-void nrnran123_setseq(nrnran123_State* s, uint32_t seq, char which) {
-    if (which > 3) {
-        s->which_ = 0;
-    } else {
-        s->which_ = which;
-    }
-    s->c.v[0] = seq;
-    s->r = philox4x32_helper(s);
-}
-
-void nrnran123_getids(nrnran123_State* s, uint32_t* id1, uint32_t* id2) {
-    *id1 = s->c.v[2];
-    *id2 = s->c.v[3];
-}
-
-void nrnran123_getids3(nrnran123_State* s, uint32_t* id1, uint32_t* id2, uint32_t* id3) {
-    *id3 = s->c.v[1];
-    *id1 = s->c.v[2];
-    *id2 = s->c.v[3];
-}
-
-uint32_t nrnran123_ipick(nrnran123_State* s) {
-    uint32_t rval;
-    char which = s->which_;
-    rval = s->r.v[int{which++}];
-    if (which > 3) {
-        which = 0;
-        s->c.v[0]++;
-        s->r = philox4x32_helper(s);
-    }
-    s->which_ = which;
-    return rval;
-}
-
-double nrnran123_dblpick(nrnran123_State* s) {
-    return nrnran123_uint2dbl(nrnran123_ipick(s));
-}
-
-double nrnran123_negexp(nrnran123_State* s) {
-    /* min 2.3283064e-10 to max 22.18071 */
-    return -std::log(nrnran123_dblpick(s));
-}
-
-/* at cost of a cached  value we could compute two at a time. */
-double nrnran123_normal(nrnran123_State* s) {
-    double w, u1;
-    do {
-        u1 = nrnran123_dblpick(s);
-        double u2{nrnran123_dblpick(s)};
-        u1 = 2. * u1 - 1.;
-        u2 = 2. * u2 - 1.;
-        w = (u1 * u1) + (u2 * u2);
-    } while (w > 1);
-    double y{std::sqrt((-2. * std::log(w)) / w)};
-    return u1 * y;
-}
-
-double nrnran123_uint2dbl(uint32_t u) {
-    /* 0 to 2^32-1 transforms to double value in open (0,1) interval */
-    /* min 2.3283064e-10 to max (1 - 2.3283064e-10) */
-    return ((double) u + 1.0) * SHIFT32;
+    return random123::detail::global_state().v[0];
 }
 
 /* nrn123 streams are created from cpu launcher routine */
 void nrnran123_set_globalindex(uint32_t gix) {
     // If the global seed is changing then we shouldn't have any active streams.
-    auto& g_k = global_state();
+    auto& g_k = random123::detail::global_state();
     {
         std::lock_guard<OMP_Mutex> _{g_instance_count_mutex};
         if (g_instance_count != 0 && nrnmpi_myid == 0) {
@@ -217,7 +113,7 @@ void nrnran123_set_globalindex(uint32_t gix) {
         if (coreneuron::gpu_enabled()) {
 #ifdef __CUDACC__
             {
-                auto const code = cudaMemcpyToSymbol(g_k_dev, &g_k, sizeof(g_k));
+                auto const code = cudaMemcpyToSymbol(random123::detail::g_k_dev, &g_k, sizeof(g_k));
                 assert(code == cudaSuccess);
             }
             {
diff --git a/coreneuron/utils/randoms/nrnran123.h b/coreneuron/utils/randoms/nrnran123.h
index ccd3fa5db..8290749c7 100644
--- a/coreneuron/utils/randoms/nrnran123.h
+++ b/coreneuron/utils/randoms/nrnran123.h
@@ -39,6 +39,10 @@ of the full distribution available from
 #include <Random123/philox.h>
 #include <inttypes.h>
 
+#ifdef __CUDACC__
+#include <nv/target>
+#endif
+
 // Some files are compiled with DISABLE_OPENACC, and some builds have no GPU
 // support at all. In these two cases, request that the random123 state is
 // allocated using new/delete instead of CUDA unified memory.
@@ -86,38 +90,103 @@ void nrnran123_deletestream(nrnran123_State* s,
                             bool use_unified_memory = CORENRN_RAN123_USE_UNIFIED_MEMORY);
 
 /* minimal data stream */
-nrn_pragma_omp(declare target)
-nrn_pragma_acc(routine seq)
-void nrnran123_getseq(nrnran123_State*, uint32_t* seq, char* which);
-nrn_pragma_acc(routine seq)
-void nrnran123_getids(nrnran123_State*, uint32_t* id1, uint32_t* id2);
-nrn_pragma_acc(routine seq)
-void nrnran123_getids3(nrnran123_State*, uint32_t* id1, uint32_t* id2, uint32_t* id3);
-nrn_pragma_acc(routine seq)
-uint32_t nrnran123_ipick(nrnran123_State*); /* uniform 0 to 2^32-1 */
-
-/* this could be called from openacc parallel construct */
-nrn_pragma_acc(routine seq)
-double nrnran123_dblpick(nrnran123_State*); /* uniform open interval (0,1)*/
-/* nrnran123_dblpick minimum value is 2.3283064e-10 and max value is 1-min */
+constexpr void nrnran123_getseq(nrnran123_State* s, uint32_t* seq, char* which) {
+    *seq = s->c.v[0];
+    *which = s->which_;
+}
+constexpr void nrnran123_getids(nrnran123_State* s, uint32_t* id1, uint32_t* id2) {
+    *id1 = s->c.v[2];
+    *id2 = s->c.v[3];
+}
+constexpr void nrnran123_getids3(nrnran123_State* s, uint32_t* id1, uint32_t* id2, uint32_t* id3) {
+    *id3 = s->c.v[1];
+    *id1 = s->c.v[2];
+    *id2 = s->c.v[3];
+}
+
+namespace random123::detail {
+inline philox4x32_key_t g_k{};
+#ifdef __CUDACC__
+// Not 100% clear we need a different name (g_k_dev) here in addition to g_k,
+// but it's clearer and the overhead cannot be high (if it exists).
+__constant__ __device__ inline philox4x32_key_t g_k_dev{};
+// noinline to force "CUDA" not "acc routine seq" behaviour :shrug:
+__attribute__((noinline)) inline philox4x32_key_t& global_state() {
+    if target (nv::target::is_device) {
+        return g_k_dev;
+    } else {
+        return g_k;
+    }
+}
+#else
+inline philox4x32_key_t& global_state() {
+    return g_k;
+}
+#endif
+
+/** @brief Provide a helper function in global namespace that is declared target for OpenMP
+ * offloading to function correctly with NVHPC
+ */
+inline philox4x32_ctr_t philox4x32_helper(coreneuron::nrnran123_State* s) {
+    return philox4x32(s->c, global_state());
+}
+}  // namespace random123::detail
+
+// Uniform 0 to 2*32-1
+inline uint32_t nrnran123_ipick(nrnran123_State* s) {
+    uint32_t rval;
+    char which = s->which_;
+    rval = s->r.v[int{which++}];
+    if (which > 3) {
+        which = 0;
+        s->c.v[0]++;
+        s->r = random123::detail::philox4x32_helper(s);
+    }
+    s->which_ = which;
+    return rval;
+}
+
+constexpr double nrnran123_uint2dbl(uint32_t u) {
+    constexpr double SHIFT32 = 1.0 / 4294967297.0; /* 1/(2^32 + 1) */
+    /* 0 to 2^32-1 transforms to double value in open (0,1) interval */
+    /* min 2.3283064e-10 to max (1 - 2.3283064e-10) */
+    return (static_cast<double>(u) + 1.0) * SHIFT32;
+}
+
+// Uniform open interval (0,1), minimum value is 2.3283064e-10 and max value is 1-min
+inline double nrnran123_dblpick(nrnran123_State* s) {
+    return nrnran123_uint2dbl(nrnran123_ipick(s));
+}
 
 /* this could be called from openacc parallel construct (in INITIAL block) */
-nrn_pragma_acc(routine seq)
-void nrnran123_setseq(nrnran123_State*, uint32_t seq, char which);
-nrn_pragma_acc(routine seq)
-double nrnran123_negexp(nrnran123_State*); /* mean 1.0 */
-/* nrnran123_negexp min value is 2.3283064e-10, max is 22.18071 */
-
-/* missing declaration in coreneuron */
-nrn_pragma_acc(routine seq)
-double nrnran123_normal(nrnran123_State*);
-nrn_pragma_acc(routine seq)
-double nrnran123_gauss(nrnran123_State*); /* mean 0.0, std 1.0 */
-
-/* more fundamental (stateless) (though the global index is still used) */
-nrn_pragma_acc(routine seq)
-nrnran123_array4x32 nrnran123_iran(uint32_t seq, uint32_t id1, uint32_t id2);
-nrn_pragma_acc(routine seq)
-double nrnran123_uint2dbl(uint32_t);
-nrn_pragma_omp(end declare target)
+inline void nrnran123_setseq(nrnran123_State* s, uint32_t seq, char which) {
+    if (which > 3) {
+        s->which_ = 0;
+    } else {
+        s->which_ = which;
+    }
+    s->c.v[0] = seq;
+    s->r = random123::detail::philox4x32_helper(s);
+}
+
+// nrnran123_negexp min value is 2.3283064e-10, max is 22.18071, mean 1.0
+inline double nrnran123_negexp(nrnran123_State* s) {
+    return -std::log(nrnran123_dblpick(s));
+}
+
+/* at cost of a cached  value we could compute two at a time. */
+inline double nrnran123_normal(nrnran123_State* s) {
+    double w, u1;
+    do {
+        u1 = nrnran123_dblpick(s);
+        double u2{nrnran123_dblpick(s)};
+        u1 = 2. * u1 - 1.;
+        u2 = 2. * u2 - 1.;
+        w = (u1 * u1) + (u2 * u2);
+    } while (w > 1);
+    double y{std::sqrt((-2. * std::log(w)) / w)};
+    return u1 * y;
+}
+
+// nrnran123_gauss, nrnran123_iran were declared but not defined in CoreNEURON
 }  // namespace coreneuron

From 6cca56f19ab9f4ab1c78690f56de862aad3a25e4 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Thu, 14 Jul 2022 09:01:28 +0200
Subject: [PATCH 023/128] fix handing of empty suffix

---
 tests/integration/CMakeLists.txt | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/integration/CMakeLists.txt b/tests/integration/CMakeLists.txt
index 891f1443e..fa488028e 100644
--- a/tests/integration/CMakeLists.txt
+++ b/tests/integration/CMakeLists.txt
@@ -81,7 +81,12 @@ endif()
 # create them and copy reference spikes
 # ~~~
 foreach(data_dir "ring" "ring_gap")
-  foreach(test_suffix ${test_suffixes})
+  # Naïve foreach(test_suffix ${test_suffixes}) does not seem to handle empty suffixes correctly.
+  list(LENGTH test_suffixes num_suffixes)
+  math(EXPR num_suffixes_m1 "${num_suffixes} - 1")
+  foreach(suffix_index RANGE 0 ${num_suffixes_m1})
+    list(GET test_suffixes ${suffix_index} test_suffix)
+    message(STATUS "test_suffix=${test_suffix}")
     file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/${data_dir}/out.dat.ref"
          DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/${data_dir}${test_suffix}/")
   endforeach()

From 258ef9425a7f4f1a3b6d1e6dccd0a95237deed11 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Fri, 15 Jul 2022 10:25:26 +0200
Subject: [PATCH 024/128] revert hack to compile/link main() with g++

---
 extra/nrnivmodl_core_makefile.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extra/nrnivmodl_core_makefile.in b/extra/nrnivmodl_core_makefile.in
index 9388059fe..7f233b9aa 100644
--- a/extra/nrnivmodl_core_makefile.in
+++ b/extra/nrnivmodl_core_makefile.in
@@ -209,7 +209,7 @@ endif
 # main target to build binary
 $(SPECIAL_EXE): coremech_lib_target
 	@printf " => $(C_GREEN)Binary$(C_RESET) creating $(SPECIAL_EXE)\n"
-	g++ -o $(SPECIAL_EXE) $(CORENRN_SHARE_CORENRN_DIR)/coreneuron.cpp \
+	$(CXX_LINK_EXE_CMD) -o $(SPECIAL_EXE) $(CORENRN_SHARE_CORENRN_DIR)/coreneuron.cpp \
 	  -I$(CORENRN_INC_DIR) $(INCFLAGS) \
 	  -L$(OUTPUT_DIR) -l$(COREMECH_LIB_NAME) $(CORENRNLIB_FLAGS) $(LDFLAGS) \
 	  -L$(CORENRN_LIB_DIR) -lscopmath \

From 708864c3b8cb5c286cc67778787aefd59a10cf4a Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Fri, 15 Jul 2022 10:50:01 +0200
Subject: [PATCH 025/128] revert to -gpu=rdc and put the random123 global state
 back in .cpp because inline device variables don't seem to work

---
 CMake/OpenAccHelper.cmake              |  2 +-
 coreneuron/utils/randoms/nrnran123.cpp | 29 +++++++++++--
 coreneuron/utils/randoms/nrnran123.h   | 56 +++++++++-----------------
 3 files changed, 45 insertions(+), 42 deletions(-)

diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake
index 10c942f75..bc21e051f 100644
--- a/CMake/OpenAccHelper.cmake
+++ b/CMake/OpenAccHelper.cmake
@@ -71,7 +71,7 @@ if(CORENRN_ENABLE_GPU)
   # https://forums.developer.nvidia.com/t/separate-compilation-of-mixed-cuda-openacc-code/192701 but
   # as discussed in https://github.com/BlueBrain/CoreNeuron/issues/141#issuecomment-1086742194 this
   # is still not completely solving underlying link issue.
-  set(NVHPC_ACC_COMP_FLAGS "-cuda -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo,nordc")
+  set(NVHPC_ACC_COMP_FLAGS "-cuda -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo,rdc")
   # Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA
   # code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the
   # same default compute capabilities as each other, particularly on GPU-less build machines.
diff --git a/coreneuron/utils/randoms/nrnran123.cpp b/coreneuron/utils/randoms/nrnran123.cpp
index 7e2538f3d..b23c01485 100644
--- a/coreneuron/utils/randoms/nrnran123.cpp
+++ b/coreneuron/utils/randoms/nrnran123.cpp
@@ -82,8 +82,31 @@ using random123_allocator = coreneuron::unified_allocator<coreneuron::nrnran123_
  */
 OMP_Mutex g_instance_count_mutex;
 std::size_t g_instance_count{};
+
+philox4x32_key_t g_k{};
+#ifdef __CUDACC__
+// Not 100% clear we need a different name (g_k_dev) here in addition to g_k,
+// but it's clearer and the overhead cannot be high (if it exists).
+__constant__ __device__ philox4x32_key_t g_k_dev{};
+// noinline to force "CUDA" not "acc routine seq" behaviour :shrug:
+__attribute__((noinline)) philox4x32_key_t& global_state() {
+    if target (nv::target::is_device) {
+        return g_k_dev;
+    } else {
+        return g_k;
+    }
+}
+#else
+philox4x32_key_t& global_state() {
+    return g_k;
+}
+#endif
 }  // namespace
 
+philox4x32_ctr_t coreneuron_random123_philox4x32_helper(coreneuron::nrnran123_State* s) {
+    return philox4x32(s->c, global_state());
+}
+
 namespace coreneuron {
 std::size_t nrnran123_instance_count() {
     return g_instance_count;
@@ -91,13 +114,13 @@ std::size_t nrnran123_instance_count() {
 
 /* if one sets the global, one should reset all the stream sequences. */
 uint32_t nrnran123_get_globalindex() {
-    return random123::detail::global_state().v[0];
+    return global_state().v[0];
 }
 
 /* nrn123 streams are created from cpu launcher routine */
 void nrnran123_set_globalindex(uint32_t gix) {
     // If the global seed is changing then we shouldn't have any active streams.
-    auto& g_k = random123::detail::global_state();
+    auto& g_k = global_state();
     {
         std::lock_guard<OMP_Mutex> _{g_instance_count_mutex};
         if (g_instance_count != 0 && nrnmpi_myid == 0) {
@@ -113,7 +136,7 @@ void nrnran123_set_globalindex(uint32_t gix) {
         if (coreneuron::gpu_enabled()) {
 #ifdef __CUDACC__
             {
-                auto const code = cudaMemcpyToSymbol(random123::detail::g_k_dev, &g_k, sizeof(g_k));
+                auto const code = cudaMemcpyToSymbol(g_k_dev, &g_k, sizeof(g_k));
                 assert(code == cudaSuccess);
             }
             {
diff --git a/coreneuron/utils/randoms/nrnran123.h b/coreneuron/utils/randoms/nrnran123.h
index 8290749c7..c68cc9017 100644
--- a/coreneuron/utils/randoms/nrnran123.h
+++ b/coreneuron/utils/randoms/nrnran123.h
@@ -60,9 +60,17 @@ struct nrnran123_State {
     char which_;
 };
 
-struct nrnran123_array4x32 {
-    uint32_t v[4];
-};
+}
+
+/** @brief Provide a helper function in global namespace that is declared target for OpenMP
+ * offloading to function correctly with NVHPC
+ */
+nrn_pragma_acc(routine seq)
+nrn_pragma_omp(declare target)
+philox4x32_ctr_t coreneuron_random123_philox4x32_helper(coreneuron::nrnran123_State* s);
+nrn_pragma_omp(end declare target)
+
+namespace coreneuron {
 
 /* global index. eg. run number */
 /* all generator instances share this global index */
@@ -104,43 +112,14 @@ constexpr void nrnran123_getids3(nrnran123_State* s, uint32_t* id1, uint32_t* id
     *id2 = s->c.v[3];
 }
 
-namespace random123::detail {
-inline philox4x32_key_t g_k{};
-#ifdef __CUDACC__
-// Not 100% clear we need a different name (g_k_dev) here in addition to g_k,
-// but it's clearer and the overhead cannot be high (if it exists).
-__constant__ __device__ inline philox4x32_key_t g_k_dev{};
-// noinline to force "CUDA" not "acc routine seq" behaviour :shrug:
-__attribute__((noinline)) inline philox4x32_key_t& global_state() {
-    if target (nv::target::is_device) {
-        return g_k_dev;
-    } else {
-        return g_k;
-    }
-}
-#else
-inline philox4x32_key_t& global_state() {
-    return g_k;
-}
-#endif
-
-/** @brief Provide a helper function in global namespace that is declared target for OpenMP
- * offloading to function correctly with NVHPC
- */
-inline philox4x32_ctr_t philox4x32_helper(coreneuron::nrnran123_State* s) {
-    return philox4x32(s->c, global_state());
-}
-}  // namespace random123::detail
-
 // Uniform 0 to 2*32-1
-inline uint32_t nrnran123_ipick(nrnran123_State* s) {
-    uint32_t rval;
+constexpr uint32_t nrnran123_ipick(nrnran123_State* s) {
     char which = s->which_;
-    rval = s->r.v[int{which++}];
+    uint32_t rval{s->r.v[int{which++}]};
     if (which > 3) {
         which = 0;
         s->c.v[0]++;
-        s->r = random123::detail::philox4x32_helper(s);
+        s->r = coreneuron_random123_philox4x32_helper(s);
     }
     s->which_ = which;
     return rval;
@@ -154,19 +133,19 @@ constexpr double nrnran123_uint2dbl(uint32_t u) {
 }
 
 // Uniform open interval (0,1), minimum value is 2.3283064e-10 and max value is 1-min
-inline double nrnran123_dblpick(nrnran123_State* s) {
+constexpr double nrnran123_dblpick(nrnran123_State* s) {
     return nrnran123_uint2dbl(nrnran123_ipick(s));
 }
 
 /* this could be called from openacc parallel construct (in INITIAL block) */
-inline void nrnran123_setseq(nrnran123_State* s, uint32_t seq, char which) {
+constexpr void nrnran123_setseq(nrnran123_State* s, uint32_t seq, char which) {
     if (which > 3) {
         s->which_ = 0;
     } else {
         s->which_ = which;
     }
     s->c.v[0] = seq;
-    s->r = random123::detail::philox4x32_helper(s);
+    s->r = coreneuron_random123_philox4x32_helper(s);
 }
 
 // nrnran123_negexp min value is 2.3283064e-10, max is 22.18071, mean 1.0
@@ -189,4 +168,5 @@ inline double nrnran123_normal(nrnran123_State* s) {
 }
 
 // nrnran123_gauss, nrnran123_iran were declared but not defined in CoreNEURON
+// nrnran123_array4x32 was declared but not used in CoreNEURON
 }  // namespace coreneuron

From 6a74cf940482e1232b6e9de99272b87df8cd323d Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Fri, 15 Jul 2022 16:03:33 +0200
Subject: [PATCH 026/128] promise we will never try and allocate from the
 device

---
 coreneuron/sim/scopmath/sparse_thread.hpp | 32 ++++++++++++++++++-----
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/coreneuron/sim/scopmath/sparse_thread.hpp b/coreneuron/sim/scopmath/sparse_thread.hpp
index 6614a0a1b..fc2cc89c7 100644
--- a/coreneuron/sim/scopmath/sparse_thread.hpp
+++ b/coreneuron/sim/scopmath/sparse_thread.hpp
@@ -13,6 +13,10 @@
 #include "coreneuron/mechanism/mech/mod2c_core_thread.hpp"
 #include "coreneuron/sim/scopmath/errcodes.h"
 
+#ifdef __CUDACC__
+#include <nv/target>
+#endif
+
 namespace coreneuron {
 namespace scopmath {
 namespace sparse {
@@ -90,10 +94,17 @@ inline Elm* getelm(SparseObj* so, unsigned row, unsigned col, Elm* new_elem) {
         }
         /* insert below el */
         if (!new_elem) {
-            new_elem = new Elm{};
-            // Using array-new here causes problems in GPU compilation.
-            new_elem->value = static_cast<double*>(std::malloc(so->_cntml_padded * sizeof(double)));
-            increase_order(so, row);
+#ifdef __CUDACC__
+            if target (nv::target::is_device) {
+                assert(false);
+            } else
+#endif
+            {
+                new_elem = new Elm{};
+                // Using array-new here causes problems in GPU compilation.
+                new_elem->value = static_cast<double*>(std::malloc(so->_cntml_padded * sizeof(double)));
+                increase_order(so, row);
+            }
         }
         new_elem->r_down = el->r_down;
         el->r_down = new_elem;
@@ -133,9 +144,16 @@ inline Elm* getelm(SparseObj* so, unsigned row, unsigned col, Elm* new_elem) {
         }
         /* insert above el */
         if (!new_elem) {
-            new_elem = new Elm{};
-            new_elem->value = static_cast<double*>(std::malloc(so->_cntml_padded * sizeof(double)));
-            increase_order(so, row);
+#ifdef __CUDACC__
+            if target (nv::target::is_device) {
+                assert(false);
+            } else
+#endif
+            {
+                new_elem = new Elm{};
+                new_elem->value = static_cast<double*>(std::malloc(so->_cntml_padded * sizeof(double)));
+                increase_order(so, row);
+            }
         }
         new_elem->r_up = el->r_up;
         el->r_up = new_elem;

From 5dabb3663f350b2c866599425de272bff49613cf Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Fri, 15 Jul 2022 16:03:54 +0200
Subject: [PATCH 027/128] drop -lscopmath as its folded in elsewhere

---
 CMake/OpenAccHelper.cmake        | 2 +-
 extra/nrnivmodl_core_makefile.in | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake
index bc21e051f..2d84efb70 100644
--- a/CMake/OpenAccHelper.cmake
+++ b/CMake/OpenAccHelper.cmake
@@ -109,7 +109,7 @@ if(CORENRN_ENABLE_GPU)
   )
 else()
   set_property(GLOBAL PROPERTY CORENEURON_LIB_LINK_FLAGS
-                               "-L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech -lscopmath")
+                               "-L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech")
 endif(CORENRN_ENABLE_GPU)
 
 if(CORENRN_HAVE_NVHPC_COMPILER)
diff --git a/extra/nrnivmodl_core_makefile.in b/extra/nrnivmodl_core_makefile.in
index 7f233b9aa..c26066e4c 100644
--- a/extra/nrnivmodl_core_makefile.in
+++ b/extra/nrnivmodl_core_makefile.in
@@ -212,7 +212,7 @@ $(SPECIAL_EXE): coremech_lib_target
 	$(CXX_LINK_EXE_CMD) -o $(SPECIAL_EXE) $(CORENRN_SHARE_CORENRN_DIR)/coreneuron.cpp \
 	  -I$(CORENRN_INC_DIR) $(INCFLAGS) \
 	  -L$(OUTPUT_DIR) -l$(COREMECH_LIB_NAME) $(CORENRNLIB_FLAGS) $(LDFLAGS) \
-	  -L$(CORENRN_LIB_DIR) -lscopmath \
+	  -L$(CORENRN_LIB_DIR) \
 	  -Wl,-rpath,'$(LIB_RPATH)' -Wl,-rpath,$(CORENRN_LIB_DIR) -Wl,-rpath,'$(INSTALL_LIB_RPATH)'
 
 coremech_lib_target: $(corenrnmech_lib_target)

From 532b8adfb062b92287e6520e1390828757f9bb3f Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Fri, 15 Jul 2022 16:04:16 +0200
Subject: [PATCH 028/128] random123 header reorganisation

---
 coreneuron/utils/randoms/nrnran123.cpp | 14 +++++++++-----
 coreneuron/utils/randoms/nrnran123.h   |  4 ----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/coreneuron/utils/randoms/nrnran123.cpp b/coreneuron/utils/randoms/nrnran123.cpp
index b23c01485..f2dd2dee2 100644
--- a/coreneuron/utils/randoms/nrnran123.cpp
+++ b/coreneuron/utils/randoms/nrnran123.cpp
@@ -11,16 +11,20 @@
 #include "coreneuron/utils/nrnmutdec.hpp"
 #include "coreneuron/utils/randoms/nrnran123.h"
 
-#include <cmath>
-#include <iostream>
-#include <memory>
-#include <mutex>
-
 #ifdef CORENEURON_USE_BOOST_POOL
 #include <boost/pool/pool_alloc.hpp>
 #include <unordered_map>
 #endif
 
+#ifdef __CUDACC__
+#include <nv/target>
+#endif
+
+#include <cmath>
+#include <iostream>
+#include <memory>
+#include <mutex>
+
 // Defining these attributes seems to help nvc++ in OpenMP target offload mode.
 #if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENMP) && defined(__CUDACC__)
diff --git a/coreneuron/utils/randoms/nrnran123.h b/coreneuron/utils/randoms/nrnran123.h
index c68cc9017..6c8e97cf9 100644
--- a/coreneuron/utils/randoms/nrnran123.h
+++ b/coreneuron/utils/randoms/nrnran123.h
@@ -39,10 +39,6 @@ of the full distribution available from
 #include <Random123/philox.h>
 #include <inttypes.h>
 
-#ifdef __CUDACC__
-#include <nv/target>
-#endif
-
 // Some files are compiled with DISABLE_OPENACC, and some builds have no GPU
 // support at all. In these two cases, request that the random123 state is
 // allocated using new/delete instead of CUDA unified memory.

From e193d3113b46742ece4481d6311f145d772bad37 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Fri, 15 Jul 2022 16:27:44 +0200
Subject: [PATCH 029/128] revert a different allocation workaround.

---
 coreneuron/sim/scopmath/sparse_thread.hpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/coreneuron/sim/scopmath/sparse_thread.hpp b/coreneuron/sim/scopmath/sparse_thread.hpp
index fc2cc89c7..85580011e 100644
--- a/coreneuron/sim/scopmath/sparse_thread.hpp
+++ b/coreneuron/sim/scopmath/sparse_thread.hpp
@@ -101,8 +101,7 @@ inline Elm* getelm(SparseObj* so, unsigned row, unsigned col, Elm* new_elem) {
 #endif
             {
                 new_elem = new Elm{};
-                // Using array-new here causes problems in GPU compilation.
-                new_elem->value = static_cast<double*>(std::malloc(so->_cntml_padded * sizeof(double)));
+                new_elem->value = new double[so->_cntml_padded];
                 increase_order(so, row);
             }
         }
@@ -151,7 +150,7 @@ inline Elm* getelm(SparseObj* so, unsigned row, unsigned col, Elm* new_elem) {
 #endif
             {
                 new_elem = new Elm{};
-                new_elem->value = static_cast<double*>(std::malloc(so->_cntml_padded * sizeof(double)));
+                new_elem->value = new double[so->_cntml_padded];
                 increase_order(so, row);
             }
         }

From 986e82e5f814c1ce2094a24cfcbecd8f928c1fd3 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Fri, 15 Jul 2022 16:27:56 +0200
Subject: [PATCH 030/128] clang-format

---
 coreneuron/mechanism/membfunc.hpp    | 12 ++++++------
 coreneuron/utils/randoms/nrnran123.h |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/coreneuron/mechanism/membfunc.hpp b/coreneuron/mechanism/membfunc.hpp
index 3d7e9e239..8fe04a06c 100644
--- a/coreneuron/mechanism/membfunc.hpp
+++ b/coreneuron/mechanism/membfunc.hpp
@@ -130,12 +130,12 @@ constexpr double nrn_nernst(double ci, double co, double z, double celsius) {
     }
 }
 constexpr void nrn_wrote_conc(int type,
-                    double* p1,
-                    int p2,
-                    int it,
-                    double** gimap,
-                    double celsius,
-                    int _cntml_padded) {
+                              double* p1,
+                              int p2,
+                              int it,
+                              double** gimap,
+                              double celsius,
+                              int _cntml_padded) {
     if (it & 040) {
         constexpr int _iml = 0;
         int const STRIDE{_cntml_padded + _iml};
diff --git a/coreneuron/utils/randoms/nrnran123.h b/coreneuron/utils/randoms/nrnran123.h
index 6c8e97cf9..e75ec3f69 100644
--- a/coreneuron/utils/randoms/nrnran123.h
+++ b/coreneuron/utils/randoms/nrnran123.h
@@ -56,7 +56,7 @@ struct nrnran123_State {
     char which_;
 };
 
-}
+}  // namespace coreneuron
 
 /** @brief Provide a helper function in global namespace that is declared target for OpenMP
  * offloading to function correctly with NVHPC

From f03cdfde5f9d4b857b2b627b5b9df9b1fe001707 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 19 Jul 2022 10:34:29 +0200
Subject: [PATCH 031/128] build system tmpcommit

---
 CMake/OpenAccHelper.cmake                   |   5 +-
 CMakeLists.txt                              |   8 +-
 coreneuron/CMakeLists.txt                   | 128 ++++++++++++--------
 coreneuron/permute/cellorder.cpp            |   2 +-
 extra/nrnivmodl_core_makefile.in            |  25 ++--
 tests/unit/cmdline_interface/CMakeLists.txt |   7 +-
 tests/unit/interleave_info/CMakeLists.txt   |   9 +-
 tests/unit/lfp/CMakeLists.txt               |  14 +--
 tests/unit/queueing/CMakeLists.txt          |   6 +-
 tests/unit/solver/CMakeLists.txt            |   6 +-
 tests/unit/solver/test_solver.cpp           |   2 +-
 11 files changed, 112 insertions(+), 100 deletions(-)

diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake
index 2d84efb70..5be6af61b 100644
--- a/CMake/OpenAccHelper.cmake
+++ b/CMake/OpenAccHelper.cmake
@@ -100,16 +100,17 @@ endif()
 # =============================================================================
 # Set global property that will be used by NEURON to link with CoreNEURON
 # =============================================================================
+# TODO this should be derived from what we use internally to link special-core?
 if(CORENRN_ENABLE_GPU)
   set_property(
     GLOBAL
     PROPERTY
       CORENEURON_LIB_LINK_FLAGS
-      "${NVHPC_ACC_COMP_FLAGS} -rdynamic -lrt -Wl,--whole-archive -L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech -Wl,--no-whole-archive"
+      "${NVHPC_ACC_COMP_FLAGS} -rdynamic -lrt -Wl,--whole-archive -L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcoreneuron -lcoreneuron-cuda -Wl,--no-whole-archive"
   )
 else()
   set_property(GLOBAL PROPERTY CORENEURON_LIB_LINK_FLAGS
-                               "-L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech")
+                               "-L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcoreneuron")
 endif(CORENRN_ENABLE_GPU)
 
 if(CORENRN_HAVE_NVHPC_COMPILER)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4366cfee1..7ef147a41 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -303,10 +303,6 @@ if(CORENRN_HAVE_NVHPC_COMPILER)
   endif()
 endif()
 
-if(CORENRN_ENABLE_GPU)
-  set(CORENRN_ENABLE_SHARED OFF)
-endif()
-
 if(CORENRN_ENABLE_SHARED)
   set(COMPILE_LIBRARY_TYPE "SHARED")
 else()
@@ -489,7 +485,7 @@ endif()
 add_subdirectory(coreneuron)
 
 if(CORENRN_ENABLE_GPU)
-  get_target_property(CORENRN_LINK_LIBRARIES coreneuron INTERFACE_LINK_LIBRARIES)
+  get_target_property(CORENRN_LINK_LIBRARIES coreneuron-core INTERFACE_LINK_LIBRARIES)
   if(CORENRN_LINK_LIBRARIES)
     foreach(LIB ${CORENRN_LINK_LIBRARIES})
       get_filename_component(dir_path ${LIB} DIRECTORY)
@@ -499,7 +495,7 @@ if(CORENRN_ENABLE_GPU)
         # https://github.com/BlueBrain/CoreNeuron/blob/856cea4aa647c8f2b0d5bda6d0fc32144c5942e3/CMakeLists.txt#L411-L412
         message(
           NOTICE
-          "Ignoring dependency '${LIB}' of 'coreneuron' and assuming relevant flags have already been added to CORENEURON_LIB_LINK_FLAGS."
+          "Ignoring dependency '${LIB}' of 'coreneuron-core' and assuming relevant flags have already been added to CORENEURON_LIB_LINK_FLAGS."
         )
       elseif(NOT dir_path)
         # In case LIB is not a target but is just the name of a library, e.g. "dl"
diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index ef5f577bd..28411fd53 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -16,8 +16,6 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 # =============================================================================
 # gather various source files
 # =============================================================================
-file(GLOB_RECURSE CORENEURON_HEADER_FILES "*.h*")
-file(GLOB_RECURSE CORENEURON_TEMPLATE_FILES "*.ipp")
 file(
   GLOB
   CORENEURON_CODE_FILES
@@ -27,18 +25,19 @@ file(
   "io/*.cpp"
   "io/reports/*.cpp"
   "mechanism/*.cpp"
+  "mpi/core/nrnmpi_def_cinc.cpp"
   "network/*.cpp"
   "permute/*.cpp"
   "sim/*.cpp"
+  "sim/scopmath/abort.cpp"
+  "sim/scopmath/newton_thread.cpp"
   "utils/*.cpp"
   "utils/*/*.c"
   "utils/*/*.cpp")
-set(SCOPMATH_CODE_FILES "sim/scopmath/abort.cpp" "sim/scopmath/newton_thread.cpp")
 set(MPI_LIB_FILES "mpi/lib/mpispike.cpp" "mpi/lib/nrnmpi.cpp")
-set(MPI_CORE_FILES "mpi/core/nrnmpi_def_cinc.cpp")
 if(CORENRN_ENABLE_MPI)
   # Building these requires -ldl, which is only added if MPI is enabled.
-  list(APPEND MPI_CORE_FILES "mpi/core/resolve.cpp" "mpi/core/nrnmpidec.cpp")
+  list(APPEND CORENEURON_CODE_FILES "mpi/core/resolve.cpp" "mpi/core/nrnmpidec.cpp")
 endif()
 file(COPY ${CORENEURON_PROJECT_SOURCE_DIR}/external/Random123/include/Random123
      DESTINATION ${CMAKE_BINARY_DIR}/include)
@@ -107,15 +106,20 @@ if(CORENRN_ENABLE_GPU)
 
   set_source_files_properties(${OPENACC_EXCLUDED_FILES} PROPERTIES COMPILE_FLAGS
                                                                    "-DDISABLE_OPENACC")
-  # Only compile the explicit CUDA implementation of the Hines solver in GPU builds. list(APPEND
-  # CORENEURON_CODE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/permute/cellorder.cu)
+  # Only compile the explicit CUDA implementation of the Hines solver in GPU
+  # builds. Because of
+  # https://forums.developer.nvidia.com/t/cannot-dynamically-load-a-shared-library-containing-both-openacc-and-cuda-code/210972
+  # this cannot be included in the same shared library as the rest of the
+  # OpenACC code.
+  set(CORENEURON_CUDA_FILES ${CMAKE_CURRENT_SOURCE_DIR}/permute/cellorder.cu)
 
   # Eigen-3.5+ provides better GPU support. However, some functions cannot be called directly from
   # within an OpenACC region. Therefore, we need to wrap them in a special API (decorate them with
   # __device__ & acc routine tokens), which allows us to eventually call them from OpenACC. Calling
   # these functions from CUDA kernels presents no issue ...
+  # TODO is it going to work to call these from libcoreneuron-cuda.so? probably not...
   if(CORENRN_ENABLE_NMODL AND EXISTS ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cu)
-    # list(APPEND CORENEURON_CODE_FILES ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cu)
+    list(APPEND CORENEURON_CUDA_FILES ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cu)
   endif()
 endif()
 
@@ -143,23 +147,42 @@ if(CORENRN_ENABLE_MPI AND NOT CORENRN_ENABLE_MPI_DYNAMIC)
   set(CORENRN_MPI_OBJ $<TARGET_OBJECTS:${CORENRN_MPI_LIB_NAME}>)
 endif()
 
-# main coreneuron library
+# Library containing the bulk of the non-mechanism CoreNEURON code. This is
+# always created and installed as a static library, and then the nrnivmodl-core
+# workflow extracts the object files from it and does one of the following:
+# ~~~
+# - shared build: creates libcoreneuron.so from these objects plus those from
+#   the translated MOD files
+# - static build: creates a (temporary) libcoreneuron.a from these objects plus
+#   those from the translated MOD files, then statically links that into
+#   special-core (nrniv-core)
+# ~~~
+# This scheme means that both core and mechanism .o files are linked in a single
+# step, which is important for GPU linking. It does, however, mean that in a
+# shared library CPU build then the core code is installed twice, once in
+# libcoreneuron-core.a and once in the libcoreneuron.so that contains the
+# default mechanisms for the installed nrniv-core binary. In a GPU build,
+# libcoreneuron-cuda.{a,so} is also linked to provide the CUDA implementation of
+# the Hines solver.
 add_library(
-  coreneuron
-  ${COMPILE_LIBRARY_TYPE}
-  ${CORENEURON_HEADER_FILES}
-  ${CORENEURON_TEMPLATE_FILES}
+  coreneuron-core
+  STATIC
   ${CORENEURON_CODE_FILES}
-  ${cudacorenrn_objs}
-  ${NMODL_INBUILT_MOD_OUTPUTS}
-  ${MPI_CORE_FILES}
   ${CORENRN_MPI_OBJ})
 
-target_include_directories(coreneuron PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}
-                                              ${CORENEURON_PROJECT_BINARY_DIR}/generated)
+if(CORENRN_ENABLE_GPU)
+  set(coreneuron_cuda_target coreneuron-cuda)
+  add_library(coreneuron-cuda ${COMPILE_LIBRARY_TYPE} ${CORENEURON_CUDA_FILES})
+endif()
+
+foreach(target coreneuron-core ${coreneuron_cuda_target})
+  target_include_directories(${target} PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}
+                                               ${CORENEURON_PROJECT_BINARY_DIR}/generated)
+endforeach()
+
 # we can link to MPI libraries in non-dynamic-mpi build
 if(CORENRN_ENABLE_MPI AND NOT CORENRN_ENABLE_MPI_DYNAMIC)
-  target_link_libraries(coreneuron ${MPI_CXX_LIBRARIES})
+  target_link_libraries(coreneuron-core ${MPI_CXX_LIBRARIES})
 endif()
 
 # this is where we handle dynamic mpi library build
@@ -168,7 +191,7 @@ if(CORENRN_ENABLE_MPI AND CORENRN_ENABLE_MPI_DYNAMIC)
   # main coreneuron library needs to be linked to libdl.so and
   # and should be aware of shared library suffix on different platforms.
   # ~~~
-  target_link_libraries(coreneuron ${CMAKE_DL_LIBS})
+  target_link_libraries(coreneuron-core ${CMAKE_DL_LIBS})
 
   # store mpi library targets that will be built
   list(APPEND corenrn_mpi_targets "")
@@ -215,9 +238,9 @@ if(CORENRN_ENABLE_MPI AND CORENRN_ENABLE_MPI_DYNAMIC)
       # when we will test coreneuron on windows.
       # ~~~
       if(MINGW) # type msmpi only
-        add_dependencies(core${libname}_lib coreneuron)
+        add_dependencies(core${libname}_lib coreneuron-core)
         target_link_libraries(core${libname}_lib ${MPI_C_LIBRARIES})
-        target_link_libraries(core${libname}_lib coreneuron)
+        target_link_libraries(core${libname}_lib coreneuron-core)
       endif()
       set_property(TARGET core${libname}_lib PROPERTY OUTPUT_NAME core${libname})
       list(APPEND corenrn_mpi_targets "core${libname}_lib")
@@ -234,28 +257,26 @@ endif()
 
 # Prevent CMake from running a device code link step when assembling libcoreneuron.a in GPU builds.
 # The device code linking needs to be deferred to the final step, where it is done by `nvc++ -cuda`.
-set_target_properties(coreneuron PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-# Suppress some compiler warnings. Note in GPU builds this library includes CUDA files.
-target_compile_options(coreneuron
+#set_target_properties(coreneuron-core PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF CUDA_SEPARABLE_COMPILATION ON)
+#if(TARGET coreneuron-cuda)
+  # set_target_properties(coreneuron-cuda PROPERTIES )
+#endif()
+# Suppress some compiler warnings. TODO no it doesn't: Note in GPU builds this library includes CUDA files.
+target_compile_options(coreneuron-core
                        PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${CORENEURON_CXX_WARNING_SUPPRESSIONS}>)
-add_dependencies(coreneuron nrnivmodl-core)
+add_dependencies(coreneuron-core nrnivmodl-core) # TODO why?
 
-# scopmath is created separately for nrnivmodl-core workflow
-add_library(scopmath ${COMPILE_LIBRARY_TYPE} ${CORENEURON_HEADER_FILES} ${SCOPMATH_CODE_FILES})
-target_include_directories(scopmath PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}
-                                            ${CORENEURON_PROJECT_BINARY_DIR}/generated)
-
-target_link_libraries(coreneuron ${reportinglib_LIBRARY} ${sonatareport_LIBRARY} ${CALIPER_LIB}
+target_link_libraries(coreneuron-core ${reportinglib_LIBRARY} ${sonatareport_LIBRARY} ${CALIPER_LIB}
                       ${likwid_LIBRARIES})
 
-target_include_directories(coreneuron SYSTEM
+target_include_directories(coreneuron-core SYSTEM
                            PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}/external/Random123/include)
-target_include_directories(coreneuron SYSTEM
+target_include_directories(coreneuron-core SYSTEM
                            PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11/include)
 
 # See: https://en.cppreference.com/w/cpp/filesystem#Notes
 if(CMAKE_CXX_COMPILER_IS_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.1)
-  target_link_libraries(coreneuron stdc++fs)
+  target_link_libraries(coreneuron-core stdc++fs)
 endif()
 
 if(CORENRN_ENABLE_GPU)
@@ -263,17 +284,17 @@ if(CORENRN_ENABLE_GPU)
   find_package(Boost QUIET)
   if(Boost_FOUND)
     message(STATUS "Boost found, enabling use of memory pools for Random123...")
-    target_include_directories(coreneuron SYSTEM PRIVATE ${Boost_INCLUDE_DIRS})
-    target_compile_definitions(coreneuron PRIVATE CORENEURON_USE_BOOST_POOL)
+    target_include_directories(coreneuron-core SYSTEM PRIVATE ${Boost_INCLUDE_DIRS})
+    target_compile_definitions(coreneuron-core PRIVATE CORENEURON_USE_BOOST_POOL)
   endif()
 endif()
 
 set_target_properties(
-  coreneuron scopmath
+  coreneuron-core ${coreneuron_cuda_target}
   PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
              LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
              POSITION_INDEPENDENT_CODE ON)
-cpp_cc_configure_sanitizers(TARGET coreneuron scopmath ${corenrn_mpi_targets})
+cpp_cc_configure_sanitizers(TARGET coreneuron-core ${coreneuron_cuda_target} ${corenrn_mpi_targets})
 
 # =============================================================================
 # create special-core with halfgap.mod for tests
@@ -283,11 +304,11 @@ file(GLOB modfiles "${modfile_directory}/*.mod")
 
 if(CORENRN_ENABLE_SHARED)
   set(corenrn_mech_library
-      "${CMAKE_BINARY_DIR}/bin/${CMAKE_SYSTEM_PROCESSOR}/libcorenrnmech${CMAKE_SHARED_LIBRARY_SUFFIX}"
+      "${CMAKE_BINARY_DIR}/bin/${CMAKE_SYSTEM_PROCESSOR}/libcoreneuron${CMAKE_SHARED_LIBRARY_SUFFIX}"
       CACHE INTERNAL "coreneuron mechanism library")
 else()
   set(corenrn_mech_library
-      "${CMAKE_BINARY_DIR}/bin/${CMAKE_SYSTEM_PROCESSOR}/libcorenrnmech${CMAKE_SHARED_LIBRARY_SUFFIX}"
+      "${CMAKE_BINARY_DIR}/bin/${CMAKE_SYSTEM_PROCESSOR}/libcoreneuron${CMAKE_STATIC_LIBRARY_SUFFIX}"
       CACHE INTERNAL "coreneuron mechanism library")
 endif()
 
@@ -296,7 +317,7 @@ set(output_binaries "${CMAKE_BINARY_DIR}/bin/${CMAKE_SYSTEM_PROCESSOR}/special-c
 
 add_custom_command(
   OUTPUT ${output_binaries}
-  DEPENDS scopmath coreneuron ${NMODL_TARGET_TO_DEPEND} ${modfiles} ${CORENEURON_BUILTIN_MODFILES}
+  DEPENDS coreneuron-core ${coreneuron_cuda_target} ${NMODL_TARGET_TO_DEPEND} ${modfiles} ${CORENEURON_BUILTIN_MODFILES}
   COMMAND ${CMAKE_BINARY_DIR}/bin/nrnivmodl-core -b ${COMPILE_LIBRARY_TYPE} -m
           ${CORENRN_MOD2CPP_BINARY} -p 4 "${modfile_directory}"
   WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/bin
@@ -305,14 +326,18 @@ add_custom_target(nrniv-core ALL DEPENDS ${output_binaries})
 
 if(CORENRN_ENABLE_GPU)
   separate_arguments(CORENRN_ACC_FLAGS UNIX_COMMAND "${NVHPC_ACC_COMP_FLAGS}")
-  target_compile_options(coreneuron BEFORE PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${CORENRN_ACC_FLAGS}>)
-  target_compile_options(scopmath BEFORE PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${CORENRN_ACC_FLAGS}>)
+  target_compile_options(coreneuron-core BEFORE PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${CORENRN_ACC_FLAGS}>)
 endif()
 
+# Create an extra target for internal use that unit tests and so on can depend
+# on
+add_library(coreneuron-all INTERFACE)
+target_link_libraries(coreneuron-all INTERFACE coreneuron-core ${coreneuron_cuda_target} "${corenrn_mech_library}")
+
 # =============================================================================
 # Extract link definitions to be used with nrnivmodl-core
 # =============================================================================
-get_target_property(CORENRN_LINK_LIBS coreneuron LINK_LIBRARIES)
+get_target_property(CORENRN_LINK_LIBS coreneuron-core LINK_LIBRARIES)
 if(NOT CORENRN_LINK_LIBS)
   set(CORENRN_LINK_LIBS "")
 endif()
@@ -346,18 +371,19 @@ file(COPY apps/coreneuron.cpp DESTINATION ${CMAKE_BINARY_DIR}/share/coreneuron)
 
 # coreneuron main libraries
 install(
-  TARGETS coreneuron
+  TARGETS coreneuron-core
   EXPORT coreneuron
   LIBRARY DESTINATION lib
   ARCHIVE DESTINATION lib
   INCLUDES
   DESTINATION $<INSTALL_INTERFACE:include>)
 
-# scopemath into share for nrnivmodl-core
-install(
-  TARGETS scopmath
-  EXPORT coreneuron
-  DESTINATION lib)
+if(TARGET coreneuron-cuda)
+  install(TARGETS coreneuron-cuda
+          EXPORT coreneuron
+          ARCHIVE DESTINATION lib
+          LIBRARY DESTINATION lib)
+endif()
 
 # headers and some standalone code files for nrnivmodl-core
 install(
diff --git a/coreneuron/permute/cellorder.cpp b/coreneuron/permute/cellorder.cpp
index 2c2fca92e..c95fedcf2 100644
--- a/coreneuron/permute/cellorder.cpp
+++ b/coreneuron/permute/cellorder.cpp
@@ -576,7 +576,7 @@ void solve_interleaved2(int ith) {
     if (corenrn_param.gpu && corenrn_param.cuda_interface) {
         auto* d_nt = static_cast<NrnThread*>(acc_deviceptr(nt));
         auto* d_info = static_cast<InterleaveInfo*>(acc_deviceptr(interleave_info + ith));
-        // solve_interleaved2_launcher(d_nt, d_info, ncore, acc_get_cuda_stream(nt->stream_id));
+        solve_interleaved2_launcher(d_nt, d_info, ncore, acc_get_cuda_stream(nt->stream_id));
     } else {
 #endif
         int* ncycles = ii.cellsize;         // nwarp of these
diff --git a/extra/nrnivmodl_core_makefile.in b/extra/nrnivmodl_core_makefile.in
index c26066e4c..73ed76738 100644
--- a/extra/nrnivmodl_core_makefile.in
+++ b/extra/nrnivmodl_core_makefile.in
@@ -104,7 +104,7 @@ ifeq (@CORENRN_ENABLE_NMODL@, ON)
 endif
 
 # name of the mechanism library with suffix if provided
-COREMECH_LIB_NAME = corenrnmech$(if $(MECHLIB_SUFFIX),_$(MECHLIB_SUFFIX),)
+COREMECH_LIB_NAME = coreneuron$(if $(MECHLIB_SUFFIX),_$(MECHLIB_SUFFIX),)
 COREMECH_LIB_PATH = $(OUTPUT_DIR)/lib$(COREMECH_LIB_NAME)$(LIB_SUFFIX)
 
 # Various header and C++/Object file
@@ -114,8 +114,8 @@ ENGINEMECH_OBJ = $(MOD_OBJS_DIR)/enginemech.o
 
 # Depending on static/shared build, determine library name and it's suffix
 ifeq ($(TARGET_LIB_TYPE), STATIC)
-    LIB_SUFFIX = @CMAKE_SHARED_LIBRARY_SUFFIX@
-    corenrnmech_lib_target = coremech_lib_shared
+    LIB_SUFFIX = @CMAKE_STATIC_LIBRARY_SUFFIX@
+    corenrnmech_lib_target = coremech_lib_static
 else
     LIB_SUFFIX = @CMAKE_SHARED_LIBRARY_SUFFIX@
     corenrnmech_lib_target = coremech_lib_shared
@@ -211,7 +211,7 @@ $(SPECIAL_EXE): coremech_lib_target
 	@printf " => $(C_GREEN)Binary$(C_RESET) creating $(SPECIAL_EXE)\n"
 	$(CXX_LINK_EXE_CMD) -o $(SPECIAL_EXE) $(CORENRN_SHARE_CORENRN_DIR)/coreneuron.cpp \
 	  -I$(CORENRN_INC_DIR) $(INCFLAGS) \
-	  -L$(OUTPUT_DIR) -l$(COREMECH_LIB_NAME) $(CORENRNLIB_FLAGS) $(LDFLAGS) \
+	  -L$(OUTPUT_DIR) -l$(COREMECH_LIB_NAME) -lcoreneuron-cuda $(CORENRNLIB_FLAGS) $(LDFLAGS) \
 	  -L$(CORENRN_LIB_DIR) \
 	  -Wl,-rpath,'$(LIB_RPATH)' -Wl,-rpath,$(CORENRN_LIB_DIR) -Wl,-rpath,'$(INSTALL_LIB_RPATH)'
 
@@ -225,18 +225,17 @@ $(ENGINEMECH_OBJ): $(CORENRN_SHARE_CORENRN_DIR)/enginemech.cpp | $(MOD_OBJS_DIR)
 
 # build shared library of mechanisms
 coremech_lib_shared: $(ALL_OBJS) $(ENGINEMECH_OBJ) build_always
-	# extract the object files from libcoreneuron.a
-	mkdir -p $(MOD_OBJS_DIR)/libcoreneuron
-	ar --output=$(MOD_OBJS_DIR)/libcoreneuron x $(CORENRN_LIB_DIR)/libcoreneuron.a
-	# extract the object files from libscopmath.a
-	mkdir -p $(MOD_OBJS_DIR)/libscopmath
-	ar --output=$(MOD_OBJS_DIR)/libscopmath x $(CORENRN_LIB_DIR)/libscopmath.a
+	# extract the object files from libcoreneuron-core.a
+	mkdir -p $(MOD_OBJS_DIR)/libcoreneuron-core
+	rm -f $(MOD_OBJS_DIR)/libcoreneuron-core/*.o
+	ar --output=$(MOD_OBJS_DIR)/libcoreneuron-core x $(CORENRN_LIB_DIR)/libcoreneuron-core.a
 	$(CXX_SHARED_LIB_CMD) $(ENGINEMECH_OBJ) -o ${COREMECH_LIB_PATH} $(ALL_OBJS) \
 	  -I$(CORENRN_INC_DIR) $(INCFLAGS) \
 	  $(LDFLAGS) ${SONAME_OPTION} -Wl,--start-group \
-	  $(MOD_OBJS_DIR)/libcoreneuron/*.o \
-		-Wl,--end-group -Wl,--start-group $(MOD_OBJS_DIR)/libscopmath/*.o \
-		-Wl,--end-group $(CORENRNLIB_FLAGS) -Wl,-rpath,$(CORENRN_LIB_DIR);
+	  $(MOD_OBJS_DIR)/libcoreneuron-core/*.o \
+		-Wl,--end-group $(CORENRNLIB_FLAGS) -Wl,-rpath,$(CORENRN_LIB_DIR)
+	# cleanup
+	rm $(MOD_OBJS_DIR)/libcoreneuron-core/*.o
 
 # build static library of mechanisms
 coremech_lib_static: $(ALL_OBJS) $(ENGINEMECH_OBJ) build_always
diff --git a/tests/unit/cmdline_interface/CMakeLists.txt b/tests/unit/cmdline_interface/CMakeLists.txt
index cd177c521..856ce0779 100644
--- a/tests/unit/cmdline_interface/CMakeLists.txt
+++ b/tests/unit/cmdline_interface/CMakeLists.txt
@@ -4,14 +4,13 @@
 # See top-level LICENSE file for details.
 # =============================================================================
 add_executable(cmd_interface_test_bin test_cmdline_interface.cpp)
-target_link_libraries(cmd_interface_test_bin ${MPI_CXX_LIBRARIES} coreneuron
-                      ${corenrn_mech_library} ${reportinglib_LIBRARY} ${sonatareport_LIBRARY})
+target_link_libraries(cmd_interface_test_bin coreneuron-all)
 target_include_directories(cmd_interface_test_bin SYSTEM
                            PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11/include)
-add_dependencies(cmd_interface_test_bin nrniv-core)
+#add_dependencies(cmd_interface_test_bin nrniv-core)
 # Tell CMake *not* to run an explicit device code linker step (which will produce errors); let the
 # NVHPC C++ compiler handle this implicitly.
-set_target_properties(cmd_interface_test_bin PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
+#set_target_properties(cmd_interface_test_bin PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
 target_compile_options(cmd_interface_test_bin PRIVATE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS})
 add_test(NAME cmd_interface_test COMMAND ${TEST_EXEC_PREFIX} $<TARGET_FILE:cmd_interface_test_bin>)
 cpp_cc_configure_sanitizers(TARGET cmd_interface_test_bin TEST cmd_interface_test)
diff --git a/tests/unit/interleave_info/CMakeLists.txt b/tests/unit/interleave_info/CMakeLists.txt
index ce69b097e..153fc1f75 100644
--- a/tests/unit/interleave_info/CMakeLists.txt
+++ b/tests/unit/interleave_info/CMakeLists.txt
@@ -4,13 +4,12 @@
 # See top-level LICENSE file for details.
 # =============================================================================
 add_executable(interleave_info_bin check_constructors.cpp)
-target_link_libraries(interleave_info_bin ${MPI_CXX_LIBRARIES} coreneuron ${corenrn_mech_library}
-                      ${reportinglib_LIBRARY} ${sonatareport_LIBRARY})
-add_dependencies(interleave_info_bin nrniv-core)
+target_link_libraries(interleave_info_bin coreneuron-all)
 # Tell CMake *not* to run an explicit device code linker step (which will produce errors); let the
 # NVHPC C++ compiler handle this implicitly.
-set_target_properties(interleave_info_bin PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
-target_compile_options(interleave_info_bin PRIVATE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS})
+#set_target_properties(interleave_info_bin PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
+#separate_arguments(CORENRN_ACC_FLAGS UNIX_COMMAND "${NVHPC_ACC_COMP_FLAGS}")
+target_compile_options(interleave_info_bin PRIVATE ${CORENRN_ACC_FLAGS} ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS})
 add_test(NAME interleave_info_constructor_test COMMAND ${TEST_EXEC_PREFIX}
                                                        $<TARGET_FILE:interleave_info_bin>)
 cpp_cc_configure_sanitizers(TARGET interleave_info_bin TEST interleave_info_constructor_test)
diff --git a/tests/unit/lfp/CMakeLists.txt b/tests/unit/lfp/CMakeLists.txt
index 61d749aa9..fc716fbc0 100644
--- a/tests/unit/lfp/CMakeLists.txt
+++ b/tests/unit/lfp/CMakeLists.txt
@@ -3,18 +3,12 @@
 #
 # See top-level LICENSE file for details.
 # =============================================================================
-
-include_directories(${CMAKE_SOURCE_DIR}/coreneuron ${Boost_INCLUDE_DIRS})
-file(GLOB lfp_test_src "*.cpp")
-
-add_executable(lfp_test_bin ${lfp_test_src})
-target_link_libraries(lfp_test_bin ${MPI_CXX_LIBRARIES} coreneuron ${corenrn_mech_library}
-                      ${reportinglib_LIBRARY} ${sonatareport_LIBRARY})
+add_executable(lfp_test_bin lfp.cpp)
+target_link_libraries(lfp_test_bin coreneuron-all)
 # Tell CMake *not* to run an explicit device code linker step (which will produce errors); let the
 # NVHPC C++ compiler handle this implicitly.
-set_target_properties(lfp_test_bin PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
-target_compile_options(lfp_test_bin PRIVATE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS})
-add_dependencies(lfp_test_bin nrniv-core)
+#set_target_properties(lfp_test_bin PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
+#add_dependencies(lfp_test_bin nrniv-core)
 add_test(NAME lfp_test COMMAND ${TEST_EXEC_PREFIX} $<TARGET_FILE:lfp_test_bin>)
 cpp_cc_configure_sanitizers(TARGET lfp_test_bin TEST lfp_test)
 set_property(
diff --git a/tests/unit/queueing/CMakeLists.txt b/tests/unit/queueing/CMakeLists.txt
index ba3725d32..b7d3f46a6 100644
--- a/tests/unit/queueing/CMakeLists.txt
+++ b/tests/unit/queueing/CMakeLists.txt
@@ -4,12 +4,10 @@
 # See top-level LICENSE file for details.
 # =============================================================================
 add_executable(queuing_test_bin test_queueing.cpp)
-target_link_libraries(queuing_test_bin ${Boost_SYSTEM_LIBRARY} coreneuron ${corenrn_mech_library}
-                      ${reportinglib_LIBRARY} ${sonatareport_LIBRARY})
-add_dependencies(queuing_test_bin nrniv-core)
+target_link_libraries(queuing_test_bin coreneuron-all ${Boost_SYSTEM_LIBRARY})
 # Tell CMake *not* to run an explicit device code linker step (which will produce errors); let the
 # NVHPC C++ compiler handle this implicitly.
-set_target_properties(queuing_test_bin PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
+#set_target_properties(queuing_test_bin PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
 target_compile_options(queuing_test_bin PRIVATE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS})
 add_test(NAME queuing_test COMMAND ${TEST_EXEC_PREFIX} $<TARGET_FILE:queuing_test_bin>)
 cpp_cc_configure_sanitizers(TARGET queuing_test_bin TEST queuing_test)
diff --git a/tests/unit/solver/CMakeLists.txt b/tests/unit/solver/CMakeLists.txt
index 1d01ea4b1..77d46a464 100644
--- a/tests/unit/solver/CMakeLists.txt
+++ b/tests/unit/solver/CMakeLists.txt
@@ -6,14 +6,14 @@
 
 include_directories(${CMAKE_SOURCE_DIR}/coreneuron ${Boost_INCLUDE_DIRS})
 add_executable(test-solver test_solver.cpp)
-target_link_libraries(test-solver coreneuron ${corenrn_mech_library})
+target_link_libraries(test-solver coreneuron-all)
 target_include_directories(test-solver SYSTEM
                            PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11/include)
 
 # Tell CMake *not* to run an explicit device code linker step (which will produce errors); let the
 # NVHPC C++ compiler handle this implicitly.
-set_target_properties(test-solver PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
+#set_target_properties(test-solver PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
 target_compile_options(test-solver PRIVATE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS})
-add_dependencies(test-solver nrniv-core)
+#add_dependencies(test-solver nrniv-core)
 add_test(NAME test-solver COMMAND $<TARGET_FILE:test-solver>)
 cpp_cc_configure_sanitizers(TARGET test-solver TEST test-solver)
diff --git a/tests/unit/solver/test_solver.cpp b/tests/unit/solver/test_solver.cpp
index b797ac711..c1021bcb7 100644
--- a/tests/unit/solver/test_solver.cpp
+++ b/tests/unit/solver/test_solver.cpp
@@ -275,7 +275,7 @@ auto active_implementations() {
     ret.push_back(SolverImplementation::CellPermute0_GPU);
     ret.push_back(SolverImplementation::CellPermute1_GPU);
     ret.push_back(SolverImplementation::CellPermute2_GPU);
-    // ret.push_back(SolverImplementation::CellPermute2_CUDA);
+    ret.push_back(SolverImplementation::CellPermute2_CUDA);
 #endif
     return ret;
 }

From 6bb5fe93d2d1125214e5257bbb74d923e965b031 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 19 Jul 2022 10:48:41 +0200
Subject: [PATCH 032/128] cleanup unit test cmake

---
 tests/CMakeLists.txt                        |  5 ++++-
 tests/unit/cmdline_interface/CMakeLists.txt |  7 ++-----
 tests/unit/interleave_info/CMakeLists.txt   |  7 +------
 tests/unit/lfp/CMakeLists.txt               |  6 +-----
 tests/unit/queueing/CMakeLists.txt          |  6 +-----
 tests/unit/solver/CMakeLists.txt            | 14 ++------------
 6 files changed, 11 insertions(+), 34 deletions(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index e9cacd422..b3d8a30f1 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -27,7 +27,10 @@ find_package(Boost 1.59 QUIET COMPONENTS filesystem system atomic unit_test_fram
 
 if(Boost_FOUND)
   if(CORENRN_ENABLE_UNIT_TESTS)
-    include_directories(SYSTEM ${Boost_INCLUDE_DIRS})
+    add_library(coreneuron-unit-test INTERFACE)
+    target_compile_options(coreneuron-unit-test INTERFACE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS})
+    target_include_directories(coreneuron-unit-test SYSTEM INTERFACE ${Boost_INCLUDE_DIRS} ${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11/include)
+    target_link_libraries(coreneuron-unit-test INTERFACE coreneuron-all ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY})
     add_subdirectory(unit/cmdline_interface)
     add_subdirectory(unit/interleave_info)
     add_subdirectory(unit/alignment)
diff --git a/tests/unit/cmdline_interface/CMakeLists.txt b/tests/unit/cmdline_interface/CMakeLists.txt
index 856ce0779..26f0b62c7 100644
--- a/tests/unit/cmdline_interface/CMakeLists.txt
+++ b/tests/unit/cmdline_interface/CMakeLists.txt
@@ -4,13 +4,10 @@
 # See top-level LICENSE file for details.
 # =============================================================================
 add_executable(cmd_interface_test_bin test_cmdline_interface.cpp)
-target_link_libraries(cmd_interface_test_bin coreneuron-all)
-target_include_directories(cmd_interface_test_bin SYSTEM
-                           PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11/include)
-#add_dependencies(cmd_interface_test_bin nrniv-core)
+target_link_libraries(cmd_interface_test_bin coreneuron-unit-test)
 # Tell CMake *not* to run an explicit device code linker step (which will produce errors); let the
 # NVHPC C++ compiler handle this implicitly.
 #set_target_properties(cmd_interface_test_bin PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
-target_compile_options(cmd_interface_test_bin PRIVATE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS})
+#target_compile_options(cmd_interface_test_bin PRIVATE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS})
 add_test(NAME cmd_interface_test COMMAND ${TEST_EXEC_PREFIX} $<TARGET_FILE:cmd_interface_test_bin>)
 cpp_cc_configure_sanitizers(TARGET cmd_interface_test_bin TEST cmd_interface_test)
diff --git a/tests/unit/interleave_info/CMakeLists.txt b/tests/unit/interleave_info/CMakeLists.txt
index 153fc1f75..948f32405 100644
--- a/tests/unit/interleave_info/CMakeLists.txt
+++ b/tests/unit/interleave_info/CMakeLists.txt
@@ -4,12 +4,7 @@
 # See top-level LICENSE file for details.
 # =============================================================================
 add_executable(interleave_info_bin check_constructors.cpp)
-target_link_libraries(interleave_info_bin coreneuron-all)
-# Tell CMake *not* to run an explicit device code linker step (which will produce errors); let the
-# NVHPC C++ compiler handle this implicitly.
-#set_target_properties(interleave_info_bin PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
-#separate_arguments(CORENRN_ACC_FLAGS UNIX_COMMAND "${NVHPC_ACC_COMP_FLAGS}")
-target_compile_options(interleave_info_bin PRIVATE ${CORENRN_ACC_FLAGS} ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS})
+target_link_libraries(interleave_info_bin coreneuron-unit-test)
 add_test(NAME interleave_info_constructor_test COMMAND ${TEST_EXEC_PREFIX}
                                                        $<TARGET_FILE:interleave_info_bin>)
 cpp_cc_configure_sanitizers(TARGET interleave_info_bin TEST interleave_info_constructor_test)
diff --git a/tests/unit/lfp/CMakeLists.txt b/tests/unit/lfp/CMakeLists.txt
index fc716fbc0..8b5b201c4 100644
--- a/tests/unit/lfp/CMakeLists.txt
+++ b/tests/unit/lfp/CMakeLists.txt
@@ -4,11 +4,7 @@
 # See top-level LICENSE file for details.
 # =============================================================================
 add_executable(lfp_test_bin lfp.cpp)
-target_link_libraries(lfp_test_bin coreneuron-all)
-# Tell CMake *not* to run an explicit device code linker step (which will produce errors); let the
-# NVHPC C++ compiler handle this implicitly.
-#set_target_properties(lfp_test_bin PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
-#add_dependencies(lfp_test_bin nrniv-core)
+target_link_libraries(lfp_test_bin coreneuron-unit-test)
 add_test(NAME lfp_test COMMAND ${TEST_EXEC_PREFIX} $<TARGET_FILE:lfp_test_bin>)
 cpp_cc_configure_sanitizers(TARGET lfp_test_bin TEST lfp_test)
 set_property(
diff --git a/tests/unit/queueing/CMakeLists.txt b/tests/unit/queueing/CMakeLists.txt
index b7d3f46a6..fc653ea98 100644
--- a/tests/unit/queueing/CMakeLists.txt
+++ b/tests/unit/queueing/CMakeLists.txt
@@ -4,10 +4,6 @@
 # See top-level LICENSE file for details.
 # =============================================================================
 add_executable(queuing_test_bin test_queueing.cpp)
-target_link_libraries(queuing_test_bin coreneuron-all ${Boost_SYSTEM_LIBRARY})
-# Tell CMake *not* to run an explicit device code linker step (which will produce errors); let the
-# NVHPC C++ compiler handle this implicitly.
-#set_target_properties(queuing_test_bin PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
-target_compile_options(queuing_test_bin PRIVATE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS})
+target_link_libraries(queuing_test_bin coreneuron-unit-test)
 add_test(NAME queuing_test COMMAND ${TEST_EXEC_PREFIX} $<TARGET_FILE:queuing_test_bin>)
 cpp_cc_configure_sanitizers(TARGET queuing_test_bin TEST queuing_test)
diff --git a/tests/unit/solver/CMakeLists.txt b/tests/unit/solver/CMakeLists.txt
index 77d46a464..01e058525 100644
--- a/tests/unit/solver/CMakeLists.txt
+++ b/tests/unit/solver/CMakeLists.txt
@@ -1,19 +1,9 @@
 # =============================================================================
-# Copyright (C) 2022 Blue Brain Project
+# Copyright (c) 2022 Blue Brain Project
 #
 # See top-level LICENSE file for details.
 # =============================================================================
-
-include_directories(${CMAKE_SOURCE_DIR}/coreneuron ${Boost_INCLUDE_DIRS})
 add_executable(test-solver test_solver.cpp)
-target_link_libraries(test-solver coreneuron-all)
-target_include_directories(test-solver SYSTEM
-                           PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11/include)
-
-# Tell CMake *not* to run an explicit device code linker step (which will produce errors); let the
-# NVHPC C++ compiler handle this implicitly.
-#set_target_properties(test-solver PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
-target_compile_options(test-solver PRIVATE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS})
-#add_dependencies(test-solver nrniv-core)
+target_link_libraries(test-solver coreneuron-unit-test)
 add_test(NAME test-solver COMMAND $<TARGET_FILE:test-solver>)
 cpp_cc_configure_sanitizers(TARGET test-solver TEST test-solver)

From bddef616f76d3d58f84eee5fa54498febb03a6ba Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 19 Jul 2022 10:58:36 +0200
Subject: [PATCH 033/128] more cmake

---
 coreneuron/CMakeLists.txt                   | 105 ++++++++++----------
 tests/CMakeLists.txt                        |  10 +-
 tests/integration/CMakeLists.txt            |   1 -
 tests/unit/cmdline_interface/CMakeLists.txt |   4 -
 4 files changed, 57 insertions(+), 63 deletions(-)

diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 28411fd53..6a4a7136b 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -106,18 +106,16 @@ if(CORENRN_ENABLE_GPU)
 
   set_source_files_properties(${OPENACC_EXCLUDED_FILES} PROPERTIES COMPILE_FLAGS
                                                                    "-DDISABLE_OPENACC")
-  # Only compile the explicit CUDA implementation of the Hines solver in GPU
-  # builds. Because of
+  # Only compile the explicit CUDA implementation of the Hines solver in GPU builds. Because of
   # https://forums.developer.nvidia.com/t/cannot-dynamically-load-a-shared-library-containing-both-openacc-and-cuda-code/210972
-  # this cannot be included in the same shared library as the rest of the
-  # OpenACC code.
+  # this cannot be included in the same shared library as the rest of the OpenACC code.
   set(CORENEURON_CUDA_FILES ${CMAKE_CURRENT_SOURCE_DIR}/permute/cellorder.cu)
 
   # Eigen-3.5+ provides better GPU support. However, some functions cannot be called directly from
   # within an OpenACC region. Therefore, we need to wrap them in a special API (decorate them with
   # __device__ & acc routine tokens), which allows us to eventually call them from OpenACC. Calling
-  # these functions from CUDA kernels presents no issue ...
-  # TODO is it going to work to call these from libcoreneuron-cuda.so? probably not...
+  # these functions from CUDA kernels presents no issue ... TODO is it going to work to call these
+  # from libcoreneuron-cuda.so? probably not...
   if(CORENRN_ENABLE_NMODL AND EXISTS ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cu)
     list(APPEND CORENEURON_CUDA_FILES ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cu)
   endif()
@@ -147,29 +145,26 @@ if(CORENRN_ENABLE_MPI AND NOT CORENRN_ENABLE_MPI_DYNAMIC)
   set(CORENRN_MPI_OBJ $<TARGET_OBJECTS:${CORENRN_MPI_LIB_NAME}>)
 endif()
 
-# Library containing the bulk of the non-mechanism CoreNEURON code. This is
-# always created and installed as a static library, and then the nrnivmodl-core
-# workflow extracts the object files from it and does one of the following:
+# Library containing the bulk of the non-mechanism CoreNEURON code. This is always created and
+# installed as a static library, and then the nrnivmodl-core workflow extracts the object files from
+# it and does one of the following:
 # ~~~
 # - shared build: creates libcoreneuron.so from these objects plus those from
 #   the translated MOD files
-# - static build: creates a (temporary) libcoreneuron.a from these objects plus
-#   those from the translated MOD files, then statically links that into
-#   special-core (nrniv-core)
+# - static build: creates a (temporary, does not get installed) libcoreneuron.a
+#   from these objects plus those from the translated MOD files, then
+#   statically links that into special-core (nrniv-core)
 # ~~~
-# This scheme means that both core and mechanism .o files are linked in a single
-# step, which is important for GPU linking. It does, however, mean that in a
-# shared library CPU build then the core code is installed twice, once in
-# libcoreneuron-core.a and once in the libcoreneuron.so that contains the
-# default mechanisms for the installed nrniv-core binary. In a GPU build,
-# libcoreneuron-cuda.{a,so} is also linked to provide the CUDA implementation of
-# the Hines solver.
-add_library(
-  coreneuron-core
-  STATIC
-  ${CORENEURON_CODE_FILES}
-  ${CORENRN_MPI_OBJ})
-
+# This scheme means that both core and mechanism .o files are linked in a single step, which is
+# important for GPU linking. It does, however, mean that in a shared library CPU build then the core
+# code is installed twice, once in libcoreneuron-core.a and once in the libcoreneuron.so that
+# contains the default mechanisms for the installed nrniv-core binary. In a GPU build,
+# libcoreneuron-cuda.{a,so} is also linked to provide the CUDA implementation of the Hines solver.
+add_library(coreneuron-core STATIC ${CORENEURON_CODE_FILES} ${CORENRN_MPI_OBJ})
+
+# Library containing explicit CUDA code, compiled by nvcc. This cannot be included in
+# coreneuron-core because of this issue:
+# https://forums.developer.nvidia.com/t/cannot-dynamically-load-a-shared-library-containing-both-openacc-and-cuda-code/210972
 if(CORENRN_ENABLE_GPU)
   set(coreneuron_cuda_target coreneuron-cuda)
   add_library(coreneuron-cuda ${COMPILE_LIBRARY_TYPE} ${CORENEURON_CUDA_FILES})
@@ -257,17 +252,17 @@ endif()
 
 # Prevent CMake from running a device code link step when assembling libcoreneuron.a in GPU builds.
 # The device code linking needs to be deferred to the final step, where it is done by `nvc++ -cuda`.
-#set_target_properties(coreneuron-core PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF CUDA_SEPARABLE_COMPILATION ON)
-#if(TARGET coreneuron-cuda)
-  # set_target_properties(coreneuron-cuda PROPERTIES )
-#endif()
-# Suppress some compiler warnings. TODO no it doesn't: Note in GPU builds this library includes CUDA files.
+# set_target_properties(coreneuron-core PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF
+# CUDA_SEPARABLE_COMPILATION ON) if(TARGET coreneuron-cuda)
+# set_target_properties(coreneuron-cuda PROPERTIES )
+# endif() Suppress some compiler warnings. TODO no it doesn't: Note in GPU builds this library
+# includes CUDA files.
 target_compile_options(coreneuron-core
                        PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${CORENEURON_CXX_WARNING_SUPPRESSIONS}>)
 add_dependencies(coreneuron-core nrnivmodl-core) # TODO why?
 
-target_link_libraries(coreneuron-core ${reportinglib_LIBRARY} ${sonatareport_LIBRARY} ${CALIPER_LIB}
-                      ${likwid_LIBRARIES})
+target_link_libraries(coreneuron-core ${reportinglib_LIBRARY} ${sonatareport_LIBRARY}
+                      ${CALIPER_LIB} ${likwid_LIBRARIES})
 
 target_include_directories(coreneuron-core SYSTEM
                            PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}/external/Random123/include)
@@ -302,22 +297,17 @@ cpp_cc_configure_sanitizers(TARGET coreneuron-core ${coreneuron_cuda_target} ${c
 set(modfile_directory "${CORENEURON_PROJECT_SOURCE_DIR}/tests/integration/ring_gap/mod files")
 file(GLOB modfiles "${modfile_directory}/*.mod")
 
-if(CORENRN_ENABLE_SHARED)
-  set(corenrn_mech_library
-      "${CMAKE_BINARY_DIR}/bin/${CMAKE_SYSTEM_PROCESSOR}/libcoreneuron${CMAKE_SHARED_LIBRARY_SUFFIX}"
-      CACHE INTERNAL "coreneuron mechanism library")
-else()
-  set(corenrn_mech_library
-      "${CMAKE_BINARY_DIR}/bin/${CMAKE_SYSTEM_PROCESSOR}/libcoreneuron${CMAKE_STATIC_LIBRARY_SUFFIX}"
-      CACHE INTERNAL "coreneuron mechanism library")
-endif()
-
-set(output_binaries "${CMAKE_BINARY_DIR}/bin/${CMAKE_SYSTEM_PROCESSOR}/special-core"
-                    "${corenrn_mech_library}")
+# We have to link things like unit tests against this because some "core" .cpp files refer to
+# symbols in the translated versions of default .mod files
+set(nrniv_core_prefix "${CMAKE_BINARY_DIR}/bin/${CMAKE_SYSTEM_PROCESSOR}")
+set(corenrn_mech_library
+    "${nrniv_core_prefix}/libcoreneuron${CMAKE_${COMPILE_LIBRARY_TYPE}_LIBRARY_SUFFIX}")
+set(output_binaries "${nrniv_core_prefix}/special-core" "${corenrn_mech_library}")
 
 add_custom_command(
   OUTPUT ${output_binaries}
-  DEPENDS coreneuron-core ${coreneuron_cuda_target} ${NMODL_TARGET_TO_DEPEND} ${modfiles} ${CORENEURON_BUILTIN_MODFILES}
+  DEPENDS coreneuron-core ${coreneuron_cuda_target} ${NMODL_TARGET_TO_DEPEND} ${modfiles}
+          ${CORENEURON_BUILTIN_MODFILES}
   COMMAND ${CMAKE_BINARY_DIR}/bin/nrnivmodl-core -b ${COMPILE_LIBRARY_TYPE} -m
           ${CORENRN_MOD2CPP_BINARY} -p 4 "${modfile_directory}"
   WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/bin
@@ -326,13 +316,14 @@ add_custom_target(nrniv-core ALL DEPENDS ${output_binaries})
 
 if(CORENRN_ENABLE_GPU)
   separate_arguments(CORENRN_ACC_FLAGS UNIX_COMMAND "${NVHPC_ACC_COMP_FLAGS}")
-  target_compile_options(coreneuron-core BEFORE PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${CORENRN_ACC_FLAGS}>)
+  target_compile_options(coreneuron-core BEFORE
+                         PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${CORENRN_ACC_FLAGS}>)
 endif()
 
-# Create an extra target for internal use that unit tests and so on can depend
-# on
+# Create an extra target for internal use that unit tests and so on can depend on
 add_library(coreneuron-all INTERFACE)
-target_link_libraries(coreneuron-all INTERFACE coreneuron-core ${coreneuron_cuda_target} "${corenrn_mech_library}")
+target_link_libraries(coreneuron-all INTERFACE coreneuron-core ${coreneuron_cuda_target}
+                                               "${corenrn_mech_library}")
 
 # =============================================================================
 # Extract link definitions to be used with nrnivmodl-core
@@ -379,10 +370,11 @@ install(
   DESTINATION $<INSTALL_INTERFACE:include>)
 
 if(TARGET coreneuron-cuda)
-  install(TARGETS coreneuron-cuda
-          EXPORT coreneuron
-          ARCHIVE DESTINATION lib
-          LIBRARY DESTINATION lib)
+  install(
+    TARGETS coreneuron-cuda
+    EXPORT coreneuron
+    ARCHIVE DESTINATION lib
+    LIBRARY DESTINATION lib)
 endif()
 
 # headers and some standalone code files for nrnivmodl-core
@@ -409,8 +401,11 @@ install(
   RENAME nrniv-core)
 install(FILES apps/coreneuron.cpp DESTINATION share/coreneuron)
 
-# install mechanism library
-install(FILES ${corenrn_mech_library} DESTINATION lib)
+# install mechanism library in shared library builds, if we're linking statically then there is no
+# need
+if(CORENRN_ENABLE_SHARED)
+  install(FILES ${corenrn_mech_library} DESTINATION lib)
+endif()
 
 # install random123 and nmodl headers
 install(DIRECTORY ${CMAKE_BINARY_DIR}/include/ DESTINATION include)
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index b3d8a30f1..ea8052d7b 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -28,9 +28,13 @@ find_package(Boost 1.59 QUIET COMPONENTS filesystem system atomic unit_test_fram
 if(Boost_FOUND)
   if(CORENRN_ENABLE_UNIT_TESTS)
     add_library(coreneuron-unit-test INTERFACE)
-    target_compile_options(coreneuron-unit-test INTERFACE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS})
-    target_include_directories(coreneuron-unit-test SYSTEM INTERFACE ${Boost_INCLUDE_DIRS} ${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11/include)
-    target_link_libraries(coreneuron-unit-test INTERFACE coreneuron-all ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY})
+    target_compile_options(coreneuron-unit-test
+                           INTERFACE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS})
+    target_include_directories(
+      coreneuron-unit-test SYSTEM INTERFACE ${Boost_INCLUDE_DIRS}
+                                            ${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11/include)
+    target_link_libraries(coreneuron-unit-test INTERFACE coreneuron-all
+                                                         ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY})
     add_subdirectory(unit/cmdline_interface)
     add_subdirectory(unit/interleave_info)
     add_subdirectory(unit/alignment)
diff --git a/tests/integration/CMakeLists.txt b/tests/integration/CMakeLists.txt
index fa488028e..75ae106e1 100644
--- a/tests/integration/CMakeLists.txt
+++ b/tests/integration/CMakeLists.txt
@@ -86,7 +86,6 @@ foreach(data_dir "ring" "ring_gap")
   math(EXPR num_suffixes_m1 "${num_suffixes} - 1")
   foreach(suffix_index RANGE 0 ${num_suffixes_m1})
     list(GET test_suffixes ${suffix_index} test_suffix)
-    message(STATUS "test_suffix=${test_suffix}")
     file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/${data_dir}/out.dat.ref"
          DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/${data_dir}${test_suffix}/")
   endforeach()
diff --git a/tests/unit/cmdline_interface/CMakeLists.txt b/tests/unit/cmdline_interface/CMakeLists.txt
index 26f0b62c7..fadbe60a3 100644
--- a/tests/unit/cmdline_interface/CMakeLists.txt
+++ b/tests/unit/cmdline_interface/CMakeLists.txt
@@ -5,9 +5,5 @@
 # =============================================================================
 add_executable(cmd_interface_test_bin test_cmdline_interface.cpp)
 target_link_libraries(cmd_interface_test_bin coreneuron-unit-test)
-# Tell CMake *not* to run an explicit device code linker step (which will produce errors); let the
-# NVHPC C++ compiler handle this implicitly.
-#set_target_properties(cmd_interface_test_bin PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
-#target_compile_options(cmd_interface_test_bin PRIVATE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS})
 add_test(NAME cmd_interface_test COMMAND ${TEST_EXEC_PREFIX} $<TARGET_FILE:cmd_interface_test_bin>)
 cpp_cc_configure_sanitizers(TARGET cmd_interface_test_bin TEST cmd_interface_test)

From 745676638d90d3e93806b9947e622e8f5bb6c71c Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 19 Jul 2022 11:20:37 +0200
Subject: [PATCH 034/128] cleanup

---
 CMakeLists.txt                           |  3 ---
 coreneuron/CMakeLists.txt                | 21 +++++++--------------
 coreneuron/mechanism/mech/enginemech.cpp |  2 +-
 extra/nrnivmodl-core.in                  |  2 +-
 extra/nrnivmodl_core_makefile.in         |  2 +-
 5 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7ef147a41..e128652a0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -171,9 +171,6 @@ set(CORENRN_ACCELERATOR_OFFLOAD "Disabled")
 if(CORENRN_ENABLE_GPU)
   # Older CMake versions than 3.15 have not been tested for GPU/CUDA/OpenACC support after
   # https://github.com/BlueBrain/CoreNeuron/pull/609.
-  # https://cmake.org/cmake/help/latest/release/3.14.html#properties suggests there would be
-  # problems because of expressions like set_target_properties(lfp_test_bin PROPERTIES
-  # CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
 
   # Fail hard and early if we don't have the PGI/NVHPC compiler.
   if(NOT CORENRN_HAVE_NVHPC_COMPILER)
diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 6a4a7136b..971f3fa41 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -250,18 +250,9 @@ if(CORENRN_ENABLE_MPI AND CORENRN_ENABLE_MPI_DYNAMIC)
   install(TARGETS ${corenrn_mpi_targets} DESTINATION lib)
 endif()
 
-# Prevent CMake from running a device code link step when assembling libcoreneuron.a in GPU builds.
-# The device code linking needs to be deferred to the final step, where it is done by `nvc++ -cuda`.
-# set_target_properties(coreneuron-core PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF
-# CUDA_SEPARABLE_COMPILATION ON) if(TARGET coreneuron-cuda)
-# set_target_properties(coreneuron-cuda PROPERTIES )
-# endif() Suppress some compiler warnings. TODO no it doesn't: Note in GPU builds this library
-# includes CUDA files.
-target_compile_options(coreneuron-core
-                       PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${CORENEURON_CXX_WARNING_SUPPRESSIONS}>)
-add_dependencies(coreneuron-core nrnivmodl-core) # TODO why?
-
-target_link_libraries(coreneuron-core ${reportinglib_LIBRARY} ${sonatareport_LIBRARY}
+# Suppress some compiler warnings.
+target_compile_options(coreneuron-core PRIVATE ${CORENEURON_CXX_WARNING_SUPPRESSIONS})
+target_link_libraries(coreneuron-core PUBLIC ${reportinglib_LIBRARY} ${sonatareport_LIBRARY}
                       ${CALIPER_LIB} ${likwid_LIBRARIES})
 
 target_include_directories(coreneuron-core SYSTEM
@@ -320,9 +311,11 @@ if(CORENRN_ENABLE_GPU)
                          PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${CORENRN_ACC_FLAGS}>)
 endif()
 
-# Create an extra target for internal use that unit tests and so on can depend on
+# Create an extra target for internal use that unit tests and so on can depend
+# on. ${corenrn_mech_library} is libcoreneuron.{a,so}, which contains both the
+# compiled default mechanisms and the content of libcoreneuron-core.a
 add_library(coreneuron-all INTERFACE)
-target_link_libraries(coreneuron-all INTERFACE coreneuron-core ${coreneuron_cuda_target}
+target_link_libraries(coreneuron-all INTERFACE ${coreneuron_cuda_target}
                                                "${corenrn_mech_library}")
 
 # =============================================================================
diff --git a/coreneuron/mechanism/mech/enginemech.cpp b/coreneuron/mechanism/mech/enginemech.cpp
index 2c20d1293..ee9cc9e28 100644
--- a/coreneuron/mechanism/mech/enginemech.cpp
+++ b/coreneuron/mechanism/mech/enginemech.cpp
@@ -1,6 +1,6 @@
 /*
 # =============================================================================
-# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
+# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
 #
 # See top-level LICENSE file for details.
 # =============================================================================
diff --git a/extra/nrnivmodl-core.in b/extra/nrnivmodl-core.in
index 742409d88..08804b159 100755
--- a/extra/nrnivmodl-core.in
+++ b/extra/nrnivmodl-core.in
@@ -83,7 +83,7 @@ while getopts "n:m:a:d:i:l:Vp:r:b:h" OPT; do
         echo "  -r <0|1>                  Enable NRN_PRCELLSTATE mechanism. Default: @CORENRN_NRN_PRCELLSTATE@."
         echo "  -V                        Verbose: show commands executed by make"
         echo "  -p <n_procs>              Number of parallel builds (Default: $PARALLEL_BUILDS)"
-        echo "  -b <STATIC|SHARED>        libcorenrnmech library type"
+        echo "  -b <STATIC|SHARED>        libcoreneuron library type"
         exit 0;;
     ?)
         exit 1;;
diff --git a/extra/nrnivmodl_core_makefile.in b/extra/nrnivmodl_core_makefile.in
index 73ed76738..749de3ace 100644
--- a/extra/nrnivmodl_core_makefile.in
+++ b/extra/nrnivmodl_core_makefile.in
@@ -233,7 +233,7 @@ coremech_lib_shared: $(ALL_OBJS) $(ENGINEMECH_OBJ) build_always
 	  -I$(CORENRN_INC_DIR) $(INCFLAGS) \
 	  $(LDFLAGS) ${SONAME_OPTION} -Wl,--start-group \
 	  $(MOD_OBJS_DIR)/libcoreneuron-core/*.o \
-		-Wl,--end-group $(CORENRNLIB_FLAGS) -Wl,-rpath,$(CORENRN_LIB_DIR)
+		-Wl,--end-group $(CORENRNLIB_FLAGS) -Wl,-rpath,$(CORENRN_LIB_DIR) -L$(CORENRN_LIB_DIR) -lcoreneuron-cuda
 	# cleanup
 	rm $(MOD_OBJS_DIR)/libcoreneuron-core/*.o
 

From 955719396a8a54bc470585b14ca76d162a7b02d7 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 19 Jul 2022 12:43:26 +0200
Subject: [PATCH 035/128] drop .libs stuff

---
 CMake/coreneuron-config.cmake.in | 1 +
 extra/nrnivmodl_core_makefile.in | 9 ++-------
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/CMake/coreneuron-config.cmake.in b/CMake/coreneuron-config.cmake.in
index 29f67c92f..c5f8573d0 100644
--- a/CMake/coreneuron-config.cmake.in
+++ b/CMake/coreneuron-config.cmake.in
@@ -14,6 +14,7 @@ set(CORENRN_VERSION_PATCH @PROJECT_VERSION_PATCH@)
 set(CORENRN_ENABLE_GPU @CORENRN_ENABLE_GPU@)
 set(CORENRN_ENABLE_NMODL @CORENRN_ENABLE_NMODL@)
 set(CORENRN_ENABLE_REPORTING @CORENRN_ENABLE_REPORTING@)
+set(CORENRN_ENABLE_SHARED @CORENRN_ENABLE_SHARED@)
 set(CORENEURON_LIB_LINK_FLAGS "@CORENEURON_LIB_LINK_FLAGS@")
 
 find_path(CORENEURON_INCLUDE_DIR "coreneuron/coreneuron.h" HINTS "${CONFIG_PATH}/../../include")
diff --git a/extra/nrnivmodl_core_makefile.in b/extra/nrnivmodl_core_makefile.in
index 749de3ace..1c29e7499 100644
--- a/extra/nrnivmodl_core_makefile.in
+++ b/extra/nrnivmodl_core_makefile.in
@@ -207,7 +207,7 @@ endif
 
 
 # main target to build binary
-$(SPECIAL_EXE): coremech_lib_target
+$(SPECIAL_EXE): $(corenrnmech_lib_target)
 	@printf " => $(C_GREEN)Binary$(C_RESET) creating $(SPECIAL_EXE)\n"
 	$(CXX_LINK_EXE_CMD) -o $(SPECIAL_EXE) $(CORENRN_SHARE_CORENRN_DIR)/coreneuron.cpp \
 	  -I$(CORENRN_INC_DIR) $(INCFLAGS) \
@@ -215,11 +215,6 @@ $(SPECIAL_EXE): coremech_lib_target
 	  -L$(CORENRN_LIB_DIR) \
 	  -Wl,-rpath,'$(LIB_RPATH)' -Wl,-rpath,$(CORENRN_LIB_DIR) -Wl,-rpath,'$(INSTALL_LIB_RPATH)'
 
-coremech_lib_target: $(corenrnmech_lib_target)
-	rm -rf $(OUTPUT_DIR)/.libs/lib$(COREMECH_LIB_NAME)$(LIB_SUFFIX); \
-	mkdir -p $(OUTPUT_DIR)/.libs; \
-	ln -s ../lib$(COREMECH_LIB_NAME)$(LIB_SUFFIX) $(OUTPUT_DIR)/.libs/lib$(COREMECH_LIB_NAME)$(LIB_SUFFIX)
-
 $(ENGINEMECH_OBJ): $(CORENRN_SHARE_CORENRN_DIR)/enginemech.cpp | $(MOD_OBJS_DIR)
 	$(CXX_COMPILE_CMD) -c -DADDITIONAL_MECHS $(CORENRN_SHARE_CORENRN_DIR)/enginemech.cpp -o $(ENGINEMECH_OBJ)
 
@@ -279,7 +274,7 @@ $(MOD_OBJS_DIR):
 	mkdir -p $(MOD_OBJS_DIR)
 
 # install binary and libraries
-install: $(SPECIAL_EXE) coremech_lib_target
+install: $(SPECIAL_EXE)
 	install -d $(DESTDIR)/bin $(DESTDIR)/lib
 	install ${COREMECH_LIB_PATH} $(DESTDIR)/lib
 	install $(SPECIAL_EXE) $(DESTDIR)/bin

From e5b1240793325e4a4eabc5b75115b2403899403d Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 19 Jul 2022 14:07:57 +0200
Subject: [PATCH 036/128] format

---
 coreneuron/CMakeLists.txt | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 971f3fa41..9091787dd 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -253,7 +253,7 @@ endif()
 # Suppress some compiler warnings.
 target_compile_options(coreneuron-core PRIVATE ${CORENEURON_CXX_WARNING_SUPPRESSIONS})
 target_link_libraries(coreneuron-core PUBLIC ${reportinglib_LIBRARY} ${sonatareport_LIBRARY}
-                      ${CALIPER_LIB} ${likwid_LIBRARIES})
+                                             ${CALIPER_LIB} ${likwid_LIBRARIES})
 
 target_include_directories(coreneuron-core SYSTEM
                            PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}/external/Random123/include)
@@ -311,12 +311,11 @@ if(CORENRN_ENABLE_GPU)
                          PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${CORENRN_ACC_FLAGS}>)
 endif()
 
-# Create an extra target for internal use that unit tests and so on can depend
-# on. ${corenrn_mech_library} is libcoreneuron.{a,so}, which contains both the
-# compiled default mechanisms and the content of libcoreneuron-core.a
+# Create an extra target for internal use that unit tests and so on can depend on.
+# ${corenrn_mech_library} is libcoreneuron.{a,so}, which contains both the compiled default
+# mechanisms and the content of libcoreneuron-core.a
 add_library(coreneuron-all INTERFACE)
-target_link_libraries(coreneuron-all INTERFACE ${coreneuron_cuda_target}
-                                               "${corenrn_mech_library}")
+target_link_libraries(coreneuron-all INTERFACE ${coreneuron_cuda_target} "${corenrn_mech_library}")
 
 # =============================================================================
 # Extract link definitions to be used with nrnivmodl-core

From 2e40b11cbb701833a72c5782855179f26ad00c34 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 19 Jul 2022 14:13:16 +0200
Subject: [PATCH 037/128] Fixes nvhpc didn't care about.

---
 coreneuron/utils/randoms/nrnran123.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/coreneuron/utils/randoms/nrnran123.h b/coreneuron/utils/randoms/nrnran123.h
index e75ec3f69..c3aa4d7ef 100644
--- a/coreneuron/utils/randoms/nrnran123.h
+++ b/coreneuron/utils/randoms/nrnran123.h
@@ -39,6 +39,8 @@ of the full distribution available from
 #include <Random123/philox.h>
 #include <inttypes.h>
 
+#include <cmath>
+
 // Some files are compiled with DISABLE_OPENACC, and some builds have no GPU
 // support at all. In these two cases, request that the random123 state is
 // allocated using new/delete instead of CUDA unified memory.
@@ -109,7 +111,7 @@ constexpr void nrnran123_getids3(nrnran123_State* s, uint32_t* id1, uint32_t* id
 }
 
 // Uniform 0 to 2*32-1
-constexpr uint32_t nrnran123_ipick(nrnran123_State* s) {
+inline uint32_t nrnran123_ipick(nrnran123_State* s) {
     char which = s->which_;
     uint32_t rval{s->r.v[int{which++}]};
     if (which > 3) {
@@ -134,7 +136,7 @@ constexpr double nrnran123_dblpick(nrnran123_State* s) {
 }
 
 /* this could be called from openacc parallel construct (in INITIAL block) */
-constexpr void nrnran123_setseq(nrnran123_State* s, uint32_t seq, char which) {
+inline void nrnran123_setseq(nrnran123_State* s, uint32_t seq, char which) {
     if (which > 3) {
         s->which_ = 0;
     } else {

From 0b17c683799555a24162be1578607bc9720c8533 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 19 Jul 2022 14:16:52 +0200
Subject: [PATCH 038/128] more fixes from CI errors.

---
 coreneuron/CMakeLists.txt            | 9 ++++-----
 coreneuron/utils/randoms/nrnran123.h | 2 +-
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 9091787dd..0dbf8c8a7 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -177,7 +177,7 @@ endforeach()
 
 # we can link to MPI libraries in non-dynamic-mpi build
 if(CORENRN_ENABLE_MPI AND NOT CORENRN_ENABLE_MPI_DYNAMIC)
-  target_link_libraries(coreneuron-core ${MPI_CXX_LIBRARIES})
+  target_link_libraries(coreneuron-core PUBLIC ${MPI_CXX_LIBRARIES})
 endif()
 
 # this is where we handle dynamic mpi library build
@@ -186,7 +186,7 @@ if(CORENRN_ENABLE_MPI AND CORENRN_ENABLE_MPI_DYNAMIC)
   # main coreneuron library needs to be linked to libdl.so and
   # and should be aware of shared library suffix on different platforms.
   # ~~~
-  target_link_libraries(coreneuron-core ${CMAKE_DL_LIBS})
+  target_link_libraries(coreneuron-core PUBLIC ${CMAKE_DL_LIBS})
 
   # store mpi library targets that will be built
   list(APPEND corenrn_mpi_targets "")
@@ -234,8 +234,7 @@ if(CORENRN_ENABLE_MPI AND CORENRN_ENABLE_MPI_DYNAMIC)
       # ~~~
       if(MINGW) # type msmpi only
         add_dependencies(core${libname}_lib coreneuron-core)
-        target_link_libraries(core${libname}_lib ${MPI_C_LIBRARIES})
-        target_link_libraries(core${libname}_lib coreneuron-core)
+        target_link_libraries(core${libname}_lib ${MPI_C_LIBRARIES} coreneuron-core)
       endif()
       set_property(TARGET core${libname}_lib PROPERTY OUTPUT_NAME core${libname})
       list(APPEND corenrn_mpi_targets "core${libname}_lib")
@@ -262,7 +261,7 @@ target_include_directories(coreneuron-core SYSTEM
 
 # See: https://en.cppreference.com/w/cpp/filesystem#Notes
 if(CMAKE_CXX_COMPILER_IS_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.1)
-  target_link_libraries(coreneuron-core stdc++fs)
+  target_link_libraries(coreneuron-core PUBLIC stdc++fs)
 endif()
 
 if(CORENRN_ENABLE_GPU)
diff --git a/coreneuron/utils/randoms/nrnran123.h b/coreneuron/utils/randoms/nrnran123.h
index c3aa4d7ef..12484d3d4 100644
--- a/coreneuron/utils/randoms/nrnran123.h
+++ b/coreneuron/utils/randoms/nrnran123.h
@@ -131,7 +131,7 @@ constexpr double nrnran123_uint2dbl(uint32_t u) {
 }
 
 // Uniform open interval (0,1), minimum value is 2.3283064e-10 and max value is 1-min
-constexpr double nrnran123_dblpick(nrnran123_State* s) {
+inline double nrnran123_dblpick(nrnran123_State* s) {
     return nrnran123_uint2dbl(nrnran123_ipick(s));
 }
 

From 71e1895d05a2c3e79e89c00957fc1fd0d5e4df25 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 19 Jul 2022 14:19:30 +0200
Subject: [PATCH 039/128] OpenACC + shared tests in GitLab CI.

---
 .gitlab-ci.yml | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 737e867f0..c91eea6ae 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -86,7 +86,14 @@ build:coreneuron:mod2c:nvhpc:acc:
   variables:
     SPACK_PACKAGE: coreneuron
     # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
-    SPACK_PACKAGE_SPEC: +gpu+openmp+tests~legacy-unit build_type=RelWithDebInfo
+    SPACK_PACKAGE_SPEC: +gpu+openmp~shared+tests~legacy-unit build_type=RelWithDebInfo
+
+build:coreneuron:mod2c:nvhpc:acc:shared:
+  extends: [.build, .spack_nvhpc]
+  variables:
+    SPACK_PACKAGE: coreneuron
+    # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
+    SPACK_PACKAGE_SPEC: +gpu+openmp+shared+tests~legacy-unit build_type=RelWithDebInfo
 
 # Build CoreNEURON with Unified Memory on GPU
 build:coreneuron:mod2c:nvhpc:acc:unified:
@@ -94,7 +101,7 @@ build:coreneuron:mod2c:nvhpc:acc:unified:
   variables:
     SPACK_PACKAGE: coreneuron
     # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
-    SPACK_PACKAGE_SPEC: +gpu+unified+openmp+tests~legacy-unit build_type=RelWithDebInfo
+    SPACK_PACKAGE_SPEC: +gpu+unified+openmp~shared+tests~legacy-unit build_type=RelWithDebInfo
 
 .build_coreneuron_nmodl:
   extends: [.build]
@@ -114,7 +121,7 @@ build:coreneuron:nmodl:nvhpc:omp:
   variables:
     SPACK_PACKAGE: coreneuron
     # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
-    SPACK_PACKAGE_SPEC: +nmodl+openmp+gpu+tests~legacy-unit~sympy build_type=RelWithDebInfo
+    SPACK_PACKAGE_SPEC: +nmodl+openmp+gpu~shared+tests~legacy-unit~sympy build_type=RelWithDebInfo
   needs: ["build:nmodl"]
 
 build:coreneuron:nmodl:nvhpc:acc:
@@ -123,7 +130,7 @@ build:coreneuron:nmodl:nvhpc:acc:
     SPACK_PACKAGE: coreneuron
     # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
     # Sympy + OpenMP target offload does not currently work with NVHPC
-    SPACK_PACKAGE_SPEC: +nmodl~openmp+gpu+tests~legacy-unit+sympy build_type=RelWithDebInfo
+    SPACK_PACKAGE_SPEC: +nmodl~openmp+gpu~shared+tests~legacy-unit+sympy build_type=RelWithDebInfo
   needs: ["build:nmodl"]
 
 build:coreneuron:mod2c:intel:
@@ -144,6 +151,10 @@ build:neuron:mod2c:nvhpc:acc:
   extends: [.build_neuron, .spack_nvhpc]
   needs: ["build:coreneuron:mod2c:nvhpc:acc"]
 
+build:neuron:mod2c:nvhpc:acc:shared:
+  extends: [.build_neuron, .spack_nvhpc]
+  needs: ["build:coreneuron:mod2c:nvhpc:acc:shared"]
+
 build:neuron:nmodl:nvhpc:omp:
   extends: [.build_neuron, .spack_nvhpc]
   needs: ["build:coreneuron:nmodl:nvhpc:omp"]
@@ -165,6 +176,10 @@ test:coreneuron:mod2c:nvhpc:acc:
   extends: [.ctest, .gpu_node]
   needs: ["build:coreneuron:mod2c:nvhpc:acc"]
 
+test:coreneuron:mod2c:nvhpc:acc:shared:
+  extends: [.ctest, .gpu_node]
+  needs: ["build:coreneuron:mod2c:nvhpc:acc:shared"]
+
 test:coreneuron:mod2c:nvhpc:acc:unified:
   extends: [.ctest, .gpu_node]
   needs: ["build:coreneuron:mod2c:nvhpc:acc:unified"]
@@ -190,6 +205,10 @@ test:neuron:mod2c:nvhpc:acc:
   extends: [.test_neuron, .gpu_node]
   needs: ["build:neuron:mod2c:nvhpc:acc"]
 
+test:neuron:mod2c:nvhpc:acc:shared:
+  extends: [.test_neuron, .gpu_node]
+  needs: ["build:neuron:mod2c:nvhpc:acc:shared"]
+
 test:neuron:nmodl:nvhpc:omp:
   extends: [.test_neuron, .gpu_node]
   needs: ["build:neuron:nmodl:nvhpc:omp"]

From 3492fe462e09876f15a541301da3636154999f14 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 19 Jul 2022 16:17:37 +0200
Subject: [PATCH 040/128] Try and cleanup CLI11 handling.

---
 coreneuron/CMakeLists.txt                     | 28 ++++++++-------
 coreneuron/apps/corenrn_parameters.cpp        | 21 +++++++----
 coreneuron/apps/corenrn_parameters.hpp        | 36 +++++++++++++------
 coreneuron/apps/main1.cpp                     |  2 +-
 tests/CMakeLists.txt                          |  4 +--
 .../test_cmdline_interface.cpp                |  2 +-
 6 files changed, 59 insertions(+), 34 deletions(-)

diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 0dbf8c8a7..e0ed1b71f 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -148,13 +148,13 @@ endif()
 # Library containing the bulk of the non-mechanism CoreNEURON code. This is always created and
 # installed as a static library, and then the nrnivmodl-core workflow extracts the object files from
 # it and does one of the following:
-# ~~~
-# - shared build: creates libcoreneuron.so from these objects plus those from
-#   the translated MOD files
-# - static build: creates a (temporary, does not get installed) libcoreneuron.a
-#   from these objects plus those from the translated MOD files, then
-#   statically links that into special-core (nrniv-core)
-# ~~~
+#
+# * shared build: creates libcoreneuron.so from these objects plus those from the translated MOD
+#   files
+# * static build: creates a (temporary, does not get installed) libcoreneuron.a from these objects
+#   plus those from the translated MOD files, then statically links that into special-core
+#   (nrniv-core)
+#
 # This scheme means that both core and mechanism .o files are linked in a single step, which is
 # important for GPU linking. It does, however, mean that in a shared library CPU build then the core
 # code is installed twice, once in libcoreneuron-core.a and once in the libcoreneuron.so that
@@ -254,10 +254,14 @@ target_compile_options(coreneuron-core PRIVATE ${CORENEURON_CXX_WARNING_SUPPRESS
 target_link_libraries(coreneuron-core PUBLIC ${reportinglib_LIBRARY} ${sonatareport_LIBRARY}
                                              ${CALIPER_LIB} ${likwid_LIBRARIES})
 
-target_include_directories(coreneuron-core SYSTEM
-                           PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}/external/Random123/include)
-target_include_directories(coreneuron-core SYSTEM
-                           PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11/include)
+# TODO: fix adding a dependency of coreneuron-core on CLI11::CLI11 when CLI11 is a submodule. Right
+# now this doesn't work because the CLI11 targets are not exported/installed, but coreneuron-core
+# is.
+get_target_property(CLI11_HEADER_DIRECTORY CLI11::CLI11 INTERFACE_INCLUDE_DIRECTORIES)
+message(STATUS "CLI11_HEADER_DIRECTORY=${CLI11_HEADER_DIRECTORY}")
+target_include_directories(
+  coreneuron-core SYSTEM PRIVATE ${CLI11_HEADER_DIRECTORY}
+                                 ${CORENEURON_PROJECT_SOURCE_DIR}/external/Random123/include)
 
 # See: https://en.cppreference.com/w/cpp/filesystem#Notes
 if(CMAKE_CXX_COMPILER_IS_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.1)
@@ -265,7 +269,7 @@ if(CMAKE_CXX_COMPILER_IS_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.1)
 endif()
 
 if(CORENRN_ENABLE_GPU)
-  # nrnran123.cpp possibly-temporarily uses Boost.Pool in GPU builds if it's available.
+  # nrnran123.cpp uses Boost.Pool in GPU builds if it's available.
   find_package(Boost QUIET)
   if(Boost_FOUND)
     message(STATUS "Boost found, enabling use of memory pools for Random123...")
diff --git a/coreneuron/apps/corenrn_parameters.cpp b/coreneuron/apps/corenrn_parameters.cpp
index 40c322b18..6ee920d1f 100644
--- a/coreneuron/apps/corenrn_parameters.cpp
+++ b/coreneuron/apps/corenrn_parameters.cpp
@@ -5,15 +5,17 @@
 # See top-level LICENSE file for details.
 # =============================================================================.
 */
-
 #include "coreneuron/apps/corenrn_parameters.hpp"
 
+#include <CLI/CLI.hpp>
 
 namespace coreneuron {
 
 extern std::string cnrn_version();
 
-corenrn_parameters::corenrn_parameters() {
+corenrn_parameters::corenrn_parameters()
+    : m_app{std::make_unique<CLI::App>("CoreNeuron - Optimised Simulator Engine for NEURON.")} {
+    auto& app = *m_app;
     app.set_config("--read-config", "", "Read parameters from ini file", false)
         ->check(CLI::ExistingFile);
     app.add_option("--write-config",
@@ -167,14 +169,21 @@ corenrn_parameters::corenrn_parameters() {
     CLI::retire_option(app, "--show");
 }
 
+// Implementation in .cpp file where CLI types are complete.
+corenrn_parameters::~corenrn_parameters() = default;
+
+std::string corenrn_parameters::config_to_str(bool default_also, bool write_description) const {
+    return m_app->config_to_str(default_also, write_description);
+}
+
 void corenrn_parameters::reset() {
     static_cast<corenrn_parameters_data&>(*this) = corenrn_parameters_data{};
-    app.clear();
+    m_app->clear();
 }
 
 void corenrn_parameters::parse(int argc, char** argv) {
     try {
-        app.parse(argc, argv);
+        m_app->parse(argc, argv);
         if (verbose == verbose_level::NONE) {
             nrn_nobanner_ = 1;
         }
@@ -182,11 +191,11 @@ void corenrn_parameters::parse(int argc, char** argv) {
         // in case of parsing errors, show message with exception
         std::cerr << "CLI parsing error, see nrniv-core --help for more information. \n"
                   << std::endl;
-        app.exit(e);
+        m_app->exit(e);
         throw e;
     } catch (const CLI::ParseError& e) {
         // use --help is also ParseError; in this case exit by showing all options
-        app.exit(e);
+        m_app->exit(e);
         exit(0);
     }
 
diff --git a/coreneuron/apps/corenrn_parameters.hpp b/coreneuron/apps/corenrn_parameters.hpp
index bfe646622..8db8ce06c 100644
--- a/coreneuron/apps/corenrn_parameters.hpp
+++ b/coreneuron/apps/corenrn_parameters.hpp
@@ -1,18 +1,14 @@
 /*
 # =============================================================================
-# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
+# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
 #
 # See top-level LICENSE file for details.
 # =============================================================================.
 */
-
 #pragma once
-
+#include <memory>
+#include <ostream>
 #include <string>
-#include <fstream>
-#include <iostream>
-#include <iomanip>
-#include <CLI/CLI.hpp>
 
 /**
  * \class corenrn_parameters
@@ -32,6 +28,10 @@
  * Also single dash long options are not supported anymore (-mpi -> --mpi).
  */
 
+namespace CLI {
+struct App;
+}
+
 namespace coreneuron {
 
 struct corenrn_parameters_data {
@@ -94,10 +94,8 @@ struct corenrn_parameters_data {
 };
 
 struct corenrn_parameters: corenrn_parameters_data {
-    CLI::App app{"CoreNeuron - Optimised Simulator Engine for NEURON."};  /// CLI app that performs
-                                                                          /// CLI parsing
-
-    corenrn_parameters();  /// Constructor that initializes the CLI11 app.
+    corenrn_parameters();   /// Constructor that initializes the CLI11 app.
+    ~corenrn_parameters();  /// Destructor defined in .cpp where CLI11 types are complete.
 
     void parse(int argc, char* argv[]);  /// Runs the CLI11_PARSE macro.
 
@@ -111,6 +109,22 @@ struct corenrn_parameters: corenrn_parameters_data {
     inline bool is_quiet() {
         return verbose == verbose_level::NONE;
     }
+
+    /** @brief Return a string summarising the current parameter values.
+     *
+     * This forwards to the CLI11 method of the same name. Returns a string that
+     * could be read in as a config of the current values of the App.
+     *
+     * @param default_also Include any defaulted arguments.
+     * @param write_description Include option descriptions and the App description.
+     */
+    std::string config_to_str(bool default_also = false, bool write_description = false) const;
+
+  private:
+    // CLI app that performs CLI parsing. std::unique_ptr avoids having to
+    // include CLI11 headers from CoreNEURON headers, and therefore avoids
+    // CoreNEURON having to install CLI11 when using it from a submodule.
+    std::unique_ptr<CLI::App> m_app;
 };
 
 std::ostream& operator<<(std::ostream& os,
diff --git a/coreneuron/apps/main1.cpp b/coreneuron/apps/main1.cpp
index 8e05a5d69..86275fff0 100644
--- a/coreneuron/apps/main1.cpp
+++ b/coreneuron/apps/main1.cpp
@@ -511,7 +511,7 @@ extern "C" void mk_mech_init(int argc, char** argv) {
 
     if (!corenrn_param.writeParametersFilepath.empty()) {
         std::ofstream out(corenrn_param.writeParametersFilepath, std::ios::trunc);
-        out << corenrn_param.app.config_to_str(false, false);
+        out << corenrn_param.config_to_str(false, false);
         out.close();
     }
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index ea8052d7b..7ef3d9647 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -30,9 +30,7 @@ if(Boost_FOUND)
     add_library(coreneuron-unit-test INTERFACE)
     target_compile_options(coreneuron-unit-test
                            INTERFACE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS})
-    target_include_directories(
-      coreneuron-unit-test SYSTEM INTERFACE ${Boost_INCLUDE_DIRS}
-                                            ${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11/include)
+    target_include_directories(coreneuron-unit-test SYSTEM INTERFACE ${Boost_INCLUDE_DIRS})
     target_link_libraries(coreneuron-unit-test INTERFACE coreneuron-all
                                                          ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY})
     add_subdirectory(unit/cmdline_interface)
diff --git a/tests/unit/cmdline_interface/test_cmdline_interface.cpp b/tests/unit/cmdline_interface/test_cmdline_interface.cpp
index caef6ca14..ccd9e1f66 100644
--- a/tests/unit/cmdline_interface/test_cmdline_interface.cpp
+++ b/tests/unit/cmdline_interface/test_cmdline_interface.cpp
@@ -130,5 +130,5 @@ BOOST_AUTO_TEST_CASE(cmdline_interface) {
 
     // Everything has its default value, and the first `false` says not to
     // include default values in the output, so this should be empty
-    BOOST_CHECK(corenrn_param_test.app.config_to_str(false, false).empty());
+    BOOST_CHECK(corenrn_param_test.config_to_str(false, false).empty());
 }

From 61e3fad4e5a3a54a3c48b37ba27285eea8d48193 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 19 Jul 2022 17:57:41 +0200
Subject: [PATCH 041/128] try and consolidate build logic more

---
 .gitlab-ci.yml                   | 14 ++---
 CMake/MakefileBuildOptions.cmake | 93 +++++++++++++++++++-------------
 CMake/OpenAccHelper.cmake        | 28 +++++-----
 CMakeLists.txt                   | 83 +++++++++++-----------------
 coreneuron/CMakeLists.txt        | 54 +++++--------------
 coreneuron/apps/main1.cpp        |  4 +-
 coreneuron/permute/cellorder.cu  |  2 +-
 extra/nrnivmodl_core_makefile.in | 11 ++--
 8 files changed, 125 insertions(+), 164 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index c91eea6ae..6a9abb44b 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -86,14 +86,14 @@ build:coreneuron:mod2c:nvhpc:acc:
   variables:
     SPACK_PACKAGE: coreneuron
     # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
-    SPACK_PACKAGE_SPEC: +gpu+openmp~shared+tests~legacy-unit build_type=RelWithDebInfo
+    SPACK_PACKAGE_SPEC: +caliper+gpu+openmp~shared+tests~legacy-unit build_type=RelWithDebInfo
 
 build:coreneuron:mod2c:nvhpc:acc:shared:
   extends: [.build, .spack_nvhpc]
   variables:
     SPACK_PACKAGE: coreneuron
     # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
-    SPACK_PACKAGE_SPEC: +gpu+openmp+shared+tests~legacy-unit build_type=RelWithDebInfo
+    SPACK_PACKAGE_SPEC: +caliper+gpu+openmp+shared+tests~legacy-unit build_type=RelWithDebInfo
 
 # Build CoreNEURON with Unified Memory on GPU
 build:coreneuron:mod2c:nvhpc:acc:unified:
@@ -101,7 +101,7 @@ build:coreneuron:mod2c:nvhpc:acc:unified:
   variables:
     SPACK_PACKAGE: coreneuron
     # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
-    SPACK_PACKAGE_SPEC: +gpu+unified+openmp~shared+tests~legacy-unit build_type=RelWithDebInfo
+    SPACK_PACKAGE_SPEC: ~caliper+gpu+unified+openmp~shared+tests~legacy-unit build_type=RelWithDebInfo
 
 .build_coreneuron_nmodl:
   extends: [.build]
@@ -121,7 +121,7 @@ build:coreneuron:nmodl:nvhpc:omp:
   variables:
     SPACK_PACKAGE: coreneuron
     # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
-    SPACK_PACKAGE_SPEC: +nmodl+openmp+gpu~shared+tests~legacy-unit~sympy build_type=RelWithDebInfo
+    SPACK_PACKAGE_SPEC: +caliper+nmodl+openmp+gpu~shared+tests~legacy-unit~sympy build_type=RelWithDebInfo
   needs: ["build:nmodl"]
 
 build:coreneuron:nmodl:nvhpc:acc:
@@ -130,20 +130,20 @@ build:coreneuron:nmodl:nvhpc:acc:
     SPACK_PACKAGE: coreneuron
     # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
     # Sympy + OpenMP target offload does not currently work with NVHPC
-    SPACK_PACKAGE_SPEC: +nmodl~openmp+gpu~shared+tests~legacy-unit+sympy build_type=RelWithDebInfo
+    SPACK_PACKAGE_SPEC: ~caliper+nmodl~openmp+gpu~shared+tests~legacy-unit+sympy build_type=RelWithDebInfo
   needs: ["build:nmodl"]
 
 build:coreneuron:mod2c:intel:
   extends: [.build, .spack_intel]
   variables:
     SPACK_PACKAGE: coreneuron
-    SPACK_PACKAGE_SPEC: +tests~legacy-unit build_type=Debug
+    SPACK_PACKAGE_SPEC: +caliper+tests~legacy-unit build_type=Debug
 
 build:coreneuron:nmodl:intel:
   extends: [.build_coreneuron_nmodl, .spack_intel]
   variables:
     SPACK_PACKAGE: coreneuron
-    SPACK_PACKAGE_SPEC: +nmodl+tests~legacy-unit build_type=Debug
+    SPACK_PACKAGE_SPEC: ~caliper+nmodl+tests~legacy-unit build_type=Debug
   needs: ["build:nmodl"]
 
 # Build NEURON
diff --git a/CMake/MakefileBuildOptions.cmake b/CMake/MakefileBuildOptions.cmake
index 7aef0c549..710b8ad4e 100644
--- a/CMake/MakefileBuildOptions.cmake
+++ b/CMake/MakefileBuildOptions.cmake
@@ -21,7 +21,7 @@ set(CMAKE_ISPC_FLAGS "${CMAKE_ISPC_FLAGS} --pic")
 set(NMODL_COMMON_ARGS "passes --inline")
 
 if(NOT "${CORENRN_NMODL_FLAGS}" STREQUAL "")
-  set(NMODL_COMMON_ARGS "${NMODL_COMMON_ARGS} ${CORENRN_NMODL_FLAGS}")
+  string(APPEND NMODL_COMMON_ARGS " ${CORENRN_NMODL_FLAGS}")
 endif()
 
 set(NMODL_CPU_BACKEND_ARGS "host --c")
@@ -29,62 +29,81 @@ set(NMODL_ISPC_BACKEND_ARGS "host --ispc")
 set(NMODL_ACC_BACKEND_ARGS "host --c acc --oacc")
 
 # =============================================================================
-# Extract Compile definitions : common to all backend
+# Construct the linker arguments that are used inside nrnivmodl-core (to build
+# libcoreneuron from libcoreneuron-core, libcoreneuron-cuda and mechanism object
+# files) and inside nrnivmodl (to link NEURON's special against CoreNEURON's
+# libcoreneuron).
 # =============================================================================
-get_directory_property(COMPILE_DEFS COMPILE_DEFINITIONS)
-if(COMPILE_DEFS)
-  set(CORENRN_COMMON_COMPILE_DEFS "")
-  foreach(flag ${COMPILE_DEFS})
-    set(CORENRN_COMMON_COMPILE_DEFS "${CORENRN_COMMON_COMPILE_DEFS} -D${flag}")
-  endforeach()
-endif()
+# Essentially we "just" want to unpack the CMake dependencies of the
+# `coreneuron-core` target into a plain string that we can bake into the
+# Makefiles in both NEURON and CoreNEURON.
+function(coreneuron_process_target target)
+  if(TARGET ${target})
+    if(NOT target STREQUAL "coreneuron-core")
+      # This is a special case: libcoreneuron-core.a is manually unpacked into .o
+      # files by the nrnivmodl-core Makefile, so we do not want to also emit an
+      # -lcoreneuron-core argument.
+      # TODO: probably need to extract an -L and RPATH path and include that here?
+      set_property(GLOBAL APPEND_STRING PROPERTY CORENEURON_LIB_LINK_FLAGS " -l${target}")
+    endif()
+    get_target_property(target_libraries ${target} LINK_LIBRARIES)
+    if(target_libraries)
+      foreach(child_target ${target_libraries})
+        coreneuron_process_target(${child_target})
+      endforeach()  
+    endif()
+    return()
+  endif()
+  get_filename_component(target_dir "${target}" DIRECTORY)
+  message(STATUS "target=${target} target_dir=${target_dir}")
+  if(NOT target_dir)
+    # In case target is not a target but is just the name of a library, e.g. "dl"
+    set_property(GLOBAL APPEND_STRING PROPERTY CORENEURON_LIB_LINK_FLAGS " -l${target}")
+  elseif("${target_dir}" MATCHES "^(/lib|/lib64|/usr/lib|/usr/lib64)$")
+    # e.g. /usr/lib64/libpthread.so -> -lpthread
+    get_filename_component(libname ${target} NAME_WE)
+    string(REGEX REPLACE "^lib" "" libname ${libname})
+    set_property(GLOBAL APPEND_STRING PROPERTY CORENEURON_LIB_LINK_FLAGS " -l${libname}")
+  else()
+    # It's a full path, include that on the line
+    set_property(GLOBAL APPEND_STRING PROPERTY CORENEURON_LIB_LINK_FLAGS " ${target}")
+  endif()
+endfunction()
+coreneuron_process_target(coreneuron-core)
+get_property(CORENEURON_LIB_LINK_FLAGS GLOBAL PROPERTY CORENEURON_LIB_LINK_FLAGS)
+message(STATUS "CORENEURON_LIB_LINK_FLAGS=${CORENEURON_LIB_LINK_FLAGS}")
+
+# Things that used to be in CORENEURON_LIB_LINK_FLAGS: -rdynamic -lrt
+# -Wl,--whole-archive -L${CMAKE_HOST_SYSTEM_PROCESSOR} -Wl,--no-whole-archive
+# -L${caliper_LIB_DIR} -l${CALIPER_LIB}
 
 # =============================================================================
-# link flags : common to all backend
+# Turn CORENRN_COMPILE_DEFS into a list of -DFOO[=BAR] options.
 # =============================================================================
-# ~~~
-# find_cuda uses FindThreads that adds below imported target we
-# shouldn't add imported target to link line
-# ~~~
-list(REMOVE_ITEM CORENRN_LINK_LIBS "Threads::Threads")
+list(TRANSFORM CORENRN_COMPILE_DEFS PREPEND -D OUTPUT_VARIABLE CORENRN_COMPILE_DEF_FLAGS)
 
-string(JOIN " " CORENRN_COMMON_LDFLAGS ${CORENRN_EXTRA_LINK_FLAGS})
+# =============================================================================
+# Extra link flags that we need to include when linking libcoreneuron.{a,so} in
+# CoreNEURON but that do not need to be passed to NEURON to use when linking
+# nrniv/special (why?)
+# =============================================================================
+string(JOIN " " CORENRN_COMMON_LDFLAGS ${CORENEURON_LIB_LINK_FLAGS} ${CORENRN_EXTRA_LINK_FLAGS})
 if(CORENRN_SANITIZER_LIBRARY_DIR)
   string(APPEND CORENRN_COMMON_LDFLAGS " -Wl,-rpath,${CORENRN_SANITIZER_LIBRARY_DIR}")
 endif()
 string(JOIN " " CORENRN_SANITIZER_ENABLE_ENVIRONMENT_STRING ${CORENRN_SANITIZER_ENABLE_ENVIRONMENT})
 
-# replicate CMake magic to transform system libs to -l<libname>
-foreach(link_lib ${CORENRN_LINK_LIBS})
-  if(${link_lib} MATCHES "\-l.*")
-    string(APPEND CORENRN_COMMON_LDFLAGS " ${link_lib}")
-    continue()
-  endif()
-  get_filename_component(path ${link_lib} DIRECTORY)
-  if(NOT path)
-    string(APPEND CORENRN_COMMON_LDFLAGS " -l${link_lib}")
-  elseif("${path}" MATCHES "^(/lib|/lib64|/usr/lib|/usr/lib64)$")
-    get_filename_component(libname ${link_lib} NAME_WE)
-    string(REGEX REPLACE "^lib" "" libname ${libname})
-    string(APPEND CORENRN_COMMON_LDFLAGS " -l${libname}")
-  else()
-    string(APPEND CORENRN_COMMON_LDFLAGS " ${link_lib}")
-  endif()
-endforeach()
-
 # =============================================================================
 # compile flags : common to all backend
 # =============================================================================
-string(JOIN " " CMAKE_CXX17_STANDARD_COMPILE_OPTION_STRING ${CMAKE_CXX17_STANDARD_COMPILE_OPTION})
 string(TOUPPER "${CMAKE_BUILD_TYPE}" _BUILD_TYPE)
-list(TRANSFORM CORENRN_COMPILE_DEFS PREPEND -D OUTPUT_VARIABLE CORENRN_COMPILE_DEF_FLAGS)
 string(
   JOIN
   " "
   CORENRN_CXX_FLAGS
   ${CMAKE_CXX_FLAGS}
   ${CMAKE_CXX_FLAGS_${_BUILD_TYPE}}
-  ${CMAKE_CXX17_STANDARD_COMPILE_OPTION_STRING}
+  ${CMAKE_CXX17_STANDARD_COMPILE_OPTION}
   ${NVHPC_ACC_COMP_FLAGS}
   ${NVHPC_CXX_INLINE_FLAGS}
   ${CORENRN_COMPILE_DEF_FLAGS}
diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake
index 5be6af61b..d2eed3d1d 100644
--- a/CMake/OpenAccHelper.cmake
+++ b/CMake/OpenAccHelper.cmake
@@ -34,13 +34,13 @@ if(CORENRN_ENABLE_GPU)
   cnrn_parse_version(${CMAKE_CXX_COMPILER_VERSION} OUTPUT_MAJOR_MINOR
                      CORENRN_NVHPC_MAJOR_MINOR_VERSION)
   # Enable cudaProfiler{Start,Stop}() behind the Instrumentor::phase... APIs
-  add_compile_definitions(CORENEURON_CUDA_PROFILING CORENEURON_ENABLE_GPU)
+  list(APPEND CORENRN_COMPILE_DEFS CORENEURON_CUDA_PROFILING CORENEURON_ENABLE_GPU)
   # Plain C++ code in CoreNEURON may need to use CUDA runtime APIs for, for example, starting and
   # stopping profiling. This makes sure those headers can be found.
   include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
   # cuda unified memory support
   if(CORENRN_ENABLE_CUDA_UNIFIED_MEMORY)
-    add_compile_definitions(CORENEURON_UNIFIED_MEMORY)
+    list(APPEND CORENRN_COMPILE_DEFS CORENEURON_UNIFIED_MEMORY)
   endif()
   if(${CMAKE_VERSION} VERSION_LESS 3.17)
     # Hopefully we can drop this soon. Parse ${CMAKE_CUDA_COMPILER_VERSION} into a shorter X.Y
@@ -81,7 +81,7 @@ if(CORENRN_ENABLE_GPU)
   if(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenMP")
     # Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available
     # for a region then prefer OpenMP.
-    add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD)
+    list(APPEND CORENRN_COMPILE_DEFS CORENEURON_PREFER_OPENMP_OFFLOAD)
     string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu")
   elseif(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenACC")
     # Only enable OpenACC offload for GPU
@@ -98,20 +98,16 @@ if(CORENRN_ENABLE_GPU)
 endif()
 
 # =============================================================================
-# Set global property that will be used by NEURON to link with CoreNEURON
+# Initialise global property that will be used by NEURON to link with CoreNEURON
 # =============================================================================
-# TODO this should be derived from what we use internally to link special-core?
-if(CORENRN_ENABLE_GPU)
-  set_property(
-    GLOBAL
-    PROPERTY
-      CORENEURON_LIB_LINK_FLAGS
-      "${NVHPC_ACC_COMP_FLAGS} -rdynamic -lrt -Wl,--whole-archive -L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcoreneuron -lcoreneuron-cuda -Wl,--no-whole-archive"
-  )
-else()
-  set_property(GLOBAL PROPERTY CORENEURON_LIB_LINK_FLAGS
-                               "-L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcoreneuron")
-endif(CORENRN_ENABLE_GPU)
+if(CORENRN_ENABLE_GPU AND CORENRN_ENABLE_SHARED)
+  # Because of
+  # https://forums.developer.nvidia.com/t/dynamically-loading-an-openacc-enabled-shared-library-from-an-executable-compiled-with-nvc-does-not-work/210968
+  # we have to tell NEURON to pass OpenACC flags when linking special, otherwise
+  # we end up with an `nrniv` binary that cannot dynamically load CoreNEURON in
+  # shared-library builds
+  set_property(GLOBAL PROPERTY CORENEURON_LIB_LINK_FLAGS "${NVHPC_ACC_COMP_FLAGS}")
+endif()
 
 if(CORENRN_HAVE_NVHPC_COMPILER)
   if(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL 20.7)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e128652a0..ccf5f0aa0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -271,12 +271,11 @@ find_package(Perl REQUIRED)
 # Common build options
 # =============================================================================
 # build mod files for coreneuron
-add_definitions(-DCORENEURON_BUILD)
-
+list(APPEND CORENRN_COMPILE_DEFS CORENEURON_BUILD)
 set(CMAKE_REQUIRED_QUIET TRUE)
 check_include_files(malloc.h have_malloc_h)
 if(have_malloc_h)
-  add_definitions("-DHAVE_MALLOC_H")
+  list(APPEND CORENRN_COMPILE_DEFS HAVE_MALLOC_H)
 endif()
 
 # =============================================================================
@@ -313,14 +312,14 @@ endif()
 
 if(CORENRN_ENABLE_MPI)
   find_package(MPI REQUIRED)
-  add_definitions("-DNRNMPI=1")
+  list(APPEND CORENRN_COMPILE_DEFS NRNMPI=1)
   # avoid linking to C++ bindings
-  add_definitions("-DMPI_NO_CPPBIND=1")
-  add_definitions("-DOMPI_SKIP_MPICXX=1")
-  add_definitions("-DMPICH_SKIP_MPICXX=1")
+  list(APPEND CORENRN_COMPILE_DEFS MPI_NO_CPPBIND=1)
+  list(APPEND CORENRN_COMPILE_DEFS OMPI_SKIP_MPICXX=1)
+  list(APPEND CORENRN_COMPILE_DEFS MPICH_SKIP_MPICXX=1)
 else()
-  add_definitions("-DNRNMPI=0")
-  add_definitions("-DNRN_MULTISEND=0")
+  list(APPEND CORENRN_COMPILE_DEFS NRNMPI=0)
+  list(APPEND CORENRN_COMPILE_DEFS NRN_MULTISEND=0)
 endif()
 
 if(CORENRN_ENABLE_OPENMP)
@@ -331,23 +330,23 @@ if(CORENRN_ENABLE_OPENMP)
   endif()
 endif()
 
-add_definitions("-DLAYOUT=0")
+list(APPEND CORENRN_COMPILE_DEFS LAYOUT=0)
 
 if(NOT CORENRN_ENABLE_HOC_EXP)
-  add_definitions("-DDISABLE_HOC_EXP")
+  list(APPEND CORENRN_COMPILE_DEFS DISABLE_HOC_EXP)
 endif()
 
 # splay tree required for net_move
 if(CORENRN_ENABLE_SPLAYTREE_QUEUING)
-  add_definitions("-DENABLE_SPLAYTREE_QUEUING")
+  list(APPEND CORENRN_COMPILE_DEFS ENABLE_SPLAYTREE_QUEUING)
 endif()
 
 if(NOT CORENRN_ENABLE_NET_RECEIVE_BUFFER)
-  add_definitions("-DNET_RECEIVE_BUFFERING=0")
+  list(APPEND CORENRN_COMPILE_DEFS NET_RECEIVE_BUFFERING=0)
 endif()
 
 if(NOT CORENRN_ENABLE_TIMEOUT)
-  add_definitions("-DDISABLE_TIMEOUT")
+  list(APPEND CORENRN_COMPILE_DEFS DISABLE_TIMEOUT)
 endif()
 
 if(CORENRN_ENABLE_REPORTING)
@@ -356,7 +355,7 @@ if(CORENRN_ENABLE_REPORTING)
   find_program(H5DUMP_EXECUTABLE h5dump)
 
   if(reportinglib_FOUND)
-    add_definitions("-DENABLE_BIN_REPORTS")
+    list(APPEND CORENRN_COMPILE_DEFS ENABLE_BIN_REPORTS)
     set(ENABLE_BIN_REPORTS_TESTS ON)
   else()
     set(reportinglib_INCLUDE_DIR "")
@@ -364,7 +363,7 @@ if(CORENRN_ENABLE_REPORTING)
   endif()
   if(sonata_FOUND)
     if(TARGET sonata::sonata_report)
-      add_definitions("-DENABLE_SONATA_REPORTS")
+      list(APPEND CORENRN_COMPILE_DEFS ENABLE_SONATA_REPORTS)
       set(ENABLE_SONATA_REPORTS_TESTS ON)
     else()
       message(SEND_ERROR "SONATA library was found but without reporting support")
@@ -384,6 +383,7 @@ if(CORENRN_ENABLE_LEGACY_UNITS)
 else()
   set(CORENRN_USE_LEGACY_UNITS 0)
 endif()
+list(APPEND CORENRN_COMPILE_DEFS CORENEURON_USE_LEGACY_UNITS=${CORENRN_USE_LEGACY_UNITS})
 # Propagate Legacy Units flag to backends.
 set(MOD2C_ENABLE_LEGACY_UNITS
     ${CORENRN_ENABLE_LEGACY_UNITS}
@@ -396,7 +396,7 @@ if(CORENRN_ENABLE_MPI_DYNAMIC)
   if(NOT CORENRN_ENABLE_MPI)
     message(FATAL_ERROR "Cannot enable dynamic mpi without mpi")
   endif()
-  add_compile_definitions(CORENRN_ENABLE_MPI_DYNAMIC)
+  list(APPEND CORENRN_COMPILE_DEFS CORENEURON_ENABLE_MPI_DYNAMIC)
 endif()
 
 if(CORENRN_ENABLE_PRCELLSTATE)
@@ -405,7 +405,7 @@ else()
   set(CORENRN_NRN_PRCELLSTATE 0)
 endif()
 if(MINGW)
-  add_definitions("-DMINGW")
+  list(APPEND CORENRN_COMPILE_DEFS MINGW)
 endif()
 
 # =============================================================================
@@ -448,22 +448,20 @@ endif()
 # =============================================================================
 if(CORENRN_ENABLE_CALIPER_PROFILING)
   find_package(caliper REQUIRED)
-  include_directories(${caliper_INCLUDE_DIR})
-  add_definitions("-DCORENEURON_CALIPER")
-  set(CALIPER_LIB "caliper")
-  set_property(GLOBAL APPEND_STRING PROPERTY CORENEURON_LIB_LINK_FLAGS
-                                             " -L${caliper_LIB_DIR} -l${CALIPER_LIB}")
+  list(APPEND CORENRN_COMPILE_DEFS CORENEURON_CALIPER)
+  set(CORENRN_CALIPER_LIB caliper)
 endif()
 
 if(CORENRN_ENABLE_LIKWID_PROFILING)
   find_package(likwid REQUIRED)
+  list(APPEND CORENRN_COMPILE_DEFS LIKWID_PERFMON)
+  # TODO: avoid this part, probably by using some likwid CMake target
   include_directories(${likwid_INCLUDE_DIRS})
-  add_definitions("-DLIKWID_PERFMON")
 endif()
 
 # enable debugging code with extra logs to stdout
 if(CORENRN_ENABLE_DEBUG_CODE)
-  add_definitions(-DCORENRN_DEBUG -DCHKPNTDEBUG -DCORENRN_DEBUG_QUEUE -DINTERLEAVE_DEBUG)
+  list(APPEND CORENRN_COMPILE_DEFS CORENRN_DEBUG CHKPNTDEBUG CORENRN_DEBUG_QUEUE INTERLEAVE_DEBUG)
 endif()
 
 # =============================================================================
@@ -473,38 +471,19 @@ endif()
 # compiler will be invoked with these flags, so we have to use flags that are as generic as
 # possible.
 if(NOT DEFINED NRN_WHEEL_BUILD OR NOT NRN_WHEEL_BUILD)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${IGNORE_UNKNOWN_PRAGMA_FLAGS}")
+  list(APPEND CORENRN_EXTRA_CXX_FLAGS "${IGNORE_UNKNOWN_PRAGMA_FLAGS}")
 endif()
 
-# =============================================================================
-# Add main directories
-# =============================================================================
+# Add the main source directory
 add_subdirectory(coreneuron)
 
-if(CORENRN_ENABLE_GPU)
-  get_target_property(CORENRN_LINK_LIBRARIES coreneuron-core INTERFACE_LINK_LIBRARIES)
-  if(CORENRN_LINK_LIBRARIES)
-    foreach(LIB ${CORENRN_LINK_LIBRARIES})
-      get_filename_component(dir_path ${LIB} DIRECTORY)
-      if(TARGET ${LIB})
-        # See, for example, caliper where the coreneuron target depends on the caliper target (so we
-        # get LIB=caliper in this loop), but -l and -L are already added manually here:
-        # https://github.com/BlueBrain/CoreNeuron/blob/856cea4aa647c8f2b0d5bda6d0fc32144c5942e3/CMakeLists.txt#L411-L412
-        message(
-          NOTICE
-          "Ignoring dependency '${LIB}' of 'coreneuron-core' and assuming relevant flags have already been added to CORENEURON_LIB_LINK_FLAGS."
-        )
-      elseif(NOT dir_path)
-        # In case LIB is not a target but is just the name of a library, e.g. "dl"
-        set_property(GLOBAL APPEND_STRING PROPERTY CORENEURON_LIB_LINK_FLAGS " -l${LIB}")
-      else()
-        set_property(GLOBAL APPEND_STRING PROPERTY CORENEURON_LIB_LINK_FLAGS " ${LIB}")
-      endif()
-    endforeach()
-  endif()
-endif()
-
+# Extract the various compiler option strings to use inside nrnivmodl-core. Sets
+# the global property CORENEURON_LIB_LINK_FLAGS, which contains the arguments
+# that must be added to the link line for `special` to link against
+# `libcoreneuron.{a,so}`
 include(MakefileBuildOptions)
+
+# Generate the nrnivmodl-core script and makefile using the options from MakefileBuildOptions
 add_subdirectory(extra)
 
 if(CORENRN_ENABLE_UNIT_TESTS)
diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index e0ed1b71f..befb2f8dc 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -21,7 +21,7 @@ file(
   CORENEURON_CODE_FILES
   "apps/main1.cpp"
   "apps/corenrn_parameters.cpp"
-  "gpu/*.cpp"
+  "gpu/nrn_acc_manager.cpp"
   "io/*.cpp"
   "io/reports/*.cpp"
   "mechanism/*.cpp"
@@ -121,11 +121,6 @@ if(CORENRN_ENABLE_GPU)
   endif()
 endif()
 
-# =============================================================================
-# CORENEURON_USE_LEGACY_UNITS is used in membfunc.hpp so define it everywhere
-# =============================================================================
-add_compile_definitions(CORENEURON_USE_LEGACY_UNITS=${CORENRN_USE_LEGACY_UNITS})
-
 # =============================================================================
 # create libraries
 # =============================================================================
@@ -156,18 +151,16 @@ endif()
 #   (nrniv-core)
 #
 # This scheme means that both core and mechanism .o files are linked in a single step, which is
-# important for GPU linking. It does, however, mean that in a shared library CPU build then the core
-# code is installed twice, once in libcoreneuron-core.a and once in the libcoreneuron.so that
-# contains the default mechanisms for the installed nrniv-core binary. In a GPU build,
+# important for GPU linking. It does, however, mean that the core code is installed twice, once in
+# libcoreneuron-core.a and once in libcoreneuron.so (shared) or nrniv-core (static). In a GPU build,
 # libcoreneuron-cuda.{a,so} is also linked to provide the CUDA implementation of the Hines solver.
-add_library(coreneuron-core STATIC ${CORENEURON_CODE_FILES} ${CORENRN_MPI_OBJ})
-
-# Library containing explicit CUDA code, compiled by nvcc. This cannot be included in
-# coreneuron-core because of this issue:
+# This cannot be included in coreneuron-core because of this issue:
 # https://forums.developer.nvidia.com/t/cannot-dynamically-load-a-shared-library-containing-both-openacc-and-cuda-code/210972
+add_library(coreneuron-core STATIC ${CORENEURON_CODE_FILES} ${CORENRN_MPI_OBJ})
 if(CORENRN_ENABLE_GPU)
   set(coreneuron_cuda_target coreneuron-cuda)
   add_library(coreneuron-cuda ${COMPILE_LIBRARY_TYPE} ${CORENEURON_CUDA_FILES})
+  target_link_libraries(coreneuron-core PUBLIC coreneuron-cuda)
 endif()
 
 foreach(target coreneuron-core ${coreneuron_cuda_target})
@@ -252,13 +245,11 @@ endif()
 # Suppress some compiler warnings.
 target_compile_options(coreneuron-core PRIVATE ${CORENEURON_CXX_WARNING_SUPPRESSIONS})
 target_link_libraries(coreneuron-core PUBLIC ${reportinglib_LIBRARY} ${sonatareport_LIBRARY}
-                                             ${CALIPER_LIB} ${likwid_LIBRARIES})
+                                             ${CORENRN_CALIPER_LIB} ${likwid_LIBRARIES})
 
 # TODO: fix adding a dependency of coreneuron-core on CLI11::CLI11 when CLI11 is a submodule. Right
-# now this doesn't work because the CLI11 targets are not exported/installed, but coreneuron-core
-# is.
+# now this doesn't work because the CLI11 targets are not exported/installed but coreneuron-core is.
 get_target_property(CLI11_HEADER_DIRECTORY CLI11::CLI11 INTERFACE_INCLUDE_DIRECTORIES)
-message(STATUS "CLI11_HEADER_DIRECTORY=${CLI11_HEADER_DIRECTORY}")
 target_include_directories(
   coreneuron-core SYSTEM PRIVATE ${CLI11_HEADER_DIRECTORY}
                                  ${CORENEURON_PROJECT_SOURCE_DIR}/external/Random123/include)
@@ -282,7 +273,7 @@ set_target_properties(
   coreneuron-core ${coreneuron_cuda_target}
   PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
              LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
-             POSITION_INDEPENDENT_CODE ON)
+             POSITION_INDEPENDENT_CODE ${CORENRN_ENABLE_SHARED})
 cpp_cc_configure_sanitizers(TARGET coreneuron-core ${coreneuron_cuda_target} ${corenrn_mpi_targets})
 
 # =============================================================================
@@ -295,12 +286,12 @@ file(GLOB modfiles "${modfile_directory}/*.mod")
 # symbols in the translated versions of default .mod files
 set(nrniv_core_prefix "${CMAKE_BINARY_DIR}/bin/${CMAKE_SYSTEM_PROCESSOR}")
 set(corenrn_mech_library
-    "${nrniv_core_prefix}/libcoreneuron${CMAKE_${COMPILE_LIBRARY_TYPE}_LIBRARY_SUFFIX}")
+    "${nrniv_core_prefix}/${CMAKE_${COMPILE_LIBRARY_TYPE}_LIBRARY_PREFIX}coreneuron${CMAKE_${COMPILE_LIBRARY_TYPE}_LIBRARY_SUFFIX}")
 set(output_binaries "${nrniv_core_prefix}/special-core" "${corenrn_mech_library}")
 
 add_custom_command(
   OUTPUT ${output_binaries}
-  DEPENDS coreneuron-core ${coreneuron_cuda_target} ${NMODL_TARGET_TO_DEPEND} ${modfiles}
+  DEPENDS coreneuron-core ${NMODL_TARGET_TO_DEPEND} ${modfiles}
           ${CORENEURON_BUILTIN_MODFILES}
   COMMAND ${CMAKE_BINARY_DIR}/bin/nrnivmodl-core -b ${COMPILE_LIBRARY_TYPE} -m
           ${CORENRN_MOD2CPP_BINARY} -p 4 "${modfile_directory}"
@@ -318,18 +309,7 @@ endif()
 # ${corenrn_mech_library} is libcoreneuron.{a,so}, which contains both the compiled default
 # mechanisms and the content of libcoreneuron-core.a
 add_library(coreneuron-all INTERFACE)
-target_link_libraries(coreneuron-all INTERFACE ${coreneuron_cuda_target} "${corenrn_mech_library}")
-
-# =============================================================================
-# Extract link definitions to be used with nrnivmodl-core
-# =============================================================================
-get_target_property(CORENRN_LINK_LIBS coreneuron-core LINK_LIBRARIES)
-if(NOT CORENRN_LINK_LIBS)
-  set(CORENRN_LINK_LIBS "")
-endif()
-set(CORENRN_LINK_LIBS
-    "${CORENRN_LINK_LIBS}"
-    PARENT_SCOPE)
+target_link_libraries(coreneuron-all INTERFACE "${corenrn_mech_library}")
 
 # Make headers avail to build tree
 configure_file(engine.h.in ${CMAKE_BINARY_DIR}/include/coreneuron/engine.h @ONLY)
@@ -357,21 +337,13 @@ file(COPY apps/coreneuron.cpp DESTINATION ${CMAKE_BINARY_DIR}/share/coreneuron)
 
 # coreneuron main libraries
 install(
-  TARGETS coreneuron-core
+  TARGETS coreneuron-core ${coreneuron_cuda_target}
   EXPORT coreneuron
   LIBRARY DESTINATION lib
   ARCHIVE DESTINATION lib
   INCLUDES
   DESTINATION $<INSTALL_INTERFACE:include>)
 
-if(TARGET coreneuron-cuda)
-  install(
-    TARGETS coreneuron-cuda
-    EXPORT coreneuron
-    ARCHIVE DESTINATION lib
-    LIBRARY DESTINATION lib)
-endif()
-
 # headers and some standalone code files for nrnivmodl-core
 install(
   DIRECTORY ${CMAKE_BINARY_DIR}/include/coreneuron
diff --git a/coreneuron/apps/main1.cpp b/coreneuron/apps/main1.cpp
index 86275fff0..b019748fd 100644
--- a/coreneuron/apps/main1.cpp
+++ b/coreneuron/apps/main1.cpp
@@ -456,7 +456,7 @@ std::unique_ptr<ReportHandler> create_report_handler(ReportConfiguration& config
 
 using namespace coreneuron;
 
-#if NRNMPI && defined CORENRN_ENABLE_MPI_DYNAMIC
+#if NRNMPI && defined(CORENEURON_ENABLE_MPI_DYNAMIC)
 static void* load_dynamic_mpi(const std::string& libname) {
     dlerror();
     void* handle = dlopen(libname.c_str(), RTLD_NOW | RTLD_GLOBAL);
@@ -478,7 +478,7 @@ extern "C" void mk_mech_init(int argc, char** argv) {
 
 #if NRNMPI
     if (corenrn_param.mpi_enable) {
-#ifdef CORENRN_ENABLE_MPI_DYNAMIC
+#ifdef CORENEURON_ENABLE_MPI_DYNAMIC
         // coreneuron rely on neuron to detect mpi library distribution and
         // the name of the library itself. Make sure the library name is specified
         // via CLI option.
diff --git a/coreneuron/permute/cellorder.cu b/coreneuron/permute/cellorder.cu
index 1f1bdff94..ed8975148 100644
--- a/coreneuron/permute/cellorder.cu
+++ b/coreneuron/permute/cellorder.cu
@@ -1,6 +1,6 @@
 /*
 # =============================================================================
-# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
+# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
 #
 # See top-level LICENSE file for details.
 # =============================================================================
diff --git a/extra/nrnivmodl_core_makefile.in b/extra/nrnivmodl_core_makefile.in
index 1c29e7499..24e630f92 100644
--- a/extra/nrnivmodl_core_makefile.in
+++ b/extra/nrnivmodl_core_makefile.in
@@ -38,11 +38,6 @@ MOD_OBJS_DIR = $(OUTPUT_DIR)/corenrn/build
 
 # Linked libraries gathered by CMake
 LDFLAGS = $(LINKFLAGS) @CORENRN_COMMON_LDFLAGS@
-CORENRNLIB_FLAGS =
-CORENRNLIB_FLAGS += $(if @reportinglib_LIB_DIR@, -W$(subst ;, -W,l,-rpath,@reportinglib_LIB_DIR@),)
-CORENRNLIB_FLAGS += $(if @sonatareport_LIB_DIR@, -W$(subst ;, -W,l,-rpath,@sonatareport_LIB_DIR@),)
-CORENRNLIB_FLAGS += $(if @caliper_LIB_DIR@, -W$(subst ;, -W,l,-rpath,@caliper_LIB_DIR@),)
-CORENRNLIB_FLAGS += $(if @caliper_LIB_DIR@,-L@caliper_LIB_DIR@,)
 
 # Includes paths gathered by CMake
 # coreneuron/utils/randoms goes first because it needs to override the NEURON
@@ -82,7 +77,7 @@ ifeq ($(wildcard $(CORENRN_PERLEXE)),)
 endif
 
 CXXFLAGS = @CORENRN_CXX_FLAGS@
-CXX_COMPILE_CMD = $(CXX) $(CXXFLAGS) @CMAKE_CXX_COMPILE_OPTIONS_PIC@ @CORENRN_COMMON_COMPILE_DEFS@ $(INCLUDES)
+CXX_COMPILE_CMD = $(CXX) $(CXXFLAGS) @CMAKE_CXX_COMPILE_OPTIONS_PIC@ $(INCLUDES)
 CXX_LINK_EXE_CMD = $(CXX) $(CXXFLAGS) @CMAKE_EXE_LINKER_FLAGS@
 CXX_SHARED_LIB_CMD = $(CXX) $(CXXFLAGS) @CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS@ @CMAKE_SHARED_LIBRARY_CXX_FLAGS@ @CMAKE_SHARED_LINKER_FLAGS@
 
@@ -211,7 +206,7 @@ $(SPECIAL_EXE): $(corenrnmech_lib_target)
 	@printf " => $(C_GREEN)Binary$(C_RESET) creating $(SPECIAL_EXE)\n"
 	$(CXX_LINK_EXE_CMD) -o $(SPECIAL_EXE) $(CORENRN_SHARE_CORENRN_DIR)/coreneuron.cpp \
 	  -I$(CORENRN_INC_DIR) $(INCFLAGS) \
-	  -L$(OUTPUT_DIR) -l$(COREMECH_LIB_NAME) -lcoreneuron-cuda $(CORENRNLIB_FLAGS) $(LDFLAGS) \
+	  -L$(OUTPUT_DIR) -l$(COREMECH_LIB_NAME) $(LDFLAGS) \
 	  -L$(CORENRN_LIB_DIR) \
 	  -Wl,-rpath,'$(LIB_RPATH)' -Wl,-rpath,$(CORENRN_LIB_DIR) -Wl,-rpath,'$(INSTALL_LIB_RPATH)'
 
@@ -228,7 +223,7 @@ coremech_lib_shared: $(ALL_OBJS) $(ENGINEMECH_OBJ) build_always
 	  -I$(CORENRN_INC_DIR) $(INCFLAGS) \
 	  $(LDFLAGS) ${SONAME_OPTION} -Wl,--start-group \
 	  $(MOD_OBJS_DIR)/libcoreneuron-core/*.o \
-		-Wl,--end-group $(CORENRNLIB_FLAGS) -Wl,-rpath,$(CORENRN_LIB_DIR) -L$(CORENRN_LIB_DIR) -lcoreneuron-cuda
+		-Wl,--end-group -Wl,-rpath,$(CORENRN_LIB_DIR) -L$(CORENRN_LIB_DIR)
 	# cleanup
 	rm $(MOD_OBJS_DIR)/libcoreneuron-core/*.o
 

From a05830cc0bc54243a5c41d4b4eab26e89402c7db Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 19 Jul 2022 17:58:32 +0200
Subject: [PATCH 042/128] format

---
 CMake/MakefileBuildOptions.cmake | 33 ++++++++++++++------------------
 CMake/OpenAccHelper.cmake        |  5 ++---
 CMakeLists.txt                   |  7 +++----
 coreneuron/CMakeLists.txt        |  6 +++---
 4 files changed, 22 insertions(+), 29 deletions(-)

diff --git a/CMake/MakefileBuildOptions.cmake b/CMake/MakefileBuildOptions.cmake
index 710b8ad4e..29edf2bd1 100644
--- a/CMake/MakefileBuildOptions.cmake
+++ b/CMake/MakefileBuildOptions.cmake
@@ -29,28 +29,25 @@ set(NMODL_ISPC_BACKEND_ARGS "host --ispc")
 set(NMODL_ACC_BACKEND_ARGS "host --c acc --oacc")
 
 # =============================================================================
-# Construct the linker arguments that are used inside nrnivmodl-core (to build
-# libcoreneuron from libcoreneuron-core, libcoreneuron-cuda and mechanism object
-# files) and inside nrnivmodl (to link NEURON's special against CoreNEURON's
-# libcoreneuron).
-# =============================================================================
-# Essentially we "just" want to unpack the CMake dependencies of the
-# `coreneuron-core` target into a plain string that we can bake into the
-# Makefiles in both NEURON and CoreNEURON.
+# Construct the linker arguments that are used inside nrnivmodl-core (to build libcoreneuron from
+# libcoreneuron-core, libcoreneuron-cuda and mechanism object files) and inside nrnivmodl (to link
+# NEURON's special against CoreNEURON's libcoreneuron).
+# =============================================================================
+# Essentially we "just" want to unpack the CMake dependencies of the `coreneuron-core` target into a
+# plain string that we can bake into the Makefiles in both NEURON and CoreNEURON.
 function(coreneuron_process_target target)
   if(TARGET ${target})
     if(NOT target STREQUAL "coreneuron-core")
-      # This is a special case: libcoreneuron-core.a is manually unpacked into .o
-      # files by the nrnivmodl-core Makefile, so we do not want to also emit an
-      # -lcoreneuron-core argument.
-      # TODO: probably need to extract an -L and RPATH path and include that here?
+      # This is a special case: libcoreneuron-core.a is manually unpacked into .o files by the
+      # nrnivmodl-core Makefile, so we do not want to also emit an -lcoreneuron-core argument. TODO:
+      # probably need to extract an -L and RPATH path and include that here?
       set_property(GLOBAL APPEND_STRING PROPERTY CORENEURON_LIB_LINK_FLAGS " -l${target}")
     endif()
     get_target_property(target_libraries ${target} LINK_LIBRARIES)
     if(target_libraries)
       foreach(child_target ${target_libraries})
         coreneuron_process_target(${child_target})
-      endforeach()  
+      endforeach()
     endif()
     return()
   endif()
@@ -73,9 +70,8 @@ coreneuron_process_target(coreneuron-core)
 get_property(CORENEURON_LIB_LINK_FLAGS GLOBAL PROPERTY CORENEURON_LIB_LINK_FLAGS)
 message(STATUS "CORENEURON_LIB_LINK_FLAGS=${CORENEURON_LIB_LINK_FLAGS}")
 
-# Things that used to be in CORENEURON_LIB_LINK_FLAGS: -rdynamic -lrt
-# -Wl,--whole-archive -L${CMAKE_HOST_SYSTEM_PROCESSOR} -Wl,--no-whole-archive
-# -L${caliper_LIB_DIR} -l${CALIPER_LIB}
+# Things that used to be in CORENEURON_LIB_LINK_FLAGS: -rdynamic -lrt -Wl,--whole-archive
+# -L${CMAKE_HOST_SYSTEM_PROCESSOR} -Wl,--no-whole-archive -L${caliper_LIB_DIR} -l${CALIPER_LIB}
 
 # =============================================================================
 # Turn CORENRN_COMPILE_DEFS into a list of -DFOO[=BAR] options.
@@ -83,9 +79,8 @@ message(STATUS "CORENEURON_LIB_LINK_FLAGS=${CORENEURON_LIB_LINK_FLAGS}")
 list(TRANSFORM CORENRN_COMPILE_DEFS PREPEND -D OUTPUT_VARIABLE CORENRN_COMPILE_DEF_FLAGS)
 
 # =============================================================================
-# Extra link flags that we need to include when linking libcoreneuron.{a,so} in
-# CoreNEURON but that do not need to be passed to NEURON to use when linking
-# nrniv/special (why?)
+# Extra link flags that we need to include when linking libcoreneuron.{a,so} in CoreNEURON but that
+# do not need to be passed to NEURON to use when linking nrniv/special (why?)
 # =============================================================================
 string(JOIN " " CORENRN_COMMON_LDFLAGS ${CORENEURON_LIB_LINK_FLAGS} ${CORENRN_EXTRA_LINK_FLAGS})
 if(CORENRN_SANITIZER_LIBRARY_DIR)
diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake
index d2eed3d1d..8ba850456 100644
--- a/CMake/OpenAccHelper.cmake
+++ b/CMake/OpenAccHelper.cmake
@@ -103,9 +103,8 @@ endif()
 if(CORENRN_ENABLE_GPU AND CORENRN_ENABLE_SHARED)
   # Because of
   # https://forums.developer.nvidia.com/t/dynamically-loading-an-openacc-enabled-shared-library-from-an-executable-compiled-with-nvc-does-not-work/210968
-  # we have to tell NEURON to pass OpenACC flags when linking special, otherwise
-  # we end up with an `nrniv` binary that cannot dynamically load CoreNEURON in
-  # shared-library builds
+  # we have to tell NEURON to pass OpenACC flags when linking special, otherwise we end up with an
+  # `nrniv` binary that cannot dynamically load CoreNEURON in shared-library builds
   set_property(GLOBAL PROPERTY CORENEURON_LIB_LINK_FLAGS "${NVHPC_ACC_COMP_FLAGS}")
 endif()
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ccf5f0aa0..00e74896e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -477,10 +477,9 @@ endif()
 # Add the main source directory
 add_subdirectory(coreneuron)
 
-# Extract the various compiler option strings to use inside nrnivmodl-core. Sets
-# the global property CORENEURON_LIB_LINK_FLAGS, which contains the arguments
-# that must be added to the link line for `special` to link against
-# `libcoreneuron.{a,so}`
+# Extract the various compiler option strings to use inside nrnivmodl-core. Sets the global property
+# CORENEURON_LIB_LINK_FLAGS, which contains the arguments that must be added to the link line for
+# `special` to link against `libcoreneuron.{a,so}`
 include(MakefileBuildOptions)
 
 # Generate the nrnivmodl-core script and makefile using the options from MakefileBuildOptions
diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index befb2f8dc..e1754cc66 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -286,13 +286,13 @@ file(GLOB modfiles "${modfile_directory}/*.mod")
 # symbols in the translated versions of default .mod files
 set(nrniv_core_prefix "${CMAKE_BINARY_DIR}/bin/${CMAKE_SYSTEM_PROCESSOR}")
 set(corenrn_mech_library
-    "${nrniv_core_prefix}/${CMAKE_${COMPILE_LIBRARY_TYPE}_LIBRARY_PREFIX}coreneuron${CMAKE_${COMPILE_LIBRARY_TYPE}_LIBRARY_SUFFIX}")
+    "${nrniv_core_prefix}/${CMAKE_${COMPILE_LIBRARY_TYPE}_LIBRARY_PREFIX}coreneuron${CMAKE_${COMPILE_LIBRARY_TYPE}_LIBRARY_SUFFIX}"
+)
 set(output_binaries "${nrniv_core_prefix}/special-core" "${corenrn_mech_library}")
 
 add_custom_command(
   OUTPUT ${output_binaries}
-  DEPENDS coreneuron-core ${NMODL_TARGET_TO_DEPEND} ${modfiles}
-          ${CORENEURON_BUILTIN_MODFILES}
+  DEPENDS coreneuron-core ${NMODL_TARGET_TO_DEPEND} ${modfiles} ${CORENEURON_BUILTIN_MODFILES}
   COMMAND ${CMAKE_BINARY_DIR}/bin/nrnivmodl-core -b ${COMPILE_LIBRARY_TYPE} -m
           ${CORENRN_MOD2CPP_BINARY} -p 4 "${modfile_directory}"
   WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/bin

From c504a872986a2563cd57623c79d096862dca03a6 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 19 Jul 2022 18:14:55 +0200
Subject: [PATCH 043/128] fix static linking

---
 extra/nrnivmodl_core_makefile.in | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/extra/nrnivmodl_core_makefile.in b/extra/nrnivmodl_core_makefile.in
index 24e630f92..6c7bd2cb2 100644
--- a/extra/nrnivmodl_core_makefile.in
+++ b/extra/nrnivmodl_core_makefile.in
@@ -229,8 +229,10 @@ coremech_lib_shared: $(ALL_OBJS) $(ENGINEMECH_OBJ) build_always
 
 # build static library of mechanisms
 coremech_lib_static: $(ALL_OBJS) $(ENGINEMECH_OBJ) build_always
-	rm -f ${COREMECH_LIB_PATH}; \
-	ar cq ${COREMECH_LIB_PATH} $(ENGINEMECH_OBJ) $(ALL_OBJS);
+	# make a libcoreneuron.a by copying libcoreneuron-core.a and then appending
+	# the newly compiled objects
+	cp $(CORENRN_LIB_DIR)/libcoreneuron-core.a ${COREMECH_LIB_PATH}
+	ar r ${COREMECH_LIB_PATH} $(ENGINEMECH_OBJ) $(ALL_OBJS)
 
 # compile cpp files to .o
 $(MOD_OBJS_DIR)/%.o: $(MOD_TO_CPP_DIR)/%.cpp | $(MOD_OBJS_DIR)

From 5d61ca26479058ea0e8d474e889349c888d49251 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 19 Jul 2022 18:23:40 +0200
Subject: [PATCH 044/128] linking fixups

---
 coreneuron/CMakeLists.txt | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index e1754cc66..cae2d8f66 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -307,9 +307,16 @@ endif()
 
 # Create an extra target for internal use that unit tests and so on can depend on.
 # ${corenrn_mech_library} is libcoreneuron.{a,so}, which contains both the compiled default
-# mechanisms and the content of libcoreneuron-core.a
+# mechanisms and the content of libcoreneuron-core.a. Also copy the dependencies of
+# libcoreneuron-core as interface dependencies of this new target (example: ${corenrn_mech_library}
+# will probably depend on MPI, so when the unit tests link against ${corenrn_mech_library} they need
+# to know about MPI too).
 add_library(coreneuron-all INTERFACE)
 target_link_libraries(coreneuron-all INTERFACE "${corenrn_mech_library}")
+get_target_property(coreneuron_core_deps coreneuron-core LINK_LIBRARIES)
+foreach(dep ${coreneuron_core_deps})
+  target_link_libraries(coreneuron-all INTERFACE ${dep})
+endforeach()
 
 # Make headers avail to build tree
 configure_file(engine.h.in ${CMAKE_BINARY_DIR}/include/coreneuron/engine.h @ONLY)

From c147425c52cb55951a4222d6a78031d41e855507 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 19 Jul 2022 18:27:51 +0200
Subject: [PATCH 045/128] minor tweaks

---
 coreneuron/CMakeLists.txt | 5 ++---
 coreneuron/mpi/nrnmpi.h   | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index cae2d8f66..da473fdac 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -301,8 +301,7 @@ add_custom_target(nrniv-core ALL DEPENDS ${output_binaries})
 
 if(CORENRN_ENABLE_GPU)
   separate_arguments(CORENRN_ACC_FLAGS UNIX_COMMAND "${NVHPC_ACC_COMP_FLAGS}")
-  target_compile_options(coreneuron-core BEFORE
-                         PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${CORENRN_ACC_FLAGS}>)
+  target_compile_options(coreneuron-core PRIVATE ${CORENRN_ACC_FLAGS})
 endif()
 
 # Create an extra target for internal use that unit tests and so on can depend on.
@@ -310,7 +309,7 @@ endif()
 # mechanisms and the content of libcoreneuron-core.a. Also copy the dependencies of
 # libcoreneuron-core as interface dependencies of this new target (example: ${corenrn_mech_library}
 # will probably depend on MPI, so when the unit tests link against ${corenrn_mech_library} they need
-# to know about MPI too).
+# to know to link against MPI too).
 add_library(coreneuron-all INTERFACE)
 target_link_libraries(coreneuron-all INTERFACE "${corenrn_mech_library}")
 get_target_property(coreneuron_core_deps coreneuron-core LINK_LIBRARIES)
diff --git a/coreneuron/mpi/nrnmpi.h b/coreneuron/mpi/nrnmpi.h
index 04df699ff..03a1d2461 100644
--- a/coreneuron/mpi/nrnmpi.h
+++ b/coreneuron/mpi/nrnmpi.h
@@ -81,7 +81,7 @@ struct mpi_function<std::integral_constant<function_ptr, fptr>>: mpi_function_ba
     using mpi_function_base::mpi_function_base;
     template <typename... Args>  // in principle deducible from `function_ptr`
     auto operator()(Args&&... args) const {
-#ifdef CORENRN_ENABLE_MPI_DYNAMIC
+#ifdef CORENEURON_ENABLE_MPI_DYNAMIC
         // Dynamic MPI, m_fptr should have been initialised via dlsym.
         assert(m_fptr);
         return (*reinterpret_cast<decltype(fptr)>(m_fptr))(std::forward<Args>(args)...);

From 6a7c824c2514e105b62f66a29fe9512481a2a643 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 19 Jul 2022 18:47:17 +0200
Subject: [PATCH 046/128] One more tweak

---
 coreneuron/CMakeLists.txt | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index da473fdac..24facf06a 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -306,16 +306,18 @@ endif()
 
 # Create an extra target for internal use that unit tests and so on can depend on.
 # ${corenrn_mech_library} is libcoreneuron.{a,so}, which contains both the compiled default
-# mechanisms and the content of libcoreneuron-core.a. Also copy the dependencies of
-# libcoreneuron-core as interface dependencies of this new target (example: ${corenrn_mech_library}
-# will probably depend on MPI, so when the unit tests link against ${corenrn_mech_library} they need
-# to know to link against MPI too).
+# mechanisms and the content of libcoreneuron-core.a.
 add_library(coreneuron-all INTERFACE)
 target_link_libraries(coreneuron-all INTERFACE "${corenrn_mech_library}")
+# Also copy the dependencies of libcoreneuron-core as interface dependencies of this new target
+# (example: ${corenrn_mech_library} will probably depend on MPI, so when the unit tests link against
+# ${corenrn_mech_library} they need to know to link against MPI too).
 get_target_property(coreneuron_core_deps coreneuron-core LINK_LIBRARIES)
-foreach(dep ${coreneuron_core_deps})
-  target_link_libraries(coreneuron-all INTERFACE ${dep})
-endforeach()
+if(coreneuron_core_deps)
+  foreach(dep ${coreneuron_core_deps})
+    target_link_libraries(coreneuron-all INTERFACE ${dep})
+  endforeach()
+endif()
 
 # Make headers avail to build tree
 configure_file(engine.h.in ${CMAKE_BINARY_DIR}/include/coreneuron/engine.h @ONLY)

From fc2b5722113e26e31e6182b98807613f5148ece3 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 26 Jul 2022 12:24:26 +0200
Subject: [PATCH 047/128] ar: avoid --output

---
 extra/nrnivmodl_core_makefile.in | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/extra/nrnivmodl_core_makefile.in b/extra/nrnivmodl_core_makefile.in
index 6c7bd2cb2..47681fb5a 100644
--- a/extra/nrnivmodl_core_makefile.in
+++ b/extra/nrnivmodl_core_makefile.in
@@ -218,7 +218,8 @@ coremech_lib_shared: $(ALL_OBJS) $(ENGINEMECH_OBJ) build_always
 	# extract the object files from libcoreneuron-core.a
 	mkdir -p $(MOD_OBJS_DIR)/libcoreneuron-core
 	rm -f $(MOD_OBJS_DIR)/libcoreneuron-core/*.o
-	ar --output=$(MOD_OBJS_DIR)/libcoreneuron-core x $(CORENRN_LIB_DIR)/libcoreneuron-core.a
+	# --output is only supported by modern versions of ar
+	(cd $(MOD_OBJS_DIR)/libcoreneuron-core && ar x $(CORENRN_LIB_DIR)/libcoreneuron-core.a)
 	$(CXX_SHARED_LIB_CMD) $(ENGINEMECH_OBJ) -o ${COREMECH_LIB_PATH} $(ALL_OBJS) \
 	  -I$(CORENRN_INC_DIR) $(INCFLAGS) \
 	  $(LDFLAGS) ${SONAME_OPTION} -Wl,--start-group \

From 0e9f713b8ae5f764a21a748ba4fdb8029a23f2b1 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 26 Jul 2022 13:52:08 +0200
Subject: [PATCH 048/128] Try and fix macOS linking.

---
 CMake/MakefileBuildOptions.cmake | 9 +++++++++
 extra/nrnivmodl_core_makefile.in | 6 +++---
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/CMake/MakefileBuildOptions.cmake b/CMake/MakefileBuildOptions.cmake
index 29edf2bd1..b99442018 100644
--- a/CMake/MakefileBuildOptions.cmake
+++ b/CMake/MakefileBuildOptions.cmake
@@ -70,6 +70,15 @@ coreneuron_process_target(coreneuron-core)
 get_property(CORENEURON_LIB_LINK_FLAGS GLOBAL PROPERTY CORENEURON_LIB_LINK_FLAGS)
 message(STATUS "CORENEURON_LIB_LINK_FLAGS=${CORENEURON_LIB_LINK_FLAGS}")
 
+# Detect if --start-group and --end-group are valid linker arguments. These are typically needed
+# when linking mutually-dependent .o files (or where we don't know the correct order) on Linux, but
+# they are not needed *or* recognised by the macOS linker.
+include(CheckLinkerFlag) # requires CMake 3.18
+check_linker_flag(CXX -Wl,--start-group CORENRN_CXX_LINKER_SUPPORTS_START_GROUP)
+if(CORENRN_CXX_LINKER_SUPPORTS_START_GROUP)
+  set(CORENEURON_LINKER_START_GROUP -Wl,--start-group)
+  set(CORENEURON_LINKER_END_GROUP -Wl,--end-group)
+endif()
 # Things that used to be in CORENEURON_LIB_LINK_FLAGS: -rdynamic -lrt -Wl,--whole-archive
 # -L${CMAKE_HOST_SYSTEM_PROCESSOR} -Wl,--no-whole-archive -L${caliper_LIB_DIR} -l${CALIPER_LIB}
 
diff --git a/extra/nrnivmodl_core_makefile.in b/extra/nrnivmodl_core_makefile.in
index 47681fb5a..2c732a288 100644
--- a/extra/nrnivmodl_core_makefile.in
+++ b/extra/nrnivmodl_core_makefile.in
@@ -222,9 +222,9 @@ coremech_lib_shared: $(ALL_OBJS) $(ENGINEMECH_OBJ) build_always
 	(cd $(MOD_OBJS_DIR)/libcoreneuron-core && ar x $(CORENRN_LIB_DIR)/libcoreneuron-core.a)
 	$(CXX_SHARED_LIB_CMD) $(ENGINEMECH_OBJ) -o ${COREMECH_LIB_PATH} $(ALL_OBJS) \
 	  -I$(CORENRN_INC_DIR) $(INCFLAGS) \
-	  $(LDFLAGS) ${SONAME_OPTION} -Wl,--start-group \
-	  $(MOD_OBJS_DIR)/libcoreneuron-core/*.o \
-		-Wl,--end-group -Wl,-rpath,$(CORENRN_LIB_DIR) -L$(CORENRN_LIB_DIR)
+	  $(LDFLAGS) ${SONAME_OPTION} @CORENEURON_LINKER_START_GROUP@ \
+	  $(MOD_OBJS_DIR)/libcoreneuron-core/*.o @CORENEURON_LINKER_END_GROUP@ \
+		-Wl,-rpath,$(CORENRN_LIB_DIR) -L$(CORENRN_LIB_DIR)
 	# cleanup
 	rm $(MOD_OBJS_DIR)/libcoreneuron-core/*.o
 

From 3dc6884b5ee016a2d54ab22f099fa6c3cc895155 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 26 Jul 2022 14:00:01 +0200
Subject: [PATCH 049/128] Try and fix Caliper in GitLab CI.

---
 .gitlab-ci.yml | 49 ++++++++++++++++++++++---------------------------
 1 file changed, 22 insertions(+), 27 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 6a9abb44b..c299a836e 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -81,68 +81,63 @@ build:nmodl:
     SPACK_PACKAGE_COMPILER: gcc
 
 # Build CoreNEURON
-build:coreneuron:mod2c:nvhpc:acc:
-  extends: [.build, .spack_nvhpc]
+.build_coreneuron:
+  extends: [.build]
   variables:
     SPACK_PACKAGE: coreneuron
+    # NEURON depends on py-mpi4py, most of whose dependencies are pulled in by
+    # nmodl%gcc, with the exception of MPI, which is pulled in by
+    # coreneuron%{nvhpc,intel}. hpe-mpi is an external package anyway, so
+    # setting its compiler is just changing how it is labelled in the
+    # dependency graph and not changing which installation is used, but this
+    # means that in the NEURON step an existing py-mpi4py%gcc can be used.
+    # Otherwise a new py-mpi4py with hpe-mpi%{nvhpc,intel} will be built.
+    # caliper: papi%nvhpc does not build; use the caliper from the deployment
+    # TODO: fix this more robustly so we don't have to play so many games.
+    SPACK_PACKAGE_DEPENDENCIES: ^hpe-mpi%gcc ^caliper%gcc+cuda cuda_arch=70
+
+build:coreneuron:mod2c:nvhpc:acc:
+  extends: [.build_coreneuron, .spack_nvhpc]
+  variables:
     # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
     SPACK_PACKAGE_SPEC: +caliper+gpu+openmp~shared+tests~legacy-unit build_type=RelWithDebInfo
 
 build:coreneuron:mod2c:nvhpc:acc:shared:
-  extends: [.build, .spack_nvhpc]
+  extends: [.build_coreneuron, .spack_nvhpc]
   variables:
-    SPACK_PACKAGE: coreneuron
     # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
     SPACK_PACKAGE_SPEC: +caliper+gpu+openmp+shared+tests~legacy-unit build_type=RelWithDebInfo
 
 # Build CoreNEURON with Unified Memory on GPU
 build:coreneuron:mod2c:nvhpc:acc:unified:
-  extends: [.build, .spack_nvhpc]
+  extends: [.build_coreneuron, .spack_nvhpc]
   variables:
-    SPACK_PACKAGE: coreneuron
     # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
     SPACK_PACKAGE_SPEC: ~caliper+gpu+unified+openmp~shared+tests~legacy-unit build_type=RelWithDebInfo
 
-.build_coreneuron_nmodl:
-  extends: [.build]
-  variables:
-    # NEURON depends on py-mpi4py, most of whose dependencies are pulled in by
-    # nmodl%gcc, with the exception of MPI, which is pulled in by
-    # coreneuron%{nvhpc,intel}. hpe-mpi is an external package anyway, so
-    # setting its compiler is just changing how it is labelled in the
-    # dependency graph and not changing which installation is used, but this
-    # means that in the NEURON step an existing py-mpi4py%gcc can be used.
-    # Otherwise a new py-mpi4py with hpe-mpi%{nvhpc,intel} will be built.
-    # TODO: fix this more robustly so we don't have to play so many games.
-    SPACK_PACKAGE_DEPENDENCIES: ^hpe-mpi%gcc
-
 build:coreneuron:nmodl:nvhpc:omp:
-  extends: [.build_coreneuron_nmodl, .spack_nvhpc]
+  extends: [.build_coreneuron, .spack_nvhpc]
   variables:
-    SPACK_PACKAGE: coreneuron
     # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
     SPACK_PACKAGE_SPEC: +caliper+nmodl+openmp+gpu~shared+tests~legacy-unit~sympy build_type=RelWithDebInfo
   needs: ["build:nmodl"]
 
 build:coreneuron:nmodl:nvhpc:acc:
-  extends: [.build_coreneuron_nmodl, .spack_nvhpc]
+  extends: [.build_coreneuron, .spack_nvhpc]
   variables:
-    SPACK_PACKAGE: coreneuron
     # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
     # Sympy + OpenMP target offload does not currently work with NVHPC
     SPACK_PACKAGE_SPEC: ~caliper+nmodl~openmp+gpu~shared+tests~legacy-unit+sympy build_type=RelWithDebInfo
   needs: ["build:nmodl"]
 
 build:coreneuron:mod2c:intel:
-  extends: [.build, .spack_intel]
+  extends: [.build_coreneuron, .spack_intel]
   variables:
-    SPACK_PACKAGE: coreneuron
     SPACK_PACKAGE_SPEC: +caliper+tests~legacy-unit build_type=Debug
 
 build:coreneuron:nmodl:intel:
-  extends: [.build_coreneuron_nmodl, .spack_intel]
+  extends: [.build_coreneuron, .spack_intel]
   variables:
-    SPACK_PACKAGE: coreneuron
     SPACK_PACKAGE_SPEC: ~caliper+nmodl+tests~legacy-unit build_type=Debug
   needs: ["build:nmodl"]
 

From d41e82bf1bc849b3830a5fe04fa3d8bbe45489cb Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 26 Jul 2022 14:08:19 +0200
Subject: [PATCH 050/128] simplify: +caliper in all CoreNEURON builds

---
 .gitlab-ci.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index c299a836e..a7e0f39bc 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -113,7 +113,7 @@ build:coreneuron:mod2c:nvhpc:acc:unified:
   extends: [.build_coreneuron, .spack_nvhpc]
   variables:
     # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
-    SPACK_PACKAGE_SPEC: ~caliper+gpu+unified+openmp~shared+tests~legacy-unit build_type=RelWithDebInfo
+    SPACK_PACKAGE_SPEC: +caliper+gpu+unified+openmp~shared+tests~legacy-unit build_type=RelWithDebInfo
 
 build:coreneuron:nmodl:nvhpc:omp:
   extends: [.build_coreneuron, .spack_nvhpc]
@@ -127,7 +127,7 @@ build:coreneuron:nmodl:nvhpc:acc:
   variables:
     # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
     # Sympy + OpenMP target offload does not currently work with NVHPC
-    SPACK_PACKAGE_SPEC: ~caliper+nmodl~openmp+gpu~shared+tests~legacy-unit+sympy build_type=RelWithDebInfo
+    SPACK_PACKAGE_SPEC: +caliper+nmodl~openmp+gpu~shared+tests~legacy-unit+sympy build_type=RelWithDebInfo
   needs: ["build:nmodl"]
 
 build:coreneuron:mod2c:intel:
@@ -138,7 +138,7 @@ build:coreneuron:mod2c:intel:
 build:coreneuron:nmodl:intel:
   extends: [.build_coreneuron, .spack_intel]
   variables:
-    SPACK_PACKAGE_SPEC: ~caliper+nmodl+tests~legacy-unit build_type=Debug
+    SPACK_PACKAGE_SPEC: +caliper+nmodl+tests~legacy-unit build_type=Debug
   needs: ["build:nmodl"]
 
 # Build NEURON

From acd5a7e84d23eac71ecc82ff41f2cf4984b33d42 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 26 Jul 2022 15:16:09 +0200
Subject: [PATCH 051/128] Disable present table when unified memory enabled.

---
 coreneuron/utils/offload.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/coreneuron/utils/offload.hpp b/coreneuron/utils/offload.hpp
index 1f068c4d7..f37724bb4 100644
--- a/coreneuron/utils/offload.hpp
+++ b/coreneuron/utils/offload.hpp
@@ -26,7 +26,7 @@
 
 namespace coreneuron {
 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
-    defined(_OPENACC)
+    defined(_OPENACC) && !defined(CORENEURON_UNIFIED_MEMORY)
 // Homegrown implementation for buggy NVHPC versions (<=22.3?)
 #define CORENEURON_ENABLE_PRESENT_TABLE
 void* cnrn_target_deviceptr_impl(void const* h_ptr);

From 2a07e146ad159f45074d8288207d2c6a9ff5b4e2 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 26 Jul 2022 15:17:36 +0200
Subject: [PATCH 052/128] Shuffle Random123 + GPU yet again for 22.3

---
 coreneuron/utils/randoms/nrnran123.cpp | 28 ++++----------------------
 1 file changed, 4 insertions(+), 24 deletions(-)

diff --git a/coreneuron/utils/randoms/nrnran123.cpp b/coreneuron/utils/randoms/nrnran123.cpp
index f2dd2dee2..8a2f07866 100644
--- a/coreneuron/utils/randoms/nrnran123.cpp
+++ b/coreneuron/utils/randoms/nrnran123.cpp
@@ -87,24 +87,15 @@ using random123_allocator = coreneuron::unified_allocator<coreneuron::nrnran123_
 OMP_Mutex g_instance_count_mutex;
 std::size_t g_instance_count{};
 
-philox4x32_key_t g_k{};
 #ifdef __CUDACC__
-// Not 100% clear we need a different name (g_k_dev) here in addition to g_k,
-// but it's clearer and the overhead cannot be high (if it exists).
-__constant__ __device__ philox4x32_key_t g_k_dev{};
-// noinline to force "CUDA" not "acc routine seq" behaviour :shrug:
-__attribute__((noinline)) philox4x32_key_t& global_state() {
-    if target (nv::target::is_device) {
-        return g_k_dev;
-    } else {
-        return g_k;
-    }
-}
+#define g_k_qualifiers __device__ __constant__
 #else
+#define g_k_qualifiers
+#endif
+g_k_qualifiers philox4x32_key_t g_k{};
 philox4x32_key_t& global_state() {
     return g_k;
 }
-#endif
 }  // namespace
 
 philox4x32_ctr_t coreneuron_random123_philox4x32_helper(coreneuron::nrnran123_State* s) {
@@ -138,19 +129,8 @@ void nrnran123_set_globalindex(uint32_t gix) {
     if (g_k.v[0] != gix) {
         g_k.v[0] = gix;
         if (coreneuron::gpu_enabled()) {
-#ifdef __CUDACC__
-            {
-                auto const code = cudaMemcpyToSymbol(g_k_dev, &g_k, sizeof(g_k));
-                assert(code == cudaSuccess);
-            }
-            {
-                auto const code = cudaDeviceSynchronize();
-                assert(code == cudaSuccess);
-            }
-#else
             nrn_pragma_acc(update device(g_k))
             nrn_pragma_omp(target update to(g_k))
-#endif
         }
     }
 }

From 93af3f3ee070a67f0bcc541b4b36e00d4d8e451d Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 26 Jul 2022 16:31:54 +0200
Subject: [PATCH 053/128] Some CORENEURON_ -> CORENRN_ for consistency. Export
 OpenACC flags to NEURON separately as well as as part of of the whole ...
 -lcoreneuron ... link line.

---
 CMake/MakefileBuildOptions.cmake | 15 +++++++--------
 CMake/OpenAccHelper.cmake        | 17 +++++++++--------
 CMake/coreneuron-config.cmake.in |  3 ++-
 CMakeLists.txt                   |  4 ++--
 4 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/CMake/MakefileBuildOptions.cmake b/CMake/MakefileBuildOptions.cmake
index b99442018..2a5b6a7d9 100644
--- a/CMake/MakefileBuildOptions.cmake
+++ b/CMake/MakefileBuildOptions.cmake
@@ -41,7 +41,7 @@ function(coreneuron_process_target target)
       # This is a special case: libcoreneuron-core.a is manually unpacked into .o files by the
       # nrnivmodl-core Makefile, so we do not want to also emit an -lcoreneuron-core argument. TODO:
       # probably need to extract an -L and RPATH path and include that here?
-      set_property(GLOBAL APPEND_STRING PROPERTY CORENEURON_LIB_LINK_FLAGS " -l${target}")
+      set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " -l${target}")
     endif()
     get_target_property(target_libraries ${target} LINK_LIBRARIES)
     if(target_libraries)
@@ -55,20 +55,19 @@ function(coreneuron_process_target target)
   message(STATUS "target=${target} target_dir=${target_dir}")
   if(NOT target_dir)
     # In case target is not a target but is just the name of a library, e.g. "dl"
-    set_property(GLOBAL APPEND_STRING PROPERTY CORENEURON_LIB_LINK_FLAGS " -l${target}")
+    set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " -l${target}")
   elseif("${target_dir}" MATCHES "^(/lib|/lib64|/usr/lib|/usr/lib64)$")
     # e.g. /usr/lib64/libpthread.so -> -lpthread
     get_filename_component(libname ${target} NAME_WE)
     string(REGEX REPLACE "^lib" "" libname ${libname})
-    set_property(GLOBAL APPEND_STRING PROPERTY CORENEURON_LIB_LINK_FLAGS " -l${libname}")
+    set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " -l${libname}")
   else()
     # It's a full path, include that on the line
-    set_property(GLOBAL APPEND_STRING PROPERTY CORENEURON_LIB_LINK_FLAGS " ${target}")
+    set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " ${target}")
   endif()
 endfunction()
 coreneuron_process_target(coreneuron-core)
-get_property(CORENEURON_LIB_LINK_FLAGS GLOBAL PROPERTY CORENEURON_LIB_LINK_FLAGS)
-message(STATUS "CORENEURON_LIB_LINK_FLAGS=${CORENEURON_LIB_LINK_FLAGS}")
+get_property(CORENRN_LIB_LINK_FLAGS GLOBAL PROPERTY CORENRN_LIB_LINK_FLAGS)
 
 # Detect if --start-group and --end-group are valid linker arguments. These are typically needed
 # when linking mutually-dependent .o files (or where we don't know the correct order) on Linux, but
@@ -79,7 +78,7 @@ if(CORENRN_CXX_LINKER_SUPPORTS_START_GROUP)
   set(CORENEURON_LINKER_START_GROUP -Wl,--start-group)
   set(CORENEURON_LINKER_END_GROUP -Wl,--end-group)
 endif()
-# Things that used to be in CORENEURON_LIB_LINK_FLAGS: -rdynamic -lrt -Wl,--whole-archive
+# Things that used to be in CORENRN_LIB_LINK_FLAGS: -rdynamic -lrt -Wl,--whole-archive
 # -L${CMAKE_HOST_SYSTEM_PROCESSOR} -Wl,--no-whole-archive -L${caliper_LIB_DIR} -l${CALIPER_LIB}
 
 # =============================================================================
@@ -91,7 +90,7 @@ list(TRANSFORM CORENRN_COMPILE_DEFS PREPEND -D OUTPUT_VARIABLE CORENRN_COMPILE_D
 # Extra link flags that we need to include when linking libcoreneuron.{a,so} in CoreNEURON but that
 # do not need to be passed to NEURON to use when linking nrniv/special (why?)
 # =============================================================================
-string(JOIN " " CORENRN_COMMON_LDFLAGS ${CORENEURON_LIB_LINK_FLAGS} ${CORENRN_EXTRA_LINK_FLAGS})
+string(JOIN " " CORENRN_COMMON_LDFLAGS ${CORENRN_LIB_LINK_FLAGS} ${CORENRN_EXTRA_LINK_FLAGS})
 if(CORENRN_SANITIZER_LIBRARY_DIR)
   string(APPEND CORENRN_COMMON_LDFLAGS " -Wl,-rpath,${CORENRN_SANITIZER_LIBRARY_DIR}")
 endif()
diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake
index 8ba850456..f50f1436a 100644
--- a/CMake/OpenAccHelper.cmake
+++ b/CMake/OpenAccHelper.cmake
@@ -66,12 +66,8 @@ if(CORENRN_ENABLE_GPU)
   # linking. Without this, we had problems with linking between the explicit CUDA (.cu) device code
   # and offloaded OpenACC/OpenMP code. Using -cuda when compiling seems to improve error messages in
   # some cases, and to be recommended by NVIDIA. We pass -gpu=cudaX.Y to ensure that OpenACC/OpenMP
-  # code is compiled with the same CUDA version as the explicit CUDA code. TODO nordc option is
-  # added based on the recommendation from:
-  # https://forums.developer.nvidia.com/t/separate-compilation-of-mixed-cuda-openacc-code/192701 but
-  # as discussed in https://github.com/BlueBrain/CoreNeuron/issues/141#issuecomment-1086742194 this
-  # is still not completely solving underlying link issue.
-  set(NVHPC_ACC_COMP_FLAGS "-cuda -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo,rdc")
+  # code is compiled with the same CUDA version as the explicit CUDA code.
+  set(NVHPC_ACC_COMP_FLAGS "-cuda -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
   # Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA
   # code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the
   # same default compute capabilities as each other, particularly on GPU-less build machines.
@@ -104,8 +100,13 @@ if(CORENRN_ENABLE_GPU AND CORENRN_ENABLE_SHARED)
   # Because of
   # https://forums.developer.nvidia.com/t/dynamically-loading-an-openacc-enabled-shared-library-from-an-executable-compiled-with-nvc-does-not-work/210968
   # we have to tell NEURON to pass OpenACC flags when linking special, otherwise we end up with an
-  # `nrniv` binary that cannot dynamically load CoreNEURON in shared-library builds
-  set_property(GLOBAL PROPERTY CORENEURON_LIB_LINK_FLAGS "${NVHPC_ACC_COMP_FLAGS}")
+  # `nrniv` binary that cannot dynamically load CoreNEURON in shared-library builds.
+  # CORENRN_LIB_LINK_FLAGS is the full set of flags needed to link against libcoreneuron.so:
+  # something like `-acc -lcoreneuron ...`. CORENRN_NEURON_LINK_FLAGS only contains flags that need
+  # to be used when linking the NEURON Python module to make sure it is able to dynamically load
+  # libcoreneuron.so.
+  set_property(GLOBAL PROPERTY CORENRN_LIB_LINK_FLAGS "${NVHPC_ACC_COMP_FLAGS}")
+  set_property(GLOBAL PROPERTY CORENRN_NEURON_LINK_FLAGS "${NVHPC_ACC_COMP_FLAGS}")
 endif()
 
 if(CORENRN_HAVE_NVHPC_COMPILER)
diff --git a/CMake/coreneuron-config.cmake.in b/CMake/coreneuron-config.cmake.in
index c5f8573d0..4fe3988c3 100644
--- a/CMake/coreneuron-config.cmake.in
+++ b/CMake/coreneuron-config.cmake.in
@@ -15,7 +15,8 @@ set(CORENRN_ENABLE_GPU @CORENRN_ENABLE_GPU@)
 set(CORENRN_ENABLE_NMODL @CORENRN_ENABLE_NMODL@)
 set(CORENRN_ENABLE_REPORTING @CORENRN_ENABLE_REPORTING@)
 set(CORENRN_ENABLE_SHARED @CORENRN_ENABLE_SHARED@)
-set(CORENEURON_LIB_LINK_FLAGS "@CORENEURON_LIB_LINK_FLAGS@")
+set(CORENRN_LIB_LINK_FLAGS "@CORENRN_LIB_LINK_FLAGS@")
+set(CORENRN_NEURON_LINK_FLAGS "@CORENRN_NEURON_LINK_FLAGS@")
 
 find_path(CORENEURON_INCLUDE_DIR "coreneuron/coreneuron.h" HINTS "${CONFIG_PATH}/../../include")
 find_path(
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 00e74896e..f3b1e7ee9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -478,7 +478,7 @@ endif()
 add_subdirectory(coreneuron)
 
 # Extract the various compiler option strings to use inside nrnivmodl-core. Sets the global property
-# CORENEURON_LIB_LINK_FLAGS, which contains the arguments that must be added to the link line for
+# CORENRN_LIB_LINK_FLAGS, which contains the arguments that must be added to the link line for
 # `special` to link against `libcoreneuron.{a,so}`
 include(MakefileBuildOptions)
 
@@ -492,7 +492,7 @@ endif()
 # =============================================================================
 # Install cmake modules
 # =============================================================================
-get_property(CORENEURON_LIB_LINK_FLAGS GLOBAL PROPERTY CORENEURON_LIB_LINK_FLAGS)
+get_property(CORENRN_NEURON_LINK_FLAGS GLOBAL PROPERTY CORENRN_NEURON_LINK_FLAGS)
 configure_file(CMake/coreneuron-config.cmake.in CMake/coreneuron-config.cmake @ONLY)
 install(FILES "${CMAKE_CURRENT_BINARY_DIR}/CMake/coreneuron-config.cmake" DESTINATION share/cmake)
 install(EXPORT coreneuron DESTINATION share/cmake)

From bd10048d8c4ff89fe61e8b7372a23c43d359d002 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 26 Jul 2022 17:52:17 +0200
Subject: [PATCH 054/128] cmake fixups

---
 CMake/MakefileBuildOptions.cmake | 57 ++++++++++++++++++++++----------
 coreneuron/CMakeLists.txt        |  1 +
 tests/integration/CMakeLists.txt |  2 +-
 3 files changed, 41 insertions(+), 19 deletions(-)

diff --git a/CMake/MakefileBuildOptions.cmake b/CMake/MakefileBuildOptions.cmake
index 2a5b6a7d9..7fec40860 100644
--- a/CMake/MakefileBuildOptions.cmake
+++ b/CMake/MakefileBuildOptions.cmake
@@ -35,13 +35,45 @@ set(NMODL_ACC_BACKEND_ARGS "host --c acc --oacc")
 # =============================================================================
 # Essentially we "just" want to unpack the CMake dependencies of the `coreneuron-core` target into a
 # plain string that we can bake into the Makefiles in both NEURON and CoreNEURON.
+function(coreneuron_process_library_path library)
+  get_filename_component(library_dir "${library}" DIRECTORY)
+  if(NOT library_dir)
+    # In case target is not a target but is just the name of a library, e.g. "dl"
+    set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " -l${library}")
+  elseif("${library_dir}" MATCHES "^(/lib|/lib64|/usr/lib|/usr/lib64)$")
+    # e.g. /usr/lib64/libpthread.so -> -lpthread
+    get_filename_component(libname ${library} NAME_WE)
+    string(REGEX REPLACE "^lib" "" libname ${libname})
+    set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " -l${libname}")
+  else()
+    # It's a full path, include that on the line
+    set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " ${library}")
+  endif()
+endfunction()
 function(coreneuron_process_target target)
   if(TARGET ${target})
     if(NOT target STREQUAL "coreneuron-core")
       # This is a special case: libcoreneuron-core.a is manually unpacked into .o files by the
-      # nrnivmodl-core Makefile, so we do not want to also emit an -lcoreneuron-core argument. TODO:
-      # probably need to extract an -L and RPATH path and include that here?
-      set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " -l${target}")
+      # nrnivmodl-core Makefile, so we do not want to also emit an -lcoreneuron-core argument.
+      get_target_property(target_inc_dirs ${target} INTERFACE_INCLUDE_DIRECTORIES)
+      if(target_inc_dirs)
+        foreach(inc_dir_genex ${target_inc_dirs})
+          string(GENEX_STRIP "${inc_dir_genex}" inc_dir)
+          if(inc_dir)
+            set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_EXTRA_COMPILE_FLAGS " -I${inc_dir}")
+          endif()
+        endforeach()
+      endif()
+      get_target_property(target_imported ${target} IMPORTED)
+      if(target_imported)
+        # In this case we can extract the full path to the library
+        get_target_property(target_location ${target} LOCATION)
+        coreneuron_process_library_path(${target_location})
+      else()
+        # This is probably another of our libraries, like -lcoreneuron-cuda. We might need to add -L
+        # and an RPATH later.
+        set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " -l${target}")
+      endif()
     endif()
     get_target_property(target_libraries ${target} LINK_LIBRARIES)
     if(target_libraries)
@@ -51,22 +83,10 @@ function(coreneuron_process_target target)
     endif()
     return()
   endif()
-  get_filename_component(target_dir "${target}" DIRECTORY)
-  message(STATUS "target=${target} target_dir=${target_dir}")
-  if(NOT target_dir)
-    # In case target is not a target but is just the name of a library, e.g. "dl"
-    set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " -l${target}")
-  elseif("${target_dir}" MATCHES "^(/lib|/lib64|/usr/lib|/usr/lib64)$")
-    # e.g. /usr/lib64/libpthread.so -> -lpthread
-    get_filename_component(libname ${target} NAME_WE)
-    string(REGEX REPLACE "^lib" "" libname ${libname})
-    set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " -l${libname}")
-  else()
-    # It's a full path, include that on the line
-    set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " ${target}")
-  endif()
+  coreneuron_process_library_path("${target}")
 endfunction()
 coreneuron_process_target(coreneuron-core)
+get_property(CORENRN_EXTRA_COMPILE_FLAGS GLOBAL PROPERTY CORENRN_EXTRA_COMPILE_FLAGS)
 get_property(CORENRN_LIB_LINK_FLAGS GLOBAL PROPERTY CORENRN_LIB_LINK_FLAGS)
 
 # Detect if --start-group and --end-group are valid linker arguments. These are typically needed
@@ -110,7 +130,8 @@ string(
   ${NVHPC_ACC_COMP_FLAGS}
   ${NVHPC_CXX_INLINE_FLAGS}
   ${CORENRN_COMPILE_DEF_FLAGS}
-  ${CORENRN_EXTRA_MECH_CXX_FLAGS})
+  ${CORENRN_EXTRA_MECH_CXX_FLAGS}
+  ${CORENRN_EXTRA_COMPILE_FLAGS})
 
 # =============================================================================
 # nmodl/mod2c related options : TODO
diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 24facf06a..168fdf708 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -216,6 +216,7 @@ if(CORENRN_ENABLE_MPI AND CORENRN_ENABLE_MPI_DYNAMIC)
       list(GET NRN_MPI_LIBNAME_LIST ${val} libname)
 
       add_library(core${libname}_lib SHARED ${MPI_LIB_FILES})
+      target_link_libraries(core${libname}_lib ${CORENRN_CALIPER_LIB})
       target_include_directories(
         core${libname}_lib
         PUBLIC ${include}
diff --git a/tests/integration/CMakeLists.txt b/tests/integration/CMakeLists.txt
index 75ae106e1..17b57084b 100644
--- a/tests/integration/CMakeLists.txt
+++ b/tests/integration/CMakeLists.txt
@@ -12,7 +12,7 @@ if(CORENRN_ENABLE_MPI_DYNAMIC)
   # building single generic mpi library libcorenrn_mpi.<suffix>
   # ~~~
   if(CORENEURON_AS_SUBPROJECT)
-    message(INFO "CoreNEURON integration tests are disabled with dynamic MPI")
+    message(STATUS "CoreNEURON integration tests are disabled with dynamic MPI")
     return()
   else()
     set(CORENRN_MPI_LIB_ARG

From b0b74517c4289918c5270b29c107a2ada95ce651 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 26 Jul 2022 18:04:12 +0200
Subject: [PATCH 055/128] Explicit random123 global state data transfer.

---
 coreneuron/apps/main1.cpp              |  2 ++
 coreneuron/utils/randoms/nrnran123.cpp | 14 ++++++++++++++
 coreneuron/utils/randoms/nrnran123.h   |  2 ++
 3 files changed, 18 insertions(+)

diff --git a/coreneuron/apps/main1.cpp b/coreneuron/apps/main1.cpp
index b019748fd..4408234b6 100644
--- a/coreneuron/apps/main1.cpp
+++ b/coreneuron/apps/main1.cpp
@@ -506,6 +506,7 @@ extern "C" void mk_mech_init(int argc, char** argv) {
 #ifdef CORENEURON_ENABLE_GPU
     if (corenrn_param.gpu) {
         init_gpu();
+        nrnran123_initialise_global_state_on_device();
     }
 #endif
 
@@ -683,6 +684,7 @@ extern "C" int run_solve_core(int argc, char** argv) {
         if (nrn_have_gaps) {
             nrn_partrans::delete_gap_indices_from_device();
         }
+        nrnran123_destroy_global_state_on_device();
     }
 
     // Cleaning the memory
diff --git a/coreneuron/utils/randoms/nrnran123.cpp b/coreneuron/utils/randoms/nrnran123.cpp
index 8a2f07866..5a92ae8ee 100644
--- a/coreneuron/utils/randoms/nrnran123.cpp
+++ b/coreneuron/utils/randoms/nrnran123.cpp
@@ -135,6 +135,20 @@ void nrnran123_set_globalindex(uint32_t gix) {
     }
 }
 
+void nrnran123_initialise_global_state_on_device() {
+    if (coreneuron::gpu_enabled()) {
+        auto& g_k = global_state();
+        nrn_pragma_acc(enter data copyin(g_k))
+    }
+}
+
+void nrnran123_destroy_global_state_on_device() {
+    if (coreneuron::gpu_enabled()) {
+        auto& g_k = global_state();
+        nrn_pragma_acc(exit data delete (g_k))
+    }
+}
+
 /** @brief Allocate a new Random123 stream.
  *  @todo  It would be nicer if the API return type was
  *  std::unique_ptr<nrnran123_State, ...not specified...>, so we could use a
diff --git a/coreneuron/utils/randoms/nrnran123.h b/coreneuron/utils/randoms/nrnran123.h
index 12484d3d4..d4108612d 100644
--- a/coreneuron/utils/randoms/nrnran123.h
+++ b/coreneuron/utils/randoms/nrnran123.h
@@ -69,6 +69,8 @@ philox4x32_ctr_t coreneuron_random123_philox4x32_helper(coreneuron::nrnran123_St
 nrn_pragma_omp(end declare target)
 
 namespace coreneuron {
+void nrnran123_initialise_global_state_on_device();
+void nrnran123_destroy_global_state_on_device();
 
 /* global index. eg. run number */
 /* all generator instances share this global index */

From 179af80466457b2722c7c727a8aae2445c844506 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 26 Jul 2022 18:17:41 +0200
Subject: [PATCH 056/128] random123 global state v59

---
 coreneuron/utils/randoms/nrnran123.cpp | 28 ++++++++++++++------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/coreneuron/utils/randoms/nrnran123.cpp b/coreneuron/utils/randoms/nrnran123.cpp
index 5a92ae8ee..fadfa7b5b 100644
--- a/coreneuron/utils/randoms/nrnran123.cpp
+++ b/coreneuron/utils/randoms/nrnran123.cpp
@@ -25,17 +25,6 @@
 #include <memory>
 #include <mutex>
 
-// Defining these attributes seems to help nvc++ in OpenMP target offload mode.
-#if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
-    defined(_OPENMP) && defined(__CUDACC__)
-#define CORENRN_HOST_DEVICE __host__ __device__
-#elif defined(__CUDACC__)
-// This is necessary to make the new CUDA-syntax-in-.cpp version compile
-#define CORENRN_HOST_DEVICE __host__ __device__
-#else
-#define CORENRN_HOST_DEVICE
-#endif
-
 namespace {
 #ifdef CORENEURON_USE_BOOST_POOL
 /** Tag type for use with boost::fast_pool_allocator that forwards to
@@ -129,23 +118,36 @@ void nrnran123_set_globalindex(uint32_t gix) {
     if (g_k.v[0] != gix) {
         g_k.v[0] = gix;
         if (coreneuron::gpu_enabled()) {
+#ifdef __CUDACC__
+            {
+                auto const code = cudaMemcpyToSymbol(g_k, &g_k, sizeof(g_k));
+                assert(code == cudaSuccess);
+            }
+            {
+                auto const code = cudaDeviceSynchronize();
+                assert(code == cudaSuccess);
+            }
+#else
             nrn_pragma_acc(update device(g_k))
             nrn_pragma_omp(target update to(g_k))
+#endif
         }
     }
 }
 
 void nrnran123_initialise_global_state_on_device() {
     if (coreneuron::gpu_enabled()) {
-        auto& g_k = global_state();
+#ifndef __CUDACC__
         nrn_pragma_acc(enter data copyin(g_k))
+#endif
     }
 }
 
 void nrnran123_destroy_global_state_on_device() {
     if (coreneuron::gpu_enabled()) {
-        auto& g_k = global_state();
+#ifndef __CUDACC__
         nrn_pragma_acc(exit data delete (g_k))
+#endif
     }
 }
 

From fb8018e41e28eea3653fbcaf1e123ca51c5e3f3b Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 27 Jul 2022 10:38:44 +0200
Subject: [PATCH 057/128] random123 global state v73

---
 coreneuron/utils/randoms/nrnran123.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/coreneuron/utils/randoms/nrnran123.cpp b/coreneuron/utils/randoms/nrnran123.cpp
index fadfa7b5b..f2bfed11a 100644
--- a/coreneuron/utils/randoms/nrnran123.cpp
+++ b/coreneuron/utils/randoms/nrnran123.cpp
@@ -82,7 +82,10 @@ std::size_t g_instance_count{};
 #define g_k_qualifiers
 #endif
 g_k_qualifiers philox4x32_key_t g_k{};
-philox4x32_key_t& global_state() {
+// Cannot refer to g_k directly from a nrn_pragma_acc(routine seq) method like
+// coreneuron_random123_philox4x32_helper, and cannot have this inlined there at
+// higher optimisation levels
+__attribute__((noinline)) philox4x32_key_t& global_state() {
     return g_k;
 }
 }  // namespace

From 66a39385a438933b6135c95d809bfcd0279a6b55 Mon Sep 17 00:00:00 2001
From: pramodk <pramod.s.kumbhar@gmail.com>
Date: Wed, 27 Jul 2022 12:05:08 +0200
Subject: [PATCH 058/128] fix prototype for _check_table_thread

---
 coreneuron/mechanism/membfunc.hpp      | 2 +-
 coreneuron/mechanism/register_mech.cpp | 2 +-
 coreneuron/mechanism/register_mech.hpp | 2 +-
 coreneuron/sim/multicore.cpp           | 2 +-
 external/mod2c                         | 2 +-
 external/nmodl                         | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/coreneuron/mechanism/membfunc.hpp b/coreneuron/mechanism/membfunc.hpp
index 8fe04a06c..87927780c 100644
--- a/coreneuron/mechanism/membfunc.hpp
+++ b/coreneuron/mechanism/membfunc.hpp
@@ -40,7 +40,7 @@ struct Memb_func {
     int thread_size_;                       /* how many Datum needed in Memb_list if vectorized */
     void (*thread_mem_init_)(ThreadDatum*); /* after Memb_list._thread is allocated */
     void (*thread_cleanup_)(ThreadDatum*);  /* before Memb_list._thread is freed */
-    void (*thread_table_check_)(int, int, double*, Datum*, ThreadDatum*, NrnThread*, int);
+    void (*thread_table_check_)(int, int, double*, Datum*, ThreadDatum*, NrnThread*, Memb_list*, int);
     int is_point;
     void (*setdata_)(double*, Datum*);
     int* dparam_semantics; /* for nrncore writing. */
diff --git a/coreneuron/mechanism/register_mech.cpp b/coreneuron/mechanism/register_mech.cpp
index 01b82814c..20fbcd424 100644
--- a/coreneuron/mechanism/register_mech.cpp
+++ b/coreneuron/mechanism/register_mech.cpp
@@ -418,7 +418,7 @@ void _nrn_thread_reg1(int i, void (*f)(ThreadDatum*)) {
 }
 
 void _nrn_thread_table_reg(int i,
-                           void (*f)(int, int, double*, Datum*, ThreadDatum*, NrnThread*, int)) {
+                           void (*f)(int, int, double*, Datum*, ThreadDatum*, NrnThread*, Memb_list*, int)) {
     if (i == -1)
         return;
 
diff --git a/coreneuron/mechanism/register_mech.hpp b/coreneuron/mechanism/register_mech.hpp
index df80d958a..07fa1ca5c 100644
--- a/coreneuron/mechanism/register_mech.hpp
+++ b/coreneuron/mechanism/register_mech.hpp
@@ -18,7 +18,7 @@ extern void hoc_reg_bbcore_read(int type, bbcore_read_t f);
 extern void hoc_reg_bbcore_write(int type, bbcore_write_t f);
 extern void _nrn_thread_table_reg(
     int i,
-    void (*f)(int, int, double*, Datum*, ThreadDatum*, NrnThread*, int));
+    void (*f)(int, int, double*, Datum*, ThreadDatum*, NrnThread*, Memb_list*, int));
 extern void alloc_mech(int);
 
 }  // namespace coreneuron
diff --git a/coreneuron/sim/multicore.cpp b/coreneuron/sim/multicore.cpp
index d5368a29c..cf8daaac8 100644
--- a/coreneuron/sim/multicore.cpp
+++ b/coreneuron/sim/multicore.cpp
@@ -166,7 +166,7 @@ void nrn_thread_table_check() {
         auto tml = static_cast<NrnThreadMembList*>(table_check_[i + 1]._pvoid);
         Memb_list* ml = tml->ml;
         (*corenrn.get_memb_func(tml->index).thread_table_check_)(
-            0, ml->_nodecount_padded, ml->data, ml->pdata, ml->_thread, &nt, tml->index);
+            0, ml->_nodecount_padded, ml->data, ml->pdata, ml->_thread, &nt, ml, tml->index);
     }
 }
 }  // namespace coreneuron
diff --git a/external/mod2c b/external/mod2c
index 9d21b18a0..d8507dec1 160000
--- a/external/mod2c
+++ b/external/mod2c
@@ -1 +1 @@
-Subproject commit 9d21b18a0036810f3ced1a8b16428754b87c8e87
+Subproject commit d8507dec1671b43b012e4e1ab36160e6da21aabb
diff --git a/external/nmodl b/external/nmodl
index 7000ff612..b99496a91 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit 7000ff612208ed8b27837438731903c58d1786e3
+Subproject commit b99496a919df98a89cd97cb6898dda49f0d17c56

From ad634e909a7d05057a410f9f560bd289974e3754 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 27 Jul 2022 17:48:40 +0200
Subject: [PATCH 059/128] add more GPU builds

---
 .gitlab-ci.yml | 43 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 32 insertions(+), 11 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index a7e0f39bc..eb68083a1 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -99,35 +99,40 @@ build:nmodl:
 build:coreneuron:mod2c:nvhpc:acc:
   extends: [.build_coreneuron, .spack_nvhpc]
   variables:
-    # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
     SPACK_PACKAGE_SPEC: +caliper+gpu+openmp~shared+tests~legacy-unit build_type=RelWithDebInfo
 
+build:coreneuron:mod2c:nvhpc:acc:debug:
+  extends: [.build_coreneuron, .spack_nvhpc]
+  variables:
+    SPACK_PACKAGE_SPEC: +caliper+gpu+openmp~shared+tests~legacy-unit build_type=Debug
+
 build:coreneuron:mod2c:nvhpc:acc:shared:
   extends: [.build_coreneuron, .spack_nvhpc]
   variables:
-    # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
     SPACK_PACKAGE_SPEC: +caliper+gpu+openmp+shared+tests~legacy-unit build_type=RelWithDebInfo
 
+build:coreneuron:mod2c:nvhpc:acc:shared:debug:
+  extends: [.build_coreneuron, .spack_nvhpc]
+  variables:
+    SPACK_PACKAGE_SPEC: +caliper+gpu+openmp+shared+tests~legacy-unit build_type=Debug
+
 # Build CoreNEURON with Unified Memory on GPU
 build:coreneuron:mod2c:nvhpc:acc:unified:
   extends: [.build_coreneuron, .spack_nvhpc]
   variables:
-    # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
-    SPACK_PACKAGE_SPEC: +caliper+gpu+unified+openmp~shared+tests~legacy-unit build_type=RelWithDebInfo
+    SPACK_PACKAGE_SPEC: +caliper+gpu+unified+openmp~shared+tests~legacy-unit build_type=Debug
 
-build:coreneuron:nmodl:nvhpc:omp:
+build:coreneuron:nmodl:nvhpc:acc:
   extends: [.build_coreneuron, .spack_nvhpc]
   variables:
-    # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
-    SPACK_PACKAGE_SPEC: +caliper+nmodl+openmp+gpu~shared+tests~legacy-unit~sympy build_type=RelWithDebInfo
+    SPACK_PACKAGE_SPEC: +caliper+nmodl~openmp+gpu~shared+tests~legacy-unit+sympy build_type=RelWithDebInfo
   needs: ["build:nmodl"]
 
-build:coreneuron:nmodl:nvhpc:acc:
+build:coreneuron:nmodl:nvhpc:omp:
   extends: [.build_coreneuron, .spack_nvhpc]
   variables:
-    # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
-    # Sympy + OpenMP target offload does not currently work with NVHPC
-    SPACK_PACKAGE_SPEC: +caliper+nmodl~openmp+gpu~shared+tests~legacy-unit+sympy build_type=RelWithDebInfo
+    # Sympy + OpenMP target offload does not currently work with NVHPC (?)
+    SPACK_PACKAGE_SPEC: +caliper+nmodl+openmp+gpu~shared+tests~legacy-unit~sympy build_type=RelWithDebInfo
   needs: ["build:nmodl"]
 
 build:coreneuron:mod2c:intel:
@@ -146,10 +151,18 @@ build:neuron:mod2c:nvhpc:acc:
   extends: [.build_neuron, .spack_nvhpc]
   needs: ["build:coreneuron:mod2c:nvhpc:acc"]
 
+build:neuron:mod2c:nvhpc:acc:debug:
+  extends: [.build_neuron, .spack_nvhpc]
+  needs: ["build:coreneuron:mod2c:nvhpc:acc:debug"]
+
 build:neuron:mod2c:nvhpc:acc:shared:
   extends: [.build_neuron, .spack_nvhpc]
   needs: ["build:coreneuron:mod2c:nvhpc:acc:shared"]
 
+build:neuron:mod2c:nvhpc:acc:shared:debug:
+  extends: [.build_neuron, .spack_nvhpc]
+  needs: ["build:coreneuron:mod2c:nvhpc:acc:shared:debug"]
+
 build:neuron:nmodl:nvhpc:omp:
   extends: [.build_neuron, .spack_nvhpc]
   needs: ["build:coreneuron:nmodl:nvhpc:omp"]
@@ -171,10 +184,18 @@ test:coreneuron:mod2c:nvhpc:acc:
   extends: [.ctest, .gpu_node]
   needs: ["build:coreneuron:mod2c:nvhpc:acc"]
 
+test:coreneuron:mod2c:nvhpc:acc:debug:
+  extends: [.ctest, .gpu_node]
+  needs: ["build:coreneuron:mod2c:nvhpc:acc:debug"]
+
 test:coreneuron:mod2c:nvhpc:acc:shared:
   extends: [.ctest, .gpu_node]
   needs: ["build:coreneuron:mod2c:nvhpc:acc:shared"]
 
+test:coreneuron:mod2c:nvhpc:acc:shared:
+  extends: [.ctest, .gpu_node]
+  needs: ["build:coreneuron:mod2c:nvhpc:acc:shared:debug"]
+
 test:coreneuron:mod2c:nvhpc:acc:unified:
   extends: [.ctest, .gpu_node]
   needs: ["build:coreneuron:mod2c:nvhpc:acc:unified"]

From 9ce83c2fcff002473ed7a9ac5beb98ace9d526fe Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Thu, 28 Jul 2022 15:13:46 +0200
Subject: [PATCH 060/128] avoid if target (nv::target::is_device)

---
 coreneuron/sim/scopmath/errcodes.h        | 39 ++++++++++++++++-------
 coreneuron/sim/scopmath/sparse_thread.hpp | 35 ++++++++------------
 external/mod2c                            |  2 +-
 3 files changed, 43 insertions(+), 33 deletions(-)

diff --git a/coreneuron/sim/scopmath/errcodes.h b/coreneuron/sim/scopmath/errcodes.h
index 5f32c5785..94d08f73c 100644
--- a/coreneuron/sim/scopmath/errcodes.h
+++ b/coreneuron/sim/scopmath/errcodes.h
@@ -1,16 +1,33 @@
-/******************************************************************************
- *
- * File: errcodes.h
- *
- * Copyright (c) 1984, 1985, 1986, 1987, 1988, 1989, 1990
- *   Duke University
- *
- * errcodes.h,v 1.1.1.1 1994/10/12 17:22:18 hines Exp
- *
- ******************************************************************************/
+/*
+# =============================================================================
+# Originally errcodes.h from SCoP library, Copyright (c) 1984-90 Duke University
+# =============================================================================
+# Subsequent extensive prototype and memory layout changes for CoreNEURON
+#
+# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
+#
+# See top-level LICENSE file for details.
+# =============================================================================.
+*/
+#pragma once
 namespace coreneuron {
 extern int abort_run(int);
-}
+namespace scopmath {
+/** @brief Flag to disable some code sections at compile time.
+ *
+ *  Some methods, such as coreneuron::scopmath::sparse::getelm(...), decide at
+ *  runtime whether they are simply accessors, or if they dynamically modify the
+ *  matrix in question, possibly allocating new memory. Typically the second
+ *  mode will be used during model initialisation, while the first will be used
+ *  during computation/simulation. Compiling the more complicated code for the
+ *  second mode can be problematic for targets such as GPU, where dynamic
+ *  allocation and global state are complex. This enum is intended to be used as
+ *  a template parameter to flag (at compile time) when this code can be
+ *  omitted.
+ */
+enum struct enabled_code { all, compute_only };
+}  // namespace scopmath
+}  // namespace coreneuron
 #define ROUNDOFF       1.e-20
 #define ZERO           1.e-8
 #define STEP           1.e-6
diff --git a/coreneuron/sim/scopmath/sparse_thread.hpp b/coreneuron/sim/scopmath/sparse_thread.hpp
index 85580011e..8d84cbb0e 100644
--- a/coreneuron/sim/scopmath/sparse_thread.hpp
+++ b/coreneuron/sim/scopmath/sparse_thread.hpp
@@ -13,10 +13,6 @@
 #include "coreneuron/mechanism/mech/mod2c_core_thread.hpp"
 #include "coreneuron/sim/scopmath/errcodes.h"
 
-#ifdef __CUDACC__
-#include <nv/target>
-#endif
-
 namespace coreneuron {
 namespace scopmath {
 namespace sparse {
@@ -71,7 +67,8 @@ inline void increase_order(SparseObj* so, unsigned row) {
  * biggest difference is that elements are no longer removed and this saves much
  * time allocating and freeing during the solve phase.
  */
-inline Elm* getelm(SparseObj* so, unsigned row, unsigned col, Elm* new_elem) {
+template <enabled_code code_to_enable = enabled_code::all>
+Elm* getelm(SparseObj* so, unsigned row, unsigned col, Elm* new_elem) {
     Elm *el, *elnext;
 
     unsigned vrow = so->varord[row];
@@ -94,12 +91,10 @@ inline Elm* getelm(SparseObj* so, unsigned row, unsigned col, Elm* new_elem) {
         }
         /* insert below el */
         if (!new_elem) {
-#ifdef __CUDACC__
-            if target (nv::target::is_device) {
+            if constexpr (code_to_enable == enabled_code::compute_only) {
+                // Dynamic allocation should not happen during the compute phase.
                 assert(false);
-            } else
-#endif
-            {
+            } else {
                 new_elem = new Elm{};
                 new_elem->value = new double[so->_cntml_padded];
                 increase_order(so, row);
@@ -143,12 +138,9 @@ inline Elm* getelm(SparseObj* so, unsigned row, unsigned col, Elm* new_elem) {
         }
         /* insert above el */
         if (!new_elem) {
-#ifdef __CUDACC__
-            if target (nv::target::is_device) {
+            if constexpr (code_to_enable == enabled_code::compute_only) {
                 assert(false);
-            } else
-#endif
-            {
+            } else {
                 new_elem = new Elm{};
                 new_elem->value = new double[so->_cntml_padded];
                 increase_order(so, row);
@@ -508,16 +500,13 @@ void create_coef_list(SparseObj* so, int n, SPFUN fun, _threadargsproto_) {
     fun(so, so->rhs, _threadargs_);  // std::invoke in C++17
     so->phase = 0;
 }
-}  // namespace sparse
-}  // namespace scopmath
 
-// Methods that may be called from translated MOD files are kept outside the
-// scopmath::sparse namespace.
-inline double* _nrn_thread_getelm(SparseObj* so, int row, int col, int _iml) {
+template <enabled_code code_to_enable = enabled_code::all>
+double* thread_getelm(SparseObj* so, int row, int col, int _iml) {
     if (!so->phase) {
         return so->coef_list[so->ngetcall[_iml]++];
     }
-    Elm* el = scopmath::sparse::getelm(so, (unsigned) row, (unsigned) col, nullptr);
+    Elm* el = scopmath::sparse::getelm<code_to_enable>(so, (unsigned) row, (unsigned) col, nullptr);
     if (so->phase == 1) {
         so->ngetcall[_iml]++;
     } else {
@@ -525,7 +514,11 @@ inline double* _nrn_thread_getelm(SparseObj* so, int row, int col, int _iml) {
     }
     return el->value;
 }
+}  // namespace sparse
+}  // namespace scopmath
 
+// Methods that may be called from translated MOD files are kept outside the
+// scopmath::sparse namespace.
 #define scopmath_sparse_s(arg) _p[scopmath_sparse_ix(s[arg])]
 #define scopmath_sparse_d(arg) _p[scopmath_sparse_ix(d[arg])]
 
diff --git a/external/mod2c b/external/mod2c
index d8507dec1..7d1557eec 160000
--- a/external/mod2c
+++ b/external/mod2c
@@ -1 +1 @@
-Subproject commit d8507dec1671b43b012e4e1ab36160e6da21aabb
+Subproject commit 7d1557eecc8800c09ae1368a8a6401957b8bcb57

From 202a5f51f531db996765609c48028e842c5189e0 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Thu, 28 Jul 2022 15:35:58 +0200
Subject: [PATCH 061/128] submodule

---
 external/mod2c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/mod2c b/external/mod2c
index 7d1557eec..c37aff7d0 160000
--- a/external/mod2c
+++ b/external/mod2c
@@ -1 +1 @@
-Subproject commit 7d1557eecc8800c09ae1368a8a6401957b8bcb57
+Subproject commit c37aff7d0bdefbee83ffcb4f2e0d58e9ad806864

From c77d7c286f1c524b5bb887f3343c066b5be95624 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Thu, 28 Jul 2022 17:44:57 +0200
Subject: [PATCH 062/128] tweaks

---
 .gitlab-ci.yml                     | 10 +++++++++-
 coreneuron/gpu/nrn_acc_manager.cpp | 29 +++++++++++++++--------------
 2 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index eb68083a1..e867b7eb1 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -192,7 +192,7 @@ test:coreneuron:mod2c:nvhpc:acc:shared:
   extends: [.ctest, .gpu_node]
   needs: ["build:coreneuron:mod2c:nvhpc:acc:shared"]
 
-test:coreneuron:mod2c:nvhpc:acc:shared:
+test:coreneuron:mod2c:nvhpc:acc:shared:debug:
   extends: [.ctest, .gpu_node]
   needs: ["build:coreneuron:mod2c:nvhpc:acc:shared:debug"]
 
@@ -221,10 +221,18 @@ test:neuron:mod2c:nvhpc:acc:
   extends: [.test_neuron, .gpu_node]
   needs: ["build:neuron:mod2c:nvhpc:acc"]
 
+test:neuron:mod2c:nvhpc:acc:debug:
+  extends: [.test_neuron, .gpu_node]
+  needs: ["build:neuron:mod2c:nvhpc:acc:debug"]
+
 test:neuron:mod2c:nvhpc:acc:shared:
   extends: [.test_neuron, .gpu_node]
   needs: ["build:neuron:mod2c:nvhpc:acc:shared"]
 
+test:neuron:mod2c:nvhpc:acc:shared:debug:
+  extends: [.test_neuron, .gpu_node]
+  needs: ["build:neuron:mod2c:nvhpc:acc:shared:debug"]
+
 test:neuron:nmodl:nvhpc:omp:
   extends: [.test_neuron, .gpu_node]
   needs: ["build:neuron:nmodl:nvhpc:omp"]
diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index d0862b31e..82ef53e0f 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -34,13 +34,13 @@
 
 #ifdef CORENEURON_ENABLE_PRESENT_TABLE
 #include <cassert>
+#include <cstddef>
 #include <iostream>
 #include <map>
-#include <mutex>
+#include <shared_mutex>
 namespace {
-enum class byte : unsigned char {};  // std::byte in C++17
-std::map<byte const*, std::pair<std::size_t, byte*>> present_table;
-std::mutex present_table_mutex;
+std::map<std::byte const*, std::pair<std::size_t, std::byte*>> present_table;
+std::shared_mutex present_table_mutex;
 }  // namespace
 #endif
 
@@ -56,8 +56,9 @@ void* cnrn_target_deviceptr_impl(void const* h_ptr) {
     if (!h_ptr) {
         return nullptr;
     }
-    // note no locking, undefined behaviour if you call this concurrently with
-    // the copyin/delete methods (which do lock)
+    // Concurrent calls to this method are safe, but they must be serialised
+    // w.r.t. calls to the cnrn_target_*_update_present_table methods.
+    std::shared_lock _{present_table_mutex};
     assert(!present_table.empty());
     // prev(first iterator greater than h_ptr or last if not found) gives the first iterator less
     // than or equal to h_ptr
@@ -66,10 +67,10 @@ void* cnrn_target_deviceptr_impl(void const* h_ptr) {
             return hp < entry.first;
         }));
     assert(iter != present_table.end());
-    byte const* const h_byte_ptr{static_cast<byte const*>(h_ptr)};
-    byte const* const h_start_of_block{iter->first};
+    std::byte const* const h_byte_ptr{static_cast<std::byte const*>(h_ptr)};
+    std::byte const* const h_start_of_block{iter->first};
     std::size_t const block_size{iter->second.first};
-    byte* const d_start_of_block{iter->second.second};
+    std::byte* const d_start_of_block{iter->second.second};
     assert(h_byte_ptr < h_start_of_block + block_size);
     return d_start_of_block + (h_byte_ptr - h_start_of_block);
 }
@@ -78,16 +79,16 @@ void cnrn_target_copyin_update_present_table(void const* h_ptr, void* d_ptr, std
         assert(!d_ptr);
         return;
     }
-    std::lock_guard<std::mutex> _{present_table_mutex};
-    auto const result = present_table.emplace(static_cast<byte const*>(h_ptr),
-                                              std::make_pair(len, static_cast<byte*>(d_ptr)));
+    std::lock_guard _{present_table_mutex};
+    auto const result = present_table.emplace(static_cast<std::byte const*>(h_ptr),
+                                              std::make_pair(len, static_cast<std::byte*>(d_ptr)));
 }
 void cnrn_target_delete_update_present_table(void const* h_ptr, std::size_t len) {
     if (!h_ptr) {
         return;
     }
-    std::lock_guard<std::mutex> _{present_table_mutex};
-    auto const iter = present_table.find(static_cast<byte const*>(h_ptr));
+    std::lock_guard _{present_table_mutex};
+    auto const iter = present_table.find(static_cast<std::byte const*>(h_ptr));
     assert(iter != present_table.end());
     assert(iter->second.first == len);
     present_table.erase(iter);

From d6d9c0c7e8949ea1be1ec9e9820cf07dac02e637 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Thu, 28 Jul 2022 17:46:01 +0200
Subject: [PATCH 063/128] submodule

---
 external/mod2c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/mod2c b/external/mod2c
index c37aff7d0..4f8df8877 160000
--- a/external/mod2c
+++ b/external/mod2c
@@ -1 +1 @@
-Subproject commit c37aff7d0bdefbee83ffcb4f2e0d58e9ad806864
+Subproject commit 4f8df887736f24c4d59262984f62312bb7851363

From 1eabb5610b43cb42ccf2ade0135c79de0807fd05 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 2 Aug 2022 15:51:47 +0200
Subject: [PATCH 064/128] fix non-dynamic MPI + caliper

---
 coreneuron/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 168fdf708..319d6b13b 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -136,6 +136,7 @@ if(CORENRN_ENABLE_MPI AND NOT CORENRN_ENABLE_MPI_DYNAMIC)
   target_include_directories(
     ${CORENRN_MPI_LIB_NAME} PRIVATE ${MPI_INCLUDE_PATH} ${CORENEURON_PROJECT_SOURCE_DIR}
                                     ${CORENEURON_PROJECT_BINARY_DIR}/generated)
+  target_link_libraries(${CORENRN_MPI_LIB_NAME} ${CORENRN_CALIPER_LIB})
   set_property(TARGET ${CORENRN_MPI_LIB_NAME} PROPERTY POSITION_INDEPENDENT_CODE ON)
   set(CORENRN_MPI_OBJ $<TARGET_OBJECTS:${CORENRN_MPI_LIB_NAME}>)
 endif()

From fb36084fcce9cc1439533961c118f42e684bf868 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 2 Aug 2022 16:30:57 +0200
Subject: [PATCH 065/128] clang-format

---
 coreneuron/mechanism/membfunc.hpp      | 3 ++-
 coreneuron/mechanism/register_mech.cpp | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/coreneuron/mechanism/membfunc.hpp b/coreneuron/mechanism/membfunc.hpp
index 87927780c..2a7c8f54e 100644
--- a/coreneuron/mechanism/membfunc.hpp
+++ b/coreneuron/mechanism/membfunc.hpp
@@ -40,7 +40,8 @@ struct Memb_func {
     int thread_size_;                       /* how many Datum needed in Memb_list if vectorized */
     void (*thread_mem_init_)(ThreadDatum*); /* after Memb_list._thread is allocated */
     void (*thread_cleanup_)(ThreadDatum*);  /* before Memb_list._thread is freed */
-    void (*thread_table_check_)(int, int, double*, Datum*, ThreadDatum*, NrnThread*, Memb_list*, int);
+    void (
+        *thread_table_check_)(int, int, double*, Datum*, ThreadDatum*, NrnThread*, Memb_list*, int);
     int is_point;
     void (*setdata_)(double*, Datum*);
     int* dparam_semantics; /* for nrncore writing. */
diff --git a/coreneuron/mechanism/register_mech.cpp b/coreneuron/mechanism/register_mech.cpp
index 20fbcd424..4f545998a 100644
--- a/coreneuron/mechanism/register_mech.cpp
+++ b/coreneuron/mechanism/register_mech.cpp
@@ -417,8 +417,9 @@ void _nrn_thread_reg1(int i, void (*f)(ThreadDatum*)) {
     corenrn.get_memb_func(i).thread_mem_init_ = f;
 }
 
-void _nrn_thread_table_reg(int i,
-                           void (*f)(int, int, double*, Datum*, ThreadDatum*, NrnThread*, Memb_list*, int)) {
+void _nrn_thread_table_reg(
+    int i,
+    void (*f)(int, int, double*, Datum*, ThreadDatum*, NrnThread*, Memb_list*, int)) {
     if (i == -1)
         return;
 

From ca987194715ee6dd613c7accdf563c50206a921e Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 3 Aug 2022 09:48:54 +0200
Subject: [PATCH 066/128] libcoreneuron.so -> libcorenrnmech.so, try and fix
 static builds

---
 CMake/MakefileBuildOptions.cmake | 23 ++++++++++++++++++-----
 CMake/OpenAccHelper.cmake        | 29 +++++++++++++++++++----------
 CMake/coreneuron-config.cmake.in |  2 +-
 CMakeLists.txt                   |  2 +-
 coreneuron/CMakeLists.txt        | 12 ++++++------
 extra/nrnivmodl-core.in          |  2 +-
 extra/nrnivmodl_core_makefile.in |  6 +++---
 7 files changed, 49 insertions(+), 27 deletions(-)

diff --git a/CMake/MakefileBuildOptions.cmake b/CMake/MakefileBuildOptions.cmake
index 7fec40860..51785c0f9 100644
--- a/CMake/MakefileBuildOptions.cmake
+++ b/CMake/MakefileBuildOptions.cmake
@@ -29,10 +29,17 @@ set(NMODL_ISPC_BACKEND_ARGS "host --ispc")
 set(NMODL_ACC_BACKEND_ARGS "host --c acc --oacc")
 
 # =============================================================================
-# Construct the linker arguments that are used inside nrnivmodl-core (to build libcoreneuron from
+# Construct the linker arguments that are used inside nrnivmodl-core (to build libcorenrnmech from
 # libcoreneuron-core, libcoreneuron-cuda and mechanism object files) and inside nrnivmodl (to link
-# NEURON's special against CoreNEURON's libcoreneuron).
+# NEURON's special against CoreNEURON's libcorenrnmech).
 # =============================================================================
+if(NOT CORENRN_ENABLE_SHARED)
+  set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " -Wl,--whole-archive")
+endif()
+set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " -lcorenrnmech")
+if(NOT CORENRN_ENABLE_SHARED)
+  set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " -Wl,--no-whole-archive")
+endif()
 # Essentially we "just" want to unpack the CMake dependencies of the `coreneuron-core` target into a
 # plain string that we can bake into the Makefiles in both NEURON and CoreNEURON.
 function(coreneuron_process_library_path library)
@@ -86,6 +93,11 @@ function(coreneuron_process_target target)
   coreneuron_process_library_path("${target}")
 endfunction()
 coreneuron_process_target(coreneuron-core)
+# In static builds then NEURON uses dlopen(nullptr, ...) to look for the corenrn_embedded_run
+# symbol, which comes from libcoreneuron-core.a and gets included in libcorenrnmech.
+if(NOT CORENRN_ENABLE_SHARED)
+  set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " -rdynamic")
+endif()
 get_property(CORENRN_EXTRA_COMPILE_FLAGS GLOBAL PROPERTY CORENRN_EXTRA_COMPILE_FLAGS)
 get_property(CORENRN_LIB_LINK_FLAGS GLOBAL PROPERTY CORENRN_LIB_LINK_FLAGS)
 
@@ -98,8 +110,9 @@ if(CORENRN_CXX_LINKER_SUPPORTS_START_GROUP)
   set(CORENEURON_LINKER_START_GROUP -Wl,--start-group)
   set(CORENEURON_LINKER_END_GROUP -Wl,--end-group)
 endif()
-# Things that used to be in CORENRN_LIB_LINK_FLAGS: -rdynamic -lrt -Wl,--whole-archive
-# -L${CMAKE_HOST_SYSTEM_PROCESSOR} -Wl,--no-whole-archive -L${caliper_LIB_DIR} -l${CALIPER_LIB}
+
+# Things that used to be in CORENRN_LIB_LINK_FLAGS: -lrt -L${CMAKE_HOST_SYSTEM_PROCESSOR}
+# -L${caliper_LIB_DIR} -l${CALIPER_LIB}
 
 # =============================================================================
 # Turn CORENRN_COMPILE_DEFS into a list of -DFOO[=BAR] options.
@@ -107,7 +120,7 @@ endif()
 list(TRANSFORM CORENRN_COMPILE_DEFS PREPEND -D OUTPUT_VARIABLE CORENRN_COMPILE_DEF_FLAGS)
 
 # =============================================================================
-# Extra link flags that we need to include when linking libcoreneuron.{a,so} in CoreNEURON but that
+# Extra link flags that we need to include when linking libcorenrnmech.{a,so} in CoreNEURON but that
 # do not need to be passed to NEURON to use when linking nrniv/special (why?)
 # =============================================================================
 string(JOIN " " CORENRN_COMMON_LDFLAGS ${CORENRN_LIB_LINK_FLAGS} ${CORENRN_EXTRA_LINK_FLAGS})
diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake
index f50f1436a..f232d2bab 100644
--- a/CMake/OpenAccHelper.cmake
+++ b/CMake/OpenAccHelper.cmake
@@ -67,7 +67,14 @@ if(CORENRN_ENABLE_GPU)
   # and offloaded OpenACC/OpenMP code. Using -cuda when compiling seems to improve error messages in
   # some cases, and to be recommended by NVIDIA. We pass -gpu=cudaX.Y to ensure that OpenACC/OpenMP
   # code is compiled with the same CUDA version as the explicit CUDA code.
-  set(NVHPC_ACC_COMP_FLAGS "-cuda -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
+  set(NVHPC_ACC_COMP_FLAGS "-cuda -gpu=cuda${CORENRN_CUDA_VERSION_SHORT}")
+  # Combining -gpu=lineinfo with -O0 -g gives a warning: Conflicting options --device-debug and
+  # --generate-line-info specified, ignoring --generate-line-info option
+  if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    string(APPEND NVHPC_ACC_COMP_FLAGS ",debug")
+  else()
+    string(APPEND NVHPC_ACC_COMP_FLAGS ",lineinfo")
+  endif()
   # Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA
   # code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the
   # same default compute capabilities as each other, particularly on GPU-less build machines.
@@ -96,17 +103,19 @@ endif()
 # =============================================================================
 # Initialise global property that will be used by NEURON to link with CoreNEURON
 # =============================================================================
-if(CORENRN_ENABLE_GPU AND CORENRN_ENABLE_SHARED)
-  # Because of
-  # https://forums.developer.nvidia.com/t/dynamically-loading-an-openacc-enabled-shared-library-from-an-executable-compiled-with-nvc-does-not-work/210968
-  # we have to tell NEURON to pass OpenACC flags when linking special, otherwise we end up with an
-  # `nrniv` binary that cannot dynamically load CoreNEURON in shared-library builds.
-  # CORENRN_LIB_LINK_FLAGS is the full set of flags needed to link against libcoreneuron.so:
-  # something like `-acc -lcoreneuron ...`. CORENRN_NEURON_LINK_FLAGS only contains flags that need
+if(CORENRN_ENABLE_GPU)
+  # CORENRN_LIB_LINK_FLAGS is the full set of flags needed to link against libcorenrnmech.so:
+  # something like `-acc -lcorenrnmech ...`. CORENRN_NEURON_LINK_FLAGS only contains flags that need
   # to be used when linking the NEURON Python module to make sure it is able to dynamically load
-  # libcoreneuron.so.
+  # libcorenrnmech.so.
   set_property(GLOBAL PROPERTY CORENRN_LIB_LINK_FLAGS "${NVHPC_ACC_COMP_FLAGS}")
-  set_property(GLOBAL PROPERTY CORENRN_NEURON_LINK_FLAGS "${NVHPC_ACC_COMP_FLAGS}")
+  # Because of
+  if(CORENRN_ENABLE_SHARED)
+    # https://forums.developer.nvidia.com/t/dynamically-loading-an-openacc-enabled-shared-library-from-an-executable-compiled-with-nvc-does-not-work/210968
+    # we have to tell NEURON to pass OpenACC flags when linking special, otherwise we end up with an
+    # `nrniv` binary that cannot dynamically load CoreNEURON in shared-library builds.
+    set_property(GLOBAL PROPERTY CORENRN_NEURON_LINK_FLAGS "${NVHPC_ACC_COMP_FLAGS}")
+  endif()
 endif()
 
 if(CORENRN_HAVE_NVHPC_COMPILER)
diff --git a/CMake/coreneuron-config.cmake.in b/CMake/coreneuron-config.cmake.in
index 4fe3988c3..9f7ac4997 100644
--- a/CMake/coreneuron-config.cmake.in
+++ b/CMake/coreneuron-config.cmake.in
@@ -21,7 +21,7 @@ set(CORENRN_NEURON_LINK_FLAGS "@CORENRN_NEURON_LINK_FLAGS@")
 find_path(CORENEURON_INCLUDE_DIR "coreneuron/coreneuron.h" HINTS "${CONFIG_PATH}/../../include")
 find_path(
   CORENEURON_LIB_DIR
-  NAMES libcoreneuron.a libcoreneuron.so libcoreneuron.dylib
+  NAMES libcorenrnmech.a libcorenrnmech.so libcorenrnmech.dylib
   HINTS "${CONFIG_PATH}/../../lib")
 
 include(${CONFIG_PATH}/coreneuron.cmake)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f3b1e7ee9..ab3de7345 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -479,7 +479,7 @@ add_subdirectory(coreneuron)
 
 # Extract the various compiler option strings to use inside nrnivmodl-core. Sets the global property
 # CORENRN_LIB_LINK_FLAGS, which contains the arguments that must be added to the link line for
-# `special` to link against `libcoreneuron.{a,so}`
+# `special` to link against `libcorenrnmech.{a,so}`
 include(MakefileBuildOptions)
 
 # Generate the nrnivmodl-core script and makefile using the options from MakefileBuildOptions
diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 319d6b13b..0bc986ae6 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -145,17 +145,17 @@ endif()
 # installed as a static library, and then the nrnivmodl-core workflow extracts the object files from
 # it and does one of the following:
 #
-# * shared build: creates libcoreneuron.so from these objects plus those from the translated MOD
+# * shared build: creates libcorenrnmech.so from these objects plus those from the translated MOD
 #   files
-# * static build: creates a (temporary, does not get installed) libcoreneuron.a from these objects
+# * static build: creates a (temporary, does not get installed) libcorenrnmech.a from these objects
 #   plus those from the translated MOD files, then statically links that into special-core
 #   (nrniv-core)
 #
 # This scheme means that both core and mechanism .o files are linked in a single step, which is
 # important for GPU linking. It does, however, mean that the core code is installed twice, once in
-# libcoreneuron-core.a and once in libcoreneuron.so (shared) or nrniv-core (static). In a GPU build,
-# libcoreneuron-cuda.{a,so} is also linked to provide the CUDA implementation of the Hines solver.
-# This cannot be included in coreneuron-core because of this issue:
+# libcoreneuron-core.a and once in libcorenrnmech.so (shared) or nrniv-core (static). In a GPU
+# build, libcoreneuron-cuda.{a,so} is also linked to provide the CUDA implementation of the Hines
+# solver. This cannot be included in coreneuron-core because of this issue:
 # https://forums.developer.nvidia.com/t/cannot-dynamically-load-a-shared-library-containing-both-openacc-and-cuda-code/210972
 add_library(coreneuron-core STATIC ${CORENEURON_CODE_FILES} ${CORENRN_MPI_OBJ})
 if(CORENRN_ENABLE_GPU)
@@ -307,7 +307,7 @@ if(CORENRN_ENABLE_GPU)
 endif()
 
 # Create an extra target for internal use that unit tests and so on can depend on.
-# ${corenrn_mech_library} is libcoreneuron.{a,so}, which contains both the compiled default
+# ${corenrn_mech_library} is libcorenrnmech.{a,so}, which contains both the compiled default
 # mechanisms and the content of libcoreneuron-core.a.
 add_library(coreneuron-all INTERFACE)
 target_link_libraries(coreneuron-all INTERFACE "${corenrn_mech_library}")
diff --git a/extra/nrnivmodl-core.in b/extra/nrnivmodl-core.in
index 08804b159..742409d88 100755
--- a/extra/nrnivmodl-core.in
+++ b/extra/nrnivmodl-core.in
@@ -83,7 +83,7 @@ while getopts "n:m:a:d:i:l:Vp:r:b:h" OPT; do
         echo "  -r <0|1>                  Enable NRN_PRCELLSTATE mechanism. Default: @CORENRN_NRN_PRCELLSTATE@."
         echo "  -V                        Verbose: show commands executed by make"
         echo "  -p <n_procs>              Number of parallel builds (Default: $PARALLEL_BUILDS)"
-        echo "  -b <STATIC|SHARED>        libcoreneuron library type"
+        echo "  -b <STATIC|SHARED>        libcorenrnmech library type"
         exit 0;;
     ?)
         exit 1;;
diff --git a/extra/nrnivmodl_core_makefile.in b/extra/nrnivmodl_core_makefile.in
index 2c732a288..2804a297f 100644
--- a/extra/nrnivmodl_core_makefile.in
+++ b/extra/nrnivmodl_core_makefile.in
@@ -99,7 +99,7 @@ ifeq (@CORENRN_ENABLE_NMODL@, ON)
 endif
 
 # name of the mechanism library with suffix if provided
-COREMECH_LIB_NAME = coreneuron$(if $(MECHLIB_SUFFIX),_$(MECHLIB_SUFFIX),)
+COREMECH_LIB_NAME = corenrnmech$(if $(MECHLIB_SUFFIX),_$(MECHLIB_SUFFIX),)
 COREMECH_LIB_PATH = $(OUTPUT_DIR)/lib$(COREMECH_LIB_NAME)$(LIB_SUFFIX)
 
 # Various header and C++/Object file
@@ -206,7 +206,7 @@ $(SPECIAL_EXE): $(corenrnmech_lib_target)
 	@printf " => $(C_GREEN)Binary$(C_RESET) creating $(SPECIAL_EXE)\n"
 	$(CXX_LINK_EXE_CMD) -o $(SPECIAL_EXE) $(CORENRN_SHARE_CORENRN_DIR)/coreneuron.cpp \
 	  -I$(CORENRN_INC_DIR) $(INCFLAGS) \
-	  -L$(OUTPUT_DIR) -l$(COREMECH_LIB_NAME) $(LDFLAGS) \
+	  -L$(OUTPUT_DIR) $(LDFLAGS) \
 	  -L$(CORENRN_LIB_DIR) \
 	  -Wl,-rpath,'$(LIB_RPATH)' -Wl,-rpath,$(CORENRN_LIB_DIR) -Wl,-rpath,'$(INSTALL_LIB_RPATH)'
 
@@ -230,7 +230,7 @@ coremech_lib_shared: $(ALL_OBJS) $(ENGINEMECH_OBJ) build_always
 
 # build static library of mechanisms
 coremech_lib_static: $(ALL_OBJS) $(ENGINEMECH_OBJ) build_always
-	# make a libcoreneuron.a by copying libcoreneuron-core.a and then appending
+	# make a libcorenrnmech.a by copying libcoreneuron-core.a and then appending
 	# the newly compiled objects
 	cp $(CORENRN_LIB_DIR)/libcoreneuron-core.a ${COREMECH_LIB_PATH}
 	ar r ${COREMECH_LIB_PATH} $(ENGINEMECH_OBJ) $(ALL_OBJS)

From f33d56ef3d4d14dea588919f6ab6fe74515a9329 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 3 Aug 2022 10:37:23 +0200
Subject: [PATCH 067/128] fixups

---
 CMake/MakefileBuildOptions.cmake | 17 +++++++++++------
 coreneuron/CMakeLists.txt        |  2 +-
 extra/nrnivmodl_core_makefile.in |  2 +-
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/CMake/MakefileBuildOptions.cmake b/CMake/MakefileBuildOptions.cmake
index 51785c0f9..77bd4c9a3 100644
--- a/CMake/MakefileBuildOptions.cmake
+++ b/CMake/MakefileBuildOptions.cmake
@@ -31,7 +31,10 @@ set(NMODL_ACC_BACKEND_ARGS "host --c acc --oacc")
 # =============================================================================
 # Construct the linker arguments that are used inside nrnivmodl-core (to build libcorenrnmech from
 # libcoreneuron-core, libcoreneuron-cuda and mechanism object files) and inside nrnivmodl (to link
-# NEURON's special against CoreNEURON's libcorenrnmech).
+# NEURON's special against CoreNEURON's libcorenrnmech). These are stored in two global properties:
+# CORENRN_LIB_LINK_FLAGS (used by NEURON/nrnivmodl to link special against CoreNEURON) and
+# CORENRN_LIB_LINK_DEP_FLAGS (used by CoreNEURON/nrnivmodl-core to link libcorenrnmech.so).
+# Conceptually: CORENRN_LIB_LINK_FLAGS = -lcorenrnmech $CORENRN_LIB_LINK_DEP_FLAGS
 # =============================================================================
 if(NOT CORENRN_ENABLE_SHARED)
   set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " -Wl,--whole-archive")
@@ -46,15 +49,15 @@ function(coreneuron_process_library_path library)
   get_filename_component(library_dir "${library}" DIRECTORY)
   if(NOT library_dir)
     # In case target is not a target but is just the name of a library, e.g. "dl"
-    set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " -l${library}")
+    set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_DEP_FLAGS " -l${library}")
   elseif("${library_dir}" MATCHES "^(/lib|/lib64|/usr/lib|/usr/lib64)$")
     # e.g. /usr/lib64/libpthread.so -> -lpthread
     get_filename_component(libname ${library} NAME_WE)
     string(REGEX REPLACE "^lib" "" libname ${libname})
-    set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " -l${libname}")
+    set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_DEP_FLAGS " -l${libname}")
   else()
     # It's a full path, include that on the line
-    set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " ${library}")
+    set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_DEP_FLAGS " ${library}")
   endif()
 endfunction()
 function(coreneuron_process_target target)
@@ -79,7 +82,7 @@ function(coreneuron_process_target target)
       else()
         # This is probably another of our libraries, like -lcoreneuron-cuda. We might need to add -L
         # and an RPATH later.
-        set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " -l${target}")
+        set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_DEP_FLAGS " -l${target}")
       endif()
     endif()
     get_target_property(target_libraries ${target} LINK_LIBRARIES)
@@ -93,6 +96,8 @@ function(coreneuron_process_target target)
   coreneuron_process_library_path("${target}")
 endfunction()
 coreneuron_process_target(coreneuron-core)
+get_property(CORENRN_LIB_LINK_DEP_FLAGS GLOBAL PROPERTY CORENRN_LIB_LINK_DEP_FLAGS)
+set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " ${CORENRN_LIB_LINK_DEP_FLAGS}")
 # In static builds then NEURON uses dlopen(nullptr, ...) to look for the corenrn_embedded_run
 # symbol, which comes from libcoreneuron-core.a and gets included in libcorenrnmech.
 if(NOT CORENRN_ENABLE_SHARED)
@@ -123,7 +128,7 @@ list(TRANSFORM CORENRN_COMPILE_DEFS PREPEND -D OUTPUT_VARIABLE CORENRN_COMPILE_D
 # Extra link flags that we need to include when linking libcorenrnmech.{a,so} in CoreNEURON but that
 # do not need to be passed to NEURON to use when linking nrniv/special (why?)
 # =============================================================================
-string(JOIN " " CORENRN_COMMON_LDFLAGS ${CORENRN_LIB_LINK_FLAGS} ${CORENRN_EXTRA_LINK_FLAGS})
+string(JOIN " " CORENRN_COMMON_LDFLAGS ${CORENRN_LIB_LINK_DEP_FLAGS} ${CORENRN_EXTRA_LINK_FLAGS})
 if(CORENRN_SANITIZER_LIBRARY_DIR)
   string(APPEND CORENRN_COMMON_LDFLAGS " -Wl,-rpath,${CORENRN_SANITIZER_LIBRARY_DIR}")
 endif()
diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 0bc986ae6..57af85b6e 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -288,7 +288,7 @@ file(GLOB modfiles "${modfile_directory}/*.mod")
 # symbols in the translated versions of default .mod files
 set(nrniv_core_prefix "${CMAKE_BINARY_DIR}/bin/${CMAKE_SYSTEM_PROCESSOR}")
 set(corenrn_mech_library
-    "${nrniv_core_prefix}/${CMAKE_${COMPILE_LIBRARY_TYPE}_LIBRARY_PREFIX}coreneuron${CMAKE_${COMPILE_LIBRARY_TYPE}_LIBRARY_SUFFIX}"
+    "${nrniv_core_prefix}/${CMAKE_${COMPILE_LIBRARY_TYPE}_LIBRARY_PREFIX}corenrnmech${CMAKE_${COMPILE_LIBRARY_TYPE}_LIBRARY_SUFFIX}"
 )
 set(output_binaries "${nrniv_core_prefix}/special-core" "${corenrn_mech_library}")
 
diff --git a/extra/nrnivmodl_core_makefile.in b/extra/nrnivmodl_core_makefile.in
index 2804a297f..4d7df0388 100644
--- a/extra/nrnivmodl_core_makefile.in
+++ b/extra/nrnivmodl_core_makefile.in
@@ -206,7 +206,7 @@ $(SPECIAL_EXE): $(corenrnmech_lib_target)
 	@printf " => $(C_GREEN)Binary$(C_RESET) creating $(SPECIAL_EXE)\n"
 	$(CXX_LINK_EXE_CMD) -o $(SPECIAL_EXE) $(CORENRN_SHARE_CORENRN_DIR)/coreneuron.cpp \
 	  -I$(CORENRN_INC_DIR) $(INCFLAGS) \
-	  -L$(OUTPUT_DIR) $(LDFLAGS) \
+	  -L$(OUTPUT_DIR) -l$(COREMECH_LIB_NAME) $(LDFLAGS) \
 	  -L$(CORENRN_LIB_DIR) \
 	  -Wl,-rpath,'$(LIB_RPATH)' -Wl,-rpath,$(CORENRN_LIB_DIR) -Wl,-rpath,'$(INSTALL_LIB_RPATH)'
 

From b2e7f30295958c973ad5676a0c52e634ac5800da Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 3 Aug 2022 15:49:21 +0200
Subject: [PATCH 068/128] typo

---
 coreneuron/utils/memory.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/coreneuron/utils/memory.h b/coreneuron/utils/memory.h
index 286cfa5f2..9e612680c 100644
--- a/coreneuron/utils/memory.h
+++ b/coreneuron/utils/memory.h
@@ -26,7 +26,7 @@ namespace coreneuron {
  * @brief Check if GPU support is enabled.
  *
  * This returns true if GPU support was enabled at compile time and at runtime
- * via coreneuron.gpu = True and/or --gpu, otherwise it returnss false.
+ * via coreneuron.gpu = True and/or --gpu, otherwise it returns false.
  */
 bool gpu_enabled();
 

From d3545ff005d87c72d7569d467777e3dcb63ab1eb Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 3 Aug 2022 15:56:23 +0200
Subject: [PATCH 069/128] add fallback logic for cmake<3.18

---
 CMake/MakefileBuildOptions.cmake | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/CMake/MakefileBuildOptions.cmake b/CMake/MakefileBuildOptions.cmake
index 77bd4c9a3..b6f872115 100644
--- a/CMake/MakefileBuildOptions.cmake
+++ b/CMake/MakefileBuildOptions.cmake
@@ -109,8 +109,13 @@ get_property(CORENRN_LIB_LINK_FLAGS GLOBAL PROPERTY CORENRN_LIB_LINK_FLAGS)
 # Detect if --start-group and --end-group are valid linker arguments. These are typically needed
 # when linking mutually-dependent .o files (or where we don't know the correct order) on Linux, but
 # they are not needed *or* recognised by the macOS linker.
-include(CheckLinkerFlag) # requires CMake 3.18
-check_linker_flag(CXX -Wl,--start-group CORENRN_CXX_LINKER_SUPPORTS_START_GROUP)
+if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
+  include(CheckLinkerFlag)
+  check_linker_flag(CXX -Wl,--start-group CORENRN_CXX_LINKER_SUPPORTS_START_GROUP)
+elseif(CMAKE_SYSTEM_NAME MATCHES Linux)
+  # Assume that --start-group and --end-group are only supported on Linux
+  set(CORENRN_CXX_LINKER_SUPPORTS_START_GROUP ON)
+endif()
 if(CORENRN_CXX_LINKER_SUPPORTS_START_GROUP)
   set(CORENEURON_LINKER_START_GROUP -Wl,--start-group)
   set(CORENEURON_LINKER_END_GROUP -Wl,--end-group)

From 8db9162792c5269bfa60bd178518b11dece12683 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Thu, 4 Aug 2022 13:09:23 +0200
Subject: [PATCH 070/128] Do not enable OpenMP in shared/OpenACC builds.

---
 .gitlab-ci.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index e867b7eb1..3ba880c40 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -106,15 +106,17 @@ build:coreneuron:mod2c:nvhpc:acc:debug:
   variables:
     SPACK_PACKAGE_SPEC: +caliper+gpu+openmp~shared+tests~legacy-unit build_type=Debug
 
+# Shared + OpenACC + OpenMP host threading has problems
 build:coreneuron:mod2c:nvhpc:acc:shared:
   extends: [.build_coreneuron, .spack_nvhpc]
   variables:
-    SPACK_PACKAGE_SPEC: +caliper+gpu+openmp+shared+tests~legacy-unit build_type=RelWithDebInfo
+    SPACK_PACKAGE_SPEC: +caliper+gpu~openmp+shared+tests~legacy-unit build_type=RelWithDebInfo
 
+# Shared + OpenACC + OpenMP host threading has problems
 build:coreneuron:mod2c:nvhpc:acc:shared:debug:
   extends: [.build_coreneuron, .spack_nvhpc]
   variables:
-    SPACK_PACKAGE_SPEC: +caliper+gpu+openmp+shared+tests~legacy-unit build_type=Debug
+    SPACK_PACKAGE_SPEC: +caliper+gpu~openmp+shared+tests~legacy-unit build_type=Debug
 
 # Build CoreNEURON with Unified Memory on GPU
 build:coreneuron:mod2c:nvhpc:acc:unified:

From fc465940f2735c7a1db34ace159b16e9667fefef Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Thu, 4 Aug 2022 13:30:38 +0200
Subject: [PATCH 071/128] Add rpaths inside nrnivmodl-core.

---
 CMake/MakefileBuildOptions.cmake | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/CMake/MakefileBuildOptions.cmake b/CMake/MakefileBuildOptions.cmake
index b6f872115..e4c658349 100644
--- a/CMake/MakefileBuildOptions.cmake
+++ b/CMake/MakefileBuildOptions.cmake
@@ -51,13 +51,16 @@ function(coreneuron_process_library_path library)
     # In case target is not a target but is just the name of a library, e.g. "dl"
     set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_DEP_FLAGS " -l${library}")
   elseif("${library_dir}" MATCHES "^(/lib|/lib64|/usr/lib|/usr/lib64)$")
-    # e.g. /usr/lib64/libpthread.so -> -lpthread
+    # e.g. /usr/lib64/libpthread.so -> -lpthread TODO: consider using
+    # https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_IMPLICIT_LINK_DIRECTORIES.html, or
+    # dropping this special case entirely
     get_filename_component(libname ${library} NAME_WE)
     string(REGEX REPLACE "^lib" "" libname ${libname})
     set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_DEP_FLAGS " -l${libname}")
   else()
     # It's a full path, include that on the line
-    set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_DEP_FLAGS " ${library}")
+    set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_DEP_FLAGS
+                                               " -Wl,-rpath,${library_dir} ${library}")
   endif()
 endfunction()
 function(coreneuron_process_target target)

From 9c96a36e4d68241c00908acbdc6f89c61aea00b4 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Thu, 4 Aug 2022 16:21:02 +0200
Subject: [PATCH 072/128] accept a private destructor function pointer from
 generated mechanisms

---
 coreneuron/gpu/nrn_acc_manager.cpp     |  4 ----
 coreneuron/io/nrn_setup.cpp            |  9 ++++-----
 coreneuron/mechanism/capac.cpp         | 11 ++++++-----
 coreneuron/mechanism/eion.cpp          |  9 +++++----
 coreneuron/mechanism/mechanism.hpp     |  8 +++++++-
 coreneuron/mechanism/membfunc.hpp      | 10 ++++++++--
 coreneuron/mechanism/register_mech.cpp | 10 +++++++---
 external/mod2c                         |  2 +-
 8 files changed, 38 insertions(+), 25 deletions(-)

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index 82ef53e0f..4c5f28b67 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -312,10 +312,6 @@ static void delete_ml_from_device(Memb_list* ml, int type) {
         int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
         cnrn_target_delete(ml->pdata, pcnt);
     }
-    if (ml->global_variables) {
-        // std::byte* in C++17
-        cnrn_target_delete(static_cast<char*>(ml->global_variables), ml->global_variables_size);
-    }
 
     cnrn_target_delete(ml->nodeindices, n);
     cnrn_target_delete(ml);
diff --git a/coreneuron/io/nrn_setup.cpp b/coreneuron/io/nrn_setup.cpp
index f34a489c1..a7ee2fdf8 100644
--- a/coreneuron/io/nrn_setup.cpp
+++ b/coreneuron/io/nrn_setup.cpp
@@ -761,11 +761,10 @@ void nrn_cleanup() {
                 ml->instance = nullptr;
             }
 
-            if (ml->global_variables) {
-                std::cout << "Cannot generically free Memb_list::global_variables, leaking it"
-                          << std::endl;
-                // free(ml->global_variables);
-                ml->global_variables = nullptr;
+            // Destroy the global variables struct allocated in nrn_init
+            if (auto* const priv_dtor = corenrn.get_memb_func(tml->index).private_destructor) {
+                (*priv_dtor)(nt, ml, tml->index);
+                assert(!ml->global_variables);
             }
 
             NetReceiveBuffer_t* nrb = ml->_net_receive_buffer;
diff --git a/coreneuron/mechanism/capac.cpp b/coreneuron/mechanism/capac.cpp
index 42c65cb18..5333767c5 100644
--- a/coreneuron/mechanism/capac.cpp
+++ b/coreneuron/mechanism/capac.cpp
@@ -32,12 +32,13 @@ void capacitance_reg(void) {
     /* all methods deal with capacitance in special ways */
     register_mech(mechanism,
                   nrn_alloc_capacitance,
-                  (mod_f_t) 0,
-                  (mod_f_t) 0,
-                  (mod_f_t) 0,
-                  (mod_f_t) nrn_init_capacitance,
+                  nullptr,
+                  nullptr,
+                  nullptr,
+                  nrn_init_capacitance,
                   -1,
-                  1);
+                  1,
+                  nullptr);
     int mechtype = nrn_get_mechtype(mechanism[1]);
     _nrn_layout_reg(mechtype, SOA_LAYOUT);
     hoc_register_prop_size(mechtype, nparm, 0);
diff --git a/coreneuron/mechanism/eion.cpp b/coreneuron/mechanism/eion.cpp
index deab46627..350b4ff90 100644
--- a/coreneuron/mechanism/eion.cpp
+++ b/coreneuron/mechanism/eion.cpp
@@ -94,11 +94,12 @@ void ion_reg(const char* name, double valence) {
         register_mech((const char**) mechanism,
                       nrn_alloc_ion,
                       nrn_cur_ion,
-                      (mod_f_t) 0,
-                      (mod_f_t) 0,
-                      (mod_f_t) nrn_init_ion,
+                      nullptr,
+                      nullptr,
+                      nrn_init_ion,
                       -1,
-                      1);
+                      1,
+                      nullptr);
         mechtype = nrn_get_mechtype(mechanism[1]);
         _nrn_layout_reg(mechtype, SOA_LAYOUT);
         hoc_register_prop_size(mechtype, nparm, 1);
diff --git a/coreneuron/mechanism/mechanism.hpp b/coreneuron/mechanism/mechanism.hpp
index 1c177976c..caa895a0a 100644
--- a/coreneuron/mechanism/mechanism.hpp
+++ b/coreneuron/mechanism/mechanism.hpp
@@ -143,8 +143,14 @@ struct Memb_list {
     NetSendBuffer_t* _net_send_buffer = nullptr;
     int nodecount; /* actual node count */
     int _nodecount_padded;
+    // Not obvious that these need to be distinct (i.e. we could just have
+    // `instance` and `instance_size`, and use them in mod2c for global
+    // variables while NMODL could use the existing instance struct for globals
+    // too). nrn_acc_manager.cpp could handle data movement to/from the
+    // accelerator if the "constructor" in the translated MOD file code was
+    // called before the main nrn_acc_manager methods that copy
+    // thread/mechanism data to the device.
     void* instance = nullptr;         /* mechanism instance struct from NMODL */
     void* global_variables = nullptr; /* global variables struct for each mechanism */
-    int global_variables_size = 0;    /* size of global variables struct in bytes */
 };
 }  // namespace coreneuron
diff --git a/coreneuron/mechanism/membfunc.hpp b/coreneuron/mechanism/membfunc.hpp
index 2a7c8f54e..64b9443c5 100644
--- a/coreneuron/mechanism/membfunc.hpp
+++ b/coreneuron/mechanism/membfunc.hpp
@@ -35,6 +35,10 @@ struct Memb_func {
     mod_f_t initialize;
     mod_f_t constructor;
     mod_f_t destructor; /* only for point processes */
+    // This is used for CoreNEURON-internal cleanup; it is kept separate from
+    // the DESTRUCTOR function just above (which apparently is only for point
+    // processes) for simplicity;
+    mod_f_t private_destructor;
     Symbol* sym;
     int vectorized;
     int thread_size_;                       /* how many Datum needed in Memb_list if vectorized */
@@ -91,7 +95,8 @@ extern int register_mech(const char** m,
                          mod_f_t stat,
                          mod_f_t initialize,
                          int nrnpointerindex,
-                         int vectorized);
+                         int vectorized,
+                         mod_f_t private_destructor);
 extern int point_register_mech(const char**,
                                mod_alloc_t alloc,
                                mod_f_t cur,
@@ -101,7 +106,8 @@ extern int point_register_mech(const char**,
                                int nrnpointerindex,
                                mod_f_t constructor,
                                mod_f_t destructor,
-                               int vectorized);
+                               int vectorized,
+                               mod_f_t private_destructor);
 extern void register_constructor(mod_f_t constructor);
 using NetBufReceive_t = void (*)(NrnThread*);
 extern void hoc_register_net_receive_buffering(NetBufReceive_t, int);
diff --git a/coreneuron/mechanism/register_mech.cpp b/coreneuron/mechanism/register_mech.cpp
index 4f545998a..44d4b5f8f 100644
--- a/coreneuron/mechanism/register_mech.cpp
+++ b/coreneuron/mechanism/register_mech.cpp
@@ -118,7 +118,8 @@ int register_mech(const char** m,
                   mod_f_t stat,
                   mod_f_t initialize,
                   int /* nrnpointerindex */,
-                  int vectorized) {
+                  int vectorized,
+                  mod_f_t private_destructor) {
     auto& memb_func = corenrn.get_memb_funcs();
 
     int type = nrn_get_mechtype(m[1]);
@@ -144,6 +145,7 @@ int register_mech(const char** m,
     memb_func[type].initialize = initialize;
     memb_func[type].constructor = nullptr;
     memb_func[type].destructor = nullptr;
+    memb_func[type].private_destructor = private_destructor;
 #if VECTORIZE
     memb_func[type].vectorized = vectorized ? 1 : 0;
     memb_func[type].thread_size_ = vectorized ? (vectorized - 1) : 0;
@@ -343,9 +345,11 @@ int point_register_mech(const char** m,
                         int nrnpointerindex,
                         mod_f_t constructor,
                         mod_f_t destructor,
-                        int vectorized) {
+                        int vectorized,
+                        mod_f_t private_destructor) {
     const Symbol* s = m[1];
-    register_mech(m, alloc, cur, jacob, stat, initialize, nrnpointerindex, vectorized);
+    register_mech(
+        m, alloc, cur, jacob, stat, initialize, nrnpointerindex, vectorized, private_destructor);
     register_constructor(constructor);
     register_destructor(destructor);
     return point_reg_helper(s);
diff --git a/external/mod2c b/external/mod2c
index 4f8df8877..eb9a42f79 160000
--- a/external/mod2c
+++ b/external/mod2c
@@ -1 +1 @@
-Subproject commit 4f8df887736f24c4d59262984f62312bb7851363
+Subproject commit eb9a42f79d4a9ad82b06f5a080e1927a3f7c5f9f

From 37b9291dcc0702d7074d9288420def78547e5277 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Thu, 4 Aug 2022 18:05:52 +0200
Subject: [PATCH 073/128] Support private constructor.

- Private in the sense that it is for CoreNEURON-internal usage (in
  collaboration with mod2c/nmodl) and not visible to MOD files.
- Storage for global variables is now allocated there, so it is already
  visible when TABLE statements are initialised (before nrn_init is
  called for the first time).
- This means that the global variable structure is copied to/from the
  device in nrn_acc_manager.cpp, just like the other members of
  Memb_list.
---
 coreneuron/gpu/nrn_acc_manager.cpp     | 14 ++++++++++++++
 coreneuron/io/phase2.cpp               |  3 ++-
 coreneuron/mechanism/capac.cpp         |  1 +
 coreneuron/mechanism/eion.cpp          |  1 +
 coreneuron/mechanism/mechanism.hpp     |  1 +
 coreneuron/mechanism/membfunc.hpp      | 14 +++++++++-----
 coreneuron/mechanism/patternstim.cpp   |  8 ++++++--
 coreneuron/mechanism/register_mech.cpp | 19 ++++++++++++++-----
 coreneuron/sim/multicore.cpp           |  7 ++++++-
 coreneuron/sim/multicore.hpp           |  3 ++-
 external/mod2c                         |  2 +-
 11 files changed, 57 insertions(+), 16 deletions(-)

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index 4c5f28b67..9501a758d 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -141,6 +141,13 @@ static Memb_list* copy_ml_to_device(const Memb_list* ml, int type, double* dml_d
 
     auto d_ml = cnrn_target_copyin(ml);
 
+    if (ml->global_variables) {
+        assert(ml->global_variables_size);
+        void* d_glob_vars = cnrn_target_copyin(static_cast<std::byte*>(ml->global_variables),
+                                               ml->global_variables_size);
+        cnrn_target_memcpy_to_device(&(d_ml->global_variables), &d_glob_vars);
+    }
+
     int n = ml->nodecount;
     int szp = corenrn.get_prop_param_size()[type];
     int szdp = corenrn.get_prop_dparam_size()[type];
@@ -314,6 +321,13 @@ static void delete_ml_from_device(Memb_list* ml, int type) {
     }
 
     cnrn_target_delete(ml->nodeindices, n);
+
+    if (ml->global_variables) {
+        assert(ml->global_variables_size);
+        cnrn_target_delete(static_cast<std::byte*>(ml->global_variables),
+                           ml->global_variables_size);
+    }
+
     cnrn_target_delete(ml);
 }
 
diff --git a/coreneuron/io/phase2.cpp b/coreneuron/io/phase2.cpp
index 0b96e1956..ad5748ad7 100644
--- a/coreneuron/io/phase2.cpp
+++ b/coreneuron/io/phase2.cpp
@@ -959,7 +959,8 @@ void Phase2::populate(NrnThread& nt, const UserParams& userParams) {
 
     NrnThreadMembList* tml_last = nullptr;
     for (int i = 0; i < n_mech; ++i) {
-        auto tml = create_tml(i, memb_func[mech_types[i]], shadow_rhs_cnt, mech_types, nodecounts);
+        auto tml =
+            create_tml(nt, i, memb_func[mech_types[i]], shadow_rhs_cnt, mech_types, nodecounts);
 
         nt._ml_list[tml->index] = tml->ml;
 
diff --git a/coreneuron/mechanism/capac.cpp b/coreneuron/mechanism/capac.cpp
index 5333767c5..2ffabb4e9 100644
--- a/coreneuron/mechanism/capac.cpp
+++ b/coreneuron/mechanism/capac.cpp
@@ -38,6 +38,7 @@ void capacitance_reg(void) {
                   nrn_init_capacitance,
                   -1,
                   1,
+                  nullptr,
                   nullptr);
     int mechtype = nrn_get_mechtype(mechanism[1]);
     _nrn_layout_reg(mechtype, SOA_LAYOUT);
diff --git a/coreneuron/mechanism/eion.cpp b/coreneuron/mechanism/eion.cpp
index 350b4ff90..9f442e12a 100644
--- a/coreneuron/mechanism/eion.cpp
+++ b/coreneuron/mechanism/eion.cpp
@@ -99,6 +99,7 @@ void ion_reg(const char* name, double valence) {
                       nrn_init_ion,
                       -1,
                       1,
+                      nullptr,
                       nullptr);
         mechtype = nrn_get_mechtype(mechanism[1]);
         _nrn_layout_reg(mechtype, SOA_LAYOUT);
diff --git a/coreneuron/mechanism/mechanism.hpp b/coreneuron/mechanism/mechanism.hpp
index caa895a0a..d82729a1d 100644
--- a/coreneuron/mechanism/mechanism.hpp
+++ b/coreneuron/mechanism/mechanism.hpp
@@ -152,5 +152,6 @@ struct Memb_list {
     // thread/mechanism data to the device.
     void* instance = nullptr;         /* mechanism instance struct from NMODL */
     void* global_variables = nullptr; /* global variables struct for each mechanism */
+    std::size_t global_variables_size{};
 };
 }  // namespace coreneuron
diff --git a/coreneuron/mechanism/membfunc.hpp b/coreneuron/mechanism/membfunc.hpp
index 64b9443c5..5380e16b6 100644
--- a/coreneuron/mechanism/membfunc.hpp
+++ b/coreneuron/mechanism/membfunc.hpp
@@ -23,6 +23,8 @@ struct NrnThread;
 using mod_alloc_t = void (*)(double*, Datum*, int);
 using mod_f_t = void (*)(NrnThread*, Memb_list*, int);
 using pnt_receive_t = void (*)(Point_process*, int, double);
+using thread_table_check_t =
+    void (*)(int, int, double*, Datum*, ThreadDatum*, NrnThread*, Memb_list*, int);
 
 /*
  * Memb_func structure contains all related informations of a mechanism
@@ -35,17 +37,17 @@ struct Memb_func {
     mod_f_t initialize;
     mod_f_t constructor;
     mod_f_t destructor; /* only for point processes */
-    // This is used for CoreNEURON-internal cleanup; it is kept separate from
-    // the DESTRUCTOR function just above (which apparently is only for point
-    // processes) for simplicity;
+    // These are used for CoreNEURON-internal allocation/cleanup; they are kept
+    // separate from the CONSTRUCTOR/DESTRUCTOR functions just above (one of
+    // which is apparently only for point processes) for simplicity.
+    mod_f_t private_constructor;
     mod_f_t private_destructor;
     Symbol* sym;
     int vectorized;
     int thread_size_;                       /* how many Datum needed in Memb_list if vectorized */
     void (*thread_mem_init_)(ThreadDatum*); /* after Memb_list._thread is allocated */
     void (*thread_cleanup_)(ThreadDatum*);  /* before Memb_list._thread is freed */
-    void (
-        *thread_table_check_)(int, int, double*, Datum*, ThreadDatum*, NrnThread*, Memb_list*, int);
+    thread_table_check_t thread_table_check_;
     int is_point;
     void (*setdata_)(double*, Datum*);
     int* dparam_semantics; /* for nrncore writing. */
@@ -96,6 +98,7 @@ extern int register_mech(const char** m,
                          mod_f_t initialize,
                          int nrnpointerindex,
                          int vectorized,
+                         mod_f_t private_constructor,
                          mod_f_t private_destructor);
 extern int point_register_mech(const char**,
                                mod_alloc_t alloc,
@@ -107,6 +110,7 @@ extern int point_register_mech(const char**,
                                mod_f_t constructor,
                                mod_f_t destructor,
                                int vectorized,
+                               mod_f_t private_constructor,
                                mod_f_t private_destructor);
 extern void register_constructor(mod_f_t constructor);
 using NetBufReceive_t = void (*)(NrnThread*);
diff --git a/coreneuron/mechanism/patternstim.cpp b/coreneuron/mechanism/patternstim.cpp
index 4f5e4e4e6..e680a6187 100644
--- a/coreneuron/mechanism/patternstim.cpp
+++ b/coreneuron/mechanism/patternstim.cpp
@@ -137,7 +137,7 @@ size_t read_raster_file(const char* fname, double** tvec, int** gidvec, double t
 }
 
 // see nrn_setup.cpp:read_phase2 for how it creates NrnThreadMembList instances.
-static NrnThreadMembList* alloc_nrn_thread_memb(int type) {
+static NrnThreadMembList* alloc_nrn_thread_memb(NrnThread* nt, int type) {
     NrnThreadMembList* tml = (NrnThreadMembList*) ecalloc(1, sizeof(NrnThreadMembList));
     tml->dependencies = nullptr;
     tml->ndependencies = 0;
@@ -161,6 +161,10 @@ static NrnThreadMembList* alloc_nrn_thread_memb(int type) {
     tml->ml->_net_send_buffer = nullptr;
     tml->ml->_permute = nullptr;
 
+    if (auto* const priv_ctor = corenrn.get_memb_func(tml->index).private_constructor) {
+        priv_ctor(nt, tml->ml, tml->index);
+    }
+
     return tml;
 }
 
@@ -178,7 +182,7 @@ Point_process* nrn_artcell_instantiate(const char* mechname) {
     // printf("nrn_artcell_instantiate %s type=%d\n", mechname, type);
 
     // create and append to nt.tml
-    auto tml = alloc_nrn_thread_memb(type);
+    auto tml = alloc_nrn_thread_memb(nt, type);
 
     assert(nt->_ml_list[type] == nullptr);  // FIXME
     nt->_ml_list[type] = tml->ml;
diff --git a/coreneuron/mechanism/register_mech.cpp b/coreneuron/mechanism/register_mech.cpp
index 44d4b5f8f..41ed41a29 100644
--- a/coreneuron/mechanism/register_mech.cpp
+++ b/coreneuron/mechanism/register_mech.cpp
@@ -119,6 +119,7 @@ int register_mech(const char** m,
                   mod_f_t initialize,
                   int /* nrnpointerindex */,
                   int vectorized,
+                  mod_f_t private_constructor,
                   mod_f_t private_destructor) {
     auto& memb_func = corenrn.get_memb_funcs();
 
@@ -145,6 +146,7 @@ int register_mech(const char** m,
     memb_func[type].initialize = initialize;
     memb_func[type].constructor = nullptr;
     memb_func[type].destructor = nullptr;
+    memb_func[type].private_constructor = private_constructor;
     memb_func[type].private_destructor = private_destructor;
 #if VECTORIZE
     memb_func[type].vectorized = vectorized ? 1 : 0;
@@ -346,10 +348,19 @@ int point_register_mech(const char** m,
                         mod_f_t constructor,
                         mod_f_t destructor,
                         int vectorized,
+                        mod_f_t private_constructor,
                         mod_f_t private_destructor) {
     const Symbol* s = m[1];
-    register_mech(
-        m, alloc, cur, jacob, stat, initialize, nrnpointerindex, vectorized, private_destructor);
+    register_mech(m,
+                  alloc,
+                  cur,
+                  jacob,
+                  stat,
+                  initialize,
+                  nrnpointerindex,
+                  vectorized,
+                  private_constructor,
+                  private_destructor);
     register_constructor(constructor);
     register_destructor(destructor);
     return point_reg_helper(s);
@@ -421,9 +432,7 @@ void _nrn_thread_reg1(int i, void (*f)(ThreadDatum*)) {
     corenrn.get_memb_func(i).thread_mem_init_ = f;
 }
 
-void _nrn_thread_table_reg(
-    int i,
-    void (*f)(int, int, double*, Datum*, ThreadDatum*, NrnThread*, Memb_list*, int)) {
+void _nrn_thread_table_reg(int i, thread_table_check_t f) {
     if (i == -1)
         return;
 
diff --git a/coreneuron/sim/multicore.cpp b/coreneuron/sim/multicore.cpp
index cf8daaac8..b8dd293d2 100644
--- a/coreneuron/sim/multicore.cpp
+++ b/coreneuron/sim/multicore.cpp
@@ -61,7 +61,8 @@ static int table_check_cnt_;
 static ThreadDatum* table_check_;
 
 
-NrnThreadMembList* create_tml(int mech_id,
+NrnThreadMembList* create_tml(NrnThread& nt,
+                              int mech_id,
                               Memb_func& memb_func,
                               int& shadow_rhs_cnt,
                               const std::vector<int>& mech_types,
@@ -91,6 +92,10 @@ NrnThreadMembList* create_tml(int mech_id,
         }
     }
 
+    if (auto* const priv_ctor = corenrn.get_memb_func(tml->index).private_constructor) {
+        priv_ctor(&nt, tml->ml, tml->index);
+    }
+
     return tml;
 }
 
diff --git a/coreneuron/sim/multicore.hpp b/coreneuron/sim/multicore.hpp
index 391b5dcaa..3e06e3585 100644
--- a/coreneuron/sim/multicore.hpp
+++ b/coreneuron/sim/multicore.hpp
@@ -36,7 +36,8 @@ struct NrnThreadMembList { /* patterned after CvMembList in cvodeobj.h */
     int* dependencies; /* list of mechanism types that this mechanism depends on*/
     int ndependencies; /* for scheduling we need to know the dependency count */
 };
-NrnThreadMembList* create_tml(int mech_id,
+NrnThreadMembList* create_tml(NrnThread& nt,
+                              int mech_id,
                               Memb_func& memb_func,
                               int& shadow_rhs_cnt,
                               const std::vector<int>& mech_types,
diff --git a/external/mod2c b/external/mod2c
index eb9a42f79..f4080b2ce 160000
--- a/external/mod2c
+++ b/external/mod2c
@@ -1 +1 @@
-Subproject commit eb9a42f79d4a9ad82b06f5a080e1927a3f7c5f9f
+Subproject commit f4080b2cefff571cc455c0e7f3efe34a034c0918

From a13f67bbcfdca4fc4ddab3a23b4402e128c583e5 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Thu, 4 Aug 2022 22:27:18 +0200
Subject: [PATCH 074/128] try and fix table statements

---
 external/mod2c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/mod2c b/external/mod2c
index f4080b2ce..2603ada87 160000
--- a/external/mod2c
+++ b/external/mod2c
@@ -1 +1 @@
-Subproject commit f4080b2cefff571cc455c0e7f3efe34a034c0918
+Subproject commit 2603ada879eaa9937b955807e65f3d02500c6a09

From 8124239373924db3ba423d30859a5e1dd3bcbff5 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Fri, 5 Aug 2022 08:06:10 +0200
Subject: [PATCH 075/128] submodule

---
 external/mod2c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/mod2c b/external/mod2c
index 2603ada87..bc5f9d696 160000
--- a/external/mod2c
+++ b/external/mod2c
@@ -1 +1 @@
-Subproject commit 2603ada879eaa9937b955807e65f3d02500c6a09
+Subproject commit bc5f9d6962ebcf2b40d64ae8d6b95a829f40f517

From cd499d73d1b98ce3a0d83d81f52cb8b492daf910 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Fri, 5 Aug 2022 09:52:38 +0200
Subject: [PATCH 076/128] reduce diff

---
 coreneuron/gpu/nrn_acc_manager.cpp       | 18 +++++-------------
 coreneuron/mechanism/mech/enginemech.cpp |  2 +-
 coreneuron/permute/cellorder.cu          |  2 +-
 coreneuron/utils/randoms/nrnran123.cpp   |  4 ----
 tests/unit/solver/CMakeLists.txt         |  2 +-
 5 files changed, 8 insertions(+), 20 deletions(-)

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index 9501a758d..e8dcb6ae0 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -132,7 +132,7 @@ void cnrn_target_set_default_device(int device_num) {
 
 #ifdef CORENEURON_ENABLE_GPU
 #ifndef CORENEURON_UNIFIED_MEMORY
-static Memb_list* copy_ml_to_device(const Memb_list* ml, int type, double* dml_data) {
+static Memb_list* copy_ml_to_device(const Memb_list* ml, int type) {
     // As we never run code for artificial cell inside GPU we don't copy it.
     int is_art = corenrn.get_is_artificial()[type];
     if (is_art) {
@@ -152,10 +152,10 @@ static Memb_list* copy_ml_to_device(const Memb_list* ml, int type, double* dml_d
     int szp = corenrn.get_prop_param_size()[type];
     int szdp = corenrn.get_prop_dparam_size()[type];
 
-    double* dptr = dml_data;
-
+    double* dptr = cnrn_target_deviceptr(ml->data);
     cnrn_target_memcpy_to_device(&(d_ml->data), &(dptr));
 
+
     int* d_nodeindices = cnrn_target_copyin(ml->nodeindices, n);
     cnrn_target_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices);
 
@@ -319,7 +319,6 @@ static void delete_ml_from_device(Memb_list* ml, int type) {
         int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
         cnrn_target_delete(ml->pdata, pcnt);
     }
-
     cnrn_target_delete(ml->nodeindices, n);
 
     if (ml->global_variables) {
@@ -396,6 +395,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         /*copy all double data for thread */
         d__data = cnrn_target_copyin(nt->_data, nt->_ndata);
 
+
         /* Here is the example of using OpenACC data enter/exit
          * Remember that we are not allowed to use nt->_data but we have to use:
          *      double *dtmp = nt->_data;  // now use dtmp!
@@ -465,17 +465,9 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             // book keeping for linked-list
             d_last_tml = d_tml;
 
-            // TODO: acc_deviceptr is returning host pointer when
-            // coreneuron is launched via python instead of special
-            //      see: https://github.com/BlueBrain/CoreNeuron/issues/141#issuecomment-1086746848
-            // As ml->data is always within nt->_data, temporarily calculate
-            // device pointer of ml->data on using offset.
-            double* dml_data = d__data + (tml->ml->data - nt->_data);
-
             /* now for every tml, there is a ml. copy that and setup pointer */
-            Memb_list* d_ml = copy_ml_to_device(tml->ml, tml->index, dml_data);
+            Memb_list* d_ml = copy_ml_to_device(tml->ml, tml->index);
             cnrn_target_memcpy_to_device(&(d_tml->ml), &d_ml);
-
             /* setup nt._ml_list */
             cnrn_target_memcpy_to_device(&(d_ml_list[tml->index]), &d_ml);
         }
diff --git a/coreneuron/mechanism/mech/enginemech.cpp b/coreneuron/mechanism/mech/enginemech.cpp
index ee9cc9e28..2c20d1293 100644
--- a/coreneuron/mechanism/mech/enginemech.cpp
+++ b/coreneuron/mechanism/mech/enginemech.cpp
@@ -1,6 +1,6 @@
 /*
 # =============================================================================
-# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
+# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
 #
 # See top-level LICENSE file for details.
 # =============================================================================
diff --git a/coreneuron/permute/cellorder.cu b/coreneuron/permute/cellorder.cu
index ed8975148..1f1bdff94 100644
--- a/coreneuron/permute/cellorder.cu
+++ b/coreneuron/permute/cellorder.cu
@@ -1,6 +1,6 @@
 /*
 # =============================================================================
-# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
+# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
 #
 # See top-level LICENSE file for details.
 # =============================================================================
diff --git a/coreneuron/utils/randoms/nrnran123.cpp b/coreneuron/utils/randoms/nrnran123.cpp
index f2bfed11a..af1378044 100644
--- a/coreneuron/utils/randoms/nrnran123.cpp
+++ b/coreneuron/utils/randoms/nrnran123.cpp
@@ -16,10 +16,6 @@
 #include <unordered_map>
 #endif
 
-#ifdef __CUDACC__
-#include <nv/target>
-#endif
-
 #include <cmath>
 #include <iostream>
 #include <memory>
diff --git a/tests/unit/solver/CMakeLists.txt b/tests/unit/solver/CMakeLists.txt
index 01e058525..f8bc52287 100644
--- a/tests/unit/solver/CMakeLists.txt
+++ b/tests/unit/solver/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022 Blue Brain Project
+# Copyright (c) 2022 Blue Brain Project/EPFL
 #
 # See top-level LICENSE file for details.
 # =============================================================================

From 341e89c3d9a8d737062ec97b675019ae96e780c3 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Fri, 5 Aug 2022 19:22:46 +0200
Subject: [PATCH 077/128] nmodl attempt

---
 coreneuron/gpu/nrn_acc_manager.cpp     | 14 ++++++++++++++
 coreneuron/mechanism/capac.cpp         |  6 +++---
 coreneuron/mechanism/eion.cpp          |  6 +++---
 coreneuron/mechanism/mechanism.hpp     |  1 +
 coreneuron/mechanism/membfunc.hpp      | 13 +++++++------
 coreneuron/mechanism/register_mech.cpp | 18 +++++++++---------
 external/mod2c                         |  2 +-
 external/nmodl                         |  2 +-
 8 files changed, 39 insertions(+), 23 deletions(-)

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index e8dcb6ae0..7df4155fb 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -141,6 +141,14 @@ static Memb_list* copy_ml_to_device(const Memb_list* ml, int type) {
 
     auto d_ml = cnrn_target_copyin(ml);
 
+    if (ml->instance) {
+        assert(ml->instance_size);
+        void* d_inst = cnrn_target_copyin(static_cast<std::byte*>(ml->instance),
+                                               ml->instance_size);
+        cnrn_target_memcpy_to_device(&(d_ml->instance), &d_inst);
+    }
+
+
     if (ml->global_variables) {
         assert(ml->global_variables_size);
         void* d_glob_vars = cnrn_target_copyin(static_cast<std::byte*>(ml->global_variables),
@@ -327,6 +335,12 @@ static void delete_ml_from_device(Memb_list* ml, int type) {
                            ml->global_variables_size);
     }
 
+    if (ml->instance) {
+        assert(ml->instance_size);
+        cnrn_target_delete(static_cast<std::byte*>(ml->instance),
+                           ml->instance_size);
+    }
+
     cnrn_target_delete(ml);
 }
 
diff --git a/coreneuron/mechanism/capac.cpp b/coreneuron/mechanism/capac.cpp
index 2ffabb4e9..f47a4ebd7 100644
--- a/coreneuron/mechanism/capac.cpp
+++ b/coreneuron/mechanism/capac.cpp
@@ -36,10 +36,10 @@ void capacitance_reg(void) {
                   nullptr,
                   nullptr,
                   nrn_init_capacitance,
-                  -1,
-                  1,
                   nullptr,
-                  nullptr);
+                  nullptr,
+                  -1,
+                  1);
     int mechtype = nrn_get_mechtype(mechanism[1]);
     _nrn_layout_reg(mechtype, SOA_LAYOUT);
     hoc_register_prop_size(mechtype, nparm, 0);
diff --git a/coreneuron/mechanism/eion.cpp b/coreneuron/mechanism/eion.cpp
index 9f442e12a..ec1fd665e 100644
--- a/coreneuron/mechanism/eion.cpp
+++ b/coreneuron/mechanism/eion.cpp
@@ -97,10 +97,10 @@ void ion_reg(const char* name, double valence) {
                       nullptr,
                       nullptr,
                       nrn_init_ion,
-                      -1,
-                      1,
                       nullptr,
-                      nullptr);
+                      nullptr,
+                      -1,
+                      1);
         mechtype = nrn_get_mechtype(mechanism[1]);
         _nrn_layout_reg(mechtype, SOA_LAYOUT);
         hoc_register_prop_size(mechtype, nparm, 1);
diff --git a/coreneuron/mechanism/mechanism.hpp b/coreneuron/mechanism/mechanism.hpp
index d82729a1d..f8efd643b 100644
--- a/coreneuron/mechanism/mechanism.hpp
+++ b/coreneuron/mechanism/mechanism.hpp
@@ -151,6 +151,7 @@ struct Memb_list {
     // called before the main nrn_acc_manager methods that copy
     // thread/mechanism data to the device.
     void* instance = nullptr;         /* mechanism instance struct from NMODL */
+    std::size_t instance_size{};
     void* global_variables = nullptr; /* global variables struct for each mechanism */
     std::size_t global_variables_size{};
 };
diff --git a/coreneuron/mechanism/membfunc.hpp b/coreneuron/mechanism/membfunc.hpp
index 5380e16b6..7602a8218 100644
--- a/coreneuron/mechanism/membfunc.hpp
+++ b/coreneuron/mechanism/membfunc.hpp
@@ -96,22 +96,23 @@ extern int register_mech(const char** m,
                          mod_f_t jacob,
                          mod_f_t stat,
                          mod_f_t initialize,
-                         int nrnpointerindex,
-                         int vectorized,
                          mod_f_t private_constructor,
-                         mod_f_t private_destructor);
+                         mod_f_t private_destructor,
+                         int nrnpointerindex,
+                         int vectorized
+);
 extern int point_register_mech(const char**,
                                mod_alloc_t alloc,
                                mod_f_t cur,
                                mod_f_t jacob,
                                mod_f_t stat,
                                mod_f_t initialize,
+                               mod_f_t private_constructor,
+                               mod_f_t private_destructor,
                                int nrnpointerindex,
                                mod_f_t constructor,
                                mod_f_t destructor,
-                               int vectorized,
-                               mod_f_t private_constructor,
-                               mod_f_t private_destructor);
+                               int vectorized);
 extern void register_constructor(mod_f_t constructor);
 using NetBufReceive_t = void (*)(NrnThread*);
 extern void hoc_register_net_receive_buffering(NetBufReceive_t, int);
diff --git a/coreneuron/mechanism/register_mech.cpp b/coreneuron/mechanism/register_mech.cpp
index 41ed41a29..01a71b5bc 100644
--- a/coreneuron/mechanism/register_mech.cpp
+++ b/coreneuron/mechanism/register_mech.cpp
@@ -117,10 +117,10 @@ int register_mech(const char** m,
                   mod_f_t jacob,
                   mod_f_t stat,
                   mod_f_t initialize,
-                  int /* nrnpointerindex */,
-                  int vectorized,
                   mod_f_t private_constructor,
-                  mod_f_t private_destructor) {
+                  mod_f_t private_destructor,
+                  int /* nrnpointerindex */,
+                  int vectorized) {
     auto& memb_func = corenrn.get_memb_funcs();
 
     int type = nrn_get_mechtype(m[1]);
@@ -344,12 +344,12 @@ int point_register_mech(const char** m,
                         mod_f_t jacob,
                         mod_f_t stat,
                         mod_f_t initialize,
+                        mod_f_t private_constructor,
+                        mod_f_t private_destructor,
                         int nrnpointerindex,
                         mod_f_t constructor,
                         mod_f_t destructor,
-                        int vectorized,
-                        mod_f_t private_constructor,
-                        mod_f_t private_destructor) {
+                        int vectorized) {
     const Symbol* s = m[1];
     register_mech(m,
                   alloc,
@@ -357,10 +357,10 @@ int point_register_mech(const char** m,
                   jacob,
                   stat,
                   initialize,
-                  nrnpointerindex,
-                  vectorized,
                   private_constructor,
-                  private_destructor);
+                  private_destructor,
+                  nrnpointerindex,
+                  vectorized);
     register_constructor(constructor);
     register_destructor(destructor);
     return point_reg_helper(s);
diff --git a/external/mod2c b/external/mod2c
index bc5f9d696..8b754b35b 160000
--- a/external/mod2c
+++ b/external/mod2c
@@ -1 +1 @@
-Subproject commit bc5f9d6962ebcf2b40d64ae8d6b95a829f40f517
+Subproject commit 8b754b35b6ea3088a713590bc5d72af3e2f8ef2b
diff --git a/external/nmodl b/external/nmodl
index b99496a91..ec1f3300c 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit b99496a919df98a89cd97cb6898dda49f0d17c56
+Subproject commit ec1f3300c8e4b6b5cd7c4d85ebb5204050c1b311

From 5ee91a6458cdc602b51634e7e0b8feb7872be32b Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 10 Aug 2022 09:02:17 +0200
Subject: [PATCH 078/128] fix build dependencies

---
 coreneuron/CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 57af85b6e..69f61daaf 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -306,6 +306,11 @@ if(CORENRN_ENABLE_GPU)
   target_compile_options(coreneuron-core PRIVATE ${CORENRN_ACC_FLAGS})
 endif()
 
+# Create an extra target for use by NEURON when CoreNEURON is being built as a
+# submodule. NEURON tests will depend on this, so it must in turn depend on
+# everything that is needed to run nrnivmodl -coreneuron.
+add_custom_target(coreneuron-for-tests)
+add_dependencies(coreneuron-for-tests coreneuron-core ${NMODL_TARGET_TO_DEPEND})
 # Create an extra target for internal use that unit tests and so on can depend on.
 # ${corenrn_mech_library} is libcorenrnmech.{a,so}, which contains both the compiled default
 # mechanisms and the content of libcoreneuron-core.a.

From 95972570b27a0f0fb9f1610abc64be1d7ebb6113 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 10 Aug 2022 09:02:43 +0200
Subject: [PATCH 079/128] Try and fix partial_piv_lu.cu linking.

---
 coreneuron/CMakeLists.txt | 8 ++++++--
 external/nmodl            | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 69f61daaf..9e829f940 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -117,7 +117,7 @@ if(CORENRN_ENABLE_GPU)
   # these functions from CUDA kernels presents no issue ... TODO is it going to work to call these
   # from libcoreneuron-cuda.so? probably not...
   if(CORENRN_ENABLE_NMODL AND EXISTS ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cu)
-    list(APPEND CORENEURON_CUDA_FILES ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cu)
+    set(CORENEURON_CUDA_LIBRARY_FILES ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cu)
   endif()
 endif()
 
@@ -159,9 +159,13 @@ endif()
 # https://forums.developer.nvidia.com/t/cannot-dynamically-load-a-shared-library-containing-both-openacc-and-cuda-code/210972
 add_library(coreneuron-core STATIC ${CORENEURON_CODE_FILES} ${CORENRN_MPI_OBJ})
 if(CORENRN_ENABLE_GPU)
-  set(coreneuron_cuda_target coreneuron-cuda)
+  set(coreneuron_cuda_target coreneuron-cuda coreneuron-cuda-helpers)
+  add_library(coreneuron-cuda-helpers STATIC ${CORENEURON_CUDA_LIBRARY_FILES})
   add_library(coreneuron-cuda ${COMPILE_LIBRARY_TYPE} ${CORENEURON_CUDA_FILES})
+  target_link_libraries(coreneuron-cuda PUBLIC coreneuron-cuda-helpers)
   target_link_libraries(coreneuron-core PUBLIC coreneuron-cuda)
+  set_property(TARGET coreneuron-cuda PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
+  set_property(TARGET coreneuron-core coreneuron-cuda-helpers PROPERTY CUDA_SEPARABLE_COMPILATION ON)
 endif()
 
 foreach(target coreneuron-core ${coreneuron_cuda_target})
diff --git a/external/nmodl b/external/nmodl
index ec1f3300c..8003e2627 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit ec1f3300c8e4b6b5cd7c4d85ebb5204050c1b311
+Subproject commit 8003e262727ca89630e1e800e8b3415f6d9b716f

From c0862b792f766f7aa8bdd426a03853bc52b10471 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 10 Aug 2022 09:32:53 +0200
Subject: [PATCH 080/128] fix shutdown with NMODL

---
 coreneuron/io/nrn_setup.cpp | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/coreneuron/io/nrn_setup.cpp b/coreneuron/io/nrn_setup.cpp
index a7ee2fdf8..5aed57fbb 100644
--- a/coreneuron/io/nrn_setup.cpp
+++ b/coreneuron/io/nrn_setup.cpp
@@ -754,17 +754,14 @@ void nrn_cleanup() {
                 ml->_thread = nullptr;
             }
 
-            // Probably causes problems with NMODL, which allocates its instance
-            // in unified memory.
-            if (ml->instance) {
-                free(ml->instance);
-                ml->instance = nullptr;
-            }
-
             // Destroy the global variables struct allocated in nrn_init
             if (auto* const priv_dtor = corenrn.get_memb_func(tml->index).private_destructor) {
                 (*priv_dtor)(nt, ml, tml->index);
+                assert(!ml->instance);
+                assert(!ml->instance_size);
+                // TODO make mod2c use `instance` instead of `global_variables`
                 assert(!ml->global_variables);
+                assert(!ml->global_variables_size);
             }
 
             NetReceiveBuffer_t* nrb = ml->_net_receive_buffer;

From 4f3c5df42c8ce541dfe1a665933c1157009aadb6 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Thu, 11 Aug 2022 12:49:23 +0200
Subject: [PATCH 081/128] try and fix all the things

---
 CMakeLists.txt                         |  5 +++++
 coreneuron/CMakeLists.txt              | 10 +++-------
 coreneuron/utils/randoms/nrnran123.cpp | 10 +++++++++-
 external/nmodl                         |  2 +-
 tests/unit/alignment/CMakeLists.txt    |  4 +---
 5 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ab3de7345..d53d2a369 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -281,6 +281,11 @@ endif()
 # =============================================================================
 # Build option specific compiler flags
 # =============================================================================
+if(CORENRN_ENABLE_NMODL)
+  # We use Eigen for "small" matrices with thread-level parallelism handled at a
+  # higher level; tell Eigen not to try to multithread internally
+  list(APPEND CORENRN_COMPILE_DEFS EIGEN_DONT_PARALLELIZE)
+endif()
 if(CORENRN_HAVE_NVHPC_COMPILER)
   # PGI with llvm code generation doesn't have necessary assembly intrinsic headers
   list(APPEND CORENRN_COMPILE_DEFS EIGEN_DONT_VECTORIZE=1)
diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 9e829f940..2fa72a534 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -116,8 +116,8 @@ if(CORENRN_ENABLE_GPU)
   # __device__ & acc routine tokens), which allows us to eventually call them from OpenACC. Calling
   # these functions from CUDA kernels presents no issue ... TODO is it going to work to call these
   # from libcoreneuron-cuda.so? probably not...
-  if(CORENRN_ENABLE_NMODL AND EXISTS ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cu)
-    set(CORENEURON_CUDA_LIBRARY_FILES ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cu)
+  if(CORENRN_ENABLE_NMODL AND EXISTS ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cpp)
+    list(APPEND CORENEURON_CODE_FILES ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cpp)
   endif()
 endif()
 
@@ -159,13 +159,9 @@ endif()
 # https://forums.developer.nvidia.com/t/cannot-dynamically-load-a-shared-library-containing-both-openacc-and-cuda-code/210972
 add_library(coreneuron-core STATIC ${CORENEURON_CODE_FILES} ${CORENRN_MPI_OBJ})
 if(CORENRN_ENABLE_GPU)
-  set(coreneuron_cuda_target coreneuron-cuda coreneuron-cuda-helpers)
-  add_library(coreneuron-cuda-helpers STATIC ${CORENEURON_CUDA_LIBRARY_FILES})
+  set(coreneuron_cuda_target coreneuron-cuda)
   add_library(coreneuron-cuda ${COMPILE_LIBRARY_TYPE} ${CORENEURON_CUDA_FILES})
-  target_link_libraries(coreneuron-cuda PUBLIC coreneuron-cuda-helpers)
   target_link_libraries(coreneuron-core PUBLIC coreneuron-cuda)
-  set_property(TARGET coreneuron-cuda PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
-  set_property(TARGET coreneuron-core coreneuron-cuda-helpers PROPERTY CUDA_SEPARABLE_COMPILATION ON)
 endif()
 
 foreach(target coreneuron-core ${coreneuron_cuda_target})
diff --git a/coreneuron/utils/randoms/nrnran123.cpp b/coreneuron/utils/randoms/nrnran123.cpp
index af1378044..0a6c89562 100644
--- a/coreneuron/utils/randoms/nrnran123.cpp
+++ b/coreneuron/utils/randoms/nrnran123.cpp
@@ -21,6 +21,14 @@
 #include <memory>
 #include <mutex>
 
+// Defining these attributes seems to help nvc++ in OpenMP target offload mode.
+#if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENMP) && defined(__CUDACC__)
+#define CORENRN_HOST_DEVICE __host__ __device__
+#else
+#define CORENRN_HOST_DEVICE
+#endif
+
 namespace {
 #ifdef CORENEURON_USE_BOOST_POOL
 /** Tag type for use with boost::fast_pool_allocator that forwards to
@@ -86,7 +94,7 @@ __attribute__((noinline)) philox4x32_key_t& global_state() {
 }
 }  // namespace
 
-philox4x32_ctr_t coreneuron_random123_philox4x32_helper(coreneuron::nrnran123_State* s) {
+CORENRN_HOST_DEVICE philox4x32_ctr_t coreneuron_random123_philox4x32_helper(coreneuron::nrnran123_State* s) {
     return philox4x32(s->c, global_state());
 }
 
diff --git a/external/nmodl b/external/nmodl
index 8003e2627..09005d9ad 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit 8003e262727ca89630e1e800e8b3415f6d9b716f
+Subproject commit 09005d9adf2a6f9372a0d4ad11674ce15ff72ca0
diff --git a/tests/unit/alignment/CMakeLists.txt b/tests/unit/alignment/CMakeLists.txt
index 92464350e..0cffdc8b3 100644
--- a/tests/unit/alignment/CMakeLists.txt
+++ b/tests/unit/alignment/CMakeLists.txt
@@ -3,9 +3,7 @@
 #
 # See top-level LICENSE file for details.
 # =============================================================================
-include_directories(${CMAKE_SOURCE_DIR}/coreneuron ${Boost_INCLUDE_DIRS})
-
 add_executable(alignment_test_bin alignment.cpp)
-target_compile_options(alignment_test_bin PRIVATE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS})
+target_link_libraries(alignment_test_bin coreneuron-unit-test)
 add_test(NAME alignment_test COMMAND ${TEST_EXEC_PREFIX} $<TARGET_FILE:alignment_test_bin>)
 cpp_cc_configure_sanitizers(TARGET alignment_test_bin TEST alignment_test)

From f706029239fa7f0cfbd69cb1abd5c84172c6b8dd Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Thu, 11 Aug 2022 12:52:52 +0200
Subject: [PATCH 082/128] one more openmp fix

---
 coreneuron/mechanism/register_mech.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/coreneuron/mechanism/register_mech.cpp b/coreneuron/mechanism/register_mech.cpp
index 01a71b5bc..498754d80 100644
--- a/coreneuron/mechanism/register_mech.cpp
+++ b/coreneuron/mechanism/register_mech.cpp
@@ -19,9 +19,7 @@
 
 namespace coreneuron {
 int secondorder = 0;
-nrn_pragma_omp(declare target)
 double t, dt, celsius, pi;
-nrn_pragma_omp(end declare target)
 int rev_dt;
 
 using Pfrv = void (*)();

From adead78fcc33fbd05c8a63f041e91190794dfcef Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Thu, 11 Aug 2022 13:05:09 +0200
Subject: [PATCH 083/128] clang-format

---
 coreneuron/gpu/nrn_acc_manager.cpp | 6 ++----
 coreneuron/mechanism/mechanism.hpp | 2 +-
 coreneuron/mechanism/membfunc.hpp  | 3 +--
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index 7df4155fb..13ed8d109 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -143,8 +143,7 @@ static Memb_list* copy_ml_to_device(const Memb_list* ml, int type) {
 
     if (ml->instance) {
         assert(ml->instance_size);
-        void* d_inst = cnrn_target_copyin(static_cast<std::byte*>(ml->instance),
-                                               ml->instance_size);
+        void* d_inst = cnrn_target_copyin(static_cast<std::byte*>(ml->instance), ml->instance_size);
         cnrn_target_memcpy_to_device(&(d_ml->instance), &d_inst);
     }
 
@@ -337,8 +336,7 @@ static void delete_ml_from_device(Memb_list* ml, int type) {
 
     if (ml->instance) {
         assert(ml->instance_size);
-        cnrn_target_delete(static_cast<std::byte*>(ml->instance),
-                           ml->instance_size);
+        cnrn_target_delete(static_cast<std::byte*>(ml->instance), ml->instance_size);
     }
 
     cnrn_target_delete(ml);
diff --git a/coreneuron/mechanism/mechanism.hpp b/coreneuron/mechanism/mechanism.hpp
index f8efd643b..baa872c85 100644
--- a/coreneuron/mechanism/mechanism.hpp
+++ b/coreneuron/mechanism/mechanism.hpp
@@ -150,7 +150,7 @@ struct Memb_list {
     // accelerator if the "constructor" in the translated MOD file code was
     // called before the main nrn_acc_manager methods that copy
     // thread/mechanism data to the device.
-    void* instance = nullptr;         /* mechanism instance struct from NMODL */
+    void* instance = nullptr; /* mechanism instance struct from NMODL */
     std::size_t instance_size{};
     void* global_variables = nullptr; /* global variables struct for each mechanism */
     std::size_t global_variables_size{};
diff --git a/coreneuron/mechanism/membfunc.hpp b/coreneuron/mechanism/membfunc.hpp
index 7602a8218..ac650595c 100644
--- a/coreneuron/mechanism/membfunc.hpp
+++ b/coreneuron/mechanism/membfunc.hpp
@@ -99,8 +99,7 @@ extern int register_mech(const char** m,
                          mod_f_t private_constructor,
                          mod_f_t private_destructor,
                          int nrnpointerindex,
-                         int vectorized
-);
+                         int vectorized);
 extern int point_register_mech(const char**,
                                mod_alloc_t alloc,
                                mod_f_t cur,

From de70877ae7228599fa900e85dfeeac0949160314 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 16 Aug 2022 14:10:19 +0200
Subject: [PATCH 084/128] clang-format

---
 coreneuron/utils/randoms/nrnran123.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/coreneuron/utils/randoms/nrnran123.cpp b/coreneuron/utils/randoms/nrnran123.cpp
index 0a6c89562..14e2b15df 100644
--- a/coreneuron/utils/randoms/nrnran123.cpp
+++ b/coreneuron/utils/randoms/nrnran123.cpp
@@ -94,7 +94,8 @@ __attribute__((noinline)) philox4x32_key_t& global_state() {
 }
 }  // namespace
 
-CORENRN_HOST_DEVICE philox4x32_ctr_t coreneuron_random123_philox4x32_helper(coreneuron::nrnran123_State* s) {
+CORENRN_HOST_DEVICE philox4x32_ctr_t
+coreneuron_random123_philox4x32_helper(coreneuron::nrnran123_State* s) {
     return philox4x32(s->c, global_state());
 }
 

From 8c737562a719c93552ebbab28d4ff0970316534d Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 16 Aug 2022 14:28:58 +0200
Subject: [PATCH 085/128] nmodl submodule

---
 external/nmodl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/nmodl b/external/nmodl
index 09005d9ad..d44340f48 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit 09005d9adf2a6f9372a0d4ad11674ce15ff72ca0
+Subproject commit d44340f4805d08ffa55510448ab48602a2635c62

From 56b573a2fb01bcc19c7739ea703146273cc24357 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 16 Aug 2022 14:32:46 +0200
Subject: [PATCH 086/128] cmake-format

---
 CMakeLists.txt            | 4 ++--
 coreneuron/CMakeLists.txt | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d53d2a369..9c8db5d31 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -282,8 +282,8 @@ endif()
 # Build option specific compiler flags
 # =============================================================================
 if(CORENRN_ENABLE_NMODL)
-  # We use Eigen for "small" matrices with thread-level parallelism handled at a
-  # higher level; tell Eigen not to try to multithread internally
+  # We use Eigen for "small" matrices with thread-level parallelism handled at a higher level; tell
+  # Eigen not to try to multithread internally
   list(APPEND CORENRN_COMPILE_DEFS EIGEN_DONT_PARALLELIZE)
 endif()
 if(CORENRN_HAVE_NVHPC_COMPILER)
diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 2fa72a534..5392a9c48 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -306,9 +306,9 @@ if(CORENRN_ENABLE_GPU)
   target_compile_options(coreneuron-core PRIVATE ${CORENRN_ACC_FLAGS})
 endif()
 
-# Create an extra target for use by NEURON when CoreNEURON is being built as a
-# submodule. NEURON tests will depend on this, so it must in turn depend on
-# everything that is needed to run nrnivmodl -coreneuron.
+# Create an extra target for use by NEURON when CoreNEURON is being built as a submodule. NEURON
+# tests will depend on this, so it must in turn depend on everything that is needed to run nrnivmodl
+# -coreneuron.
 add_custom_target(coreneuron-for-tests)
 add_dependencies(coreneuron-for-tests coreneuron-core ${NMODL_TARGET_TO_DEPEND})
 # Create an extra target for internal use that unit tests and so on can depend on.

From b8f266595b5e71fda7cb626c6eb84df263d1de0d Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 16 Aug 2022 14:48:02 +0200
Subject: [PATCH 087/128] Boost unit tests are built in header-only mode now

---
 tests/CMakeLists.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 7ef3d9647..d6b334ca3 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -31,8 +31,7 @@ if(Boost_FOUND)
     target_compile_options(coreneuron-unit-test
                            INTERFACE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS})
     target_include_directories(coreneuron-unit-test SYSTEM INTERFACE ${Boost_INCLUDE_DIRS})
-    target_link_libraries(coreneuron-unit-test INTERFACE coreneuron-all
-                                                         ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY})
+    target_link_libraries(coreneuron-unit-test INTERFACE coreneuron-all)
     add_subdirectory(unit/cmdline_interface)
     add_subdirectory(unit/interleave_info)
     add_subdirectory(unit/alignment)

From 4944c75c8e148ce14f2a10afdf7a4631e8a2125f Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 16 Aug 2022 14:48:46 +0200
Subject: [PATCH 088/128] Drop ${TEST_EXEC_PREFIX} that was causing simple
 tests to be executed on many ranks.

---
 tests/unit/alignment/CMakeLists.txt         | 2 +-
 tests/unit/cmdline_interface/CMakeLists.txt | 2 +-
 tests/unit/interleave_info/CMakeLists.txt   | 3 +--
 tests/unit/lfp/CMakeLists.txt               | 2 +-
 tests/unit/queueing/CMakeLists.txt          | 2 +-
 5 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/tests/unit/alignment/CMakeLists.txt b/tests/unit/alignment/CMakeLists.txt
index 0cffdc8b3..89da4da14 100644
--- a/tests/unit/alignment/CMakeLists.txt
+++ b/tests/unit/alignment/CMakeLists.txt
@@ -5,5 +5,5 @@
 # =============================================================================
 add_executable(alignment_test_bin alignment.cpp)
 target_link_libraries(alignment_test_bin coreneuron-unit-test)
-add_test(NAME alignment_test COMMAND ${TEST_EXEC_PREFIX} $<TARGET_FILE:alignment_test_bin>)
+add_test(NAME alignment_test COMMAND $<TARGET_FILE:alignment_test_bin>)
 cpp_cc_configure_sanitizers(TARGET alignment_test_bin TEST alignment_test)
diff --git a/tests/unit/cmdline_interface/CMakeLists.txt b/tests/unit/cmdline_interface/CMakeLists.txt
index fadbe60a3..cc98ad78d 100644
--- a/tests/unit/cmdline_interface/CMakeLists.txt
+++ b/tests/unit/cmdline_interface/CMakeLists.txt
@@ -5,5 +5,5 @@
 # =============================================================================
 add_executable(cmd_interface_test_bin test_cmdline_interface.cpp)
 target_link_libraries(cmd_interface_test_bin coreneuron-unit-test)
-add_test(NAME cmd_interface_test COMMAND ${TEST_EXEC_PREFIX} $<TARGET_FILE:cmd_interface_test_bin>)
+add_test(NAME cmd_interface_test COMMAND $<TARGET_FILE:cmd_interface_test_bin>)
 cpp_cc_configure_sanitizers(TARGET cmd_interface_test_bin TEST cmd_interface_test)
diff --git a/tests/unit/interleave_info/CMakeLists.txt b/tests/unit/interleave_info/CMakeLists.txt
index 948f32405..cda875eae 100644
--- a/tests/unit/interleave_info/CMakeLists.txt
+++ b/tests/unit/interleave_info/CMakeLists.txt
@@ -5,6 +5,5 @@
 # =============================================================================
 add_executable(interleave_info_bin check_constructors.cpp)
 target_link_libraries(interleave_info_bin coreneuron-unit-test)
-add_test(NAME interleave_info_constructor_test COMMAND ${TEST_EXEC_PREFIX}
-                                                       $<TARGET_FILE:interleave_info_bin>)
+add_test(NAME interleave_info_constructor_test COMMAND $<TARGET_FILE:interleave_info_bin>)
 cpp_cc_configure_sanitizers(TARGET interleave_info_bin TEST interleave_info_constructor_test)
diff --git a/tests/unit/lfp/CMakeLists.txt b/tests/unit/lfp/CMakeLists.txt
index 8b5b201c4..34231b9f9 100644
--- a/tests/unit/lfp/CMakeLists.txt
+++ b/tests/unit/lfp/CMakeLists.txt
@@ -5,7 +5,7 @@
 # =============================================================================
 add_executable(lfp_test_bin lfp.cpp)
 target_link_libraries(lfp_test_bin coreneuron-unit-test)
-add_test(NAME lfp_test COMMAND ${TEST_EXEC_PREFIX} $<TARGET_FILE:lfp_test_bin>)
+add_test(NAME lfp_test COMMAND $<TARGET_FILE:lfp_test_bin>)
 cpp_cc_configure_sanitizers(TARGET lfp_test_bin TEST lfp_test)
 set_property(
   TEST lfp_test
diff --git a/tests/unit/queueing/CMakeLists.txt b/tests/unit/queueing/CMakeLists.txt
index fc653ea98..05b2a12f2 100644
--- a/tests/unit/queueing/CMakeLists.txt
+++ b/tests/unit/queueing/CMakeLists.txt
@@ -5,5 +5,5 @@
 # =============================================================================
 add_executable(queuing_test_bin test_queueing.cpp)
 target_link_libraries(queuing_test_bin coreneuron-unit-test)
-add_test(NAME queuing_test COMMAND ${TEST_EXEC_PREFIX} $<TARGET_FILE:queuing_test_bin>)
+add_test(NAME queuing_test COMMAND $<TARGET_FILE:queuing_test_bin>)
 cpp_cc_configure_sanitizers(TARGET queuing_test_bin TEST queuing_test)

From 9d2ce472636d352909f85c1a689b38e63df565de Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 17 Aug 2022 09:03:19 +0200
Subject: [PATCH 089/128] submodule

---
 external/nmodl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/nmodl b/external/nmodl
index d44340f48..22361bdbb 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit d44340f4805d08ffa55510448ab48602a2635c62
+Subproject commit 22361bdbbe1a7c2874aa93c49e7e601858fc5abf

From 467f1327033853cef34b30bd3933f38131722ecb Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 17 Aug 2022 11:02:31 +0200
Subject: [PATCH 090/128] CORENEURON_GPU_DEBUG: add environment variable that
 enables cnrn_target_* debug messages.

---
 coreneuron/gpu/nrn_acc_manager.cpp | 93 ++++++++++++++++++++++++++++++
 coreneuron/sim/multicore.hpp       |  1 +
 coreneuron/utils/offload.hpp       | 80 +++++++++++++++++++------
 3 files changed, 155 insertions(+), 19 deletions(-)

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index 13ed8d109..cd6ab939e 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -32,6 +32,13 @@
 #include <cuda_runtime_api.h>
 #endif
 
+#if __has_include(<cxxabi.h>)
+#define USE_CXXABI
+#include <cxxabi.h>
+#include <memory>
+#include <string>
+#endif
+
 #ifdef CORENEURON_ENABLE_PRESENT_TABLE
 #include <cassert>
 #include <cstddef>
@@ -44,6 +51,38 @@ std::shared_mutex present_table_mutex;
 }  // namespace
 #endif
 
+namespace {
+/** @brief Try to demangle a type name, return the mangled name on failure.
+ */
+std::string cxx_demangle(const char* mangled) {
+#ifdef USE_CXXABI
+    int status{};
+    // Note that the third argument to abi::__cxa_demangle returns the length of
+    // the allocated buffer, which may be larger than strlen(demangled) + 1.
+    std::unique_ptr<char, decltype(free)*> demangled{
+        abi::__cxa_demangle(mangled, nullptr, nullptr, &status), free};
+    return status ? mangled : demangled.get();
+#else
+    return mangled;
+#endif
+}
+bool cnrn_target_debug_output_enabled() {
+    const char* env = std::getenv("CORENEURON_GPU_DEBUG");
+    if (!env) {
+        return false;
+    }
+    std::string env_s{env};
+    if (env_s == "1") {
+        return true;
+    } else if (env_s == "0") {
+        return false;
+    } else {
+        throw std::runtime_error("CORENEURON_GPU_DEBUG must be set to 0 or 1 (got " + env_s + ")");
+    }
+}
+bool cnrn_target_enable_debug{cnrn_target_debug_output_enabled()};
+}  // namespace
+
 namespace coreneuron {
 extern InterleaveInfo* interleave_info;
 void nrn_ion_global_map_copyto_device();
@@ -51,6 +90,60 @@ void nrn_ion_global_map_delete_from_device();
 void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay);
 void nrn_VecPlay_delete_from_device(NrnThread* nt);
 
+void cnrn_target_copyin_debug(std::string_view file,
+                              int line,
+                              std::size_t sizeof_T,
+                              std::type_info const& typeid_T,
+                              void const* h_ptr,
+                              std::size_t len,
+                              void* d_ptr) {
+    if (!cnrn_target_enable_debug) {
+        return;
+    }
+    std::cerr << file << ':' << line << ": cnrn_target_copyin<" << cxx_demangle(typeid_T.name())
+              << ">(" << h_ptr << ", " << len << " * " << sizeof_T << " = " << len * sizeof_T
+              << ") -> " << d_ptr << std::endl;
+}
+void cnrn_target_delete_debug(std::string_view file,
+                              int line,
+                              std::size_t sizeof_T,
+                              std::type_info const& typeid_T,
+                              void const* h_ptr,
+                              std::size_t len) {
+    if (!cnrn_target_enable_debug) {
+        return;
+    }
+    std::cerr << file << ':' << line << ": cnrn_target_delete<" << cxx_demangle(typeid_T.name())
+              << ">(" << h_ptr << ", " << len << " * " << sizeof_T << " = " << len * sizeof_T << ')'
+              << std::endl;
+}
+void cnrn_target_deviceptr_debug(std::string_view file,
+                                 int line,
+                                 std::size_t /* sizeof_T */,
+                                 std::type_info const& typeid_T,
+                                 void const* h_ptr,
+                                 void* d_ptr) {
+    if (!cnrn_target_enable_debug) {
+        return;
+    }
+    std::cerr << file << ':' << line << ": cnrn_target_device_ptr<" << cxx_demangle(typeid_T.name())
+              << ">(" << h_ptr << ") -> " << d_ptr << std::endl;
+}
+void cnrn_target_memcpy_to_device_debug(std::string_view file,
+                                        int line,
+                                        std::size_t sizeof_T,
+                                        std::type_info const& typeid_T,
+                                        void const* h_ptr,
+                                        std::size_t len,
+                                        void* d_ptr) {
+    if (!cnrn_target_enable_debug) {
+        return;
+    }
+    std::cerr << file << ':' << line << ": cnrn_target_memcpy_to_device<"
+              << cxx_demangle(typeid_T.name()) << ">(" << d_ptr << ", " << h_ptr << ", " << len
+              << " * " << sizeof_T << " = " << len * sizeof_T << ')' << std::endl;
+}
+
 #ifdef CORENEURON_ENABLE_PRESENT_TABLE
 void* cnrn_target_deviceptr_impl(void const* h_ptr) {
     if (!h_ptr) {
diff --git a/coreneuron/sim/multicore.hpp b/coreneuron/sim/multicore.hpp
index 3e06e3585..349e057c5 100644
--- a/coreneuron/sim/multicore.hpp
+++ b/coreneuron/sim/multicore.hpp
@@ -164,6 +164,7 @@ void nrn_multithread_job(F&& job, Args&&... args) {
 
     #pragma omp parallel for private(i) shared(nrn_threads, job, nrn_nthread, \
                                            nrnmpi_myid) schedule(static, 1)
+    // FIXME: multiple forwarding of the same arguments...
     for (i = 0; i < nrn_nthread; ++i) {
         job(nrn_threads + i, std::forward<Args>(args)...);
     }
diff --git a/coreneuron/utils/offload.hpp b/coreneuron/utils/offload.hpp
index f37724bb4..a487a92fe 100644
--- a/coreneuron/utils/offload.hpp
+++ b/coreneuron/utils/offload.hpp
@@ -23,8 +23,35 @@
 #endif
 
 #include <cstddef>
+#include <string_view>
 
 namespace coreneuron {
+void cnrn_target_copyin_debug(std::string_view file,
+                              int line,
+                              std::size_t sizeof_T,
+                              std::type_info const& typeid_T,
+                              void const* h_ptr,
+                              std::size_t len,
+                              void* d_ptr);
+void cnrn_target_delete_debug(std::string_view file,
+                              int line,
+                              std::size_t sizeof_T,
+                              std::type_info const& typeid_T,
+                              void const* h_ptr,
+                              std::size_t len);
+void cnrn_target_deviceptr_debug(std::string_view file,
+                                 int line,
+                                 std::size_t sizeof_T,
+                                 std::type_info const& typeid_T,
+                                 void const* h_ptr,
+                                 void* d_ptr);
+void cnrn_target_memcpy_to_device_debug(std::string_view file,
+                                        int line,
+                                        std::size_t sizeof_T,
+                                        std::type_info const& typeid_T,
+                                        void const* h_ptr,
+                                        std::size_t len,
+                                        void* d_ptr);
 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENACC) && !defined(CORENEURON_UNIFIED_MEMORY)
 // Homegrown implementation for buggy NVHPC versions (<=22.3?)
@@ -35,52 +62,55 @@ void cnrn_target_delete_update_present_table(void const* h_ptr, std::size_t len)
 #endif
 
 template <typename T>
-T* cnrn_target_deviceptr(const T* h_ptr) {
+T* cnrn_target_deviceptr(std::string_view file, int line, const T* h_ptr) {
+    T* d_ptr{};
 #ifdef CORENEURON_ENABLE_PRESENT_TABLE
-    return static_cast<T*>(cnrn_target_deviceptr_impl(h_ptr));
+    d_ptr = static_cast<T*>(cnrn_target_deviceptr_impl(h_ptr));
 #elif defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENACC)
-    return static_cast<T*>(acc_deviceptr(const_cast<T*>(h_ptr)));
+    d_ptr = static_cast<T*>(acc_deviceptr(const_cast<T*>(h_ptr)));
 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENMP)
-    T const* d_ptr{};
-
     nrn_pragma_omp(target data use_device_ptr(h_ptr))
-    { d_ptr = h_ptr; }
-
-    return const_cast<T*>(d_ptr);
+    { d_ptr = const_cast<T*>(h_ptr); }
 #else
     throw std::runtime_error(
         "cnrn_target_deviceptr() not implemented without OpenACC/OpenMP and gpu build");
 #endif
+    cnrn_target_deviceptr_debug(file, line, sizeof(T), typeid(T), h_ptr, d_ptr);
+    return d_ptr;
 }
 
 template <typename T>
-T* cnrn_target_copyin(const T* h_ptr, std::size_t len = 1) {
+T* cnrn_target_copyin(std::string_view file, int line, const T* h_ptr, std::size_t len = 1) {
+    T* d_ptr{};
 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENACC)
-    auto* d_ptr = static_cast<T*>(acc_copyin(const_cast<T*>(h_ptr), len * sizeof(T)));
-#ifdef CORENEURON_ENABLE_PRESENT_TABLE
-    cnrn_target_copyin_update_present_table(h_ptr, d_ptr, len * sizeof(T));
-#endif
-    return d_ptr;
+    d_ptr = static_cast<T*>(acc_copyin(const_cast<T*>(h_ptr), len * sizeof(T)));
 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENMP)
     nrn_pragma_omp(target enter data map(to : h_ptr[:len]))
-    return cnrn_target_deviceptr(h_ptr);
+    nrn_pragma_omp(target data use_device_ptr(h_ptr))
+    { d_ptr = const_cast<T*>(h_ptr); }
 #else
     throw std::runtime_error(
         "cnrn_target_copyin() not implemented without OpenACC/OpenMP and gpu build");
 #endif
+#ifdef CORENEURON_ENABLE_PRESENT_TABLE
+    cnrn_target_copyin_update_present_table(h_ptr, d_ptr, len * sizeof(T));
+#endif
+    cnrn_target_copyin_debug(file, line, sizeof(T), typeid(T), h_ptr, len, d_ptr);
+    return d_ptr;
 }
 
 template <typename T>
-void cnrn_target_delete(T* h_ptr, std::size_t len = 1) {
-#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
-    defined(_OPENACC)
+void cnrn_target_delete(std::string_view file, int line, T* h_ptr, std::size_t len = 1) {
+    cnrn_target_delete_debug(file, line, sizeof(T), typeid(T), h_ptr, len);
 #ifdef CORENEURON_ENABLE_PRESENT_TABLE
     cnrn_target_delete_update_present_table(h_ptr, len * sizeof(T));
 #endif
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENACC)
     acc_delete(h_ptr, len * sizeof(T));
 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENMP)
@@ -92,7 +122,12 @@ void cnrn_target_delete(T* h_ptr, std::size_t len = 1) {
 }
 
 template <typename T>
-void cnrn_target_memcpy_to_device(T* d_ptr, const T* h_ptr, std::size_t len = 1) {
+void cnrn_target_memcpy_to_device(std::string_view file,
+                                  int line,
+                                  T* d_ptr,
+                                  const T* h_ptr,
+                                  std::size_t len = 1) {
+    cnrn_target_memcpy_to_device_debug(file, line, sizeof(T), typeid(T), h_ptr, len, d_ptr);
 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENACC)
     acc_memcpy_to_device(d_ptr, const_cast<T*>(h_ptr), len * sizeof(T));
@@ -111,4 +146,11 @@ void cnrn_target_memcpy_to_device(T* d_ptr, const T* h_ptr, std::size_t len = 1)
 #endif
 }
 
+// Replace with std::source_location once we have C++20
+#define cnrn_target_copyin(...)    cnrn_target_copyin(__FILE__, __LINE__, __VA_ARGS__)
+#define cnrn_target_delete(...)    cnrn_target_delete(__FILE__, __LINE__, __VA_ARGS__)
+#define cnrn_target_deviceptr(...) cnrn_target_deviceptr(__FILE__, __LINE__, __VA_ARGS__)
+#define cnrn_target_memcpy_to_device(...) \
+    cnrn_target_memcpy_to_device(__FILE__, __LINE__, __VA_ARGS__)
+
 }  // namespace coreneuron

From b23acd70bbcf2e68ebd3a1b99ac163f8509d8c84 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 17 Aug 2022 11:18:05 +0200
Subject: [PATCH 091/128] avoid sizeof(void)

---
 coreneuron/gpu/nrn_acc_manager.cpp | 1 -
 coreneuron/utils/offload.hpp       | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index cd6ab939e..ab2d16ce2 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -119,7 +119,6 @@ void cnrn_target_delete_debug(std::string_view file,
 }
 void cnrn_target_deviceptr_debug(std::string_view file,
                                  int line,
-                                 std::size_t /* sizeof_T */,
                                  std::type_info const& typeid_T,
                                  void const* h_ptr,
                                  void* d_ptr) {
diff --git a/coreneuron/utils/offload.hpp b/coreneuron/utils/offload.hpp
index a487a92fe..ff49477ce 100644
--- a/coreneuron/utils/offload.hpp
+++ b/coreneuron/utils/offload.hpp
@@ -41,7 +41,6 @@ void cnrn_target_delete_debug(std::string_view file,
                               std::size_t len);
 void cnrn_target_deviceptr_debug(std::string_view file,
                                  int line,
-                                 std::size_t sizeof_T,
                                  std::type_info const& typeid_T,
                                  void const* h_ptr,
                                  void* d_ptr);
@@ -77,7 +76,7 @@ T* cnrn_target_deviceptr(std::string_view file, int line, const T* h_ptr) {
     throw std::runtime_error(
         "cnrn_target_deviceptr() not implemented without OpenACC/OpenMP and gpu build");
 #endif
-    cnrn_target_deviceptr_debug(file, line, sizeof(T), typeid(T), h_ptr, d_ptr);
+    cnrn_target_deviceptr_debug(file, line, typeid(T), h_ptr, d_ptr);
     return d_ptr;
 }
 

From 26c394b08f9ed07ad3b3f3722da6255965e85801 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 17 Aug 2022 14:58:15 +0200
Subject: [PATCH 092/128] try and fix ispc

---
 external/nmodl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/nmodl b/external/nmodl
index 22361bdbb..1bd24b21c 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit 22361bdbbe1a7c2874aa93c49e7e601858fc5abf
+Subproject commit 1bd24b21c6480fbb38a7ea89ccb1ff9491d89f85

From d74796a4b4cf9bda7ca8d67c4c8bf1e9667a307f Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 17 Aug 2022 15:05:18 +0200
Subject: [PATCH 093/128] drop ispc_celsius

---
 coreneuron/apps/main1.cpp          |  6 ------
 coreneuron/mechanism/nrnoc_ml.ispc |  2 --
 coreneuron/utils/ispc/globals.cpp  | 17 -----------------
 external/nmodl                     |  2 +-
 4 files changed, 1 insertion(+), 26 deletions(-)
 delete mode 100644 coreneuron/utils/ispc/globals.cpp

diff --git a/coreneuron/apps/main1.cpp b/coreneuron/apps/main1.cpp
index 4408234b6..e79db2104 100644
--- a/coreneuron/apps/main1.cpp
+++ b/coreneuron/apps/main1.cpp
@@ -58,9 +58,6 @@ bool corenrn_units_use_legacy() {
 
 void (*nrn2core_part2_clean_)();
 
-// cf. utils/ispc_globals.c
-extern double ispc_celsius;
-
 /**
  * If "export OMP_NUM_THREADS=n" is not set then omp by default sets
  * the number of threads equal to the number of cores on this node.
@@ -244,9 +241,6 @@ void nrn_init_and_load_data(int argc,
 
     corenrn_param.celsius = celsius;
 
-    // for ispc backend
-    ispc_celsius = celsius;
-
     // create net_cvode instance
     mk_netcvode();
 
diff --git a/coreneuron/mechanism/nrnoc_ml.ispc b/coreneuron/mechanism/nrnoc_ml.ispc
index 6b196eaf3..fa8079fb2 100644
--- a/coreneuron/mechanism/nrnoc_ml.ispc
+++ b/coreneuron/mechanism/nrnoc_ml.ispc
@@ -153,8 +153,6 @@ struct NrnThread {
     void* mapping;
 };
 
-extern uniform double ispc_celsius;
-
 // NOTE : this implementation is duplicated from "coreneuron/network/cvodestb.cpp"
 // If changes are required, make sure to change CPP as well.
 static inline int at_time(uniform NrnThread* nt, varying double te) {
diff --git a/coreneuron/utils/ispc/globals.cpp b/coreneuron/utils/ispc/globals.cpp
deleted file mode 100644
index 0344bf1b8..000000000
--- a/coreneuron/utils/ispc/globals.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
-# =============================================================================
-# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
-#
-# See top-level LICENSE file for details.
-# =============================================================================
-*/
-
-/*
- * Coreneuron global variables are declared at least in the coreneuron namespace. In ispc it is,
- * however, not possible to access variables within C++ namespaces. To be able to access these
- * variables from ispc kernels, we declare them in global namespace and a C linkage file.
- */
-
-extern "C" {
-double ispc_celsius;
-}
diff --git a/external/nmodl b/external/nmodl
index 1bd24b21c..8559c925c 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit 1bd24b21c6480fbb38a7ea89ccb1ff9491d89f85
+Subproject commit 8559c925cf9791dc9f40764ee0eb4c9fd6d9f57c

From 1c238265d08b8ce5e4025dee5c3f58b19b20a131 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 17 Aug 2022 17:21:19 +0200
Subject: [PATCH 094/128] ispc fix

---
 external/nmodl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/nmodl b/external/nmodl
index 8559c925c..a69cb558e 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit 8559c925cf9791dc9f40764ee0eb4c9fd6d9f57c
+Subproject commit a69cb558e03b3f53ffcd545cee9fd7157beee33a

From d1757ebb303f78dd2f24706cbb011fda206be2b1 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 17 Aug 2022 18:50:08 +0200
Subject: [PATCH 095/128] submodule

---
 external/nmodl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/nmodl b/external/nmodl
index a69cb558e..13f00b4c3 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit a69cb558e03b3f53ffcd545cee9fd7157beee33a
+Subproject commit 13f00b4c3ebac25078c3cff20259f4e1c4855291

From 2df364f75a93c2b7478deead824850e8ee169ce4 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 17 Aug 2022 18:50:23 +0200
Subject: [PATCH 096/128] lots more coreneuron builds, presumably temporarily

---
 .gitlab-ci.yml | 101 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 100 insertions(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 3ba880c40..3da9d2b52 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -130,13 +130,72 @@ build:coreneuron:nmodl:nvhpc:acc:
     SPACK_PACKAGE_SPEC: +caliper+nmodl~openmp+gpu~shared+tests~legacy-unit+sympy build_type=RelWithDebInfo
   needs: ["build:nmodl"]
 
+build:coreneuron:nmodl:nvhpc:acc:debug:
+  extends: [.build_coreneuron, .spack_nvhpc]
+  variables:
+    SPACK_PACKAGE_SPEC: +caliper+nmodl~openmp+gpu~shared+tests~legacy-unit+sympy build_type=Debug
+  needs: ["build:nmodl"]
+
+build:coreneuron:nmodl:nvhpc:acc:shared:
+  extends: [.build_coreneuron, .spack_nvhpc]
+  variables:
+    SPACK_PACKAGE_SPEC: +caliper+nmodl~openmp+gpu+shared+tests~legacy-unit+sympy build_type=RelWithDebInfo
+  needs: ["build:nmodl"]
+
+build:coreneuron:nmodl:nvhpc:acc:shared:debug:
+  extends: [.build_coreneuron, .spack_nvhpc]
+  variables:
+    SPACK_PACKAGE_SPEC: +caliper+nmodl~openmp+gpu+shared+tests~legacy-unit+sympy build_type=Debug
+  needs: ["build:nmodl"]
+
+build:coreneuron:nmodl:nvhpc:acc:legacy:
+  extends: [.build_coreneuron, .spack_nvhpc]
+  variables:
+    SPACK_PACKAGE_SPEC: +caliper+nmodl~openmp+gpu~shared+tests~legacy-unit~sympy build_type=RelWithDebInfo
+  needs: ["build:nmodl"]
+
+build:coreneuron:nmodl:nvhpc:acc:debug:legacy:
+  extends: [.build_coreneuron, .spack_nvhpc]
+  variables:
+    SPACK_PACKAGE_SPEC: +caliper+nmodl~openmp+gpu~shared+tests~legacy-unit~sympy build_type=Debug
+  needs: ["build:nmodl"]
+
+build:coreneuron:nmodl:nvhpc:acc:shared:legacy:
+  extends: [.build_coreneuron, .spack_nvhpc]
+  variables:
+    SPACK_PACKAGE_SPEC: +caliper+nmodl~openmp+gpu+shared+tests~legacy-unit~sympy build_type=RelWithDebInfo
+  needs: ["build:nmodl"]
+
+build:coreneuron:nmodl:nvhpc:acc:shared:debug:legacy:
+  extends: [.build_coreneuron, .spack_nvhpc]
+  variables:
+    SPACK_PACKAGE_SPEC: +caliper+nmodl~openmp+gpu+shared+tests~legacy-unit~sympy build_type=Debug
+  needs: ["build:nmodl"]
+
 build:coreneuron:nmodl:nvhpc:omp:
   extends: [.build_coreneuron, .spack_nvhpc]
   variables:
-    # Sympy + OpenMP target offload does not currently work with NVHPC (?)
+    SPACK_PACKAGE_SPEC: +caliper+nmodl+openmp+gpu~shared+tests~legacy-unit+sympy build_type=RelWithDebInfo
+  needs: ["build:nmodl"]
+
+build:coreneuron:nmodl:nvhpc:omp:debug:
+  extends: [.build_coreneuron, .spack_nvhpc]
+  variables:
+    SPACK_PACKAGE_SPEC: +caliper+nmodl+openmp+gpu~shared+tests~legacy-unit+sympy build_type=Debug
+  needs: ["build:nmodl"]
+
+build:coreneuron:nmodl:nvhpc:omp:legacy:
+  extends: [.build_coreneuron, .spack_nvhpc]
+  variables:
     SPACK_PACKAGE_SPEC: +caliper+nmodl+openmp+gpu~shared+tests~legacy-unit~sympy build_type=RelWithDebInfo
   needs: ["build:nmodl"]
 
+build:coreneuron:nmodl:nvhpc:omp:debug:legacy:
+  extends: [.build_coreneuron, .spack_nvhpc]
+  variables:
+    SPACK_PACKAGE_SPEC: +caliper+nmodl+openmp+gpu~shared+tests~legacy-unit~sympy build_type=Debug
+  needs: ["build:nmodl"]
+
 build:coreneuron:mod2c:intel:
   extends: [.build_coreneuron, .spack_intel]
   variables:
@@ -206,10 +265,50 @@ test:coreneuron:nmodl:nvhpc:omp:
   extends: [.ctest, .gpu_node]
   needs: ["build:coreneuron:nmodl:nvhpc:omp"]
 
+test:coreneuron:nmodl:nvhpc:omp:debug:
+  extends: [.ctest, .gpu_node]
+  needs: ["build:coreneuron:nmodl:nvhpc:omp:debug"]
+
+test:coreneuron:nmodl:nvhpc:omp:legacy:
+  extends: [.ctest, .gpu_node]
+  needs: ["build:coreneuron:nmodl:nvhpc:omp:legacy"]
+
+test:coreneuron:nmodl:nvhpc:omp:debug:legacy:
+  extends: [.ctest, .gpu_node]
+  needs: ["build:coreneuron:nmodl:nvhpc:omp:debug:legacy"]
+
 test:coreneuron:nmodl:nvhpc:acc:
   extends: [.ctest, .gpu_node]
   needs: ["build:coreneuron:nmodl:nvhpc:acc"]
 
+test:coreneuron:nmodl:nvhpc:acc:debug:
+  extends: [.ctest, .gpu_node]
+  needs: ["build:coreneuron:nmodl:nvhpc:acc:debug"]
+
+test:coreneuron:nmodl:nvhpc:acc:shared:
+  extends: [.ctest, .gpu_node]
+  needs: ["build:coreneuron:nmodl:nvhpc:acc:shared"]
+
+test:coreneuron:nmodl:nvhpc:acc:shared:debug:
+  extends: [.ctest, .gpu_node]
+  needs: ["build:coreneuron:nmodl:nvhpc:acc:shared:debug"]
+
+test:coreneuron:nmodl:nvhpc:acc:legacy:
+  extends: [.ctest, .gpu_node]
+  needs: ["build:coreneuron:nmodl:nvhpc:acc:legacy"]
+
+test:coreneuron:nmodl:nvhpc:acc:debug:legacy:
+  extends: [.ctest, .gpu_node]
+  needs: ["build:coreneuron:nmodl:nvhpc:acc:debug:legacy"]
+
+test:coreneuron:nmodl:nvhpc:acc:shared:legacy:
+  extends: [.ctest, .gpu_node]
+  needs: ["build:coreneuron:nmodl:nvhpc:acc:shared:legacy"]
+
+test:coreneuron:nmodl:nvhpc:acc:shared:debug:legacy:
+  extends: [.ctest, .gpu_node]
+  needs: ["build:coreneuron:nmodl:nvhpc:acc:shared:debug:legacy"]
+
 test:coreneuron:mod2c:intel:
   extends: [.ctest]
   needs: ["build:coreneuron:mod2c:intel"]

From da8d64fd40605df7a246127f1003d4766b801d37 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Thu, 18 Aug 2022 12:19:10 +0200
Subject: [PATCH 097/128] cleanup on mod2c side

---
 coreneuron/gpu/nrn_acc_manager.cpp | 13 -------------
 coreneuron/io/nrn_setup.cpp        |  3 ---
 coreneuron/mechanism/mechanism.hpp | 15 +++++----------
 coreneuron/utils/offload.hpp       | 11 +++++++++++
 external/mod2c                     |  2 +-
 5 files changed, 17 insertions(+), 27 deletions(-)

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index ab2d16ce2..fcaf8047b 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -240,13 +240,6 @@ static Memb_list* copy_ml_to_device(const Memb_list* ml, int type) {
     }
 
 
-    if (ml->global_variables) {
-        assert(ml->global_variables_size);
-        void* d_glob_vars = cnrn_target_copyin(static_cast<std::byte*>(ml->global_variables),
-                                               ml->global_variables_size);
-        cnrn_target_memcpy_to_device(&(d_ml->global_variables), &d_glob_vars);
-    }
-
     int n = ml->nodecount;
     int szp = corenrn.get_prop_param_size()[type];
     int szdp = corenrn.get_prop_dparam_size()[type];
@@ -420,12 +413,6 @@ static void delete_ml_from_device(Memb_list* ml, int type) {
     }
     cnrn_target_delete(ml->nodeindices, n);
 
-    if (ml->global_variables) {
-        assert(ml->global_variables_size);
-        cnrn_target_delete(static_cast<std::byte*>(ml->global_variables),
-                           ml->global_variables_size);
-    }
-
     if (ml->instance) {
         assert(ml->instance_size);
         cnrn_target_delete(static_cast<std::byte*>(ml->instance), ml->instance_size);
diff --git a/coreneuron/io/nrn_setup.cpp b/coreneuron/io/nrn_setup.cpp
index 5aed57fbb..b9edf7814 100644
--- a/coreneuron/io/nrn_setup.cpp
+++ b/coreneuron/io/nrn_setup.cpp
@@ -759,9 +759,6 @@ void nrn_cleanup() {
                 (*priv_dtor)(nt, ml, tml->index);
                 assert(!ml->instance);
                 assert(!ml->instance_size);
-                // TODO make mod2c use `instance` instead of `global_variables`
-                assert(!ml->global_variables);
-                assert(!ml->global_variables_size);
             }
 
             NetReceiveBuffer_t* nrb = ml->_net_receive_buffer;
diff --git a/coreneuron/mechanism/mechanism.hpp b/coreneuron/mechanism/mechanism.hpp
index baa872c85..9335e7530 100644
--- a/coreneuron/mechanism/mechanism.hpp
+++ b/coreneuron/mechanism/mechanism.hpp
@@ -143,16 +143,11 @@ struct Memb_list {
     NetSendBuffer_t* _net_send_buffer = nullptr;
     int nodecount; /* actual node count */
     int _nodecount_padded;
-    // Not obvious that these need to be distinct (i.e. we could just have
-    // `instance` and `instance_size`, and use them in mod2c for global
-    // variables while NMODL could use the existing instance struct for globals
-    // too). nrn_acc_manager.cpp could handle data movement to/from the
-    // accelerator if the "constructor" in the translated MOD file code was
-    // called before the main nrn_acc_manager methods that copy
-    // thread/mechanism data to the device.
-    void* instance = nullptr; /* mechanism instance struct from NMODL */
+    // nrn_acc_manager.cpp handles data movement to/from the accelerator as the
+    // "private constructor" in the translated MOD file code is called before
+    // the main nrn_acc_manager methods that copy thread/mechanism data to the
+    // device
+    void* instance{nullptr}; /* mechanism instance struct */
     std::size_t instance_size{};
-    void* global_variables = nullptr; /* global variables struct for each mechanism */
-    std::size_t global_variables_size{};
 };
 }  // namespace coreneuron
diff --git a/coreneuron/utils/offload.hpp b/coreneuron/utils/offload.hpp
index ff49477ce..920b786a6 100644
--- a/coreneuron/utils/offload.hpp
+++ b/coreneuron/utils/offload.hpp
@@ -145,11 +145,22 @@ void cnrn_target_memcpy_to_device(std::string_view file,
 #endif
 }
 
+template <typename T>
+void cnrn_target_update_on_device(std::string_view file,
+                                  int line,
+                                  const T* h_ptr,
+                                  std::size_t len = 1) {
+    auto* d_ptr = cnrn_target_deviceptr(file, line, h_ptr);
+    cnrn_target_memcpy_to_device(file, line, d_ptr, h_ptr);
+}
+
 // Replace with std::source_location once we have C++20
 #define cnrn_target_copyin(...)    cnrn_target_copyin(__FILE__, __LINE__, __VA_ARGS__)
 #define cnrn_target_delete(...)    cnrn_target_delete(__FILE__, __LINE__, __VA_ARGS__)
 #define cnrn_target_deviceptr(...) cnrn_target_deviceptr(__FILE__, __LINE__, __VA_ARGS__)
 #define cnrn_target_memcpy_to_device(...) \
     cnrn_target_memcpy_to_device(__FILE__, __LINE__, __VA_ARGS__)
+#define cnrn_target_update_on_device(...) \
+    cnrn_target_update_on_device(__FILE__, __LINE__, __VA_ARGS__)
 
 }  // namespace coreneuron
diff --git a/external/mod2c b/external/mod2c
index 8b754b35b..1264bc364 160000
--- a/external/mod2c
+++ b/external/mod2c
@@ -1 +1 @@
-Subproject commit 8b754b35b6ea3088a713590bc5d72af3e2f8ef2b
+Subproject commit 1264bc364a89d6d63faa47f82e761a1a7de726b3

From be420bfcc54780587d5d8cb12bbf2f1c672dd608 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Thu, 18 Aug 2022 18:20:28 +0200
Subject: [PATCH 098/128] nmodl submodule

---
 external/nmodl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/nmodl b/external/nmodl
index 13f00b4c3..59a971157 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit 13f00b4c3ebac25078c3cff20259f4e1c4855291
+Subproject commit 59a9711578e6841c1c3389c5f3685cc4c6bc7a88

From 8dcb6019514843492b94939f32ee38ae3855f423 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Fri, 19 Aug 2022 09:51:55 +0200
Subject: [PATCH 099/128] nmodl submodule, eigen shim comments

---
 coreneuron/CMakeLists.txt | 12 +++++++-----
 external/nmodl            |  2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 5392a9c48..c4143b48c 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -111,11 +111,13 @@ if(CORENRN_ENABLE_GPU)
   # this cannot be included in the same shared library as the rest of the OpenACC code.
   set(CORENEURON_CUDA_FILES ${CMAKE_CURRENT_SOURCE_DIR}/permute/cellorder.cu)
 
-  # Eigen-3.5+ provides better GPU support. However, some functions cannot be called directly from
-  # within an OpenACC region. Therefore, we need to wrap them in a special API (decorate them with
-  # __device__ & acc routine tokens), which allows us to eventually call them from OpenACC. Calling
-  # these functions from CUDA kernels presents no issue ... TODO is it going to work to call these
-  # from libcoreneuron-cuda.so? probably not...
+  # Eigen functions cannot be called directly from OpenACC regions, but Eigen is sort-of compatible
+  # with being compiled as CUDA code. Because of
+  # https://forums.developer.nvidia.com/t/cannot-dynamically-load-a-shared-library-containing-both-openacc-and-cuda-code/210972
+  # this has to mean `nvc++ -cuda` rather than `nvcc`. We explicitly instantiate Eigen functions for
+  # different matrix sizes in partial_piv_lu.cpp (with CUDA attributes but without OpenACC or OpenMP
+  # annotations) and dispatch to these from a wrapper in partial_piv_lu.h that does have
+  # OpenACC/OpenMP annotations.
   if(CORENRN_ENABLE_NMODL AND EXISTS ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cpp)
     list(APPEND CORENEURON_CODE_FILES ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cpp)
   endif()
diff --git a/external/nmodl b/external/nmodl
index 59a971157..bed61a5d0 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit 59a9711578e6841c1c3389c5f3685cc4c6bc7a88
+Subproject commit bed61a5d0380bd8260776f45515dd40878fafaf7

From a11b4616276a2670e870a9d1faa148cc6dd484cd Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Fri, 19 Aug 2022 12:58:06 +0200
Subject: [PATCH 100/128] submodules

---
 external/mod2c | 2 +-
 external/nmodl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/external/mod2c b/external/mod2c
index 1264bc364..e40c7c093 160000
--- a/external/mod2c
+++ b/external/mod2c
@@ -1 +1 @@
-Subproject commit 1264bc364a89d6d63faa47f82e761a1a7de726b3
+Subproject commit e40c7c093f70bfba72ade6802e4ba7d242eca03a
diff --git a/external/nmodl b/external/nmodl
index bed61a5d0..e0183df5b 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit bed61a5d0380bd8260776f45515dd40878fafaf7
+Subproject commit e0183df5b9f3d01cb35af21c197090c42f1c3354

From 1cce3af2568335420d995034bf52487895c1311a Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Fri, 19 Aug 2022 14:30:25 +0200
Subject: [PATCH 101/128] swap order so needs_foo.o comes before libfoo.so

---
 extra/nrnivmodl_core_makefile.in | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/extra/nrnivmodl_core_makefile.in b/extra/nrnivmodl_core_makefile.in
index 4d7df0388..135a9e722 100644
--- a/extra/nrnivmodl_core_makefile.in
+++ b/extra/nrnivmodl_core_makefile.in
@@ -222,8 +222,9 @@ coremech_lib_shared: $(ALL_OBJS) $(ENGINEMECH_OBJ) build_always
 	(cd $(MOD_OBJS_DIR)/libcoreneuron-core && ar x $(CORENRN_LIB_DIR)/libcoreneuron-core.a)
 	$(CXX_SHARED_LIB_CMD) $(ENGINEMECH_OBJ) -o ${COREMECH_LIB_PATH} $(ALL_OBJS) \
 	  -I$(CORENRN_INC_DIR) $(INCFLAGS) \
-	  $(LDFLAGS) ${SONAME_OPTION} @CORENEURON_LINKER_START_GROUP@ \
+	  @CORENEURON_LINKER_START_GROUP@ \
 	  $(MOD_OBJS_DIR)/libcoreneuron-core/*.o @CORENEURON_LINKER_END_GROUP@ \
+		$(LDFLAGS) ${SONAME_OPTION} \
 		-Wl,-rpath,$(CORENRN_LIB_DIR) -L$(CORENRN_LIB_DIR)
 	# cleanup
 	rm $(MOD_OBJS_DIR)/libcoreneuron-core/*.o

From 188a93535a9407feb8974fa6187c2837bc5b5392 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Fri, 19 Aug 2022 14:49:54 +0200
Subject: [PATCH 102/128] submodule

---
 external/nmodl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/nmodl b/external/nmodl
index e0183df5b..980a19534 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit e0183df5b9f3d01cb35af21c197090c42f1c3354
+Subproject commit 980a195349091cd64455a75ed9ec574efe948962

From 195e75d27dcc68d38cbfc15f8ff3fad6545414be Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Fri, 19 Aug 2022 15:14:52 +0200
Subject: [PATCH 103/128] revert some incomplete unified memory changes, add
 comments, private present table only for nvhpc <= 22.3

---
 coreneuron/gpu/nrn_acc_manager.cpp            |  2 ++
 .../mechanism/mech/mod2c_core_thread.hpp      | 20 +++++++++----------
 coreneuron/sim/multicore.hpp                  |  2 +-
 coreneuron/utils/offload.hpp                  |  8 +++++---
 4 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index fcaf8047b..eed8d0c0f 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -172,6 +172,7 @@ void cnrn_target_copyin_update_present_table(void const* h_ptr, void* d_ptr, std
         return;
     }
     std::lock_guard _{present_table_mutex};
+    // TODO include more pendantic overlap checking?
     auto const result = present_table.emplace(static_cast<std::byte const*>(h_ptr),
                                               std::make_pair(len, static_cast<std::byte*>(d_ptr)));
 }
@@ -180,6 +181,7 @@ void cnrn_target_delete_update_present_table(void const* h_ptr, std::size_t len)
         return;
     }
     std::lock_guard _{present_table_mutex};
+    // TODO properly matching OpenACC semantics would require a reference count
     auto const iter = present_table.find(static_cast<std::byte const*>(h_ptr));
     assert(iter != present_table.end());
     assert(iter->second.first == len);
diff --git a/coreneuron/mechanism/mech/mod2c_core_thread.hpp b/coreneuron/mechanism/mech/mod2c_core_thread.hpp
index e224137e0..d18160f3a 100644
--- a/coreneuron/mechanism/mech/mod2c_core_thread.hpp
+++ b/coreneuron/mechanism/mech/mod2c_core_thread.hpp
@@ -44,16 +44,16 @@ struct Item {
 
 using List = Item; /* list of mixed items */
 
-struct SparseObj: public MemoryManaged { /* all the state information */
-    Elm** rowst{};                       /* link to first element in row (solution order)*/
-    Elm** diag{};                        /* link to pivot element in row (solution order)*/
-    void* elmpool{};                     /* no interthread cache line sharing for elements */
-    unsigned neqn{};                     /* number of equations */
-    unsigned _cntml_padded{};            /* number of instances */
-    unsigned* varord{};                  /* row and column order for pivots */
-    double* rhs{};                       /* initially- right hand side        finally - answer */
-    unsigned* ngetcall{};                /* per instance counter for number of calls to _getelm */
-    int phase{};                         /* 0-solution phase; 1-count phase; 2-build list phase */
+struct SparseObj {            /* all the state information */
+    Elm** rowst{};            /* link to first element in row (solution order)*/
+    Elm** diag{};             /* link to pivot element in row (solution order)*/
+    void* elmpool{};          /* no interthread cache line sharing for elements */
+    unsigned neqn{};          /* number of equations */
+    unsigned _cntml_padded{}; /* number of instances */
+    unsigned* varord{};       /* row and column order for pivots */
+    double* rhs{};            /* initially- right hand side        finally - answer */
+    unsigned* ngetcall{};     /* per instance counter for number of calls to _getelm */
+    int phase{};              /* 0-solution phase; 1-count phase; 2-build list phase */
     int numop{};
     unsigned coef_list_size{};
     double** coef_list{}; /* pointer to (first instance) value in _getelm order */
diff --git a/coreneuron/sim/multicore.hpp b/coreneuron/sim/multicore.hpp
index 349e057c5..a6ac50be0 100644
--- a/coreneuron/sim/multicore.hpp
+++ b/coreneuron/sim/multicore.hpp
@@ -54,7 +54,7 @@ struct NrnFastImem {
     double* nrn_sav_d;
 };
 
-struct TrajectoryRequests: public MemoryManaged {
+struct TrajectoryRequests {
     void** vpr;       /* PlayRecord Objects known by NEURON */
     double** scatter; /* if bsize == 0, each time step */
     double** varrays; /* if bsize > 0, the Vector data pointers. */
diff --git a/coreneuron/utils/offload.hpp b/coreneuron/utils/offload.hpp
index 920b786a6..184211c19 100644
--- a/coreneuron/utils/offload.hpp
+++ b/coreneuron/utils/offload.hpp
@@ -51,9 +51,11 @@ void cnrn_target_memcpy_to_device_debug(std::string_view file,
                                         void const* h_ptr,
                                         std::size_t len,
                                         void* d_ptr);
-#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
-    defined(_OPENACC) && !defined(CORENEURON_UNIFIED_MEMORY)
-// Homegrown implementation for buggy NVHPC versions (<=22.3?)
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) &&              \
+    defined(_OPENACC) && !defined(CORENEURON_UNIFIED_MEMORY) && defined(__NVCOMPILER_MAJOR__) && \
+    defined(__NVCOMPILER_MINOR__) && (__NVCOMPILER_MAJOR__ <= 22) && (__NVCOMPILER_MINOR__ <= 3)
+// Homegrown implementation for buggy NVHPC versions (<=22.3), see
+// https://forums.developer.nvidia.com/t/acc-deviceptr-does-not-work-in-openacc-code-dynamically-loaded-from-a-shared-library/211599
 #define CORENEURON_ENABLE_PRESENT_TABLE
 void* cnrn_target_deviceptr_impl(void const* h_ptr);
 void cnrn_target_copyin_update_present_table(void const* h_ptr, void* d_ptr, std::size_t len);

From 52de49e4b0d38dcc6e326a8ed35246069357aeeb Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Mon, 22 Aug 2022 15:52:03 +0200
Subject: [PATCH 104/128] nmodl

---
 external/nmodl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/nmodl b/external/nmodl
index 980a19534..7a53be75f 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit 980a195349091cd64455a75ed9ec574efe948962
+Subproject commit 7a53be75fa329e5120038b26e53b18dbe3074bd6

From 94fef756d56649779dc91207192bb78db69ca0a2 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Mon, 22 Aug 2022 15:58:47 +0200
Subject: [PATCH 105/128] mod2c

---
 external/mod2c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/mod2c b/external/mod2c
index e40c7c093..77bba7715 160000
--- a/external/mod2c
+++ b/external/mod2c
@@ -1 +1 @@
-Subproject commit e40c7c093f70bfba72ade6802e4ba7d242eca03a
+Subproject commit 77bba771579c9f91a2e10533967486e5e1f429fa

From dbd53fce89de5893e77738e21082b74a7e5df81b Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Mon, 22 Aug 2022 15:59:10 +0200
Subject: [PATCH 106/128] set CORENRN_ENABLE_SHARED as global property

---
 CMake/OpenAccHelper.cmake | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake
index f232d2bab..a21f8b523 100644
--- a/CMake/OpenAccHelper.cmake
+++ b/CMake/OpenAccHelper.cmake
@@ -101,7 +101,7 @@ if(CORENRN_ENABLE_GPU)
 endif()
 
 # =============================================================================
-# Initialise global property that will be used by NEURON to link with CoreNEURON
+# Initialise global properties that will be used by NEURON to link with CoreNEURON
 # =============================================================================
 if(CORENRN_ENABLE_GPU)
   # CORENRN_LIB_LINK_FLAGS is the full set of flags needed to link against libcorenrnmech.so:
@@ -109,8 +109,8 @@ if(CORENRN_ENABLE_GPU)
   # to be used when linking the NEURON Python module to make sure it is able to dynamically load
   # libcorenrnmech.so.
   set_property(GLOBAL PROPERTY CORENRN_LIB_LINK_FLAGS "${NVHPC_ACC_COMP_FLAGS}")
-  # Because of
   if(CORENRN_ENABLE_SHARED)
+    # Because of
     # https://forums.developer.nvidia.com/t/dynamically-loading-an-openacc-enabled-shared-library-from-an-executable-compiled-with-nvc-does-not-work/210968
     # we have to tell NEURON to pass OpenACC flags when linking special, otherwise we end up with an
     # `nrniv` binary that cannot dynamically load CoreNEURON in shared-library builds.
@@ -118,6 +118,10 @@ if(CORENRN_ENABLE_GPU)
   endif()
 endif()
 
+# NEURON needs to have access to this when CoreNEURON is built as a submodule. If CoreNEURON is
+# installed externally then this is set via coreneuron-config.cmake
+set_property(GLOBAL PROPERTY CORENRN_ENABLE_SHARED ${CORENRN_ENABLE_SHARED})
+
 if(CORENRN_HAVE_NVHPC_COMPILER)
   if(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL 20.7)
     # https://forums.developer.nvidia.com/t/many-all-diagnostic-numbers-increased-by-1-from-previous-values/146268/3

From 30ba4b0bdcbdf9f49121b1673621d91a049f7af2 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 23 Aug 2022 14:04:10 +0200
Subject: [PATCH 107/128] re-add global_variables[_size] and more helpers

---
 coreneuron/gpu/nrn_acc_manager.cpp | 52 ++++++++++++++++++++++--------
 coreneuron/io/nrn_setup.cpp        |  3 +-
 coreneuron/mechanism/mechanism.hpp |  5 +--
 coreneuron/mechanism/nrnoc_ml.ispc |  2 ++
 coreneuron/utils/offload.hpp       | 43 +++++++++++++++++-------
 external/nmodl                     |  2 +-
 6 files changed, 78 insertions(+), 29 deletions(-)

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index eed8d0c0f..a05c897d7 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -125,7 +125,18 @@ void cnrn_target_deviceptr_debug(std::string_view file,
     if (!cnrn_target_enable_debug) {
         return;
     }
-    std::cerr << file << ':' << line << ": cnrn_target_device_ptr<" << cxx_demangle(typeid_T.name())
+    std::cerr << file << ':' << line << ": cnrn_target_deviceptr<" << cxx_demangle(typeid_T.name())
+              << ">(" << h_ptr << ") -> " << d_ptr << std::endl;
+}
+void cnrn_target_is_present_debug(std::string_view file,
+                                  int line,
+                                  std::type_info const& typeid_T,
+                                  void const* h_ptr,
+                                  void* d_ptr) {
+    if (!cnrn_target_enable_debug) {
+        return;
+    }
+    std::cerr << file << ':' << line << ": cnrn_target_is_present<" << cxx_demangle(typeid_T.name())
               << ">(" << h_ptr << ") -> " << d_ptr << std::endl;
 }
 void cnrn_target_memcpy_to_device_debug(std::string_view file,
@@ -144,28 +155,39 @@ void cnrn_target_memcpy_to_device_debug(std::string_view file,
 }
 
 #ifdef CORENEURON_ENABLE_PRESENT_TABLE
-void* cnrn_target_deviceptr_impl(void const* h_ptr) {
+template <bool must_be_present_or_null>
+std::pair<void*, bool> cnrn_target_deviceptr_impl(void const* h_ptr) {
     if (!h_ptr) {
-        return nullptr;
+        return {nullptr, false};
     }
     // Concurrent calls to this method are safe, but they must be serialised
     // w.r.t. calls to the cnrn_target_*_update_present_table methods.
     std::shared_lock _{present_table_mutex};
-    assert(!present_table.empty());
+    if (present_table.empty()) {
+        return {nullptr, must_be_present_or_null};
+    }
     // prev(first iterator greater than h_ptr or last if not found) gives the first iterator less
     // than or equal to h_ptr
     auto const iter = std::prev(std::upper_bound(
         present_table.begin(), present_table.end(), h_ptr, [](void const* hp, auto const& entry) {
             return hp < entry.first;
         }));
-    assert(iter != present_table.end());
+    if (iter == present_table.end()) {
+        return {nullptr, must_be_present_or_null};
+    }
     std::byte const* const h_byte_ptr{static_cast<std::byte const*>(h_ptr)};
     std::byte const* const h_start_of_block{iter->first};
     std::size_t const block_size{iter->second.first};
     std::byte* const d_start_of_block{iter->second.second};
-    assert(h_byte_ptr < h_start_of_block + block_size);
-    return d_start_of_block + (h_byte_ptr - h_start_of_block);
+    bool const is_present{h_byte_ptr < h_start_of_block + block_size};
+    if (!is_present) {
+        return {nullptr, must_be_present_or_null};
+    }
+    return {d_start_of_block + (h_byte_ptr - h_start_of_block), false};
 }
+template std::pair<void*, bool> cnrn_target_deviceptr_impl<true>(void const*);
+template std::pair<void*, bool> cnrn_target_deviceptr_impl<false>(void const*);
+
 void cnrn_target_copyin_update_present_table(void const* h_ptr, void* d_ptr, std::size_t len) {
     if (!h_ptr) {
         assert(!d_ptr);
@@ -235,10 +257,11 @@ static Memb_list* copy_ml_to_device(const Memb_list* ml, int type) {
 
     auto d_ml = cnrn_target_copyin(ml);
 
-    if (ml->instance) {
-        assert(ml->instance_size);
-        void* d_inst = cnrn_target_copyin(static_cast<std::byte*>(ml->instance), ml->instance_size);
-        cnrn_target_memcpy_to_device(&(d_ml->instance), &d_inst);
+    if (ml->global_variables) {
+        assert(ml->global_variables_size);
+        void* d_inst = cnrn_target_copyin(static_cast<std::byte*>(ml->global_variables),
+                                          ml->global_variables_size);
+        cnrn_target_memcpy_to_device(&(d_ml->global_variables), &d_inst);
     }
 
 
@@ -415,9 +438,10 @@ static void delete_ml_from_device(Memb_list* ml, int type) {
     }
     cnrn_target_delete(ml->nodeindices, n);
 
-    if (ml->instance) {
-        assert(ml->instance_size);
-        cnrn_target_delete(static_cast<std::byte*>(ml->instance), ml->instance_size);
+    if (ml->global_variables) {
+        assert(ml->global_variables_size);
+        cnrn_target_delete(static_cast<std::byte*>(ml->global_variables),
+                           ml->global_variables_size);
     }
 
     cnrn_target_delete(ml);
diff --git a/coreneuron/io/nrn_setup.cpp b/coreneuron/io/nrn_setup.cpp
index b9edf7814..703e853d8 100644
--- a/coreneuron/io/nrn_setup.cpp
+++ b/coreneuron/io/nrn_setup.cpp
@@ -758,7 +758,8 @@ void nrn_cleanup() {
             if (auto* const priv_dtor = corenrn.get_memb_func(tml->index).private_destructor) {
                 (*priv_dtor)(nt, ml, tml->index);
                 assert(!ml->instance);
-                assert(!ml->instance_size);
+                assert(!ml->global_variables);
+                assert(ml->global_variables_size == 0);
             }
 
             NetReceiveBuffer_t* nrb = ml->_net_receive_buffer;
diff --git a/coreneuron/mechanism/mechanism.hpp b/coreneuron/mechanism/mechanism.hpp
index 9335e7530..9427423df 100644
--- a/coreneuron/mechanism/mechanism.hpp
+++ b/coreneuron/mechanism/mechanism.hpp
@@ -143,11 +143,12 @@ struct Memb_list {
     NetSendBuffer_t* _net_send_buffer = nullptr;
     int nodecount; /* actual node count */
     int _nodecount_padded;
+    void* instance{nullptr}; /* mechanism instance struct */
     // nrn_acc_manager.cpp handles data movement to/from the accelerator as the
     // "private constructor" in the translated MOD file code is called before
     // the main nrn_acc_manager methods that copy thread/mechanism data to the
     // device
-    void* instance{nullptr}; /* mechanism instance struct */
-    std::size_t instance_size{};
+    void* global_variables{nullptr};
+    std::size_t global_variables_size{};
 };
 }  // namespace coreneuron
diff --git a/coreneuron/mechanism/nrnoc_ml.ispc b/coreneuron/mechanism/nrnoc_ml.ispc
index fa8079fb2..0b1196df0 100644
--- a/coreneuron/mechanism/nrnoc_ml.ispc
+++ b/coreneuron/mechanism/nrnoc_ml.ispc
@@ -57,6 +57,8 @@ struct Memb_list {
     uniform int nodecount;
     uniform int _nodecount_padded;
     void* uniform instance;
+    void* uniform global_variables;
+    uniform unsigned long global_variables_size;
 };
 
 struct Point_process {
diff --git a/coreneuron/utils/offload.hpp b/coreneuron/utils/offload.hpp
index 184211c19..df8f5f507 100644
--- a/coreneuron/utils/offload.hpp
+++ b/coreneuron/utils/offload.hpp
@@ -19,10 +19,10 @@
 #else
 #define nrn_pragma_acc(x)
 #define nrn_pragma_omp(x)
-#include <stdexcept>
 #endif
 
 #include <cstddef>
+#include <stdexcept>
 #include <string_view>
 
 namespace coreneuron {
@@ -44,6 +44,11 @@ void cnrn_target_deviceptr_debug(std::string_view file,
                                  std::type_info const& typeid_T,
                                  void const* h_ptr,
                                  void* d_ptr);
+void cnrn_target_is_present_debug(std::string_view file,
+                                  int line,
+                                  std::type_info const& typeid_T,
+                                  void const* h_ptr,
+                                  void* d_ptr);
 void cnrn_target_memcpy_to_device_debug(std::string_view file,
                                         int line,
                                         std::size_t sizeof_T,
@@ -57,16 +62,20 @@ void cnrn_target_memcpy_to_device_debug(std::string_view file,
 // Homegrown implementation for buggy NVHPC versions (<=22.3), see
 // https://forums.developer.nvidia.com/t/acc-deviceptr-does-not-work-in-openacc-code-dynamically-loaded-from-a-shared-library/211599
 #define CORENEURON_ENABLE_PRESENT_TABLE
-void* cnrn_target_deviceptr_impl(void const* h_ptr);
+template <bool>
+std::pair<void*, bool> cnrn_target_deviceptr_impl(void const* h_ptr);
 void cnrn_target_copyin_update_present_table(void const* h_ptr, void* d_ptr, std::size_t len);
 void cnrn_target_delete_update_present_table(void const* h_ptr, std::size_t len);
 #endif
 
-template <typename T>
+template <bool must_be_present_or_null, typename T>
 T* cnrn_target_deviceptr(std::string_view file, int line, const T* h_ptr) {
     T* d_ptr{};
+    bool error{false};
 #ifdef CORENEURON_ENABLE_PRESENT_TABLE
-    d_ptr = static_cast<T*>(cnrn_target_deviceptr_impl(h_ptr));
+    auto const d_ptr_and_error = cnrn_target_deviceptr_impl<must_be_present_or_null>(h_ptr);
+    d_ptr = static_cast<T*>(d_ptr_and_error.first);
+    error = d_ptr_and_error.second;
 #elif defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENACC)
     d_ptr = static_cast<T*>(acc_deviceptr(const_cast<T*>(h_ptr)));
@@ -75,10 +84,21 @@ T* cnrn_target_deviceptr(std::string_view file, int line, const T* h_ptr) {
     nrn_pragma_omp(target data use_device_ptr(h_ptr))
     { d_ptr = const_cast<T*>(h_ptr); }
 #else
-    throw std::runtime_error(
-        "cnrn_target_deviceptr() not implemented without OpenACC/OpenMP and gpu build");
+    if (must_be_present_or_null && h_ptr) {
+        throw std::runtime_error(
+            "cnrn_target_deviceptr() not implemented without OpenACC/OpenMP and gpu build");
+    }
 #endif
-    cnrn_target_deviceptr_debug(file, line, typeid(T), h_ptr, d_ptr);
+    if (must_be_present_or_null) {
+        cnrn_target_deviceptr_debug(file, line, typeid(T), h_ptr, d_ptr);
+    } else {
+        cnrn_target_is_present_debug(file, line, typeid(T), h_ptr, d_ptr);
+    }
+    if (error) {
+        throw std::runtime_error(
+            "cnrn_target_deviceptr() encountered an error, you may want to try setting "
+            "CORENEURON_GPU_DEBUG=1");
+    }
     return d_ptr;
 }
 
@@ -152,14 +172,15 @@ void cnrn_target_update_on_device(std::string_view file,
                                   int line,
                                   const T* h_ptr,
                                   std::size_t len = 1) {
-    auto* d_ptr = cnrn_target_deviceptr(file, line, h_ptr);
+    auto* d_ptr = cnrn_target_deviceptr<true>(file, line, h_ptr);
     cnrn_target_memcpy_to_device(file, line, d_ptr, h_ptr);
 }
 
 // Replace with std::source_location once we have C++20
-#define cnrn_target_copyin(...)    cnrn_target_copyin(__FILE__, __LINE__, __VA_ARGS__)
-#define cnrn_target_delete(...)    cnrn_target_delete(__FILE__, __LINE__, __VA_ARGS__)
-#define cnrn_target_deviceptr(...) cnrn_target_deviceptr(__FILE__, __LINE__, __VA_ARGS__)
+#define cnrn_target_copyin(...)     cnrn_target_copyin(__FILE__, __LINE__, __VA_ARGS__)
+#define cnrn_target_delete(...)     cnrn_target_delete(__FILE__, __LINE__, __VA_ARGS__)
+#define cnrn_target_deviceptr(...)  cnrn_target_deviceptr<true>(__FILE__, __LINE__, __VA_ARGS__)
+#define cnrn_target_is_present(...) cnrn_target_deviceptr<false>(__FILE__, __LINE__, __VA_ARGS__)
 #define cnrn_target_memcpy_to_device(...) \
     cnrn_target_memcpy_to_device(__FILE__, __LINE__, __VA_ARGS__)
 #define cnrn_target_update_on_device(...) \
diff --git a/external/nmodl b/external/nmodl
index 7a53be75f..32cff1684 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit 7a53be75fa329e5120038b26e53b18dbe3074bd6
+Subproject commit 32cff1684e4cb3b66b1d0bfa21b0ed062edf9b22

From 9c28e3d7e7a496d1104b832dc74b1f9fc441013a Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 23 Aug 2022 14:31:14 +0200
Subject: [PATCH 108/128] reference counting in private present table

---
 coreneuron/gpu/nrn_acc_manager.cpp | 39 ++++++++++++++++++++----------
 coreneuron/utils/offload.hpp       | 24 ++++++++++--------
 external/nmodl                     |  2 +-
 3 files changed, 41 insertions(+), 24 deletions(-)

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index a05c897d7..1fcc59478 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -46,7 +46,11 @@
 #include <map>
 #include <shared_mutex>
 namespace {
-std::map<std::byte const*, std::pair<std::size_t, std::byte*>> present_table;
+struct present_table_value {
+    std::size_t ref_count{}, size{};
+    std::byte* dev_ptr{};
+};
+std::map<std::byte const*, present_table_value> present_table;
 std::shared_mutex present_table_mutex;
 }  // namespace
 #endif
@@ -155,8 +159,7 @@ void cnrn_target_memcpy_to_device_debug(std::string_view file,
 }
 
 #ifdef CORENEURON_ENABLE_PRESENT_TABLE
-template <bool must_be_present_or_null>
-std::pair<void*, bool> cnrn_target_deviceptr_impl(void const* h_ptr) {
+std::pair<void*, bool> cnrn_target_deviceptr_impl(bool must_be_present_or_null, void const* h_ptr) {
     if (!h_ptr) {
         return {nullptr, false};
     }
@@ -177,16 +180,14 @@ std::pair<void*, bool> cnrn_target_deviceptr_impl(void const* h_ptr) {
     }
     std::byte const* const h_byte_ptr{static_cast<std::byte const*>(h_ptr)};
     std::byte const* const h_start_of_block{iter->first};
-    std::size_t const block_size{iter->second.first};
-    std::byte* const d_start_of_block{iter->second.second};
+    std::size_t const block_size{iter->second.size};
+    std::byte* const d_start_of_block{iter->second.dev_ptr};
     bool const is_present{h_byte_ptr < h_start_of_block + block_size};
     if (!is_present) {
         return {nullptr, must_be_present_or_null};
     }
     return {d_start_of_block + (h_byte_ptr - h_start_of_block), false};
 }
-template std::pair<void*, bool> cnrn_target_deviceptr_impl<true>(void const*);
-template std::pair<void*, bool> cnrn_target_deviceptr_impl<false>(void const*);
 
 void cnrn_target_copyin_update_present_table(void const* h_ptr, void* d_ptr, std::size_t len) {
     if (!h_ptr) {
@@ -194,20 +195,32 @@ void cnrn_target_copyin_update_present_table(void const* h_ptr, void* d_ptr, std
         return;
     }
     std::lock_guard _{present_table_mutex};
-    // TODO include more pendantic overlap checking?
-    auto const result = present_table.emplace(static_cast<std::byte const*>(h_ptr),
-                                              std::make_pair(len, static_cast<std::byte*>(d_ptr)));
+    // TODO include more pedantic overlap checking?
+    present_table_value new_val{};
+    new_val.size = len;
+    new_val.ref_count = 1;
+    new_val.dev_ptr = static_cast<std::byte*>(d_ptr);
+    auto const [iter, inserted] = present_table.emplace(static_cast<std::byte const*>(h_ptr),
+                                                        std::move(new_val));
+    if (!inserted) {
+        // Insertion didn't occur because h_ptr was already in the present table
+        assert(iter->second.size == len);
+        assert(iter->second.dev_ptr == new_val.dev_ptr);
+        ++(iter->second.ref_count);
+    }
 }
 void cnrn_target_delete_update_present_table(void const* h_ptr, std::size_t len) {
     if (!h_ptr) {
         return;
     }
     std::lock_guard _{present_table_mutex};
-    // TODO properly matching OpenACC semantics would require a reference count
     auto const iter = present_table.find(static_cast<std::byte const*>(h_ptr));
     assert(iter != present_table.end());
-    assert(iter->second.first == len);
-    present_table.erase(iter);
+    assert(iter->second.size == len);
+    --(iter->second.ref_count);
+    if (iter->second.ref_count == 0) {
+        present_table.erase(iter);
+    }
 }
 #endif
 
diff --git a/coreneuron/utils/offload.hpp b/coreneuron/utils/offload.hpp
index df8f5f507..1911d364b 100644
--- a/coreneuron/utils/offload.hpp
+++ b/coreneuron/utils/offload.hpp
@@ -62,18 +62,20 @@ void cnrn_target_memcpy_to_device_debug(std::string_view file,
 // Homegrown implementation for buggy NVHPC versions (<=22.3), see
 // https://forums.developer.nvidia.com/t/acc-deviceptr-does-not-work-in-openacc-code-dynamically-loaded-from-a-shared-library/211599
 #define CORENEURON_ENABLE_PRESENT_TABLE
-template <bool>
-std::pair<void*, bool> cnrn_target_deviceptr_impl(void const* h_ptr);
+std::pair<void*, bool> cnrn_target_deviceptr_impl(bool must_be_present_or_null, void const* h_ptr);
 void cnrn_target_copyin_update_present_table(void const* h_ptr, void* d_ptr, std::size_t len);
 void cnrn_target_delete_update_present_table(void const* h_ptr, std::size_t len);
 #endif
 
-template <bool must_be_present_or_null, typename T>
-T* cnrn_target_deviceptr(std::string_view file, int line, const T* h_ptr) {
+template <typename T>
+T* cnrn_target_deviceptr_or_present(std::string_view file,
+                                    int line,
+                                    bool must_be_present_or_null,
+                                    const T* h_ptr) {
     T* d_ptr{};
     bool error{false};
 #ifdef CORENEURON_ENABLE_PRESENT_TABLE
-    auto const d_ptr_and_error = cnrn_target_deviceptr_impl<must_be_present_or_null>(h_ptr);
+    auto const d_ptr_and_error = cnrn_target_deviceptr_impl(must_be_present_or_null, h_ptr);
     d_ptr = static_cast<T*>(d_ptr_and_error.first);
     error = d_ptr_and_error.second;
 #elif defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
@@ -172,15 +174,17 @@ void cnrn_target_update_on_device(std::string_view file,
                                   int line,
                                   const T* h_ptr,
                                   std::size_t len = 1) {
-    auto* d_ptr = cnrn_target_deviceptr<true>(file, line, h_ptr);
+    auto* d_ptr = cnrn_target_deviceptr_or_present(file, line, true, h_ptr);
     cnrn_target_memcpy_to_device(file, line, d_ptr, h_ptr);
 }
 
 // Replace with std::source_location once we have C++20
-#define cnrn_target_copyin(...)     cnrn_target_copyin(__FILE__, __LINE__, __VA_ARGS__)
-#define cnrn_target_delete(...)     cnrn_target_delete(__FILE__, __LINE__, __VA_ARGS__)
-#define cnrn_target_deviceptr(...)  cnrn_target_deviceptr<true>(__FILE__, __LINE__, __VA_ARGS__)
-#define cnrn_target_is_present(...) cnrn_target_deviceptr<false>(__FILE__, __LINE__, __VA_ARGS__)
+#define cnrn_target_copyin(...) cnrn_target_copyin(__FILE__, __LINE__, __VA_ARGS__)
+#define cnrn_target_delete(...) cnrn_target_delete(__FILE__, __LINE__, __VA_ARGS__)
+#define cnrn_target_is_present(...) \
+    cnrn_target_deviceptr_or_present(__FILE__, __LINE__, false, __VA_ARGS__)
+#define cnrn_target_deviceptr(...) \
+    cnrn_target_deviceptr_or_present(__FILE__, __LINE__, true, __VA_ARGS__)
 #define cnrn_target_memcpy_to_device(...) \
     cnrn_target_memcpy_to_device(__FILE__, __LINE__, __VA_ARGS__)
 #define cnrn_target_update_on_device(...) \
diff --git a/external/nmodl b/external/nmodl
index 32cff1684..032f3ae88 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit 32cff1684e4cb3b66b1d0bfa21b0ed062edf9b22
+Subproject commit 032f3ae88278262a4ba3d22469bde3c7f578e1f6

From 89f3e67442732cf1f5f814a559482d13e584afb4 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 23 Aug 2022 16:00:22 +0200
Subject: [PATCH 109/128] explicit copyin/update/copyout for
 celsius/pi/secondorder

---
 coreneuron/apps/main1.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/coreneuron/apps/main1.cpp b/coreneuron/apps/main1.cpp
index e79db2104..a429b04f7 100644
--- a/coreneuron/apps/main1.cpp
+++ b/coreneuron/apps/main1.cpp
@@ -500,6 +500,9 @@ extern "C" void mk_mech_init(int argc, char** argv) {
 #ifdef CORENEURON_ENABLE_GPU
     if (corenrn_param.gpu) {
         init_gpu();
+        cnrn_target_copyin(&celsius);
+        cnrn_target_copyin(&pi);
+        cnrn_target_copyin(&secondorder);
         nrnran123_initialise_global_state_on_device();
     }
 #endif
@@ -558,6 +561,8 @@ extern "C" int run_solve_core(int argc, char** argv) {
 #endif
     bool compute_gpu = corenrn_param.gpu;
 
+    nrn_pragma_acc(update device(celsius, secondorder, pi) if (compute_gpu))
+    nrn_pragma_omp(target update to(celsius, secondorder, pi) if (compute_gpu))
     {
         double v = corenrn_param.voltage;
         double dt = corenrn_param.dt;
@@ -679,6 +684,9 @@ extern "C" int run_solve_core(int argc, char** argv) {
             nrn_partrans::delete_gap_indices_from_device();
         }
         nrnran123_destroy_global_state_on_device();
+        cnrn_target_delete(&secondorder);
+        cnrn_target_delete(&pi);
+        cnrn_target_delete(&celsius);
     }
 
     // Cleaning the memory

From fbde41ca1abe4fdd28bbf8d171626b971ab7cc41 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 23 Aug 2022 16:14:40 +0200
Subject: [PATCH 110/128] submodules

---
 external/mod2c | 2 +-
 external/nmodl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/external/mod2c b/external/mod2c
index 77bba7715..ec96803a3 160000
--- a/external/mod2c
+++ b/external/mod2c
@@ -1 +1 @@
-Subproject commit 77bba771579c9f91a2e10533967486e5e1f429fa
+Subproject commit ec96803a3ec34bab63cc0e00b6cc85581eacd403
diff --git a/external/nmodl b/external/nmodl
index 032f3ae88..bac6ff883 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit 032f3ae88278262a4ba3d22469bde3c7f578e1f6
+Subproject commit bac6ff8839bf6556bab88aec9666a91d7de44825

From c728fabe04ebc70fe15da9ec92f8f0a2bd9f02fa Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 24 Aug 2022 11:27:01 +0200
Subject: [PATCH 111/128] ispc fix

---
 coreneuron/mechanism/nrnoc_ml.ispc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/coreneuron/mechanism/nrnoc_ml.ispc b/coreneuron/mechanism/nrnoc_ml.ispc
index 0b1196df0..2c28a745f 100644
--- a/coreneuron/mechanism/nrnoc_ml.ispc
+++ b/coreneuron/mechanism/nrnoc_ml.ispc
@@ -58,7 +58,7 @@ struct Memb_list {
     uniform int _nodecount_padded;
     void* uniform instance;
     void* uniform global_variables;
-    uniform unsigned long global_variables_size;
+    uniform size_t global_variables_size;
 };
 
 struct Point_process {

From 0adad8b225f5184c9aa2de5bf366a4c46291a034 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 24 Aug 2022 11:27:17 +0200
Subject: [PATCH 112/128] submodule

---
 external/nmodl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/nmodl b/external/nmodl
index bac6ff883..07086dfa3 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit bac6ff8839bf6556bab88aec9666a91d7de44825
+Subproject commit 07086dfa38e308699c86212cf98d5324217995bc

From dc8d86bad778c2047e1cdc4c1603f752157df7b3 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 24 Aug 2022 14:50:18 +0200
Subject: [PATCH 113/128] OpenMP: support cnrn_target_is_present

---
 coreneuron/utils/offload.hpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/coreneuron/utils/offload.hpp b/coreneuron/utils/offload.hpp
index 1911d364b..b0a19fb67 100644
--- a/coreneuron/utils/offload.hpp
+++ b/coreneuron/utils/offload.hpp
@@ -83,8 +83,10 @@ T* cnrn_target_deviceptr_or_present(std::string_view file,
     d_ptr = static_cast<T*>(acc_deviceptr(const_cast<T*>(h_ptr)));
 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENMP)
-    nrn_pragma_omp(target data use_device_ptr(h_ptr))
-    { d_ptr = const_cast<T*>(h_ptr); }
+    if (must_be_present_or_null || omp_target_is_present(h_ptr, omp_get_default_device())) {
+        nrn_pragma_omp(target data use_device_ptr(h_ptr))
+        { d_ptr = const_cast<T*>(h_ptr); }
+    }
 #else
     if (must_be_present_or_null && h_ptr) {
         throw std::runtime_error(

From d75d5d4de25758720971558a13dd096550caee9d Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 24 Aug 2022 14:50:31 +0200
Subject: [PATCH 114/128] submodule

---
 external/nmodl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/nmodl b/external/nmodl
index 07086dfa3..6b43a20be 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit 07086dfa38e308699c86212cf98d5324217995bc
+Subproject commit 6b43a20be76fce6b144e4324c574301c1f087387

From d8ea959ebf9ea59823987038601824ccc787aadb Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 24 Aug 2022 15:57:02 +0200
Subject: [PATCH 115/128] omp_target_is_present has problems in nvhpc 22.3

---
 coreneuron/utils/offload.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/coreneuron/utils/offload.hpp b/coreneuron/utils/offload.hpp
index b0a19fb67..6297221e8 100644
--- a/coreneuron/utils/offload.hpp
+++ b/coreneuron/utils/offload.hpp
@@ -56,9 +56,9 @@ void cnrn_target_memcpy_to_device_debug(std::string_view file,
                                         void const* h_ptr,
                                         std::size_t len,
                                         void* d_ptr);
-#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) &&              \
-    defined(_OPENACC) && !defined(CORENEURON_UNIFIED_MEMORY) && defined(__NVCOMPILER_MAJOR__) && \
-    defined(__NVCOMPILER_MINOR__) && (__NVCOMPILER_MAJOR__ <= 22) && (__NVCOMPILER_MINOR__ <= 3)
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_UNIFIED_MEMORY) && \
+    defined(__NVCOMPILER_MAJOR__) && defined(__NVCOMPILER_MINOR__) &&        \
+    (__NVCOMPILER_MAJOR__ <= 22) && (__NVCOMPILER_MINOR__ <= 3)
 // Homegrown implementation for buggy NVHPC versions (<=22.3), see
 // https://forums.developer.nvidia.com/t/acc-deviceptr-does-not-work-in-openacc-code-dynamically-loaded-from-a-shared-library/211599
 #define CORENEURON_ENABLE_PRESENT_TABLE

From 74e5f0ac9abaa686279119ffac9208d8866bee66 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Thu, 25 Aug 2022 11:41:59 +0200
Subject: [PATCH 116/128] Update GitLab CI config

---
 .gitlab-ci.yml | 252 +++++++++++++++----------------------------------
 1 file changed, 78 insertions(+), 174 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 3da9d2b52..b000c1eab 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -96,256 +96,160 @@ build:nmodl:
     # TODO: fix this more robustly so we don't have to play so many games.
     SPACK_PACKAGE_DEPENDENCIES: ^hpe-mpi%gcc ^caliper%gcc+cuda cuda_arch=70
 
-build:coreneuron:mod2c:nvhpc:acc:
-  extends: [.build_coreneuron, .spack_nvhpc]
-  variables:
-    SPACK_PACKAGE_SPEC: +caliper+gpu+openmp~shared+tests~legacy-unit build_type=RelWithDebInfo
-
-build:coreneuron:mod2c:nvhpc:acc:debug:
-  extends: [.build_coreneuron, .spack_nvhpc]
+build:coreneuron:mod2c:intel:shared:
+  extends: [.build_coreneuron, .spack_intel]
   variables:
-    SPACK_PACKAGE_SPEC: +caliper+gpu+openmp~shared+tests~legacy-unit build_type=Debug
+    SPACK_PACKAGE_SPEC: +caliper~gpu~legacy-unit~nmodl+openmp+shared+tests~unified build_type=RelWithDebInfo
 
-# Shared + OpenACC + OpenMP host threading has problems
-build:coreneuron:mod2c:nvhpc:acc:shared:
-  extends: [.build_coreneuron, .spack_nvhpc]
+build:coreneuron:nmodl:intel:shared:debug:legacy:
+  extends: [.build_coreneuron, .spack_intel]
+  needs: ["build:nmodl"]
   variables:
-    SPACK_PACKAGE_SPEC: +caliper+gpu~openmp+shared+tests~legacy-unit build_type=RelWithDebInfo
+    SPACK_PACKAGE_SPEC: +caliper~gpu~legacy-unit+nmodl+openmp+shared~sympy+tests~unified build_type=Debug
 
-# Shared + OpenACC + OpenMP host threading has problems
-build:coreneuron:mod2c:nvhpc:acc:shared:debug:
-  extends: [.build_coreneuron, .spack_nvhpc]
+# Disable caliper to improve coverage
+build:coreneuron:nmodl:intel:
+  extends: [.build_coreneuron, .spack_intel]
+  needs: ["build:nmodl"]
   variables:
-    SPACK_PACKAGE_SPEC: +caliper+gpu~openmp+shared+tests~legacy-unit build_type=Debug
+    SPACK_PACKAGE_SPEC: ~caliper~gpu~legacy-unit+nmodl+openmp~shared+sympy+tests~unified build_type=RelWithDebInfo
 
-# Build CoreNEURON with Unified Memory on GPU
-build:coreneuron:mod2c:nvhpc:acc:unified:
+# Not linked to a NEURON build+test job, see
+# https://github.com/BlueBrain/CoreNeuron/issues/594
+build:coreneuron:mod2c:nvhpc:acc:debug:unified:
   extends: [.build_coreneuron, .spack_nvhpc]
   variables:
-    SPACK_PACKAGE_SPEC: +caliper+gpu+unified+openmp~shared+tests~legacy-unit build_type=Debug
+    SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit~nmodl+openmp~shared+tests+unified build_type=Debug
 
-build:coreneuron:nmodl:nvhpc:acc:
+# Shared + OpenACC + OpenMP host threading has problems
+build:coreneuron:mod2c:nvhpc:acc:shared:debug:
   extends: [.build_coreneuron, .spack_nvhpc]
   variables:
-    SPACK_PACKAGE_SPEC: +caliper+nmodl~openmp+gpu~shared+tests~legacy-unit+sympy build_type=RelWithDebInfo
-  needs: ["build:nmodl"]
+    SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit~nmodl~openmp+shared+tests~unified build_type=Debug
 
-build:coreneuron:nmodl:nvhpc:acc:debug:
+build:coreneuron:nmodl:nvhpc:acc:legacy:
   extends: [.build_coreneuron, .spack_nvhpc]
-  variables:
-    SPACK_PACKAGE_SPEC: +caliper+nmodl~openmp+gpu~shared+tests~legacy-unit+sympy build_type=Debug
   needs: ["build:nmodl"]
-
-build:coreneuron:nmodl:nvhpc:acc:shared:
-  extends: [.build_coreneuron, .spack_nvhpc]
   variables:
-    SPACK_PACKAGE_SPEC: +caliper+nmodl~openmp+gpu+shared+tests~legacy-unit+sympy build_type=RelWithDebInfo
-  needs: ["build:nmodl"]
+    SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit+nmodl~openmp~shared~sympy+tests~unified build_type=RelWithDebInfo
 
 build:coreneuron:nmodl:nvhpc:acc:shared:debug:
   extends: [.build_coreneuron, .spack_nvhpc]
-  variables:
-    SPACK_PACKAGE_SPEC: +caliper+nmodl~openmp+gpu+shared+tests~legacy-unit+sympy build_type=Debug
   needs: ["build:nmodl"]
-
-build:coreneuron:nmodl:nvhpc:acc:legacy:
-  extends: [.build_coreneuron, .spack_nvhpc]
-  variables:
-    SPACK_PACKAGE_SPEC: +caliper+nmodl~openmp+gpu~shared+tests~legacy-unit~sympy build_type=RelWithDebInfo
-  needs: ["build:nmodl"]
-
-build:coreneuron:nmodl:nvhpc:acc:debug:legacy:
-  extends: [.build_coreneuron, .spack_nvhpc]
   variables:
-    SPACK_PACKAGE_SPEC: +caliper+nmodl~openmp+gpu~shared+tests~legacy-unit~sympy build_type=Debug
-  needs: ["build:nmodl"]
+    SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit+nmodl~openmp+shared+sympy+tests~unified build_type=Debug
 
-build:coreneuron:nmodl:nvhpc:acc:shared:legacy:
+build:coreneuron:nmodl:nvhpc:omp:debug:legacy:
   extends: [.build_coreneuron, .spack_nvhpc]
-  variables:
-    SPACK_PACKAGE_SPEC: +caliper+nmodl~openmp+gpu+shared+tests~legacy-unit~sympy build_type=RelWithDebInfo
   needs: ["build:nmodl"]
-
-build:coreneuron:nmodl:nvhpc:acc:shared:debug:legacy:
-  extends: [.build_coreneuron, .spack_nvhpc]
   variables:
-    SPACK_PACKAGE_SPEC: +caliper+nmodl~openmp+gpu+shared+tests~legacy-unit~sympy build_type=Debug
-  needs: ["build:nmodl"]
+    SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit+nmodl+openmp~shared~sympy+tests~unified build_type=Debug
 
 build:coreneuron:nmodl:nvhpc:omp:
   extends: [.build_coreneuron, .spack_nvhpc]
-  variables:
-    SPACK_PACKAGE_SPEC: +caliper+nmodl+openmp+gpu~shared+tests~legacy-unit+sympy build_type=RelWithDebInfo
   needs: ["build:nmodl"]
-
-build:coreneuron:nmodl:nvhpc:omp:debug:
-  extends: [.build_coreneuron, .spack_nvhpc]
   variables:
-    SPACK_PACKAGE_SPEC: +caliper+nmodl+openmp+gpu~shared+tests~legacy-unit+sympy build_type=Debug
-  needs: ["build:nmodl"]
+    SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit+nmodl+openmp~shared+sympy+tests~unified build_type=RelWithDebInfo
 
-build:coreneuron:nmodl:nvhpc:omp:legacy:
-  extends: [.build_coreneuron, .spack_nvhpc]
-  variables:
-    SPACK_PACKAGE_SPEC: +caliper+nmodl+openmp+gpu~shared+tests~legacy-unit~sympy build_type=RelWithDebInfo
-  needs: ["build:nmodl"]
-
-build:coreneuron:nmodl:nvhpc:omp:debug:legacy:
-  extends: [.build_coreneuron, .spack_nvhpc]
-  variables:
-    SPACK_PACKAGE_SPEC: +caliper+nmodl+openmp+gpu~shared+tests~legacy-unit~sympy build_type=Debug
-  needs: ["build:nmodl"]
+# Build NEURON
+build:neuron:mod2c:intel:shared:
+  extends: [.build_neuron, .spack_intel]
+  needs: ["build:coreneuron:mod2c:intel:shared"]
 
-build:coreneuron:mod2c:intel:
-  extends: [.build_coreneuron, .spack_intel]
-  variables:
-    SPACK_PACKAGE_SPEC: +caliper+tests~legacy-unit build_type=Debug
+build:neuron:nmodl:intel:shared:debug:legacy:
+  extends: [.build_neuron, .spack_intel]
+  needs: ["build:coreneuron:nmodl:intel:shared:debug:legacy"]
 
-build:coreneuron:nmodl:intel:
-  extends: [.build_coreneuron, .spack_intel]
-  variables:
-    SPACK_PACKAGE_SPEC: +caliper+nmodl+tests~legacy-unit build_type=Debug
-  needs: ["build:nmodl"]
+build:neuron:nmodl:intel:
+  extends: [.build_neuron, .spack_intel]
+  needs: ["build:coreneuron:nmodl:intel"]
 
-# Build NEURON
-build:neuron:mod2c:nvhpc:acc:
+build:neuron:mod2c:nvhpc:acc:shared:debug:
   extends: [.build_neuron, .spack_nvhpc]
-  needs: ["build:coreneuron:mod2c:nvhpc:acc"]
+  needs: ["build:coreneuron:mod2c:nvhpc:acc:shared:debug"]
 
-build:neuron:mod2c:nvhpc:acc:debug:
+build:neuron:nmodl:nvhpc:acc:legacy:
   extends: [.build_neuron, .spack_nvhpc]
-  needs: ["build:coreneuron:mod2c:nvhpc:acc:debug"]
+  needs: ["build:coreneuron:nmodl:nvhpc:acc:legacy"]
 
-build:neuron:mod2c:nvhpc:acc:shared:
+build:neuron:nmodl:nvhpc:acc:shared:debug:
   extends: [.build_neuron, .spack_nvhpc]
-  needs: ["build:coreneuron:mod2c:nvhpc:acc:shared"]
+  needs: ["build:coreneuron:nmodl:nvhpc:acc:shared:debug"]
 
-build:neuron:mod2c:nvhpc:acc:shared:debug:
+build:neuron:nmodl:nvhpc:omp:debug:legacy:
   extends: [.build_neuron, .spack_nvhpc]
-  needs: ["build:coreneuron:mod2c:nvhpc:acc:shared:debug"]
+  needs: ["build:coreneuron:nmodl:nvhpc:omp:debug:legacy"]
 
 build:neuron:nmodl:nvhpc:omp:
   extends: [.build_neuron, .spack_nvhpc]
   needs: ["build:coreneuron:nmodl:nvhpc:omp"]
 
-build:neuron:nmodl:nvhpc:acc:
-  extends: [.build_neuron, .spack_nvhpc]
-  needs: ["build:coreneuron:nmodl:nvhpc:acc"]
+# Test CoreNEURON
+test:coreneuron:mod2c:intel:shared:
+  extends: [.ctest]
+  needs: ["build:coreneuron:mod2c:intel:shared"]
 
-build:neuron:mod2c:intel:
-  extends: [.build_neuron, .spack_intel]
-  needs: ["build:coreneuron:mod2c:intel"]
+test:coreneuron:nmodl:intel:shared:debug:legacy:
+  extends: [.ctest]
+  needs: ["build:coreneuron:nmodl:intel:shared:debug:legacy"]
 
-build:neuron:nmodl:intel:
-  extends: [.build_neuron, .spack_intel]
+test:coreneuron:nmodl:intel:
+  extends: [.ctest]
   needs: ["build:coreneuron:nmodl:intel"]
 
-# Test CoreNEURON
-test:coreneuron:mod2c:nvhpc:acc:
-  extends: [.ctest, .gpu_node]
-  needs: ["build:coreneuron:mod2c:nvhpc:acc"]
-
-test:coreneuron:mod2c:nvhpc:acc:debug:
+test:coreneuron:mod2c:nvhpc:acc:debug:unified:
   extends: [.ctest, .gpu_node]
-  needs: ["build:coreneuron:mod2c:nvhpc:acc:debug"]
-
-test:coreneuron:mod2c:nvhpc:acc:shared:
-  extends: [.ctest, .gpu_node]
-  needs: ["build:coreneuron:mod2c:nvhpc:acc:shared"]
+  needs: ["build:coreneuron:mod2c:nvhpc:acc:debug:unified"]
 
 test:coreneuron:mod2c:nvhpc:acc:shared:debug:
   extends: [.ctest, .gpu_node]
   needs: ["build:coreneuron:mod2c:nvhpc:acc:shared:debug"]
 
-test:coreneuron:mod2c:nvhpc:acc:unified:
-  extends: [.ctest, .gpu_node]
-  needs: ["build:coreneuron:mod2c:nvhpc:acc:unified"]
-
-test:coreneuron:nmodl:nvhpc:omp:
-  extends: [.ctest, .gpu_node]
-  needs: ["build:coreneuron:nmodl:nvhpc:omp"]
-
-test:coreneuron:nmodl:nvhpc:omp:debug:
-  extends: [.ctest, .gpu_node]
-  needs: ["build:coreneuron:nmodl:nvhpc:omp:debug"]
-
-test:coreneuron:nmodl:nvhpc:omp:legacy:
-  extends: [.ctest, .gpu_node]
-  needs: ["build:coreneuron:nmodl:nvhpc:omp:legacy"]
-
-test:coreneuron:nmodl:nvhpc:omp:debug:legacy:
-  extends: [.ctest, .gpu_node]
-  needs: ["build:coreneuron:nmodl:nvhpc:omp:debug:legacy"]
-
-test:coreneuron:nmodl:nvhpc:acc:
-  extends: [.ctest, .gpu_node]
-  needs: ["build:coreneuron:nmodl:nvhpc:acc"]
-
-test:coreneuron:nmodl:nvhpc:acc:debug:
-  extends: [.ctest, .gpu_node]
-  needs: ["build:coreneuron:nmodl:nvhpc:acc:debug"]
-
-test:coreneuron:nmodl:nvhpc:acc:shared:
+test:coreneuron:nmodl:nvhpc:acc:legacy:
   extends: [.ctest, .gpu_node]
-  needs: ["build:coreneuron:nmodl:nvhpc:acc:shared"]
+  needs: ["build:coreneuron:nmodl:nvhpc:acc:legacy"]
 
 test:coreneuron:nmodl:nvhpc:acc:shared:debug:
   extends: [.ctest, .gpu_node]
   needs: ["build:coreneuron:nmodl:nvhpc:acc:shared:debug"]
 
-test:coreneuron:nmodl:nvhpc:acc:legacy:
-  extends: [.ctest, .gpu_node]
-  needs: ["build:coreneuron:nmodl:nvhpc:acc:legacy"]
-
-test:coreneuron:nmodl:nvhpc:acc:debug:legacy:
+test:coreneuron:nmodl:nvhpc:omp:debug:legacy:
   extends: [.ctest, .gpu_node]
-  needs: ["build:coreneuron:nmodl:nvhpc:acc:debug:legacy"]
+  needs: ["build:coreneuron:nmodl:nvhpc:omp:debug:legacy"]
 
-test:coreneuron:nmodl:nvhpc:acc:shared:legacy:
+test:coreneuron:nmodl:nvhpc:omp:
   extends: [.ctest, .gpu_node]
-  needs: ["build:coreneuron:nmodl:nvhpc:acc:shared:legacy"]
+  needs: ["build:coreneuron:nmodl:nvhpc:omp"]
 
-test:coreneuron:nmodl:nvhpc:acc:shared:debug:legacy:
-  extends: [.ctest, .gpu_node]
-  needs: ["build:coreneuron:nmodl:nvhpc:acc:shared:debug:legacy"]
+# Test NEURON
+test:neuron:mod2c:intel:shared:
+  extends: [.test_neuron]
+  needs: ["build:neuron:mod2c:intel:shared"]
 
-test:coreneuron:mod2c:intel:
-  extends: [.ctest]
-  needs: ["build:coreneuron:mod2c:intel"]
+test:neuron:nmodl:intel:shared:debug:legacy:
+  extends: [.test_neuron]
+  needs: ["build:neuron:nmodl:intel:shared:debug:legacy"]
 
-test:coreneuron:nmodl:intel:
-  extends: [.ctest]
-  needs: ["build:coreneuron:nmodl:intel"]
+test:neuron:nmodl:intel:
+  extends: [.test_neuron]
+  needs: ["build:neuron:nmodl:intel"]
 
-# Test NEURON
-test:neuron:mod2c:nvhpc:acc:
+test:neuron:mod2c:nvhpc:acc:shared:debug:
   extends: [.test_neuron, .gpu_node]
-  needs: ["build:neuron:mod2c:nvhpc:acc"]
+  needs: ["build:neuron:mod2c:nvhpc:acc:shared:debug"]
 
-test:neuron:mod2c:nvhpc:acc:debug:
+test:neuron:nmodl:nvhpc:acc:legacy:
   extends: [.test_neuron, .gpu_node]
-  needs: ["build:neuron:mod2c:nvhpc:acc:debug"]
+  needs: ["build:neuron:nmodl:nvhpc:acc:legacy"]
 
-test:neuron:mod2c:nvhpc:acc:shared:
+test:neuron:nmodl:nvhpc:acc:shared:debug:
   extends: [.test_neuron, .gpu_node]
-  needs: ["build:neuron:mod2c:nvhpc:acc:shared"]
+  needs: ["build:neuron:nmodl:nvhpc:acc:shared:debug"]
 
-test:neuron:mod2c:nvhpc:acc:shared:debug:
+test:neuron:nmodl:nvhpc:omp:debug:legacy:
   extends: [.test_neuron, .gpu_node]
-  needs: ["build:neuron:mod2c:nvhpc:acc:shared:debug"]
+  needs: ["build:neuron:nmodl:nvhpc:omp:debug:legacy"]
 
 test:neuron:nmodl:nvhpc:omp:
   extends: [.test_neuron, .gpu_node]
   needs: ["build:neuron:nmodl:nvhpc:omp"]
-
-test:neuron:nmodl:nvhpc:acc:
-  extends: [.test_neuron, .gpu_node]
-  needs: ["build:neuron:nmodl:nvhpc:acc"]
-
-test:neuron:mod2c:intel:
-  extends: [.test_neuron]
-  needs: ["build:neuron:mod2c:intel"]
-
-test:neuron:nmodl:intel:
-  extends: [.test_neuron]
-  needs: ["build:neuron:nmodl:intel"]

From 6b96330671326403231247c50e289b5786f8c497 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Thu, 25 Aug 2022 11:55:09 +0200
Subject: [PATCH 117/128] tweak nmodl+cpu builds

---
 .gitlab-ci.yml | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index b000c1eab..cd1944cde 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -101,18 +101,18 @@ build:coreneuron:mod2c:intel:shared:
   variables:
     SPACK_PACKAGE_SPEC: +caliper~gpu~legacy-unit~nmodl+openmp+shared+tests~unified build_type=RelWithDebInfo
 
-build:coreneuron:nmodl:intel:shared:debug:legacy:
+build:coreneuron:nmodl:intel:debug:legacy:
   extends: [.build_coreneuron, .spack_intel]
   needs: ["build:nmodl"]
   variables:
-    SPACK_PACKAGE_SPEC: +caliper~gpu~legacy-unit+nmodl+openmp+shared~sympy+tests~unified build_type=Debug
+    SPACK_PACKAGE_SPEC: +caliper~gpu~legacy-unit+nmodl+openmp~shared~sympy+tests~unified build_type=Debug
 
 # Disable caliper to improve coverage
-build:coreneuron:nmodl:intel:
+build:coreneuron:nmodl:intel:shared:
   extends: [.build_coreneuron, .spack_intel]
   needs: ["build:nmodl"]
   variables:
-    SPACK_PACKAGE_SPEC: ~caliper~gpu~legacy-unit+nmodl+openmp~shared+sympy+tests~unified build_type=RelWithDebInfo
+    SPACK_PACKAGE_SPEC: ~caliper~gpu~legacy-unit+nmodl+openmp+shared+sympy+tests~unified build_type=RelWithDebInfo
 
 # Not linked to a NEURON build+test job, see
 # https://github.com/BlueBrain/CoreNeuron/issues/594
@@ -156,13 +156,13 @@ build:neuron:mod2c:intel:shared:
   extends: [.build_neuron, .spack_intel]
   needs: ["build:coreneuron:mod2c:intel:shared"]
 
-build:neuron:nmodl:intel:shared:debug:legacy:
+build:neuron:nmodl:intel:debug:legacy:
   extends: [.build_neuron, .spack_intel]
-  needs: ["build:coreneuron:nmodl:intel:shared:debug:legacy"]
+  needs: ["build:coreneuron:nmodl:intel:debug:legacy"]
 
-build:neuron:nmodl:intel:
+build:neuron:nmodl:intel:shared:
   extends: [.build_neuron, .spack_intel]
-  needs: ["build:coreneuron:nmodl:intel"]
+  needs: ["build:coreneuron:nmodl:intel:shared"]
 
 build:neuron:mod2c:nvhpc:acc:shared:debug:
   extends: [.build_neuron, .spack_nvhpc]
@@ -189,13 +189,13 @@ test:coreneuron:mod2c:intel:shared:
   extends: [.ctest]
   needs: ["build:coreneuron:mod2c:intel:shared"]
 
-test:coreneuron:nmodl:intel:shared:debug:legacy:
+test:coreneuron:nmodl:intel:debug:legacy:
   extends: [.ctest]
-  needs: ["build:coreneuron:nmodl:intel:shared:debug:legacy"]
+  needs: ["build:coreneuron:nmodl:intel:debug:legacy"]
 
-test:coreneuron:nmodl:intel:
+test:coreneuron:nmodl:intel:shared:
   extends: [.ctest]
-  needs: ["build:coreneuron:nmodl:intel"]
+  needs: ["build:coreneuron:nmodl:intel:shared"]
 
 test:coreneuron:mod2c:nvhpc:acc:debug:unified:
   extends: [.ctest, .gpu_node]
@@ -226,13 +226,13 @@ test:neuron:mod2c:intel:shared:
   extends: [.test_neuron]
   needs: ["build:neuron:mod2c:intel:shared"]
 
-test:neuron:nmodl:intel:shared:debug:legacy:
+test:neuron:nmodl:intel:debug:legacy:
   extends: [.test_neuron]
-  needs: ["build:neuron:nmodl:intel:shared:debug:legacy"]
+  needs: ["build:neuron:nmodl:intel:debug:legacy"]
 
-test:neuron:nmodl:intel:
+test:neuron:nmodl:intel:shared:
   extends: [.test_neuron]
-  needs: ["build:neuron:nmodl:intel"]
+  needs: ["build:neuron:nmodl:intel:shared"]
 
 test:neuron:mod2c:nvhpc:acc:shared:debug:
   extends: [.test_neuron, .gpu_node]

From d72ecde07632f9eb0200eed70b59e6b1bd742d69 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Thu, 25 Aug 2022 11:56:33 +0200
Subject: [PATCH 118/128] tweak mod2c+gpu build

---
 .gitlab-ci.yml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index cd1944cde..01b3175ba 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -122,10 +122,10 @@ build:coreneuron:mod2c:nvhpc:acc:debug:unified:
     SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit~nmodl+openmp~shared+tests+unified build_type=Debug
 
 # Shared + OpenACC + OpenMP host threading has problems
-build:coreneuron:mod2c:nvhpc:acc:shared:debug:
+build:coreneuron:mod2c:nvhpc:acc:shared:
   extends: [.build_coreneuron, .spack_nvhpc]
   variables:
-    SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit~nmodl~openmp+shared+tests~unified build_type=Debug
+    SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit~nmodl~openmp+shared+tests~unified build_type=RelWithDebInfo
 
 build:coreneuron:nmodl:nvhpc:acc:legacy:
   extends: [.build_coreneuron, .spack_nvhpc]
@@ -164,9 +164,9 @@ build:neuron:nmodl:intel:shared:
   extends: [.build_neuron, .spack_intel]
   needs: ["build:coreneuron:nmodl:intel:shared"]
 
-build:neuron:mod2c:nvhpc:acc:shared:debug:
+build:neuron:mod2c:nvhpc:acc:shared:
   extends: [.build_neuron, .spack_nvhpc]
-  needs: ["build:coreneuron:mod2c:nvhpc:acc:shared:debug"]
+  needs: ["build:coreneuron:mod2c:nvhpc:acc:shared"]
 
 build:neuron:nmodl:nvhpc:acc:legacy:
   extends: [.build_neuron, .spack_nvhpc]
@@ -201,9 +201,9 @@ test:coreneuron:mod2c:nvhpc:acc:debug:unified:
   extends: [.ctest, .gpu_node]
   needs: ["build:coreneuron:mod2c:nvhpc:acc:debug:unified"]
 
-test:coreneuron:mod2c:nvhpc:acc:shared:debug:
+test:coreneuron:mod2c:nvhpc:acc:shared:
   extends: [.ctest, .gpu_node]
-  needs: ["build:coreneuron:mod2c:nvhpc:acc:shared:debug"]
+  needs: ["build:coreneuron:mod2c:nvhpc:acc:shared"]
 
 test:coreneuron:nmodl:nvhpc:acc:legacy:
   extends: [.ctest, .gpu_node]
@@ -234,9 +234,9 @@ test:neuron:nmodl:intel:shared:
   extends: [.test_neuron]
   needs: ["build:neuron:nmodl:intel:shared"]
 
-test:neuron:mod2c:nvhpc:acc:shared:debug:
+test:neuron:mod2c:nvhpc:acc:shared:
   extends: [.test_neuron, .gpu_node]
-  needs: ["build:neuron:mod2c:nvhpc:acc:shared:debug"]
+  needs: ["build:neuron:mod2c:nvhpc:acc:shared"]
 
 test:neuron:nmodl:nvhpc:acc:legacy:
   extends: [.test_neuron, .gpu_node]

From 977779ec201ed950938a99ebf7fa9b598789b029 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Thu, 25 Aug 2022 13:14:29 +0200
Subject: [PATCH 119/128] swap debug/relwithdebinfo for nmodl+gpu builds

---
 .gitlab-ci.yml | 64 +++++++++++++++++++++++++-------------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 01b3175ba..d7ba6c74f 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -127,29 +127,29 @@ build:coreneuron:mod2c:nvhpc:acc:shared:
   variables:
     SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit~nmodl~openmp+shared+tests~unified build_type=RelWithDebInfo
 
-build:coreneuron:nmodl:nvhpc:acc:legacy:
+build:coreneuron:nmodl:nvhpc:acc:debug:legacy:
   extends: [.build_coreneuron, .spack_nvhpc]
   needs: ["build:nmodl"]
   variables:
-    SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit+nmodl~openmp~shared~sympy+tests~unified build_type=RelWithDebInfo
+    SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit+nmodl~openmp~shared~sympy+tests~unified build_type=Debug
 
-build:coreneuron:nmodl:nvhpc:acc:shared:debug:
+build:coreneuron:nmodl:nvhpc:acc:shared:
   extends: [.build_coreneuron, .spack_nvhpc]
   needs: ["build:nmodl"]
   variables:
-    SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit+nmodl~openmp+shared+sympy+tests~unified build_type=Debug
+    SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit+nmodl~openmp+shared+sympy+tests~unified build_type=RelWithDebInfo
 
-build:coreneuron:nmodl:nvhpc:omp:debug:legacy:
+build:coreneuron:nmodl:nvhpc:omp:legacy:
   extends: [.build_coreneuron, .spack_nvhpc]
   needs: ["build:nmodl"]
   variables:
-    SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit+nmodl+openmp~shared~sympy+tests~unified build_type=Debug
+    SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit+nmodl+openmp~shared~sympy+tests~unified build_type=RelWithDebInfo
 
-build:coreneuron:nmodl:nvhpc:omp:
+build:coreneuron:nmodl:nvhpc:omp:debug:
   extends: [.build_coreneuron, .spack_nvhpc]
   needs: ["build:nmodl"]
   variables:
-    SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit+nmodl+openmp~shared+sympy+tests~unified build_type=RelWithDebInfo
+    SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit+nmodl+openmp~shared+sympy+tests~unified build_type=Debug
 
 # Build NEURON
 build:neuron:mod2c:intel:shared:
@@ -168,21 +168,21 @@ build:neuron:mod2c:nvhpc:acc:shared:
   extends: [.build_neuron, .spack_nvhpc]
   needs: ["build:coreneuron:mod2c:nvhpc:acc:shared"]
 
-build:neuron:nmodl:nvhpc:acc:legacy:
+build:neuron:nmodl:nvhpc:acc:debug:legacy:
   extends: [.build_neuron, .spack_nvhpc]
-  needs: ["build:coreneuron:nmodl:nvhpc:acc:legacy"]
+  needs: ["build:coreneuron:nmodl:nvhpc:acc:debug:legacy"]
 
-build:neuron:nmodl:nvhpc:acc:shared:debug:
+build:neuron:nmodl:nvhpc:acc:shared:
   extends: [.build_neuron, .spack_nvhpc]
-  needs: ["build:coreneuron:nmodl:nvhpc:acc:shared:debug"]
+  needs: ["build:coreneuron:nmodl:nvhpc:acc:shared"]
 
-build:neuron:nmodl:nvhpc:omp:debug:legacy:
+build:neuron:nmodl:nvhpc:omp:legacy:
   extends: [.build_neuron, .spack_nvhpc]
-  needs: ["build:coreneuron:nmodl:nvhpc:omp:debug:legacy"]
+  needs: ["build:coreneuron:nmodl:nvhpc:omp:legacy"]
 
-build:neuron:nmodl:nvhpc:omp:
+build:neuron:nmodl:nvhpc:omp:debug:
   extends: [.build_neuron, .spack_nvhpc]
-  needs: ["build:coreneuron:nmodl:nvhpc:omp"]
+  needs: ["build:coreneuron:nmodl:nvhpc:omp:debug"]
 
 # Test CoreNEURON
 test:coreneuron:mod2c:intel:shared:
@@ -205,21 +205,21 @@ test:coreneuron:mod2c:nvhpc:acc:shared:
   extends: [.ctest, .gpu_node]
   needs: ["build:coreneuron:mod2c:nvhpc:acc:shared"]
 
-test:coreneuron:nmodl:nvhpc:acc:legacy:
+test:coreneuron:nmodl:nvhpc:acc:debug:legacy:
   extends: [.ctest, .gpu_node]
-  needs: ["build:coreneuron:nmodl:nvhpc:acc:legacy"]
+  needs: ["build:coreneuron:nmodl:nvhpc:acc:debug:legacy"]
 
-test:coreneuron:nmodl:nvhpc:acc:shared:debug:
+test:coreneuron:nmodl:nvhpc:acc:shared:
   extends: [.ctest, .gpu_node]
-  needs: ["build:coreneuron:nmodl:nvhpc:acc:shared:debug"]
+  needs: ["build:coreneuron:nmodl:nvhpc:acc:shared"]
 
-test:coreneuron:nmodl:nvhpc:omp:debug:legacy:
+test:coreneuron:nmodl:nvhpc:omp:legacy:
   extends: [.ctest, .gpu_node]
-  needs: ["build:coreneuron:nmodl:nvhpc:omp:debug:legacy"]
+  needs: ["build:coreneuron:nmodl:nvhpc:omp:legacy"]
 
-test:coreneuron:nmodl:nvhpc:omp:
+test:coreneuron:nmodl:nvhpc:omp:debug:
   extends: [.ctest, .gpu_node]
-  needs: ["build:coreneuron:nmodl:nvhpc:omp"]
+  needs: ["build:coreneuron:nmodl:nvhpc:omp:debug"]
 
 # Test NEURON
 test:neuron:mod2c:intel:shared:
@@ -238,18 +238,18 @@ test:neuron:mod2c:nvhpc:acc:shared:
   extends: [.test_neuron, .gpu_node]
   needs: ["build:neuron:mod2c:nvhpc:acc:shared"]
 
-test:neuron:nmodl:nvhpc:acc:legacy:
+test:neuron:nmodl:nvhpc:acc:debug:legacy:
   extends: [.test_neuron, .gpu_node]
-  needs: ["build:neuron:nmodl:nvhpc:acc:legacy"]
+  needs: ["build:neuron:nmodl:nvhpc:acc:debug:legacy"]
 
-test:neuron:nmodl:nvhpc:acc:shared:debug:
+test:neuron:nmodl:nvhpc:acc:shared:
   extends: [.test_neuron, .gpu_node]
-  needs: ["build:neuron:nmodl:nvhpc:acc:shared:debug"]
+  needs: ["build:neuron:nmodl:nvhpc:acc:shared"]
 
-test:neuron:nmodl:nvhpc:omp:debug:legacy:
+test:neuron:nmodl:nvhpc:omp:legacy:
   extends: [.test_neuron, .gpu_node]
-  needs: ["build:neuron:nmodl:nvhpc:omp:debug:legacy"]
+  needs: ["build:neuron:nmodl:nvhpc:omp:legacy"]
 
-test:neuron:nmodl:nvhpc:omp:
+test:neuron:nmodl:nvhpc:omp:debug:
   extends: [.test_neuron, .gpu_node]
-  needs: ["build:neuron:nmodl:nvhpc:omp"]
+  needs: ["build:neuron:nmodl:nvhpc:omp:debug"]

From 08031b2cccde0eb2b4d8bd53975c83731647c077 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Thu, 25 Aug 2022 13:33:53 +0200
Subject: [PATCH 120/128] fixup

---
 .gitlab-ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index d7ba6c74f..dd9adc325 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -112,6 +112,7 @@ build:coreneuron:nmodl:intel:shared:
   extends: [.build_coreneuron, .spack_intel]
   needs: ["build:nmodl"]
   variables:
+    SPACK_PACKAGE_DEPENDENCIES: ^hpe-mpi%gcc
     SPACK_PACKAGE_SPEC: ~caliper~gpu~legacy-unit+nmodl+openmp+shared+sympy+tests~unified build_type=RelWithDebInfo
 
 # Not linked to a NEURON build+test job, see

From aa893f6c02f61f3efdc45d28146e63c7eeed9b10 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Thu, 25 Aug 2022 14:25:23 +0200
Subject: [PATCH 121/128] disable OpenMP in CPU builds

---
 .gitlab-ci.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index dd9adc325..a6073578d 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -99,13 +99,13 @@ build:nmodl:
 build:coreneuron:mod2c:intel:shared:
   extends: [.build_coreneuron, .spack_intel]
   variables:
-    SPACK_PACKAGE_SPEC: +caliper~gpu~legacy-unit~nmodl+openmp+shared+tests~unified build_type=RelWithDebInfo
+    SPACK_PACKAGE_SPEC: +caliper~gpu~legacy-unit~nmodl~openmp+shared+tests~unified build_type=RelWithDebInfo
 
 build:coreneuron:nmodl:intel:debug:legacy:
   extends: [.build_coreneuron, .spack_intel]
   needs: ["build:nmodl"]
   variables:
-    SPACK_PACKAGE_SPEC: +caliper~gpu~legacy-unit+nmodl+openmp~shared~sympy+tests~unified build_type=Debug
+    SPACK_PACKAGE_SPEC: +caliper~gpu~legacy-unit+nmodl~openmp~shared~sympy+tests~unified build_type=Debug
 
 # Disable caliper to improve coverage
 build:coreneuron:nmodl:intel:shared:
@@ -113,7 +113,7 @@ build:coreneuron:nmodl:intel:shared:
   needs: ["build:nmodl"]
   variables:
     SPACK_PACKAGE_DEPENDENCIES: ^hpe-mpi%gcc
-    SPACK_PACKAGE_SPEC: ~caliper~gpu~legacy-unit+nmodl+openmp+shared+sympy+tests~unified build_type=RelWithDebInfo
+    SPACK_PACKAGE_SPEC: ~caliper~gpu~legacy-unit+nmodl~openmp+shared+sympy+tests~unified build_type=RelWithDebInfo
 
 # Not linked to a NEURON build+test job, see
 # https://github.com/BlueBrain/CoreNeuron/issues/594

From f71b6fd57624e7fd3c46ddcc324b29f5940d66de Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Thu, 25 Aug 2022 17:04:26 +0200
Subject: [PATCH 122/128] submodules

---
 external/mod2c | 2 +-
 external/nmodl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/external/mod2c b/external/mod2c
index ec96803a3..626ffc202 160000
--- a/external/mod2c
+++ b/external/mod2c
@@ -1 +1 @@
-Subproject commit ec96803a3ec34bab63cc0e00b6cc85581eacd403
+Subproject commit 626ffc2024872b7d48ecca95786b97d707898317
diff --git a/external/nmodl b/external/nmodl
index 6b43a20be..1265e4a84 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit 6b43a20be76fce6b144e4324c574301c1f087387
+Subproject commit 1265e4a84b699cac10668db5ca59f7054c9f1f51

From b214ad0c44ea99e250e47e0287e909f8f7298a97 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Thu, 25 Aug 2022 17:16:01 +0200
Subject: [PATCH 123/128] submodule

---
 external/nmodl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/nmodl b/external/nmodl
index 1265e4a84..c7891c5e7 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit 1265e4a84b699cac10668db5ca59f7054c9f1f51
+Subproject commit c7891c5e73a857aab96554d668a75eb16316a992

From f53901ea120d0a30962bad527701aacff0308c2f Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Thu, 25 Aug 2022 17:19:20 +0200
Subject: [PATCH 124/128] submodule

---
 external/nmodl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/nmodl b/external/nmodl
index c7891c5e7..4eaad0be8 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit c7891c5e73a857aab96554d668a75eb16316a992
+Subproject commit 4eaad0be8922de50c4b4a0444b72393361c7a998

From c647a08c7afd9a8af48e2ac14df8b7516a8a46b7 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Thu, 25 Aug 2022 17:33:42 +0200
Subject: [PATCH 125/128] submodule

---
 external/nmodl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/nmodl b/external/nmodl
index 4eaad0be8..0274a4c6e 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit 4eaad0be8922de50c4b4a0444b72393361c7a998
+Subproject commit 0274a4c6e87d71fb161f21aea8d236a2a57d3fd1

From 2857e2d71089a6899e3c43a9865acd7a4ed304f3 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Fri, 26 Aug 2022 10:09:53 +0200
Subject: [PATCH 126/128] make all intel builds debug

---
 .gitlab-ci.yml | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index a6073578d..b8fad911f 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -96,10 +96,13 @@ build:nmodl:
     # TODO: fix this more robustly so we don't have to play so many games.
     SPACK_PACKAGE_DEPENDENCIES: ^hpe-mpi%gcc ^caliper%gcc+cuda cuda_arch=70
 
-build:coreneuron:mod2c:intel:shared:
+# TODO: improve coverage by switching an Intel build to be statically linked
+# TODO: improve coverage by switching an Intel build to RelWithDebInfo
+# TODO: improve coverage by enabling +openmp on an Intel build
+build:coreneuron:mod2c:intel:shared:debug:
   extends: [.build_coreneuron, .spack_intel]
   variables:
-    SPACK_PACKAGE_SPEC: +caliper~gpu~legacy-unit~nmodl~openmp+shared+tests~unified build_type=RelWithDebInfo
+    SPACK_PACKAGE_SPEC: +caliper~gpu~legacy-unit~nmodl~openmp+shared+tests~unified build_type=Debug
 
 build:coreneuron:nmodl:intel:debug:legacy:
   extends: [.build_coreneuron, .spack_intel]
@@ -108,12 +111,12 @@ build:coreneuron:nmodl:intel:debug:legacy:
     SPACK_PACKAGE_SPEC: +caliper~gpu~legacy-unit+nmodl~openmp~shared~sympy+tests~unified build_type=Debug
 
 # Disable caliper to improve coverage
-build:coreneuron:nmodl:intel:shared:
+build:coreneuron:nmodl:intel:shared:debug:
   extends: [.build_coreneuron, .spack_intel]
   needs: ["build:nmodl"]
   variables:
     SPACK_PACKAGE_DEPENDENCIES: ^hpe-mpi%gcc
-    SPACK_PACKAGE_SPEC: ~caliper~gpu~legacy-unit+nmodl~openmp+shared+sympy+tests~unified build_type=RelWithDebInfo
+    SPACK_PACKAGE_SPEC: ~caliper~gpu~legacy-unit+nmodl~openmp+shared+sympy+tests~unified build_type=Debug
 
 # Not linked to a NEURON build+test job, see
 # https://github.com/BlueBrain/CoreNeuron/issues/594
@@ -153,17 +156,17 @@ build:coreneuron:nmodl:nvhpc:omp:debug:
     SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit+nmodl+openmp~shared+sympy+tests~unified build_type=Debug
 
 # Build NEURON
-build:neuron:mod2c:intel:shared:
+build:neuron:mod2c:intel:shared:debug:
   extends: [.build_neuron, .spack_intel]
-  needs: ["build:coreneuron:mod2c:intel:shared"]
+  needs: ["build:coreneuron:mod2c:intel:shared:debug"]
 
 build:neuron:nmodl:intel:debug:legacy:
   extends: [.build_neuron, .spack_intel]
   needs: ["build:coreneuron:nmodl:intel:debug:legacy"]
 
-build:neuron:nmodl:intel:shared:
+build:neuron:nmodl:intel:shared:debug:
   extends: [.build_neuron, .spack_intel]
-  needs: ["build:coreneuron:nmodl:intel:shared"]
+  needs: ["build:coreneuron:nmodl:intel:shared:debug"]
 
 build:neuron:mod2c:nvhpc:acc:shared:
   extends: [.build_neuron, .spack_nvhpc]
@@ -186,17 +189,17 @@ build:neuron:nmodl:nvhpc:omp:debug:
   needs: ["build:coreneuron:nmodl:nvhpc:omp:debug"]
 
 # Test CoreNEURON
-test:coreneuron:mod2c:intel:shared:
+test:coreneuron:mod2c:intel:shared:debug:
   extends: [.ctest]
-  needs: ["build:coreneuron:mod2c:intel:shared"]
+  needs: ["build:coreneuron:mod2c:intel:shared:debug"]
 
 test:coreneuron:nmodl:intel:debug:legacy:
   extends: [.ctest]
   needs: ["build:coreneuron:nmodl:intel:debug:legacy"]
 
-test:coreneuron:nmodl:intel:shared:
+test:coreneuron:nmodl:intel:shared:debug:
   extends: [.ctest]
-  needs: ["build:coreneuron:nmodl:intel:shared"]
+  needs: ["build:coreneuron:nmodl:intel:shared:debug"]
 
 test:coreneuron:mod2c:nvhpc:acc:debug:unified:
   extends: [.ctest, .gpu_node]
@@ -223,17 +226,17 @@ test:coreneuron:nmodl:nvhpc:omp:debug:
   needs: ["build:coreneuron:nmodl:nvhpc:omp:debug"]
 
 # Test NEURON
-test:neuron:mod2c:intel:shared:
+test:neuron:mod2c:intel:shared:debug:
   extends: [.test_neuron]
-  needs: ["build:neuron:mod2c:intel:shared"]
+  needs: ["build:neuron:mod2c:intel:shared:debug"]
 
 test:neuron:nmodl:intel:debug:legacy:
   extends: [.test_neuron]
   needs: ["build:neuron:nmodl:intel:debug:legacy"]
 
-test:neuron:nmodl:intel:shared:
+test:neuron:nmodl:intel:shared:debug:
   extends: [.test_neuron]
-  needs: ["build:neuron:nmodl:intel:shared"]
+  needs: ["build:neuron:nmodl:intel:shared:debug"]
 
 test:neuron:mod2c:nvhpc:acc:shared:
   extends: [.test_neuron, .gpu_node]

From 0e2475593ef020606ca95b8a04f84ea0d771ee87 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Fri, 26 Aug 2022 14:59:27 +0200
Subject: [PATCH 127/128] keep some optimisation of partial_piv_lu.cpp in debug
 builds

e
---
 coreneuron/CMakeLists.txt | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index c4143b48c..97d12e613 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -120,6 +120,14 @@ if(CORENRN_ENABLE_GPU)
   # OpenACC/OpenMP annotations.
   if(CORENRN_ENABLE_NMODL AND EXISTS ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cpp)
     list(APPEND CORENEURON_CODE_FILES ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cpp)
+    if(CORENRN_ENABLE_GPU
+       AND CORENRN_HAVE_NVHPC_COMPILER
+       AND CMAKE_BUILD_TYPE STREQUAL "Debug")
+      # In this case OpenAccHelper.cmake passes -gpu=debug, which makes these Eigen functions
+      # extremely slow. Downgrade that to -gpu=lineinfo for this file.
+      set_source_files_properties(${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cpp
+                                  PROPERTIES COMPILE_FLAGS "-gpu=lineinfo,nodebug -O1")
+    endif()
   endif()
 endif()
 

From 567bd21cc895460c3e176d71d24dd82a549835a9 Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
Date: Sun, 28 Aug 2022 13:53:07 +0200
Subject: [PATCH 128/128] update nmodl and mod2c submodule

---
 external/mod2c | 2 +-
 external/nmodl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/external/mod2c b/external/mod2c
index 626ffc202..469c74dc7 160000
--- a/external/mod2c
+++ b/external/mod2c
@@ -1 +1 @@
-Subproject commit 626ffc2024872b7d48ecca95786b97d707898317
+Subproject commit 469c74dc7d96bbc5a06a42696422154b4cd2ce28
diff --git a/external/nmodl b/external/nmodl
index 0274a4c6e..4f45a1c8a 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit 0274a4c6e87d71fb161f21aea8d236a2a57d3fd1
+Subproject commit 4f45a1c8a9b99c64127ea795eb12952e754b775c