From 4a25a6bb789b84456e9f220d5978b9c10bcbdf49 Mon Sep 17 00:00:00 2001
From: Christos Kotsalos <christos.kotsalos@epfl.ch>
Date: Fri, 12 Aug 2022 16:23:48 +0200
Subject: [PATCH 1/9] Fixing race condition in cell permute 2 with NVHPC 22.5 /
 22.7 (#847)

* Fixing race condition in cell permute 2 : OpenACC  [performance optimization]
* Update Unit Test / Documentation
---
 coreneuron/permute/cellorder.cpp  | 56 +++++++++++++------------------
 tests/unit/solver/test_solver.cpp | 12 +++++--
 2 files changed, 33 insertions(+), 35 deletions(-)

diff --git a/coreneuron/permute/cellorder.cpp b/coreneuron/permute/cellorder.cpp
index 14feb31de..c95fedcf2 100644
--- a/coreneuron/permute/cellorder.cpp
+++ b/coreneuron/permute/cellorder.cpp
@@ -478,13 +478,12 @@ static void bksub_interleaved(NrnThread* nt,
 }
 
 // icore ranges [0:warpsize) ; stride[ncycle]
+nrn_pragma_acc(routine vector)
 static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* stride, int lastnode) {
     int icycle = ncycle - 1;
     int istride = stride[icycle];
     int i = lastnode - istride + icore;
-#ifndef CORENEURON_ENABLE_GPU
     int ii = i;
-#endif
 
     // execute until all tree depths are executed
     bool has_subtrees_to_compute = true;
@@ -492,11 +491,11 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
     // clang-format off
     nrn_pragma_acc(loop seq)
     for (; has_subtrees_to_compute; ) {  // ncycle loop
-#ifndef CORENEURON_ENABLE_GPU
         // serial test, gpu does this in parallel
+        nrn_pragma_acc(loop vector)
+        nrn_pragma_omp(loop bind(parallel))
         for (int icore = 0; icore < warpsize; ++icore) {
             int i = ii + icore;
-#endif
             if (icore < istride) {  // most efficient if istride equal  warpsize
                 // what is the index
                 int ip = GPU_PARENT(i);
@@ -508,9 +507,7 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
                 nrn_pragma_omp(atomic update)
                 GPU_RHS(ip) -= p * GPU_RHS(i);
             }
-#ifndef CORENEURON_ENABLE_GPU
         }
-#endif
         // if finished with all tree depths then ready to break
         // (note that break is not allowed in OpenACC)
         if (icycle == 0) {
@@ -520,14 +517,12 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
         --icycle;
         istride = stride[icycle];
         i -= istride;
-#ifndef CORENEURON_ENABLE_GPU
         ii -= istride;
-#endif
     }
-    // clang-format on
 }
 
 // icore ranges [0:warpsize) ; stride[ncycle]
+nrn_pragma_acc(routine vector)
 static void bksub_interleaved2(NrnThread* nt,
                                int root,
                                int lastroot,
@@ -535,36 +530,29 @@ static void bksub_interleaved2(NrnThread* nt,
                                int ncycle,
                                int* stride,
                                int firstnode) {
-#ifndef CORENEURON_ENABLE_GPU
-    for (int i = root; i < lastroot; i += 1) {
-#else
     nrn_pragma_acc(loop seq)
-    for (int i = root; i < lastroot; i += warpsize) {
-#endif
+    for (int i = root; i < lastroot; i += 1) {
         GPU_RHS(i) /= GPU_D(i);  // the root
     }
 
     int i = firstnode + icore;
-#ifndef CORENEURON_ENABLE_GPU
     int ii = i;
-#endif
+    nrn_pragma_acc(loop seq)
     for (int icycle = 0; icycle < ncycle; ++icycle) {
         int istride = stride[icycle];
-#ifndef CORENEURON_ENABLE_GPU
         // serial test, gpu does this in parallel
+        nrn_pragma_acc(loop vector)
+        nrn_pragma_omp(loop bind(parallel))
         for (int icore = 0; icore < warpsize; ++icore) {
             int i = ii + icore;
-#endif
             if (icore < istride) {
                 int ip = GPU_PARENT(i);
                 GPU_RHS(i) -= GPU_B(i) * GPU_RHS(ip);
                 GPU_RHS(i) /= GPU_D(i);
             }
             i += istride;
-#ifndef CORENEURON_ENABLE_GPU
         }
         ii += istride;
-#endif
     }
 }
 
@@ -600,15 +588,24 @@ void solve_interleaved2(int ith) {
     defined(_OPENACC)
         int nstride = stridedispl[nwarp];
 #endif
-        nrn_pragma_acc(parallel loop gang vector vector_length(
-            warpsize) present(nt [0:1],
+        /* If we compare this loop with the one from cellorder.cu (CUDA version), we will understand 
+         * that the parallelism here is exposed in steps, while in the CUDA version all the parallelism 
+         * is exposed from the very beginning of the loop. In more details, here we initially distribute
+         * the outermost loop, e.g. in the CUDA blocks, and for the innermost loops we explicitly use multiple
+         * threads for the parallelization (see for example the loop directives in triang/bksub_interleaved2). 
+         * On the other hand, in the CUDA version the outermost loop is distributed to all the available threads,
+         * and therefore there is no need to have the innermost loops. Here, the loop/icore jumps every warpsize,
+         * while in the CUDA version the icore increases by one. Other than this, the two loop versions
+         * are equivalent (same results).
+         */
+        nrn_pragma_acc(parallel loop gang present(nt [0:1],
                               strides [0:nstride],
                               ncycles [0:nwarp],
                               stridedispl [0:nwarp + 1],
                               rootbegin [0:nwarp + 1],
                               nodebegin [0:nwarp + 1]) if (nt->compute_gpu) async(nt->stream_id))
-        nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
-        for (int icore = 0; icore < ncore; ++icore) {
+        nrn_pragma_omp(target teams loop if(nt->compute_gpu))
+        for (int icore = 0; icore < ncore; icore += warpsize) {
             int iwarp = icore / warpsize;     // figure out the >> value
             int ic = icore & (warpsize - 1);  // figure out the & mask
             int ncycle = ncycles[iwarp];
@@ -617,14 +614,9 @@ void solve_interleaved2(int ith) {
             int lastroot = rootbegin[iwarp + 1];
             int firstnode = nodebegin[iwarp];
             int lastnode = nodebegin[iwarp + 1];
-#ifndef CORENEURON_ENABLE_GPU
-            if (ic == 0) {  // serial test mode. triang and bksub do all cores in warp
-#endif
-                triang_interleaved2(nt, ic, ncycle, stride, lastnode);
-                bksub_interleaved2(nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
-#ifndef CORENEURON_ENABLE_GPU
-            }  // serial test mode
-#endif
+            
+            triang_interleaved2(nt, ic, ncycle, stride, lastnode);
+            bksub_interleaved2(nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
         }
         nrn_pragma_acc(wait(nt->stream_id))
 #ifdef _OPENACC
diff --git a/tests/unit/solver/test_solver.cpp b/tests/unit/solver/test_solver.cpp
index 4c47913ba..6511f03e1 100644
--- a/tests/unit/solver/test_solver.cpp
+++ b/tests/unit/solver/test_solver.cpp
@@ -33,12 +33,12 @@ struct SolverData {
 constexpr auto magic_index_value = -2;
 constexpr auto magic_double_value = std::numeric_limits<double>::lowest();
 
-// TODO: check out adding a CellPermute2_CPU version?
 enum struct SolverImplementation {
     CellPermute0_CPU,
     CellPermute0_GPU,
     CellPermute1_CPU,
     CellPermute1_GPU,
+    CellPermute2_CPU,
     CellPermute2_GPU,
     CellPermute2_CUDA
 };
@@ -52,6 +52,8 @@ std::ostream& operator<<(std::ostream& os, SolverImplementation impl) {
         return os << "SolverImplementation::CellPermute1_CPU";
     } else if (impl == SolverImplementation::CellPermute1_GPU) {
         return os << "SolverImplementation::CellPermute1_GPU";
+    } else if (impl == SolverImplementation::CellPermute2_CPU) {
+        return os << "SolverImplementation::CellPermute2_CPU";
     } else if (impl == SolverImplementation::CellPermute2_GPU) {
         return os << "SolverImplementation::CellPermute2_GPU";
     } else if (impl == SolverImplementation::CellPermute2_CUDA) {
@@ -91,8 +93,11 @@ struct SetupThreads {
                 break;
             case SolverImplementation::CellPermute2_CUDA:
                 corenrn_param.cuda_interface = true;
+                [[fallthrough]];
             case SolverImplementation::CellPermute2_GPU:
                 corenrn_param.gpu = true;
+                [[fallthrough]];
+            case SolverImplementation::CellPermute2_CPU:
                 interleave_permute_type = 2;
                 break;
         }
@@ -259,9 +264,10 @@ auto solve_and_dump(Args&&... args) {
 }
 
 auto active_implementations() {
-    // These two are always available
+    // These are always available
     std::vector<SolverImplementation> ret{SolverImplementation::CellPermute0_CPU,
-                                          SolverImplementation::CellPermute1_CPU};
+                                          SolverImplementation::CellPermute1_CPU,
+                                          SolverImplementation::CellPermute2_CPU};
 #ifdef CORENEURON_ENABLE_GPU
     // Consider making these steerable via a runtime switch in GPU builds
     ret.push_back(SolverImplementation::CellPermute0_GPU);

From d6507e20ffdcf652b07e231347174f8f9c89e32a Mon Sep 17 00:00:00 2001
From: Nicolas Cornu <nicolas.cornu@epfl.ch>
Date: Thu, 25 Aug 2022 07:53:07 +0200
Subject: [PATCH 2/9] Use last hpc-coding-conventions formatter script (#850)

Co-authored-by: Tristan Carel <tristan.carel@epfl.ch>
---
 .bbp-project.yaml                             |  8 +++++++
 .../workflows/clang_cmake_format_check.yaml   | 23 ++++---------------
 .gitignore                                    |  1 +
 CMake/hpc-coding-conventions                  |  2 +-
 CMake/packages/FindSphinx.cmake               | 11 ++++-----
 CMake/packages/Findlikwid.cmake               |  5 ++--
 CMake/packages/Findnmodl.cmake                | 13 +++++++----
 CMake/packages/Findreportinglib.cmake         |  8 ++++---
 CMakeLists.txt                                | 11 ---------
 9 files changed, 36 insertions(+), 46 deletions(-)
 create mode 100644 .bbp-project.yaml

diff --git a/.bbp-project.yaml b/.bbp-project.yaml
new file mode 100644
index 000000000..d523b219f
--- /dev/null
+++ b/.bbp-project.yaml
@@ -0,0 +1,8 @@
+tools:
+  ClangFormat:
+    enable: True
+    include:
+      match:
+      - coreneuron/.*\.((cu)|(h)|([chi]pp))$
+  CMakeFormat:
+    enable: True
diff --git a/.github/workflows/clang_cmake_format_check.yaml b/.github/workflows/clang_cmake_format_check.yaml
index 8725fa9c1..1494941cc 100644
--- a/.github/workflows/clang_cmake_format_check.yaml
+++ b/.github/workflows/clang_cmake_format_check.yaml
@@ -14,24 +14,11 @@ jobs:
     steps:
         - name: Fetch repository
           uses: actions/checkout@v3
-        - name: Install clang-format 11
-          run: |
-              sudo apt-get update
-              sudo apt-get install clang-format-11 python3-pip libboost-all-dev libopenmpi-dev openmpi-bin
-        - name: Install cmake-format 0.6.13
-          run:  python3 -m pip install cmake-format==0.6.13
-        - name: Configure
+        - name: Fetch hpc-coding-conventions submodules
           shell: bash
           working-directory: ${{runner.workspace}}/CoreNeuron
-          run: |
-              export PATH=/home/runner/.local/bin:$PATH
-              mkdir BUILD && cd BUILD
-              cmake -DCORENRN_CLANG_FORMAT=ON -DCORENRN_CMAKE_FORMAT=ON -DCORENRN_ENABLE_MPI=ON -DCORENRN_ENABLE_OPENMP=OFF -DClangFormat_EXECUTABLE=$(which clang-format-11) -DCMakeFormat_EXECUTABLE=$(which cmake-format) ..
-        - name: Run clang-format
+          run: git submodule update --init --depth 1 -- CMake/hpc-coding-conventions
+        - name: Run clang-format and cmake-format
           shell: bash
-          working-directory: ${{runner.workspace}}/CoreNeuron/BUILD
-          run: make check-clang-format VERBOSE=1
-        - name: Run cmake-format
-          shell: bash
-          working-directory: ${{runner.workspace}}/CoreNeuron/BUILD
-          run: make check-cmake-format
+          working-directory: ${{runner.workspace}}/CoreNeuron
+          run: CMake/hpc-coding-conventions/bin/format -v --dry-run
diff --git a/.gitignore b/.gitignore
index e6c1ba624..e0813fb15 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,4 @@ spconfig.*
 .clang-tidy
 .cmake-format.yaml
 .pre-commit-config.yaml
+.bbp-project-venv/
diff --git a/CMake/hpc-coding-conventions b/CMake/hpc-coding-conventions
index 5d4bcd2d4..939463f17 160000
--- a/CMake/hpc-coding-conventions
+++ b/CMake/hpc-coding-conventions
@@ -1 +1 @@
-Subproject commit 5d4bcd2d410e67bdc1d23d3280c08ee5c9df943b
+Subproject commit 939463f1722352a376a2474ad33c75e59c0c388c
diff --git a/CMake/packages/FindSphinx.cmake b/CMake/packages/FindSphinx.cmake
index 2ffc285bf..f0701cd44 100644
--- a/CMake/packages/FindSphinx.cmake
+++ b/CMake/packages/FindSphinx.cmake
@@ -4,12 +4,11 @@
 # See top-level LICENSE file for details.
 # =============================================================================
 
-find_program(SPHINX_EXECUTABLE
-        NAMES sphinx-build
-        DOC "/path/to/sphinx-build")
+find_program(
+  SPHINX_EXECUTABLE
+  NAMES sphinx-build
+  DOC "/path/to/sphinx-build")
 
 include(FindPackageHandleStandardArgs)
 
-find_package_handle_standard_args(Sphinx
-        "Failed to find sphinx-build executable"
-        SPHINX_EXECUTABLE)
\ No newline at end of file
+find_package_handle_standard_args(Sphinx "Failed to find sphinx-build executable" SPHINX_EXECUTABLE)
diff --git a/CMake/packages/Findlikwid.cmake b/CMake/packages/Findlikwid.cmake
index 28f3da748..c2b4aa0ba 100644
--- a/CMake/packages/Findlikwid.cmake
+++ b/CMake/packages/Findlikwid.cmake
@@ -4,6 +4,7 @@
 # See top-level LICENSE file for details.
 # =============================================================================
 
+# ~~~
 # Findlikwid
 # -------------
 #
@@ -26,6 +27,7 @@
 #   likwid_FOUND     - set to true if the library is found
 #   likwid_INCLUDE   - list of required include directories
 #   likwid_LIBRARIES - list of required library directories
+# ~~~
 
 find_path(likwid_INCLUDE_DIRS "likwid.h" HINTS "${LIKWID_DIR}/include")
 find_library(likwid_LIBRARIES likwid HINTS "${LIKWID_DIR}/lib")
@@ -33,5 +35,4 @@ find_library(likwid_LIBRARIES likwid HINTS "${LIKWID_DIR}/lib")
 # Checks 'REQUIRED', 'QUIET' and versions.
 include(FindPackageHandleStandardArgs)
 
-find_package_handle_standard_args(likwid
-    REQUIRED_VARS likwid_INCLUDE_DIRS likwid_LIBRARIES)
+find_package_handle_standard_args(likwid REQUIRED_VARS likwid_INCLUDE_DIRS likwid_LIBRARIES)
diff --git a/CMake/packages/Findnmodl.cmake b/CMake/packages/Findnmodl.cmake
index b7b73fdbb..b76a3c2ec 100644
--- a/CMake/packages/Findnmodl.cmake
+++ b/CMake/packages/Findnmodl.cmake
@@ -4,6 +4,7 @@
 # See top-level LICENSE file for details.
 # =============================================================================
 
+# ~~~
 # Findnmodl
 # -------------
 #
@@ -26,11 +27,13 @@
 #   nmodl_FOUND   - set to true if the library is found
 #   nmodl_INCLUDE - list of required include directories
 #   nmodl_BINARY  - the nmodl binary
-
+# ~~~
 
 # UNIX paths are standard, no need to write.
-find_program(nmodl_BINARY NAMES nmodl${CMAKE_EXECUTABLE_SUFFIX}
-        HINTS "${CORENRN_NMODL_DIR}/bin" QUIET)
+find_program(
+  nmodl_BINARY
+  NAMES nmodl${CMAKE_EXECUTABLE_SUFFIX}
+  HINTS "${CORENRN_NMODL_DIR}/bin" QUIET)
 
 find_path(nmodl_INCLUDE "nmodl/fast_math.ispc" HINTS "${CORENRN_NMODL_DIR}/include")
 find_path(nmodl_PYTHONPATH "nmodl/__init__.py" HINTS "${CORENRN_NMODL_DIR}/lib")
@@ -38,7 +41,7 @@ find_path(nmodl_PYTHONPATH "nmodl/__init__.py" HINTS "${CORENRN_NMODL_DIR}/lib")
 # Checks 'REQUIRED', 'QUIET' and versions.
 include(FindPackageHandleStandardArgs)
 
-find_package_handle_standard_args(nmodl
+find_package_handle_standard_args(
+  nmodl
   FOUND_VAR nmodl_FOUND
   REQUIRED_VARS nmodl_BINARY nmodl_INCLUDE nmodl_PYTHONPATH)
-
diff --git a/CMake/packages/Findreportinglib.cmake b/CMake/packages/Findreportinglib.cmake
index 9c457327f..6755d57fd 100644
--- a/CMake/packages/Findreportinglib.cmake
+++ b/CMake/packages/Findreportinglib.cmake
@@ -4,6 +4,7 @@
 # See top-level LICENSE file for details.
 # =============================================================================
 
+# ~~~
 # Findreportinglib
 # -------------
 #
@@ -26,17 +27,18 @@
 #   reportinglib_FOUND - set to true if the library is found
 #   reportinglib_INCLUDE_DIRS - list of required include directories
 #   reportinglib_LIBRARIES - list of libraries to be linked
+# ~~~
 
 # UNIX paths are standard, no need to write.
 find_path(reportinglib_INCLUDE_DIR reportinglib/Report.h)
 find_library(reportinglib_LIBRARY reportinglib)
 get_filename_component(reportinglib_LIB_DIR ${reportinglib_LIBRARY} DIRECTORY)
-find_program (reportinglib_somaDump somaDump ${reportinglib_LIB_DIR}/../bin)
+find_program(reportinglib_somaDump somaDump ${reportinglib_LIB_DIR}/../bin)
 
 # Checks 'REQUIRED', 'QUIET' and versions.
 include(FindPackageHandleStandardArgs)
 
-find_package_handle_standard_args(reportinglib
+find_package_handle_standard_args(
+  reportinglib
   FOUND_VAR reportinglib_FOUND
   REQUIRED_VARS reportinglib_INCLUDE_DIR reportinglib_LIBRARY reportinglib_LIB_DIR)
-
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cb1c96b6c..eda8f2feb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -85,17 +85,6 @@ list(APPEND CMAKE_MODULE_PATH ${CORENEURON_PROJECT_SOURCE_DIR}/CMake
 # =============================================================================
 set(CODING_CONV_PREFIX "CORENRN")
 set(CORENRN_3RDPARTY_DIR "external")
-# Adds .cu with respect to the current default in hpc-coding-conventions, and drops various patterns
-# that don't match anything in CoreNEURON.
-set(CORENRN_ClangFormat_FILES_RE
-    "^.*\\\\.cu$$" "^.*\\\\.h$$" "^.*\\\\.[chi]pp$$"
-    CACHE STRING "List of regular expressions matching C/C++ filenames" FORCE)
-set(CORENRN_ClangFormat_EXCLUDES_RE
-    ""
-    CACHE STRING "list of regular expressions to exclude C/C++ files from formatting" FORCE)
-set(CORENRN_CMakeFormat_EXCLUDES_RE
-    "CMake/packages/.*$$"
-    CACHE STRING "list of regular expressions to exclude CMake files from formatting" FORCE)
 include(AddHpcCodingConvSubmodule)
 add_subdirectory(CMake/hpc-coding-conventions/cpp)
 

From 12272f80343871a79b5c8a8a42c0096ce7f42edc Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
Date: Sun, 28 Aug 2022 18:13:24 +0200
Subject: [PATCH 3/9] Support for shared libraries in GPU execution (python
 launch support) (#795)

* coreneuron and mechanism library can be built as shared and it
  enables launching coreneuron on GPU via python
* update MOD2C and NMODL fixes to handle GLOBAL variables
      See BlueBrain/mod2c/pull/78
      See BlueBrain/nmodl/pull/904
* removed acc/openmp global annotations for celsius, pi and secondorder
  and they don't need to be copied on GPU
* Pass Memb_list* as an argument for all common prototypes in order
   to support global variables via argument
* free ml->instance if not empty
* add link to libscopmath in neuron as well
* nrn_ghk is now declared inline.
* homegrown present table to avoid dynamic loading + acc_deviceptr limitations
* use -gpu=nordc and make #pragma acc routine seq functions inline
* drop -lscopmath as its folded in elsewhere
* random123 header reorganisation
* try and cleanup CLI11 handling.
* try and consolidate build logic
* some CORENEURON_ -> CORENRN_ for consistency.
* export OpenACC flags to NEURON separately as well as part
     of the whole ... -lcoreneuron ... link line.
* libcoreneuron.so -> libcorenrnmech.so, try and fix static builds
* do not enable OpenMP in shared/OpenACC builds.
* add rpaths inside nrnivmodl-core.
* accept a private destructor function pointer from generated mechanisms
* drop ${TEST_EXEC_PREFIX} that was causing simple tests to be executed on many ranks.
* CORENEURON_GPU_DEBUG: add environment variable that enables cnrn_target_* debug messages.

fixes #141

Co-authored-by: Olli Lupton <oliver.lupton@epfl.ch>
---
 .gitlab-ci.yml                                | 200 ++++++++++------
 CMake/MakefileBuildOptions.cmake              | 143 +++++++++---
 CMake/OpenAccHelper.cmake                     |  44 ++--
 CMake/coreneuron-config.cmake.in              |   6 +-
 CMakeLists.txt                                | 100 +++-----
 coreneuron/CMakeLists.txt                     | 218 +++++++++---------
 coreneuron/apps/corenrn_parameters.cpp        |  21 +-
 coreneuron/apps/corenrn_parameters.hpp        |  36 ++-
 coreneuron/apps/main1.cpp                     |  26 ++-
 coreneuron/gpu/nrn_acc_manager.cpp            | 215 ++++++++++++++++-
 coreneuron/gpu/nrn_acc_manager.hpp            |  13 +-
 coreneuron/io/core2nrn_data_return.cpp        |  16 +-
 coreneuron/io/global_vars.cpp                 |   2 +-
 coreneuron/io/nrn2core_data_init.cpp          |   3 +-
 coreneuron/io/nrn_checkpoint.cpp              |   6 +-
 coreneuron/io/nrn_setup.cpp                   |   8 +
 coreneuron/io/phase2.cpp                      |   6 +-
 coreneuron/mechanism/capac.cpp                |  10 +-
 coreneuron/mechanism/eion.cpp                 |  76 +-----
 .../mechanism/mech/mod2c_core_thread.hpp      |   8 +-
 coreneuron/mechanism/mechanism.hpp            |   8 +-
 coreneuron/mechanism/membfunc.hpp             |  84 +++++--
 coreneuron/mechanism/nrnoc_ml.ispc            |   4 +-
 coreneuron/mechanism/patternstim.cpp          |  15 +-
 coreneuron/mechanism/register_mech.cpp        |  22 +-
 coreneuron/mechanism/register_mech.hpp        |   2 +-
 .../mpi/core/{nrnmpi.cpp => resolve.cpp}      |   0
 coreneuron/mpi/nrnmpi.h                       |   2 +-
 coreneuron/network/cvodestb.cpp               |  12 -
 coreneuron/network/partrans.cpp               |  19 +-
 coreneuron/nrnconf.h                          |   8 -
 coreneuron/nrnoc/nrnunits_modern.h            |  36 ---
 coreneuron/sim/fast_imem.cpp                  |   8 +-
 coreneuron/sim/multicore.cpp                  |   9 +-
 coreneuron/sim/multicore.hpp                  |  15 +-
 coreneuron/sim/scopmath/errcodes.h            |  39 +++-
 coreneuron/sim/scopmath/sparse_thread.hpp     |  38 +--
 coreneuron/utils/ispc/globals.cpp             |  17 --
 coreneuron/utils/memory.cpp                   |   2 +-
 coreneuron/utils/memory.h                     |  12 +-
 coreneuron/utils/nrnoc_aux.hpp                |   5 -
 coreneuron/utils/offload.hpp                  | 135 +++++++++--
 coreneuron/utils/randoms/nrnran123.cpp        | 152 +++++-------
 coreneuron/utils/randoms/nrnran123.h          | 113 ++++++---
 coreneuron/utils/units.hpp                    |  38 +++
 external/mod2c                                |   2 +-
 external/nmodl                                |   2 +-
 extra/nrnivmodl_core_makefile.in              |  40 ++--
 tests/CMakeLists.txt                          |   6 +-
 tests/integration/CMakeLists.txt              |  55 ++---
 tests/unit/alignment/CMakeLists.txt           |   6 +-
 tests/unit/cmdline_interface/CMakeLists.txt   |  12 +-
 .../test_cmdline_interface.cpp                |   2 +-
 tests/unit/interleave_info/CMakeLists.txt     |  11 +-
 tests/unit/lfp/CMakeLists.txt                 |  16 +-
 tests/unit/queueing/CMakeLists.txt            |  10 +-
 tests/unit/solver/CMakeLists.txt              |  14 +-
 tests/unit/solver/test_solver.cpp             |   4 +-
 58 files changed, 1287 insertions(+), 845 deletions(-)
 rename coreneuron/mpi/core/{nrnmpi.cpp => resolve.cpp} (100%)
 delete mode 100644 coreneuron/nrnoc/nrnunits_modern.h
 delete mode 100644 coreneuron/utils/ispc/globals.cpp
 create mode 100644 coreneuron/utils/units.hpp

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 737e867f0..b8fad911f 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -81,24 +81,10 @@ build:nmodl:
     SPACK_PACKAGE_COMPILER: gcc
 
 # Build CoreNEURON
-build:coreneuron:mod2c:nvhpc:acc:
-  extends: [.build, .spack_nvhpc]
-  variables:
-    SPACK_PACKAGE: coreneuron
-    # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
-    SPACK_PACKAGE_SPEC: +gpu+openmp+tests~legacy-unit build_type=RelWithDebInfo
-
-# Build CoreNEURON with Unified Memory on GPU
-build:coreneuron:mod2c:nvhpc:acc:unified:
-  extends: [.build, .spack_nvhpc]
-  variables:
-    SPACK_PACKAGE: coreneuron
-    # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
-    SPACK_PACKAGE_SPEC: +gpu+unified+openmp+tests~legacy-unit build_type=RelWithDebInfo
-
-.build_coreneuron_nmodl:
+.build_coreneuron:
   extends: [.build]
   variables:
+    SPACK_PACKAGE: coreneuron
     # NEURON depends on py-mpi4py, most of whose dependencies are pulled in by
     # nmodl%gcc, with the exception of MPI, which is pulled in by
     # coreneuron%{nvhpc,intel}. hpe-mpi is an external package anyway, so
@@ -106,102 +92,168 @@ build:coreneuron:mod2c:nvhpc:acc:unified:
     # dependency graph and not changing which installation is used, but this
     # means that in the NEURON step an existing py-mpi4py%gcc can be used.
     # Otherwise a new py-mpi4py with hpe-mpi%{nvhpc,intel} will be built.
+    # caliper: papi%nvhpc does not build; use the caliper from the deployment
     # TODO: fix this more robustly so we don't have to play so many games.
-    SPACK_PACKAGE_DEPENDENCIES: ^hpe-mpi%gcc
+    SPACK_PACKAGE_DEPENDENCIES: ^hpe-mpi%gcc ^caliper%gcc+cuda cuda_arch=70
 
-build:coreneuron:nmodl:nvhpc:omp:
-  extends: [.build_coreneuron_nmodl, .spack_nvhpc]
+# TODO: improve coverage by switching an Intel build to be statically linked
+# TODO: improve coverage by switching an Intel build to RelWithDebInfo
+# TODO: improve coverage by enabling +openmp on an Intel build
+build:coreneuron:mod2c:intel:shared:debug:
+  extends: [.build_coreneuron, .spack_intel]
   variables:
-    SPACK_PACKAGE: coreneuron
-    # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
-    SPACK_PACKAGE_SPEC: +nmodl+openmp+gpu+tests~legacy-unit~sympy build_type=RelWithDebInfo
+    SPACK_PACKAGE_SPEC: +caliper~gpu~legacy-unit~nmodl~openmp+shared+tests~unified build_type=Debug
+
+build:coreneuron:nmodl:intel:debug:legacy:
+  extends: [.build_coreneuron, .spack_intel]
   needs: ["build:nmodl"]
+  variables:
+    SPACK_PACKAGE_SPEC: +caliper~gpu~legacy-unit+nmodl~openmp~shared~sympy+tests~unified build_type=Debug
 
-build:coreneuron:nmodl:nvhpc:acc:
-  extends: [.build_coreneuron_nmodl, .spack_nvhpc]
+# Disable caliper to improve coverage
+build:coreneuron:nmodl:intel:shared:debug:
+  extends: [.build_coreneuron, .spack_intel]
+  needs: ["build:nmodl"]
   variables:
-    SPACK_PACKAGE: coreneuron
-    # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
-    # Sympy + OpenMP target offload does not currently work with NVHPC
-    SPACK_PACKAGE_SPEC: +nmodl~openmp+gpu+tests~legacy-unit+sympy build_type=RelWithDebInfo
+    SPACK_PACKAGE_DEPENDENCIES: ^hpe-mpi%gcc
+    SPACK_PACKAGE_SPEC: ~caliper~gpu~legacy-unit+nmodl~openmp+shared+sympy+tests~unified build_type=Debug
+
+# Not linked to a NEURON build+test job, see
+# https://github.com/BlueBrain/CoreNeuron/issues/594
+build:coreneuron:mod2c:nvhpc:acc:debug:unified:
+  extends: [.build_coreneuron, .spack_nvhpc]
+  variables:
+    SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit~nmodl+openmp~shared+tests+unified build_type=Debug
+
+# Shared + OpenACC + OpenMP host threading has problems
+build:coreneuron:mod2c:nvhpc:acc:shared:
+  extends: [.build_coreneuron, .spack_nvhpc]
+  variables:
+    SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit~nmodl~openmp+shared+tests~unified build_type=RelWithDebInfo
+
+build:coreneuron:nmodl:nvhpc:acc:debug:legacy:
+  extends: [.build_coreneuron, .spack_nvhpc]
   needs: ["build:nmodl"]
+  variables:
+    SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit+nmodl~openmp~shared~sympy+tests~unified build_type=Debug
 
-build:coreneuron:mod2c:intel:
-  extends: [.build, .spack_intel]
+build:coreneuron:nmodl:nvhpc:acc:shared:
+  extends: [.build_coreneuron, .spack_nvhpc]
+  needs: ["build:nmodl"]
   variables:
-    SPACK_PACKAGE: coreneuron
-    SPACK_PACKAGE_SPEC: +tests~legacy-unit build_type=Debug
+    SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit+nmodl~openmp+shared+sympy+tests~unified build_type=RelWithDebInfo
 
-build:coreneuron:nmodl:intel:
-  extends: [.build_coreneuron_nmodl, .spack_intel]
+build:coreneuron:nmodl:nvhpc:omp:legacy:
+  extends: [.build_coreneuron, .spack_nvhpc]
+  needs: ["build:nmodl"]
   variables:
-    SPACK_PACKAGE: coreneuron
-    SPACK_PACKAGE_SPEC: +nmodl+tests~legacy-unit build_type=Debug
+    SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit+nmodl+openmp~shared~sympy+tests~unified build_type=RelWithDebInfo
+
+build:coreneuron:nmodl:nvhpc:omp:debug:
+  extends: [.build_coreneuron, .spack_nvhpc]
   needs: ["build:nmodl"]
+  variables:
+    SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit+nmodl+openmp~shared+sympy+tests~unified build_type=Debug
 
 # Build NEURON
-build:neuron:mod2c:nvhpc:acc:
+build:neuron:mod2c:intel:shared:debug:
+  extends: [.build_neuron, .spack_intel]
+  needs: ["build:coreneuron:mod2c:intel:shared:debug"]
+
+build:neuron:nmodl:intel:debug:legacy:
+  extends: [.build_neuron, .spack_intel]
+  needs: ["build:coreneuron:nmodl:intel:debug:legacy"]
+
+build:neuron:nmodl:intel:shared:debug:
+  extends: [.build_neuron, .spack_intel]
+  needs: ["build:coreneuron:nmodl:intel:shared:debug"]
+
+build:neuron:mod2c:nvhpc:acc:shared:
   extends: [.build_neuron, .spack_nvhpc]
-  needs: ["build:coreneuron:mod2c:nvhpc:acc"]
+  needs: ["build:coreneuron:mod2c:nvhpc:acc:shared"]
 
-build:neuron:nmodl:nvhpc:omp:
+build:neuron:nmodl:nvhpc:acc:debug:legacy:
   extends: [.build_neuron, .spack_nvhpc]
-  needs: ["build:coreneuron:nmodl:nvhpc:omp"]
+  needs: ["build:coreneuron:nmodl:nvhpc:acc:debug:legacy"]
 
-build:neuron:nmodl:nvhpc:acc:
+build:neuron:nmodl:nvhpc:acc:shared:
   extends: [.build_neuron, .spack_nvhpc]
-  needs: ["build:coreneuron:nmodl:nvhpc:acc"]
+  needs: ["build:coreneuron:nmodl:nvhpc:acc:shared"]
 
-build:neuron:mod2c:intel:
-  extends: [.build_neuron, .spack_intel]
-  needs: ["build:coreneuron:mod2c:intel"]
+build:neuron:nmodl:nvhpc:omp:legacy:
+  extends: [.build_neuron, .spack_nvhpc]
+  needs: ["build:coreneuron:nmodl:nvhpc:omp:legacy"]
 
-build:neuron:nmodl:intel:
-  extends: [.build_neuron, .spack_intel]
-  needs: ["build:coreneuron:nmodl:intel"]
+build:neuron:nmodl:nvhpc:omp:debug:
+  extends: [.build_neuron, .spack_nvhpc]
+  needs: ["build:coreneuron:nmodl:nvhpc:omp:debug"]
 
 # Test CoreNEURON
-test:coreneuron:mod2c:nvhpc:acc:
+test:coreneuron:mod2c:intel:shared:debug:
+  extends: [.ctest]
+  needs: ["build:coreneuron:mod2c:intel:shared:debug"]
+
+test:coreneuron:nmodl:intel:debug:legacy:
+  extends: [.ctest]
+  needs: ["build:coreneuron:nmodl:intel:debug:legacy"]
+
+test:coreneuron:nmodl:intel:shared:debug:
+  extends: [.ctest]
+  needs: ["build:coreneuron:nmodl:intel:shared:debug"]
+
+test:coreneuron:mod2c:nvhpc:acc:debug:unified:
   extends: [.ctest, .gpu_node]
-  needs: ["build:coreneuron:mod2c:nvhpc:acc"]
+  needs: ["build:coreneuron:mod2c:nvhpc:acc:debug:unified"]
 
-test:coreneuron:mod2c:nvhpc:acc:unified:
+test:coreneuron:mod2c:nvhpc:acc:shared:
   extends: [.ctest, .gpu_node]
-  needs: ["build:coreneuron:mod2c:nvhpc:acc:unified"]
+  needs: ["build:coreneuron:mod2c:nvhpc:acc:shared"]
 
-test:coreneuron:nmodl:nvhpc:omp:
+test:coreneuron:nmodl:nvhpc:acc:debug:legacy:
   extends: [.ctest, .gpu_node]
-  needs: ["build:coreneuron:nmodl:nvhpc:omp"]
+  needs: ["build:coreneuron:nmodl:nvhpc:acc:debug:legacy"]
 
-test:coreneuron:nmodl:nvhpc:acc:
+test:coreneuron:nmodl:nvhpc:acc:shared:
   extends: [.ctest, .gpu_node]
-  needs: ["build:coreneuron:nmodl:nvhpc:acc"]
+  needs: ["build:coreneuron:nmodl:nvhpc:acc:shared"]
 
-test:coreneuron:mod2c:intel:
-  extends: [.ctest]
-  needs: ["build:coreneuron:mod2c:intel"]
+test:coreneuron:nmodl:nvhpc:omp:legacy:
+  extends: [.ctest, .gpu_node]
+  needs: ["build:coreneuron:nmodl:nvhpc:omp:legacy"]
 
-test:coreneuron:nmodl:intel:
-  extends: [.ctest]
-  needs: ["build:coreneuron:nmodl:intel"]
+test:coreneuron:nmodl:nvhpc:omp:debug:
+  extends: [.ctest, .gpu_node]
+  needs: ["build:coreneuron:nmodl:nvhpc:omp:debug"]
 
 # Test NEURON
-test:neuron:mod2c:nvhpc:acc:
+test:neuron:mod2c:intel:shared:debug:
+  extends: [.test_neuron]
+  needs: ["build:neuron:mod2c:intel:shared:debug"]
+
+test:neuron:nmodl:intel:debug:legacy:
+  extends: [.test_neuron]
+  needs: ["build:neuron:nmodl:intel:debug:legacy"]
+
+test:neuron:nmodl:intel:shared:debug:
+  extends: [.test_neuron]
+  needs: ["build:neuron:nmodl:intel:shared:debug"]
+
+test:neuron:mod2c:nvhpc:acc:shared:
   extends: [.test_neuron, .gpu_node]
-  needs: ["build:neuron:mod2c:nvhpc:acc"]
+  needs: ["build:neuron:mod2c:nvhpc:acc:shared"]
 
-test:neuron:nmodl:nvhpc:omp:
+test:neuron:nmodl:nvhpc:acc:debug:legacy:
   extends: [.test_neuron, .gpu_node]
-  needs: ["build:neuron:nmodl:nvhpc:omp"]
+  needs: ["build:neuron:nmodl:nvhpc:acc:debug:legacy"]
 
-test:neuron:nmodl:nvhpc:acc:
+test:neuron:nmodl:nvhpc:acc:shared:
   extends: [.test_neuron, .gpu_node]
-  needs: ["build:neuron:nmodl:nvhpc:acc"]
+  needs: ["build:neuron:nmodl:nvhpc:acc:shared"]
 
-test:neuron:mod2c:intel:
-  extends: [.test_neuron]
-  needs: ["build:neuron:mod2c:intel"]
+test:neuron:nmodl:nvhpc:omp:legacy:
+  extends: [.test_neuron, .gpu_node]
+  needs: ["build:neuron:nmodl:nvhpc:omp:legacy"]
 
-test:neuron:nmodl:intel:
-  extends: [.test_neuron]
-  needs: ["build:neuron:nmodl:intel"]
+test:neuron:nmodl:nvhpc:omp:debug:
+  extends: [.test_neuron, .gpu_node]
+  needs: ["build:neuron:nmodl:nvhpc:omp:debug"]
diff --git a/CMake/MakefileBuildOptions.cmake b/CMake/MakefileBuildOptions.cmake
index 7aef0c549..e4c658349 100644
--- a/CMake/MakefileBuildOptions.cmake
+++ b/CMake/MakefileBuildOptions.cmake
@@ -21,7 +21,7 @@ set(CMAKE_ISPC_FLAGS "${CMAKE_ISPC_FLAGS} --pic")
 set(NMODL_COMMON_ARGS "passes --inline")
 
 if(NOT "${CORENRN_NMODL_FLAGS}" STREQUAL "")
-  set(NMODL_COMMON_ARGS "${NMODL_COMMON_ARGS} ${CORENRN_NMODL_FLAGS}")
+  string(APPEND NMODL_COMMON_ARGS " ${CORENRN_NMODL_FLAGS}")
 endif()
 
 set(NMODL_CPU_BACKEND_ARGS "host --c")
@@ -29,66 +29,135 @@ set(NMODL_ISPC_BACKEND_ARGS "host --ispc")
 set(NMODL_ACC_BACKEND_ARGS "host --c acc --oacc")
 
 # =============================================================================
-# Extract Compile definitions : common to all backend
+# Construct the linker arguments that are used inside nrnivmodl-core (to build libcorenrnmech from
+# libcoreneuron-core, libcoreneuron-cuda and mechanism object files) and inside nrnivmodl (to link
+# NEURON's special against CoreNEURON's libcorenrnmech). These are stored in two global properties:
+# CORENRN_LIB_LINK_FLAGS (used by NEURON/nrnivmodl to link special against CoreNEURON) and
+# CORENRN_LIB_LINK_DEP_FLAGS (used by CoreNEURON/nrnivmodl-core to link libcorenrnmech.so).
+# Conceptually: CORENRN_LIB_LINK_FLAGS = -lcorenrnmech $CORENRN_LIB_LINK_DEP_FLAGS
 # =============================================================================
-get_directory_property(COMPILE_DEFS COMPILE_DEFINITIONS)
-if(COMPILE_DEFS)
-  set(CORENRN_COMMON_COMPILE_DEFS "")
-  foreach(flag ${COMPILE_DEFS})
-    set(CORENRN_COMMON_COMPILE_DEFS "${CORENRN_COMMON_COMPILE_DEFS} -D${flag}")
-  endforeach()
+if(NOT CORENRN_ENABLE_SHARED)
+  set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " -Wl,--whole-archive")
 endif()
+set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " -lcorenrnmech")
+if(NOT CORENRN_ENABLE_SHARED)
+  set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " -Wl,--no-whole-archive")
+endif()
+# Essentially we "just" want to unpack the CMake dependencies of the `coreneuron-core` target into a
+# plain string that we can bake into the Makefiles in both NEURON and CoreNEURON.
+function(coreneuron_process_library_path library)
+  get_filename_component(library_dir "${library}" DIRECTORY)
+  if(NOT library_dir)
+    # In case target is not a target but is just the name of a library, e.g. "dl"
+    set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_DEP_FLAGS " -l${library}")
+  elseif("${library_dir}" MATCHES "^(/lib|/lib64|/usr/lib|/usr/lib64)$")
+    # e.g. /usr/lib64/libpthread.so -> -lpthread TODO: consider using
+    # https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_IMPLICIT_LINK_DIRECTORIES.html, or
+    # dropping this special case entirely
+    get_filename_component(libname ${library} NAME_WE)
+    string(REGEX REPLACE "^lib" "" libname ${libname})
+    set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_DEP_FLAGS " -l${libname}")
+  else()
+    # It's a full path, include that on the line
+    set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_DEP_FLAGS
+                                               " -Wl,-rpath,${library_dir} ${library}")
+  endif()
+endfunction()
+function(coreneuron_process_target target)
+  if(TARGET ${target})
+    if(NOT target STREQUAL "coreneuron-core")
+      # This is a special case: libcoreneuron-core.a is manually unpacked into .o files by the
+      # nrnivmodl-core Makefile, so we do not want to also emit an -lcoreneuron-core argument.
+      get_target_property(target_inc_dirs ${target} INTERFACE_INCLUDE_DIRECTORIES)
+      if(target_inc_dirs)
+        foreach(inc_dir_genex ${target_inc_dirs})
+          string(GENEX_STRIP "${inc_dir_genex}" inc_dir)
+          if(inc_dir)
+            set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_EXTRA_COMPILE_FLAGS " -I${inc_dir}")
+          endif()
+        endforeach()
+      endif()
+      get_target_property(target_imported ${target} IMPORTED)
+      if(target_imported)
+        # In this case we can extract the full path to the library
+        get_target_property(target_location ${target} LOCATION)
+        coreneuron_process_library_path(${target_location})
+      else()
+        # This is probably another of our libraries, like -lcoreneuron-cuda. We might need to add -L
+        # and an RPATH later.
+        set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_DEP_FLAGS " -l${target}")
+      endif()
+    endif()
+    get_target_property(target_libraries ${target} LINK_LIBRARIES)
+    if(target_libraries)
+      foreach(child_target ${target_libraries})
+        coreneuron_process_target(${child_target})
+      endforeach()
+    endif()
+    return()
+  endif()
+  coreneuron_process_library_path("${target}")
+endfunction()
+coreneuron_process_target(coreneuron-core)
+get_property(CORENRN_LIB_LINK_DEP_FLAGS GLOBAL PROPERTY CORENRN_LIB_LINK_DEP_FLAGS)
+set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " ${CORENRN_LIB_LINK_DEP_FLAGS}")
+# In static builds then NEURON uses dlopen(nullptr, ...) to look for the corenrn_embedded_run
+# symbol, which comes from libcoreneuron-core.a and gets included in libcorenrnmech.
+if(NOT CORENRN_ENABLE_SHARED)
+  set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " -rdynamic")
+endif()
+get_property(CORENRN_EXTRA_COMPILE_FLAGS GLOBAL PROPERTY CORENRN_EXTRA_COMPILE_FLAGS)
+get_property(CORENRN_LIB_LINK_FLAGS GLOBAL PROPERTY CORENRN_LIB_LINK_FLAGS)
+
+# Detect if --start-group and --end-group are valid linker arguments. These are typically needed
+# when linking mutually-dependent .o files (or where we don't know the correct order) on Linux, but
+# they are not needed *or* recognised by the macOS linker.
+if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
+  include(CheckLinkerFlag)
+  check_linker_flag(CXX -Wl,--start-group CORENRN_CXX_LINKER_SUPPORTS_START_GROUP)
+elseif(CMAKE_SYSTEM_NAME MATCHES Linux)
+  # Assume that --start-group and --end-group are only supported on Linux
+  set(CORENRN_CXX_LINKER_SUPPORTS_START_GROUP ON)
+endif()
+if(CORENRN_CXX_LINKER_SUPPORTS_START_GROUP)
+  set(CORENEURON_LINKER_START_GROUP -Wl,--start-group)
+  set(CORENEURON_LINKER_END_GROUP -Wl,--end-group)
+endif()
+
+# Things that used to be in CORENRN_LIB_LINK_FLAGS: -lrt -L${CMAKE_HOST_SYSTEM_PROCESSOR}
+# -L${caliper_LIB_DIR} -l${CALIPER_LIB}
 
 # =============================================================================
-# link flags : common to all backend
+# Turn CORENRN_COMPILE_DEFS into a list of -DFOO[=BAR] options.
 # =============================================================================
-# ~~~
-# find_cuda uses FindThreads that adds below imported target we
-# shouldn't add imported target to link line
-# ~~~
-list(REMOVE_ITEM CORENRN_LINK_LIBS "Threads::Threads")
+list(TRANSFORM CORENRN_COMPILE_DEFS PREPEND -D OUTPUT_VARIABLE CORENRN_COMPILE_DEF_FLAGS)
 
-string(JOIN " " CORENRN_COMMON_LDFLAGS ${CORENRN_EXTRA_LINK_FLAGS})
+# =============================================================================
+# Extra link flags that we need to include when linking libcorenrnmech.{a,so} in CoreNEURON but that
+# do not need to be passed to NEURON to use when linking nrniv/special (why?)
+# =============================================================================
+string(JOIN " " CORENRN_COMMON_LDFLAGS ${CORENRN_LIB_LINK_DEP_FLAGS} ${CORENRN_EXTRA_LINK_FLAGS})
 if(CORENRN_SANITIZER_LIBRARY_DIR)
   string(APPEND CORENRN_COMMON_LDFLAGS " -Wl,-rpath,${CORENRN_SANITIZER_LIBRARY_DIR}")
 endif()
 string(JOIN " " CORENRN_SANITIZER_ENABLE_ENVIRONMENT_STRING ${CORENRN_SANITIZER_ENABLE_ENVIRONMENT})
 
-# replicate CMake magic to transform system libs to -l<libname>
-foreach(link_lib ${CORENRN_LINK_LIBS})
-  if(${link_lib} MATCHES "\-l.*")
-    string(APPEND CORENRN_COMMON_LDFLAGS " ${link_lib}")
-    continue()
-  endif()
-  get_filename_component(path ${link_lib} DIRECTORY)
-  if(NOT path)
-    string(APPEND CORENRN_COMMON_LDFLAGS " -l${link_lib}")
-  elseif("${path}" MATCHES "^(/lib|/lib64|/usr/lib|/usr/lib64)$")
-    get_filename_component(libname ${link_lib} NAME_WE)
-    string(REGEX REPLACE "^lib" "" libname ${libname})
-    string(APPEND CORENRN_COMMON_LDFLAGS " -l${libname}")
-  else()
-    string(APPEND CORENRN_COMMON_LDFLAGS " ${link_lib}")
-  endif()
-endforeach()
-
 # =============================================================================
 # compile flags : common to all backend
 # =============================================================================
-string(JOIN " " CMAKE_CXX17_STANDARD_COMPILE_OPTION_STRING ${CMAKE_CXX17_STANDARD_COMPILE_OPTION})
 string(TOUPPER "${CMAKE_BUILD_TYPE}" _BUILD_TYPE)
-list(TRANSFORM CORENRN_COMPILE_DEFS PREPEND -D OUTPUT_VARIABLE CORENRN_COMPILE_DEF_FLAGS)
 string(
   JOIN
   " "
   CORENRN_CXX_FLAGS
   ${CMAKE_CXX_FLAGS}
   ${CMAKE_CXX_FLAGS_${_BUILD_TYPE}}
-  ${CMAKE_CXX17_STANDARD_COMPILE_OPTION_STRING}
+  ${CMAKE_CXX17_STANDARD_COMPILE_OPTION}
   ${NVHPC_ACC_COMP_FLAGS}
   ${NVHPC_CXX_INLINE_FLAGS}
   ${CORENRN_COMPILE_DEF_FLAGS}
-  ${CORENRN_EXTRA_MECH_CXX_FLAGS})
+  ${CORENRN_EXTRA_MECH_CXX_FLAGS}
+  ${CORENRN_EXTRA_COMPILE_FLAGS})
 
 # =============================================================================
 # nmodl/mod2c related options : TODO
diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake
index 1c18225b6..a21f8b523 100644
--- a/CMake/OpenAccHelper.cmake
+++ b/CMake/OpenAccHelper.cmake
@@ -34,13 +34,13 @@ if(CORENRN_ENABLE_GPU)
   cnrn_parse_version(${CMAKE_CXX_COMPILER_VERSION} OUTPUT_MAJOR_MINOR
                      CORENRN_NVHPC_MAJOR_MINOR_VERSION)
   # Enable cudaProfiler{Start,Stop}() behind the Instrumentor::phase... APIs
-  add_compile_definitions(CORENEURON_CUDA_PROFILING CORENEURON_ENABLE_GPU)
+  list(APPEND CORENRN_COMPILE_DEFS CORENEURON_CUDA_PROFILING CORENEURON_ENABLE_GPU)
   # Plain C++ code in CoreNEURON may need to use CUDA runtime APIs for, for example, starting and
   # stopping profiling. This makes sure those headers can be found.
   include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
   # cuda unified memory support
   if(CORENRN_ENABLE_CUDA_UNIFIED_MEMORY)
-    add_compile_definitions(CORENEURON_UNIFIED_MEMORY)
+    list(APPEND CORENRN_COMPILE_DEFS CORENEURON_UNIFIED_MEMORY)
   endif()
   if(${CMAKE_VERSION} VERSION_LESS 3.17)
     # Hopefully we can drop this soon. Parse ${CMAKE_CUDA_COMPILER_VERSION} into a shorter X.Y
@@ -67,7 +67,14 @@ if(CORENRN_ENABLE_GPU)
   # and offloaded OpenACC/OpenMP code. Using -cuda when compiling seems to improve error messages in
   # some cases, and to be recommended by NVIDIA. We pass -gpu=cudaX.Y to ensure that OpenACC/OpenMP
   # code is compiled with the same CUDA version as the explicit CUDA code.
-  set(NVHPC_ACC_COMP_FLAGS "-cuda -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
+  set(NVHPC_ACC_COMP_FLAGS "-cuda -gpu=cuda${CORENRN_CUDA_VERSION_SHORT}")
+  # Combining -gpu=lineinfo with -O0 -g gives a warning: Conflicting options --device-debug and
+  # --generate-line-info specified, ignoring --generate-line-info option
+  if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    string(APPEND NVHPC_ACC_COMP_FLAGS ",debug")
+  else()
+    string(APPEND NVHPC_ACC_COMP_FLAGS ",lineinfo")
+  endif()
   # Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA
   # code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the
   # same default compute capabilities as each other, particularly on GPU-less build machines.
@@ -77,7 +84,7 @@ if(CORENRN_ENABLE_GPU)
   if(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenMP")
     # Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available
     # for a region then prefer OpenMP.
-    add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD)
+    list(APPEND CORENRN_COMPILE_DEFS CORENEURON_PREFER_OPENMP_OFFLOAD)
     string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu")
   elseif(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenACC")
     # Only enable OpenACC offload for GPU
@@ -94,19 +101,26 @@ if(CORENRN_ENABLE_GPU)
 endif()
 
 # =============================================================================
-# Set global property that will be used by NEURON to link with CoreNEURON
+# Initialise global properties that will be used by NEURON to link with CoreNEURON
 # =============================================================================
 if(CORENRN_ENABLE_GPU)
-  set_property(
-    GLOBAL
-    PROPERTY
-      CORENEURON_LIB_LINK_FLAGS
-      "${NVHPC_ACC_COMP_FLAGS} -rdynamic -lrt -Wl,--whole-archive -L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech -L$(libdir) -lcoreneuron -Wl,--no-whole-archive"
-  )
-else()
-  set_property(GLOBAL PROPERTY CORENEURON_LIB_LINK_FLAGS
-                               "-L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech")
-endif(CORENRN_ENABLE_GPU)
+  # CORENRN_LIB_LINK_FLAGS is the full set of flags needed to link against libcorenrnmech.so:
+  # something like `-acc -lcorenrnmech ...`. CORENRN_NEURON_LINK_FLAGS only contains flags that need
+  # to be used when linking the NEURON Python module to make sure it is able to dynamically load
+  # libcorenrnmech.so.
+  set_property(GLOBAL PROPERTY CORENRN_LIB_LINK_FLAGS "${NVHPC_ACC_COMP_FLAGS}")
+  if(CORENRN_ENABLE_SHARED)
+    # Because of
+    # https://forums.developer.nvidia.com/t/dynamically-loading-an-openacc-enabled-shared-library-from-an-executable-compiled-with-nvc-does-not-work/210968
+    # we have to tell NEURON to pass OpenACC flags when linking special, otherwise we end up with an
+    # `nrniv` binary that cannot dynamically load CoreNEURON in shared-library builds.
+    set_property(GLOBAL PROPERTY CORENRN_NEURON_LINK_FLAGS "${NVHPC_ACC_COMP_FLAGS}")
+  endif()
+endif()
+
+# NEURON needs to have access to this when CoreNEURON is built as a submodule. If CoreNEURON is
+# installed externally then this is set via coreneuron-config.cmake
+set_property(GLOBAL PROPERTY CORENRN_ENABLE_SHARED ${CORENRN_ENABLE_SHARED})
 
 if(CORENRN_HAVE_NVHPC_COMPILER)
   if(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL 20.7)
diff --git a/CMake/coreneuron-config.cmake.in b/CMake/coreneuron-config.cmake.in
index 29f67c92f..9f7ac4997 100644
--- a/CMake/coreneuron-config.cmake.in
+++ b/CMake/coreneuron-config.cmake.in
@@ -14,12 +14,14 @@ set(CORENRN_VERSION_PATCH @PROJECT_VERSION_PATCH@)
 set(CORENRN_ENABLE_GPU @CORENRN_ENABLE_GPU@)
 set(CORENRN_ENABLE_NMODL @CORENRN_ENABLE_NMODL@)
 set(CORENRN_ENABLE_REPORTING @CORENRN_ENABLE_REPORTING@)
-set(CORENEURON_LIB_LINK_FLAGS "@CORENEURON_LIB_LINK_FLAGS@")
+set(CORENRN_ENABLE_SHARED @CORENRN_ENABLE_SHARED@)
+set(CORENRN_LIB_LINK_FLAGS "@CORENRN_LIB_LINK_FLAGS@")
+set(CORENRN_NEURON_LINK_FLAGS "@CORENRN_NEURON_LINK_FLAGS@")
 
 find_path(CORENEURON_INCLUDE_DIR "coreneuron/coreneuron.h" HINTS "${CONFIG_PATH}/../../include")
 find_path(
   CORENEURON_LIB_DIR
-  NAMES libcoreneuron.a libcoreneuron.so libcoreneuron.dylib
+  NAMES libcorenrnmech.a libcorenrnmech.so libcorenrnmech.dylib
   HINTS "${CONFIG_PATH}/../../lib")
 
 include(${CONFIG_PATH}/coreneuron.cmake)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index eda8f2feb..183639480 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -160,9 +160,6 @@ set(CORENRN_ACCELERATOR_OFFLOAD "Disabled")
 if(CORENRN_ENABLE_GPU)
   # Older CMake versions than 3.15 have not been tested for GPU/CUDA/OpenACC support after
   # https://github.com/BlueBrain/CoreNeuron/pull/609.
-  # https://cmake.org/cmake/help/latest/release/3.14.html#properties suggests there would be
-  # problems because of expressions like set_target_properties(lfp_test_bin PROPERTIES
-  # CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
 
   # Fail hard and early if we don't have the PGI/NVHPC compiler.
   if(NOT CORENRN_HAVE_NVHPC_COMPILER)
@@ -263,17 +260,21 @@ find_package(Perl REQUIRED)
 # Common build options
 # =============================================================================
 # build mod files for coreneuron
-add_definitions(-DCORENEURON_BUILD)
-
+list(APPEND CORENRN_COMPILE_DEFS CORENEURON_BUILD)
 set(CMAKE_REQUIRED_QUIET TRUE)
 check_include_files(malloc.h have_malloc_h)
 if(have_malloc_h)
-  add_definitions("-DHAVE_MALLOC_H")
+  list(APPEND CORENRN_COMPILE_DEFS HAVE_MALLOC_H)
 endif()
 
 # =============================================================================
 # Build option specific compiler flags
 # =============================================================================
+if(CORENRN_ENABLE_NMODL)
+  # We use Eigen for "small" matrices with thread-level parallelism handled at a higher level; tell
+  # Eigen not to try to multithread internally
+  list(APPEND CORENRN_COMPILE_DEFS EIGEN_DONT_PARALLELIZE)
+endif()
 if(CORENRN_HAVE_NVHPC_COMPILER)
   # PGI with llvm code generation doesn't have necessary assembly intrinsic headers
   list(APPEND CORENRN_COMPILE_DEFS EIGEN_DONT_VECTORIZE=1)
@@ -292,14 +293,6 @@ if(CORENRN_HAVE_NVHPC_COMPILER)
   endif()
 endif()
 
-# ~~~
-# OpenACC needs to build static library in order to have global/routines working.
-# See https://www.pgroup.com/userforum/viewtopic.php?t=5350
-# ~~~
-if(CORENRN_ENABLE_GPU)
-  set(CORENRN_ENABLE_SHARED OFF)
-endif()
-
 if(CORENRN_ENABLE_SHARED)
   set(COMPILE_LIBRARY_TYPE "SHARED")
 else()
@@ -313,14 +306,14 @@ endif()
 
 if(CORENRN_ENABLE_MPI)
   find_package(MPI REQUIRED)
-  add_definitions("-DNRNMPI=1")
+  list(APPEND CORENRN_COMPILE_DEFS NRNMPI=1)
   # avoid linking to C++ bindings
-  add_definitions("-DMPI_NO_CPPBIND=1")
-  add_definitions("-DOMPI_SKIP_MPICXX=1")
-  add_definitions("-DMPICH_SKIP_MPICXX=1")
+  list(APPEND CORENRN_COMPILE_DEFS MPI_NO_CPPBIND=1)
+  list(APPEND CORENRN_COMPILE_DEFS OMPI_SKIP_MPICXX=1)
+  list(APPEND CORENRN_COMPILE_DEFS MPICH_SKIP_MPICXX=1)
 else()
-  add_definitions("-DNRNMPI=0")
-  add_definitions("-DNRN_MULTISEND=0")
+  list(APPEND CORENRN_COMPILE_DEFS NRNMPI=0)
+  list(APPEND CORENRN_COMPILE_DEFS NRN_MULTISEND=0)
 endif()
 
 if(CORENRN_ENABLE_OPENMP)
@@ -331,23 +324,23 @@ if(CORENRN_ENABLE_OPENMP)
   endif()
 endif()
 
-add_definitions("-DLAYOUT=0")
+list(APPEND CORENRN_COMPILE_DEFS LAYOUT=0)
 
 if(NOT CORENRN_ENABLE_HOC_EXP)
-  add_definitions("-DDISABLE_HOC_EXP")
+  list(APPEND CORENRN_COMPILE_DEFS DISABLE_HOC_EXP)
 endif()
 
 # splay tree required for net_move
 if(CORENRN_ENABLE_SPLAYTREE_QUEUING)
-  add_definitions("-DENABLE_SPLAYTREE_QUEUING")
+  list(APPEND CORENRN_COMPILE_DEFS ENABLE_SPLAYTREE_QUEUING)
 endif()
 
 if(NOT CORENRN_ENABLE_NET_RECEIVE_BUFFER)
-  add_definitions("-DNET_RECEIVE_BUFFERING=0")
+  list(APPEND CORENRN_COMPILE_DEFS NET_RECEIVE_BUFFERING=0)
 endif()
 
 if(NOT CORENRN_ENABLE_TIMEOUT)
-  add_definitions("-DDISABLE_TIMEOUT")
+  list(APPEND CORENRN_COMPILE_DEFS DISABLE_TIMEOUT)
 endif()
 
 if(CORENRN_ENABLE_REPORTING)
@@ -356,7 +349,7 @@ if(CORENRN_ENABLE_REPORTING)
   find_program(H5DUMP_EXECUTABLE h5dump)
 
   if(reportinglib_FOUND)
-    add_definitions("-DENABLE_BIN_REPORTS")
+    list(APPEND CORENRN_COMPILE_DEFS ENABLE_BIN_REPORTS)
     set(ENABLE_BIN_REPORTS_TESTS ON)
   else()
     set(reportinglib_INCLUDE_DIR "")
@@ -364,7 +357,7 @@ if(CORENRN_ENABLE_REPORTING)
   endif()
   if(sonata_FOUND)
     if(TARGET sonata::sonata_report)
-      add_definitions("-DENABLE_SONATA_REPORTS")
+      list(APPEND CORENRN_COMPILE_DEFS ENABLE_SONATA_REPORTS)
       set(ENABLE_SONATA_REPORTS_TESTS ON)
     else()
       message(SEND_ERROR "SONATA library was found but without reporting support")
@@ -384,6 +377,7 @@ if(CORENRN_ENABLE_LEGACY_UNITS)
 else()
   set(CORENRN_USE_LEGACY_UNITS 0)
 endif()
+list(APPEND CORENRN_COMPILE_DEFS CORENEURON_USE_LEGACY_UNITS=${CORENRN_USE_LEGACY_UNITS})
 # Propagate Legacy Units flag to backends.
 set(MOD2C_ENABLE_LEGACY_UNITS
     ${CORENRN_ENABLE_LEGACY_UNITS}
@@ -396,7 +390,7 @@ if(CORENRN_ENABLE_MPI_DYNAMIC)
   if(NOT CORENRN_ENABLE_MPI)
     message(FATAL_ERROR "Cannot enable dynamic mpi without mpi")
   endif()
-  add_compile_definitions(CORENRN_ENABLE_MPI_DYNAMIC)
+  list(APPEND CORENRN_COMPILE_DEFS CORENEURON_ENABLE_MPI_DYNAMIC)
 endif()
 
 if(CORENRN_ENABLE_PRCELLSTATE)
@@ -405,7 +399,7 @@ else()
   set(CORENRN_NRN_PRCELLSTATE 0)
 endif()
 if(MINGW)
-  add_definitions("-DMINGW")
+  list(APPEND CORENRN_COMPILE_DEFS MINGW)
 endif()
 
 # =============================================================================
@@ -448,22 +442,20 @@ endif()
 # =============================================================================
 if(CORENRN_ENABLE_CALIPER_PROFILING)
   find_package(caliper REQUIRED)
-  include_directories(${caliper_INCLUDE_DIR})
-  add_definitions("-DCORENEURON_CALIPER")
-  set(CALIPER_LIB "caliper")
-  set_property(GLOBAL APPEND_STRING PROPERTY CORENEURON_LIB_LINK_FLAGS
-                                             " -L${caliper_LIB_DIR} -l${CALIPER_LIB}")
+  list(APPEND CORENRN_COMPILE_DEFS CORENEURON_CALIPER)
+  set(CORENRN_CALIPER_LIB caliper)
 endif()
 
 if(CORENRN_ENABLE_LIKWID_PROFILING)
   find_package(likwid REQUIRED)
+  list(APPEND CORENRN_COMPILE_DEFS LIKWID_PERFMON)
+  # TODO: avoid this part, probably by using some likwid CMake target
   include_directories(${likwid_INCLUDE_DIRS})
-  add_definitions("-DLIKWID_PERFMON")
 endif()
 
 # enable debugging code with extra logs to stdout
 if(CORENRN_ENABLE_DEBUG_CODE)
-  add_definitions(-DCORENRN_DEBUG -DCHKPNTDEBUG -DCORENRN_DEBUG_QUEUE -DINTERLEAVE_DEBUG)
+  list(APPEND CORENRN_COMPILE_DEFS CORENRN_DEBUG CHKPNTDEBUG CORENRN_DEBUG_QUEUE INTERLEAVE_DEBUG)
 endif()
 
 # =============================================================================
@@ -473,38 +465,18 @@ endif()
 # compiler will be invoked with these flags, so we have to use flags that are as generic as
 # possible.
 if(NOT DEFINED NRN_WHEEL_BUILD OR NOT NRN_WHEEL_BUILD)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${IGNORE_UNKNOWN_PRAGMA_FLAGS}")
+  list(APPEND CORENRN_EXTRA_CXX_FLAGS "${IGNORE_UNKNOWN_PRAGMA_FLAGS}")
 endif()
 
-# =============================================================================
-# Add main directories
-# =============================================================================
+# Add the main source directory
 add_subdirectory(coreneuron)
 
-if(CORENRN_ENABLE_GPU)
-  get_target_property(CORENRN_LINK_LIBRARIES coreneuron INTERFACE_LINK_LIBRARIES)
-  if(CORENRN_LINK_LIBRARIES)
-    foreach(LIB ${CORENRN_LINK_LIBRARIES})
-      get_filename_component(dir_path ${LIB} DIRECTORY)
-      if(TARGET ${LIB})
-        # See, for example, caliper where the coreneuron target depends on the caliper target (so we
-        # get LIB=caliper in this loop), but -l and -L are already added manually here:
-        # https://github.com/BlueBrain/CoreNeuron/blob/856cea4aa647c8f2b0d5bda6d0fc32144c5942e3/CMakeLists.txt#L411-L412
-        message(
-          NOTICE
-          "Ignoring dependency '${LIB}' of 'coreneuron' and assuming relevant flags have already been added to CORENEURON_LIB_LINK_FLAGS."
-        )
-      elseif(NOT dir_path)
-        # In case LIB is not a target but is just the name of a library, e.g. "dl"
-        set_property(GLOBAL APPEND_STRING PROPERTY CORENEURON_LIB_LINK_FLAGS " -l${LIB}")
-      else()
-        set_property(GLOBAL APPEND_STRING PROPERTY CORENEURON_LIB_LINK_FLAGS " ${LIB}")
-      endif()
-    endforeach()
-  endif()
-endif()
-
+# Extract the various compiler option strings to use inside nrnivmodl-core. Sets the global property
+# CORENRN_LIB_LINK_FLAGS, which contains the arguments that must be added to the link line for
+# `special` to link against `libcorenrnmech.{a,so}`
 include(MakefileBuildOptions)
+
+# Generate the nrnivmodl-core script and makefile using the options from MakefileBuildOptions
 add_subdirectory(extra)
 
 if(CORENRN_ENABLE_UNIT_TESTS)
@@ -514,7 +486,7 @@ endif()
 # =============================================================================
 # Install cmake modules
 # =============================================================================
-get_property(CORENEURON_LIB_LINK_FLAGS GLOBAL PROPERTY CORENEURON_LIB_LINK_FLAGS)
+get_property(CORENRN_NEURON_LINK_FLAGS GLOBAL PROPERTY CORENRN_NEURON_LINK_FLAGS)
 configure_file(CMake/coreneuron-config.cmake.in CMake/coreneuron-config.cmake @ONLY)
 install(FILES "${CMAKE_CURRENT_BINARY_DIR}/CMake/coreneuron-config.cmake" DESTINATION share/cmake)
 install(EXPORT coreneuron DESTINATION share/cmake)
diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 0dc648628..97d12e613 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -16,29 +16,28 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 # =============================================================================
 # gather various source files
 # =============================================================================
-file(GLOB_RECURSE CORENEURON_HEADER_FILES "*.h*")
-file(GLOB_RECURSE CORENEURON_TEMPLATE_FILES "*.ipp")
 file(
   GLOB
   CORENEURON_CODE_FILES
   "apps/main1.cpp"
   "apps/corenrn_parameters.cpp"
-  "gpu/*.cpp"
+  "gpu/nrn_acc_manager.cpp"
   "io/*.cpp"
   "io/reports/*.cpp"
   "mechanism/*.cpp"
+  "mpi/core/nrnmpi_def_cinc.cpp"
   "network/*.cpp"
   "permute/*.cpp"
   "sim/*.cpp"
+  "sim/scopmath/abort.cpp"
+  "sim/scopmath/newton_thread.cpp"
   "utils/*.cpp"
   "utils/*/*.c"
   "utils/*/*.cpp")
-set(SCOPMATH_CODE_FILES "sim/scopmath/abort.cpp" "sim/scopmath/newton_thread.cpp")
 set(MPI_LIB_FILES "mpi/lib/mpispike.cpp" "mpi/lib/nrnmpi.cpp")
-set(MPI_CORE_FILES "mpi/core/nrnmpi_def_cinc.cpp")
 if(CORENRN_ENABLE_MPI)
   # Building these requires -ldl, which is only added if MPI is enabled.
-  list(APPEND MPI_CORE_FILES "mpi/core/nrnmpi.cpp" "mpi/core/nrnmpidec.cpp")
+  list(APPEND CORENEURON_CODE_FILES "mpi/core/resolve.cpp" "mpi/core/nrnmpidec.cpp")
 endif()
 file(COPY ${CORENEURON_PROJECT_SOURCE_DIR}/external/Random123/include/Random123
      DESTINATION ${CMAKE_BINARY_DIR}/include)
@@ -107,30 +106,31 @@ if(CORENRN_ENABLE_GPU)
 
   set_source_files_properties(${OPENACC_EXCLUDED_FILES} PROPERTIES COMPILE_FLAGS
                                                                    "-DDISABLE_OPENACC")
-  # Only compile the explicit CUDA implementation of the Hines solver in GPU builds.
-  list(APPEND CORENEURON_CODE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/permute/cellorder.cu)
-
-  # Eigen-3.5+ provides better GPU support. However, some functions cannot be called directly from
-  # within an OpenACC region. Therefore, we need to wrap them in a special API (decorate them with
-  # __device__ & acc routine tokens), which allows us to eventually call them from OpenACC. Calling
-  # these functions from CUDA kernels presents no issue ...
-  if(CORENRN_ENABLE_NMODL AND EXISTS ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cu)
-    list(APPEND CORENEURON_CODE_FILES ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cu)
+  # Only compile the explicit CUDA implementation of the Hines solver in GPU builds. Because of
+  # https://forums.developer.nvidia.com/t/cannot-dynamically-load-a-shared-library-containing-both-openacc-and-cuda-code/210972
+  # this cannot be included in the same shared library as the rest of the OpenACC code.
+  set(CORENEURON_CUDA_FILES ${CMAKE_CURRENT_SOURCE_DIR}/permute/cellorder.cu)
+
+  # Eigen functions cannot be called directly from OpenACC regions, but Eigen is sort-of compatible
+  # with being compiled as CUDA code. Because of
+  # https://forums.developer.nvidia.com/t/cannot-dynamically-load-a-shared-library-containing-both-openacc-and-cuda-code/210972
+  # this has to mean `nvc++ -cuda` rather than `nvcc`. We explicitly instantiate Eigen functions for
+  # different matrix sizes in partial_piv_lu.cpp (with CUDA attributes but without OpenACC or OpenMP
+  # annotations) and dispatch to these from a wrapper in partial_piv_lu.h that does have
+  # OpenACC/OpenMP annotations.
+  if(CORENRN_ENABLE_NMODL AND EXISTS ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cpp)
+    list(APPEND CORENEURON_CODE_FILES ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cpp)
+    if(CORENRN_ENABLE_GPU
+       AND CORENRN_HAVE_NVHPC_COMPILER
+       AND CMAKE_BUILD_TYPE STREQUAL "Debug")
+      # In this case OpenAccHelper.cmake passes -gpu=debug, which makes these Eigen functions
+      # extremely slow. Downgrade that to -gpu=lineinfo for this file.
+      set_source_files_properties(${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cpp
+                                  PROPERTIES COMPILE_FLAGS "-gpu=lineinfo,nodebug -O1")
+    endif()
   endif()
 endif()
 
-# =============================================================================
-# eion.cpp depends on CORENRN_USE_LEGACY_UNITS
-# =============================================================================
-set(LegacyFR_FILES
-    ${CMAKE_CURRENT_SOURCE_DIR}/mechanism/eion.cpp ${CMAKE_CURRENT_SOURCE_DIR}/apps/main1.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/io/global_vars.cpp)
-
-set_property(
-  SOURCE ${LegacyFR_FILES}
-  APPEND
-  PROPERTY COMPILE_DEFINITIONS "CORENRN_USE_LEGACY_UNITS=${CORENRN_USE_LEGACY_UNITS}")
-
 # =============================================================================
 # create libraries
 # =============================================================================
@@ -146,27 +146,42 @@ if(CORENRN_ENABLE_MPI AND NOT CORENRN_ENABLE_MPI_DYNAMIC)
   target_include_directories(
     ${CORENRN_MPI_LIB_NAME} PRIVATE ${MPI_INCLUDE_PATH} ${CORENEURON_PROJECT_SOURCE_DIR}
                                     ${CORENEURON_PROJECT_BINARY_DIR}/generated)
+  target_link_libraries(${CORENRN_MPI_LIB_NAME} ${CORENRN_CALIPER_LIB})
   set_property(TARGET ${CORENRN_MPI_LIB_NAME} PROPERTY POSITION_INDEPENDENT_CODE ON)
   set(CORENRN_MPI_OBJ $<TARGET_OBJECTS:${CORENRN_MPI_LIB_NAME}>)
 endif()
 
-# main coreneuron library
-add_library(
-  coreneuron
-  ${COMPILE_LIBRARY_TYPE}
-  ${CORENEURON_HEADER_FILES}
-  ${CORENEURON_TEMPLATE_FILES}
-  ${CORENEURON_CODE_FILES}
-  ${cudacorenrn_objs}
-  ${NMODL_INBUILT_MOD_OUTPUTS}
-  ${MPI_CORE_FILES}
-  ${CORENRN_MPI_OBJ})
-
-target_include_directories(coreneuron PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}
-                                              ${CORENEURON_PROJECT_BINARY_DIR}/generated)
+# Library containing the bulk of the non-mechanism CoreNEURON code. This is always created and
+# installed as a static library, and then the nrnivmodl-core workflow extracts the object files from
+# it and does one of the following:
+#
+# * shared build: creates libcorenrnmech.so from these objects plus those from the translated MOD
+#   files
+# * static build: creates a (temporary, does not get installed) libcorenrnmech.a from these objects
+#   plus those from the translated MOD files, then statically links that into special-core
+#   (nrniv-core)
+#
+# This scheme means that both core and mechanism .o files are linked in a single step, which is
+# important for GPU linking. It does, however, mean that the core code is installed twice, once in
+# libcoreneuron-core.a and once in libcorenrnmech.so (shared) or nrniv-core (static). In a GPU
+# build, libcoreneuron-cuda.{a,so} is also linked to provide the CUDA implementation of the Hines
+# solver. This cannot be included in coreneuron-core because of this issue:
+# https://forums.developer.nvidia.com/t/cannot-dynamically-load-a-shared-library-containing-both-openacc-and-cuda-code/210972
+add_library(coreneuron-core STATIC ${CORENEURON_CODE_FILES} ${CORENRN_MPI_OBJ})
+if(CORENRN_ENABLE_GPU)
+  set(coreneuron_cuda_target coreneuron-cuda)
+  add_library(coreneuron-cuda ${COMPILE_LIBRARY_TYPE} ${CORENEURON_CUDA_FILES})
+  target_link_libraries(coreneuron-core PUBLIC coreneuron-cuda)
+endif()
+
+foreach(target coreneuron-core ${coreneuron_cuda_target})
+  target_include_directories(${target} PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}
+                                               ${CORENEURON_PROJECT_BINARY_DIR}/generated)
+endforeach()
+
 # we can link to MPI libraries in non-dynamic-mpi build
 if(CORENRN_ENABLE_MPI AND NOT CORENRN_ENABLE_MPI_DYNAMIC)
-  target_link_libraries(coreneuron ${MPI_CXX_LIBRARIES})
+  target_link_libraries(coreneuron-core PUBLIC ${MPI_CXX_LIBRARIES})
 endif()
 
 # this is where we handle dynamic mpi library build
@@ -175,7 +190,7 @@ if(CORENRN_ENABLE_MPI AND CORENRN_ENABLE_MPI_DYNAMIC)
   # main coreneuron library needs to be linked to libdl.so and
   # and should be aware of shared library suffix on different platforms.
   # ~~~
-  target_link_libraries(coreneuron ${CMAKE_DL_LIBS})
+  target_link_libraries(coreneuron-core PUBLIC ${CMAKE_DL_LIBS})
 
   # store mpi library targets that will be built
   list(APPEND corenrn_mpi_targets "")
@@ -212,6 +227,7 @@ if(CORENRN_ENABLE_MPI AND CORENRN_ENABLE_MPI_DYNAMIC)
       list(GET NRN_MPI_LIBNAME_LIST ${val} libname)
 
       add_library(core${libname}_lib SHARED ${MPI_LIB_FILES})
+      target_link_libraries(core${libname}_lib ${CORENRN_CALIPER_LIB})
       target_include_directories(
         core${libname}_lib
         PUBLIC ${include}
@@ -222,9 +238,8 @@ if(CORENRN_ENABLE_MPI AND CORENRN_ENABLE_MPI_DYNAMIC)
       # when we will test coreneuron on windows.
       # ~~~
       if(MINGW) # type msmpi only
-        add_dependencies(core${libname}_lib coreneuron)
-        target_link_libraries(core${libname}_lib ${MPI_C_LIBRARIES})
-        target_link_libraries(core${libname}_lib coreneuron)
+        add_dependencies(core${libname}_lib coreneuron-core)
+        target_link_libraries(core${libname}_lib ${MPI_C_LIBRARIES} coreneuron-core)
       endif()
       set_property(TARGET core${libname}_lib PROPERTY OUTPUT_NAME core${libname})
       list(APPEND corenrn_mpi_targets "core${libname}_lib")
@@ -239,48 +254,39 @@ if(CORENRN_ENABLE_MPI AND CORENRN_ENABLE_MPI_DYNAMIC)
   install(TARGETS ${corenrn_mpi_targets} DESTINATION lib)
 endif()
 
-# Prevent CMake from running a device code link step when assembling libcoreneuron.a in GPU builds.
-# The device code linking needs to be deferred to the final step, where it is done by `nvc++ -cuda`.
-set_target_properties(coreneuron PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-# Suppress some compiler warnings. Note in GPU builds this library includes CUDA files.
-target_compile_options(coreneuron
-                       PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${CORENEURON_CXX_WARNING_SUPPRESSIONS}>)
-add_dependencies(coreneuron nrnivmodl-core)
-
-# scopmath is created separately for nrnivmodl-core workflow
-add_library(scopmath STATIC ${CORENEURON_HEADER_FILES} ${SCOPMATH_CODE_FILES})
-target_include_directories(scopmath PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}
-                                            ${CORENEURON_PROJECT_BINARY_DIR}/generated)
+# Suppress some compiler warnings.
+target_compile_options(coreneuron-core PRIVATE ${CORENEURON_CXX_WARNING_SUPPRESSIONS})
+target_link_libraries(coreneuron-core PUBLIC ${reportinglib_LIBRARY} ${sonatareport_LIBRARY}
+                                             ${CORENRN_CALIPER_LIB} ${likwid_LIBRARIES})
 
-target_link_libraries(coreneuron ${reportinglib_LIBRARY} ${sonatareport_LIBRARY} ${CALIPER_LIB}
-                      ${likwid_LIBRARIES})
-
-target_include_directories(coreneuron SYSTEM
-                           PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}/external/Random123/include)
-target_include_directories(coreneuron SYSTEM
-                           PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11/include)
+# TODO: fix adding a dependency of coreneuron-core on CLI11::CLI11 when CLI11 is a submodule. Right
+# now this doesn't work because the CLI11 targets are not exported/installed but coreneuron-core is.
+get_target_property(CLI11_HEADER_DIRECTORY CLI11::CLI11 INTERFACE_INCLUDE_DIRECTORIES)
+target_include_directories(
+  coreneuron-core SYSTEM PRIVATE ${CLI11_HEADER_DIRECTORY}
+                                 ${CORENEURON_PROJECT_SOURCE_DIR}/external/Random123/include)
 
 # See: https://en.cppreference.com/w/cpp/filesystem#Notes
 if(CMAKE_CXX_COMPILER_IS_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.1)
-  target_link_libraries(coreneuron stdc++fs)
+  target_link_libraries(coreneuron-core PUBLIC stdc++fs)
 endif()
 
 if(CORENRN_ENABLE_GPU)
-  # nrnran123.cpp possibly-temporarily uses Boost.Pool in GPU builds if it's available.
+  # nrnran123.cpp uses Boost.Pool in GPU builds if it's available.
   find_package(Boost QUIET)
   if(Boost_FOUND)
     message(STATUS "Boost found, enabling use of memory pools for Random123...")
-    target_include_directories(coreneuron SYSTEM PRIVATE ${Boost_INCLUDE_DIRS})
-    target_compile_definitions(coreneuron PRIVATE CORENEURON_USE_BOOST_POOL)
+    target_include_directories(coreneuron-core SYSTEM PRIVATE ${Boost_INCLUDE_DIRS})
+    target_compile_definitions(coreneuron-core PRIVATE CORENEURON_USE_BOOST_POOL)
   endif()
 endif()
 
 set_target_properties(
-  coreneuron scopmath
+  coreneuron-core ${coreneuron_cuda_target}
   PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
              LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
-             POSITION_INDEPENDENT_CODE ON)
-cpp_cc_configure_sanitizers(TARGET coreneuron scopmath ${corenrn_mpi_targets})
+             POSITION_INDEPENDENT_CODE ${CORENRN_ENABLE_SHARED})
+cpp_cc_configure_sanitizers(TARGET coreneuron-core ${coreneuron_cuda_target} ${corenrn_mpi_targets})
 
 # =============================================================================
 # create special-core with halfgap.mod for tests
@@ -288,44 +294,47 @@ cpp_cc_configure_sanitizers(TARGET coreneuron scopmath ${corenrn_mpi_targets})
 set(modfile_directory "${CORENEURON_PROJECT_SOURCE_DIR}/tests/integration/ring_gap/mod files")
 file(GLOB modfiles "${modfile_directory}/*.mod")
 
-if(CORENRN_ENABLE_SHARED)
-  set(corenrn_mech_library
-      "${CMAKE_BINARY_DIR}/bin/${CMAKE_SYSTEM_PROCESSOR}/libcorenrnmech${CMAKE_SHARED_LIBRARY_SUFFIX}"
-      CACHE INTERNAL "coreneuron mechanism library")
-else()
-  set(corenrn_mech_library
-      "${CMAKE_BINARY_DIR}/bin/${CMAKE_SYSTEM_PROCESSOR}/libcorenrnmech${CMAKE_STATIC_LIBRARY_SUFFIX}"
-      CACHE INTERNAL "coreneuron mechanism library")
-endif()
-
-set(output_binaries "${CMAKE_BINARY_DIR}/bin/${CMAKE_SYSTEM_PROCESSOR}/special-core"
-                    "${corenrn_mech_library}")
+# We have to link things like unit tests against this because some "core" .cpp files refer to
+# symbols in the translated versions of default .mod files
+set(nrniv_core_prefix "${CMAKE_BINARY_DIR}/bin/${CMAKE_SYSTEM_PROCESSOR}")
+set(corenrn_mech_library
+    "${nrniv_core_prefix}/${CMAKE_${COMPILE_LIBRARY_TYPE}_LIBRARY_PREFIX}corenrnmech${CMAKE_${COMPILE_LIBRARY_TYPE}_LIBRARY_SUFFIX}"
+)
+set(output_binaries "${nrniv_core_prefix}/special-core" "${corenrn_mech_library}")
 
 add_custom_command(
   OUTPUT ${output_binaries}
-  DEPENDS scopmath coreneuron ${NMODL_TARGET_TO_DEPEND} ${modfiles} ${CORENEURON_BUILTIN_MODFILES}
+  DEPENDS coreneuron-core ${NMODL_TARGET_TO_DEPEND} ${modfiles} ${CORENEURON_BUILTIN_MODFILES}
   COMMAND ${CMAKE_BINARY_DIR}/bin/nrnivmodl-core -b ${COMPILE_LIBRARY_TYPE} -m
-          ${CORENRN_MOD2CPP_BINARY} -p 1 "${modfile_directory}"
+          ${CORENRN_MOD2CPP_BINARY} -p 4 "${modfile_directory}"
   WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/bin
   COMMENT "Running nrnivmodl-core with halfgap.mod")
 add_custom_target(nrniv-core ALL DEPENDS ${output_binaries})
 
 if(CORENRN_ENABLE_GPU)
   separate_arguments(CORENRN_ACC_FLAGS UNIX_COMMAND "${NVHPC_ACC_COMP_FLAGS}")
-  target_compile_options(coreneuron BEFORE PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${CORENRN_ACC_FLAGS}>)
-  target_compile_options(scopmath BEFORE PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${CORENRN_ACC_FLAGS}>)
+  target_compile_options(coreneuron-core PRIVATE ${CORENRN_ACC_FLAGS})
 endif()
 
-# =============================================================================
-# Extract link definitions to be used with nrnivmodl-core
-# =============================================================================
-get_target_property(CORENRN_LINK_LIBS coreneuron LINK_LIBRARIES)
-if(NOT CORENRN_LINK_LIBS)
-  set(CORENRN_LINK_LIBS "")
+# Create an extra target for use by NEURON when CoreNEURON is being built as a submodule. NEURON
+# tests will depend on this, so it must in turn depend on everything that is needed to run nrnivmodl
+# -coreneuron.
+add_custom_target(coreneuron-for-tests)
+add_dependencies(coreneuron-for-tests coreneuron-core ${NMODL_TARGET_TO_DEPEND})
+# Create an extra target for internal use that unit tests and so on can depend on.
+# ${corenrn_mech_library} is libcorenrnmech.{a,so}, which contains both the compiled default
+# mechanisms and the content of libcoreneuron-core.a.
+add_library(coreneuron-all INTERFACE)
+target_link_libraries(coreneuron-all INTERFACE "${corenrn_mech_library}")
+# Also copy the dependencies of libcoreneuron-core as interface dependencies of this new target
+# (example: ${corenrn_mech_library} will probably depend on MPI, so when the unit tests link against
+# ${corenrn_mech_library} they need to know to link against MPI too).
+get_target_property(coreneuron_core_deps coreneuron-core LINK_LIBRARIES)
+if(coreneuron_core_deps)
+  foreach(dep ${coreneuron_core_deps})
+    target_link_libraries(coreneuron-all INTERFACE ${dep})
+  endforeach()
 endif()
-set(CORENRN_LINK_LIBS
-    "${CORENRN_LINK_LIBS}"
-    PARENT_SCOPE)
 
 # Make headers avail to build tree
 configure_file(engine.h.in ${CMAKE_BINARY_DIR}/include/coreneuron/engine.h @ONLY)
@@ -353,19 +362,13 @@ file(COPY apps/coreneuron.cpp DESTINATION ${CMAKE_BINARY_DIR}/share/coreneuron)
 
 # coreneuron main libraries
 install(
-  TARGETS coreneuron
+  TARGETS coreneuron-core ${coreneuron_cuda_target}
   EXPORT coreneuron
   LIBRARY DESTINATION lib
   ARCHIVE DESTINATION lib
   INCLUDES
   DESTINATION $<INSTALL_INTERFACE:include>)
 
-# scopemath into share for nrnivmodl-core
-install(
-  TARGETS scopmath
-  EXPORT coreneuron
-  DESTINATION lib)
-
 # headers and some standalone code files for nrnivmodl-core
 install(
   DIRECTORY ${CMAKE_BINARY_DIR}/include/coreneuron
@@ -390,8 +393,11 @@ install(
   RENAME nrniv-core)
 install(FILES apps/coreneuron.cpp DESTINATION share/coreneuron)
 
-# install mechanism library
-install(FILES ${corenrn_mech_library} DESTINATION lib)
+# install mechanism library in shared library builds, if we're linking statically then there is no
+# need
+if(CORENRN_ENABLE_SHARED)
+  install(FILES ${corenrn_mech_library} DESTINATION lib)
+endif()
 
 # install random123 and nmodl headers
 install(DIRECTORY ${CMAKE_BINARY_DIR}/include/ DESTINATION include)
diff --git a/coreneuron/apps/corenrn_parameters.cpp b/coreneuron/apps/corenrn_parameters.cpp
index 40c322b18..6ee920d1f 100644
--- a/coreneuron/apps/corenrn_parameters.cpp
+++ b/coreneuron/apps/corenrn_parameters.cpp
@@ -5,15 +5,17 @@
 # See top-level LICENSE file for details.
 # =============================================================================.
 */
-
 #include "coreneuron/apps/corenrn_parameters.hpp"
 
+#include <CLI/CLI.hpp>
 
 namespace coreneuron {
 
 extern std::string cnrn_version();
 
-corenrn_parameters::corenrn_parameters() {
+corenrn_parameters::corenrn_parameters()
+    : m_app{std::make_unique<CLI::App>("CoreNeuron - Optimised Simulator Engine for NEURON.")} {
+    auto& app = *m_app;
     app.set_config("--read-config", "", "Read parameters from ini file", false)
         ->check(CLI::ExistingFile);
     app.add_option("--write-config",
@@ -167,14 +169,21 @@ corenrn_parameters::corenrn_parameters() {
     CLI::retire_option(app, "--show");
 }
 
+// Implementation in .cpp file where CLI types are complete.
+corenrn_parameters::~corenrn_parameters() = default;
+
+std::string corenrn_parameters::config_to_str(bool default_also, bool write_description) const {
+    return m_app->config_to_str(default_also, write_description);
+}
+
 void corenrn_parameters::reset() {
     static_cast<corenrn_parameters_data&>(*this) = corenrn_parameters_data{};
-    app.clear();
+    m_app->clear();
 }
 
 void corenrn_parameters::parse(int argc, char** argv) {
     try {
-        app.parse(argc, argv);
+        m_app->parse(argc, argv);
         if (verbose == verbose_level::NONE) {
             nrn_nobanner_ = 1;
         }
@@ -182,11 +191,11 @@ void corenrn_parameters::parse(int argc, char** argv) {
         // in case of parsing errors, show message with exception
         std::cerr << "CLI parsing error, see nrniv-core --help for more information. \n"
                   << std::endl;
-        app.exit(e);
+        m_app->exit(e);
         throw e;
     } catch (const CLI::ParseError& e) {
         // use --help is also ParseError; in this case exit by showing all options
-        app.exit(e);
+        m_app->exit(e);
         exit(0);
     }
 
diff --git a/coreneuron/apps/corenrn_parameters.hpp b/coreneuron/apps/corenrn_parameters.hpp
index bfe646622..8db8ce06c 100644
--- a/coreneuron/apps/corenrn_parameters.hpp
+++ b/coreneuron/apps/corenrn_parameters.hpp
@@ -1,18 +1,14 @@
 /*
 # =============================================================================
-# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
+# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
 #
 # See top-level LICENSE file for details.
 # =============================================================================.
 */
-
 #pragma once
-
+#include <memory>
+#include <ostream>
 #include <string>
-#include <fstream>
-#include <iostream>
-#include <iomanip>
-#include <CLI/CLI.hpp>
 
 /**
  * \class corenrn_parameters
@@ -32,6 +28,10 @@
  * Also single dash long options are not supported anymore (-mpi -> --mpi).
  */
 
+namespace CLI {
+struct App;
+}
+
 namespace coreneuron {
 
 struct corenrn_parameters_data {
@@ -94,10 +94,8 @@ struct corenrn_parameters_data {
 };
 
 struct corenrn_parameters: corenrn_parameters_data {
-    CLI::App app{"CoreNeuron - Optimised Simulator Engine for NEURON."};  /// CLI app that performs
-                                                                          /// CLI parsing
-
-    corenrn_parameters();  /// Constructor that initializes the CLI11 app.
+    corenrn_parameters();   /// Constructor that initializes the CLI11 app.
+    ~corenrn_parameters();  /// Destructor defined in .cpp where CLI11 types are complete.
 
     void parse(int argc, char* argv[]);  /// Runs the CLI11_PARSE macro.
 
@@ -111,6 +109,22 @@ struct corenrn_parameters: corenrn_parameters_data {
     inline bool is_quiet() {
         return verbose == verbose_level::NONE;
     }
+
+    /** @brief Return a string summarising the current parameter values.
+     *
+     * This forwards to the CLI11 method of the same name. Returns a string that
+     * could be read in as a config of the current values of the App.
+     *
+     * @param default_also Include any defaulted arguments.
+     * @param write_description Include option descriptions and the App description.
+     */
+    std::string config_to_str(bool default_also = false, bool write_description = false) const;
+
+  private:
+    // CLI app that performs CLI parsing. std::unique_ptr avoids having to
+    // include CLI11 headers from CoreNEURON headers, and therefore avoids
+    // CoreNEURON having to install CLI11 when using it from a submodule.
+    std::unique_ptr<CLI::App> m_app;
 };
 
 std::ostream& operator<<(std::ostream& os,
diff --git a/coreneuron/apps/main1.cpp b/coreneuron/apps/main1.cpp
index fb74df7d0..a429b04f7 100644
--- a/coreneuron/apps/main1.cpp
+++ b/coreneuron/apps/main1.cpp
@@ -1,6 +1,6 @@
 /*
 # =============================================================================
-# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
+# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
 #
 # See top-level LICENSE file for details.
 # =============================================================================.
@@ -51,16 +51,13 @@ const char* corenrn_version() {
     return coreneuron::bbcore_write_version;
 }
 
-// the CORENRN_USE_LEGACY_UNITS determined by CORENRN_ENABLE_LEGACY_UNITS
+// the CORENEURON_USE_LEGACY_UNITS determined by CORENRN_ENABLE_LEGACY_UNITS
 bool corenrn_units_use_legacy() {
-    return CORENRN_USE_LEGACY_UNITS;
+    return CORENEURON_USE_LEGACY_UNITS;
 }
 
 void (*nrn2core_part2_clean_)();
 
-// cf. utils/ispc_globals.c
-extern double ispc_celsius;
-
 /**
  * If "export OMP_NUM_THREADS=n" is not set then omp by default sets
  * the number of threads equal to the number of cores on this node.
@@ -244,9 +241,6 @@ void nrn_init_and_load_data(int argc,
 
     corenrn_param.celsius = celsius;
 
-    // for ispc backend
-    ispc_celsius = celsius;
-
     // create net_cvode instance
     mk_netcvode();
 
@@ -456,7 +450,7 @@ std::unique_ptr<ReportHandler> create_report_handler(ReportConfiguration& config
 
 using namespace coreneuron;
 
-#if NRNMPI && defined CORENRN_ENABLE_MPI_DYNAMIC
+#if NRNMPI && defined(CORENEURON_ENABLE_MPI_DYNAMIC)
 static void* load_dynamic_mpi(const std::string& libname) {
     dlerror();
     void* handle = dlopen(libname.c_str(), RTLD_NOW | RTLD_GLOBAL);
@@ -478,7 +472,7 @@ extern "C" void mk_mech_init(int argc, char** argv) {
 
 #if NRNMPI
     if (corenrn_param.mpi_enable) {
-#ifdef CORENRN_ENABLE_MPI_DYNAMIC
+#ifdef CORENEURON_ENABLE_MPI_DYNAMIC
         // coreneuron rely on neuron to detect mpi library distribution and
         // the name of the library itself. Make sure the library name is specified
         // via CLI option.
@@ -506,12 +500,16 @@ extern "C" void mk_mech_init(int argc, char** argv) {
 #ifdef CORENEURON_ENABLE_GPU
     if (corenrn_param.gpu) {
         init_gpu();
+        cnrn_target_copyin(&celsius);
+        cnrn_target_copyin(&pi);
+        cnrn_target_copyin(&secondorder);
+        nrnran123_initialise_global_state_on_device();
     }
 #endif
 
     if (!corenrn_param.writeParametersFilepath.empty()) {
         std::ofstream out(corenrn_param.writeParametersFilepath, std::ios::trunc);
-        out << corenrn_param.app.config_to_str(false, false);
+        out << corenrn_param.config_to_str(false, false);
         out.close();
     }
 
@@ -685,6 +683,10 @@ extern "C" int run_solve_core(int argc, char** argv) {
         if (nrn_have_gaps) {
             nrn_partrans::delete_gap_indices_from_device();
         }
+        nrnran123_destroy_global_state_on_device();
+        cnrn_target_delete(&secondorder);
+        cnrn_target_delete(&pi);
+        cnrn_target_delete(&celsius);
     }
 
     // Cleaning the memory
diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index 3eff82fe1..1fcc59478 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -10,6 +10,7 @@
 #include <utility>
 
 #include "coreneuron/apps/corenrn_parameters.hpp"
+#include "coreneuron/gpu/nrn_acc_manager.hpp"
 #include "coreneuron/sim/multicore.hpp"
 #include "coreneuron/network/netcon.hpp"
 #include "coreneuron/nrniv/nrniv_decl.h"
@@ -31,15 +32,198 @@
 #include <cuda_runtime_api.h>
 #endif
 
+#if __has_include(<cxxabi.h>)
+#define USE_CXXABI
+#include <cxxabi.h>
+#include <memory>
+#include <string>
+#endif
+
+#ifdef CORENEURON_ENABLE_PRESENT_TABLE
+#include <cassert>
+#include <cstddef>
+#include <iostream>
+#include <map>
+#include <shared_mutex>
+namespace {
+struct present_table_value {
+    std::size_t ref_count{}, size{};
+    std::byte* dev_ptr{};
+};
+std::map<std::byte const*, present_table_value> present_table;
+std::shared_mutex present_table_mutex;
+}  // namespace
+#endif
+
+namespace {
+/** @brief Try to demangle a type name, return the mangled name on failure.
+ */
+std::string cxx_demangle(const char* mangled) {
+#ifdef USE_CXXABI
+    int status{};
+    // Note that the third argument to abi::__cxa_demangle returns the length of
+    // the allocated buffer, which may be larger than strlen(demangled) + 1.
+    std::unique_ptr<char, decltype(free)*> demangled{
+        abi::__cxa_demangle(mangled, nullptr, nullptr, &status), free};
+    return status ? mangled : demangled.get();
+#else
+    return mangled;
+#endif
+}
+bool cnrn_target_debug_output_enabled() {
+    const char* env = std::getenv("CORENEURON_GPU_DEBUG");
+    if (!env) {
+        return false;
+    }
+    std::string env_s{env};
+    if (env_s == "1") {
+        return true;
+    } else if (env_s == "0") {
+        return false;
+    } else {
+        throw std::runtime_error("CORENEURON_GPU_DEBUG must be set to 0 or 1 (got " + env_s + ")");
+    }
+}
+bool cnrn_target_enable_debug{cnrn_target_debug_output_enabled()};
+}  // namespace
+
 namespace coreneuron {
 extern InterleaveInfo* interleave_info;
-void copy_ivoc_vect_to_device(const IvocVect& iv, IvocVect& div);
-void delete_ivoc_vect_from_device(IvocVect&);
 void nrn_ion_global_map_copyto_device();
 void nrn_ion_global_map_delete_from_device();
 void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay);
 void nrn_VecPlay_delete_from_device(NrnThread* nt);
 
+void cnrn_target_copyin_debug(std::string_view file,
+                              int line,
+                              std::size_t sizeof_T,
+                              std::type_info const& typeid_T,
+                              void const* h_ptr,
+                              std::size_t len,
+                              void* d_ptr) {
+    if (!cnrn_target_enable_debug) {
+        return;
+    }
+    std::cerr << file << ':' << line << ": cnrn_target_copyin<" << cxx_demangle(typeid_T.name())
+              << ">(" << h_ptr << ", " << len << " * " << sizeof_T << " = " << len * sizeof_T
+              << ") -> " << d_ptr << std::endl;
+}
+void cnrn_target_delete_debug(std::string_view file,
+                              int line,
+                              std::size_t sizeof_T,
+                              std::type_info const& typeid_T,
+                              void const* h_ptr,
+                              std::size_t len) {
+    if (!cnrn_target_enable_debug) {
+        return;
+    }
+    std::cerr << file << ':' << line << ": cnrn_target_delete<" << cxx_demangle(typeid_T.name())
+              << ">(" << h_ptr << ", " << len << " * " << sizeof_T << " = " << len * sizeof_T << ')'
+              << std::endl;
+}
+void cnrn_target_deviceptr_debug(std::string_view file,
+                                 int line,
+                                 std::type_info const& typeid_T,
+                                 void const* h_ptr,
+                                 void* d_ptr) {
+    if (!cnrn_target_enable_debug) {
+        return;
+    }
+    std::cerr << file << ':' << line << ": cnrn_target_deviceptr<" << cxx_demangle(typeid_T.name())
+              << ">(" << h_ptr << ") -> " << d_ptr << std::endl;
+}
+void cnrn_target_is_present_debug(std::string_view file,
+                                  int line,
+                                  std::type_info const& typeid_T,
+                                  void const* h_ptr,
+                                  void* d_ptr) {
+    if (!cnrn_target_enable_debug) {
+        return;
+    }
+    std::cerr << file << ':' << line << ": cnrn_target_is_present<" << cxx_demangle(typeid_T.name())
+              << ">(" << h_ptr << ") -> " << d_ptr << std::endl;
+}
+void cnrn_target_memcpy_to_device_debug(std::string_view file,
+                                        int line,
+                                        std::size_t sizeof_T,
+                                        std::type_info const& typeid_T,
+                                        void const* h_ptr,
+                                        std::size_t len,
+                                        void* d_ptr) {
+    if (!cnrn_target_enable_debug) {
+        return;
+    }
+    std::cerr << file << ':' << line << ": cnrn_target_memcpy_to_device<"
+              << cxx_demangle(typeid_T.name()) << ">(" << d_ptr << ", " << h_ptr << ", " << len
+              << " * " << sizeof_T << " = " << len * sizeof_T << ')' << std::endl;
+}
+
+#ifdef CORENEURON_ENABLE_PRESENT_TABLE
+std::pair<void*, bool> cnrn_target_deviceptr_impl(bool must_be_present_or_null, void const* h_ptr) {
+    if (!h_ptr) {
+        return {nullptr, false};
+    }
+    // Concurrent calls to this method are safe, but they must be serialised
+    // w.r.t. calls to the cnrn_target_*_update_present_table methods.
+    std::shared_lock _{present_table_mutex};
+    if (present_table.empty()) {
+        return {nullptr, must_be_present_or_null};
+    }
+    // prev(first iterator greater than h_ptr or last if not found) gives the first iterator less
+    // than or equal to h_ptr
+    auto const iter = std::prev(std::upper_bound(
+        present_table.begin(), present_table.end(), h_ptr, [](void const* hp, auto const& entry) {
+            return hp < entry.first;
+        }));
+    if (iter == present_table.end()) {
+        return {nullptr, must_be_present_or_null};
+    }
+    std::byte const* const h_byte_ptr{static_cast<std::byte const*>(h_ptr)};
+    std::byte const* const h_start_of_block{iter->first};
+    std::size_t const block_size{iter->second.size};
+    std::byte* const d_start_of_block{iter->second.dev_ptr};
+    bool const is_present{h_byte_ptr < h_start_of_block + block_size};
+    if (!is_present) {
+        return {nullptr, must_be_present_or_null};
+    }
+    return {d_start_of_block + (h_byte_ptr - h_start_of_block), false};
+}
+
+void cnrn_target_copyin_update_present_table(void const* h_ptr, void* d_ptr, std::size_t len) {
+    if (!h_ptr) {
+        assert(!d_ptr);
+        return;
+    }
+    std::lock_guard _{present_table_mutex};
+    // TODO include more pedantic overlap checking?
+    present_table_value new_val{};
+    new_val.size = len;
+    new_val.ref_count = 1;
+    new_val.dev_ptr = static_cast<std::byte*>(d_ptr);
+    auto const [iter, inserted] = present_table.emplace(static_cast<std::byte const*>(h_ptr),
+                                                        std::move(new_val));
+    if (!inserted) {
+        // Insertion didn't occur because h_ptr was already in the present table
+        assert(iter->second.size == len);
+        assert(iter->second.dev_ptr == new_val.dev_ptr);
+        ++(iter->second.ref_count);
+    }
+}
+void cnrn_target_delete_update_present_table(void const* h_ptr, std::size_t len) {
+    if (!h_ptr) {
+        return;
+    }
+    std::lock_guard _{present_table_mutex};
+    auto const iter = present_table.find(static_cast<std::byte const*>(h_ptr));
+    assert(iter != present_table.end());
+    assert(iter->second.size == len);
+    --(iter->second.ref_count);
+    if (iter->second.ref_count == 0) {
+        present_table.erase(iter);
+    }
+}
+#endif
+
 int cnrn_target_get_num_devices() {
 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENACC)
@@ -76,7 +260,7 @@ void cnrn_target_set_default_device(int device_num) {
 }
 
 #ifdef CORENEURON_ENABLE_GPU
-
+#ifndef CORENEURON_UNIFIED_MEMORY
 static Memb_list* copy_ml_to_device(const Memb_list* ml, int type) {
     // As we never run code for artificial cell inside GPU we don't copy it.
     int is_art = corenrn.get_is_artificial()[type];
@@ -86,6 +270,14 @@ static Memb_list* copy_ml_to_device(const Memb_list* ml, int type) {
 
     auto d_ml = cnrn_target_copyin(ml);
 
+    if (ml->global_variables) {
+        assert(ml->global_variables_size);
+        void* d_inst = cnrn_target_copyin(static_cast<std::byte*>(ml->global_variables),
+                                          ml->global_variables_size);
+        cnrn_target_memcpy_to_device(&(d_ml->global_variables), &d_inst);
+    }
+
+
     int n = ml->nodecount;
     int szp = corenrn.get_prop_param_size()[type];
     int szdp = corenrn.get_prop_dparam_size()[type];
@@ -168,6 +360,7 @@ static Memb_list* copy_ml_to_device(const Memb_list* ml, int type) {
 
     return d_ml;
 }
+#endif
 
 static void update_ml_on_host(const Memb_list* ml, int type) {
     int is_art = corenrn.get_is_artificial()[type];
@@ -257,6 +450,13 @@ static void delete_ml_from_device(Memb_list* ml, int type) {
         cnrn_target_delete(ml->pdata, pcnt);
     }
     cnrn_target_delete(ml->nodeindices, n);
+
+    if (ml->global_variables) {
+        assert(ml->global_variables_size);
+        cnrn_target_delete(static_cast<std::byte*>(ml->global_variables),
+                           ml->global_variables_size);
+    }
+
     cnrn_target_delete(ml);
 }
 
@@ -603,9 +803,8 @@ void delete_ivoc_vect_from_device(IvocVect& vec) {
     if (n) {
         cnrn_target_delete(vec.data(), n);
     }
-    cnrn_target_delete(&vec);
 #else
-    (void) vec;
+    static_cast<void>(vec);
 #endif
 }
 
@@ -1108,7 +1307,7 @@ void nrn_newtonspace_delete_from_device(NewtonSpace* ns) {
 }
 
 void nrn_sparseobj_copyto_device(SparseObj* so) {
-#ifdef CORENEURON_ENABLE_GPU
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_UNIFIED_MEMORY)
     // FIXME this check needs to be tweaked if we ever want to run with a mix
     //       of CPU and GPU threads.
     if (nrn_threads[0].compute_gpu == 0) {
@@ -1191,7 +1390,7 @@ void nrn_sparseobj_copyto_device(SparseObj* so) {
 }
 
 void nrn_sparseobj_delete_from_device(SparseObj* so) {
-#ifdef CORENEURON_ENABLE_GPU
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_UNIFIED_MEMORY)
     // FIXME this check needs to be tweaked if we ever want to run with a mix
     //       of CPU and GPU threads.
     if (nrn_threads[0].compute_gpu == 0) {
@@ -1312,7 +1511,7 @@ void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) {
 
 void nrn_VecPlay_delete_from_device(NrnThread* nt) {
     for (int i = 0; i < nt->n_vecplay; i++) {
-        auto* vecplay_instance = reinterpret_cast<VecPlayContinuous*>(nt->_vecplay[i]);
+        auto* vecplay_instance = static_cast<VecPlayContinuous*>(nt->_vecplay[i]);
         cnrn_target_delete(vecplay_instance->e_);
         if (vecplay_instance->discon_indices_) {
             delete_ivoc_vect_from_device(*(vecplay_instance->discon_indices_));
diff --git a/coreneuron/gpu/nrn_acc_manager.hpp b/coreneuron/gpu/nrn_acc_manager.hpp
index 72d222cdd..5a2a6f544 100644
--- a/coreneuron/gpu/nrn_acc_manager.hpp
+++ b/coreneuron/gpu/nrn_acc_manager.hpp
@@ -1,17 +1,16 @@
 /*
 # =============================================================================
-# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
+# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
 #
 # See top-level LICENSE file for details.
 # =============================================================================
 */
-
-#ifndef _nrn_device_manager_
-#define _nrn_device_manager_
-
-#include "coreneuron/sim/multicore.hpp"
+#pragma once
 
 namespace coreneuron {
+struct Memb_list;
+struct NrnThread;
+struct NetSendBuffer_t;
 void setup_nrnthreads_on_device(NrnThread* threads, int nthreads);
 void delete_nrnthreads_on_device(NrnThread* threads, int nthreads);
 void update_nrnthreads_on_host(NrnThread* threads, int nthreads);
@@ -24,6 +23,4 @@ void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb);
 
 void update_weights_from_gpu(NrnThread* threads, int nthreads);
 void init_gpu();
-
 }  // namespace coreneuron
-#endif  // _nrn_device_manager_
diff --git a/coreneuron/io/core2nrn_data_return.cpp b/coreneuron/io/core2nrn_data_return.cpp
index 6a12c197f..87a549ac6 100644
--- a/coreneuron/io/core2nrn_data_return.cpp
+++ b/coreneuron/io/core2nrn_data_return.cpp
@@ -137,7 +137,7 @@ static void core2nrn_corepointer(int tid, NrnThreadMembList* tml) {
         d = ml->data + nrn_i_layout(jp, ml->nodecount, 0, dsz, layout);
         pd = ml->pdata + nrn_i_layout(jp, ml->nodecount, 0, pdsz, layout);
         (*corenrn.get_bbcore_write()[type])(
-            nullptr, nullptr, &dcnt, &icnt, 0, aln_cntml, d, pd, ml->_thread, &nt, 0.0);
+            nullptr, nullptr, &dcnt, &icnt, 0, aln_cntml, d, pd, ml->_thread, &nt, ml, 0.0);
     }
 
     std::unique_ptr<int[]> iArray;
@@ -159,8 +159,18 @@ static void core2nrn_corepointer(int tid, NrnThreadMembList* tml) {
         d = ml->data + nrn_i_layout(jp, ml->nodecount, 0, dsz, layout);
         pd = ml->pdata + nrn_i_layout(jp, ml->nodecount, 0, pdsz, layout);
 
-        (*corenrn.get_bbcore_write()[type])(
-            dArray.get(), iArray.get(), &dcnt, &icnt, 0, aln_cntml, d, pd, ml->_thread, &nt, 0.0);
+        (*corenrn.get_bbcore_write()[type])(dArray.get(),
+                                            iArray.get(),
+                                            &dcnt,
+                                            &icnt,
+                                            0,
+                                            aln_cntml,
+                                            d,
+                                            pd,
+                                            ml->_thread,
+                                            &nt,
+                                            ml,
+                                            0.0);
     }
 
     (*core2nrn_corepointer_mech_)(tid, type, icnt, dcnt, iArray.get(), dArray.get());
diff --git a/coreneuron/io/global_vars.cpp b/coreneuron/io/global_vars.cpp
index 128a1cdb9..815423ea9 100644
--- a/coreneuron/io/global_vars.cpp
+++ b/coreneuron/io/global_vars.cpp
@@ -142,7 +142,7 @@ void set_globals(const char* path, bool cli_global_seed, int cli_global_seed_val
                 } else if (strcmp(name, "Random123_globalindex") == 0) {
                     nrnran123_set_globalindex((uint32_t) n);
                 } else if (strcmp(name, "_nrnunit_use_legacy_") == 0) {
-                    if (n != CORENRN_USE_LEGACY_UNITS) {
+                    if (n != CORENEURON_USE_LEGACY_UNITS) {
                         hoc_execerror(
                             "CORENRN_ENABLE_LEGACY_UNITS not"
                             " consistent with NEURON value of"
diff --git a/coreneuron/io/nrn2core_data_init.cpp b/coreneuron/io/nrn2core_data_init.cpp
index e732dec11..ad7106f6e 100644
--- a/coreneuron/io/nrn2core_data_init.cpp
+++ b/coreneuron/io/nrn2core_data_init.cpp
@@ -407,6 +407,7 @@ extern void** pattern_stim_info_ref(int icnt,
                                     Datum* _ppvar,
                                     ThreadDatum* _thread,
                                     NrnThread* _nt,
+                                    Memb_list* ml,
                                     double v);
 
 extern "C" {
@@ -437,7 +438,7 @@ void nrn2core_patstim_share_info() {
             assert(0);
         }
 
-        void** info = pattern_stim_info_ref(_iml, _cntml, _p, _ppvar, nullptr, nt, 0.0);
+        void** info = pattern_stim_info_ref(_iml, _cntml, _p, _ppvar, nullptr, nt, ml, 0.0);
         (*nrn2core_patternstim_)(info);
     }
 }
diff --git a/coreneuron/io/nrn_checkpoint.cpp b/coreneuron/io/nrn_checkpoint.cpp
index 955848901..ecf432422 100644
--- a/coreneuron/io/nrn_checkpoint.cpp
+++ b/coreneuron/io/nrn_checkpoint.cpp
@@ -449,7 +449,7 @@ void CheckPoints::write_phase2(NrnThread& nt) const {
                 d = ml->data + nrn_i_layout(jp, ml->nodecount, 0, dsz, layout);
                 pd = ml->pdata + nrn_i_layout(jp, ml->nodecount, 0, pdsz, layout);
                 (*corenrn.get_bbcore_write()[type])(
-                    nullptr, nullptr, &dcnt, &icnt, 0, aln_cntml, d, pd, ml->_thread, &nt, 0.0);
+                    nullptr, nullptr, &dcnt, &icnt, 0, aln_cntml, d, pd, ml->_thread, &nt, ml, 0.0);
             }
             fh << icnt << "\n";
             fh << dcnt << "\n";
@@ -478,7 +478,7 @@ void CheckPoints::write_phase2(NrnThread& nt) const {
                 pd = ml->pdata + nrn_i_layout(jp, ml->nodecount, 0, pdsz, layout);
 
                 (*corenrn.get_bbcore_write()[type])(
-                    dArray, iArray, &dcnt, &icnt, 0, aln_cntml, d, pd, ml->_thread, &nt, 0.0);
+                    dArray, iArray, &dcnt, &icnt, 0, aln_cntml, d, pd, ml->_thread, &nt, ml, 0.0);
             }
 
             if (icnt) {
@@ -592,6 +592,7 @@ bool CheckPoints::initialize() {
                                            ml->pdata,
                                            ml->_thread,
                                            nrn_threads,
+                                           ml,
                                            0.0);
             break;
         }
@@ -802,6 +803,7 @@ void CheckPoints::write_tqueue(NrnThread& nt, FileHandler& fh) const {
                 ml->pdata,
                 ml->_thread,
                 nrn_threads,
+                ml,
                 0.0);
             break;
         }
diff --git a/coreneuron/io/nrn_setup.cpp b/coreneuron/io/nrn_setup.cpp
index c22ffc0ce..703e853d8 100644
--- a/coreneuron/io/nrn_setup.cpp
+++ b/coreneuron/io/nrn_setup.cpp
@@ -754,6 +754,14 @@ void nrn_cleanup() {
                 ml->_thread = nullptr;
             }
 
+            // Destroy the global variables struct allocated in nrn_init
+            if (auto* const priv_dtor = corenrn.get_memb_func(tml->index).private_destructor) {
+                (*priv_dtor)(nt, ml, tml->index);
+                assert(!ml->instance);
+                assert(!ml->global_variables);
+                assert(ml->global_variables_size == 0);
+            }
+
             NetReceiveBuffer_t* nrb = ml->_net_receive_buffer;
             if (nrb) {
                 if (nrb->_size) {
diff --git a/coreneuron/io/phase2.cpp b/coreneuron/io/phase2.cpp
index a97b335bb..ad5748ad7 100644
--- a/coreneuron/io/phase2.cpp
+++ b/coreneuron/io/phase2.cpp
@@ -337,7 +337,7 @@ void Phase2::read_direct(int thread_id, const NrnThread& nt) {
         offset += nrn_soa_padded_size(nodecounts[i], layout) * param_sizes[type];
         if (nodeindices_) {
             std::copy(nodeindices_, nodeindices_ + nodecounts[i], tml.nodeindices.data());
-            free_memory(nodeindices_);
+            free(nodeindices_);  // not free_memory because this is allocated by NEURON?
         }
         if (corenrn.get_is_artificial()[type]) {
             assert(nodeindices_ == nullptr);
@@ -867,6 +867,7 @@ void Phase2::get_info_from_bbcore(NrnThread& nt,
                                                pd,
                                                ml->_thread,
                                                &nt,
+                                               ml,
                                                0.0);
         }
         assert(dk == static_cast<int>(tmls[i].dArray.size()));
@@ -958,7 +959,8 @@ void Phase2::populate(NrnThread& nt, const UserParams& userParams) {
 
     NrnThreadMembList* tml_last = nullptr;
     for (int i = 0; i < n_mech; ++i) {
-        auto tml = create_tml(i, memb_func[mech_types[i]], shadow_rhs_cnt, mech_types, nodecounts);
+        auto tml =
+            create_tml(nt, i, memb_func[mech_types[i]], shadow_rhs_cnt, mech_types, nodecounts);
 
         nt._ml_list[tml->index] = tml->ml;
 
diff --git a/coreneuron/mechanism/capac.cpp b/coreneuron/mechanism/capac.cpp
index 42c65cb18..f47a4ebd7 100644
--- a/coreneuron/mechanism/capac.cpp
+++ b/coreneuron/mechanism/capac.cpp
@@ -32,10 +32,12 @@ void capacitance_reg(void) {
     /* all methods deal with capacitance in special ways */
     register_mech(mechanism,
                   nrn_alloc_capacitance,
-                  (mod_f_t) 0,
-                  (mod_f_t) 0,
-                  (mod_f_t) 0,
-                  (mod_f_t) nrn_init_capacitance,
+                  nullptr,
+                  nullptr,
+                  nullptr,
+                  nrn_init_capacitance,
+                  nullptr,
+                  nullptr,
                   -1,
                   1);
     int mechtype = nrn_get_mechtype(mechanism[1]);
diff --git a/coreneuron/mechanism/eion.cpp b/coreneuron/mechanism/eion.cpp
index 4bc077880..ec1fd665e 100644
--- a/coreneuron/mechanism/eion.cpp
+++ b/coreneuron/mechanism/eion.cpp
@@ -1,6 +1,6 @@
 /*
 # =============================================================================
-# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
+# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
 #
 # See top-level LICENSE file for details.
 # =============================================================================.
@@ -94,9 +94,11 @@ void ion_reg(const char* name, double valence) {
         register_mech((const char**) mechanism,
                       nrn_alloc_ion,
                       nrn_cur_ion,
-                      (mod_f_t) 0,
-                      (mod_f_t) 0,
-                      (mod_f_t) nrn_init_ion,
+                      nullptr,
+                      nullptr,
+                      nrn_init_ion,
+                      nullptr,
+                      nullptr,
                       -1,
                       1);
         mechtype = nrn_get_mechtype(mechanism[1]);
@@ -154,70 +156,6 @@ the USEION statement of any model using this ion\n",
     }
 }
 
-#ifndef CORENRN_USE_LEGACY_UNITS
-#define CORENRN_USE_LEGACY_UNITS 0
-#endif
-
-#if CORENRN_USE_LEGACY_UNITS == 1
-#define FARADAY     96485.309
-#define gasconstant 8.3134
-#else
-#include "coreneuron/nrnoc/nrnunits_modern.h"
-#define FARADAY     _faraday_codata2018
-#define gasconstant _gasconstant_codata2018
-#endif
-
-#define ktf (1000. * gasconstant * (celsius + 273.15) / FARADAY)
-
-double nrn_nernst(double ci, double co, double z, double celsius) {
-    /*printf("nrn_nernst %g %g %g\n", ci, co, z);*/
-    if (z == 0) {
-        return 0.;
-    }
-    if (ci <= 0.) {
-        return 1e6;
-    } else if (co <= 0.) {
-        return -1e6;
-    } else {
-        return ktf / z * log(co / ci);
-    }
-}
-
-nrn_pragma_omp(declare target)
-void nrn_wrote_conc(int type,
-                    double* p1,
-                    int p2,
-                    int it,
-                    double** gimap,
-                    double celsius,
-                    int _cntml_padded) {
-    if (it & 040) {
-        int _iml = 0;
-        /* passing _nt to this function causes cray compiler to segfault during compilation
-         * hence passing _cntml_padded
-         */
-        double* pe = p1 - p2 * _STRIDE;
-        pe[0] = nrn_nernst(pe[1 * _STRIDE], pe[2 * _STRIDE], gimap[type][2], celsius);
-    }
-}
-
-static double efun(double x) {
-    if (fabs(x) < 1e-4) {
-        return 1. - x / 2.;
-    } else {
-        return x / (exp(x) - 1);
-    }
-}
-
-nrn_pragma_omp(end declare target)
-
-double nrn_ghk(double v, double ci, double co, double z) {
-    double temp = z * v / ktf;
-    double eco = co * efun(temp);
-    double eci = ci * efun(-temp);
-    return (.001) * z * FARADAY * (eci - eco);
-}
-
 #if VECTORIZE
 #define erev   pd[0 * _STRIDE] /* From Eion */
 #define conci  pd[1 * _STRIDE]
@@ -257,7 +195,7 @@ ion_style("name_ion", [c_style, e_style, einit, eadvance, cinit])
 
 double nrn_nernst_coef(int type) {
     /* for computing jacobian element dconc'/dconc */
-    return ktf / charge;
+    return ktf(celsius) / charge;
 }
 
 /* Must be called prior to any channels which update the currents */
diff --git a/coreneuron/mechanism/mech/mod2c_core_thread.hpp b/coreneuron/mechanism/mech/mod2c_core_thread.hpp
index 85ed348f6..d18160f3a 100644
--- a/coreneuron/mechanism/mech/mod2c_core_thread.hpp
+++ b/coreneuron/mechanism/mech/mod2c_core_thread.hpp
@@ -16,14 +16,14 @@ namespace coreneuron {
 
 #define _STRIDE _cntml_padded + _iml
 
-#define _threadargscomma_ _iml, _cntml_padded, _p, _ppvar, _thread, _nt, _v,
+#define _threadargscomma_ _iml, _cntml_padded, _p, _ppvar, _thread, _nt, _ml, _v,
 #define _threadargsprotocomma_                                                                    \
     int _iml, int _cntml_padded, double *_p, Datum *_ppvar, ThreadDatum *_thread, NrnThread *_nt, \
-        double _v,
-#define _threadargs_ _iml, _cntml_padded, _p, _ppvar, _thread, _nt, _v
+        Memb_list *_ml, double _v,
+#define _threadargs_ _iml, _cntml_padded, _p, _ppvar, _thread, _nt, _ml, _v
 #define _threadargsproto_                                                                         \
     int _iml, int _cntml_padded, double *_p, Datum *_ppvar, ThreadDatum *_thread, NrnThread *_nt, \
-        double _v
+        Memb_list *_ml, double _v
 
 struct Elm {
     unsigned row;        /* Row location */
diff --git a/coreneuron/mechanism/mechanism.hpp b/coreneuron/mechanism/mechanism.hpp
index ab78ad502..9427423df 100644
--- a/coreneuron/mechanism/mechanism.hpp
+++ b/coreneuron/mechanism/mechanism.hpp
@@ -143,6 +143,12 @@ struct Memb_list {
     NetSendBuffer_t* _net_send_buffer = nullptr;
     int nodecount; /* actual node count */
     int _nodecount_padded;
-    void* instance = nullptr; /* mechanism instance */
+    void* instance{nullptr}; /* mechanism instance struct */
+    // nrn_acc_manager.cpp handles data movement to/from the accelerator as the
+    // "private constructor" in the translated MOD file code is called before
+    // the main nrn_acc_manager methods that copy thread/mechanism data to the
+    // device
+    void* global_variables{nullptr};
+    std::size_t global_variables_size{};
 };
 }  // namespace coreneuron
diff --git a/coreneuron/mechanism/membfunc.hpp b/coreneuron/mechanism/membfunc.hpp
index 2556f0f87..ac650595c 100644
--- a/coreneuron/mechanism/membfunc.hpp
+++ b/coreneuron/mechanism/membfunc.hpp
@@ -1,17 +1,19 @@
 /*
 # =============================================================================
-# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
+# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
 #
 # See top-level LICENSE file for details.
 # =============================================================================.
 */
-
 #pragma once
 
-#include <vector>
-
 #include "coreneuron/mechanism/mechanism.hpp"
 #include "coreneuron/utils/offload.hpp"
+#include "coreneuron/utils/units.hpp"
+
+#include <cmath>
+#include <vector>
+
 namespace coreneuron {
 
 using Pfrpdat = Datum* (*) (void);
@@ -21,6 +23,8 @@ struct NrnThread;
 using mod_alloc_t = void (*)(double*, Datum*, int);
 using mod_f_t = void (*)(NrnThread*, Memb_list*, int);
 using pnt_receive_t = void (*)(Point_process*, int, double);
+using thread_table_check_t =
+    void (*)(int, int, double*, Datum*, ThreadDatum*, NrnThread*, Memb_list*, int);
 
 /*
  * Memb_func structure contains all related informations of a mechanism
@@ -33,12 +37,17 @@ struct Memb_func {
     mod_f_t initialize;
     mod_f_t constructor;
     mod_f_t destructor; /* only for point processes */
+    // These are used for CoreNEURON-internal allocation/cleanup; they are kept
+    // separate from the CONSTRUCTOR/DESTRUCTOR functions just above (one of
+    // which is apparently only for point processes) for simplicity.
+    mod_f_t private_constructor;
+    mod_f_t private_destructor;
     Symbol* sym;
     int vectorized;
     int thread_size_;                       /* how many Datum needed in Memb_list if vectorized */
     void (*thread_mem_init_)(ThreadDatum*); /* after Memb_list._thread is allocated */
     void (*thread_cleanup_)(ThreadDatum*);  /* before Memb_list._thread is freed */
-    void (*thread_table_check_)(int, int, double*, Datum*, ThreadDatum*, NrnThread*, int);
+    thread_table_check_t thread_table_check_;
     int is_point;
     void (*setdata_)(double*, Datum*);
     int* dparam_semantics; /* for nrncore writing. */
@@ -87,6 +96,8 @@ extern int register_mech(const char** m,
                          mod_f_t jacob,
                          mod_f_t stat,
                          mod_f_t initialize,
+                         mod_f_t private_constructor,
+                         mod_f_t private_destructor,
                          int nrnpointerindex,
                          int vectorized);
 extern int point_register_mech(const char**,
@@ -95,6 +106,8 @@ extern int point_register_mech(const char**,
                                mod_f_t jacob,
                                mod_f_t stat,
                                mod_f_t initialize,
+                               mod_f_t private_constructor,
+                               mod_f_t private_destructor,
                                int nrnpointerindex,
                                mod_f_t constructor,
                                mod_f_t destructor,
@@ -110,14 +123,53 @@ extern void hoc_register_watch_check(nrn_watch_check_t, int);
 
 extern void nrn_jacob_capacitance(NrnThread*, Memb_list*, int);
 extern void nrn_writes_conc(int, int);
-nrn_pragma_omp(declare target)
-nrn_pragma_acc(routine seq)
-extern void nrn_wrote_conc(int, double*, int, int, double**, double, int);
-nrn_pragma_acc(routine seq)
-double nrn_nernst(double ci, double co, double z, double celsius);
-nrn_pragma_acc(routine seq)
-extern double nrn_ghk(double v, double ci, double co, double z);
-nrn_pragma_omp(end declare target)
+constexpr double ktf(double celsius) {
+    return 1000. * units::gasconstant * (celsius + 273.15) / units::faraday;
+}
+// std::log isn't constexpr, but there are argument values for which nrn_nernst
+// is a constant expression
+constexpr double nrn_nernst(double ci, double co, double z, double celsius) {
+    if (z == 0) {
+        return 0.;
+    }
+    if (ci <= 0.) {
+        return 1e6;
+    } else if (co <= 0.) {
+        return -1e6;
+    } else {
+        return ktf(celsius) / z * std::log(co / ci);
+    }
+}
+constexpr void nrn_wrote_conc(int type,
+                              double* p1,
+                              int p2,
+                              int it,
+                              double** gimap,
+                              double celsius,
+                              int _cntml_padded) {
+    if (it & 040) {
+        constexpr int _iml = 0;
+        int const STRIDE{_cntml_padded + _iml};
+        /* passing _nt to this function causes cray compiler to segfault during compilation
+         * hence passing _cntml_padded
+         */
+        double* pe = p1 - p2 * STRIDE;
+        pe[0] = nrn_nernst(pe[1 * STRIDE], pe[2 * STRIDE], gimap[type][2], celsius);
+    }
+}
+inline double nrn_ghk(double v, double ci, double co, double z, double celsius) {
+    auto const efun = [](double x) {
+        if (std::abs(x) < 1e-4) {
+            return 1. - x / 2.;
+        } else {
+            return x / (std::exp(x) - 1.);
+        }
+    };
+    double const temp{z * v / ktf(celsius)};
+    double const eco{co * efun(+temp)};
+    double const eci{ci * efun(-temp)};
+    return .001 * z * units::faraday * (eci - eco);
+}
 extern void hoc_register_prop_size(int, int, int);
 extern void hoc_register_dparam_semantics(int type, int, const char* name);
 extern void hoc_reg_ba(int, mod_f_t, int);
@@ -151,6 +203,7 @@ using bbcore_read_t = void (*)(double*,
                                Datum*,
                                ThreadDatum*,
                                NrnThread*,
+                               Memb_list*,
                                double);
 
 using bbcore_write_t = void (*)(double*,
@@ -163,6 +216,7 @@ using bbcore_write_t = void (*)(double*,
                                 Datum*,
                                 ThreadDatum*,
                                 NrnThread*,
+                                Memb_list*,
                                 double);
 
 extern int nrn_mech_depend(int type, int* dependencies);
@@ -179,10 +233,6 @@ extern void artcell_net_move(void**, Point_process*, double);
 extern void nrn2ncs_outputevent(int netcon_output_index, double firetime);
 extern bool nrn_use_localgid_;
 extern void net_sem_from_gpu(int sendtype, int i_vdata, int, int ith, int ipnt, double, double);
-nrn_pragma_acc(routine seq)
-nrn_pragma_omp(declare target)
-extern int at_time(NrnThread*, double);
-nrn_pragma_omp(end declare target)
 
 // _OPENACC and/or NET_RECEIVE_BUFFERING
 extern void net_sem_from_gpu(int, int, int, int, int, double, double);
diff --git a/coreneuron/mechanism/nrnoc_ml.ispc b/coreneuron/mechanism/nrnoc_ml.ispc
index 6b196eaf3..2c28a745f 100644
--- a/coreneuron/mechanism/nrnoc_ml.ispc
+++ b/coreneuron/mechanism/nrnoc_ml.ispc
@@ -57,6 +57,8 @@ struct Memb_list {
     uniform int nodecount;
     uniform int _nodecount_padded;
     void* uniform instance;
+    void* uniform global_variables;
+    uniform size_t global_variables_size;
 };
 
 struct Point_process {
@@ -153,8 +155,6 @@ struct NrnThread {
     void* mapping;
 };
 
-extern uniform double ispc_celsius;
-
 // NOTE : this implementation is duplicated from "coreneuron/network/cvodestb.cpp"
 // If changes are required, make sure to change CPP as well.
 static inline int at_time(uniform NrnThread* nt, varying double te) {
diff --git a/coreneuron/mechanism/patternstim.cpp b/coreneuron/mechanism/patternstim.cpp
index e22b19e98..e680a6187 100644
--- a/coreneuron/mechanism/patternstim.cpp
+++ b/coreneuron/mechanism/patternstim.cpp
@@ -38,6 +38,7 @@ extern void pattern_stim_setup_helper(int size,
                                       Datum* _ppvar,
                                       ThreadDatum* _thread,
                                       NrnThread* _nt,
+                                      Memb_list* ml,
                                       double v);
 
 static size_t read_raster_file(const char* fname, double** tvec, int** gidvec, double tstop);
@@ -93,7 +94,7 @@ void nrn_mkPatternStim(const char* fname, double tstop) {
     } else {
         assert(0);
     }
-    pattern_stim_setup_helper(size, tvec, gidvec, _iml, _cntml, _p, _ppvar, nullptr, nt, 0.0);
+    pattern_stim_setup_helper(size, tvec, gidvec, _iml, _cntml, _p, _ppvar, nullptr, nt, ml, 0.0);
 }
 
 size_t read_raster_file(const char* fname, double** tvec, int** gidvec, double tstop) {
@@ -136,8 +137,8 @@ size_t read_raster_file(const char* fname, double** tvec, int** gidvec, double t
 }
 
 // see nrn_setup.cpp:read_phase2 for how it creates NrnThreadMembList instances.
-static NrnThreadMembList* alloc_nrn_thread_memb(int type) {
-    NrnThreadMembList* tml = (NrnThreadMembList*) emalloc(sizeof(NrnThreadMembList));
+static NrnThreadMembList* alloc_nrn_thread_memb(NrnThread* nt, int type) {
+    NrnThreadMembList* tml = (NrnThreadMembList*) ecalloc(1, sizeof(NrnThreadMembList));
     tml->dependencies = nullptr;
     tml->ndependencies = 0;
     tml->index = type;
@@ -148,7 +149,7 @@ static NrnThreadMembList* alloc_nrn_thread_memb(int type) {
     int psize = corenrn.get_prop_param_size()[type];
     int dsize = corenrn.get_prop_dparam_size()[type];
     int layout = corenrn.get_mech_data_layout()[type];
-    tml->ml = (Memb_list*) emalloc(sizeof(Memb_list));
+    tml->ml = (Memb_list*) ecalloc(1, sizeof(Memb_list));
     tml->ml->nodecount = 1;
     tml->ml->_nodecount_padded = tml->ml->nodecount;
     tml->ml->nodeindices = nullptr;
@@ -160,6 +161,10 @@ static NrnThreadMembList* alloc_nrn_thread_memb(int type) {
     tml->ml->_net_send_buffer = nullptr;
     tml->ml->_permute = nullptr;
 
+    if (auto* const priv_ctor = corenrn.get_memb_func(tml->index).private_constructor) {
+        priv_ctor(nt, tml->ml, tml->index);
+    }
+
     return tml;
 }
 
@@ -177,7 +182,7 @@ Point_process* nrn_artcell_instantiate(const char* mechname) {
     // printf("nrn_artcell_instantiate %s type=%d\n", mechname, type);
 
     // create and append to nt.tml
-    auto tml = alloc_nrn_thread_memb(type);
+    auto tml = alloc_nrn_thread_memb(nt, type);
 
     assert(nt->_ml_list[type] == nullptr);  // FIXME
     nt->_ml_list[type] = tml->ml;
diff --git a/coreneuron/mechanism/register_mech.cpp b/coreneuron/mechanism/register_mech.cpp
index 01b82814c..498754d80 100644
--- a/coreneuron/mechanism/register_mech.cpp
+++ b/coreneuron/mechanism/register_mech.cpp
@@ -19,9 +19,7 @@
 
 namespace coreneuron {
 int secondorder = 0;
-nrn_pragma_omp(declare target)
 double t, dt, celsius, pi;
-nrn_pragma_omp(end declare target)
 int rev_dt;
 
 using Pfrv = void (*)();
@@ -117,6 +115,8 @@ int register_mech(const char** m,
                   mod_f_t jacob,
                   mod_f_t stat,
                   mod_f_t initialize,
+                  mod_f_t private_constructor,
+                  mod_f_t private_destructor,
                   int /* nrnpointerindex */,
                   int vectorized) {
     auto& memb_func = corenrn.get_memb_funcs();
@@ -144,6 +144,8 @@ int register_mech(const char** m,
     memb_func[type].initialize = initialize;
     memb_func[type].constructor = nullptr;
     memb_func[type].destructor = nullptr;
+    memb_func[type].private_constructor = private_constructor;
+    memb_func[type].private_destructor = private_destructor;
 #if VECTORIZE
     memb_func[type].vectorized = vectorized ? 1 : 0;
     memb_func[type].thread_size_ = vectorized ? (vectorized - 1) : 0;
@@ -340,12 +342,23 @@ int point_register_mech(const char** m,
                         mod_f_t jacob,
                         mod_f_t stat,
                         mod_f_t initialize,
+                        mod_f_t private_constructor,
+                        mod_f_t private_destructor,
                         int nrnpointerindex,
                         mod_f_t constructor,
                         mod_f_t destructor,
                         int vectorized) {
     const Symbol* s = m[1];
-    register_mech(m, alloc, cur, jacob, stat, initialize, nrnpointerindex, vectorized);
+    register_mech(m,
+                  alloc,
+                  cur,
+                  jacob,
+                  stat,
+                  initialize,
+                  private_constructor,
+                  private_destructor,
+                  nrnpointerindex,
+                  vectorized);
     register_constructor(constructor);
     register_destructor(destructor);
     return point_reg_helper(s);
@@ -417,8 +430,7 @@ void _nrn_thread_reg1(int i, void (*f)(ThreadDatum*)) {
     corenrn.get_memb_func(i).thread_mem_init_ = f;
 }
 
-void _nrn_thread_table_reg(int i,
-                           void (*f)(int, int, double*, Datum*, ThreadDatum*, NrnThread*, int)) {
+void _nrn_thread_table_reg(int i, thread_table_check_t f) {
     if (i == -1)
         return;
 
diff --git a/coreneuron/mechanism/register_mech.hpp b/coreneuron/mechanism/register_mech.hpp
index df80d958a..07fa1ca5c 100644
--- a/coreneuron/mechanism/register_mech.hpp
+++ b/coreneuron/mechanism/register_mech.hpp
@@ -18,7 +18,7 @@ extern void hoc_reg_bbcore_read(int type, bbcore_read_t f);
 extern void hoc_reg_bbcore_write(int type, bbcore_write_t f);
 extern void _nrn_thread_table_reg(
     int i,
-    void (*f)(int, int, double*, Datum*, ThreadDatum*, NrnThread*, int));
+    void (*f)(int, int, double*, Datum*, ThreadDatum*, NrnThread*, Memb_list*, int));
 extern void alloc_mech(int);
 
 }  // namespace coreneuron
diff --git a/coreneuron/mpi/core/nrnmpi.cpp b/coreneuron/mpi/core/resolve.cpp
similarity index 100%
rename from coreneuron/mpi/core/nrnmpi.cpp
rename to coreneuron/mpi/core/resolve.cpp
diff --git a/coreneuron/mpi/nrnmpi.h b/coreneuron/mpi/nrnmpi.h
index 04df699ff..03a1d2461 100644
--- a/coreneuron/mpi/nrnmpi.h
+++ b/coreneuron/mpi/nrnmpi.h
@@ -81,7 +81,7 @@ struct mpi_function<std::integral_constant<function_ptr, fptr>>: mpi_function_ba
     using mpi_function_base::mpi_function_base;
     template <typename... Args>  // in principle deducible from `function_ptr`
     auto operator()(Args&&... args) const {
-#ifdef CORENRN_ENABLE_MPI_DYNAMIC
+#ifdef CORENEURON_ENABLE_MPI_DYNAMIC
         // Dynamic MPI, m_fptr should have been initialised via dlsym.
         assert(m_fptr);
         return (*reinterpret_cast<decltype(fptr)>(m_fptr))(std::forward<Args>(args)...);
diff --git a/coreneuron/network/cvodestb.cpp b/coreneuron/network/cvodestb.cpp
index 31c18807e..bd3de5f4c 100644
--- a/coreneuron/network/cvodestb.cpp
+++ b/coreneuron/network/cvodestb.cpp
@@ -84,16 +84,4 @@ void fixed_play_continuous(NrnThread* nt) {
     }
 }
 
-// NOTE : this implementation is duplicated in "coreneuron/mechanism/nrnoc_ml.ispc"
-// for the ISPC backend. If changes are required, make sure to change ISPC as well.
-nrn_pragma_omp(declare target)
-int at_time(NrnThread* nt, double te) {
-    double x = te - 1e-11;
-    if (x <= nt->_t && x > (nt->_t - nt->_dt)) {
-        return 1;
-    }
-    return 0;
-}
-nrn_pragma_omp(end declare target)
-
 }  // namespace coreneuron
diff --git a/coreneuron/network/partrans.cpp b/coreneuron/network/partrans.cpp
index ddfb49421..28fee5d86 100644
--- a/coreneuron/network/partrans.cpp
+++ b/coreneuron/network/partrans.cpp
@@ -133,12 +133,9 @@ void nrnthread_v_transfer(NrnThread* _nt) {
 void nrn_partrans::copy_gap_indices_to_device() {
     // Ensure index vectors, src_gather, and insrc_buf_ are on the gpu.
     if (insrcdspl_) {
-        int n_insrc_buf = insrcdspl_[nrnmpi_numprocs];
-        static_cast<void>(n_insrc_buf);
-        nrn_pragma_acc(enter data create(insrc_buf_[:n_insrc_buf]))
-        // clang-format off
-        nrn_pragma_omp(target enter data map(alloc: insrc_buf_[:n_insrc_buf]))
-        // clang-format off
+        // TODO: we don't actually need to copy here, just allocate + associate
+        // storage on the device
+        cnrn_target_copyin(insrc_buf_, insrcdspl_[nrnmpi_numprocs]);
     }
     for (int tid = 0; tid < nrn_nthread; ++tid) {
         const NrnThread* nt = nrn_threads + tid;
@@ -150,13 +147,9 @@ void nrn_partrans::copy_gap_indices_to_device() {
 
         if (!ttd.src_indices.empty()) {
             cnrn_target_copyin(ttd.src_indices.data(), ttd.src_indices.size());
-
-            size_t n_src_gather = ttd.src_gather.size();
-            const double* src_gather = ttd.src_gather.data();
-            static_cast<void>(n_src_gather);
-            static_cast<void>(src_gather);
-            nrn_pragma_acc(enter data create(src_gather[:n_src_gather]))
-            nrn_pragma_omp(target enter data map(alloc: src_gather[:n_src_gather]))
+            // TODO: we don't actually need to copy here, just allocate +
+            // associate storage on the device.
+            cnrn_target_copyin(ttd.src_gather.data(), ttd.src_gather.size());
         }
 
         if (ttd.insrc_indices.size()) {
diff --git a/coreneuron/nrnconf.h b/coreneuron/nrnconf.h
index b25a2764a..7e4cb6d4e 100644
--- a/coreneuron/nrnconf.h
+++ b/coreneuron/nrnconf.h
@@ -32,17 +32,9 @@ using Symbol = char;
 #define VEC_AREA(i) (_nt->_actual_area[(i)])
 #define VECTORIZE   1
 
-// extern variables require acc declare
-nrn_pragma_omp(declare target)
 extern double celsius;
-nrn_pragma_acc(declare create(celsius))
-
 extern double pi;
-nrn_pragma_acc(declare create(pi))
-
 extern int secondorder;
-nrn_pragma_acc(declare create(secondorder))
-nrn_pragma_omp(end declare target)
 
 extern double t, dt;
 extern int rev_dt;
diff --git a/coreneuron/nrnoc/nrnunits_modern.h b/coreneuron/nrnoc/nrnunits_modern.h
deleted file mode 100644
index d93638841..000000000
--- a/coreneuron/nrnoc/nrnunits_modern.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
-# =============================================================================
-# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
-#
-# See top-level LICENSE file for details.
-# =============================================================================
-*/
-
-#pragma once
-
-/**
- NMODL translated MOD files get unit constants typically from
- share/lib/nrnunits.lib.in. But there were other source files that
- hardcode some of the constants. Here we gather a few modern units into
- a single place (but, unfortunately, also in nrnunits.lib.in). Legacy units
- cannot be gathered here because they can differ slightly from place to place.
-
- These come from https://physics.nist.gov/cuu/Constants/index.html.
- Termed the "2018 CODATA recommended values", they became available
- on 20 May 2019 and replace the 2014 CODATA set.
-
- See oc/hoc_init.c, nrnoc/eion.c, nrniv/kschan.h
-**/
-
-
-#define _electron_charge_codata2018 1.602176634e-19 /* coulomb exact*/
-#define _avogadro_number_codata2018 6.02214076e+23  /* exact */
-#define _boltzmann_codata2018       1.380649e-23    /* joule/K exact */
-#define _faraday_codata2018 \
-    (_electron_charge_codata2018 * _avogadro_number_codata2018) /* 96485.33212... coulomb/mol */
-#define _gasconstant_codata2018 \
-    (_boltzmann_codata2018 * _avogadro_number_codata2018) /* 8.314462618... joule/mol-K */
-
-/* e/k in K/millivolt */
-#define _e_over_k_codata2018 \
-    (.001 * _electron_charge_codata2018 / _boltzmann_codata2018) /* 11.604518... K/mV */
diff --git a/coreneuron/sim/fast_imem.cpp b/coreneuron/sim/fast_imem.cpp
index 1218b7967..d3b463a48 100644
--- a/coreneuron/sim/fast_imem.cpp
+++ b/coreneuron/sim/fast_imem.cpp
@@ -21,9 +21,9 @@ bool nrn_use_fast_imem;
 void fast_imem_free() {
     for (auto nt = nrn_threads; nt < nrn_threads + nrn_nthread; ++nt) {
         if (nt->nrn_fast_imem) {
-            free(nt->nrn_fast_imem->nrn_sav_rhs);
-            free(nt->nrn_fast_imem->nrn_sav_d);
-            free(nt->nrn_fast_imem);
+            free_memory(nt->nrn_fast_imem->nrn_sav_rhs);
+            free_memory(nt->nrn_fast_imem->nrn_sav_d);
+            free_memory(nt->nrn_fast_imem);
             nt->nrn_fast_imem = nullptr;
         }
     }
@@ -34,7 +34,7 @@ void nrn_fast_imem_alloc() {
         fast_imem_free();
         for (auto nt = nrn_threads; nt < nrn_threads + nrn_nthread; ++nt) {
             int n = nt->end;
-            nt->nrn_fast_imem = (NrnFastImem*) ecalloc(1, sizeof(NrnFastImem));
+            nt->nrn_fast_imem = (NrnFastImem*) ecalloc_align(1, sizeof(NrnFastImem));
             nt->nrn_fast_imem->nrn_sav_rhs = (double*) ecalloc_align(n, sizeof(double));
             nt->nrn_fast_imem->nrn_sav_d = (double*) ecalloc_align(n, sizeof(double));
         }
diff --git a/coreneuron/sim/multicore.cpp b/coreneuron/sim/multicore.cpp
index d5368a29c..b8dd293d2 100644
--- a/coreneuron/sim/multicore.cpp
+++ b/coreneuron/sim/multicore.cpp
@@ -61,7 +61,8 @@ static int table_check_cnt_;
 static ThreadDatum* table_check_;
 
 
-NrnThreadMembList* create_tml(int mech_id,
+NrnThreadMembList* create_tml(NrnThread& nt,
+                              int mech_id,
                               Memb_func& memb_func,
                               int& shadow_rhs_cnt,
                               const std::vector<int>& mech_types,
@@ -91,6 +92,10 @@ NrnThreadMembList* create_tml(int mech_id,
         }
     }
 
+    if (auto* const priv_ctor = corenrn.get_memb_func(tml->index).private_constructor) {
+        priv_ctor(&nt, tml->ml, tml->index);
+    }
+
     return tml;
 }
 
@@ -166,7 +171,7 @@ void nrn_thread_table_check() {
         auto tml = static_cast<NrnThreadMembList*>(table_check_[i + 1]._pvoid);
         Memb_list* ml = tml->ml;
         (*corenrn.get_memb_func(tml->index).thread_table_check_)(
-            0, ml->_nodecount_padded, ml->data, ml->pdata, ml->_thread, &nt, tml->index);
+            0, ml->_nodecount_padded, ml->data, ml->pdata, ml->_thread, &nt, ml, tml->index);
     }
 }
 }  // namespace coreneuron
diff --git a/coreneuron/sim/multicore.hpp b/coreneuron/sim/multicore.hpp
index c9b3cb58e..a6ac50be0 100644
--- a/coreneuron/sim/multicore.hpp
+++ b/coreneuron/sim/multicore.hpp
@@ -36,7 +36,8 @@ struct NrnThreadMembList { /* patterned after CvMembList in cvodeobj.h */
     int* dependencies; /* list of mechanism types that this mechanism depends on*/
     int ndependencies; /* for scheduling we need to know the dependency count */
 };
-NrnThreadMembList* create_tml(int mech_id,
+NrnThreadMembList* create_tml(NrnThread& nt,
+                              int mech_id,
                               Memb_func& memb_func,
                               int& shadow_rhs_cnt,
                               const std::vector<int>& mech_types,
@@ -163,6 +164,7 @@ void nrn_multithread_job(F&& job, Args&&... args) {
 
     #pragma omp parallel for private(i) shared(nrn_threads, job, nrn_nthread, \
                                            nrnmpi_myid) schedule(static, 1)
+    // FIXME: multiple forwarding of the same arguments...
     for (i = 0; i < nrn_nthread; ++i) {
         job(nrn_threads + i, std::forward<Args>(args)...);
     }
@@ -192,6 +194,13 @@ extern void direct_mode_initialize();
 extern void nrn_mk_table_check(void);
 extern void nonvint(NrnThread* _nt);
 extern void update(NrnThread*);
-
-
+// NOTE : this implementation is duplicated in "coreneuron/mechanism/nrnoc_ml.ispc"
+// for the ISPC backend. If changes are required, make sure to change ISPC as well.
+constexpr int at_time(NrnThread* nt, double te) {
+    double x = te - 1e-11;
+    if (x <= nt->_t && x > (nt->_t - nt->_dt)) {
+        return 1;
+    }
+    return 0;
+}
 }  // namespace coreneuron
diff --git a/coreneuron/sim/scopmath/errcodes.h b/coreneuron/sim/scopmath/errcodes.h
index 5f32c5785..94d08f73c 100644
--- a/coreneuron/sim/scopmath/errcodes.h
+++ b/coreneuron/sim/scopmath/errcodes.h
@@ -1,16 +1,33 @@
-/******************************************************************************
- *
- * File: errcodes.h
- *
- * Copyright (c) 1984, 1985, 1986, 1987, 1988, 1989, 1990
- *   Duke University
- *
- * errcodes.h,v 1.1.1.1 1994/10/12 17:22:18 hines Exp
- *
- ******************************************************************************/
+/*
+# =============================================================================
+# Originally errcodes.h from SCoP library, Copyright (c) 1984-90 Duke University
+# =============================================================================
+# Subsequent extensive prototype and memory layout changes for CoreNEURON
+#
+# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
+#
+# See top-level LICENSE file for details.
+# =============================================================================.
+*/
+#pragma once
 namespace coreneuron {
 extern int abort_run(int);
-}
+namespace scopmath {
+/** @brief Flag to disable some code sections at compile time.
+ *
+ *  Some methods, such as coreneuron::scopmath::sparse::getelm(...), decide at
+ *  runtime whether they are simply accessors, or if they dynamically modify the
+ *  matrix in question, possibly allocating new memory. Typically the second
+ *  mode will be used during model initialisation, while the first will be used
+ *  during computation/simulation. Compiling the more complicated code for the
+ *  second mode can be problematic for targets such as GPU, where dynamic
+ *  allocation and global state are complex. This enum is intended to be used as
+ *  a template parameter to flag (at compile time) when this code can be
+ *  omitted.
+ */
+enum struct enabled_code { all, compute_only };
+}  // namespace scopmath
+}  // namespace coreneuron
 #define ROUNDOFF       1.e-20
 #define ZERO           1.e-8
 #define STEP           1.e-6
diff --git a/coreneuron/sim/scopmath/sparse_thread.hpp b/coreneuron/sim/scopmath/sparse_thread.hpp
index 6614a0a1b..8d84cbb0e 100644
--- a/coreneuron/sim/scopmath/sparse_thread.hpp
+++ b/coreneuron/sim/scopmath/sparse_thread.hpp
@@ -67,7 +67,8 @@ inline void increase_order(SparseObj* so, unsigned row) {
  * biggest difference is that elements are no longer removed and this saves much
  * time allocating and freeing during the solve phase.
  */
-inline Elm* getelm(SparseObj* so, unsigned row, unsigned col, Elm* new_elem) {
+template <enabled_code code_to_enable = enabled_code::all>
+Elm* getelm(SparseObj* so, unsigned row, unsigned col, Elm* new_elem) {
     Elm *el, *elnext;
 
     unsigned vrow = so->varord[row];
@@ -90,10 +91,14 @@ inline Elm* getelm(SparseObj* so, unsigned row, unsigned col, Elm* new_elem) {
         }
         /* insert below el */
         if (!new_elem) {
-            new_elem = new Elm{};
-            // Using array-new here causes problems in GPU compilation.
-            new_elem->value = static_cast<double*>(std::malloc(so->_cntml_padded * sizeof(double)));
-            increase_order(so, row);
+            if constexpr (code_to_enable == enabled_code::compute_only) {
+                // Dynamic allocation should not happen during the compute phase.
+                assert(false);
+            } else {
+                new_elem = new Elm{};
+                new_elem->value = new double[so->_cntml_padded];
+                increase_order(so, row);
+            }
         }
         new_elem->r_down = el->r_down;
         el->r_down = new_elem;
@@ -133,9 +138,13 @@ inline Elm* getelm(SparseObj* so, unsigned row, unsigned col, Elm* new_elem) {
         }
         /* insert above el */
         if (!new_elem) {
-            new_elem = new Elm{};
-            new_elem->value = static_cast<double*>(std::malloc(so->_cntml_padded * sizeof(double)));
-            increase_order(so, row);
+            if constexpr (code_to_enable == enabled_code::compute_only) {
+                assert(false);
+            } else {
+                new_elem = new Elm{};
+                new_elem->value = new double[so->_cntml_padded];
+                increase_order(so, row);
+            }
         }
         new_elem->r_up = el->r_up;
         el->r_up = new_elem;
@@ -491,16 +500,13 @@ void create_coef_list(SparseObj* so, int n, SPFUN fun, _threadargsproto_) {
     fun(so, so->rhs, _threadargs_);  // std::invoke in C++17
     so->phase = 0;
 }
-}  // namespace sparse
-}  // namespace scopmath
 
-// Methods that may be called from translated MOD files are kept outside the
-// scopmath::sparse namespace.
-inline double* _nrn_thread_getelm(SparseObj* so, int row, int col, int _iml) {
+template <enabled_code code_to_enable = enabled_code::all>
+double* thread_getelm(SparseObj* so, int row, int col, int _iml) {
     if (!so->phase) {
         return so->coef_list[so->ngetcall[_iml]++];
     }
-    Elm* el = scopmath::sparse::getelm(so, (unsigned) row, (unsigned) col, nullptr);
+    Elm* el = scopmath::sparse::getelm<code_to_enable>(so, (unsigned) row, (unsigned) col, nullptr);
     if (so->phase == 1) {
         so->ngetcall[_iml]++;
     } else {
@@ -508,7 +514,11 @@ inline double* _nrn_thread_getelm(SparseObj* so, int row, int col, int _iml) {
     }
     return el->value;
 }
+}  // namespace sparse
+}  // namespace scopmath
 
+// Methods that may be called from translated MOD files are kept outside the
+// scopmath::sparse namespace.
 #define scopmath_sparse_s(arg) _p[scopmath_sparse_ix(s[arg])]
 #define scopmath_sparse_d(arg) _p[scopmath_sparse_ix(d[arg])]
 
diff --git a/coreneuron/utils/ispc/globals.cpp b/coreneuron/utils/ispc/globals.cpp
deleted file mode 100644
index 0344bf1b8..000000000
--- a/coreneuron/utils/ispc/globals.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
-# =============================================================================
-# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
-#
-# See top-level LICENSE file for details.
-# =============================================================================
-*/
-
-/*
- * Coreneuron global variables are declared at least in the coreneuron namespace. In ispc it is,
- * however, not possible to access variables within C++ namespaces. To be able to access these
- * variables from ispc kernels, we declare them in global namespace and a C linkage file.
- */
-
-extern "C" {
-double ispc_celsius;
-}
diff --git a/coreneuron/utils/memory.cpp b/coreneuron/utils/memory.cpp
index 70d928b63..8f45487dc 100644
--- a/coreneuron/utils/memory.cpp
+++ b/coreneuron/utils/memory.cpp
@@ -15,7 +15,7 @@
 #include <cassert>
 
 namespace coreneuron {
-bool unified_memory_enabled() {
+bool gpu_enabled() {
 #ifdef CORENEURON_ENABLE_GPU
     return corenrn_param.gpu;
 #else
diff --git a/coreneuron/utils/memory.h b/coreneuron/utils/memory.h
index 9a2e65645..9e612680c 100644
--- a/coreneuron/utils/memory.h
+++ b/coreneuron/utils/memory.h
@@ -1,6 +1,6 @@
 /*
 # =============================================================================
-# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
+# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
 #
 # See top-level LICENSE file for details.
 # =============================================================================.
@@ -22,13 +22,13 @@
 #endif
 
 namespace coreneuron {
-/** @brief Check if allocate_unified will return a unified memory address.
+/**
+ * @brief Check if GPU support is enabled.
  *
- *  If false, [de]allocate_unified simply forward to new/delete. It is
- *  convenient to include this method here to avoid having to access
- *  corenrn_param directly.
+ * This returns true if GPU support was enabled at compile time and at runtime
+ * via coreneuron.gpu = True and/or --gpu, otherwise it returns false.
  */
-bool unified_memory_enabled();
+bool gpu_enabled();
 
 /** @brief Allocate unified memory in GPU builds iff GPU enabled, otherwise new
  */
diff --git a/coreneuron/utils/nrnoc_aux.hpp b/coreneuron/utils/nrnoc_aux.hpp
index 3c2f23326..10b5880ea 100644
--- a/coreneuron/utils/nrnoc_aux.hpp
+++ b/coreneuron/utils/nrnoc_aux.hpp
@@ -34,9 +34,4 @@ extern void hoc_execerror(const char*, const char*); /* print and abort */
 extern void hoc_warning(const char*, const char*);
 
 extern double hoc_Exp(double x);
-
-// defined in eion.cpp and this file included in translated mod files.
-extern double nrn_nernst(double ci, double co, double z, double celsius);
-extern double nrn_ghk(double v, double ci, double co, double z);
-
 }  // namespace coreneuron
diff --git a/coreneuron/utils/offload.hpp b/coreneuron/utils/offload.hpp
index 078990107..6297221e8 100644
--- a/coreneuron/utils/offload.hpp
+++ b/coreneuron/utils/offload.hpp
@@ -1,6 +1,6 @@
 /*
 # =============================================================================
-# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
+# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
 #
 # See top-level LICENSE file for details.
 # =============================================================================
@@ -19,48 +19,121 @@
 #else
 #define nrn_pragma_acc(x)
 #define nrn_pragma_omp(x)
-#include <stdexcept>
 #endif
 
 #include <cstddef>
+#include <stdexcept>
+#include <string_view>
 
 namespace coreneuron {
+void cnrn_target_copyin_debug(std::string_view file,
+                              int line,
+                              std::size_t sizeof_T,
+                              std::type_info const& typeid_T,
+                              void const* h_ptr,
+                              std::size_t len,
+                              void* d_ptr);
+void cnrn_target_delete_debug(std::string_view file,
+                              int line,
+                              std::size_t sizeof_T,
+                              std::type_info const& typeid_T,
+                              void const* h_ptr,
+                              std::size_t len);
+void cnrn_target_deviceptr_debug(std::string_view file,
+                                 int line,
+                                 std::type_info const& typeid_T,
+                                 void const* h_ptr,
+                                 void* d_ptr);
+void cnrn_target_is_present_debug(std::string_view file,
+                                  int line,
+                                  std::type_info const& typeid_T,
+                                  void const* h_ptr,
+                                  void* d_ptr);
+void cnrn_target_memcpy_to_device_debug(std::string_view file,
+                                        int line,
+                                        std::size_t sizeof_T,
+                                        std::type_info const& typeid_T,
+                                        void const* h_ptr,
+                                        std::size_t len,
+                                        void* d_ptr);
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_UNIFIED_MEMORY) && \
+    defined(__NVCOMPILER_MAJOR__) && defined(__NVCOMPILER_MINOR__) &&        \
+    (__NVCOMPILER_MAJOR__ <= 22) && (__NVCOMPILER_MINOR__ <= 3)
+// Homegrown implementation for buggy NVHPC versions (<=22.3), see
+// https://forums.developer.nvidia.com/t/acc-deviceptr-does-not-work-in-openacc-code-dynamically-loaded-from-a-shared-library/211599
+#define CORENEURON_ENABLE_PRESENT_TABLE
+std::pair<void*, bool> cnrn_target_deviceptr_impl(bool must_be_present_or_null, void const* h_ptr);
+void cnrn_target_copyin_update_present_table(void const* h_ptr, void* d_ptr, std::size_t len);
+void cnrn_target_delete_update_present_table(void const* h_ptr, std::size_t len);
+#endif
+
 template <typename T>
-T* cnrn_target_deviceptr(const T* h_ptr) {
-#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+T* cnrn_target_deviceptr_or_present(std::string_view file,
+                                    int line,
+                                    bool must_be_present_or_null,
+                                    const T* h_ptr) {
+    T* d_ptr{};
+    bool error{false};
+#ifdef CORENEURON_ENABLE_PRESENT_TABLE
+    auto const d_ptr_and_error = cnrn_target_deviceptr_impl(must_be_present_or_null, h_ptr);
+    d_ptr = static_cast<T*>(d_ptr_and_error.first);
+    error = d_ptr_and_error.second;
+#elif defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENACC)
-    return static_cast<T*>(acc_deviceptr(const_cast<T*>(h_ptr)));
+    d_ptr = static_cast<T*>(acc_deviceptr(const_cast<T*>(h_ptr)));
 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENMP)
-    T const* d_ptr{};
-
-    nrn_pragma_omp(target data use_device_ptr(h_ptr))
-    { d_ptr = h_ptr; }
-
-    return const_cast<T*>(d_ptr);
+    if (must_be_present_or_null || omp_target_is_present(h_ptr, omp_get_default_device())) {
+        nrn_pragma_omp(target data use_device_ptr(h_ptr))
+        { d_ptr = const_cast<T*>(h_ptr); }
+    }
 #else
-    throw std::runtime_error(
-        "cnrn_target_deviceptr() not implemented without OpenACC/OpenMP and gpu build");
+    if (must_be_present_or_null && h_ptr) {
+        throw std::runtime_error(
+            "cnrn_target_deviceptr() not implemented without OpenACC/OpenMP and gpu build");
+    }
 #endif
+    if (must_be_present_or_null) {
+        cnrn_target_deviceptr_debug(file, line, typeid(T), h_ptr, d_ptr);
+    } else {
+        cnrn_target_is_present_debug(file, line, typeid(T), h_ptr, d_ptr);
+    }
+    if (error) {
+        throw std::runtime_error(
+            "cnrn_target_deviceptr() encountered an error, you may want to try setting "
+            "CORENEURON_GPU_DEBUG=1");
+    }
+    return d_ptr;
 }
 
 template <typename T>
-T* cnrn_target_copyin(const T* h_ptr, std::size_t len = 1) {
+T* cnrn_target_copyin(std::string_view file, int line, const T* h_ptr, std::size_t len = 1) {
+    T* d_ptr{};
 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENACC)
-    return static_cast<T*>(acc_copyin(const_cast<T*>(h_ptr), len * sizeof(T)));
+    d_ptr = static_cast<T*>(acc_copyin(const_cast<T*>(h_ptr), len * sizeof(T)));
 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENMP)
     nrn_pragma_omp(target enter data map(to : h_ptr[:len]))
-    return cnrn_target_deviceptr(h_ptr);
+    nrn_pragma_omp(target data use_device_ptr(h_ptr))
+    { d_ptr = const_cast<T*>(h_ptr); }
 #else
     throw std::runtime_error(
         "cnrn_target_copyin() not implemented without OpenACC/OpenMP and gpu build");
 #endif
+#ifdef CORENEURON_ENABLE_PRESENT_TABLE
+    cnrn_target_copyin_update_present_table(h_ptr, d_ptr, len * sizeof(T));
+#endif
+    cnrn_target_copyin_debug(file, line, sizeof(T), typeid(T), h_ptr, len, d_ptr);
+    return d_ptr;
 }
 
 template <typename T>
-void cnrn_target_delete(T* h_ptr, std::size_t len = 1) {
+void cnrn_target_delete(std::string_view file, int line, T* h_ptr, std::size_t len = 1) {
+    cnrn_target_delete_debug(file, line, sizeof(T), typeid(T), h_ptr, len);
+#ifdef CORENEURON_ENABLE_PRESENT_TABLE
+    cnrn_target_delete_update_present_table(h_ptr, len * sizeof(T));
+#endif
 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENACC)
     acc_delete(h_ptr, len * sizeof(T));
@@ -74,7 +147,12 @@ void cnrn_target_delete(T* h_ptr, std::size_t len = 1) {
 }
 
 template <typename T>
-void cnrn_target_memcpy_to_device(T* d_ptr, const T* h_ptr, std::size_t len = 1) {
+void cnrn_target_memcpy_to_device(std::string_view file,
+                                  int line,
+                                  T* d_ptr,
+                                  const T* h_ptr,
+                                  std::size_t len = 1) {
+    cnrn_target_memcpy_to_device_debug(file, line, sizeof(T), typeid(T), h_ptr, len, d_ptr);
 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENACC)
     acc_memcpy_to_device(d_ptr, const_cast<T*>(h_ptr), len * sizeof(T));
@@ -93,4 +171,25 @@ void cnrn_target_memcpy_to_device(T* d_ptr, const T* h_ptr, std::size_t len = 1)
 #endif
 }
 
+template <typename T>
+void cnrn_target_update_on_device(std::string_view file,
+                                  int line,
+                                  const T* h_ptr,
+                                  std::size_t len = 1) {
+    auto* d_ptr = cnrn_target_deviceptr_or_present(file, line, true, h_ptr);
+    cnrn_target_memcpy_to_device(file, line, d_ptr, h_ptr);
+}
+
+// Replace with std::source_location once we have C++20
+#define cnrn_target_copyin(...) cnrn_target_copyin(__FILE__, __LINE__, __VA_ARGS__)
+#define cnrn_target_delete(...) cnrn_target_delete(__FILE__, __LINE__, __VA_ARGS__)
+#define cnrn_target_is_present(...) \
+    cnrn_target_deviceptr_or_present(__FILE__, __LINE__, false, __VA_ARGS__)
+#define cnrn_target_deviceptr(...) \
+    cnrn_target_deviceptr_or_present(__FILE__, __LINE__, true, __VA_ARGS__)
+#define cnrn_target_memcpy_to_device(...) \
+    cnrn_target_memcpy_to_device(__FILE__, __LINE__, __VA_ARGS__)
+#define cnrn_target_update_on_device(...) \
+    cnrn_target_update_on_device(__FILE__, __LINE__, __VA_ARGS__)
+
 }  // namespace coreneuron
diff --git a/coreneuron/utils/randoms/nrnran123.cpp b/coreneuron/utils/randoms/nrnran123.cpp
index 77ff88fb3..14e2b15df 100644
--- a/coreneuron/utils/randoms/nrnran123.cpp
+++ b/coreneuron/utils/randoms/nrnran123.cpp
@@ -5,21 +5,22 @@
 # See top-level LICENSE file for details.
 # =============================================================================.
 */
+#include "coreneuron/gpu/nrn_acc_manager.hpp"
 #include "coreneuron/mpi/core/nrnmpi.hpp"
 #include "coreneuron/utils/memory.h"
 #include "coreneuron/utils/nrnmutdec.hpp"
 #include "coreneuron/utils/randoms/nrnran123.h"
 
-#include <cmath>
-#include <iostream>
-#include <memory>
-#include <mutex>
-
 #ifdef CORENEURON_USE_BOOST_POOL
 #include <boost/pool/pool_alloc.hpp>
 #include <unordered_map>
 #endif
 
+#include <cmath>
+#include <iostream>
+#include <memory>
+#include <mutex>
+
 // Defining these attributes seems to help nvc++ in OpenMP target offload mode.
 #if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENMP) && defined(__CUDACC__)
@@ -76,25 +77,28 @@ using random123_allocator = coreneuron::unified_allocator<coreneuron::nrnran123_
  * shutdown. If the destructor calls cudaFree and the CUDA runtime has already
  * been shut down then tools like cuda-memcheck reports errors.
  */
-nrn_pragma_omp(declare target)
-philox4x32_key_t g_k{};
-nrn_pragma_omp(end declare target)
-nrn_pragma_acc(declare create(g_k))
-
 OMP_Mutex g_instance_count_mutex;
-
 std::size_t g_instance_count{};
 
-constexpr double SHIFT32 = 1.0 / 4294967297.0; /* 1/(2^32 + 1) */
-
-/** @brief Provide a helper function in global namespace that is declared target for OpenMP
- * offloading to function correctly with NVHPC
- */
-CORENRN_HOST_DEVICE philox4x32_ctr_t philox4x32_helper(coreneuron::nrnran123_State* s) {
-    return philox4x32(s->c, g_k);
+#ifdef __CUDACC__
+#define g_k_qualifiers __device__ __constant__
+#else
+#define g_k_qualifiers
+#endif
+g_k_qualifiers philox4x32_key_t g_k{};
+// Cannot refer to g_k directly from a nrn_pragma_acc(routine seq) method like
+// coreneuron_random123_philox4x32_helper, and cannot have this inlined there at
+// higher optimisation levels
+__attribute__((noinline)) philox4x32_key_t& global_state() {
+    return g_k;
 }
 }  // namespace
 
+CORENRN_HOST_DEVICE philox4x32_ctr_t
+coreneuron_random123_philox4x32_helper(coreneuron::nrnran123_State* s) {
+    return philox4x32(s->c, global_state());
+}
+
 namespace coreneuron {
 std::size_t nrnran123_instance_count() {
     return g_instance_count;
@@ -102,84 +106,13 @@ std::size_t nrnran123_instance_count() {
 
 /* if one sets the global, one should reset all the stream sequences. */
 uint32_t nrnran123_get_globalindex() {
-    return g_k.v[0];
-}
-
-void nrnran123_getseq(nrnran123_State* s, uint32_t* seq, char* which) {
-    *seq = s->c.v[0];
-    *which = s->which_;
-}
-
-void nrnran123_setseq(nrnran123_State* s, uint32_t seq, char which) {
-    if (which > 3) {
-        s->which_ = 0;
-    } else {
-        s->which_ = which;
-    }
-    s->c.v[0] = seq;
-    s->r = philox4x32_helper(s);
-}
-
-void nrnran123_getids(nrnran123_State* s, uint32_t* id1, uint32_t* id2) {
-    *id1 = s->c.v[2];
-    *id2 = s->c.v[3];
-}
-
-void nrnran123_getids3(nrnran123_State* s, uint32_t* id1, uint32_t* id2, uint32_t* id3) {
-    *id3 = s->c.v[1];
-    *id1 = s->c.v[2];
-    *id2 = s->c.v[3];
-}
-
-uint32_t nrnran123_ipick(nrnran123_State* s) {
-    uint32_t rval;
-    char which = s->which_;
-    rval = s->r.v[int{which++}];
-    if (which > 3) {
-        which = 0;
-        s->c.v[0]++;
-        s->r = philox4x32_helper(s);
-    }
-    s->which_ = which;
-    return rval;
-}
-
-double nrnran123_dblpick(nrnran123_State* s) {
-    return nrnran123_uint2dbl(nrnran123_ipick(s));
-}
-
-double nrnran123_negexp(nrnran123_State* s) {
-    /* min 2.3283064e-10 to max 22.18071 */
-    return -std::log(nrnran123_dblpick(s));
-}
-
-/* at cost of a cached  value we could compute two at a time. */
-double nrnran123_normal(nrnran123_State* s) {
-    double w, x, y;
-    double u1, u2;
-
-    do {
-        u1 = nrnran123_dblpick(s);
-        u2 = nrnran123_dblpick(s);
-        u1 = 2. * u1 - 1.;
-        u2 = 2. * u2 - 1.;
-        w = (u1 * u1) + (u2 * u2);
-    } while (w > 1);
-
-    y = std::sqrt((-2. * log(w)) / w);
-    x = u1 * y;
-    return x;
-}
-
-double nrnran123_uint2dbl(uint32_t u) {
-    /* 0 to 2^32-1 transforms to double value in open (0,1) interval */
-    /* min 2.3283064e-10 to max (1 - 2.3283064e-10) */
-    return ((double) u + 1.0) * SHIFT32;
+    return global_state().v[0];
 }
 
 /* nrn123 streams are created from cpu launcher routine */
 void nrnran123_set_globalindex(uint32_t gix) {
     // If the global seed is changing then we shouldn't have any active streams.
+    auto& g_k = global_state();
     {
         std::lock_guard<OMP_Mutex> _{g_instance_count_mutex};
         if (g_instance_count != 0 && nrnmpi_myid == 0) {
@@ -190,9 +123,40 @@ void nrnran123_set_globalindex(uint32_t gix) {
                 << g_k.v[0] << ')' << std::endl;
         }
     }
-    g_k.v[0] = gix;
-    nrn_pragma_acc(update device(g_k))
-    nrn_pragma_omp(target update to(g_k))
+    if (g_k.v[0] != gix) {
+        g_k.v[0] = gix;
+        if (coreneuron::gpu_enabled()) {
+#ifdef __CUDACC__
+            {
+                auto const code = cudaMemcpyToSymbol(g_k, &g_k, sizeof(g_k));
+                assert(code == cudaSuccess);
+            }
+            {
+                auto const code = cudaDeviceSynchronize();
+                assert(code == cudaSuccess);
+            }
+#else
+            nrn_pragma_acc(update device(g_k))
+            nrn_pragma_omp(target update to(g_k))
+#endif
+        }
+    }
+}
+
+void nrnran123_initialise_global_state_on_device() {
+    if (coreneuron::gpu_enabled()) {
+#ifndef __CUDACC__
+        nrn_pragma_acc(enter data copyin(g_k))
+#endif
+    }
+}
+
+void nrnran123_destroy_global_state_on_device() {
+    if (coreneuron::gpu_enabled()) {
+#ifndef __CUDACC__
+        nrn_pragma_acc(exit data delete (g_k))
+#endif
+    }
 }
 
 /** @brief Allocate a new Random123 stream.
diff --git a/coreneuron/utils/randoms/nrnran123.h b/coreneuron/utils/randoms/nrnran123.h
index ccd3fa5db..d4108612d 100644
--- a/coreneuron/utils/randoms/nrnran123.h
+++ b/coreneuron/utils/randoms/nrnran123.h
@@ -39,6 +39,8 @@ of the full distribution available from
 #include <Random123/philox.h>
 #include <inttypes.h>
 
+#include <cmath>
+
 // Some files are compiled with DISABLE_OPENACC, and some builds have no GPU
 // support at all. In these two cases, request that the random123 state is
 // allocated using new/delete instead of CUDA unified memory.
@@ -56,9 +58,19 @@ struct nrnran123_State {
     char which_;
 };
 
-struct nrnran123_array4x32 {
-    uint32_t v[4];
-};
+}  // namespace coreneuron
+
+/** @brief Provide a helper function in global namespace that is declared target for OpenMP
+ * offloading to function correctly with NVHPC
+ */
+nrn_pragma_acc(routine seq)
+nrn_pragma_omp(declare target)
+philox4x32_ctr_t coreneuron_random123_philox4x32_helper(coreneuron::nrnran123_State* s);
+nrn_pragma_omp(end declare target)
+
+namespace coreneuron {
+void nrnran123_initialise_global_state_on_device();
+void nrnran123_destroy_global_state_on_device();
 
 /* global index. eg. run number */
 /* all generator instances share this global index */
@@ -86,38 +98,75 @@ void nrnran123_deletestream(nrnran123_State* s,
                             bool use_unified_memory = CORENRN_RAN123_USE_UNIFIED_MEMORY);
 
 /* minimal data stream */
-nrn_pragma_omp(declare target)
-nrn_pragma_acc(routine seq)
-void nrnran123_getseq(nrnran123_State*, uint32_t* seq, char* which);
-nrn_pragma_acc(routine seq)
-void nrnran123_getids(nrnran123_State*, uint32_t* id1, uint32_t* id2);
-nrn_pragma_acc(routine seq)
-void nrnran123_getids3(nrnran123_State*, uint32_t* id1, uint32_t* id2, uint32_t* id3);
-nrn_pragma_acc(routine seq)
-uint32_t nrnran123_ipick(nrnran123_State*); /* uniform 0 to 2^32-1 */
+constexpr void nrnran123_getseq(nrnran123_State* s, uint32_t* seq, char* which) {
+    *seq = s->c.v[0];
+    *which = s->which_;
+}
+constexpr void nrnran123_getids(nrnran123_State* s, uint32_t* id1, uint32_t* id2) {
+    *id1 = s->c.v[2];
+    *id2 = s->c.v[3];
+}
+constexpr void nrnran123_getids3(nrnran123_State* s, uint32_t* id1, uint32_t* id2, uint32_t* id3) {
+    *id3 = s->c.v[1];
+    *id1 = s->c.v[2];
+    *id2 = s->c.v[3];
+}
 
-/* this could be called from openacc parallel construct */
-nrn_pragma_acc(routine seq)
-double nrnran123_dblpick(nrnran123_State*); /* uniform open interval (0,1)*/
-/* nrnran123_dblpick minimum value is 2.3283064e-10 and max value is 1-min */
+// Uniform 0 to 2*32-1
+inline uint32_t nrnran123_ipick(nrnran123_State* s) {
+    char which = s->which_;
+    uint32_t rval{s->r.v[int{which++}]};
+    if (which > 3) {
+        which = 0;
+        s->c.v[0]++;
+        s->r = coreneuron_random123_philox4x32_helper(s);
+    }
+    s->which_ = which;
+    return rval;
+}
+
+constexpr double nrnran123_uint2dbl(uint32_t u) {
+    constexpr double SHIFT32 = 1.0 / 4294967297.0; /* 1/(2^32 + 1) */
+    /* 0 to 2^32-1 transforms to double value in open (0,1) interval */
+    /* min 2.3283064e-10 to max (1 - 2.3283064e-10) */
+    return (static_cast<double>(u) + 1.0) * SHIFT32;
+}
+
+// Uniform open interval (0,1), minimum value is 2.3283064e-10 and max value is 1-min
+inline double nrnran123_dblpick(nrnran123_State* s) {
+    return nrnran123_uint2dbl(nrnran123_ipick(s));
+}
 
 /* this could be called from openacc parallel construct (in INITIAL block) */
-nrn_pragma_acc(routine seq)
-void nrnran123_setseq(nrnran123_State*, uint32_t seq, char which);
-nrn_pragma_acc(routine seq)
-double nrnran123_negexp(nrnran123_State*); /* mean 1.0 */
-/* nrnran123_negexp min value is 2.3283064e-10, max is 22.18071 */
+inline void nrnran123_setseq(nrnran123_State* s, uint32_t seq, char which) {
+    if (which > 3) {
+        s->which_ = 0;
+    } else {
+        s->which_ = which;
+    }
+    s->c.v[0] = seq;
+    s->r = coreneuron_random123_philox4x32_helper(s);
+}
 
-/* missing declaration in coreneuron */
-nrn_pragma_acc(routine seq)
-double nrnran123_normal(nrnran123_State*);
-nrn_pragma_acc(routine seq)
-double nrnran123_gauss(nrnran123_State*); /* mean 0.0, std 1.0 */
+// nrnran123_negexp min value is 2.3283064e-10, max is 22.18071, mean 1.0
+inline double nrnran123_negexp(nrnran123_State* s) {
+    return -std::log(nrnran123_dblpick(s));
+}
 
-/* more fundamental (stateless) (though the global index is still used) */
-nrn_pragma_acc(routine seq)
-nrnran123_array4x32 nrnran123_iran(uint32_t seq, uint32_t id1, uint32_t id2);
-nrn_pragma_acc(routine seq)
-double nrnran123_uint2dbl(uint32_t);
-nrn_pragma_omp(end declare target)
+/* at cost of a cached  value we could compute two at a time. */
+inline double nrnran123_normal(nrnran123_State* s) {
+    double w, u1;
+    do {
+        u1 = nrnran123_dblpick(s);
+        double u2{nrnran123_dblpick(s)};
+        u1 = 2. * u1 - 1.;
+        u2 = 2. * u2 - 1.;
+        w = (u1 * u1) + (u2 * u2);
+    } while (w > 1);
+    double y{std::sqrt((-2. * std::log(w)) / w)};
+    return u1 * y;
+}
+
+// nrnran123_gauss, nrnran123_iran were declared but not defined in CoreNEURON
+// nrnran123_array4x32 was declared but not used in CoreNEURON
 }  // namespace coreneuron
diff --git a/coreneuron/utils/units.hpp b/coreneuron/utils/units.hpp
new file mode 100644
index 000000000..de44343fe
--- /dev/null
+++ b/coreneuron/utils/units.hpp
@@ -0,0 +1,38 @@
+/*
+# =============================================================================
+# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
+#
+# See top-level LICENSE file for details.
+# =============================================================================
+*/
+#pragma once
+namespace coreneuron {
+namespace units {
+#if CORENEURON_USE_LEGACY_UNITS == 1
+constexpr double faraday{96485.309};
+constexpr double gasconstant{8.3134};
+#else
+/* NMODL translated MOD files get unit constants typically from
+ * share/lib/nrnunits.lib.in. But there were other source files that hardcode
+ * some of the constants. Here we gather a few modern units into a single place
+ * (but, unfortunately, also in nrnunits.lib.in). Legacy units cannot be
+ * gathered here because they can differ slightly from place to place.
+ *
+ * These come from https://physics.nist.gov/cuu/Constants/index.html.
+ * Termed the "2018 CODATA recommended values", they became available
+ * on 20 May 2019 and replace the 2014 CODATA set.
+ *
+ * See oc/hoc_init.c, nrnoc/eion.c, nrniv/kschan.h
+ */
+namespace detail {
+constexpr double electron_charge{1.602176634e-19};  // coulomb exact
+constexpr double avogadro_number{6.02214076e+23};   // exact
+constexpr double boltzmann{1.380649e-23};           // joule/K exact
+}  // namespace detail
+constexpr double faraday{detail::electron_charge * detail::avogadro_number};  // 96485.33212...
+                                                                              // coulomb/mol
+constexpr double gasconstant{detail::boltzmann * detail::avogadro_number};    // 8.314462618...
+                                                                              // joule/mol-K
+#endif
+}  // namespace units
+}  // namespace coreneuron
diff --git a/external/mod2c b/external/mod2c
index 8565d3c17..469c74dc7 160000
--- a/external/mod2c
+++ b/external/mod2c
@@ -1 +1 @@
-Subproject commit 8565d3c178a195a489fae0623d6338c2e92cd1e5
+Subproject commit 469c74dc7d96bbc5a06a42696422154b4cd2ce28
diff --git a/external/nmodl b/external/nmodl
index 7000ff612..4f45a1c8a 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit 7000ff612208ed8b27837438731903c58d1786e3
+Subproject commit 4f45a1c8a9b99c64127ea795eb12952e754b775c
diff --git a/extra/nrnivmodl_core_makefile.in b/extra/nrnivmodl_core_makefile.in
index 6601f7123..135a9e722 100644
--- a/extra/nrnivmodl_core_makefile.in
+++ b/extra/nrnivmodl_core_makefile.in
@@ -38,11 +38,6 @@ MOD_OBJS_DIR = $(OUTPUT_DIR)/corenrn/build
 
 # Linked libraries gathered by CMake
 LDFLAGS = $(LINKFLAGS) @CORENRN_COMMON_LDFLAGS@
-CORENRNLIB_FLAGS = -L$(CORENRN_LIB_DIR) -lcoreneuron
-CORENRNLIB_FLAGS += $(if @reportinglib_LIB_DIR@, -W$(subst ;, -W,l,-rpath,@reportinglib_LIB_DIR@),)
-CORENRNLIB_FLAGS += $(if @sonatareport_LIB_DIR@, -W$(subst ;, -W,l,-rpath,@sonatareport_LIB_DIR@),)
-CORENRNLIB_FLAGS += $(if @caliper_LIB_DIR@, -W$(subst ;, -W,l,-rpath,@caliper_LIB_DIR@),)
-CORENRNLIB_FLAGS += $(if @caliper_LIB_DIR@,-L@caliper_LIB_DIR@,)
 
 # Includes paths gathered by CMake
 # coreneuron/utils/randoms goes first because it needs to override the NEURON
@@ -82,7 +77,7 @@ ifeq ($(wildcard $(CORENRN_PERLEXE)),)
 endif
 
 CXXFLAGS = @CORENRN_CXX_FLAGS@
-CXX_COMPILE_CMD = $(CXX) $(CXXFLAGS) @CMAKE_CXX_COMPILE_OPTIONS_PIC@ @CORENRN_COMMON_COMPILE_DEFS@ $(INCLUDES)
+CXX_COMPILE_CMD = $(CXX) $(CXXFLAGS) @CMAKE_CXX_COMPILE_OPTIONS_PIC@ $(INCLUDES)
 CXX_LINK_EXE_CMD = $(CXX) $(CXXFLAGS) @CMAKE_EXE_LINKER_FLAGS@
 CXX_SHARED_LIB_CMD = $(CXX) $(CXXFLAGS) @CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS@ @CMAKE_SHARED_LIBRARY_CXX_FLAGS@ @CMAKE_SHARED_LINKER_FLAGS@
 
@@ -207,34 +202,39 @@ endif
 
 
 # main target to build binary
-$(SPECIAL_EXE): coremech_lib_target
+$(SPECIAL_EXE): $(corenrnmech_lib_target)
 	@printf " => $(C_GREEN)Binary$(C_RESET) creating $(SPECIAL_EXE)\n"
 	$(CXX_LINK_EXE_CMD) -o $(SPECIAL_EXE) $(CORENRN_SHARE_CORENRN_DIR)/coreneuron.cpp \
 	  -I$(CORENRN_INC_DIR) $(INCFLAGS) \
-	  -L$(OUTPUT_DIR) -l$(COREMECH_LIB_NAME) $(CORENRNLIB_FLAGS) $(LDFLAGS) \
+	  -L$(OUTPUT_DIR) -l$(COREMECH_LIB_NAME) $(LDFLAGS) \
+	  -L$(CORENRN_LIB_DIR) \
 	  -Wl,-rpath,'$(LIB_RPATH)' -Wl,-rpath,$(CORENRN_LIB_DIR) -Wl,-rpath,'$(INSTALL_LIB_RPATH)'
 
-coremech_lib_target: $(corenrnmech_lib_target)
-	rm -rf $(OUTPUT_DIR)/.libs/lib$(COREMECH_LIB_NAME)$(LIB_SUFFIX); \
-	mkdir -p $(OUTPUT_DIR)/.libs; \
-	ln -s ../lib$(COREMECH_LIB_NAME)$(LIB_SUFFIX) $(OUTPUT_DIR)/.libs/lib$(COREMECH_LIB_NAME)$(LIB_SUFFIX)
-
 $(ENGINEMECH_OBJ): $(CORENRN_SHARE_CORENRN_DIR)/enginemech.cpp | $(MOD_OBJS_DIR)
 	$(CXX_COMPILE_CMD) -c -DADDITIONAL_MECHS $(CORENRN_SHARE_CORENRN_DIR)/enginemech.cpp -o $(ENGINEMECH_OBJ)
 
 # build shared library of mechanisms
 coremech_lib_shared: $(ALL_OBJS) $(ENGINEMECH_OBJ) build_always
+	# extract the object files from libcoreneuron-core.a
+	mkdir -p $(MOD_OBJS_DIR)/libcoreneuron-core
+	rm -f $(MOD_OBJS_DIR)/libcoreneuron-core/*.o
+	# --output is only supported by modern versions of ar
+	(cd $(MOD_OBJS_DIR)/libcoreneuron-core && ar x $(CORENRN_LIB_DIR)/libcoreneuron-core.a)
 	$(CXX_SHARED_LIB_CMD) $(ENGINEMECH_OBJ) -o ${COREMECH_LIB_PATH} $(ALL_OBJS) \
 	  -I$(CORENRN_INC_DIR) $(INCFLAGS) \
-	  $(LDFLAGS) $(CORENRN_LIB_DIR)/libscopmath.a \
-	  ${SONAME_OPTION} $(CORENRNLIB_FLAGS) -Wl,-rpath,$(CORENRN_LIB_DIR);
+	  @CORENEURON_LINKER_START_GROUP@ \
+	  $(MOD_OBJS_DIR)/libcoreneuron-core/*.o @CORENEURON_LINKER_END_GROUP@ \
+		$(LDFLAGS) ${SONAME_OPTION} \
+		-Wl,-rpath,$(CORENRN_LIB_DIR) -L$(CORENRN_LIB_DIR)
+	# cleanup
+	rm $(MOD_OBJS_DIR)/libcoreneuron-core/*.o
 
 # build static library of mechanisms
 coremech_lib_static: $(ALL_OBJS) $(ENGINEMECH_OBJ) build_always
-	mkdir -p $(MOD_OBJS_DIR)/scopmath; \
-	cd $(MOD_OBJS_DIR)/scopmath && ar -x $(CORENRN_LIB_DIR)/libscopmath.a && cd -;\
-	rm -f ${COREMECH_LIB_PATH}; \
-	ar cq ${COREMECH_LIB_PATH} $(ENGINEMECH_OBJ) $(ALL_OBJS) $(MOD_OBJS_DIR)/scopmath/*.o;
+	# make a libcorenrnmech.a by copying libcoreneuron-core.a and then appending
+	# the newly compiled objects
+	cp $(CORENRN_LIB_DIR)/libcoreneuron-core.a ${COREMECH_LIB_PATH}
+	ar r ${COREMECH_LIB_PATH} $(ENGINEMECH_OBJ) $(ALL_OBJS)
 
 # compile cpp files to .o
 $(MOD_OBJS_DIR)/%.o: $(MOD_TO_CPP_DIR)/%.cpp | $(MOD_OBJS_DIR)
@@ -273,7 +273,7 @@ $(MOD_OBJS_DIR):
 	mkdir -p $(MOD_OBJS_DIR)
 
 # install binary and libraries
-install: $(SPECIAL_EXE) coremech_lib_target
+install: $(SPECIAL_EXE)
 	install -d $(DESTDIR)/bin $(DESTDIR)/lib
 	install ${COREMECH_LIB_PATH} $(DESTDIR)/lib
 	install $(SPECIAL_EXE) $(DESTDIR)/bin
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index e9cacd422..d6b334ca3 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -27,7 +27,11 @@ find_package(Boost 1.59 QUIET COMPONENTS filesystem system atomic unit_test_fram
 
 if(Boost_FOUND)
   if(CORENRN_ENABLE_UNIT_TESTS)
-    include_directories(SYSTEM ${Boost_INCLUDE_DIRS})
+    add_library(coreneuron-unit-test INTERFACE)
+    target_compile_options(coreneuron-unit-test
+                           INTERFACE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS})
+    target_include_directories(coreneuron-unit-test SYSTEM INTERFACE ${Boost_INCLUDE_DIRS})
+    target_link_libraries(coreneuron-unit-test INTERFACE coreneuron-all)
     add_subdirectory(unit/cmdline_interface)
     add_subdirectory(unit/interleave_info)
     add_subdirectory(unit/alignment)
diff --git a/tests/integration/CMakeLists.txt b/tests/integration/CMakeLists.txt
index 7b7e1e1a5..17b57084b 100644
--- a/tests/integration/CMakeLists.txt
+++ b/tests/integration/CMakeLists.txt
@@ -12,7 +12,7 @@ if(CORENRN_ENABLE_MPI_DYNAMIC)
   # building single generic mpi library libcorenrn_mpi.<suffix>
   # ~~~
   if(CORENEURON_AS_SUBPROJECT)
-    message(INFO "CoreNEURON integration tests are disabled with dynamic MPI")
+    message(STATUS "CoreNEURON integration tests are disabled with dynamic MPI")
     return()
   else()
     set(CORENRN_MPI_LIB_ARG
@@ -31,6 +31,9 @@ set(PERMUTE2_ARGS "--cell-permute 2")
 set(CUDA_INTERFACE "--cuda-interface")
 if(CORENRN_ENABLE_GPU)
   set(GPU_ARGS "--gpu")
+  set(permutation_modes 1 2)
+else()
+  set(permutation_modes 0 1)
 endif()
 
 # List of tests with arguments
@@ -39,32 +42,37 @@ set(TEST_CASES_WITH_ARGS
     "ring_binqueue!${RING_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_binqueue --binqueue"
     "ring_multisend!${RING_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_multisend --multisend"
     "ring_spike_buffer!${RING_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_spike_buffer --spikebuf 1"
-    "ring_permute1!${RING_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_permute1 ${PERMUTE1_ARGS}"
-    "ring_permute2!${RING_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_permute2 ${PERMUTE2_ARGS}"
     "ring_gap!${RING_GAP_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_gap"
     "ring_gap_binqueue!${RING_GAP_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_gap_binqueue --binqueue"
     "ring_gap_multisend!${RING_GAP_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_gap_multisend --multisend"
-    "ring_gap_permute1!${RING_GAP_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_gap_permute1 ${PERMUTE1_ARGS}"
-    "ring_gap_permute2!${RING_GAP_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_gap_permute2 ${PERMUTE2_ARGS}"
 )
-
-if(CORENRN_ENABLE_GPU)
+set(test_suffixes "" "_binqueue" "_multisend")
+foreach(cell_permute ${permutation_modes})
+  list(APPEND test_suffixes "_permute${cell_permute}")
   list(
     APPEND
     TEST_CASES_WITH_ARGS
-    "ring_permute2_cudaInterface!${RING_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_permute2_cudaInterface ${PERMUTE2_ARGS} ${CUDA_INTERFACE}"
-    "ring_gap_permute2_cudaInterface!${RING_GAP_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_gap_permute2_cudaInterface ${PERMUTE2_ARGS} ${CUDA_INTERFACE}"
+    "ring_permute${cell_permute}!${RING_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_permute${cell_permute} --cell-permute=${cell_permute}"
+    "ring_gap_permute${cell_permute}!${RING_GAP_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_gap_permute${cell_permute} --cell-permute=${cell_permute}"
   )
-endif()
+  # As reports require MPI, do not add test if report is enabled.
+  if(NOT CORENRN_ENABLE_REPORTING)
+    list(APPEND test_suffixes "_serial_permute${cell_permute}")
+    list(
+      APPEND
+      TEST_CASES_WITH_ARGS
+      "ring_serial_permute${cell_permute}!${GPU_ARGS} --cell-permute=${cell_permute} --tstop 100. --celsius 6.3 --datpath ${RING_DATASET_DIR} ${MODEL_STATS_ARG} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_serial_permute${cell_permute}"
+    )
+  endif()
+endforeach()
 
-# ~~~
-# As reports require MPI, do not add test if report is enabled.
-# ~~~
-if(NOT CORENRN_ENABLE_REPORTING)
+if(CORENRN_ENABLE_GPU)
+  list(APPEND test_suffixes "_permute2_cudaInterface")
   list(
     APPEND
     TEST_CASES_WITH_ARGS
-    "ring_serial!--tstop 100. --celsius 6.3 --datpath ${RING_DATASET_DIR} ${MODEL_STATS_ARG} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_serial"
+    "ring_permute2_cudaInterface!${RING_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_permute2_cudaInterface ${PERMUTE2_ARGS} ${CUDA_INTERFACE}"
+    "ring_gap_permute2_cudaInterface!${RING_GAP_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_gap_permute2_cudaInterface ${PERMUTE2_ARGS} ${CUDA_INTERFACE}"
   )
 endif()
 
@@ -73,18 +81,11 @@ endif()
 # create them and copy reference spikes
 # ~~~
 foreach(data_dir "ring" "ring_gap")
-  foreach(
-    test_suffix
-    ""
-    "_serial"
-    "_multisend"
-    "_binqueue"
-    "_savestate_permute0"
-    "_savestate_permute1"
-    "_savestate_permute2"
-    "_permute1"
-    "_permute2"
-    "_permute2_cudaInterface")
+  # Naïve foreach(test_suffix ${test_suffixes}) does not seem to handle empty suffixes correctly.
+  list(LENGTH test_suffixes num_suffixes)
+  math(EXPR num_suffixes_m1 "${num_suffixes} - 1")
+  foreach(suffix_index RANGE 0 ${num_suffixes_m1})
+    list(GET test_suffixes ${suffix_index} test_suffix)
     file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/${data_dir}/out.dat.ref"
          DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/${data_dir}${test_suffix}/")
   endforeach()
diff --git a/tests/unit/alignment/CMakeLists.txt b/tests/unit/alignment/CMakeLists.txt
index 92464350e..89da4da14 100644
--- a/tests/unit/alignment/CMakeLists.txt
+++ b/tests/unit/alignment/CMakeLists.txt
@@ -3,9 +3,7 @@
 #
 # See top-level LICENSE file for details.
 # =============================================================================
-include_directories(${CMAKE_SOURCE_DIR}/coreneuron ${Boost_INCLUDE_DIRS})
-
 add_executable(alignment_test_bin alignment.cpp)
-target_compile_options(alignment_test_bin PRIVATE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS})
-add_test(NAME alignment_test COMMAND ${TEST_EXEC_PREFIX} $<TARGET_FILE:alignment_test_bin>)
+target_link_libraries(alignment_test_bin coreneuron-unit-test)
+add_test(NAME alignment_test COMMAND $<TARGET_FILE:alignment_test_bin>)
 cpp_cc_configure_sanitizers(TARGET alignment_test_bin TEST alignment_test)
diff --git a/tests/unit/cmdline_interface/CMakeLists.txt b/tests/unit/cmdline_interface/CMakeLists.txt
index cd177c521..cc98ad78d 100644
--- a/tests/unit/cmdline_interface/CMakeLists.txt
+++ b/tests/unit/cmdline_interface/CMakeLists.txt
@@ -4,14 +4,6 @@
 # See top-level LICENSE file for details.
 # =============================================================================
 add_executable(cmd_interface_test_bin test_cmdline_interface.cpp)
-target_link_libraries(cmd_interface_test_bin ${MPI_CXX_LIBRARIES} coreneuron
-                      ${corenrn_mech_library} ${reportinglib_LIBRARY} ${sonatareport_LIBRARY})
-target_include_directories(cmd_interface_test_bin SYSTEM
-                           PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11/include)
-add_dependencies(cmd_interface_test_bin nrniv-core)
-# Tell CMake *not* to run an explicit device code linker step (which will produce errors); let the
-# NVHPC C++ compiler handle this implicitly.
-set_target_properties(cmd_interface_test_bin PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
-target_compile_options(cmd_interface_test_bin PRIVATE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS})
-add_test(NAME cmd_interface_test COMMAND ${TEST_EXEC_PREFIX} $<TARGET_FILE:cmd_interface_test_bin>)
+target_link_libraries(cmd_interface_test_bin coreneuron-unit-test)
+add_test(NAME cmd_interface_test COMMAND $<TARGET_FILE:cmd_interface_test_bin>)
 cpp_cc_configure_sanitizers(TARGET cmd_interface_test_bin TEST cmd_interface_test)
diff --git a/tests/unit/cmdline_interface/test_cmdline_interface.cpp b/tests/unit/cmdline_interface/test_cmdline_interface.cpp
index caef6ca14..ccd9e1f66 100644
--- a/tests/unit/cmdline_interface/test_cmdline_interface.cpp
+++ b/tests/unit/cmdline_interface/test_cmdline_interface.cpp
@@ -130,5 +130,5 @@ BOOST_AUTO_TEST_CASE(cmdline_interface) {
 
     // Everything has its default value, and the first `false` says not to
     // include default values in the output, so this should be empty
-    BOOST_CHECK(corenrn_param_test.app.config_to_str(false, false).empty());
+    BOOST_CHECK(corenrn_param_test.config_to_str(false, false).empty());
 }
diff --git a/tests/unit/interleave_info/CMakeLists.txt b/tests/unit/interleave_info/CMakeLists.txt
index ce69b097e..cda875eae 100644
--- a/tests/unit/interleave_info/CMakeLists.txt
+++ b/tests/unit/interleave_info/CMakeLists.txt
@@ -4,13 +4,6 @@
 # See top-level LICENSE file for details.
 # =============================================================================
 add_executable(interleave_info_bin check_constructors.cpp)
-target_link_libraries(interleave_info_bin ${MPI_CXX_LIBRARIES} coreneuron ${corenrn_mech_library}
-                      ${reportinglib_LIBRARY} ${sonatareport_LIBRARY})
-add_dependencies(interleave_info_bin nrniv-core)
-# Tell CMake *not* to run an explicit device code linker step (which will produce errors); let the
-# NVHPC C++ compiler handle this implicitly.
-set_target_properties(interleave_info_bin PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
-target_compile_options(interleave_info_bin PRIVATE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS})
-add_test(NAME interleave_info_constructor_test COMMAND ${TEST_EXEC_PREFIX}
-                                                       $<TARGET_FILE:interleave_info_bin>)
+target_link_libraries(interleave_info_bin coreneuron-unit-test)
+add_test(NAME interleave_info_constructor_test COMMAND $<TARGET_FILE:interleave_info_bin>)
 cpp_cc_configure_sanitizers(TARGET interleave_info_bin TEST interleave_info_constructor_test)
diff --git a/tests/unit/lfp/CMakeLists.txt b/tests/unit/lfp/CMakeLists.txt
index 61d749aa9..34231b9f9 100644
--- a/tests/unit/lfp/CMakeLists.txt
+++ b/tests/unit/lfp/CMakeLists.txt
@@ -3,19 +3,9 @@
 #
 # See top-level LICENSE file for details.
 # =============================================================================
-
-include_directories(${CMAKE_SOURCE_DIR}/coreneuron ${Boost_INCLUDE_DIRS})
-file(GLOB lfp_test_src "*.cpp")
-
-add_executable(lfp_test_bin ${lfp_test_src})
-target_link_libraries(lfp_test_bin ${MPI_CXX_LIBRARIES} coreneuron ${corenrn_mech_library}
-                      ${reportinglib_LIBRARY} ${sonatareport_LIBRARY})
-# Tell CMake *not* to run an explicit device code linker step (which will produce errors); let the
-# NVHPC C++ compiler handle this implicitly.
-set_target_properties(lfp_test_bin PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
-target_compile_options(lfp_test_bin PRIVATE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS})
-add_dependencies(lfp_test_bin nrniv-core)
-add_test(NAME lfp_test COMMAND ${TEST_EXEC_PREFIX} $<TARGET_FILE:lfp_test_bin>)
+add_executable(lfp_test_bin lfp.cpp)
+target_link_libraries(lfp_test_bin coreneuron-unit-test)
+add_test(NAME lfp_test COMMAND $<TARGET_FILE:lfp_test_bin>)
 cpp_cc_configure_sanitizers(TARGET lfp_test_bin TEST lfp_test)
 set_property(
   TEST lfp_test
diff --git a/tests/unit/queueing/CMakeLists.txt b/tests/unit/queueing/CMakeLists.txt
index ba3725d32..05b2a12f2 100644
--- a/tests/unit/queueing/CMakeLists.txt
+++ b/tests/unit/queueing/CMakeLists.txt
@@ -4,12 +4,6 @@
 # See top-level LICENSE file for details.
 # =============================================================================
 add_executable(queuing_test_bin test_queueing.cpp)
-target_link_libraries(queuing_test_bin ${Boost_SYSTEM_LIBRARY} coreneuron ${corenrn_mech_library}
-                      ${reportinglib_LIBRARY} ${sonatareport_LIBRARY})
-add_dependencies(queuing_test_bin nrniv-core)
-# Tell CMake *not* to run an explicit device code linker step (which will produce errors); let the
-# NVHPC C++ compiler handle this implicitly.
-set_target_properties(queuing_test_bin PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
-target_compile_options(queuing_test_bin PRIVATE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS})
-add_test(NAME queuing_test COMMAND ${TEST_EXEC_PREFIX} $<TARGET_FILE:queuing_test_bin>)
+target_link_libraries(queuing_test_bin coreneuron-unit-test)
+add_test(NAME queuing_test COMMAND $<TARGET_FILE:queuing_test_bin>)
 cpp_cc_configure_sanitizers(TARGET queuing_test_bin TEST queuing_test)
diff --git a/tests/unit/solver/CMakeLists.txt b/tests/unit/solver/CMakeLists.txt
index 1d01ea4b1..f8bc52287 100644
--- a/tests/unit/solver/CMakeLists.txt
+++ b/tests/unit/solver/CMakeLists.txt
@@ -1,19 +1,9 @@
 # =============================================================================
-# Copyright (C) 2022 Blue Brain Project
+# Copyright (c) 2022 Blue Brain Project/EPFL
 #
 # See top-level LICENSE file for details.
 # =============================================================================
-
-include_directories(${CMAKE_SOURCE_DIR}/coreneuron ${Boost_INCLUDE_DIRS})
 add_executable(test-solver test_solver.cpp)
-target_link_libraries(test-solver coreneuron ${corenrn_mech_library})
-target_include_directories(test-solver SYSTEM
-                           PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11/include)
-
-# Tell CMake *not* to run an explicit device code linker step (which will produce errors); let the
-# NVHPC C++ compiler handle this implicitly.
-set_target_properties(test-solver PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
-target_compile_options(test-solver PRIVATE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS})
-add_dependencies(test-solver nrniv-core)
+target_link_libraries(test-solver coreneuron-unit-test)
 add_test(NAME test-solver COMMAND $<TARGET_FILE:test-solver>)
 cpp_cc_configure_sanitizers(TARGET test-solver TEST test-solver)
diff --git a/tests/unit/solver/test_solver.cpp b/tests/unit/solver/test_solver.cpp
index 6511f03e1..c1021bcb7 100644
--- a/tests/unit/solver/test_solver.cpp
+++ b/tests/unit/solver/test_solver.cpp
@@ -196,7 +196,9 @@ struct SetupThreads {
     }
 
     ~SetupThreads() {
-        delete_nrnthreads_on_device(nrn_threads, nrn_nthread);
+        if (corenrn_param.gpu) {
+            delete_nrnthreads_on_device(nrn_threads, nrn_nthread);
+        }
         for (auto& nt: *this) {
             free_memory(std::exchange(nt._data, nullptr));
             delete[] std::exchange(nt._permute, nullptr);

From 8290df28aead01ee058df4ecc8d5726f91582dca Mon Sep 17 00:00:00 2001
From: Jorge Blanco Alonso <jorge.blancoalonso@epfl.ch>
Date: Wed, 27 Oct 2021 16:07:58 +0200
Subject: [PATCH 4/9] Read lfp factors from neurodamus

---
 coreneuron/io/nrn_filehandler.hpp    |  8 +++++++-
 coreneuron/io/nrn_setup.cpp          |  2 +-
 coreneuron/io/nrnsection_mapping.hpp | 17 +++++++++++++++++
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/coreneuron/io/nrn_filehandler.hpp b/coreneuron/io/nrn_filehandler.hpp
index 65cab4b97..26d960ef0 100644
--- a/coreneuron/io/nrn_filehandler.hpp
+++ b/coreneuron/io/nrn_filehandler.hpp
@@ -14,6 +14,7 @@
 #include <sys/stat.h>
 
 #include "coreneuron/utils/nrn_assert.h"
+#include "coreneuron/io/nrnsection_mapping.hpp"
 
 namespace coreneuron {
 /** Encapsulate low-level reading of coreneuron input data files.
@@ -110,7 +111,7 @@ class FileHandler {
      * Read count no of mappings for section to segment
      */
     template <typename T>
-    int read_mapping_info(T* mapinfo) {
+    int read_mapping_info(T* mapinfo, NrnThreadMappingInfo* ntmapping) {
         int nsec, nseg, n_scan;
         char line_buf[max_line_length], name[max_line_length];
 
@@ -123,14 +124,19 @@ class FileHandler {
 
         if (nseg) {
             std::vector<int> sec, seg;
+            std::vector<double> lfp_factors;
             sec.reserve(nseg);
             seg.reserve(nseg);
+            lfp_factors.reserve(nseg);
 
             read_array<int>(&sec[0], nseg);
             read_array<int>(&seg[0], nseg);
+            read_array<double>(&lfp_factors[0], nseg);
 
             for (int i = 0; i < nseg; i++) {
                 mapinfo->add_segment(sec[i], seg[i]);
+                ntmapping->add_segment_id(seg[i]);
+                ntmapping->add_segment_lfp_factor(seg[i], lfp_factors[i]);
             }
         }
         return nseg;
diff --git a/coreneuron/io/nrn_setup.cpp b/coreneuron/io/nrn_setup.cpp
index 703e853d8..e24f1d424 100644
--- a/coreneuron/io/nrn_setup.cpp
+++ b/coreneuron/io/nrn_setup.cpp
@@ -959,7 +959,7 @@ void read_phase3(NrnThread& nt, UserParams& userParams) {
         // read section-segment mapping for every section list
         for (int j = 0; j < nseclist; j++) {
             SecMapping* smap = new SecMapping();
-            F.read_mapping_info(smap);
+            F.read_mapping_info(smap, ntmapping);
             cmap->add_sec_map(smap);
         }
 
diff --git a/coreneuron/io/nrnsection_mapping.hpp b/coreneuron/io/nrnsection_mapping.hpp
index d7524fc42..9756fca8d 100644
--- a/coreneuron/io/nrnsection_mapping.hpp
+++ b/coreneuron/io/nrnsection_mapping.hpp
@@ -14,6 +14,7 @@
 #include <vector>
 #include <map>
 #include <iostream>
+#include <unordered_map>
 
 namespace coreneuron {
 
@@ -153,6 +154,12 @@ struct NrnThreadMappingInfo {
     /** list of cells mapping */
     std::vector<CellMapping*> mappingvec;
 
+    /** list of segment ids */
+    std::vector<int> segment_ids;
+
+    /** map containing segment ids an its respective lfp factors */
+    std::unordered_map<int, double> lfp_factors;
+
     /** @brief number of cells */
     size_t size() const {
         return mappingvec.size();
@@ -181,5 +188,15 @@ struct NrnThreadMappingInfo {
     void add_cell_mapping(CellMapping* c) {
         mappingvec.push_back(c);
     }
+
+    /** @brief add a new segment */
+    void add_segment_id(const int segment_id) {
+        segment_ids.push_back(segment_id);
+    }
+
+    /** @brief add the lfp factor of a segment_id */
+    void add_segment_lfp_factor(const int segment_id, double factor) {
+        lfp_factors.insert({segment_id, factor});
+    }
 };
 }  // namespace coreneuron

From 99d94963fb6e610aac1369b52a9d6d3ab9325448 Mon Sep 17 00:00:00 2001
From: Jorge Blanco Alonso <jorge.blancoalonso@epfl.ch>
Date: Fri, 5 Nov 2021 17:32:51 +0100
Subject: [PATCH 5/9] Add lfp report type

---
 coreneuron/io/nrn_filehandler.hpp             |  3 ++
 coreneuron/io/nrnsection_mapping.hpp          |  2 ++
 coreneuron/io/reports/nrnreport.hpp           |  3 +-
 .../reports/report_configuration_parser.cpp   |  3 ++
 coreneuron/io/reports/report_event.cpp        | 21 +++++++++++
 coreneuron/io/reports/report_event.hpp        |  1 +
 coreneuron/io/reports/report_handler.cpp      | 36 +++++++++++++++++++
 coreneuron/io/reports/report_handler.hpp      |  3 ++
 .../io/reports/sonata_report_handler.cpp      |  1 +
 9 files changed, 72 insertions(+), 1 deletion(-)

diff --git a/coreneuron/io/nrn_filehandler.hpp b/coreneuron/io/nrn_filehandler.hpp
index 26d960ef0..ad3ed2eca 100644
--- a/coreneuron/io/nrn_filehandler.hpp
+++ b/coreneuron/io/nrn_filehandler.hpp
@@ -133,6 +133,9 @@ class FileHandler {
             read_array<int>(&seg[0], nseg);
             read_array<double>(&lfp_factors[0], nseg);
 
+            std::cout << "=====> NEW CoreNEURON!" << std::endl;
+            std::cout << "nseg = " << nseg << std::endl;
+
             for (int i = 0; i < nseg; i++) {
                 mapinfo->add_segment(sec[i], seg[i]);
                 ntmapping->add_segment_id(seg[i]);
diff --git a/coreneuron/io/nrnsection_mapping.hpp b/coreneuron/io/nrnsection_mapping.hpp
index 9756fca8d..29836a24b 100644
--- a/coreneuron/io/nrnsection_mapping.hpp
+++ b/coreneuron/io/nrnsection_mapping.hpp
@@ -160,6 +160,8 @@ struct NrnThreadMappingInfo {
     /** map containing segment ids an its respective lfp factors */
     std::unordered_map<int, double> lfp_factors;
 
+    std::vector<double> _lfp;
+
     /** @brief number of cells */
     size_t size() const {
         return mappingvec.size();
diff --git a/coreneuron/io/reports/nrnreport.hpp b/coreneuron/io/reports/nrnreport.hpp
index 7cdc05806..ac85d75be 100644
--- a/coreneuron/io/reports/nrnreport.hpp
+++ b/coreneuron/io/reports/nrnreport.hpp
@@ -76,7 +76,8 @@ enum ReportType {
     SynapseReport,
     IMembraneReport,
     SectionReport,
-    SummationReport
+    SummationReport,
+    LFPReport
 };
 
 // enumerate that defines the section type for a Section report
diff --git a/coreneuron/io/reports/report_configuration_parser.cpp b/coreneuron/io/reports/report_configuration_parser.cpp
index 6c69662b7..528c269a6 100644
--- a/coreneuron/io/reports/report_configuration_parser.cpp
+++ b/coreneuron/io/reports/report_configuration_parser.cpp
@@ -138,6 +138,9 @@ std::vector<ReportConfiguration> create_report_configurations(const std::string&
             report_type = SynapseReport;
         } else if (report.type_str == "summation") {
             report_type = SummationReport;
+        } else if (report.type_str == "lfp") {
+            nrn_use_fast_imem = true;
+            report_type = LFPReport;
         } else {
             std::cerr << "Report error: unsupported type " << report.type_str << std::endl;
             nrn_abort(1);
diff --git a/coreneuron/io/reports/report_event.cpp b/coreneuron/io/reports/report_event.cpp
index 78eb6b268..139fc88b0 100644
--- a/coreneuron/io/reports/report_event.cpp
+++ b/coreneuron/io/reports/report_event.cpp
@@ -10,6 +10,7 @@
 #include "coreneuron/sim/multicore.hpp"
 #include "coreneuron/io/reports/nrnreport.hpp"
 #include "coreneuron/utils/nrn_assert.h"
+#include "coreneuron/io/nrnsection_mapping.hpp"
 #ifdef ENABLE_BIN_REPORTS
 #include "reportinglib/Records.h"
 #endif  // ENABLE_BIN_REPORTS
@@ -72,12 +73,32 @@ void ReportEvent::summation_alu(NrnThread* nt) {
     }
 }
 
+void ReportEvent::lfp_calc(NrnThread* nt) {
+    // Calculate lfp only on reporting steps
+    if (step > 0 && (static_cast<int>(step) % reporting_period) == 0) {
+        auto* mapinfo = static_cast<NrnThreadMappingInfo*>(nt->mapping);
+        double sum = 0.0;
+        double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs;
+        for (const auto& kv: mapinfo->lfp_factors) {
+                int segment_id = kv.first;
+                double factor = kv.second;
+                if(std::isnan(factor)) {
+                    factor = 0.0;
+                }
+                std::cout << segment_id << " - " << factor << std::endl;
+                sum += fast_imem_rhs[segment_id] * factor;
+        }
+        mapinfo->_lfp[0] = sum;
+    }mdi
+}
+
 /** on deliver, call ReportingLib and setup next event */
 void ReportEvent::deliver(double t, NetCvode* nc, NrnThread* nt) {
 /* reportinglib is not thread safe */
 #pragma omp critical
     {
         summation_alu(nt);
+        lfp_calc(nt);
         // each thread needs to know its own step
 #ifdef ENABLE_BIN_REPORTS
         records_nrec(step, gids_to_report.size(), gids_to_report.data(), report_path.data());
diff --git a/coreneuron/io/reports/report_event.hpp b/coreneuron/io/reports/report_event.hpp
index 20d325614..cf2d7bd19 100644
--- a/coreneuron/io/reports/report_event.hpp
+++ b/coreneuron/io/reports/report_event.hpp
@@ -42,6 +42,7 @@ class ReportEvent: public DiscreteEvent {
     void deliver(double t, NetCvode* nc, NrnThread* nt) override;
     bool require_checkpoint() override;
     void summation_alu(NrnThread* nt);
+    void lfp_calc(NrnThread* nt);
 
   private:
     double dt;
diff --git a/coreneuron/io/reports/report_handler.cpp b/coreneuron/io/reports/report_handler.cpp
index 84341e7a4..fd385cd68 100644
--- a/coreneuron/io/reports/report_handler.cpp
+++ b/coreneuron/io/reports/report_handler.cpp
@@ -35,6 +35,7 @@ void ReportHandler::create_report(double dt, double tstop, double delay) {
             continue;
         }
         const std::vector<int>& nodes_to_gid = map_gids(nt);
+        auto* mapinfo = static_cast<NrnThreadMappingInfo*>(nt.mapping);
         VarsToReport vars_to_report;
         bool is_soma_target;
         switch (m_report_config.type) {
@@ -58,6 +59,13 @@ void ReportHandler::create_report(double dt, double tstop, double delay) {
                                                               nodes_to_gid);
                 register_custom_report(nt, m_report_config, vars_to_report);
                 break;
+            case LFPReport:
+                mapinfo->_lfp.resize(12);
+                vars_to_report = get_lfp_vars_to_report(nt, m_report_config, mapinfo->_lfp.data());
+                is_soma_target = m_report_config.section_type == SectionType::Soma ||
+                                 m_report_config.section_type == SectionType::Cell;
+                register_section_report(nt, m_report_config, vars_to_report, is_soma_target);
+                break;
             default:
                 vars_to_report = get_synapse_vars_to_report(nt, m_report_config, nodes_to_gid);
                 register_custom_report(nt, m_report_config, vars_to_report);
@@ -341,6 +349,34 @@ VarsToReport ReportHandler::get_synapse_vars_to_report(
     return vars_to_report;
 }
 
+VarsToReport ReportHandler::get_lfp_vars_to_report(const NrnThread& nt,
+                                                   ReportConfiguration& report,
+                                                   double* report_variable) const {
+    VarsToReport vars_to_report;
+    /*const auto* mapinfo = static_cast<NrnThreadMappingInfo*>(nt.mapping);
+    if (!mapinfo) {
+        std::cerr << "[LFP] Error : mapping information is missing for a Cell group "
+                  << nt.ncell << '\n';
+        nrn_abort(1);
+    }*/
+    for (int i = 0; i < nt.ncell; i++) {
+        int gid = nt.presyns[i].gid_;
+        if (report.target.find(gid) == report.target.end()) {
+            continue;
+        }
+
+        std::vector<VarWithMapping> to_report;
+        // Add all electrodes to the first gid for now
+        std::vector<int> electrode_ids = {0};
+        for (const auto& electrode_id : electrode_ids) {
+            double* variable = report_variable + electrode_id;
+            to_report.push_back(VarWithMapping(electrode_id, variable));
+        }
+        vars_to_report[gid] = to_report;
+    }
+    return vars_to_report;
+}
+
 // map GIDs of every compartment, it consist in a backward sweep then forward sweep algorithm
 std::vector<int> ReportHandler::map_gids(const NrnThread& nt) const {
     std::vector<int> nodes_gid(nt.end, -1);
diff --git a/coreneuron/io/reports/report_handler.hpp b/coreneuron/io/reports/report_handler.hpp
index 084886c96..db62bf993 100644
--- a/coreneuron/io/reports/report_handler.hpp
+++ b/coreneuron/io/reports/report_handler.hpp
@@ -44,6 +44,9 @@ class ReportHandler {
     VarsToReport get_synapse_vars_to_report(const NrnThread& nt,
                                             ReportConfiguration& report,
                                             const std::vector<int>& nodes_to_gids) const;
+    VarsToReport get_lfp_vars_to_report(const NrnThread& nt,
+                                        ReportConfiguration& report,
+                                        double* report_variable) const;
     std::vector<int> map_gids(const NrnThread& nt) const;
 #endif  // defined(ENABLE_BIN_REPORTS) || defined(ENABLE_SONATA_REPORTS)
   protected:
diff --git a/coreneuron/io/reports/sonata_report_handler.cpp b/coreneuron/io/reports/sonata_report_handler.cpp
index 99f5709d5..bc38ef27b 100644
--- a/coreneuron/io/reports/sonata_report_handler.cpp
+++ b/coreneuron/io/reports/sonata_report_handler.cpp
@@ -57,6 +57,7 @@ std::pair<std::string, int> SonataReportHandler::get_population_info(int gid) {
 void SonataReportHandler::register_report(const NrnThread& nt,
                                           ReportConfiguration& config,
                                           const VarsToReport& vars_to_report) {
+    std::cout << "Registering report " << config.output_path.data() << std::endl;
     sonata_create_report(config.output_path.data(),
                          config.start,
                          config.stop,

From c1b29ccab43b2cfc3b06b3bacabf19501bbfb280 Mon Sep 17 00:00:00 2001
From: Jorge Blanco Alonso <jorge.blancoalonso@epfl.ch>
Date: Fri, 3 Dec 2021 11:52:28 +0100
Subject: [PATCH 6/9] Calculate 1 lfp value per gid/timestep

---
 coreneuron/io/nrn_filehandler.hpp        |  7 ++---
 coreneuron/io/nrn_setup.cpp              |  2 +-
 coreneuron/io/nrnsection_mapping.hpp     | 16 +++++-----
 coreneuron/io/reports/report_event.cpp   | 38 ++++++++++++++++--------
 coreneuron/io/reports/report_event.hpp   |  4 ++-
 coreneuron/io/reports/report_handler.cpp | 20 ++++---------
 6 files changed, 45 insertions(+), 42 deletions(-)

diff --git a/coreneuron/io/nrn_filehandler.hpp b/coreneuron/io/nrn_filehandler.hpp
index ad3ed2eca..9688f9575 100644
--- a/coreneuron/io/nrn_filehandler.hpp
+++ b/coreneuron/io/nrn_filehandler.hpp
@@ -111,7 +111,7 @@ class FileHandler {
      * Read count no of mappings for section to segment
      */
     template <typename T>
-    int read_mapping_info(T* mapinfo, NrnThreadMappingInfo* ntmapping) {
+    int read_mapping_info(T* mapinfo, NrnThreadMappingInfo* ntmapping, CellMapping* cmap) {
         int nsec, nseg, n_scan;
         char line_buf[max_line_length], name[max_line_length];
 
@@ -133,13 +133,10 @@ class FileHandler {
             read_array<int>(&seg[0], nseg);
             read_array<double>(&lfp_factors[0], nseg);
 
-            std::cout << "=====> NEW CoreNEURON!" << std::endl;
-            std::cout << "nseg = " << nseg << std::endl;
-
             for (int i = 0; i < nseg; i++) {
                 mapinfo->add_segment(sec[i], seg[i]);
                 ntmapping->add_segment_id(seg[i]);
-                ntmapping->add_segment_lfp_factor(seg[i], lfp_factors[i]);
+                cmap->add_segment_lfp_factor(seg[i], lfp_factors[i]);
             }
         }
         return nseg;
diff --git a/coreneuron/io/nrn_setup.cpp b/coreneuron/io/nrn_setup.cpp
index e24f1d424..19d5b5297 100644
--- a/coreneuron/io/nrn_setup.cpp
+++ b/coreneuron/io/nrn_setup.cpp
@@ -959,7 +959,7 @@ void read_phase3(NrnThread& nt, UserParams& userParams) {
         // read section-segment mapping for every section list
         for (int j = 0; j < nseclist; j++) {
             SecMapping* smap = new SecMapping();
-            F.read_mapping_info(smap, ntmapping);
+            F.read_mapping_info(smap, ntmapping, cmap);
             cmap->add_sec_map(smap);
         }
 
diff --git a/coreneuron/io/nrnsection_mapping.hpp b/coreneuron/io/nrnsection_mapping.hpp
index 29836a24b..5b9c7bd57 100644
--- a/coreneuron/io/nrnsection_mapping.hpp
+++ b/coreneuron/io/nrnsection_mapping.hpp
@@ -74,6 +74,9 @@ struct CellMapping {
     /** list of section lists (like soma, axon, apic) */
     std::vector<SecMapping*> secmapvec;
 
+    /** map containing segment ids an its respective lfp factors */
+    std::unordered_map<int, double> lfp_factors;
+
     CellMapping(int g)
         : gid(g) {}
 
@@ -138,6 +141,11 @@ struct CellMapping {
         return count;
     }
 
+    /** @brief add the lfp factor of a segment_id */
+    void add_segment_lfp_factor(const int segment_id, double factor) {
+        lfp_factors.insert({segment_id, factor});
+    }
+
     ~CellMapping() {
         for (size_t i = 0; i < secmapvec.size(); i++) {
             delete secmapvec[i];
@@ -157,9 +165,6 @@ struct NrnThreadMappingInfo {
     /** list of segment ids */
     std::vector<int> segment_ids;
 
-    /** map containing segment ids an its respective lfp factors */
-    std::unordered_map<int, double> lfp_factors;
-
     std::vector<double> _lfp;
 
     /** @brief number of cells */
@@ -195,10 +200,5 @@ struct NrnThreadMappingInfo {
     void add_segment_id(const int segment_id) {
         segment_ids.push_back(segment_id);
     }
-
-    /** @brief add the lfp factor of a segment_id */
-    void add_segment_lfp_factor(const int segment_id, double factor) {
-        lfp_factors.insert({segment_id, factor});
-    }
 };
 }  // namespace coreneuron
diff --git a/coreneuron/io/reports/report_event.cpp b/coreneuron/io/reports/report_event.cpp
index 139fc88b0..9deaff30c 100644
--- a/coreneuron/io/reports/report_event.cpp
+++ b/coreneuron/io/reports/report_event.cpp
@@ -25,11 +25,13 @@ ReportEvent::ReportEvent(double dt,
                          double tstart,
                          const VarsToReport& filtered_gids,
                          const char* name,
-                         double report_dt)
+                         double report_dt,
+                         ReportType type)
     : dt(dt)
     , tstart(tstart)
     , report_path(name)
     , report_dt(report_dt)
+    , report_type(type)
     , vars_to_report(filtered_gids) {
     nrn_assert(filtered_gids.size());
     step = tstart / dt;
@@ -77,19 +79,27 @@ void ReportEvent::lfp_calc(NrnThread* nt) {
     // Calculate lfp only on reporting steps
     if (step > 0 && (static_cast<int>(step) % reporting_period) == 0) {
         auto* mapinfo = static_cast<NrnThreadMappingInfo*>(nt->mapping);
-        double sum = 0.0;
         double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs;
-        for (const auto& kv: mapinfo->lfp_factors) {
-                int segment_id = kv.first;
-                double factor = kv.second;
-                if(std::isnan(factor)) {
-                    factor = 0.0;
-                }
-                std::cout << segment_id << " - " << factor << std::endl;
-                sum += fast_imem_rhs[segment_id] * factor;
+
+        for (const auto& kv: vars_to_report) {
+            int gid = kv.first;
+            const auto& to_report = kv.second;
+            const auto& cell_mapping = mapinfo->get_cell_mapping(gid);
+
+            int count = 0;
+            double sum = 0.0;
+            for (const auto& kv: cell_mapping->lfp_factors) {
+                    int segment_id = kv.first;
+                    double factor = kv.second;
+                    if(std::isnan(factor)) {
+                        factor = 0.0;
+                    }
+                    sum += fast_imem_rhs[segment_id] * factor;
+                    count++;
+            }
+            *(to_report.front().var_value) = sum;
         }
-        mapinfo->_lfp[0] = sum;
-    }mdi
+    }
 }
 
 /** on deliver, call ReportingLib and setup next event */
@@ -98,7 +108,9 @@ void ReportEvent::deliver(double t, NetCvode* nc, NrnThread* nt) {
 #pragma omp critical
     {
         summation_alu(nt);
-        lfp_calc(nt);
+        if (report_type == ReportType::LFPReport) {
+            lfp_calc(nt);
+        }
         // each thread needs to know its own step
 #ifdef ENABLE_BIN_REPORTS
         records_nrec(step, gids_to_report.size(), gids_to_report.data(), report_path.data());
diff --git a/coreneuron/io/reports/report_event.hpp b/coreneuron/io/reports/report_event.hpp
index cf2d7bd19..0f1a07358 100644
--- a/coreneuron/io/reports/report_event.hpp
+++ b/coreneuron/io/reports/report_event.hpp
@@ -36,7 +36,8 @@ class ReportEvent: public DiscreteEvent {
                 double tstart,
                 const VarsToReport& filtered_gids,
                 const char* name,
-                double report_dt);
+                double report_dt,
+                ReportType type);
 
     /** on deliver, call ReportingLib and setup next event */
     void deliver(double t, NetCvode* nc, NrnThread* nt) override;
@@ -53,6 +54,7 @@ class ReportEvent: public DiscreteEvent {
     std::vector<int> gids_to_report;
     double tstart;
     VarsToReport vars_to_report;
+    ReportType report_type;
 };
 #endif  // defined(ENABLE_BIN_REPORTS) || defined(ENABLE_SONATA_REPORTS)
 
diff --git a/coreneuron/io/reports/report_handler.cpp b/coreneuron/io/reports/report_handler.cpp
index fd385cd68..1b69cd6ff 100644
--- a/coreneuron/io/reports/report_handler.cpp
+++ b/coreneuron/io/reports/report_handler.cpp
@@ -60,7 +60,8 @@ void ReportHandler::create_report(double dt, double tstop, double delay) {
                 register_custom_report(nt, m_report_config, vars_to_report);
                 break;
             case LFPReport:
-                mapinfo->_lfp.resize(12);
+                // 1 lfp value per gid
+                mapinfo->_lfp.resize(nt.ncell);
                 vars_to_report = get_lfp_vars_to_report(nt, m_report_config, mapinfo->_lfp.data());
                 is_soma_target = m_report_config.section_type == SectionType::Soma ||
                                  m_report_config.section_type == SectionType::Cell;
@@ -75,7 +76,8 @@ void ReportHandler::create_report(double dt, double tstop, double delay) {
                                                               t,
                                                               vars_to_report,
                                                               m_report_config.output_path.data(),
-                                                              m_report_config.report_dt);
+                                                              m_report_config.report_dt,
+                                                              m_report_config.type);
             report_event->send(t, net_cvode_instance, &nt);
             m_report_events.push_back(std::move(report_event));
         }
@@ -353,12 +355,6 @@ VarsToReport ReportHandler::get_lfp_vars_to_report(const NrnThread& nt,
                                                    ReportConfiguration& report,
                                                    double* report_variable) const {
     VarsToReport vars_to_report;
-    /*const auto* mapinfo = static_cast<NrnThreadMappingInfo*>(nt.mapping);
-    if (!mapinfo) {
-        std::cerr << "[LFP] Error : mapping information is missing for a Cell group "
-                  << nt.ncell << '\n';
-        nrn_abort(1);
-    }*/
     for (int i = 0; i < nt.ncell; i++) {
         int gid = nt.presyns[i].gid_;
         if (report.target.find(gid) == report.target.end()) {
@@ -366,12 +362,8 @@ VarsToReport ReportHandler::get_lfp_vars_to_report(const NrnThread& nt,
         }
 
         std::vector<VarWithMapping> to_report;
-        // Add all electrodes to the first gid for now
-        std::vector<int> electrode_ids = {0};
-        for (const auto& electrode_id : electrode_ids) {
-            double* variable = report_variable + electrode_id;
-            to_report.push_back(VarWithMapping(electrode_id, variable));
-        }
+        double* variable = report_variable + i;
+        to_report.push_back(VarWithMapping(i, variable));
         vars_to_report[gid] = to_report;
     }
     return vars_to_report;

From 7b3787e4a6cfd3584b9d7c4ffc98bd478a3ce338 Mon Sep 17 00:00:00 2001
From: Jorge Blanco Alonso <jorge.blancoalonso@epfl.ch>
Date: Thu, 6 Jan 2022 18:05:14 +0100
Subject: [PATCH 7/9] Remove cout message

---
 coreneuron/io/reports/sonata_report_handler.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/coreneuron/io/reports/sonata_report_handler.cpp b/coreneuron/io/reports/sonata_report_handler.cpp
index bc38ef27b..99f5709d5 100644
--- a/coreneuron/io/reports/sonata_report_handler.cpp
+++ b/coreneuron/io/reports/sonata_report_handler.cpp
@@ -57,7 +57,6 @@ std::pair<std::string, int> SonataReportHandler::get_population_info(int gid) {
 void SonataReportHandler::register_report(const NrnThread& nt,
                                           ReportConfiguration& config,
                                           const VarsToReport& vars_to_report) {
-    std::cout << "Registering report " << config.output_path.data() << std::endl;
     sonata_create_report(config.output_path.data(),
                          config.start,
                          config.stop,

From be0210b26685bc272989b3981ba3a54e13641b36 Mon Sep 17 00:00:00 2001
From: Jorge Blanco Alonso <jorge.blancoalonso@epfl.ch>
Date: Tue, 5 Apr 2022 18:32:03 +0200
Subject: [PATCH 8/9] Take into account IClamp for the lfp calculation

---
 coreneuron/io/reports/report_event.cpp   | 16 ++++++++++++----
 coreneuron/io/reports/report_handler.cpp | 17 ++++++++++++++---
 coreneuron/io/reports/report_handler.hpp |  3 ++-
 3 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/coreneuron/io/reports/report_event.cpp b/coreneuron/io/reports/report_event.cpp
index 9deaff30c..5a943bf02 100644
--- a/coreneuron/io/reports/report_event.cpp
+++ b/coreneuron/io/reports/report_event.cpp
@@ -80,7 +80,7 @@ void ReportEvent::lfp_calc(NrnThread* nt) {
     if (step > 0 && (static_cast<int>(step) % reporting_period) == 0) {
         auto* mapinfo = static_cast<NrnThreadMappingInfo*>(nt->mapping);
         double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs;
-
+        auto& summation_report = nt->summation_report_handler_->summation_reports_[report_path];
         for (const auto& kv: vars_to_report) {
             int gid = kv.first;
             const auto& to_report = kv.second;
@@ -94,7 +94,13 @@ void ReportEvent::lfp_calc(NrnThread* nt) {
                     if(std::isnan(factor)) {
                         factor = 0.0;
                     }
-                    sum += fast_imem_rhs[segment_id] * factor;
+                    double iclamp = 0.0;
+                    for (const auto& value: summation_report.currents_[segment_id]) {
+                        double current_value = *value.first;
+                        int scale = value.second;
+                        iclamp += current_value * scale;
+                    }
+                    sum += (fast_imem_rhs[segment_id] + iclamp) * factor;
                     count++;
             }
             *(to_report.front().var_value) = sum;
@@ -107,8 +113,10 @@ void ReportEvent::deliver(double t, NetCvode* nc, NrnThread* nt) {
 /* reportinglib is not thread safe */
 #pragma omp critical
     {
-        summation_alu(nt);
-        if (report_type == ReportType::LFPReport) {
+        if (report_type == ReportType::SummationReport) {
+            summation_alu(nt);
+        }
+        else if (report_type == ReportType::LFPReport) {
             lfp_calc(nt);
         }
         // each thread needs to know its own step
diff --git a/coreneuron/io/reports/report_handler.cpp b/coreneuron/io/reports/report_handler.cpp
index 1b69cd6ff..53efb137d 100644
--- a/coreneuron/io/reports/report_handler.cpp
+++ b/coreneuron/io/reports/report_handler.cpp
@@ -62,7 +62,7 @@ void ReportHandler::create_report(double dt, double tstop, double delay) {
             case LFPReport:
                 // 1 lfp value per gid
                 mapinfo->_lfp.resize(nt.ncell);
-                vars_to_report = get_lfp_vars_to_report(nt, m_report_config, mapinfo->_lfp.data());
+                vars_to_report = get_lfp_vars_to_report(nt, m_report_config, mapinfo->_lfp.data(), nodes_to_gid);
                 is_soma_target = m_report_config.section_type == SectionType::Soma ||
                                  m_report_config.section_type == SectionType::Cell;
                 register_section_report(nt, m_report_config, vars_to_report, is_soma_target);
@@ -353,14 +353,25 @@ VarsToReport ReportHandler::get_synapse_vars_to_report(
 
 VarsToReport ReportHandler::get_lfp_vars_to_report(const NrnThread& nt,
                                                    ReportConfiguration& report,
-                                                   double* report_variable) const {
+                                                   double* report_variable,
+                                                   const std::vector<int>& nodes_to_gids) const {
+    auto& summation_report = nt.summation_report_handler_->summation_reports_[report.output_path];
     VarsToReport vars_to_report;
     for (int i = 0; i < nt.ncell; i++) {
         int gid = nt.presyns[i].gid_;
         if (report.target.find(gid) == report.target.end()) {
             continue;
         }
-
+        // IClamp is needed for the LFP calculation
+        auto mech_id = nrn_get_mechtype("IClamp");
+        Memb_list* ml = nt._ml_list[mech_id];
+        for (int j = 0; j < ml->nodecount; j++) {
+            auto segment_id = ml->nodeindices[j];
+            if ((nodes_to_gids[segment_id] == gid)) {
+                double* var_value = get_var_location_from_var_name(mech_id, "i", ml, j);
+                summation_report.currents_[segment_id].push_back(std::make_pair(var_value, -1));
+            }
+        }
         std::vector<VarWithMapping> to_report;
         double* variable = report_variable + i;
         to_report.push_back(VarWithMapping(i, variable));
diff --git a/coreneuron/io/reports/report_handler.hpp b/coreneuron/io/reports/report_handler.hpp
index db62bf993..746edbaee 100644
--- a/coreneuron/io/reports/report_handler.hpp
+++ b/coreneuron/io/reports/report_handler.hpp
@@ -46,7 +46,8 @@ class ReportHandler {
                                             const std::vector<int>& nodes_to_gids) const;
     VarsToReport get_lfp_vars_to_report(const NrnThread& nt,
                                         ReportConfiguration& report,
-                                        double* report_variable) const;
+                                        double* report_variable,
+                                        const std::vector<int>& nodes_to_gids) const;
     std::vector<int> map_gids(const NrnThread& nt) const;
 #endif  // defined(ENABLE_BIN_REPORTS) || defined(ENABLE_SONATA_REPORTS)
   protected:

From 54b4d266ceab6399db5a4cc652b84b23247d19c5 Mon Sep 17 00:00:00 2001
From: Jorge Blanco Alonso <jorge.blancoalonso@epfl.ch>
Date: Wed, 6 Apr 2022 11:22:18 +0200
Subject: [PATCH 9/9] Clang format and other minor changes

---
 coreneuron/io/reports/report_event.cpp   | 29 ++++++++++++------------
 coreneuron/io/reports/report_handler.cpp |  3 ++-
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/coreneuron/io/reports/report_event.cpp b/coreneuron/io/reports/report_event.cpp
index 5a943bf02..6189c4526 100644
--- a/coreneuron/io/reports/report_event.cpp
+++ b/coreneuron/io/reports/report_event.cpp
@@ -89,19 +89,19 @@ void ReportEvent::lfp_calc(NrnThread* nt) {
             int count = 0;
             double sum = 0.0;
             for (const auto& kv: cell_mapping->lfp_factors) {
-                    int segment_id = kv.first;
-                    double factor = kv.second;
-                    if(std::isnan(factor)) {
-                        factor = 0.0;
-                    }
-                    double iclamp = 0.0;
-                    for (const auto& value: summation_report.currents_[segment_id]) {
-                        double current_value = *value.first;
-                        int scale = value.second;
-                        iclamp += current_value * scale;
-                    }
-                    sum += (fast_imem_rhs[segment_id] + iclamp) * factor;
-                    count++;
+                int segment_id = kv.first;
+                double factor = kv.second;
+                if (std::isnan(factor)) {
+                    factor = 0.0;
+                }
+                double iclamp = 0.0;
+                for (const auto& value: summation_report.currents_[segment_id]) {
+                    double current_value = *value.first;
+                    int scale = value.second;
+                    iclamp += current_value * scale;
+                }
+                sum += (fast_imem_rhs[segment_id] + iclamp) * factor;
+                count++;
             }
             *(to_report.front().var_value) = sum;
         }
@@ -115,8 +115,7 @@ void ReportEvent::deliver(double t, NetCvode* nc, NrnThread* nt) {
     {
         if (report_type == ReportType::SummationReport) {
             summation_alu(nt);
-        }
-        else if (report_type == ReportType::LFPReport) {
+        } else if (report_type == ReportType::LFPReport) {
             lfp_calc(nt);
         }
         // each thread needs to know its own step
diff --git a/coreneuron/io/reports/report_handler.cpp b/coreneuron/io/reports/report_handler.cpp
index 53efb137d..e720f331e 100644
--- a/coreneuron/io/reports/report_handler.cpp
+++ b/coreneuron/io/reports/report_handler.cpp
@@ -62,7 +62,8 @@ void ReportHandler::create_report(double dt, double tstop, double delay) {
             case LFPReport:
                 // 1 lfp value per gid
                 mapinfo->_lfp.resize(nt.ncell);
-                vars_to_report = get_lfp_vars_to_report(nt, m_report_config, mapinfo->_lfp.data(), nodes_to_gid);
+                vars_to_report =
+                    get_lfp_vars_to_report(nt, m_report_config, mapinfo->_lfp.data(), nodes_to_gid);
                 is_soma_target = m_report_config.section_type == SectionType::Soma ||
                                  m_report_config.section_type == SectionType::Cell;
                 register_section_report(nt, m_report_config, vars_to_report, is_soma_target);