Skip to content
This repository was archived by the owner on Mar 20, 2023. It is now read-only.
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Fixing race condition in cell permute 2 : OpenACC [done] / OpenMP [WI…
…P : performance optimization]
  • Loading branch information
Christos Kotsalos committed Aug 9, 2022
commit bb0aae1d5640ce25a6640c8e7ffeb543c7a8595c
50 changes: 18 additions & 32 deletions coreneuron/permute/cellorder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -478,25 +478,25 @@ static void bksub_interleaved(NrnThread* nt,
}

// icore ranges [0:warpsize) ; stride[ncycle]
nrn_pragma_acc(routine vector)
nrn_pragma_omp(declare target)
static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* stride, int lastnode) {
int icycle = ncycle - 1;
int istride = stride[icycle];
int i = lastnode - istride + icore;
#ifndef CORENEURON_ENABLE_GPU
int ii = i;
#endif

// execute until all tree depths are executed
bool has_subtrees_to_compute = true;

// clang-format off
nrn_pragma_acc(loop seq)
for (; has_subtrees_to_compute; ) { // ncycle loop
#ifndef CORENEURON_ENABLE_GPU
// serial test, gpu does this in parallel
nrn_pragma_acc(loop vector)
nrn_pragma_omp(simd)
for (int icore = 0; icore < warpsize; ++icore) {
int i = ii + icore;
#endif
if (icore < istride) { // most efficient if istride equal warpsize
// what is the index
int ip = GPU_PARENT(i);
Expand All @@ -508,9 +508,7 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
nrn_pragma_omp(atomic update)
GPU_RHS(ip) -= p * GPU_RHS(i);
}
#ifndef CORENEURON_ENABLE_GPU
}
#endif
// if finished with all tree depths then ready to break
// (note that break is not allowed in OpenACC)
if (icycle == 0) {
Expand All @@ -520,53 +518,47 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
--icycle;
istride = stride[icycle];
i -= istride;
#ifndef CORENEURON_ENABLE_GPU
ii -= istride;
#endif
}
// clang-format on
}
nrn_pragma_omp(end declare target)

// icore ranges [0:warpsize) ; stride[ncycle]
nrn_pragma_acc(routine vector)
nrn_pragma_omp(declare target)
static void bksub_interleaved2(NrnThread* nt,
int root,
int lastroot,
int icore,
int ncycle,
int* stride,
int firstnode) {
#ifndef CORENEURON_ENABLE_GPU
for (int i = root; i < lastroot; i += 1) {
#else
nrn_pragma_acc(loop seq)
for (int i = root; i < lastroot; i += warpsize) {
#endif
for (int i = root; i < lastroot; i += 1) {
GPU_RHS(i) /= GPU_D(i); // the root
}

int i = firstnode + icore;
#ifndef CORENEURON_ENABLE_GPU
int ii = i;
#endif
nrn_pragma_acc(loop seq)
for (int icycle = 0; icycle < ncycle; ++icycle) {
int istride = stride[icycle];
#ifndef CORENEURON_ENABLE_GPU
// serial test, gpu does this in parallel
nrn_pragma_acc(loop vector)
nrn_pragma_omp(simd)
for (int icore = 0; icore < warpsize; ++icore) {
int i = ii + icore;
#endif
if (icore < istride) {
int ip = GPU_PARENT(i);
GPU_RHS(i) -= GPU_B(i) * GPU_RHS(ip);
GPU_RHS(i) /= GPU_D(i);
}
i += istride;
#ifndef CORENEURON_ENABLE_GPU
}
ii += istride;
#endif
}
}
nrn_pragma_omp(end declare target)

/**
* \brief Solve Hines matrices/cells with compartment-based granularity.
Expand Down Expand Up @@ -600,15 +592,14 @@ void solve_interleaved2(int ith) {
defined(_OPENACC)
int nstride = stridedispl[nwarp];
#endif
nrn_pragma_acc(parallel loop gang vector vector_length(
warpsize) present(nt [0:1],
nrn_pragma_acc(parallel loop gang present(nt [0:1],
strides [0:nstride],
ncycles [0:nwarp],
stridedispl [0:nwarp + 1],
rootbegin [0:nwarp + 1],
nodebegin [0:nwarp + 1]) if (nt->compute_gpu) async(nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
for (int icore = 0; icore < ncore; ++icore) {
nrn_pragma_omp(target teams distribute if(nt->compute_gpu))
for (int icore = 0; icore < ncore; icore += warpsize) {
int iwarp = icore / warpsize; // figure out the >> value
int ic = icore & (warpsize - 1); // figure out the & mask
int ncycle = ncycles[iwarp];
Expand All @@ -617,14 +608,9 @@ void solve_interleaved2(int ith) {
int lastroot = rootbegin[iwarp + 1];
int firstnode = nodebegin[iwarp];
int lastnode = nodebegin[iwarp + 1];
#ifndef CORENEURON_ENABLE_GPU
if (ic == 0) { // serial test mode. triang and bksub do all cores in warp
#endif
triang_interleaved2(nt, ic, ncycle, stride, lastnode);
bksub_interleaved2(nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
#ifndef CORENEURON_ENABLE_GPU
} // serial test mode
#endif

triang_interleaved2(nt, ic, ncycle, stride, lastnode);
bksub_interleaved2(nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
}
nrn_pragma_acc(wait(nt->stream_id))
#ifdef _OPENACC
Expand Down