nbody: Don't require global consistency for tree build
4 files changed, 178 lines added, 83 lines removed
Changes
--- nbody/kernels/nbody_kernels.cl ab9fb943603c709a9649f227cf0cf3319c70f813
+++ nbody/kernels/nbody_kernels.cl e6b75a871824178a02a32a113419a6b81dd4591e
@@ -146,7 +146,7 @@
- uint doneCnt;
+ int doneCnt;
@@ -436,6 +436,7 @@
+ _treeStatus->doneCnt = 0;
@@ -458,16 +459,42 @@
+#define LOCK (-2)
+
+/* FIXME: should maybe have separate threadcount, but
+ Should have attributes most similar to integration */
+__attribute__ ((reqd_work_group_size(THREADS7, 1, 1)))
+__kernel void NBODY_KERNEL(cellSanitize)
+{+ const int bottom = NBODY; /* Wipe all cells */
+ int inc = get_local_size(0) * get_num_groups(0);
+ int k = (bottom & (-WARPSIZE)) + get_global_id(0); /* Align to warp size */
+ if (k < bottom)
+ k += inc;
+
+ while (k <= NNODE) /* Iterate over all cells assigned to thread */
+ {+ _posX[k] = NAN;
+ _posY[k] = NAN;
+ _posZ[k] = NAN;
+
+ k += inc;
+ }
+}
+
+
- __local volatile int deadCount;
+ __local volatile int successCount;
+ __local int doneCount;
+
- uint i = get_global_id(0);
+ int i = get_global_id(0);
@@ -477,31 +504,28 @@
- deadCount = 0;
+ doneCount = _treeStatus->doneCnt;
+ successCount = 0;
- if (i >= maxNBody)
- {- (void) atom_inc(&deadCount);
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
-
- while (deadCount != THREADS2)
- {- /* Avoid conditional barrier when some items finish earlier than others */
- if (i < maxNBody)
+ if (doneCount == NBODY)
+ return;
+
+ while (i < NBODY)
+ {+ if (i < NBODY)
+ bool posNotReady;
+ posNotReady = false;
@@ -521,7 +545,7 @@
- while (ch >= NBODY && depth <= MAXDEPTH) /* Follow path to leaf cell */
+ while (ch >= NBODY && !posNotReady && depth <= MAXDEPTH) /* Follow path to leaf cell */
@@ -530,6 +554,15 @@
+
+
+ /* Test if we don't have a consistent view. We
+ initialized these all to NAN so we can be sure we
+ have a good view once actually written.
+
+ This is in case we don't have cross-workgroup global memory consistency
+ */
+ posNotReady = isnan(pnx) || isnan(pny) || isnan(pnz);
@@ -542,10 +575,12 @@
- if (ch != -2) /* Skip if child pointer is locked and try again later */
+ /* Skip if child pointer is locked, or the same particle, and try again later */
+ if (ch != LOCK && (ch != i) && !posNotReady)
- if (ch == atom_cmpxchg(&_child[locked], ch, -2)) /* Try to lock */
+
+ if (ch == atom_cmpxchg(&_child[locked], ch, LOCK)) /* Try to lock */
@@ -580,6 +615,8 @@
+ cl_assert(_treeStatus, !isnan(nx) && !isnan(ny) && !isnan(nz));
+
@@ -632,7 +669,9 @@
- /* Repeat until the two bodies are different children */
+
+ /* Repeat until the two bodies are
+ * different children or we overflow */
@@ -640,22 +679,24 @@
- mem_fence(CLK_GLOBAL_MEM_FENCE);
+
- i += inc; /* Move on to next body */
- newParticle = true;
-
- if (i >= maxNBody)
- {- (void) atom_inc(&deadCount);
- }
+ atom_inc(&successCount);
+
+ i += inc; /* Move on to next body */
+ newParticle = true;
+ }
+
+ if (get_local_id(0) == 0)
+ {+ (void) atomic_add(&_treeStatus->doneCnt, successCount);
@@ -1133,7 +1174,7 @@
- unsigned int result;
+ uint result;
--- nbody/src/nbody_cl.c ab9fb943603c709a9649f227cf0cf3319c70f813
+++ nbody/src/nbody_cl.c e6b75a871824178a02a32a113419a6b81dd4591e
@@ -37,7 +37,7 @@
- cl_uint doneCnt;
+ cl_int doneCnt;
@@ -100,11 +100,14 @@
-static cl_uint nbFindMaxDepthForDevice(const DevInfo* di, const NBodyWorkSizes* ws, cl_bool useQuad)
+cl_uint nbFindMaxDepthForDevice(const DevInfo* di, const NBodyWorkSizes* ws, cl_bool useQuad)
+
+ /* TODO: We should be able to reduce this; this is usually quite a
+ * bit deeper than we can go before hitting precision limits */
@@ -144,14 +147,17 @@
+
- ws->global[1] = ws->threads[1];
+ //ws->global[1] = 3 * ws->threads[1] * blocks;
+ ws->global[1] = 2 * ws->threads[1] * blocks;
+
@@ -402,6 +408,7 @@
+ err |= nbSetKernelArguments(k->cellSanitize, st->nbb, exact);
@@ -434,6 +441,7 @@
+ err |= clReleaseKernel_quiet(kernels->cellSanitize);
@@ -559,7 +567,8 @@
- nbFindMaxDepthForDevice(di, st->workSizes, ctx->useQuad),
+
+ st->maxDepth,
@@ -618,6 +627,7 @@
+ kernels->cellSanitize = mwCreateKernel(program, "cellSanitize");
@@ -627,7 +637,8 @@
- return ( kernels->boundingBox
+ return ( kernels->cellSanitize
+ && kernels->boundingBox
@@ -891,9 +902,7 @@
- mw_printf("(%s (%u))\n",- showNBodyKernelError(ts.errorCode),
- nbFindMaxDepthForDevice(&ci->di, st->workSizes, st->usesQuad));
+ mw_printf("(%s (%u))\n", showNBodyKernelError(ts.errorCode), st->maxDepth);@@ -944,7 +953,6 @@
- TreeStatus treeStatus;
@@ -1010,63 +1018,98 @@
- cl_int err;
- size_t chunk;
- size_t nChunk;
- cl_int upperBound;
- size_t offset[1];
- cl_event boxEv, sumEv, sortEv, quadEv;
+ cl_int err = CL_SUCCESS;
+ TreeStatus treeStatus;
+
+ NBodyBuffers* nbb = st->nbb;
- cl_int effNBody = st->effNBody;
+ cl_uint iterations = 0;
+ cl_event sanitizeEv = NULL;
+ cl_event boxEv = NULL;
+ cl_event sumEv = NULL;
+ cl_event sortEv = NULL;
+ cl_event quadEv = NULL;
+
+ err = clEnqueueNDRangeKernel(ci->queue, kernels->cellSanitize, 1,
+ NULL, &ws->global[6], &ws->local[6],
+ 0, NULL, &sanitizeEv);
+ if (err != CL_SUCCESS)
+ goto tree_build_exit;
- return err;
-
- nChunk = st->ignoreResponsive ? 1 : mwDivRoundup((size_t) effNBody, ws->global[1]);
- upperBound = st->ignoreResponsive ? effNBody : (cl_int) ws->global[1];
- for (chunk = 0, offset[0] = 0; chunk < nChunk; ++chunk, offset[0] += ws->global[1])
+ goto tree_build_exit;
+
+ /* Repeat the tree construction kernel until all bodies have been successfully inserted */
+ do
-
- if (upperBound > effNBody)
- upperBound = effNBody;
-
- err = clSetKernelArg(kernels->buildTree, 28, sizeof(cl_int), &upperBound);
- if (err != CL_SUCCESS)
- return err;
+ cl_event readEv;
+
+ /*
+ TODO: We can save somewhat on launch overhead and extra
+ reads by enqueuing a number of iterations based on a running
+ average of how many it has taken
+ */
- offset, &ws->global[1], &ws->local[1],
+ NULL, &ws->global[1], &ws->local[1],
- return err;
-
- upperBound += (cl_int) ws->global[1];
- ws->timings[1] += waitReleaseEventWithTime(ev);
- }
-
+ goto tree_build_exit;
+
+ err = clFlush(ci->queue);
+ if (err != CL_SUCCESS)
+ {+ clReleaseEvent(ev);
+ goto tree_build_exit;
+ }
+
+
+ err = clEnqueueReadBuffer(ci->queue,
+ nbb->treeStatus,
+ CL_TRUE,
+ 0, sizeof(treeStatus), &treeStatus,
+ 0, NULL, &readEv);
+ if (err != CL_SUCCESS)
+ {+ clReleaseEvent(ev);
+ goto tree_build_exit;
+ }
+
+ ++iterations;
+
+ ws->timings[1] += mwReleaseEventWithTimingMS(ev);
+ ws->timings[1] += mwReleaseEventWithTimingMS(readEv);
+
+ if (treeStatus.maxDepth > st->maxDepth)
+ {+ mw_printf("Overflow during tree construction\n");+ err = MW_CL_ERROR;
+ goto tree_build_exit;
+ }
+ }
+ while (treeStatus.doneCnt != st->effNBody);
- return err;
-
- /* FIXME: This does not work unless ALL of the threads are
- * launched at once. This may be bad when we need
- * responsiveness. This also means it will always hang with
- * CPUs. It seems to be fast enough though in every case I've
- * tried. */
+ goto tree_build_exit;
+
- return err;
+ goto tree_build_exit;
+
+ err = clFlush(ci->queue);
+ if (err != CL_SUCCESS)
+ goto tree_build_exit;
@@ -1074,19 +1117,30 @@
- return err;
- }
-
+ goto tree_build_exit;
+ }
+
+ err = clFinish(ci->queue);
+ if (err != CL_SUCCESS)
+ goto tree_build_exit;
+
+
+tree_build_exit:
- ws->chunkTimings[1] = ws->timings[1] / (double) nChunk;
- ws->timings[2] += waitReleaseEventWithTime(sumEv);
- ws->timings[3] += waitReleaseEventWithTime(sortEv);
+ ws->chunkTimings[1] = ws->timings[1] / (double) iterations;
+
+ /* Pretend the sanitize kernel doesn't exist, include it's time as
+ * part of buildTree since it exists to support it anyway */
+ ws->timings[1] += mwReleaseEventWithTimingMS(sanitizeEv);
+
+ ws->timings[2] += mwReleaseEventWithTimingMS(sumEv);
+ ws->timings[3] += mwReleaseEventWithTimingMS(sortEv);
- ws->timings[4] += waitReleaseEventWithTime(quadEv);
- }
-
- return CL_SUCCESS;
+ ws->timings[4] += mwReleaseEventWithTimingMS(quadEv);
+ }
+
+ return err;
@@ -1128,7 +1182,7 @@
- err = clSetKernelArg(forceKern, 28, sizeof(cl_int), &upperBound);
+ err = clSetKernelArg(forceKern, 28, sizeof(cl_uint), &upperBound);