Rensselaer Center for Open Source Software

nbody: Don't require global consistency for tree build

4 files changed, 178 lines added, 83 lines removed

Changes

--- nbody/kernels/nbody_kernels.cl ab9fb943603c709a9649f227cf0cf3319c70f813
+++ nbody/kernels/nbody_kernels.cl e6b75a871824178a02a32a113419a6b81dd4591e
@@ -146,7 +146,7 @@
-    uint doneCnt;
+    int doneCnt;
@@ -436,6 +436,7 @@
+            _treeStatus->doneCnt = 0;
@@ -458,16 +459,42 @@
+#define LOCK (-2)
+
+/* FIXME: should maybe have separate threadcount, but
+   Should have attributes most similar to integration */
+__attribute__ ((reqd_work_group_size(THREADS7, 1, 1)))
+__kernel void NBODY_KERNEL(cellSanitize)
+{
+    const int bottom = NBODY; /* Wipe all cells */
+    int inc = get_local_size(0) * get_num_groups(0);
+    int k = (bottom & (-WARPSIZE)) + get_global_id(0);  /* Align to warp size */
+    if (k < bottom)
+        k += inc;
+
+    while (k <= NNODE) /* Iterate over all cells assigned to thread */
+    {
+        _posX[k] = NAN;
+        _posY[k] = NAN;
+        _posZ[k] = NAN;
+
+        k += inc;
+    }
+}
+
+
-    __local volatile int deadCount;
+    __local volatile int successCount;
+    __local int doneCount;
+
-    uint i = get_global_id(0);
+    int i = get_global_id(0);
@@ -477,31 +504,28 @@
-        deadCount = 0;
+        doneCount = _treeStatus->doneCnt;
+        successCount = 0;
-    if (i >= maxNBody)
-    {
-        (void) atom_inc(&deadCount);
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-
-    while (deadCount != THREADS2)
-    {
-        /* Avoid conditional barrier when some items finish earlier than others */
-        if (i < maxNBody)
+    if (doneCount == NBODY)
+        return;
+
+    while (i < NBODY)
+    {
+        if (i < NBODY)
+            bool posNotReady;
+                posNotReady = false;
@@ -521,7 +545,7 @@
-            while (ch >= NBODY && depth <= MAXDEPTH)  /* Follow path to leaf cell */
+            while (ch >= NBODY && !posNotReady && depth <= MAXDEPTH)  /* Follow path to leaf cell */
@@ -530,6 +554,15 @@
+
+
+                /* Test if we don't have a consistent view. We
+                   initialized these all to NAN so we can be sure we
+                   have a good view once actually written.
+
+                   This is in case we don't have cross-workgroup global memory consistency
+                 */
+                posNotReady = isnan(pnx) || isnan(pny) || isnan(pnz);
@@ -542,10 +575,12 @@
-            if (ch != -2) /* Skip if child pointer is locked and try again later */
+            /* Skip if child pointer is locked, or the same particle, and try again later */
+            if (ch != LOCK && (ch != i) && !posNotReady)
-                if (ch == atom_cmpxchg(&_child[locked], ch, -2)) /* Try to lock */
+
+                if (ch == atom_cmpxchg(&_child[locked], ch, LOCK)) /* Try to lock */
@@ -580,6 +615,8 @@
+                            cl_assert(_treeStatus, !isnan(nx) && !isnan(ny) && !isnan(nz));
+
@@ -632,7 +669,9 @@
-                            /* Repeat until the two bodies are different children */
+
+                            /* Repeat until the two bodies are
+                             * different children or we overflow */
@@ -640,22 +679,24 @@
-                    mem_fence(CLK_GLOBAL_MEM_FENCE);
+
-                    i += inc;  /* Move on to next body */
-                    newParticle = true;
-
-                    if (i >= maxNBody)
-                    {
-                        (void) atom_inc(&deadCount);
-                    }
+                    atom_inc(&successCount);
+
+            i += inc;  /* Move on to next body */
+            newParticle = true;
+    }
+
+    if (get_local_id(0) == 0)
+    {
+        (void) atomic_add(&_treeStatus->doneCnt, successCount);
@@ -1133,7 +1174,7 @@
-    unsigned int result;
+    uint result;
--- nbody/src/nbody_cl.c ab9fb943603c709a9649f227cf0cf3319c70f813
+++ nbody/src/nbody_cl.c e6b75a871824178a02a32a113419a6b81dd4591e
@@ -37,7 +37,7 @@
-    cl_uint doneCnt;
+    cl_int doneCnt;
@@ -100,11 +100,14 @@
-static cl_uint nbFindMaxDepthForDevice(const DevInfo* di, const NBodyWorkSizes* ws, cl_bool useQuad)
+cl_uint nbFindMaxDepthForDevice(const DevInfo* di, const NBodyWorkSizes* ws, cl_bool useQuad)
+
+    /* TODO: We should be able to reduce this; this is usually quite a
+     * bit deeper than we can go before hitting precision limits */
@@ -144,14 +147,17 @@
+
-        ws->global[1] = ws->threads[1];
+        //ws->global[1] = 3 * ws->threads[1] * blocks;
+        ws->global[1] = 2 * ws->threads[1] * blocks;
+
@@ -402,6 +408,7 @@
+        err |= nbSetKernelArguments(k->cellSanitize, st->nbb, exact);
@@ -434,6 +441,7 @@
+    err |= clReleaseKernel_quiet(kernels->cellSanitize);
@@ -559,7 +567,8 @@
-                 nbFindMaxDepthForDevice(di, st->workSizes, ctx->useQuad),
+
+                 st->maxDepth,
@@ -618,6 +627,7 @@
+    kernels->cellSanitize = mwCreateKernel(program, "cellSanitize");
@@ -627,7 +637,8 @@
-    return (   kernels->boundingBox
+    return (   kernels->cellSanitize
+            && kernels->boundingBox
@@ -891,9 +902,7 @@
-                mw_printf("(%s (%u))\n",
-                          showNBodyKernelError(ts.errorCode),
-                          nbFindMaxDepthForDevice(&ci->di, st->workSizes, st->usesQuad));
+                mw_printf("(%s (%u))\n", showNBodyKernelError(ts.errorCode), st->maxDepth);
@@ -944,7 +953,6 @@
-    TreeStatus treeStatus;
@@ -1010,63 +1018,98 @@
-    cl_int err;
-    size_t chunk;
-    size_t nChunk;
-    cl_int upperBound;
-    size_t offset[1];
-    cl_event boxEv, sumEv, sortEv, quadEv;
+    cl_int err = CL_SUCCESS;
+    TreeStatus treeStatus;
+
+    NBodyBuffers* nbb = st->nbb;
-    cl_int effNBody = st->effNBody;
+    cl_uint iterations = 0;
+    cl_event sanitizeEv = NULL;
+    cl_event boxEv = NULL;
+    cl_event sumEv = NULL;
+    cl_event sortEv = NULL;
+    cl_event quadEv = NULL;
+
+    err = clEnqueueNDRangeKernel(ci->queue, kernels->cellSanitize, 1,
+                                 NULL, &ws->global[6], &ws->local[6],
+                                 0, NULL, &sanitizeEv);
+    if (err != CL_SUCCESS)
+        goto tree_build_exit;
-        return err;
-
-    nChunk     = st->ignoreResponsive ?        1 : mwDivRoundup((size_t) effNBody, ws->global[1]);
-    upperBound = st->ignoreResponsive ? effNBody : (cl_int) ws->global[1];
-    for (chunk = 0, offset[0] = 0; chunk < nChunk; ++chunk, offset[0] += ws->global[1])
+        goto tree_build_exit;
+
+    /* Repeat the tree construction kernel until all bodies have been successfully inserted */
+    do
-
-        if (upperBound > effNBody)
-            upperBound = effNBody;
-
-        err = clSetKernelArg(kernels->buildTree, 28, sizeof(cl_int), &upperBound);
-        if (err != CL_SUCCESS)
-            return err;
+        cl_event readEv;
+
+        /*
+          TODO: We can save somewhat on launch overhead and extra
+          reads by enqueuing a number of iterations based on a running
+          average of how many it has taken
+         */
-                                     offset, &ws->global[1], &ws->local[1],
+                                     NULL, &ws->global[1], &ws->local[1],
-            return err;
-
-        upperBound += (cl_int) ws->global[1];
-        ws->timings[1] += waitReleaseEventWithTime(ev);
-    }
-
+            goto tree_build_exit;
+
+        err = clFlush(ci->queue);
+        if (err != CL_SUCCESS)
+        {
+            clReleaseEvent(ev);
+            goto tree_build_exit;
+        }
+
+
+        err = clEnqueueReadBuffer(ci->queue,
+                                  nbb->treeStatus,
+                                  CL_TRUE,
+                                  0, sizeof(treeStatus), &treeStatus,
+                                  0, NULL, &readEv);
+        if (err != CL_SUCCESS)
+        {
+            clReleaseEvent(ev);
+            goto tree_build_exit;
+        }
+
+        ++iterations;
+
+        ws->timings[1] += mwReleaseEventWithTimingMS(ev);
+        ws->timings[1] += mwReleaseEventWithTimingMS(readEv);
+
+        if (treeStatus.maxDepth > st->maxDepth)
+        {
+            mw_printf("Overflow during tree construction\n");
+            err = MW_CL_ERROR;
+            goto tree_build_exit;
+        }
+    }
+    while (treeStatus.doneCnt != st->effNBody);
-        return err;
-
-    /* FIXME: This does not work unless ALL of the threads are
-     * launched at once. This may be bad when we need
-     * responsiveness. This also means it will always hang with
-     * CPUs. It seems to be fast enough though in every case I've
-     * tried. */
+        goto tree_build_exit;
+
-        return err;
+        goto tree_build_exit;
+
+    err = clFlush(ci->queue);
+    if (err != CL_SUCCESS)
+        goto tree_build_exit;
@@ -1074,19 +1117,30 @@
-            return err;
-    }
-
+            goto tree_build_exit;
+    }
+
+    err = clFinish(ci->queue);
+    if (err != CL_SUCCESS)
+        goto tree_build_exit;
+
+
+tree_build_exit:
-    ws->chunkTimings[1] = ws->timings[1] / (double) nChunk;
-    ws->timings[2] += waitReleaseEventWithTime(sumEv);
-    ws->timings[3] += waitReleaseEventWithTime(sortEv);
+    ws->chunkTimings[1] = ws->timings[1] / (double) iterations;
+
+    /* Pretend the sanitize kernel doesn't exist, include it's time as
+     * part of buildTree since it exists to support it anyway */
+    ws->timings[1] += mwReleaseEventWithTimingMS(sanitizeEv);
+
+    ws->timings[2] += mwReleaseEventWithTimingMS(sumEv);
+    ws->timings[3] += mwReleaseEventWithTimingMS(sortEv);
-        ws->timings[4] += waitReleaseEventWithTime(quadEv);
-    }
-
-    return CL_SUCCESS;
+        ws->timings[4] += mwReleaseEventWithTimingMS(quadEv);
+    }
+
+    return err;
@@ -1128,7 +1182,7 @@
-        err = clSetKernelArg(forceKern, 28, sizeof(cl_int), &upperBound);
+        err = clSetKernelArg(forceKern, 28, sizeof(cl_uint), &upperBound);
Milkyway@Home • 58 weeks ago