Merge pull request arrayfire#548 from pavanky/scan_fixes

umar456 · umar456 · commit 26975c319ca0 · 2015-03-30T01:29:13.000-04:00
Bug fixes for accum in CUDA and OpenCL backends
diff --git a/src/backend/cuda/kernel/scan_dim.hpp b/src/backend/cuda/kernel/scan_dim.hpp
@@ -33,18 +33,18 @@ namespace kernel
                                 uint blocks_dim,
                                 uint lim)
     {
-        const uint tidx = threadIdx.x;
-        const uint tidy = threadIdx.y;
-        const uint tid  = tidy * THREADS_X + tidx;
+        const int tidx = threadIdx.x;
+        const int tidy = threadIdx.y;
+        const int tid  = tidy * THREADS_X + tidx;
 
-        const uint zid = blockIdx.x / blocks_x;
-        const uint wid = blockIdx.y / blocks_y;
-        const uint blockIdx_x = blockIdx.x - (blocks_x) * zid;
-        const uint blockIdx_y = blockIdx.y - (blocks_y) * wid;
-        const uint xid = blockIdx_x * blockDim.x + tidx;
-        const uint yid = blockIdx_y; // yid  of output. updated for input later.
+        const int zid = blockIdx.x / blocks_x;
+        const int wid = blockIdx.y / blocks_y;
+        const int blockIdx_x = blockIdx.x - (blocks_x) * zid;
+        const int blockIdx_y = blockIdx.y - (blocks_y) * wid;
+        const int xid = blockIdx_x * blockDim.x + tidx;
+        const int yid = blockIdx_y; // yid  of output. updated for input later.
 
-        uint ids[4] = {xid, yid, zid, wid};
+        int ids[4] = {xid, yid, zid, wid};
 
         const Ti *iptr = in.ptr;
         To *optr = out.ptr;
@@ -54,22 +54,22 @@ namespace kernel
         // There are blockDim.y elements per block for in
         // Hence increment ids[dim] just after offseting out and before offsetting in
         tptr += ids[3] * tmp.strides[3] + ids[2] * tmp.strides[2] + ids[1] * tmp.strides[1] + ids[0];
-        const uint blockIdx_dim = ids[dim];
+        const int blockIdx_dim = ids[dim];
 
         ids[dim] = ids[dim] * blockDim.y * lim + tidy;
         optr  += ids[3] * out.strides[3] + ids[2] * out.strides[2] + ids[1] * out.strides[1] + ids[0];
         iptr  += ids[3] *  in.strides[3] + ids[2] *  in.strides[2] + ids[1] *  in.strides[1] + ids[0];
-        uint id_dim = ids[dim];
-        const uint out_dim = out.dims[dim];
+        int id_dim = ids[dim];
+        const int out_dim = out.dims[dim];
 
         bool is_valid =
             (ids[0] < out.dims[0]) &&
             (ids[1] < out.dims[1]) &&
             (ids[2] < out.dims[2]) &&
             (ids[3] < out.dims[3]);
 
-        const uint ostride_dim = out.strides[dim];
-        const uint istride_dim =  in.strides[dim];
+        const int ostride_dim = out.strides[dim];
+        const int istride_dim =  in.strides[dim];
 
         __shared__ To s_val[THREADS_X * DIMY * 2];
         __shared__ To s_tmp[THREADS_X];
@@ -92,7 +92,8 @@ namespace kernel
             *sptr = val;
             __syncthreads();
 
-            uint start = 0;
+            int start = 0;
+#pragma unroll
             for (int off = 1; off < DIMY; off *= 2) {
 
                 if (tidy >= off) val = binop(val, sptr[(start - off) * THREADS_X]);
@@ -103,6 +104,7 @@ namespace kernel
             }
 
             val = binop(val, s_tmp[tidx]);
+            __syncthreads();
             if (cond) *optr = val;
 
             id_dim += blockDim.y;
@@ -127,17 +129,17 @@ namespace kernel
                                  uint blocks_dim,
                                  uint lim)
     {
-        const uint tidx = threadIdx.x;
-        const uint tidy = threadIdx.y;
+        const int tidx = threadIdx.x;
+        const int tidy = threadIdx.y;
 
-        const uint zid = blockIdx.x / blocks_x;
-        const uint wid = blockIdx.y / blocks_y;
-        const uint blockIdx_x = blockIdx.x - (blocks_x) * zid;
-        const uint blockIdx_y = blockIdx.y - (blocks_y) * wid;
-        const uint xid = blockIdx_x * blockDim.x + tidx;
-        const uint yid = blockIdx_y; // yid  of output. updated for input later.
+        const int zid = blockIdx.x / blocks_x;
+        const int wid = blockIdx.y / blocks_y;
+        const int blockIdx_x = blockIdx.x - (blocks_x) * zid;
+        const int blockIdx_y = blockIdx.y - (blocks_y) * wid;
+        const int xid = blockIdx_x * blockDim.x + tidx;
+        const int yid = blockIdx_y; // yid  of output. updated for input later.
 
-        uint ids[4] = {xid, yid, zid, wid};
+        int ids[4] = {xid, yid, zid, wid};
 
         const To *tptr = tmp.ptr;
         To *optr = out.ptr;
@@ -146,12 +148,12 @@ namespace kernel
         // There are blockDim.y elements per block for in
         // Hence increment ids[dim] just after offseting out and before offsetting in
         tptr += ids[3] * tmp.strides[3] + ids[2] * tmp.strides[2] + ids[1] * tmp.strides[1] + ids[0];
-        const uint blockIdx_dim = ids[dim];
+        const int blockIdx_dim = ids[dim];
 
         ids[dim] = ids[dim] * blockDim.y * lim + tidy;
         optr  += ids[3] * out.strides[3] + ids[2] * out.strides[2] + ids[1] * out.strides[1] + ids[0];
-        const uint id_dim = ids[dim];
-        const uint out_dim = out.dims[dim];
+        const int id_dim = ids[dim];
+        const int out_dim = out.dims[dim];
 
         bool is_valid =
             (ids[0] < out.dims[0]) &&
@@ -165,7 +167,7 @@ namespace kernel
         To accum = *(tptr - tmp.strides[dim]);
 
         Binary<To, op> binop;
-        const uint ostride_dim = out.strides[dim];
+        const int ostride_dim = out.strides[dim];
 
         for (int k = 0, id = id_dim;
              is_valid && k < lim && (id < out_dim);
diff --git a/src/backend/cuda/kernel/scan_first.hpp b/src/backend/cuda/kernel/scan_first.hpp
@@ -31,15 +31,19 @@ namespace kernel
                                   uint blocks_y,
                                   uint lim)
     {
-        const uint tidx = threadIdx.x;
-        const uint tidy = threadIdx.y;
+        const int tidx = threadIdx.x;
+        const int tidy = threadIdx.y;
 
-        const uint zid = blockIdx.x / blocks_x;
-        const uint wid = blockIdx.y / blocks_y;
-        const uint blockIdx_x = blockIdx.x - (blocks_x) * zid;
-        const uint blockIdx_y = blockIdx.y - (blocks_y) * wid;
-        const uint xid = blockIdx_x * blockDim.x * lim + tidx;
-        const uint yid = blockIdx_y * blockDim.y + tidy;
+        const int zid = blockIdx.x / blocks_x;
+        const int wid = blockIdx.y / blocks_y;
+        const int blockIdx_x = blockIdx.x - (blocks_x) * zid;
+        const int blockIdx_y = blockIdx.y - (blocks_y) * wid;
+        const int xid = blockIdx_x * blockDim.x * lim + tidx;
+        const int yid = blockIdx_y * blockDim.y + tidy;
+
+        bool cond_yzw = (yid < out.dims[1]) && (zid < out.dims[2]) && (wid < out.dims[3]);
+
+        if (!cond_yzw) return; // retire warps early
 
         const Ti *iptr = in.ptr;
         To *optr = out.ptr;
@@ -49,10 +53,9 @@ namespace kernel
         optr += wid * out.strides[3] + zid * out.strides[2] + yid * out.strides[1];
         tptr += wid * tmp.strides[3] + zid * tmp.strides[2] + yid * tmp.strides[1];
 
-        bool cond_yzw = (yid < out.dims[1]) && (zid < out.dims[2]) && (wid < out.dims[3]);
 
-        const uint DIMY = THREADS_PER_BLOCK / DIMX;
-        const uint SHARED_MEM_SIZE = (2 * DIMX + 1) * (DIMY);
+        const int DIMY = THREADS_PER_BLOCK / DIMX;
+        const int SHARED_MEM_SIZE = (2 * DIMX + 1) * (DIMY);
 
         __shared__ To s_val[SHARED_MEM_SIZE];
         __shared__ To s_tmp[DIMY];
@@ -63,7 +66,7 @@ namespace kernel
         Binary<To, op> binop;
 
         const To init = binop.init();
-        uint id = xid;
+        int id = xid;
         To val = init;
 
         const bool isLast = (tidx == (DIMX - 1));
@@ -72,13 +75,14 @@ namespace kernel
 
             if (isLast) s_tmp[tidy] = val;
 
-            bool cond = (cond_yzw && (id < out.dims[0]));
+            bool cond = ((id < out.dims[0]));
             val = cond ? transform(iptr[id]) : init;
             sptr[tidx] = val;
             __syncthreads();
 
 
-            uint start = 0;
+            int start = 0;
+#pragma unroll
             for (int off = 1; off < DIMX; off *= 2) {
 
                 if (tidx >= off) val = binop(val, sptr[(start - off) + tidx]);
@@ -91,9 +95,10 @@ namespace kernel
             val = binop(val, s_tmp[tidy]);
             if (cond) optr[id] = val;
             id += blockDim.x;
+            __syncthreads();
         }
 
-        if (!isFinalPass && cond_yzw && isLast) {
+        if (!isFinalPass && isLast) {
             tptr[blockIdx_x] = val;
         }
     }
@@ -106,27 +111,27 @@ namespace kernel
                                    uint blocks_y,
                                    uint lim)
     {
-        const uint tidx = threadIdx.x;
-        const uint tidy = threadIdx.y;
+        const int tidx = threadIdx.x;
+        const int tidy = threadIdx.y;
+
+        const int zid = blockIdx.x / blocks_x;
+        const int wid = blockIdx.y / blocks_y;
+        const int blockIdx_x = blockIdx.x - (blocks_x) * zid;
+        const int blockIdx_y = blockIdx.y - (blocks_y) * wid;
+        const int xid = blockIdx_x * blockDim.x * lim + tidx;
+        const int yid = blockIdx_y * blockDim.y + tidy;
 
-        const uint zid = blockIdx.x / blocks_x;
-        const uint wid = blockIdx.y / blocks_y;
-        const uint blockIdx_x = blockIdx.x - (blocks_x) * zid;
-        const uint blockIdx_y = blockIdx.y - (blocks_y) * wid;
-        const uint xid = blockIdx_x * blockDim.x * lim + tidx;
-        const uint yid = blockIdx_y * blockDim.y + tidy;
+        if (blockIdx_x == 0) return;
+
+        bool cond = (yid < out.dims[1]) && (zid < out.dims[2]) && (wid < out.dims[3]);
+        if (!cond) return;
 
         To *optr = out.ptr;
         const To *tptr = tmp.ptr;
 
         optr += wid * out.strides[3] + zid * out.strides[2] + yid * out.strides[1];
         tptr += wid * tmp.strides[3] + zid * tmp.strides[2] + yid * tmp.strides[1];
 
-        bool cond = (yid < out.dims[1]) && (zid < out.dims[2]) && (wid < out.dims[3]);
-
-        if (!cond) return;
-        if (blockIdx_x == 0) return;
-
         Binary<To, op> binop;
         To accum = tptr[blockIdx_x - 1];
 
diff --git a/src/backend/opencl/kernel/scan_dim.cl b/src/backend/opencl/kernel/scan_dim.cl
@@ -27,7 +27,7 @@ void scan_dim_kernel(__global To *oData, KParam oInfo,
     const int xid = groupId_x * get_local_size(0) + lidx;
     const int yid = groupId_y;
 
-    uint ids[4] = {xid, yid, zid, wid};
+    int ids[4] = {xid, yid, zid, wid};
 
     // There is only one element per group for out
     // There are DIMY elements per group for in
@@ -69,7 +69,7 @@ void scan_dim_kernel(__global To *oData, KParam oInfo,
         l_val[lid] = val;
         barrier(CLK_LOCAL_MEM_FENCE);
 
-        uint start = 0;
+        int start = 0;
         for (int off = 1; off < DIMY; off *= 2) {
 
             if (lidy >= off) val = binOp(val, l_val[lid - off * THREADS_X]);
@@ -83,6 +83,7 @@ void scan_dim_kernel(__global To *oData, KParam oInfo,
 
         val = binOp(val, l_tmp[lidx]);
         if (cond) *oData = val;
+        barrier(CLK_LOCAL_MEM_FENCE);
 
         id_dim += DIMY;
         iData += DIMY * istride_dim;
@@ -116,13 +117,15 @@ void bcast_dim_kernel(__global To *oData, KParam oInfo,
     const int xid = groupId_x * get_local_size(0) + lidx;
     const int yid = groupId_y;
 
-    uint ids[4] = {xid, yid, zid, wid};
+    int ids[4] = {xid, yid, zid, wid};
+    const int groupId_dim = ids[dim];
+
+    if (groupId_dim == 0) return;
 
     // There is only one element per group for out
     // There are DIMY elements per group for in
     // Hence increment ids[dim] just after offseting out and before offsetting in
     tData += ids[3] * tInfo.strides[3] + ids[2] * tInfo.strides[2] + ids[1] * tInfo.strides[1] + ids[0];
-    const int groupId_dim = ids[dim];
 
     ids[dim] = ids[dim] * DIMY * lim + lidy;
     oData  += ids[3] * oInfo.strides[3] + ids[2] * oInfo.strides[2] + ids[1] * oInfo.strides[1] + ids[0];
@@ -137,7 +140,6 @@ void bcast_dim_kernel(__global To *oData, KParam oInfo,
         (ids[3] < oInfo.dims[3]);
 
     if (!is_valid) return;
-    if (groupId_dim == 0) return;
 
     To accum = *(tData - tInfo.strides[dim]);
 
diff --git a/src/backend/opencl/kernel/scan_first.cl b/src/backend/opencl/kernel/scan_first.cl