| 
 | 1 | +#pragma once  | 
 | 2 | + | 
 | 3 | +#include "fake_atomic.h"  | 
 | 4 | +#include "warp_reduce.h"  | 
 | 5 | + | 
 | 6 | +__global__ void device_reduce_warp_atomic_kernel(int *in, int* out, int N) {  | 
 | 7 | +  int sum=int(0);  | 
 | 8 | +  for(int i=blockIdx.x*blockDim.x+threadIdx.x;i<N;i+=blockDim.x*gridDim.x) {  | 
 | 9 | +    sum+=in[i];  | 
 | 10 | +  }  | 
 | 11 | +  sum=warpReduceSum(sum);  | 
 | 12 | +  if(threadIdx.x%warpSize==0)  | 
 | 13 | +    atomicAdd(out,sum);  | 
 | 14 | +}  | 
 | 15 | + | 
 | 16 | +void device_reduce_warp_atomic(int *in, int* out, int N) {  | 
 | 17 | +  int threads=256;  | 
 | 18 | +  int blocks=min((N+threads-1)/threads,2048);  | 
 | 19 | + | 
 | 20 | +  cudaMemsetAsync(out,0,sizeof(int));  | 
 | 21 | +  device_reduce_warp_atomic_kernel<<<blocks,threads>>>(in,out,N);   | 
 | 22 | +}  | 
 | 23 | + | 
 | 24 | +__global__ void device_reduce_warp_atomic_kernel_vector2(int *in, int* out, int N) {  | 
 | 25 | +  int sum=0;  | 
 | 26 | +  int idx=blockIdx.x*blockDim.x+threadIdx.x;  | 
 | 27 | +  for(int i=idx;i<N/2;i+=blockDim.x*gridDim.x) {  | 
 | 28 | +    int2 val=reinterpret_cast<int2*>(in)[i];  | 
 | 29 | +    sum+=val.x+val.y;  | 
 | 30 | +  }  | 
 | 31 | +  int i=idx+N/2*2;  | 
 | 32 | +  if(i<N)   | 
 | 33 | +    sum+=in[i];  | 
 | 34 | +  sum=warpReduceSum(sum);  | 
 | 35 | +  if(threadIdx.x%warpSize==0)  | 
 | 36 | +    atomicAdd(out,sum);  | 
 | 37 | +}  | 
 | 38 | + | 
 | 39 | +void device_reduce_warp_atomic_vector2(int *in, int* out, int N) {  | 
 | 40 | +  int threads=256;  | 
 | 41 | +  int blocks=min((N/2+threads-1)/threads,2048);  | 
 | 42 | + | 
 | 43 | +  cudaMemsetAsync(out,0,sizeof(int));  | 
 | 44 | +  device_reduce_warp_atomic_kernel_vector2<<<blocks,threads>>>(in,out,N);   | 
 | 45 | +}  | 
 | 46 | + | 
 | 47 | +__global__ void device_reduce_warp_atomic_kernel_vector4(int *in, int* out, int N) {  | 
 | 48 | +  int sum=0;  | 
 | 49 | +  int idx=blockIdx.x*blockDim.x+threadIdx.x;  | 
 | 50 | +  for(int i=idx;i<N/4;i+=blockDim.x*gridDim.x) {  | 
 | 51 | +    int4 val=reinterpret_cast<int4*>(in)[i];  | 
 | 52 | +    sum+=(val.x+val.y)+(val.z+val.w);  | 
 | 53 | +  }  | 
 | 54 | +  int i=idx+N/4*4;  | 
 | 55 | +  if(i<N)   | 
 | 56 | +    sum+=in[i];  | 
 | 57 | +    | 
 | 58 | +  sum=warpReduceSum(sum);  | 
 | 59 | +  if(threadIdx.x%warpSize==0)  | 
 | 60 | +    atomicAdd(out,sum);  | 
 | 61 | +}  | 
 | 62 | + | 
 | 63 | +void device_reduce_warp_atomic_vector4(int *in, int* out, int N) {  | 
 | 64 | +  int threads=256;  | 
 | 65 | +  int blocks=min((N/4+threads-1)/threads,2048);  | 
 | 66 | + | 
 | 67 | +  cudaMemsetAsync(out,0,sizeof(int));  | 
 | 68 | +  device_reduce_warp_atomic_kernel_vector4<<<blocks,threads>>>(in,out,N);   | 
 | 69 | +}  | 
0 commit comments