Skip to content

Commit 3429769

Browse files
committed
Added Parallel Reductions Code
1 parent 692f464 commit 3429769

File tree

11 files changed

+521
-0
lines changed

11 files changed

+521
-0
lines changed
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
reduce: main.cu *.h
2+
nvcc -O3 main.cu -o reduce -arch=sm_35
3+
4+
clean:
5+
rm -f reduce
6+
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#pragma once
2+
3+
#include "warp_reduce.h"
4+
5+
__inline__ __device__
6+
int blockReduceSum(int val) {
7+
static __shared__ int shared[32];
8+
int lane=threadIdx.x%warpSize;
9+
int wid=threadIdx.x/warpSize;
10+
val=warpReduceSum(val);
11+
12+
//write reduced value to shared memory
13+
if(lane==0) shared[wid]=val;
14+
__syncthreads();
15+
16+
//ensure we only grab a value from shared memory if that warp existed
17+
val = (threadIdx.x<blockDim.x/warpSize) ? shared[lane] : int(0);
18+
if(wid==0) val=warpReduceSum(val);
19+
20+
return val;
21+
}
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#pragma once
2+
3+
#include "fake_atomic.h"
4+
5+
6+
__global__ void device_reduce_atomic_kernel(int *in, int* out, int N) {
7+
int sum=int(0);
8+
for(int i=blockIdx.x*blockDim.x+threadIdx.x;i<N;i+=blockDim.x*gridDim.x) {
9+
sum+=in[i];
10+
}
11+
atomicAdd(out,sum);
12+
}
13+
14+
void device_reduce_atomic(int *in, int* out, int N) {
15+
int threads=256;
16+
int blocks=min((N+threads-1)/threads,2048);
17+
18+
cudaMemsetAsync(out,0,sizeof(int));
19+
device_reduce_atomic_kernel<<<blocks,threads>>>(in,out,N);
20+
}
21+
22+
__global__ void device_reduce_atomic_kernel_vector2(int *in, int* out, int N) {
23+
int sum=0;
24+
int idx=blockIdx.x*blockDim.x+threadIdx.x;
25+
for(int i=idx;i<N/2;i+=blockDim.x*gridDim.x) {
26+
int2 val=reinterpret_cast<int2*>(in)[i];
27+
sum+=val.x+val.y;
28+
}
29+
int i=idx+N/2*2;
30+
if(i<N)
31+
sum+=in[i];
32+
33+
atomicAdd(out,sum);
34+
}
35+
36+
void device_reduce_atomic_vector2(int *in, int* out, int N) {
37+
int threads=256;
38+
int blocks=min((N/2+threads-1)/threads,2048);
39+
40+
cudaMemsetAsync(out,0,sizeof(int));
41+
device_reduce_atomic_kernel_vector2<<<blocks,threads>>>(in,out,N);
42+
}
43+
44+
__global__ void device_reduce_atomic_kernel_vector4(int *in, int* out, int N) {
45+
int sum=0;
46+
int idx=blockIdx.x*blockDim.x+threadIdx.x;
47+
for(int i=idx;i<N/4;i+=blockDim.x*gridDim.x) {
48+
int4 val=reinterpret_cast<int4*>(in)[i];
49+
sum+=(val.x+val.y)+(val.z+val.w);
50+
}
51+
int i=idx+N/4*4;
52+
if(i<N)
53+
sum+=in[i];
54+
55+
atomicAdd(out,sum);
56+
}
57+
58+
void device_reduce_atomic_vector4(int *in, int* out, int N) {
59+
int threads=256;
60+
int blocks=min((N/4+threads-1)/threads,2048);
61+
62+
cudaMemsetAsync(out,0,sizeof(int));
63+
device_reduce_atomic_kernel_vector4<<<blocks,threads>>>(in,out,N);
64+
}
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#pragma once
2+
3+
#include "fake_atomic.h"
4+
#include "block_reduce.h"
5+
6+
__global__ void device_reduce_block_atomic_kernel(int *in, int* out, int N) {
7+
int sum=int(0);
8+
for(int i=blockIdx.x*blockDim.x+threadIdx.x;i<N;i+=blockDim.x*gridDim.x) {
9+
sum+=in[i];
10+
}
11+
sum=blockReduceSum(sum);
12+
if(threadIdx.x==0)
13+
atomicAdd(out,sum);
14+
}
15+
16+
void device_reduce_block_atomic(int *in, int* out, int N) {
17+
int threads=256;
18+
int blocks=min((N+threads-1)/threads,2048);
19+
20+
cudaMemsetAsync(out,0,sizeof(int));
21+
device_reduce_block_atomic_kernel<<<blocks,threads>>>(in,out,N);
22+
}
23+
24+
__global__ void device_reduce_block_atomic_kernel_vector2(int *in, int* out, int N) {
25+
int sum=0;
26+
int idx=blockIdx.x*blockDim.x+threadIdx.x;
27+
for(int i=idx;i<N/2;i+=blockDim.x*gridDim.x) {
28+
int2 val=reinterpret_cast<int2*>(in)[i];
29+
sum+=val.x+val.y;
30+
}
31+
int i=idx+N/2*2;
32+
if(i<N)
33+
sum+=in[i];
34+
sum=blockReduceSum(sum);
35+
if(threadIdx.x==0)
36+
atomicAdd(out,sum);
37+
}
38+
39+
void device_reduce_block_atomic_vector2(int *in, int* out, int N) {
40+
int threads=256;
41+
int blocks=min((N/2+threads-1)/threads,2048);
42+
43+
cudaMemsetAsync(out,0,sizeof(int));
44+
device_reduce_block_atomic_kernel_vector2<<<blocks,threads>>>(in,out,N);
45+
}
46+
47+
__global__ void device_reduce_block_atomic_kernel_vector4(int *in, int* out, int N) {
48+
int sum=0;
49+
int idx=blockIdx.x*blockDim.x+threadIdx.x;
50+
for(int i=idx;i<N/4;i+=blockDim.x*gridDim.x) {
51+
int4 val=reinterpret_cast<int4*>(in)[i];
52+
sum+=(val.x+val.y)+(val.z+val.w);
53+
}
54+
int i=idx+N/4*4;
55+
if(i<N)
56+
sum+=in[i];
57+
58+
sum=blockReduceSum(sum);
59+
if(threadIdx.x==0)
60+
atomicAdd(out,sum);
61+
}
62+
63+
void device_reduce_block_atomic_vector4(int *in, int* out, int N) {
64+
int threads=256;
65+
int blocks=min((N/4+threads-1)/threads,2048);
66+
67+
cudaMemsetAsync(out,0,sizeof(int));
68+
device_reduce_block_atomic_kernel_vector4<<<blocks,threads>>>(in,out,N);
69+
}
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#pragma once
2+
#pragma once
3+
4+
#include "block_reduce.h"
5+
6+
__global__ void device_reduce_stable_kernel(int *in, int* out, int N) {
7+
int sum=int(0);
8+
for(int i=blockIdx.x*blockDim.x+threadIdx.x;i<N;i+=blockDim.x*gridDim.x) {
9+
sum+=in[i];
10+
}
11+
sum=blockReduceSum(sum);
12+
if(threadIdx.x==0)
13+
out[blockIdx.x]=sum;
14+
}
15+
16+
void device_reduce_stable(int *in, int* out, int N) {
17+
int threads=512;
18+
int blocks=min((N+threads-1)/threads,1024);
19+
20+
device_reduce_stable_kernel<<<blocks,threads>>>(in,out,N);
21+
device_reduce_stable_kernel<<<1,1024>>>(out,out,blocks);
22+
}
23+
24+
__global__ void device_reduce_stable_kernel_vector2(int *in, int* out, int N) {
25+
int sum=0;
26+
int idx=blockIdx.x*blockDim.x+threadIdx.x;
27+
for(int i=idx;i<N/2;i+=blockDim.x*gridDim.x) {
28+
int2 val=reinterpret_cast<int2*>(in)[i];
29+
sum+=val.x+val.y;
30+
}
31+
int i=idx+N/2*2;
32+
if(i<N)
33+
sum+=in[i];
34+
sum=blockReduceSum(sum);
35+
if(threadIdx.x==0)
36+
out[blockIdx.x]=sum;
37+
}
38+
39+
void device_reduce_stable_vector2(int *in, int* out, int N) {
40+
int threads=512;
41+
int blocks=min((N/2+threads-1)/threads,1024);
42+
43+
device_reduce_stable_kernel_vector2<<<blocks,threads>>>(in,out,N);
44+
device_reduce_stable_kernel<<<1,1024>>>(out,out,blocks);
45+
}
46+
47+
__global__ void device_reduce_stable_kernel_vector4(int *in, int* out, int N) {
48+
int sum=0;
49+
int idx=blockIdx.x*blockDim.x+threadIdx.x;
50+
for(int i=idx;i<N/4;i+=blockDim.x*gridDim.x) {
51+
int4 val=reinterpret_cast<int4*>(in)[i];
52+
sum+=(val.x+val.y)+(val.z+val.w);
53+
}
54+
int i=idx+N/4*4;
55+
if(i<N)
56+
sum+=in[i];
57+
58+
sum=blockReduceSum(sum);
59+
if(threadIdx.x==0)
60+
out[blockIdx.x]=sum;
61+
}
62+
63+
void device_reduce_stable_vector4(int *in, int* out, int N) {
64+
int threads=512;
65+
int blocks=min((N/4+threads-1)/threads,1024);
66+
67+
device_reduce_stable_kernel_vector4<<<blocks,threads>>>(in,out,N);
68+
device_reduce_stable_kernel<<<1,1024>>>(out,out,blocks);
69+
}
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#pragma once
2+
3+
#include "fake_atomic.h"
4+
#include "warp_reduce.h"
5+
6+
__global__ void device_reduce_warp_atomic_kernel(int *in, int* out, int N) {
7+
int sum=int(0);
8+
for(int i=blockIdx.x*blockDim.x+threadIdx.x;i<N;i+=blockDim.x*gridDim.x) {
9+
sum+=in[i];
10+
}
11+
sum=warpReduceSum(sum);
12+
if(threadIdx.x%warpSize==0)
13+
atomicAdd(out,sum);
14+
}
15+
16+
void device_reduce_warp_atomic(int *in, int* out, int N) {
17+
int threads=256;
18+
int blocks=min((N+threads-1)/threads,2048);
19+
20+
cudaMemsetAsync(out,0,sizeof(int));
21+
device_reduce_warp_atomic_kernel<<<blocks,threads>>>(in,out,N);
22+
}
23+
24+
__global__ void device_reduce_warp_atomic_kernel_vector2(int *in, int* out, int N) {
25+
int sum=0;
26+
int idx=blockIdx.x*blockDim.x+threadIdx.x;
27+
for(int i=idx;i<N/2;i+=blockDim.x*gridDim.x) {
28+
int2 val=reinterpret_cast<int2*>(in)[i];
29+
sum+=val.x+val.y;
30+
}
31+
int i=idx+N/2*2;
32+
if(i<N)
33+
sum+=in[i];
34+
sum=warpReduceSum(sum);
35+
if(threadIdx.x%warpSize==0)
36+
atomicAdd(out,sum);
37+
}
38+
39+
void device_reduce_warp_atomic_vector2(int *in, int* out, int N) {
40+
int threads=256;
41+
int blocks=min((N/2+threads-1)/threads,2048);
42+
43+
cudaMemsetAsync(out,0,sizeof(int));
44+
device_reduce_warp_atomic_kernel_vector2<<<blocks,threads>>>(in,out,N);
45+
}
46+
47+
__global__ void device_reduce_warp_atomic_kernel_vector4(int *in, int* out, int N) {
48+
int sum=0;
49+
int idx=blockIdx.x*blockDim.x+threadIdx.x;
50+
for(int i=idx;i<N/4;i+=blockDim.x*gridDim.x) {
51+
int4 val=reinterpret_cast<int4*>(in)[i];
52+
sum+=(val.x+val.y)+(val.z+val.w);
53+
}
54+
int i=idx+N/4*4;
55+
if(i<N)
56+
sum+=in[i];
57+
58+
sum=warpReduceSum(sum);
59+
if(threadIdx.x%warpSize==0)
60+
atomicAdd(out,sum);
61+
}
62+
63+
void device_reduce_warp_atomic_vector4(int *in, int* out, int N) {
64+
int threads=256;
65+
int blocks=min((N/4+threads-1)/threads,2048);
66+
67+
cudaMemsetAsync(out,0,sizeof(int));
68+
device_reduce_warp_atomic_kernel_vector4<<<blocks,threads>>>(in,out,N);
69+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#pragma once
2+
3+
#if 0
4+
template <class T>
5+
__device__ __inline__
6+
void atomicAdd(T* ptr, T val) {
7+
*ptr+=val;
8+
}
9+
#endif
10+
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#pragma once
2+
3+
//#define __shfl_down fake_shfl_down
4+
#define MAX_BLOCK 512
5+
__inline__ __device__
6+
int fake_shfl_down(int val, int offset, int width=32) {
7+
static __shared__ int shared[MAX_BLOCK];
8+
int lane=threadIdx.x%32;
9+
10+
shared[threadIdx.x]=val;
11+
__syncthreads();
12+
13+
val = (lane+offset<width) ? shared[threadIdx.x+offset] : 0;
14+
__syncthreads();
15+
16+
return val;
17+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/bin/bash
2+
3+
BASE=2
4+
LOW=10
5+
HIGH=27
6+
7+
HEADER=`./reduce 100 1 | grep -v NUM_ELEMS | cut -d ":" -f 1`
8+
9+
HEADER="SIZE $HEADER"
10+
echo $HEADER
11+
for (( i=$LOW; i<=$HIGH; i++ ))
12+
do
13+
size=`echo "$BASE^$i" | bc`
14+
TIMES=`./reduce $size 100 | grep -v NUM_ELEMS | cut -d ":" -f 4 | cut -f 2 -d " "`
15+
bytes=`echo "$size*4" | bc`
16+
echo $bytes $TIMES
17+
done

0 commit comments

Comments
 (0)