1
2
3
4
5
6 from __future__ import division
7 import pycuda.driver as cuda
8 import pycuda.autoinit
9 import pycuda.gpuarray as gpuarray
10 from pycuda.compiler import SourceModule
11 import numpy
12
13
14 block_size = 512
15 el_per_thread = 1
16 multiple_block_size = el_per_thread * block_size
17
18
19 amount = 256*2560
20 limit = 0.9
21 assert amount % multiple_block_size == 0
22 a = numpy.random.rand(amount)
23 a = a.astype(numpy.float32)
24 a_gpu = gpuarray.to_gpu(a)
25
26
27 selec = numpy.zeros_like(a)
28 selec = selec.astype(numpy.int32)
29 selec.fill(-1)
30 selec_gpu = gpuarray.to_gpu(selec)
31
32
33 counter_gpu = gpuarray.zeros(1, dtype=numpy.int32)
34
35
36 mod = SourceModule("""
37 #define BLOCK_SIZE %(block_size)d
38 #define EL_PER_THREAD %(el_per_thread)d
39 // #define USE_LOOPS 1
40
41 __global__ void select_them(float *a, int *selec, float limit, int *counter)
42 {
43 __shared__ int selec_smem[EL_PER_THREAD * BLOCK_SIZE];
44 __shared__ int counter_smem;
45 int *counter_smem_ptr;
46
47 int jump = 16;
48 int idx = EL_PER_THREAD * blockIdx.x * BLOCK_SIZE + threadIdx.x +
49 (EL_PER_THREAD - 1) * (threadIdx.x / 16) * jump;
50
51 if (threadIdx.x == 1)
52 {
53 counter_smem_ptr = &counter_smem;
54 counter_smem = 0;
55 }
56
57 #if (EL_PER_THREAD == 1) && !defined(USE_LOOPS)
58 selec_smem[threadIdx.x] = -1;
59 #else
60 for (int i = 0; i <= EL_PER_THREAD - 1; i++)
61 selec_smem[threadIdx.x + i * BLOCK_SIZE] = -1;
62 #endif
63
64 __syncthreads();
65
66 // each counting thread writes its index to shared memory
67
68 #if (EL_PER_THREAD == 1) && !defined(USE_LOOPS)
69 if (a[idx] >= limit)
70 selec_smem[atomicAdd(counter_smem_ptr, 1)] = idx;
71 #else
72 for (int i = 0; i <= EL_PER_THREAD - 1; i++)
73 {
74 if (a[idx + i * jump] >= limit)
75 selec_smem[atomicAdd(counter_smem_ptr, 1)] = idx + i * jump;
76 }
77 #endif
78
79 __syncthreads();
80
81 if (threadIdx.x == 1)
82 counter_smem = atomicAdd(counter, counter_smem);
83
84 __syncthreads();
85
86 #if (EL_PER_THREAD == 1) && !defined(USE_LOOPS)
87 if (selec_smem[threadIdx.x] >= 0)
88 selec[counter_smem + threadIdx.x] = selec_smem[threadIdx.x];
89 #else
90 for (int i = 0; i <= EL_PER_THREAD - 1; i++)
91 {
92 if (selec_smem[threadIdx.x + i * jump] >= 0)
93 selec[counter_smem + threadIdx.x + i * jump] =
94 selec_smem[threadIdx.x + i * jump];
95 }
96 #endif
97 }
98 """ % {"block_size": block_size, "el_per_thread": el_per_thread})
99
100
101 func = mod.get_function("select_them")
102 func.prepare("PPfP", block=(block_size, 1, 1))
103
104
105 warmup = 2
106 for i in range(warmup):
107 func.prepared_call((amount // multiple_block_size, 1),
108 a_gpu.gpudata, selec_gpu.gpudata,
109 limit, counter_gpu.gpudata)
110 counter_gpu = gpuarray.zeros(1, dtype=numpy.int32)
111
112
113
114 start = cuda.Event()
115 stop = cuda.Event()
116
117
118 cuda.Context.synchronize()
119 start.record()
120 count = 10
121 for i in range(count):
122 func.prepared_call((amount // multiple_block_size, 1),
123 a_gpu.gpudata, selec_gpu.gpudata,
124 limit, counter_gpu.gpudata)
125 counter_gpu = gpuarray.zeros(1, dtype=numpy.int32)
126 stop.record()
127
128
129 selec_gpu.get(selec)
130
131 stop.synchronize()
132
133
134 elems_in_selec = len(numpy.nonzero(selec >= 0))
135
136 elapsed_seconds = stop.time_since(start) * 1e-3
137 print "mem bw:", (a.nbytes + elems_in_selec * 4) / elapsed_seconds / 1e9 * count
138
139 filtered_set = sorted(list(item for item in selec if item != -1))
140 reference_set = sorted(list(i for i, x in enumerate(a) if x >= limit))
141 assert filtered_set == reference_set
142