1 '''
2 /*
3 * Copyright 1993-2007 NVIDIA Corporation. All rights reserved.
4 *
5 * NOTICE TO USER:
6 *
7 * This source code is subject to NVIDIA ownership rights under U.S. and
8 * international Copyright laws. Users and possessors of this source code
9 * are hereby granted a nonexclusive, royalty-free license to use this code
10 * in individual and commercial software.
11 *
12 * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
13 * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
14 * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
15 * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
16 * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
17 * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
18 * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
19 * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
20 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
21 * OR PERFORMANCE OF THIS SOURCE CODE.
22 *
23 * U.S. Government End Users. This source code is a "commercial item" as
24 * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
25 * "commercial computer software" and "commercial computer software
26 * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
27 * and is provided to the U.S. Government only as a commercial end item.
28 * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
29 * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
30 * source code with only those rights set forth herein.
31 *
32 * Any use of this source code in individual and commercial software must
33 * include, in the user documentation and internal comments to the code,
34 * the above Disclaimer and U.S. Government End Users Notice.
35 */
36
37 /*
38 * This sample implements a separable convolution filter
39 * of a 2D signal with a gaussian kernel.
40 */
41
42 Ported to pycuda by Andrew Wagner <awagner@illinois.edu>, June 2009.
43 '''
44
45 import numpy
46 import pycuda.autoinit
47 import pycuda.driver as cuda
48 from pycuda.compiler import SourceModule
49 import string
50
51
52
53 KERNEL_RADIUS = 8
54 UNROLL_INNER_LOOP = True
55 KERNEL_W = 2 * KERNEL_RADIUS + 1
56 ROW_TILE_W = 128
57 KERNEL_RADIUS_ALIGNED = 16
58 COLUMN_TILE_W = 16
59 COLUMN_TILE_H = 48
60 template = '''
61 //24-bit multiplication is faster on G80,
62 //but we must be sure to multiply integers
63 //only within [-8M, 8M - 1] range
64 #define IMUL(a, b) __mul24(a, b)
65
66 ////////////////////////////////////////////////////////////////////////////////
67 // Kernel configuration
68 ////////////////////////////////////////////////////////////////////////////////
69 #define KERNEL_RADIUS $KERNEL_RADIUS
70 #define KERNEL_W $KERNEL_W
71 __device__ __constant__ float d_Kernel_rows[KERNEL_W];
72 __device__ __constant__ float d_Kernel_columns[KERNEL_W];
73
74 // Assuming ROW_TILE_W, KERNEL_RADIUS_ALIGNED and dataW
75 // are multiples of coalescing granularity size,
76 // all global memory operations are coalesced in convolutionRowGPU()
77 #define ROW_TILE_W $ROW_TILE_W
78 #define KERNEL_RADIUS_ALIGNED $KERNEL_RADIUS_ALIGNED
79
80 // Assuming COLUMN_TILE_W and dataW are multiples
81 // of coalescing granularity size, all global memory operations
82 // are coalesced in convolutionColumnGPU()
83 #define COLUMN_TILE_W $COLUMN_TILE_W
84 #define COLUMN_TILE_H $COLUMN_TILE_H
85
86 ////////////////////////////////////////////////////////////////////////////////
87 // Row convolution filter
88 ////////////////////////////////////////////////////////////////////////////////
89 __global__ void convolutionRowGPU(
90 float *d_Result,
91 float *d_Data,
92 int dataW,
93 int dataH
94 ){
95 //Data cache
96 __shared__ float data[KERNEL_RADIUS + ROW_TILE_W + KERNEL_RADIUS];
97
98 //Current tile and apron limits, relative to row start
99 const int tileStart = IMUL(blockIdx.x, ROW_TILE_W);
100 const int tileEnd = tileStart + ROW_TILE_W - 1;
101 const int apronStart = tileStart - KERNEL_RADIUS;
102 const int apronEnd = tileEnd + KERNEL_RADIUS;
103
104 //Clamp tile and apron limits by image borders
105 const int tileEndClamped = min(tileEnd, dataW - 1);
106 const int apronStartClamped = max(apronStart, 0);
107 const int apronEndClamped = min(apronEnd, dataW - 1);
108
109 //Row start index in d_Data[]
110 const int rowStart = IMUL(blockIdx.y, dataW);
111
112 //Aligned apron start. Assuming dataW and ROW_TILE_W are multiples
113 //of half-warp size, rowStart + apronStartAligned is also a
114 //multiple of half-warp size, thus having proper alignment
115 //for coalesced d_Data[] read.
116 const int apronStartAligned = tileStart - KERNEL_RADIUS_ALIGNED;
117
118 const int loadPos = apronStartAligned + threadIdx.x;
119 //Set the entire data cache contents
120 //Load global memory values, if indices are within the image borders,
121 //or initialize with zeroes otherwise
122 if(loadPos >= apronStart){
123 const int smemPos = loadPos - apronStart;
124
125 data[smemPos] =
126 ((loadPos >= apronStartClamped) && (loadPos <= apronEndClamped)) ?
127 d_Data[rowStart + loadPos] : 0;
128 }
129
130 //Ensure the completness of the loading stage
131 //because results, emitted by each thread depend on the data,
132 //loaded by another threads
133 __syncthreads();
134 const int writePos = tileStart + threadIdx.x;
135 //Assuming dataW and ROW_TILE_W are multiples of half-warp size,
136 //rowStart + tileStart is also a multiple of half-warp size,
137 //thus having proper alignment for coalesced d_Result[] write.
138 if(writePos <= tileEndClamped){
139 const int smemPos = writePos - apronStart;
140 float sum = 0;
141 '''
142 originalLoop = '''
143 for(int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++)
144 sum += data[smemPos + k] * d_Kernel_rows[KERNEL_RADIUS - k];
145 '''
146 unrolledLoop = ''
147 for k in range(-KERNEL_RADIUS, KERNEL_RADIUS+1):
148 loopTemplate = string.Template(
149 'sum += data[smemPos + $k] * d_Kernel_rows[KERNEL_RADIUS - $k];\n')
150 unrolledLoop += loopTemplate.substitute(k=k)
151
152
153 template += unrolledLoop if UNROLL_INNER_LOOP else originalLoop
154 template += '''
155 d_Result[rowStart + writePos] = sum;
156 //d_Result[rowStart + writePos] = 128;
157 }
158 }
159
160 ////////////////////////////////////////////////////////////////////////////////
161 // Column convolution filter
162 ////////////////////////////////////////////////////////////////////////////////
163 __global__ void convolutionColumnGPU(
164 float *d_Result,
165 float *d_Data,
166 int dataW,
167 int dataH,
168 int smemStride,
169 int gmemStride
170 ){
171 //Data cache
172 __shared__ float data[COLUMN_TILE_W *
173 (KERNEL_RADIUS + COLUMN_TILE_H + KERNEL_RADIUS)];
174
175 //Current tile and apron limits, in rows
176 const int tileStart = IMUL(blockIdx.y, COLUMN_TILE_H);
177 const int tileEnd = tileStart + COLUMN_TILE_H - 1;
178 const int apronStart = tileStart - KERNEL_RADIUS;
179 const int apronEnd = tileEnd + KERNEL_RADIUS;
180
181 //Clamp tile and apron limits by image borders
182 const int tileEndClamped = min(tileEnd, dataH - 1);
183 const int apronStartClamped = max(apronStart, 0);
184 const int apronEndClamped = min(apronEnd, dataH - 1);
185
186 //Current column index
187 const int columnStart = IMUL(blockIdx.x, COLUMN_TILE_W) + threadIdx.x;
188
189 //Shared and global memory indices for current column
190 int smemPos = IMUL(threadIdx.y, COLUMN_TILE_W) + threadIdx.x;
191 int gmemPos = IMUL(apronStart + threadIdx.y, dataW) + columnStart;
192 //Cycle through the entire data cache
193 //Load global memory values, if indices are within the image borders,
194 //or initialize with zero otherwise
195 for(int y = apronStart + threadIdx.y; y <= apronEnd; y += blockDim.y){
196 data[smemPos] =
197 ((y >= apronStartClamped) && (y <= apronEndClamped)) ?
198 d_Data[gmemPos] : 0;
199 smemPos += smemStride;
200 gmemPos += gmemStride;
201 }
202
203 //Ensure the completness of the loading stage
204 //because results, emitted by each thread depend on the data,
205 //loaded by another threads
206 __syncthreads();
207 //Shared and global memory indices for current column
208 smemPos = IMUL(threadIdx.y + KERNEL_RADIUS, COLUMN_TILE_W) + threadIdx.x;
209 gmemPos = IMUL(tileStart + threadIdx.y , dataW) + columnStart;
210 //Cycle through the tile body, clamped by image borders
211 //Calculate and output the results
212 for(int y = tileStart + threadIdx.y; y <= tileEndClamped; y += blockDim.y){
213 float sum = 0;
214 '''
215 originalLoop = '''
216 for(int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++)
217 sum += data[smemPos + IMUL(k, COLUMN_TILE_W)] *
218 d_Kernel_columns[KERNEL_RADIUS - k];
219 '''
220 unrolledLoop = ''
221 for k in range(-KERNEL_RADIUS, KERNEL_RADIUS+1):
222 loopTemplate = string.Template('sum += data[smemPos + IMUL($k, COLUMN_TILE_W)] * d_Kernel_columns[KERNEL_RADIUS - $k];\n')
223 unrolledLoop += loopTemplate.substitute(k=k)
224
225
226 template += unrolledLoop if UNROLL_INNER_LOOP else originalLoop
227 template += '''
228 d_Result[gmemPos] = sum;
229 //d_Result[gmemPos] = 128;
230 smemPos += smemStride;
231 gmemPos += gmemStride;
232 }
233 }
234 '''
235 template = string.Template(template)
236 code = template.substitute(KERNEL_RADIUS = KERNEL_RADIUS,
237 KERNEL_W = KERNEL_W,
238 COLUMN_TILE_H=COLUMN_TILE_H,
239 COLUMN_TILE_W=COLUMN_TILE_W,
240 ROW_TILE_W=ROW_TILE_W,
241 KERNEL_RADIUS_ALIGNED=KERNEL_RADIUS_ALIGNED)
242
243 module = SourceModule(code)
244 convolutionRowGPU = module.get_function('convolutionRowGPU')
245 convolutionColumnGPU = module.get_function('convolutionColumnGPU')
246 d_Kernel_rows = module.get_global('d_Kernel_rows')[0]
247 d_Kernel_columns = module.get_global('d_Kernel_columns')[0]
248
249
250 def iDivUp(a, b):
251
252 a = numpy.int32(a)
253 b = numpy.int32(b)
254 return (a / b + 1) if (a % b != 0) else (a / b)
255
256 def iDivDown(a, b):
257
258 a = numpy.int32(a)
259 b = numpy.int32(b)
260 return a / b;
261
262 def iAlignUp(a, b):
263
264 a = numpy.int32(a)
265 b = numpy.int32(b)
266 return (a - a % b + b) if (a % b != 0) else a
267
268 def iAlignDown(a, b):
269
270 a = numpy.int32(a)
271 b = numpy.int32(b)
272 return a - a % b
273
274 def gaussian_kernel(width = KERNEL_W, sigma = 4.0):
275 assert width == numpy.floor(width), 'argument width should be an integer!'
276 radius = (width - 1)/2.0
277 x = numpy.linspace(-radius, radius, width)
278 x = numpy.float32(x)
279 sigma = numpy.float32(sigma)
280 filterx = x*x / (2 * sigma * sigma)
281 filterx = numpy.exp(-1 * filterx)
282 assert filterx.sum()>0, 'something very wrong if gaussian kernel sums to zero!'
283 filterx /= filterx.sum()
284 return filterx
285
286 def derivative_of_gaussian_kernel(width = KERNEL_W, sigma = 4):
287 assert width == numpy.floor(width), 'argument width should be an integer!'
288 radius = (width - 1)/2.0
289 x = numpy.linspace(-radius, radius, width)
290 x = numpy.float32(x)
291
292 filterx = gaussian_kernel(width, sigma)
293 filterx *= x
294
295 scale = (x * filterx).sum()
296 filterx /= scale
297
298 filterx *= -1.0
299 return filterx
300
301 def test_derivative_of_gaussian_kernel():
302 width = 20
303 sigma = 10.0
304 filterx = derivative_of_gaussian_kernel(width, sigma)
305 x = 2 * numpy.arange(0, width)
306 x = numpy.float32(x)
307 response = (filter * x).sum()
308 assert abs(response - (-2.0)) < .0001, 'derivative of gaussian failed scale test!'
309 width = 19
310 sigma = 10.0
311 filterx = derivative_of_gaussian_kernel(width, sigma)
312 x = 2 * numpy.arange(0, width)
313 x = numpy.float32(x)
314 response = (filterx * x).sum()
315 assert abs(response - (-2.0)) < .0001, 'derivative of gaussian failed scale test!'
316
317 def convolution_cuda(sourceImage, filterx, filtery):
318
319
320 destImage = sourceImage.copy()
321 assert sourceImage.dtype == 'float32', 'source image must be float32'
322 (imageHeight, imageWidth) = sourceImage.shape
323 assert filterx.shape == filtery.shape == (KERNEL_W, ) , 'Kernel is compiled for a different kernel size! Try changing KERNEL_W'
324 filterx = numpy.float32(filterx)
325 filtery = numpy.float32(filtery)
326 DATA_W = iAlignUp(imageWidth, 16);
327 DATA_H = imageHeight;
328 BYTES_PER_WORD = 4;
329 DATA_SIZE = DATA_W * DATA_H * BYTES_PER_WORD;
330 KERNEL_SIZE = KERNEL_W * BYTES_PER_WORD;
331
332 destImage_gpu = cuda.mem_alloc_like(destImage)
333 sourceImage_gpu = cuda.mem_alloc_like(sourceImage)
334 intermediateImage_gpu = cuda.mem_alloc_like(sourceImage)
335 cuda.memcpy_htod(sourceImage_gpu, sourceImage)
336 cuda.memcpy_htod(d_Kernel_rows, filterx)
337 cuda.memcpy_htod(d_Kernel_columns, filtery)
338
339 blockGridRows = (iDivUp(DATA_W, ROW_TILE_W), DATA_H)
340 blockGridColumns = (iDivUp(DATA_W, COLUMN_TILE_W), iDivUp(DATA_H, COLUMN_TILE_H))
341 threadBlockRows = (KERNEL_RADIUS_ALIGNED + ROW_TILE_W + KERNEL_RADIUS, 1, 1)
342 threadBlockColumns = (COLUMN_TILE_W, 8, 1)
343 DATA_H = numpy.int32(DATA_H)
344 DATA_W = numpy.int32(DATA_W)
345 convolutionRowGPU(intermediateImage_gpu, sourceImage_gpu, DATA_W, DATA_H, grid=[int(e) for e in blockGridRows], block=[int(e) for e in threadBlockRows])
346 convolutionColumnGPU(destImage_gpu, intermediateImage_gpu, DATA_W, DATA_H, numpy.int32(COLUMN_TILE_W * threadBlockColumns[1]), numpy.int32(DATA_W * threadBlockColumns[1]), grid=[int(e) for e in blockGridColumns], block=[int(e) for e in threadBlockColumns])
347
348
349 cuda.memcpy_dtoh(destImage, destImage_gpu)
350 return destImage
351
352 def test_convolution_cuda():
353
354
355 original = numpy.random.rand(768, 1024) * 255
356 original = numpy.float32(original)
357
358 filterx = gaussian_kernel()
359 destImage = original.copy()
360 destImage[:] = numpy.nan
361 destImage = convolution_cuda(original, filterx, filterx)
362
363 print 'Done running the convolution kernel!'
364
365 if __name__ == '__main__':
366 test_convolution_cuda()
367
368 boo = raw_input('Pausing so you can look at results... <Enter> to finish...')