1 '''
   2 /*
   3  * Copyright 1993-2007 NVIDIA Corporation.  All rights reserved.
   4  *
   5  * NOTICE TO USER:
   6  *
   7  * This source code is subject to NVIDIA ownership rights under U.S. and
   8  * international Copyright laws.  Users and possessors of this source code
   9  * are hereby granted a nonexclusive, royalty-free license to use this code
  10  * in individual and commercial software.
  11  *
  12  * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
  13  * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
  14  * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
  15  * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
  16  * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
  17  * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
  18  * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
  19  * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
  20  * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
  21  * OR PERFORMANCE OF THIS SOURCE CODE.
  22  *
  23  * U.S. Government End Users.   This source code is a "commercial item" as
  24  * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
  25  * "commercial computer  software"  and "commercial computer software
  26  * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
  27  * and is provided to the U.S. Government only as a commercial end item.
  28  * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
  29  * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
  30  * source code with only those rights set forth herein.
  31  *
  32  * Any use of this source code in individual and commercial software must
  33  * include, in the user documentation and internal comments to the code,
  34  * the above Disclaimer and U.S. Government End Users Notice.
  35  */
  36 
  37 /*
  38  * This sample implements a separable convolution filter
  39  * of a 2D signal with a gaussian kernel.
  40  */
  41  
  42  Ported to pycuda by Andrew Wagner <awagner@illinois.edu>, June 2009. 
  43 '''
  44 
  45 import numpy
  46 import pycuda.autoinit
  47 import pycuda.driver as cuda
  48 from pycuda.compiler import SourceModule
  49 import string
  50 
  51 # Pull out a bunch of stuff that was hard coded as pre-processor directives used
  52 # by both the kernel and calling code.
  53 KERNEL_RADIUS = 8
  54 UNROLL_INNER_LOOP = True
  55 KERNEL_W = 2 * KERNEL_RADIUS + 1
  56 ROW_TILE_W = 128
  57 KERNEL_RADIUS_ALIGNED = 16
  58 COLUMN_TILE_W = 16
  59 COLUMN_TILE_H = 48
  60 template = '''
  61 //24-bit multiplication is faster on G80,
  62 //but we must be sure to multiply integers
  63 //only within [-8M, 8M - 1] range
  64 #define IMUL(a, b) __mul24(a, b)
  65 
  66 ////////////////////////////////////////////////////////////////////////////////
  67 // Kernel configuration
  68 ////////////////////////////////////////////////////////////////////////////////
  69 #define KERNEL_RADIUS $KERNEL_RADIUS
  70 #define KERNEL_W $KERNEL_W
  71 __device__ __constant__ float d_Kernel_rows[KERNEL_W];
  72 __device__ __constant__ float d_Kernel_columns[KERNEL_W];
  73 
  74 // Assuming ROW_TILE_W, KERNEL_RADIUS_ALIGNED and dataW 
  75 // are multiples of coalescing granularity size,
  76 // all global memory operations are coalesced in convolutionRowGPU()
  77 #define            ROW_TILE_W  $ROW_TILE_W
  78 #define KERNEL_RADIUS_ALIGNED  $KERNEL_RADIUS_ALIGNED
  79 
  80 // Assuming COLUMN_TILE_W and dataW are multiples
  81 // of coalescing granularity size, all global memory operations 
  82 // are coalesced in convolutionColumnGPU()
  83 #define COLUMN_TILE_W $COLUMN_TILE_W
  84 #define COLUMN_TILE_H $COLUMN_TILE_H
  85 
  86 ////////////////////////////////////////////////////////////////////////////////
  87 // Row convolution filter
  88 ////////////////////////////////////////////////////////////////////////////////
  89 __global__ void convolutionRowGPU(
  90     float *d_Result,
  91     float *d_Data,
  92     int dataW,
  93     int dataH
  94 ){
  95     //Data cache
  96     __shared__ float data[KERNEL_RADIUS + ROW_TILE_W + KERNEL_RADIUS];
  97 
  98     //Current tile and apron limits, relative to row start
  99     const int         tileStart = IMUL(blockIdx.x, ROW_TILE_W);
 100     const int           tileEnd = tileStart + ROW_TILE_W - 1;
 101     const int        apronStart = tileStart - KERNEL_RADIUS;
 102     const int          apronEnd = tileEnd   + KERNEL_RADIUS;
 103 
 104     //Clamp tile and apron limits by image borders
 105     const int    tileEndClamped = min(tileEnd, dataW - 1);
 106     const int apronStartClamped = max(apronStart, 0);
 107     const int   apronEndClamped = min(apronEnd, dataW - 1);
 108 
 109     //Row start index in d_Data[]
 110     const int          rowStart = IMUL(blockIdx.y, dataW);
 111 
 112     //Aligned apron start. Assuming dataW and ROW_TILE_W are multiples 
 113     //of half-warp size, rowStart + apronStartAligned is also a 
 114     //multiple of half-warp size, thus having proper alignment 
 115     //for coalesced d_Data[] read.
 116     const int apronStartAligned = tileStart - KERNEL_RADIUS_ALIGNED;
 117 
 118     const int loadPos = apronStartAligned + threadIdx.x;
 119     //Set the entire data cache contents
 120     //Load global memory values, if indices are within the image borders,
 121     //or initialize with zeroes otherwise
 122     if(loadPos >= apronStart){
 123         const int smemPos = loadPos - apronStart;
 124 
 125         data[smemPos] = 
 126             ((loadPos >= apronStartClamped) && (loadPos <= apronEndClamped)) ?
 127             d_Data[rowStart + loadPos] : 0;
 128     }
 129 
 130     //Ensure the completness of the loading stage
 131     //because results, emitted by each thread depend on the data,
 132     //loaded by another threads
 133     __syncthreads();
 134     const int writePos = tileStart + threadIdx.x;
 135     //Assuming dataW and ROW_TILE_W are multiples of half-warp size,
 136     //rowStart + tileStart is also a multiple of half-warp size,
 137     //thus having proper alignment for coalesced d_Result[] write.
 138     if(writePos <= tileEndClamped){
 139         const int smemPos = writePos - apronStart;
 140         float sum = 0;
 141 '''
 142 originalLoop = '''
 143         for(int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++)
 144             sum += data[smemPos + k] * d_Kernel_rows[KERNEL_RADIUS - k];
 145 '''
 146 unrolledLoop = ''
 147 for k in range(-KERNEL_RADIUS,  KERNEL_RADIUS+1):
 148     loopTemplate = string.Template(
 149     'sum += data[smemPos + $k] * d_Kernel_rows[KERNEL_RADIUS - $k];\n')
 150     unrolledLoop += loopTemplate.substitute(k=k)    
 151 
 152 #print unrolledLoop
 153 template += unrolledLoop if UNROLL_INNER_LOOP else originalLoop
 154 template += '''
 155         d_Result[rowStart + writePos] = sum;
 156         //d_Result[rowStart + writePos] = 128;
 157     }
 158 }
 159 
 160 ////////////////////////////////////////////////////////////////////////////////
 161 // Column convolution filter
 162 ////////////////////////////////////////////////////////////////////////////////
 163 __global__ void convolutionColumnGPU(
 164     float *d_Result,
 165     float *d_Data,
 166     int dataW,
 167     int dataH,
 168     int smemStride,
 169     int gmemStride
 170 ){
 171     //Data cache
 172     __shared__ float data[COLUMN_TILE_W * 
 173     (KERNEL_RADIUS + COLUMN_TILE_H + KERNEL_RADIUS)];
 174 
 175     //Current tile and apron limits, in rows
 176     const int         tileStart = IMUL(blockIdx.y, COLUMN_TILE_H);
 177     const int           tileEnd = tileStart + COLUMN_TILE_H - 1;
 178     const int        apronStart = tileStart - KERNEL_RADIUS;
 179     const int          apronEnd = tileEnd   + KERNEL_RADIUS;
 180 
 181     //Clamp tile and apron limits by image borders
 182     const int    tileEndClamped = min(tileEnd, dataH - 1);
 183     const int apronStartClamped = max(apronStart, 0);
 184     const int   apronEndClamped = min(apronEnd, dataH - 1);
 185 
 186     //Current column index
 187     const int       columnStart = IMUL(blockIdx.x, COLUMN_TILE_W) + threadIdx.x;
 188 
 189     //Shared and global memory indices for current column
 190     int smemPos = IMUL(threadIdx.y, COLUMN_TILE_W) + threadIdx.x;
 191     int gmemPos = IMUL(apronStart + threadIdx.y, dataW) + columnStart;
 192     //Cycle through the entire data cache
 193     //Load global memory values, if indices are within the image borders,
 194     //or initialize with zero otherwise
 195     for(int y = apronStart + threadIdx.y; y <= apronEnd; y += blockDim.y){
 196         data[smemPos] = 
 197         ((y >= apronStartClamped) && (y <= apronEndClamped)) ? 
 198         d_Data[gmemPos] : 0;
 199         smemPos += smemStride;
 200         gmemPos += gmemStride;
 201     }
 202 
 203     //Ensure the completness of the loading stage
 204     //because results, emitted by each thread depend on the data, 
 205     //loaded by another threads
 206     __syncthreads();
 207     //Shared and global memory indices for current column
 208     smemPos = IMUL(threadIdx.y + KERNEL_RADIUS, COLUMN_TILE_W) + threadIdx.x;
 209     gmemPos = IMUL(tileStart + threadIdx.y , dataW) + columnStart;
 210     //Cycle through the tile body, clamped by image borders
 211     //Calculate and output the results
 212     for(int y = tileStart + threadIdx.y; y <= tileEndClamped; y += blockDim.y){
 213         float sum = 0;
 214 '''
 215 originalLoop = '''
 216         for(int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++)
 217             sum += data[smemPos + IMUL(k, COLUMN_TILE_W)] * 
 218             d_Kernel_columns[KERNEL_RADIUS - k];
 219 '''
 220 unrolledLoop = ''
 221 for k in range(-KERNEL_RADIUS,  KERNEL_RADIUS+1):
 222     loopTemplate = string.Template('sum += data[smemPos + IMUL($k, COLUMN_TILE_W)] * d_Kernel_columns[KERNEL_RADIUS - $k];\n')
 223     unrolledLoop += loopTemplate.substitute(k=k)    
 224 
 225 #print unrolledLoop
 226 template += unrolledLoop if UNROLL_INNER_LOOP else originalLoop
 227 template += '''
 228         d_Result[gmemPos] = sum;        
 229         //d_Result[gmemPos] = 128;
 230         smemPos += smemStride;
 231         gmemPos += gmemStride;
 232     }
 233 }
 234 '''
 235 template = string.Template(template)
 236 code = template.substitute(KERNEL_RADIUS = KERNEL_RADIUS,  
 237                            KERNEL_W = KERNEL_W,  
 238                            COLUMN_TILE_H=COLUMN_TILE_H,  
 239                            COLUMN_TILE_W=COLUMN_TILE_W,  
 240                            ROW_TILE_W=ROW_TILE_W,  
 241                            KERNEL_RADIUS_ALIGNED=KERNEL_RADIUS_ALIGNED)
 242 
 243 module = SourceModule(code)
 244 convolutionRowGPU = module.get_function('convolutionRowGPU')
 245 convolutionColumnGPU = module.get_function('convolutionColumnGPU')
 246 d_Kernel_rows = module.get_global('d_Kernel_rows')[0]
 247 d_Kernel_columns = module.get_global('d_Kernel_columns')[0]
 248 
 249 # Helper functions for computing alignment...
 250 def iDivUp(a, b):
 251     # Round a / b to nearest higher integer value
 252     a = numpy.int32(a)
 253     b = numpy.int32(b)
 254     return (a / b + 1) if (a % b != 0) else (a / b)
 255 
 256 def iDivDown(a, b):
 257     # Round a / b to nearest lower integer value
 258     a = numpy.int32(a)
 259     b = numpy.int32(b)
 260     return a / b;
 261 
 262 def iAlignUp(a, b):
 263     # Align a to nearest higher multiple of b
 264     a = numpy.int32(a)
 265     b = numpy.int32(b)
 266     return (a - a % b + b) if (a % b != 0) else a
 267 
 268 def iAlignDown(a, b):
 269     # Align a to nearest lower multiple of b
 270     a = numpy.int32(a)
 271     b = numpy.int32(b)
 272     return a - a % b
 273 
 274 def gaussian_kernel(width = KERNEL_W, sigma = 4.0):
 275     assert width == numpy.floor(width),  'argument width should be an integer!'
 276     radius = (width - 1)/2.0
 277     x = numpy.linspace(-radius,  radius,  width)
 278     x = numpy.float32(x)
 279     sigma = numpy.float32(sigma)
 280     filterx = x*x / (2 * sigma * sigma)
 281     filterx = numpy.exp(-1 * filterx)
 282     assert filterx.sum()>0,  'something very wrong if gaussian kernel sums to zero!'
 283     filterx /= filterx.sum()
 284     return filterx
 285 
 286 def derivative_of_gaussian_kernel(width = KERNEL_W, sigma = 4):
 287     assert width == numpy.floor(width),  'argument width should be an integer!'
 288     radius = (width - 1)/2.0
 289     x = numpy.linspace(-radius,  radius,  width)
 290     x = numpy.float32(x)
 291     # The derivative of a gaussian is really just a gaussian times x, up to scale.
 292     filterx = gaussian_kernel(width,  sigma)
 293     filterx *= x
 294     # Rescale so that filter returns derivative of 1 when applied to x:
 295     scale = (x * filterx).sum()
 296     filterx /= scale
 297     # Careful with sign; this will be uses as a ~convolution kernel, so should start positive, then go negative.
 298     filterx *= -1.0
 299     return filterx
 300 
 301 def test_derivative_of_gaussian_kernel():
 302     width = 20
 303     sigma = 10.0
 304     filterx = derivative_of_gaussian_kernel(width,  sigma)
 305     x = 2 * numpy.arange(0, width)
 306     x = numpy.float32(x)
 307     response = (filter * x).sum()
 308     assert abs(response - (-2.0)) < .0001, 'derivative of gaussian failed scale test!'
 309     width = 19
 310     sigma = 10.0
 311     filterx = derivative_of_gaussian_kernel(width,  sigma)
 312     x = 2 * numpy.arange(0, width)
 313     x = numpy.float32(x)
 314     response = (filterx * x).sum()
 315     assert abs(response - (-2.0)) < .0001, 'derivative of gaussian failed scale test!'
 316 
 317 def convolution_cuda(sourceImage,  filterx,  filtery):
 318     # Perform separable convolution on sourceImage using CUDA.
 319     # Operates on floating point images with row-major storage.
 320     destImage = sourceImage.copy()
 321     assert sourceImage.dtype == 'float32',  'source image must be float32'
 322     (imageHeight,  imageWidth) = sourceImage.shape
 323     assert filterx.shape == filtery.shape == (KERNEL_W, ) ,  'Kernel is compiled for a different kernel size! Try changing KERNEL_W'
 324     filterx = numpy.float32(filterx)
 325     filtery = numpy.float32(filtery)
 326     DATA_W = iAlignUp(imageWidth, 16);
 327     DATA_H = imageHeight;
 328     BYTES_PER_WORD = 4;  # 4 for float32
 329     DATA_SIZE = DATA_W * DATA_H * BYTES_PER_WORD; 
 330     KERNEL_SIZE = KERNEL_W * BYTES_PER_WORD;
 331     # Prepare device arrays
 332     destImage_gpu = cuda.mem_alloc_like(destImage)
 333     sourceImage_gpu = cuda.mem_alloc_like(sourceImage)
 334     intermediateImage_gpu = cuda.mem_alloc_like(sourceImage)
 335     cuda.memcpy_htod(sourceImage_gpu, sourceImage)
 336     cuda.memcpy_htod(d_Kernel_rows,  filterx) # The kernel goes into constant memory via a symbol defined in the kernel
 337     cuda.memcpy_htod(d_Kernel_columns,  filtery)
 338     # Call the kernels for convolution in each direction.
 339     blockGridRows = (iDivUp(DATA_W, ROW_TILE_W), DATA_H)
 340     blockGridColumns = (iDivUp(DATA_W, COLUMN_TILE_W), iDivUp(DATA_H, COLUMN_TILE_H))
 341     threadBlockRows = (KERNEL_RADIUS_ALIGNED + ROW_TILE_W + KERNEL_RADIUS, 1, 1)
 342     threadBlockColumns = (COLUMN_TILE_W, 8, 1)
 343     DATA_H = numpy.int32(DATA_H)
 344     DATA_W = numpy.int32(DATA_W)
 345     convolutionRowGPU(intermediateImage_gpu,  sourceImage_gpu,  DATA_W,  DATA_H,  grid=[int(e) for e in blockGridRows],  block=[int(e) for e in threadBlockRows])    
 346     convolutionColumnGPU(destImage_gpu,  intermediateImage_gpu,  DATA_W,  DATA_H,  numpy.int32(COLUMN_TILE_W * threadBlockColumns[1]),  numpy.int32(DATA_W * threadBlockColumns[1]),  grid=[int(e) for e in blockGridColumns],  block=[int(e) for e in threadBlockColumns])
 347 
 348     # Pull the data back from the GPU.
 349     cuda.memcpy_dtoh(destImage,  destImage_gpu)
 350     return destImage
 351 
 352 def test_convolution_cuda():
 353     # Test the convolution kernel.
 354     # Generate or load a test image
 355     original = numpy.random.rand(768,  1024) * 255    
 356     original = numpy.float32(original)
 357     # You probably want to display the image using the tool of your choice here.
 358     filterx = gaussian_kernel()
 359     destImage = original.copy()
 360     destImage[:] = numpy.nan
 361     destImage = convolution_cuda(original,  filterx,  filterx)
 362     # You probably wand to display the result image using the tool of your choice here.
 363     print 'Done running the convolution kernel!'
 364 
 365 if __name__ == '__main__':
 366     test_convolution_cuda()
 367     #test_derivative_of_gaussian_kernel()
 368     boo = raw_input('Pausing so you can look at results... <Enter> to finish...')