Port of SobelFilter example from CUDA C SDK

Python port of SobelFilter example in NVIDIA CUDA C SDK. Show how opengl interoperability works.

stefanobrilli@gmail.com

License of this example:

GPL

Date:

October 2011

PyCUDA version:

2011.1.2

   1 #!/usr/bin/env python
   2 #-*- coding: utf-8 -*-
   3 #
   4 # Requires PyCuda, PyOpenGL, and Pil
   5 # MAKE SURE YOU HAVE AN UPDATED VERSION OF THESE PACKAGES!!
   6 #
   7 # Ported to PyCUDA by
   8 # Stefano Brilli: stefanobrilli@gmail.com
   9 #
  10 # Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
  11 #
  12 # This software contains source code provided by NVIDIA Corporation
  13 #
  14 # http://developer.download.nvidia.com/compute/cuda/2_3/sdk/docs/cudasdk_eula.pdf
  15 #
  16 # Please refer to the NVIDIA end user license agreement (EULA) associated
  17 # with this source code for terms and conditions that govern your use of
  18 # this software. Any use, reproduction, disclosure, or distribution of
  19 # this software and related documentation outside the terms of the EULA
  20 # is strictly prohibited.
  21 #
  22 
  23 from OpenGL.GL import *
  24 from OpenGL.GLUT import *
  25 from OpenGL.GLU import *
  26 from OpenGL.GL.ARB.vertex_buffer_object import *
  27 import numpy as np, Image
  28 import sys, time, os
  29 import pycuda.driver as cuda_driver
  30 import pycuda.gl as cuda_gl
  31 import pycuda
  32 #import pycuda.gl.autoinit
  33 from pycuda.compiler import SourceModule
  34 
  35 imWidth = 0
  36 imHeight = 0
  37 wWidth = 0
  38 wHeight = 0
  39 wName = "Cuda Edge Detection:"
  40 pixels = None
  41 array = None
  42 texid = 0
  43 pbo_buffer = None
  44 cuda_pbo_resource = None
  45 mode = 0
  46 scale = 1.0
  47 
  48 frameCount = 0
  49 fpsCount = 0
  50 fpsLimit = 8
  51 timer = 0.0
  52 ver2011 = False
  53 
  54 def copy2D_array_to_device(dst, src, type_sz, width, height):
  55     copy = cuda_driver.Memcpy2D()
  56     copy.set_src_array(src)
  57     copy.set_dst_device(dst)
  58     copy.height = height
  59     copy.dst_pitch = copy.src_pitch = copy.width_in_bytes = width*type_sz
  60     copy(aligned=True)
  61 
  62 def computeFPS():
  63     global frameCount, fpsCount, fpsLimit, timer
  64     frameCount += 1
  65     fpsCount += 1
  66     if fpsCount == fpsLimit:
  67         ifps = 1.0 /timer
  68         glutSetWindowTitle("Cuda Edge Detection: %f fps" % ifps)
  69         fpsCount = 0
  70 
  71 def sobelFilter(odata, iw, ih):
  72     global array, pixels, mode, scale
  73     if mode == 3:
  74         # Texture and shared memory with fixed BlockSize
  75         sm = SourceModule("""
  76             texture<unsigned char, 2> tex;
  77             extern __shared__ unsigned char LocalBlock[];
  78             #define RADIUS 1
  79             #define BlockWidth 80
  80             #define SharedPitch 384
  81             __device__ unsigned char
  82             ComputeSobel(unsigned char ul, // upper left
  83                          unsigned char um, // upper middle
  84                          unsigned char ur, // upper right
  85                          unsigned char ml, // middle left
  86                          unsigned char mm, // middle (unused)
  87                          unsigned char mr, // middle right
  88                          unsigned char ll, // lower left
  89                          unsigned char lm, // lower middle
  90                          unsigned char lr, // lower right
  91                          float fScale )
  92             {
  93                 short Horz = ur + 2*mr + lr - ul - 2*ml - ll;
  94                 short Vert = ul + 2*um + ur - ll - 2*lm - lr;
  95                 short Sum = (short) (fScale*(::abs(int(Horz))+::abs(int(Vert))));
  96                 if ( Sum < 0 ) return 0; else if ( Sum > 0xff ) return 0xff;
  97                 return (unsigned char) Sum;
  98             }
  99 
 100             __global__ void
 101             SobelShared( int* pSobelOriginal, unsigned short SobelPitch,
 102                          short w, short h, float fScale )
 103             {
 104                 short u = 4*blockIdx.x*BlockWidth;
 105                 short v = blockIdx.y*blockDim.y + threadIdx.y;
 106                 short ib;
 107 
 108                 int SharedIdx = threadIdx.y * SharedPitch;
 109 
 110                 for ( ib = threadIdx.x; ib < BlockWidth+2*RADIUS; ib += blockDim.x ) {
 111                     LocalBlock[SharedIdx+4*ib+0] = tex2D( tex,
 112                         (float) (u+4*ib-RADIUS+0), (float) (v-RADIUS) );
 113                     LocalBlock[SharedIdx+4*ib+1] = tex2D( tex,
 114                         (float) (u+4*ib-RADIUS+1), (float) (v-RADIUS) );
 115                     LocalBlock[SharedIdx+4*ib+2] = tex2D( tex,
 116                         (float) (u+4*ib-RADIUS+2), (float) (v-RADIUS) );
 117                     LocalBlock[SharedIdx+4*ib+3] = tex2D( tex,
 118                         (float) (u+4*ib-RADIUS+3), (float) (v-RADIUS) );
 119                 }
 120                 if ( threadIdx.y < RADIUS*2 ) {
 121                     //
 122                     // copy trailing RADIUS*2 rows of pixels into shared
 123                     //
 124                     SharedIdx = (blockDim.y+threadIdx.y) * SharedPitch;
 125                     for ( ib = threadIdx.x; ib < BlockWidth+2*RADIUS; ib += blockDim.x ) {
 126                         LocalBlock[SharedIdx+4*ib+0] = tex2D( tex,
 127                             (float) (u+4*ib-RADIUS+0), (float) (v+blockDim.y-RADIUS) );
 128                         LocalBlock[SharedIdx+4*ib+1] = tex2D( tex,
 129                             (float) (u+4*ib-RADIUS+1), (float) (v+blockDim.y-RADIUS) );
 130                         LocalBlock[SharedIdx+4*ib+2] = tex2D( tex,
 131                             (float) (u+4*ib-RADIUS+2), (float) (v+blockDim.y-RADIUS) );
 132                         LocalBlock[SharedIdx+4*ib+3] = tex2D( tex,
 133                             (float) (u+4*ib-RADIUS+3), (float) (v+blockDim.y-RADIUS) );
 134                     }
 135                 }
 136 
 137                 __syncthreads();
 138 
 139                 u >>= 2;    // index as uchar4 from here
 140                 uchar4 *pSobel = (uchar4 *) (((char *) pSobelOriginal)+v*SobelPitch);
 141                 SharedIdx = threadIdx.y * SharedPitch;
 142 
 143                 for ( ib = threadIdx.x; ib < BlockWidth; ib += blockDim.x ) {
 144 
 145                     unsigned char pix00 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+0];
 146                     unsigned char pix01 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+1];
 147                     unsigned char pix02 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+2];
 148                     unsigned char pix10 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+0];
 149                     unsigned char pix11 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+1];
 150                     unsigned char pix12 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+2];
 151                     unsigned char pix20 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+0];
 152                     unsigned char pix21 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+1];
 153                     unsigned char pix22 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+2];
 154 
 155                     uchar4 out;
 156 
 157                     out.x = ComputeSobel(pix00, pix01, pix02,
 158                                          pix10, pix11, pix12,
 159                                          pix20, pix21, pix22, fScale );
 160 
 161                     pix00 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+3];
 162                     pix10 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+3];
 163                     pix20 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+3];
 164                     out.y = ComputeSobel(pix01, pix02, pix00,
 165                                          pix11, pix12, pix10,
 166                                          pix21, pix22, pix20, fScale );
 167 
 168                     pix01 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+4];
 169                     pix11 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+4];
 170                     pix21 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+4];
 171                     out.z = ComputeSobel( pix02, pix00, pix01,
 172                                           pix12, pix10, pix11,
 173                                           pix22, pix20, pix21, fScale );
 174 
 175                     pix02 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+5];
 176                     pix12 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+5];
 177                     pix22 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+5];
 178                     out.w = ComputeSobel( pix00, pix01, pix02,
 179                                           pix10, pix11, pix12,
 180                                           pix20, pix21, pix22, fScale );
 181                     if ( u+ib < w/4 && v < h ) {
 182                         pSobel[u+ib] = out;
 183                     }
 184                 }
 185 
 186                 __syncthreads();
 187             }
 188         """)
 189         cuda_function = sm.get_function("SobelShared")
 190     elif mode == 2:
 191         # Texture and shared memory with variable BlockSize
 192         sm = SourceModule("""
 193         #define RADIUS 1
 194         texture<unsigned char, 2> tex;
 195         extern __shared__ unsigned char LocalBlock[];
 196         __device__ unsigned char
 197         ComputeSobel(unsigned char ul, // upper left
 198                      unsigned char um, // upper middle
 199                      unsigned char ur, // upper right
 200                      unsigned char ml, // middle left
 201                      unsigned char mm, // middle (unused)
 202                      unsigned char mr, // middle right
 203                      unsigned char ll, // lower left
 204                      unsigned char lm, // lower middle
 205                      unsigned char lr, // lower right
 206                      float fScale )
 207         {
 208             short Horz = ur + 2*mr + lr - ul - 2*ml - ll;
 209             short Vert = ul + 2*um + ur - ll - 2*lm - lr;
 210             short Sum = (short) (fScale*(::abs(int(Horz))+::abs(int(Vert))));
 211             if ( Sum < 0 ) return 0; else if ( Sum > 0xff ) return 0xff;
 212             return (unsigned char) Sum;
 213         }
 214 
 215         __global__ void
 216         SobelShared( int* pSobelOriginal, unsigned short SobelPitch,
 217                      short BlockWidth, short SharedPitch,
 218                      short w, short h, float fScale )
 219         {
 220             short u = 4*blockIdx.x*BlockWidth;
 221             short v = blockIdx.y*blockDim.y + threadIdx.y;
 222             short ib;
 223 
 224             int SharedIdx = threadIdx.y * SharedPitch;
 225 
 226             for ( ib = threadIdx.x; ib < BlockWidth+2*RADIUS; ib += blockDim.x ) {
 227                 LocalBlock[SharedIdx+4*ib+0] = tex2D( tex,
 228                     (float) (u+4*ib-RADIUS+0), (float) (v-RADIUS) );
 229                 LocalBlock[SharedIdx+4*ib+1] = tex2D( tex,
 230                     (float) (u+4*ib-RADIUS+1), (float) (v-RADIUS) );
 231                 LocalBlock[SharedIdx+4*ib+2] = tex2D( tex,
 232                     (float) (u+4*ib-RADIUS+2), (float) (v-RADIUS) );
 233                 LocalBlock[SharedIdx+4*ib+3] = tex2D( tex,
 234                     (float) (u+4*ib-RADIUS+3), (float) (v-RADIUS) );
 235             }
 236             if ( threadIdx.y < RADIUS*2 ) {
 237                 //
 238                 // copy trailing RADIUS*2 rows of pixels into shared
 239                 //
 240                 SharedIdx = (blockDim.y+threadIdx.y) * SharedPitch;
 241                 for ( ib = threadIdx.x; ib < BlockWidth+2*RADIUS; ib += blockDim.x ) {
 242                     LocalBlock[SharedIdx+4*ib+0] = tex2D( tex,
 243                         (float) (u+4*ib-RADIUS+0), (float) (v+blockDim.y-RADIUS) );
 244                     LocalBlock[SharedIdx+4*ib+1] = tex2D( tex,
 245                         (float) (u+4*ib-RADIUS+1), (float) (v+blockDim.y-RADIUS) );
 246                     LocalBlock[SharedIdx+4*ib+2] = tex2D( tex,
 247                         (float) (u+4*ib-RADIUS+2), (float) (v+blockDim.y-RADIUS) );
 248                     LocalBlock[SharedIdx+4*ib+3] = tex2D( tex,
 249                         (float) (u+4*ib-RADIUS+3), (float) (v+blockDim.y-RADIUS) );
 250                 }
 251             }
 252 
 253             __syncthreads();
 254 
 255             u >>= 2;    // index as uchar4 from here
 256             uchar4 *pSobel = (uchar4 *) (((char *) pSobelOriginal)+v*SobelPitch);
 257             SharedIdx = threadIdx.y * SharedPitch;
 258 
 259             for ( ib = threadIdx.x; ib < BlockWidth; ib += blockDim.x ) {
 260 
 261                 unsigned char pix00 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+0];
 262                 unsigned char pix01 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+1];
 263                 unsigned char pix02 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+2];
 264                 unsigned char pix10 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+0];
 265                 unsigned char pix11 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+1];
 266                 unsigned char pix12 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+2];
 267                 unsigned char pix20 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+0];
 268                 unsigned char pix21 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+1];
 269                 unsigned char pix22 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+2];
 270 
 271                 uchar4 out;
 272 
 273                 out.x = ComputeSobel(pix00, pix01, pix02,
 274                                      pix10, pix11, pix12,
 275                                      pix20, pix21, pix22, fScale );
 276 
 277                 pix00 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+3];
 278                 pix10 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+3];
 279                 pix20 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+3];
 280                 out.y = ComputeSobel(pix01, pix02, pix00,
 281                                      pix11, pix12, pix10,
 282                                      pix21, pix22, pix20, fScale );
 283 
 284                 pix01 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+4];
 285                 pix11 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+4];
 286                 pix21 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+4];
 287                 out.z = ComputeSobel( pix02, pix00, pix01,
 288                                       pix12, pix10, pix11,
 289                                       pix22, pix20, pix21, fScale );
 290 
 291                 pix02 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+5];
 292                 pix12 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+5];
 293                 pix22 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+5];
 294                 out.w = ComputeSobel( pix00, pix01, pix02,
 295                                       pix10, pix11, pix12,
 296                                       pix20, pix21, pix22, fScale );
 297                 if ( u+ib < w/4 && v < h ) {
 298                     pSobel[u+ib] = out;
 299                 }
 300             }
 301 
 302             __syncthreads();
 303         }
 304 
 305         """)
 306         cuda_function = sm.get_function("SobelShared")
 307     if mode == 1:
 308         # Just Texture
 309         sm = SourceModule("""
 310         texture<unsigned char, 2> tex;
 311         __device__ unsigned char ComputeSobel(unsigned char ul, // upper left
 312                      unsigned char um, // upper middle
 313                      unsigned char ur, // upper right
 314                      unsigned char ml, // middle left
 315                      unsigned char mm, // middle (unused)
 316                      unsigned char mr, // middle right
 317                      unsigned char ll, // lower left
 318                      unsigned char lm, // lower middle
 319                      unsigned char lr, // lower right
 320                      float fScale )
 321         {
 322             short Horz = ur + 2*mr + lr - ul - 2*ml - ll;
 323             short Vert = ul + 2*um + ur - ll - 2*lm - lr;
 324             short Sum = (short) (fScale*(::abs(int(Horz))+::abs(int(Vert))));
 325             if ( Sum < 0 ) return 0; else if ( Sum > 0xff ) return 0xff;
 326             return (unsigned char) Sum;
 327         }
 328         __global__ void SobelTex( int* pSobelOriginal, unsigned int Pitch,
 329                   int w, int h, float fScale )
 330         {
 331             unsigned char *pSobel =
 332               (unsigned char *) (((char *) pSobelOriginal)+blockIdx.x*Pitch);
 333             for ( int i = threadIdx.x; i < w; i += blockDim.x ) {
 334                 unsigned char pix00 = tex2D( tex, (float) i-1, (float) blockIdx.x-1 );
 335                 unsigned char pix01 = tex2D( tex, (float) i+0, (float) blockIdx.x-1 );
 336                 unsigned char pix02 = tex2D( tex, (float) i+1, (float) blockIdx.x-1 );
 337                 unsigned char pix10 = tex2D( tex, (float) i-1, (float) blockIdx.x+0 );
 338                 unsigned char pix11 = tex2D( tex, (float) i+0, (float) blockIdx.x+0 );
 339                 unsigned char pix12 = tex2D( tex, (float) i+1, (float) blockIdx.x+0 );
 340                 unsigned char pix20 = tex2D( tex, (float) i-1, (float) blockIdx.x+1 );
 341                 unsigned char pix21 = tex2D( tex, (float) i+0, (float) blockIdx.x+1 );
 342                 unsigned char pix22 = tex2D( tex, (float) i+1, (float) blockIdx.x+1 );
 343                 pSobel[i] = ComputeSobel(pix00, pix01, pix02,
 344                                          pix10, pix11, pix12,
 345                                          pix20, pix21, pix22, fScale );
 346             }
 347         }
 348         """)
 349         cuda_function = sm.get_function("SobelTex")
 350     elif mode == 0:
 351         # Just Copy
 352         sm = SourceModule("""
 353         texture<unsigned char, 2> tex;
 354         __global__ void SobelCopyImage(int* pSobelOriginal, unsigned int Pitch, int w, int h, float fscale )
 355         {
 356             unsigned char *pSobel =
 357               (unsigned char *) (((unsigned char *) pSobelOriginal)+blockIdx.x*Pitch);
 358             for ( int i = threadIdx.x; i < w; i += blockDim.x ) {
 359                 pSobel[i] = min( max((tex2D( tex, (float)i, (float)blockIdx.x ) * fscale), 0.f), 255.f);
 360             }
 361         }
 362         """)
 363         cuda_function = sm.get_function("SobelCopyImage")
 364     texref = sm.get_texref("tex")
 365     texref.set_array(array)
 366     texref.set_flags(cuda_driver.TRSA_OVERRIDE_FORMAT)
 367     if mode == 3:
 368         # fixed BlockSize Launch
 369         RADIUS = 1
 370         threads = (16, 4, 1)
 371         BlockWidth = 80 # Do not change!
 372         blocks = (iw/(4*BlockWidth)+(0!=iw%(4*BlockWidth)),
 373                                ih/threads[1]+(0!=ih%threads[1]) )
 374         SharedPitch = ~0x3f & (4*(BlockWidth+2*RADIUS)+0x3f);
 375         sharedMem = SharedPitch*(threads[1]+2*RADIUS);
 376         iw = iw & ~3
 377         cuda_function(np.intp(odata), np.uint16(iw), np.int16(iw), np.int16(ih), np.float32(scale), texrefs=[texref],block=threads, grid=blocks, shared=sharedMem)
 378     elif mode == 2:
 379         # variable BlockSize launch
 380         RADIUS = 1
 381         threads = (16, 4, 1)
 382         BlockWidth = 80 # Change only with divisible by 16 values!
 383         blocks = (iw/(4*BlockWidth)+(0!=iw%(4*BlockWidth)),
 384                                ih/threads[1]+(0!=ih%threads[1]) )
 385         SharedPitch = ~0x3f & (4*(BlockWidth+2*RADIUS)+0x3f);
 386         sharedMem = SharedPitch*(threads[1]+2*RADIUS);
 387         iw = iw & ~3
 388         cuda_function(np.intp(odata), np.uint16(iw), np.int16(BlockWidth), np.int16(SharedPitch), np.int16(iw), np.int16(ih), np.float32(scale), texrefs=[texref],block=threads, grid=blocks, shared=sharedMem)
 389     else:
 390         BlockWidth = 384
 391         cuda_function(np.intp(odata), np.uint32(iw), np.int32(iw), np.int32(ih), np.float32(scale), texrefs=[texref],block=(BlockWidth,1,1),grid=(ih,1))
 392 
 393 def initGL():
 394     global wWidth, wHeight, wName
 395     glutInit(sys.argv)
 396     glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA)
 397     glutInitWindowSize(wWidth, wHeight)
 398     glutCreateWindow(wName)
 399     import pycuda.gl.autoinit
 400 
 401 def loadImage(fn=None):
 402     global pixels, imWidth, imHeight, wWidth, wHeight
 403     try:
 404         im = Image.open(fn) # Open the image
 405     except IOError:
 406         print "Usage:", os.path.basename(sys.argv[0]), "[IMAGE=defaultimage.jpg]"
 407         print "Can't open", fn
 408         sys.exit(1)
 409     imWidth, imHeight = im.size # Window size is set to image size
 410     wWidth, wHeight = im.size
 411     im.draft("L", im.size) # L-flag is for Luminance
 412     pixels = np.fromstring(im.tostring(), dtype=np.uint8) # Got the array
 413     pixels.resize((imHeight, imWidth)) # Resize to 2d array
 414     print "Reading image:", fn, "size:", imWidth, "x", imHeight
 415 
 416 def initData(fn=None):
 417     global pixels, array, pbo_buffer, cuda_pbo_resource, imWidth, imHeight, texid
 418 
 419     # Cuda array initialization
 420     array = cuda_driver.matrix_to_array(pixels, "C") # C-style instead of Fortran-style: row-major
 421 
 422     pixels.fill(0) # Resetting the array to 0
 423 
 424     pbo_buffer = glGenBuffers(1) # generate 1 buffer reference
 425     glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer) # binding to this buffer
 426     glBufferData(GL_PIXEL_UNPACK_BUFFER, imWidth*imHeight, pixels, GL_STREAM_DRAW) # Allocate the buffer
 427     bsize = glGetBufferParameteriv(GL_PIXEL_UNPACK_BUFFER, GL_BUFFER_SIZE) # Check allocated buffer size
 428     assert(bsize == imWidth*imHeight)
 429     glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0) # Unbind
 430 
 431     if ver2011:
 432         cuda_pbo_resource = pycuda.gl.RegisteredBuffer(int(pbo_buffer), cuda_gl.graphics_map_flags.WRITE_DISCARD)
 433     else:
 434         cuda_pbo_resource = cuda_gl.BufferObject(int(pbo_buffer)) # Mapping GLBuffer to cuda_resource
 435 
 436 
 437     glGenTextures(1, texid); # generate 1 texture reference
 438     glBindTexture(GL_TEXTURE_2D, texid); # binding to this texture
 439     glTexImage2D(GL_TEXTURE_2D, 0, GL_LUMINANCE, imWidth, imHeight,  0, GL_LUMINANCE, GL_UNSIGNED_BYTE, None); # Allocate the texture
 440     glBindTexture(GL_TEXTURE_2D, 0) # Unbind
 441 
 442     glPixelStorei(GL_UNPACK_ALIGNMENT, 1) # 1-byte row alignment
 443     glPixelStorei(GL_PACK_ALIGNMENT, 1) # 1-byte row alignment
 444 
 445 
 446 def display():
 447     global cuda_pbo_resource, pbo_buffer, texid, imWidth, imHeight, timer
 448 
 449     timer = time.time() # Starting timer
 450     mapping_obj = cuda_pbo_resource.map() # Map the GlBuffer
 451     if ver2011:
 452         data, sz = mapping_obj.device_ptr_and_size() # Got the CUDA pointer to GlBuffer
 453     else:
 454         data = mapping_obj.device_ptr()
 455     sobelFilter(data, imWidth, imHeight) # Writing to "data"
 456     mapping_obj.unmap() # Unmap the GlBuffer
 457 
 458     glClear(GL_COLOR_BUFFER_BIT) # Clear
 459     glBindTexture(GL_TEXTURE_2D, texid)
 460     glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer)
 461     # Copyng from buffer to texture
 462     glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, imWidth, imHeight, GL_LUMINANCE, GL_UNSIGNED_BYTE, None)
 463     #glTexImage2D(GL_TEXTURE_2D, 0, GL_LUMINANCE, imWidth, imHeight,  0, GL_LUMINANCE, GL_UNSIGNED_BYTE, None);
 464     glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0) # Unbind
 465 
 466     glDisable(GL_DEPTH_TEST)
 467     glEnable(GL_TEXTURE_2D)
 468     glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR)
 469     glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR)
 470     glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT)
 471     glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT)
 472 
 473     glBegin(GL_QUADS)
 474     glVertex2f(0, 0)
 475     glTexCoord2f(0, 0)
 476     glVertex2f(0, 1)
 477     glTexCoord2f(1, 0)
 478     glVertex2f(1, 1)
 479     glTexCoord2f(1, 1)
 480     glVertex2f(1, 0)
 481     glTexCoord2f(0, 1)
 482     glEnd()
 483     glBindTexture(GL_TEXTURE_2D, 0)
 484     glutSwapBuffers()
 485     timer = time.time()-timer
 486     computeFPS()
 487     glutPostRedisplay()
 488 
 489 def reshape(x, y):
 490     glViewport(0, 0, x, y)
 491     glMatrixMode(GL_PROJECTION)
 492     glLoadIdentity()
 493     glOrtho(0, 1, 0, 1, 0, 1)
 494     glMatrixMode(GL_MODELVIEW)
 495     glLoadIdentity()
 496     glutPostRedisplay()
 497 
 498 def keyboard(key, x=0, y=0):
 499     global mode, scale
 500     if key=="q":
 501         sys.exit(0)
 502     elif key=="I" or key=="i":
 503         mode = 0
 504     elif key=="T" or key=="t":
 505         mode = 1
 506     elif key=="S" or key=="s":
 507         mode = 2
 508     elif key=="D" or key=="d":
 509         mode = 3
 510     elif key == "-":
 511         scale -= 0.1
 512     elif key == "=":
 513         scale += 0.1
 514 
 515 def idle():
 516     glutPostRedisplay()
 517 
 518 def main(argv):
 519     fn = "defaultimage.jpg"
 520     if len(argv) > 1:
 521         fn = argv[1]
 522 
 523     loadImage(fn) # Loading the image
 524 
 525     initGL()
 526     initData(fn)
 527     print """
 528     Q: Exit
 529     I: display image
 530     T: display Sobel edge detection (computed with tex)
 531     S: display Sobel edge detection (computed with tex+shared memory)
 532     D: display Sobel edge detection (computed with tex+shared memory+fixed block size)
 533     Use the '-' and '=' keys to change the brightness.
 534 
 535     TESTED WITH IMAGE SIZE OF 512x512... just like the original demo.
 536     Other image sizes may not work
 537     """
 538     glutDisplayFunc(display)
 539     glutKeyboardFunc(keyboard)
 540     glutReshapeFunc(reshape)
 541     glutIdleFunc(idle)
 542     glutMainLoop();
 543 
 544 if __name__ == "__main__":
 545     if pycuda.VERSION[0] >= 2011:
 546         ver2011 = True
 547     main(sys.argv)