Port of SobelFilter example from CUDA C SDK
Python port of SobelFilter example in NVIDIA CUDA C SDK. Show how opengl interoperability works.
stefanobrilli@gmail.com [[!table header="no" class="mointable" data=""" License of this example: | GPL Date: | October 2011 PyCUDA version: | 2011.1.2 """]]
#!python
#!/usr/bin/env python
#-*- coding: utf-8 -*-
#
# Requires PyCuda, PyOpenGL, and Pil
# MAKE SURE YOU HAVE AN UPDATED VERSION OF THESE PACKAGES!!
#
# Ported to PyCUDA by
# Stefano Brilli: stefanobrilli@gmail.com
#
# Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
#
# This software contains source code provided by NVIDIA Corporation
#
# http://developer.download.nvidia.com/compute/cuda/2_3/sdk/docs/cudasdk_eula.pdf
#
# Please refer to the NVIDIA end user license agreement (EULA) associated
# with this source code for terms and conditions that govern your use of
# this software. Any use, reproduction, disclosure, or distribution of
# this software and related documentation outside the terms of the EULA
# is strictly prohibited.
#
from OpenGL.GL import *
from OpenGL.GLUT import *
from OpenGL.GLU import *
from OpenGL.GL.ARB.vertex_buffer_object import *
import numpy as np, Image
import sys, time, os
import pycuda.driver as cuda_driver
import pycuda.gl as cuda_gl
import pycuda
#import pycuda.gl.autoinit
from pycuda.compiler import SourceModule
imWidth = 0
imHeight = 0
wWidth = 0
wHeight = 0
wName = "Cuda Edge Detection:"
pixels = None
array = None
texid = 0
pbo_buffer = None
cuda_pbo_resource = None
mode = 0
scale = 1.0
frameCount = 0
fpsCount = 0
fpsLimit = 8
timer = 0.0
ver2011 = False
def copy2D_array_to_device(dst, src, type_sz, width, height):
copy = cuda_driver.Memcpy2D()
copy.set_src_array(src)
copy.set_dst_device(dst)
copy.height = height
copy.dst_pitch = copy.src_pitch = copy.width_in_bytes = width*type_sz
copy(aligned=True)
def computeFPS():
global frameCount, fpsCount, fpsLimit, timer
frameCount += 1
fpsCount += 1
if fpsCount == fpsLimit:
ifps = 1.0 /timer
glutSetWindowTitle("Cuda Edge Detection: %f fps" % ifps)
fpsCount = 0
def sobelFilter(odata, iw, ih):
global array, pixels, mode, scale
if mode == 3:
# Texture and shared memory with fixed BlockSize
sm = SourceModule("""
texture<unsigned char, 2> tex;
extern __shared__ unsigned char LocalBlock[];
#define RADIUS 1
#define BlockWidth 80
#define SharedPitch 384
__device__ unsigned char
ComputeSobel(unsigned char ul, // upper left
unsigned char um, // upper middle
unsigned char ur, // upper right
unsigned char ml, // middle left
unsigned char mm, // middle (unused)
unsigned char mr, // middle right
unsigned char ll, // lower left
unsigned char lm, // lower middle
unsigned char lr, // lower right
float fScale )
{
short Horz = ur + 2*mr + lr - ul - 2*ml - ll;
short Vert = ul + 2*um + ur - ll - 2*lm - lr;
short Sum = (short) (fScale*(::abs(int(Horz))+::abs(int(Vert))));
if ( Sum < 0 ) return 0; else if ( Sum > 0xff ) return 0xff;
return (unsigned char) Sum;
}
__global__ void
SobelShared( int* pSobelOriginal, unsigned short SobelPitch,
short w, short h, float fScale )
{
short u = 4*blockIdx.x*BlockWidth;
short v = blockIdx.y*blockDim.y + threadIdx.y;
short ib;
int SharedIdx = threadIdx.y * SharedPitch;
for ( ib = threadIdx.x; ib < BlockWidth+2*RADIUS; ib += blockDim.x ) {
LocalBlock[SharedIdx+4*ib+0] = tex2D( tex,
(float) (u+4*ib-RADIUS+0), (float) (v-RADIUS) );
LocalBlock[SharedIdx+4*ib+1] = tex2D( tex,
(float) (u+4*ib-RADIUS+1), (float) (v-RADIUS) );
LocalBlock[SharedIdx+4*ib+2] = tex2D( tex,
(float) (u+4*ib-RADIUS+2), (float) (v-RADIUS) );
LocalBlock[SharedIdx+4*ib+3] = tex2D( tex,
(float) (u+4*ib-RADIUS+3), (float) (v-RADIUS) );
}
if ( threadIdx.y < RADIUS*2 ) {
//
// copy trailing RADIUS*2 rows of pixels into shared
//
SharedIdx = (blockDim.y+threadIdx.y) * SharedPitch;
for ( ib = threadIdx.x; ib < BlockWidth+2*RADIUS; ib += blockDim.x ) {
LocalBlock[SharedIdx+4*ib+0] = tex2D( tex,
(float) (u+4*ib-RADIUS+0), (float) (v+blockDim.y-RADIUS) );
LocalBlock[SharedIdx+4*ib+1] = tex2D( tex,
(float) (u+4*ib-RADIUS+1), (float) (v+blockDim.y-RADIUS) );
LocalBlock[SharedIdx+4*ib+2] = tex2D( tex,
(float) (u+4*ib-RADIUS+2), (float) (v+blockDim.y-RADIUS) );
LocalBlock[SharedIdx+4*ib+3] = tex2D( tex,
(float) (u+4*ib-RADIUS+3), (float) (v+blockDim.y-RADIUS) );
}
}
__syncthreads();
u >>= 2; // index as uchar4 from here
uchar4 *pSobel = (uchar4 *) (((char *) pSobelOriginal)+v*SobelPitch);
SharedIdx = threadIdx.y * SharedPitch;
for ( ib = threadIdx.x; ib < BlockWidth; ib += blockDim.x ) {
unsigned char pix00 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+0];
unsigned char pix01 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+1];
unsigned char pix02 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+2];
unsigned char pix10 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+0];
unsigned char pix11 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+1];
unsigned char pix12 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+2];
unsigned char pix20 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+0];
unsigned char pix21 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+1];
unsigned char pix22 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+2];
uchar4 out;
out.x = ComputeSobel(pix00, pix01, pix02,
pix10, pix11, pix12,
pix20, pix21, pix22, fScale );
pix00 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+3];
pix10 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+3];
pix20 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+3];
out.y = ComputeSobel(pix01, pix02, pix00,
pix11, pix12, pix10,
pix21, pix22, pix20, fScale );
pix01 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+4];
pix11 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+4];
pix21 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+4];
out.z = ComputeSobel( pix02, pix00, pix01,
pix12, pix10, pix11,
pix22, pix20, pix21, fScale );
pix02 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+5];
pix12 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+5];
pix22 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+5];
out.w = ComputeSobel( pix00, pix01, pix02,
pix10, pix11, pix12,
pix20, pix21, pix22, fScale );
if ( u+ib < w/4 && v < h ) {
pSobel[u+ib] = out;
}
}
__syncthreads();
}
""")
cuda_function = sm.get_function("SobelShared")
elif mode == 2:
# Texture and shared memory with variable BlockSize
sm = SourceModule("""
#define RADIUS 1
texture<unsigned char, 2> tex;
extern __shared__ unsigned char LocalBlock[];
__device__ unsigned char
ComputeSobel(unsigned char ul, // upper left
unsigned char um, // upper middle
unsigned char ur, // upper right
unsigned char ml, // middle left
unsigned char mm, // middle (unused)
unsigned char mr, // middle right
unsigned char ll, // lower left
unsigned char lm, // lower middle
unsigned char lr, // lower right
float fScale )
{
short Horz = ur + 2*mr + lr - ul - 2*ml - ll;
short Vert = ul + 2*um + ur - ll - 2*lm - lr;
short Sum = (short) (fScale*(::abs(int(Horz))+::abs(int(Vert))));
if ( Sum < 0 ) return 0; else if ( Sum > 0xff ) return 0xff;
return (unsigned char) Sum;
}
__global__ void
SobelShared( int* pSobelOriginal, unsigned short SobelPitch,
short BlockWidth, short SharedPitch,
short w, short h, float fScale )
{
short u = 4*blockIdx.x*BlockWidth;
short v = blockIdx.y*blockDim.y + threadIdx.y;
short ib;
int SharedIdx = threadIdx.y * SharedPitch;
for ( ib = threadIdx.x; ib < BlockWidth+2*RADIUS; ib += blockDim.x ) {
LocalBlock[SharedIdx+4*ib+0] = tex2D( tex,
(float) (u+4*ib-RADIUS+0), (float) (v-RADIUS) );
LocalBlock[SharedIdx+4*ib+1] = tex2D( tex,
(float) (u+4*ib-RADIUS+1), (float) (v-RADIUS) );
LocalBlock[SharedIdx+4*ib+2] = tex2D( tex,
(float) (u+4*ib-RADIUS+2), (float) (v-RADIUS) );
LocalBlock[SharedIdx+4*ib+3] = tex2D( tex,
(float) (u+4*ib-RADIUS+3), (float) (v-RADIUS) );
}
if ( threadIdx.y < RADIUS*2 ) {
//
// copy trailing RADIUS*2 rows of pixels into shared
//
SharedIdx = (blockDim.y+threadIdx.y) * SharedPitch;
for ( ib = threadIdx.x; ib < BlockWidth+2*RADIUS; ib += blockDim.x ) {
LocalBlock[SharedIdx+4*ib+0] = tex2D( tex,
(float) (u+4*ib-RADIUS+0), (float) (v+blockDim.y-RADIUS) );
LocalBlock[SharedIdx+4*ib+1] = tex2D( tex,
(float) (u+4*ib-RADIUS+1), (float) (v+blockDim.y-RADIUS) );
LocalBlock[SharedIdx+4*ib+2] = tex2D( tex,
(float) (u+4*ib-RADIUS+2), (float) (v+blockDim.y-RADIUS) );
LocalBlock[SharedIdx+4*ib+3] = tex2D( tex,
(float) (u+4*ib-RADIUS+3), (float) (v+blockDim.y-RADIUS) );
}
}
__syncthreads();
u >>= 2; // index as uchar4 from here
uchar4 *pSobel = (uchar4 *) (((char *) pSobelOriginal)+v*SobelPitch);
SharedIdx = threadIdx.y * SharedPitch;
for ( ib = threadIdx.x; ib < BlockWidth; ib += blockDim.x ) {
unsigned char pix00 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+0];
unsigned char pix01 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+1];
unsigned char pix02 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+2];
unsigned char pix10 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+0];
unsigned char pix11 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+1];
unsigned char pix12 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+2];
unsigned char pix20 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+0];
unsigned char pix21 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+1];
unsigned char pix22 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+2];
uchar4 out;
out.x = ComputeSobel(pix00, pix01, pix02,
pix10, pix11, pix12,
pix20, pix21, pix22, fScale );
pix00 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+3];
pix10 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+3];
pix20 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+3];
out.y = ComputeSobel(pix01, pix02, pix00,
pix11, pix12, pix10,
pix21, pix22, pix20, fScale );
pix01 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+4];
pix11 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+4];
pix21 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+4];
out.z = ComputeSobel( pix02, pix00, pix01,
pix12, pix10, pix11,
pix22, pix20, pix21, fScale );
pix02 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+5];
pix12 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+5];
pix22 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+5];
out.w = ComputeSobel( pix00, pix01, pix02,
pix10, pix11, pix12,
pix20, pix21, pix22, fScale );
if ( u+ib < w/4 && v < h ) {
pSobel[u+ib] = out;
}
}
__syncthreads();
}
""")
cuda_function = sm.get_function("SobelShared")
if mode == 1:
# Just Texture
sm = SourceModule("""
texture<unsigned char, 2> tex;
__device__ unsigned char ComputeSobel(unsigned char ul, // upper left
unsigned char um, // upper middle
unsigned char ur, // upper right
unsigned char ml, // middle left
unsigned char mm, // middle (unused)
unsigned char mr, // middle right
unsigned char ll, // lower left
unsigned char lm, // lower middle
unsigned char lr, // lower right
float fScale )
{
short Horz = ur + 2*mr + lr - ul - 2*ml - ll;
short Vert = ul + 2*um + ur - ll - 2*lm - lr;
short Sum = (short) (fScale*(::abs(int(Horz))+::abs(int(Vert))));
if ( Sum < 0 ) return 0; else if ( Sum > 0xff ) return 0xff;
return (unsigned char) Sum;
}
__global__ void SobelTex( int* pSobelOriginal, unsigned int Pitch,
int w, int h, float fScale )
{
unsigned char *pSobel =
(unsigned char *) (((char *) pSobelOriginal)+blockIdx.x*Pitch);
for ( int i = threadIdx.x; i < w; i += blockDim.x ) {
unsigned char pix00 = tex2D( tex, (float) i-1, (float) blockIdx.x-1 );
unsigned char pix01 = tex2D( tex, (float) i+0, (float) blockIdx.x-1 );
unsigned char pix02 = tex2D( tex, (float) i+1, (float) blockIdx.x-1 );
unsigned char pix10 = tex2D( tex, (float) i-1, (float) blockIdx.x+0 );
unsigned char pix11 = tex2D( tex, (float) i+0, (float) blockIdx.x+0 );
unsigned char pix12 = tex2D( tex, (float) i+1, (float) blockIdx.x+0 );
unsigned char pix20 = tex2D( tex, (float) i-1, (float) blockIdx.x+1 );
unsigned char pix21 = tex2D( tex, (float) i+0, (float) blockIdx.x+1 );
unsigned char pix22 = tex2D( tex, (float) i+1, (float) blockIdx.x+1 );
pSobel[i] = ComputeSobel(pix00, pix01, pix02,
pix10, pix11, pix12,
pix20, pix21, pix22, fScale );
}
}
""")
cuda_function = sm.get_function("SobelTex")
elif mode == 0:
# Just Copy
sm = SourceModule("""
texture<unsigned char, 2> tex;
__global__ void SobelCopyImage(int* pSobelOriginal, unsigned int Pitch, int w, int h, float fscale )
{
unsigned char *pSobel =
(unsigned char *) (((unsigned char *) pSobelOriginal)+blockIdx.x*Pitch);
for ( int i = threadIdx.x; i < w; i += blockDim.x ) {
pSobel[i] = min( max((tex2D( tex, (float)i, (float)blockIdx.x ) * fscale), 0.f), 255.f);
}
}
""")
cuda_function = sm.get_function("SobelCopyImage")
texref = sm.get_texref("tex")
texref.set_array(array)
texref.set_flags(cuda_driver.TRSA_OVERRIDE_FORMAT)
if mode == 3:
# fixed BlockSize Launch
RADIUS = 1
threads = (16, 4, 1)
BlockWidth = 80 # Do not change!
blocks = (iw/(4*BlockWidth)+(0!=iw%(4*BlockWidth)),
ih/threads[1]+(0!=ih%threads[1]) )
SharedPitch = ~0x3f & (4*(BlockWidth+2*RADIUS)+0x3f);
sharedMem = SharedPitch*(threads[1]+2*RADIUS);
iw = iw & ~3
cuda_function(np.intp(odata), np.uint16(iw), np.int16(iw), np.int16(ih), np.float32(scale), texrefs=[texref],block=threads, grid=blocks, shared=sharedMem)
elif mode == 2:
# variable BlockSize launch
RADIUS = 1
threads = (16, 4, 1)
BlockWidth = 80 # Change only with divisible by 16 values!
blocks = (iw/(4*BlockWidth)+(0!=iw%(4*BlockWidth)),
ih/threads[1]+(0!=ih%threads[1]) )
SharedPitch = ~0x3f & (4*(BlockWidth+2*RADIUS)+0x3f);
sharedMem = SharedPitch*(threads[1]+2*RADIUS);
iw = iw & ~3
cuda_function(np.intp(odata), np.uint16(iw), np.int16(BlockWidth), np.int16(SharedPitch), np.int16(iw), np.int16(ih), np.float32(scale), texrefs=[texref],block=threads, grid=blocks, shared=sharedMem)
else:
BlockWidth = 384
cuda_function(np.intp(odata), np.uint32(iw), np.int32(iw), np.int32(ih), np.float32(scale), texrefs=[texref],block=(BlockWidth,1,1),grid=(ih,1))
def initGL():
global wWidth, wHeight, wName
glutInit(sys.argv)
glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA)
glutInitWindowSize(wWidth, wHeight)
glutCreateWindow(wName)
import pycuda.gl.autoinit
def loadImage(fn=None):
global pixels, imWidth, imHeight, wWidth, wHeight
try:
im = Image.open(fn) # Open the image
except IOError:
print "Usage:", os.path.basename(sys.argv[0]), "[IMAGE=defaultimage.jpg]"
print "Can't open", fn
sys.exit(1)
imWidth, imHeight = im.size # Window size is set to image size
wWidth, wHeight = im.size
im.draft("L", im.size) # L-flag is for Luminance
pixels = np.fromstring(im.tostring(), dtype=np.uint8) # Got the array
pixels.resize((imHeight, imWidth)) # Resize to 2d array
print "Reading image:", fn, "size:", imWidth, "x", imHeight
def initData(fn=None):
global pixels, array, pbo_buffer, cuda_pbo_resource, imWidth, imHeight, texid
# Cuda array initialization
array = cuda_driver.matrix_to_array(pixels, "C") # C-style instead of Fortran-style: row-major
pixels.fill(0) # Resetting the array to 0
pbo_buffer = glGenBuffers(1) # generate 1 buffer reference
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer) # binding to this buffer
glBufferData(GL_PIXEL_UNPACK_BUFFER, imWidth*imHeight, pixels, GL_STREAM_DRAW) # Allocate the buffer
bsize = glGetBufferParameteriv(GL_PIXEL_UNPACK_BUFFER, GL_BUFFER_SIZE) # Check allocated buffer size
assert(bsize == imWidth*imHeight)
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0) # Unbind
if ver2011:
cuda_pbo_resource = pycuda.gl.RegisteredBuffer(int(pbo_buffer), cuda_gl.graphics_map_flags.WRITE_DISCARD)
else:
cuda_pbo_resource = cuda_gl.BufferObject(int(pbo_buffer)) # Mapping GLBuffer to cuda_resource
glGenTextures(1, texid); # generate 1 texture reference
glBindTexture(GL_TEXTURE_2D, texid); # binding to this texture
glTexImage2D(GL_TEXTURE_2D, 0, GL_LUMINANCE, imWidth, imHeight, 0, GL_LUMINANCE, GL_UNSIGNED_BYTE, None); # Allocate the texture
glBindTexture(GL_TEXTURE_2D, 0) # Unbind
glPixelStorei(GL_UNPACK_ALIGNMENT, 1) # 1-byte row alignment
glPixelStorei(GL_PACK_ALIGNMENT, 1) # 1-byte row alignment
def display():
global cuda_pbo_resource, pbo_buffer, texid, imWidth, imHeight, timer
timer = time.time() # Starting timer
mapping_obj = cuda_pbo_resource.map() # Map the GlBuffer
if ver2011:
data, sz = mapping_obj.device_ptr_and_size() # Got the CUDA pointer to GlBuffer
else:
data = mapping_obj.device_ptr()
sobelFilter(data, imWidth, imHeight) # Writing to "data"
mapping_obj.unmap() # Unmap the GlBuffer
glClear(GL_COLOR_BUFFER_BIT) # Clear
glBindTexture(GL_TEXTURE_2D, texid)
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer)
# Copyng from buffer to texture
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, imWidth, imHeight, GL_LUMINANCE, GL_UNSIGNED_BYTE, None)
#glTexImage2D(GL_TEXTURE_2D, 0, GL_LUMINANCE, imWidth, imHeight, 0, GL_LUMINANCE, GL_UNSIGNED_BYTE, None);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0) # Unbind
glDisable(GL_DEPTH_TEST)
glEnable(GL_TEXTURE_2D)
glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR)
glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR)
glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT)
glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT)
glBegin(GL_QUADS)
glVertex2f(0, 0)
glTexCoord2f(0, 0)
glVertex2f(0, 1)
glTexCoord2f(1, 0)
glVertex2f(1, 1)
glTexCoord2f(1, 1)
glVertex2f(1, 0)
glTexCoord2f(0, 1)
glEnd()
glBindTexture(GL_TEXTURE_2D, 0)
glutSwapBuffers()
timer = time.time()-timer
computeFPS()
glutPostRedisplay()
def reshape(x, y):
glViewport(0, 0, x, y)
glMatrixMode(GL_PROJECTION)
glLoadIdentity()
glOrtho(0, 1, 0, 1, 0, 1)
glMatrixMode(GL_MODELVIEW)
glLoadIdentity()
glutPostRedisplay()
def keyboard(key, x=0, y=0):
global mode, scale
if key=="q":
sys.exit(0)
elif key=="I" or key=="i":
mode = 0
elif key=="T" or key=="t":
mode = 1
elif key=="S" or key=="s":
mode = 2
elif key=="D" or key=="d":
mode = 3
elif key == "-":
scale -= 0.1
elif key == "=":
scale += 0.1
def idle():
glutPostRedisplay()
def main(argv):
fn = "defaultimage.jpg"
if len(argv) > 1:
fn = argv[1]
loadImage(fn) # Loading the image
initGL()
initData(fn)
print """
Q: Exit
I: display image
T: display Sobel edge detection (computed with tex)
S: display Sobel edge detection (computed with tex+shared memory)
D: display Sobel edge detection (computed with tex+shared memory+fixed block size)
Use the '-' and '=' keys to change the brightness.
TESTED WITH IMAGE SIZE OF 512x512... just like the original demo.
Other image sizes may not work
"""
glutDisplayFunc(display)
glutKeyboardFunc(keyboard)
glutReshapeFunc(reshape)
glutIdleFunc(idle)
glutMainLoop();
if __name__ == "__main__":
if pycuda.VERSION[0] >= 2011:
ver2011 = True
main(sys.argv)