Requires cgen.

   1 import pycuda.driver as cuda
   2 import pycuda.autoinit
   3 import numpy
   4 import numpy.linalg as la
   5 from pycuda.compiler import SourceModule
   6 
   7 thread_strides = 16
   8 block_size = 256
   9 macroblock_count = 33
  10 
  11 total_size = thread_strides*block_size*macroblock_count
  12 dtype = numpy.float32
  13 
  14 a = numpy.random.randn(total_size).astype(dtype)
  15 b = numpy.random.randn(total_size).astype(dtype)
  16 
  17 a_gpu = cuda.to_device(a)
  18 b_gpu = cuda.to_device(b)
  19 c_gpu = cuda.mem_alloc(a.nbytes)
  20 
  21 from cgen import FunctionBody, \
  22         FunctionDeclaration, Typedef, POD, Value, \
  23         Pointer, Module, Block, Initializer, Assign
  24 from cgen.cuda import CudaGlobal
  25 
  26 mod = Module([
  27     FunctionBody(
  28         CudaGlobal(FunctionDeclaration(
  29             Value("void", "add"),
  30             arg_decls=[Pointer(POD(dtype, name)) 
  31                 for name in ["tgt", "op1", "op2"]])),
  32         Block([
  33             Initializer(
  34                 POD(numpy.int32, "idx"),
  35                 "threadIdx.x + %d*blockIdx.x" 
  36                 % (block_size*thread_strides)),
  37             ]+[
  38             Assign(
  39                 "tgt[idx+%d]" % (o*block_size),
  40                 "op1[idx+%d] + op2[idx+%d]" % (
  41                     o*block_size, 
  42                     o*block_size))
  43             for o in range(thread_strides)]))])
  44 
  45 mod = SourceModule(mod)
  46 
  47 func = mod.get_function("add")
  48 func(c_gpu, a_gpu, b_gpu, 
  49         block=(block_size,1,1),
  50         grid=(macroblock_count,1))
  51 
  52 c = cuda.from_device_like(c_gpu, a)
  53 
  54 assert la.norm(c-(a+b)) == 0