1 import pycuda.driver as cuda
   2 import pycuda.autoinit
   3 import numpy
   4 import numpy.linalg as la
   5 from pycuda.compiler import SourceModule
   6 
   7 thread_strides = 16
   8 block_size = 32
   9 macroblock_count = 33
  10 
  11 total_size = thread_strides*block_size*macroblock_count
  12 dtype = numpy.float32
  13 
  14 a = numpy.random.randn(total_size).astype(dtype)
  15 b = numpy.random.randn(total_size).astype(dtype)
  16 
  17 a_gpu = cuda.to_device(a)
  18 b_gpu = cuda.to_device(b)
  19 c_gpu = cuda.mem_alloc(a.nbytes)
  20 
  21 from jinja2 import Template
  22 
  23 tpl = Template("""
  24     __global__ void add(
  25             {{ type_name }} *tgt, 
  26             {{ type_name }} *op1, 
  27             {{ type_name }} *op2)
  28     {
  29       int idx = threadIdx.x + 
  30         {{ block_size }} * {{thread_strides}}
  31         * blockIdx.x;
  32 
  33       {% for i in range(thread_strides) %}
  34           {% set offset = i*block_size %}
  35           tgt[idx + {{ offset }}] = 
  36             op1[idx + {{ offset }}] 
  37             + op2[idx + {{ offset }}];
  38       {% endfor %}
  39     }""")
  40 
  41 rendered_tpl = tpl.render(
  42     type_name="float", thread_strides=thread_strides,
  43     block_size=block_size)
  44 
  45 mod = SourceModule(rendered_tpl)
  46 # end
  47 
  48 func = mod.get_function("add")
  49 func(c_gpu, a_gpu, b_gpu, 
  50         block=(block_size,1,1),
  51         grid=(macroblock_count,1))
  52 
  53 c = cuda.from_device_like(c_gpu, a)
  54 
  55 assert la.norm(c-(a+b)) == 0

PyCuda/Examples/DemoMetaTemplate (last edited 2010-01-31 16:30:33 by ip72-221-120-106)