1 import pycuda.driver as cuda
2 import pycuda.autoinit
3 import numpy
4 import numpy.linalg as la
5 from pycuda.compiler import SourceModule
6
7 thread_strides = 16
8 block_size = 32
9 macroblock_count = 33
10
11 total_size = thread_strides*block_size*macroblock_count
12 dtype = numpy.float32
13
14 a = numpy.random.randn(total_size).astype(dtype)
15 b = numpy.random.randn(total_size).astype(dtype)
16
17 a_gpu = cuda.to_device(a)
18 b_gpu = cuda.to_device(b)
19 c_gpu = cuda.mem_alloc(a.nbytes)
20
21 from jinja2 import Template
22
23 tpl = Template("""
24 __global__ void add(
25 {{ type_name }} *tgt,
26 {{ type_name }} *op1,
27 {{ type_name }} *op2)
28 {
29 int idx = threadIdx.x +
30 {{ block_size }} * {{thread_strides}}
31 * blockIdx.x;
32
33 {% for i in range(thread_strides) %}
34 {% set offset = i*block_size %}
35 tgt[idx + {{ offset }}] =
36 op1[idx + {{ offset }}]
37 + op2[idx + {{ offset }}];
38 {% endfor %}
39 }""")
40
41 rendered_tpl = tpl.render(
42 type_name="float", thread_strides=thread_strides,
43 block_size=block_size)
44
45 mod = SourceModule(rendered_tpl)
46
47
48 func = mod.get_function("add")
49 func(c_gpu, a_gpu, b_gpu,
50 block=(block_size,1,1),
51 grid=(macroblock_count,1))
52
53 c = cuda.from_device_like(c_gpu, a)
54
55 assert la.norm(c-(a+b)) == 0
PyCuda/Examples/DemoMetaTemplate (last edited 2010-01-31 16:30:33 by ip72-221-120-106)