Requires cgen.
1 import pycuda.driver as cuda
2 import pycuda.autoinit
3 import numpy
4 import numpy.linalg as la
5 from pycuda.compiler import SourceModule
6
7 thread_strides = 16
8 block_size = 256
9 macroblock_count = 33
10
11 total_size = thread_strides*block_size*macroblock_count
12 dtype = numpy.float32
13
14 a = numpy.random.randn(total_size).astype(dtype)
15 b = numpy.random.randn(total_size).astype(dtype)
16
17 a_gpu = cuda.to_device(a)
18 b_gpu = cuda.to_device(b)
19 c_gpu = cuda.mem_alloc(a.nbytes)
20
21 from cgen import FunctionBody, \
22 FunctionDeclaration, Typedef, POD, Value, \
23 Pointer, Module, Block, Initializer, Assign
24 from cgen.cuda import CudaGlobal
25
26 mod = Module([
27 FunctionBody(
28 CudaGlobal(FunctionDeclaration(
29 Value("void", "add"),
30 arg_decls=[Pointer(POD(dtype, name))
31 for name in ["tgt", "op1", "op2"]])),
32 Block([
33 Initializer(
34 POD(numpy.int32, "idx"),
35 "threadIdx.x + %d*blockIdx.x"
36 % (block_size*thread_strides)),
37 ]+[
38 Assign(
39 "tgt[idx+%d]" % (o*block_size),
40 "op1[idx+%d] + op2[idx+%d]" % (
41 o*block_size,
42 o*block_size))
43 for o in range(thread_strides)]))])
44
45 mod = SourceModule(mod)
46
47 func = mod.get_function("add")
48 func(c_gpu, a_gpu, b_gpu,
49 block=(block_size,1,1),
50 grid=(macroblock_count,1))
51
52 c = cuda.from_device_like(c_gpu, a)
53
54 assert la.norm(c-(a+b)) == 0
