1 import numpy
2 import pycuda.autoinit
3 import pycuda.driver as drv
4 import pycuda.gpuarray as gpuarray
5 from pycuda.tools import context_dependent_memoize
6
7
8
9
10 def main(dtype):
11 from pycuda.elementwise import get_linear_combination_kernel
12 lc_kernel, lc_texrefs = get_linear_combination_kernel((
13 (True, dtype, dtype),
14 (True, dtype, dtype)
15 ), dtype)
16
17 for size_exp in range(10, 26):
18 size = 1 << size_exp
19
20 from pycuda.curandom import rand
21 a = gpuarray.to_gpu(numpy.array(5, dtype=dtype))
22 x = rand(size, dtype=dtype)
23 b = gpuarray.to_gpu(numpy.array(7, dtype=dtype))
24 y = rand(size, dtype=dtype)
25
26 z = gpuarray.empty_like(x)
27
28 start = drv.Event()
29 stop = drv.Event()
30 start.record()
31
32 for i in range(20):
33 a.bind_to_texref_ext(lc_texrefs[0], allow_double_hack=True)
34 b.bind_to_texref_ext(lc_texrefs[1], allow_double_hack=True)
35 lc_kernel.set_block_shape(*x._block)
36 lc_kernel.prepared_call(x._grid,
37 x.gpudata, y.gpudata, z.gpudata, x.mem_size)
38
39 stop.record()
40 stop.synchronize()
41
42 print size, size_exp, stop.time_since(start)
43
44
45
46 @context_dependent_memoize
47 def get_lin_comb_kernel_no_tex(summand_descriptors,
48 dtype_z):
49 from pycuda.tools import dtype_to_ctype
50 from pycuda.elementwise import \
51 VectorArg, ScalarArg, get_elwise_module
52
53 args = []
54 loop_prep = []
55 summands = []
56
57 for i, (is_gpu_scalar, scalar_dtype, vector_dtype) in \
58 enumerate(summand_descriptors):
59 if is_gpu_scalar:
60 args.append(VectorArg(vector_dtype, "global_a%d" % i))
61 args.append(VectorArg(vector_dtype, "x%d" % i))
62 loop_prep.append("%s a%d = *global_a%d"
63 % (dtype_to_ctype(scalar_dtype), i, i))
64 else:
65 args.append(ScalarArg(scalar_dtype, "a%d" % i))
66 args.append(VectorArg(vector_dtype, "x%d" % i))
67
68 summands.append("a%d*x%d[i]" % (i, i))
69
70 args.append(VectorArg(dtype_z, "z"))
71 args.append(ScalarArg(numpy.uintp, "n"))
72
73 mod = get_elwise_module(args,
74 "z[i] = " + " + ".join(summands),
75 "linear_combination",
76 loop_prep=";\n".join(loop_prep))
77
78 func = mod.get_function("linear_combination")
79 func.prepare("".join(arg.struct_char for arg in args),
80 (1,1,1))
81
82 return func
83
84
85
86 def main_no_tex(dtype):
87 lc_kernel = get_lin_comb_kernel_no_tex((
88 (True, dtype, dtype),
89 (True, dtype, dtype)
90 ), dtype)
91
92 for size_exp in range(10,26):
93 size = 1 << size_exp
94
95 from pycuda.curandom import rand
96 a = gpuarray.to_gpu(numpy.array(5, dtype=dtype))
97 x = rand(size, dtype=dtype)
98 b = gpuarray.to_gpu(numpy.array(7, dtype=dtype))
99 y = rand(size, dtype=dtype)
100
101 z = gpuarray.empty_like(x)
102
103 start = drv.Event()
104 stop = drv.Event()
105 start.record()
106
107 for i in range(20):
108 lc_kernel.set_block_shape(*x._block)
109 lc_kernel.prepared_call(x._grid,
110 a.gpudata, x.gpudata,
111 b.gpudata, y.gpudata,
112 z.gpudata, x.mem_size)
113
114 stop.record()
115 stop.synchronize()
116
117 print size, size_exp, stop.time_since(start)
118
119
120
121
122 if __name__ == "__main__":
123 dtype = numpy.float32
124
125 main(dtype)
126 print
127 main_no_tex(dtype)