1 import numpy
2 import pycuda.autoinit
3 import pycuda.driver as drv
4 import pycuda.gpuarray as gpuarray
5 from pycuda.tools import context_dependent_memoize
6
7
8
9
10 def main(dtype):
11 from pycuda.elementwise import get_linear_combination_kernel
12 lc_kernel, lc_texrefs = get_linear_combination_kernel((
13 (True, dtype, dtype),
14 (True, dtype, dtype)
15 ), dtype)
16
17 for size_exp in range(10, 26):
18 size = 1 << size_exp
19
20 from pycuda.curandom import rand
21 a = gpuarray.to_gpu(numpy.array(5, dtype=dtype))
22 x = rand(size, dtype=dtype)
23 b = gpuarray.to_gpu(numpy.array(7, dtype=dtype))
24 y = rand(size, dtype=dtype)
25
26 z = gpuarray.empty_like(x)
27
28 start = drv.Event()
29 stop = drv.Event()
30 start.record()
31
32 for i in range(20):
33 a.bind_to_texref_ext(lc_texrefs[0], allow_double_hack=True)
34 b.bind_to_texref_ext(lc_texrefs[1], allow_double_hack=True)
35 lc_kernel.prepared_call(x._grid, x._block,
36 x.gpudata, y.gpudata, z.gpudata, x.mem_size)
37
38 stop.record()
39 stop.synchronize()
40
41 print size, size_exp, stop.time_since(start)
42
43
44
45 @context_dependent_memoize
46 def get_lin_comb_kernel_no_tex(summand_descriptors,
47 dtype_z):
48 from pycuda.tools import dtype_to_ctype
49 from pycuda.elementwise import \
50 VectorArg, ScalarArg, get_elwise_module
51
52 args = []
53 loop_prep = []
54 summands = []
55
56 for i, (is_gpu_scalar, scalar_dtype, vector_dtype) in \
57 enumerate(summand_descriptors):
58 if is_gpu_scalar:
59 args.append(VectorArg(vector_dtype, "global_a%d" % i))
60 args.append(VectorArg(vector_dtype, "x%d" % i))
61 loop_prep.append("%s a%d = *global_a%d"
62 % (dtype_to_ctype(scalar_dtype), i, i))
63 else:
64 args.append(ScalarArg(scalar_dtype, "a%d" % i))
65 args.append(VectorArg(vector_dtype, "x%d" % i))
66
67 summands.append("a%d*x%d[i]" % (i, i))
68
69 args.append(VectorArg(dtype_z, "z"))
70 args.append(ScalarArg(numpy.uintp, "n"))
71
72 mod = get_elwise_module(args,
73 "z[i] = " + " + ".join(summands),
74 "linear_combination",
75 loop_prep=";\n".join(loop_prep))
76
77 func = mod.get_function("linear_combination")
78 func.prepare("".join(arg.struct_char for arg in args))
79
80 return func
81
82
83
84 def main_no_tex(dtype):
85 lc_kernel = get_lin_comb_kernel_no_tex((
86 (True, dtype, dtype),
87 (True, dtype, dtype)
88 ), dtype)
89
90 for size_exp in range(10,26):
91 size = 1 << size_exp
92
93 from pycuda.curandom import rand
94 a = gpuarray.to_gpu(numpy.array(5, dtype=dtype))
95 x = rand(size, dtype=dtype)
96 b = gpuarray.to_gpu(numpy.array(7, dtype=dtype))
97 y = rand(size, dtype=dtype)
98
99 z = gpuarray.empty_like(x)
100
101 start = drv.Event()
102 stop = drv.Event()
103 start.record()
104
105 for i in range(20):
106 lc_kernel.prepared_call(x._grid, x._block,
107 a.gpudata, x.gpudata,
108 b.gpudata, y.gpudata,
109 z.gpudata, x.mem_size)
110
111 stop.record()
112 stop.synchronize()
113
114 print size, size_exp, stop.time_since(start)
115
116
117
118
119 if __name__ == "__main__":
120 dtype = numpy.float32
121
122 main(dtype)
123 print
124 main_no_tex(dtype)