1 import numpy
   2 import pycuda.autoinit
   3 import pycuda.driver as drv
   4 import pycuda.gpuarray as gpuarray
   5 from pycuda.tools import context_dependent_memoize
   6 
   7 
   8 
   9 
  10 def main(dtype):
  11     from pycuda.elementwise import get_linear_combination_kernel
  12     lc_kernel, lc_texrefs = get_linear_combination_kernel((
  13         (True, dtype, dtype),
  14         (True, dtype, dtype)
  15         ), dtype)
  16 
  17     for size_exp in range(10, 26):
  18         size = 1 << size_exp
  19 
  20         from pycuda.curandom import rand
  21         a = gpuarray.to_gpu(numpy.array(5, dtype=dtype))
  22         x = rand(size, dtype=dtype)
  23         b = gpuarray.to_gpu(numpy.array(7, dtype=dtype))
  24         y = rand(size, dtype=dtype)
  25 
  26         z = gpuarray.empty_like(x)
  27 
  28         start = drv.Event()
  29         stop = drv.Event()
  30         start.record()
  31 
  32         for i in range(20):
  33             a.bind_to_texref_ext(lc_texrefs[0], allow_double_hack=True)
  34             b.bind_to_texref_ext(lc_texrefs[1], allow_double_hack=True)
  35             lc_kernel.set_block_shape(*x._block)
  36             lc_kernel.prepared_call(x._grid, 
  37                 x.gpudata, y.gpudata, z.gpudata, x.mem_size)
  38 
  39         stop.record()
  40         stop.synchronize()
  41 
  42         print size, size_exp, stop.time_since(start)
  43 
  44 
  45 
  46 @context_dependent_memoize
  47 def get_lin_comb_kernel_no_tex(summand_descriptors,
  48         dtype_z):
  49     from pycuda.tools import dtype_to_ctype
  50     from pycuda.elementwise import \
  51             VectorArg, ScalarArg, get_elwise_module
  52 
  53     args = []
  54     loop_prep = []
  55     summands = []
  56 
  57     for i, (is_gpu_scalar, scalar_dtype, vector_dtype) in \
  58             enumerate(summand_descriptors):
  59         if is_gpu_scalar:
  60             args.append(VectorArg(vector_dtype, "global_a%d" % i))
  61             args.append(VectorArg(vector_dtype, "x%d" % i))
  62             loop_prep.append("%s a%d = *global_a%d" 
  63                     % (dtype_to_ctype(scalar_dtype), i, i))
  64         else:
  65             args.append(ScalarArg(scalar_dtype, "a%d" % i))
  66             args.append(VectorArg(vector_dtype, "x%d" % i))
  67 
  68         summands.append("a%d*x%d[i]" % (i, i))
  69 
  70     args.append(VectorArg(dtype_z, "z"))
  71     args.append(ScalarArg(numpy.uintp, "n"))
  72 
  73     mod = get_elwise_module(args, 
  74             "z[i] = " + " + ".join(summands), 
  75             "linear_combination", 
  76             loop_prep=";\n".join(loop_prep))
  77 
  78     func = mod.get_function("linear_combination")
  79     func.prepare("".join(arg.struct_char for arg in args), 
  80             (1,1,1))
  81 
  82     return func
  83 
  84 
  85 
  86 def main_no_tex(dtype):
  87     lc_kernel = get_lin_comb_kernel_no_tex((
  88         (True, dtype, dtype),
  89         (True, dtype, dtype)
  90         ), dtype)
  91 
  92     for size_exp in range(10,26):
  93         size = 1 << size_exp
  94 
  95         from pycuda.curandom import rand
  96         a = gpuarray.to_gpu(numpy.array(5, dtype=dtype))
  97         x = rand(size, dtype=dtype)
  98         b = gpuarray.to_gpu(numpy.array(7, dtype=dtype))
  99         y = rand(size, dtype=dtype)
 100 
 101         z = gpuarray.empty_like(x)
 102 
 103         start = drv.Event()
 104         stop = drv.Event()
 105         start.record()
 106 
 107         for i in range(20):
 108             lc_kernel.set_block_shape(*x._block)
 109             lc_kernel.prepared_call(x._grid, 
 110                 a.gpudata, x.gpudata, 
 111                 b.gpudata, y.gpudata, 
 112                 z.gpudata, x.mem_size)
 113 
 114         stop.record()
 115         stop.synchronize()
 116 
 117         print size, size_exp, stop.time_since(start)
 118 
 119 
 120 
 121 
 122 if __name__ == "__main__":
 123     dtype = numpy.float32
 124 
 125     main(dtype)
 126     print
 127     main_no_tex(dtype)