1 import numpy
   2 import pycuda.autoinit
   3 import pycuda.driver as drv
   4 import pycuda.gpuarray as gpuarray
   5 from pycuda.tools import context_dependent_memoize
   6 
   7 
   8 
   9 
  10 def main(dtype):
  11     from pycuda.elementwise import get_linear_combination_kernel
  12     lc_kernel, lc_texrefs = get_linear_combination_kernel((
  13         (True, dtype, dtype),
  14         (True, dtype, dtype)
  15         ), dtype)
  16 
  17     for size_exp in range(10, 26):
  18         size = 1 << size_exp
  19 
  20         from pycuda.curandom import rand
  21         a = gpuarray.to_gpu(numpy.array(5, dtype=dtype))
  22         x = rand(size, dtype=dtype)
  23         b = gpuarray.to_gpu(numpy.array(7, dtype=dtype))
  24         y = rand(size, dtype=dtype)
  25 
  26         z = gpuarray.empty_like(x)
  27 
  28         start = drv.Event()
  29         stop = drv.Event()
  30         start.record()
  31 
  32         for i in range(20):
  33             a.bind_to_texref_ext(lc_texrefs[0], allow_double_hack=True)
  34             b.bind_to_texref_ext(lc_texrefs[1], allow_double_hack=True)
  35             lc_kernel.prepared_call(x._grid, x._block,
  36                 x.gpudata, y.gpudata, z.gpudata, x.mem_size)
  37 
  38         stop.record()
  39         stop.synchronize()
  40 
  41         print size, size_exp, stop.time_since(start)
  42 
  43 
  44 
  45 @context_dependent_memoize
  46 def get_lin_comb_kernel_no_tex(summand_descriptors,
  47         dtype_z):
  48     from pycuda.tools import dtype_to_ctype
  49     from pycuda.elementwise import \
  50             VectorArg, ScalarArg, get_elwise_module
  51 
  52     args = []
  53     loop_prep = []
  54     summands = []
  55 
  56     for i, (is_gpu_scalar, scalar_dtype, vector_dtype) in \
  57             enumerate(summand_descriptors):
  58         if is_gpu_scalar:
  59             args.append(VectorArg(vector_dtype, "global_a%d" % i))
  60             args.append(VectorArg(vector_dtype, "x%d" % i))
  61             loop_prep.append("%s a%d = *global_a%d"
  62                     % (dtype_to_ctype(scalar_dtype), i, i))
  63         else:
  64             args.append(ScalarArg(scalar_dtype, "a%d" % i))
  65             args.append(VectorArg(vector_dtype, "x%d" % i))
  66 
  67         summands.append("a%d*x%d[i]" % (i, i))
  68 
  69     args.append(VectorArg(dtype_z, "z"))
  70     args.append(ScalarArg(numpy.uintp, "n"))
  71 
  72     mod = get_elwise_module(args,
  73             "z[i] = " + " + ".join(summands),
  74             "linear_combination",
  75             loop_prep=";\n".join(loop_prep))
  76 
  77     func = mod.get_function("linear_combination")
  78     func.prepare("".join(arg.struct_char for arg in args))
  79 
  80     return func
  81 
  82 
  83 
  84 def main_no_tex(dtype):
  85     lc_kernel = get_lin_comb_kernel_no_tex((
  86         (True, dtype, dtype),
  87         (True, dtype, dtype)
  88         ), dtype)
  89 
  90     for size_exp in range(10,26):
  91         size = 1 << size_exp
  92 
  93         from pycuda.curandom import rand
  94         a = gpuarray.to_gpu(numpy.array(5, dtype=dtype))
  95         x = rand(size, dtype=dtype)
  96         b = gpuarray.to_gpu(numpy.array(7, dtype=dtype))
  97         y = rand(size, dtype=dtype)
  98 
  99         z = gpuarray.empty_like(x)
 100 
 101         start = drv.Event()
 102         stop = drv.Event()
 103         start.record()
 104 
 105         for i in range(20):
 106             lc_kernel.prepared_call(x._grid, x._block,
 107                 a.gpudata, x.gpudata,
 108                 b.gpudata, y.gpudata,
 109                 z.gpudata, x.mem_size)
 110 
 111         stop.record()
 112         stop.synchronize()
 113 
 114         print size, size_exp, stop.time_since(start)
 115 
 116 
 117 
 118 
 119 if __name__ == "__main__":
 120     dtype = numpy.float32
 121 
 122     main(dtype)
 123     print
 124     main_no_tex(dtype)

PyCuda/Examples/GpuScalarMult (last edited 2014-12-15 23:23:11 by ::ffff:205)