1 import pycuda.driver as drv
   2 import pycuda.tools
   3 import pycuda.autoinit
   4 import numpy
   5 import numpy.linalg as la
   6 from pycuda.compiler import SourceModule
   7 
   8 mod = SourceModule("""
   9 __global__ void multiply_them(float *dest, float *a, float *b)
  10 {
  11   const int i = threadIdx.x;
  12   dest[i] = a[i] * b[i];
  13 }
  14 """)
  15 
  16 multiply_them = mod.get_function("multiply_them")
  17 
  18 a = numpy.random.randn(400).astype(numpy.float32)
  19 b = numpy.random.randn(400).astype(numpy.float32)
  20 
  21 dest = numpy.zeros_like(a)
  22 multiply_them(
  23         drv.Out(dest), drv.In(a), drv.In(b),
  24         block=(400,1,1))
  25 
  26 print dest-a*b