SimpleSpeedTest.py

Very simple speed testing code. This shows you how to run a loop over sin() using different methods with a note of the time each method takes.

For the GPU this uses SourceModule, ElementwiseKernel, GPUArray. For the CPU this uses numpy

Ian@IanOzsvald.com

License of this example:

GPL

Date:

03 March 2010

PyCUDA version:

0.93

   1 # SimpleSpeedTest.py
   2 
   3 # Very simple speed testing code
   4 # Shows you how to run a loop over sin() using different methods
   5 # with a note of the time each method takes
   6 # For the GPU this uses SourceModule, ElementwiseKernel, GPUArray
   7 # For the CPU this uses numpy
   8 # Ian@IanOzsvald.com
   9 
  10 # Using a WinXP Intel Core2 Duo 2.66GHz CPU (1 CPU used)
  11 # with a 9800GT GPU I get the following timings (smaller is better):
  12 #
  13 # Using nbr_values == 8192
  14 # Calculating 100000 iterations
  15 # SourceModule time and first three results:
  16 # 0.166590s, [ 0.005477  0.005477  0.005477]
  17 # Elementwise time and first three results:
  18 # 0.171657s, [ 0.005477  0.005477  0.005477]
  19 # Elementwise Python looping time and first three results:
  20 # 1.487470s, [ 0.005477  0.005477  0.005477]
  21 # GPUArray time and first three results:
  22 # 4.740007s, [ 0.005477  0.005477  0.005477]
  23 # CPU time and first three results:
  24 # 32.933660s, [ 0.005477  0.005477  0.005477]
  25 
  26 import pycuda.driver as drv
  27 import pycuda.tools
  28 import pycuda.autoinit
  29 import numpy
  30 from pycuda.compiler import SourceModule
  31 import pycuda.gpuarray as gpuarray
  32 import pycuda.cumath 
  33 from pycuda.elementwise import ElementwiseKernel
  34 
  35 blocks = 64
  36 block_size = 128
  37 nbr_values = blocks * block_size
  38 
  39 print "Using nbr_values ==", nbr_values
  40 
  41 # Number of iterations for the calculations, 
  42 # 100 is very quick, 2000000 will take a while
  43 n_iter = 100000
  44 print "Calculating %d iterations" % (n_iter)
  45 
  46 # create two timers so we can speed-test each approach
  47 start = drv.Event()
  48 end = drv.Event()
  49 
  50 ######################
  51 # SourceModele SECTION
  52 # We write the C code and the indexing and we have lots of control
  53 
  54 mod = SourceModule("""
  55 __global__ void gpusin(float *dest, float *a, int n_iter)
  56 {
  57   const int i = blockDim.x*blockIdx.x + threadIdx.x;
  58   for(int n = 0; n < n_iter; n++) {
  59     a[i] = sin(a[i]);
  60   }
  61   dest[i] = a[i];
  62 }
  63 """)
  64 
  65 gpusin = mod.get_function("gpusin")
  66 
  67 # create an array of 1s
  68 a = numpy.ones(nbr_values).astype(numpy.float32)
  69 # create a destination array that will receive the result
  70 dest = numpy.zeros_like(a)
  71 
  72 start.record() # start timing
  73 gpusin(drv.Out(dest), drv.In(a), numpy.int32(n_iter), grid=(blocks,1), block=(block_size,1,1) )
  74 end.record() # end timing
  75 # calculate the run length
  76 end.synchronize()
  77 secs = start.time_till(end)*1e-3
  78 print "SourceModule time and first three results:"
  79 print "%fs, %s" % (secs, str(dest[:3]))
  80 
  81 
  82 #####################
  83 # Elementwise SECTION
  84 # use an ElementwiseKernel with sin in a for loop all in C call from Python
  85 kernel = ElementwiseKernel(
  86    "float *a, int n_iter",
  87    "for(int n = 0; n < n_iter; n++) { a[i] = sin(a[i]);}",
  88    "gpusin")
  89 
  90 a = numpy.ones(nbr_values).astype(numpy.float32)
  91 a_gpu = gpuarray.to_gpu(a)
  92 start.record() # start timing
  93 kernel(a_gpu, numpy.int(n_iter))
  94 end.record() # end timing
  95 # calculate the run length
  96 end.synchronize()
  97 secs = start.time_till(end)*1e-3
  98 print "Elementwise time and first three results:"
  99 print "%fs, %s" % (secs, str(a_gpu.get()[:3]))
 100 
 101 
 102 ####################################
 103 # Elementwise Python looping SECTION
 104 # as Elementwise but the for loop is in Python, not in C
 105 kernel = ElementwiseKernel(
 106    "float *a",
 107    "a[i] = sin(a[i]);",
 108    "gpusin")
 109 
 110 a = numpy.ones(nbr_values).astype(numpy.float32)
 111 a_gpu = gpuarray.to_gpu(a)
 112 start.record() # start timing
 113 for i in range(n_iter):
 114     kernel(a_gpu)
 115 end.record() # end timing
 116 # calculate the run length
 117 end.synchronize()
 118 secs = start.time_till(end)*1e-3
 119 print "Elementwise Python looping time and first three results:"
 120 print "%fs, %s" % (secs, str(a_gpu.get()[:3]))
 121 
 122 
 123 ##################
 124 # GPUArray SECTION
 125 # The result is copied back to main memory on each iteration, this is a bottleneck
 126 
 127 a = numpy.ones(nbr_values).astype(numpy.float32)
 128 a_gpu = gpuarray.to_gpu(a)
 129 start.record() # start timing
 130 for i in range(n_iter):
 131     a_gpu = pycuda.cumath.sin(a_gpu)
 132 end.record() # end timing
 133 # calculate the run length
 134 end.synchronize()
 135 secs = start.time_till(end)*1e-3
 136 print "GPUArray time and first three results:"
 137 print "%fs, %s" % (secs, str(a_gpu.get()[:3]))
 138 
 139 
 140 #############
 141 # CPU SECTION
 142 # use numpy the calculate the result on the CPU for reference
 143 
 144 a = numpy.ones(nbr_values).astype(numpy.float32)
 145 start.record() # start timing
 146 
 147 for i in range(n_iter):
 148     a = numpy.sin(a)
 149 
 150 end.record() # end timing
 151 # calculate the run length
 152 end.synchronize()
 153 secs = start.time_till(end)*1e-3
 154 print "CPU time and first three results:"
 155 print "%fs, %s" % (secs, str(a[:3]))
 156 

PyCuda/Examples/SimpleSpeedTest (last edited 2010-03-03 14:24:19 by 82)