Differences between revisions 7 and 8
Revision 7 as of 2010-03-03 14:24:19
Size: 4986
Editor: 82
Comment:
Revision 8 as of 2012-09-18 11:45:19
Size: 5659
Editor: proxy31
Comment: Synchronizing on start event seems to be necessary as of today. See also http://stackoverflow.com/questions/12259010/measuring-time-using-pycuda-driver-event-gives-wrong-results
Deletions are marked like this. Additions are marked like this.
Line 1: Line 1:
= SimpleSpeedTest.py = SimpleSpeedTest.py
Line 8: Line 8:
||License of this example: ||GPL ||
||Date: ||03 March 2010 ||
||PyCUDA version: ||0.93 ||
Line 9: Line 12:
|| License of this example: || GPL ||
|| Date: || 03 March 2010 ||
|| PyCUDA version: || 0.93 ||

Line 38: Line 40:
#
#
# Using Win 7 x64, GTX 470 GPU, X5650 Xeon,
# Driver v301.42, CUDA 4.2, Python 2.7 x64,
# PyCuda 2012.1 gave the following results:
#
# Using nbr_values == 8192
# Calculating 100000 iterations
# SourceModule time and first three results:
# 0.058321s, [ 0.005477 0.005477 0.005477]
# Elementwise time and first three results:
# 0.102110s, [ 0.005477 0.005477 0.005477]
# Elementwise Python looping time and first three results:
# 2.428810s, [ 0.005477 0.005477 0.005477]
# GPUArray time and first three results:
# 8.421861s, [ 0.005477 0.005477 0.005477]
# CPU time measured using :
# 5.905661s, [ 0.005477 0.005477 0.005477]
Line 45: Line 66:
import pycuda.cumath  import pycuda.cumath
Line 54: Line 75:
# Number of iterations for the calculations,  # Number of iterations for the calculations,
Line 159: Line 180:
start.synchronize()

SimpleSpeedTest.py

Very simple speed testing code. This shows you how to run a loop over sin() using different methods with a note of the time each method takes.

For the GPU this uses SourceModule, ElementwiseKernel, GPUArray. For the CPU this uses numpy

Ian@IanOzsvald.com

License of this example:

GPL

Date:

03 March 2010

PyCUDA version:

0.93

   1 # SimpleSpeedTest.py
   2 
   3 # Very simple speed testing code
   4 # Shows you how to run a loop over sin() using different methods
   5 # with a note of the time each method takes
   6 # For the GPU this uses SourceModule, ElementwiseKernel, GPUArray
   7 # For the CPU this uses numpy
   8 # Ian@IanOzsvald.com
   9 
  10 # Using a WinXP Intel Core2 Duo 2.66GHz CPU (1 CPU used)
  11 # with a 9800GT GPU I get the following timings (smaller is better):
  12 #
  13 # Using nbr_values == 8192
  14 # Calculating 100000 iterations
  15 # SourceModule time and first three results:
  16 # 0.166590s, [ 0.005477  0.005477  0.005477]
  17 # Elementwise time and first three results:
  18 # 0.171657s, [ 0.005477  0.005477  0.005477]
  19 # Elementwise Python looping time and first three results:
  20 # 1.487470s, [ 0.005477  0.005477  0.005477]
  21 # GPUArray time and first three results:
  22 # 4.740007s, [ 0.005477  0.005477  0.005477]
  23 # CPU time and first three results:
  24 # 32.933660s, [ 0.005477  0.005477  0.005477]
  25 #
  26 # 
  27 # Using Win 7 x64, GTX 470 GPU, X5650 Xeon,
  28 # Driver v301.42, CUDA 4.2, Python 2.7 x64,
  29 # PyCuda 2012.1 gave the following results:
  30 #
  31 # Using nbr_values == 8192
  32 # Calculating 100000 iterations
  33 # SourceModule time and first three results:
  34 # 0.058321s, [ 0.005477  0.005477  0.005477]
  35 # Elementwise time and first three results:
  36 # 0.102110s, [ 0.005477  0.005477  0.005477]
  37 # Elementwise Python looping time and first three results:
  38 # 2.428810s, [ 0.005477  0.005477  0.005477]
  39 # GPUArray time and first three results:
  40 # 8.421861s, [ 0.005477  0.005477  0.005477]
  41 # CPU time measured using :
  42 # 5.905661s, [ 0.005477  0.005477  0.005477]
  43 
  44 
  45 import pycuda.driver as drv
  46 import pycuda.tools
  47 import pycuda.autoinit
  48 import numpy
  49 from pycuda.compiler import SourceModule
  50 import pycuda.gpuarray as gpuarray
  51 import pycuda.cumath
  52 from pycuda.elementwise import ElementwiseKernel
  53 
  54 blocks = 64
  55 block_size = 128
  56 nbr_values = blocks * block_size
  57 
  58 print "Using nbr_values ==", nbr_values
  59 
  60 # Number of iterations for the calculations,
  61 # 100 is very quick, 2000000 will take a while
  62 n_iter = 100000
  63 print "Calculating %d iterations" % (n_iter)
  64 
  65 # create two timers so we can speed-test each approach
  66 start = drv.Event()
  67 end = drv.Event()
  68 
  69 ######################
  70 # SourceModele SECTION
  71 # We write the C code and the indexing and we have lots of control
  72 
  73 mod = SourceModule("""
  74 __global__ void gpusin(float *dest, float *a, int n_iter)
  75 {
  76   const int i = blockDim.x*blockIdx.x + threadIdx.x;
  77   for(int n = 0; n < n_iter; n++) {
  78     a[i] = sin(a[i]);
  79   }
  80   dest[i] = a[i];
  81 }
  82 """)
  83 
  84 gpusin = mod.get_function("gpusin")
  85 
  86 # create an array of 1s
  87 a = numpy.ones(nbr_values).astype(numpy.float32)
  88 # create a destination array that will receive the result
  89 dest = numpy.zeros_like(a)
  90 
  91 start.record() # start timing
  92 gpusin(drv.Out(dest), drv.In(a), numpy.int32(n_iter), grid=(blocks,1), block=(block_size,1,1) )
  93 end.record() # end timing
  94 # calculate the run length
  95 end.synchronize()
  96 secs = start.time_till(end)*1e-3
  97 print "SourceModule time and first three results:"
  98 print "%fs, %s" % (secs, str(dest[:3]))
  99 
 100 
 101 #####################
 102 # Elementwise SECTION
 103 # use an ElementwiseKernel with sin in a for loop all in C call from Python
 104 kernel = ElementwiseKernel(
 105    "float *a, int n_iter",
 106    "for(int n = 0; n < n_iter; n++) { a[i] = sin(a[i]);}",
 107    "gpusin")
 108 
 109 a = numpy.ones(nbr_values).astype(numpy.float32)
 110 a_gpu = gpuarray.to_gpu(a)
 111 start.record() # start timing
 112 kernel(a_gpu, numpy.int(n_iter))
 113 end.record() # end timing
 114 # calculate the run length
 115 end.synchronize()
 116 secs = start.time_till(end)*1e-3
 117 print "Elementwise time and first three results:"
 118 print "%fs, %s" % (secs, str(a_gpu.get()[:3]))
 119 
 120 
 121 ####################################
 122 # Elementwise Python looping SECTION
 123 # as Elementwise but the for loop is in Python, not in C
 124 kernel = ElementwiseKernel(
 125    "float *a",
 126    "a[i] = sin(a[i]);",
 127    "gpusin")
 128 
 129 a = numpy.ones(nbr_values).astype(numpy.float32)
 130 a_gpu = gpuarray.to_gpu(a)
 131 start.record() # start timing
 132 for i in range(n_iter):
 133     kernel(a_gpu)
 134 end.record() # end timing
 135 # calculate the run length
 136 end.synchronize()
 137 secs = start.time_till(end)*1e-3
 138 print "Elementwise Python looping time and first three results:"
 139 print "%fs, %s" % (secs, str(a_gpu.get()[:3]))
 140 
 141 
 142 ##################
 143 # GPUArray SECTION
 144 # The result is copied back to main memory on each iteration, this is a bottleneck
 145 
 146 a = numpy.ones(nbr_values).astype(numpy.float32)
 147 a_gpu = gpuarray.to_gpu(a)
 148 start.record() # start timing
 149 for i in range(n_iter):
 150     a_gpu = pycuda.cumath.sin(a_gpu)
 151 end.record() # end timing
 152 # calculate the run length
 153 end.synchronize()
 154 secs = start.time_till(end)*1e-3
 155 print "GPUArray time and first three results:"
 156 print "%fs, %s" % (secs, str(a_gpu.get()[:3]))
 157 
 158 
 159 #############
 160 # CPU SECTION
 161 # use numpy the calculate the result on the CPU for reference
 162 
 163 a = numpy.ones(nbr_values).astype(numpy.float32)
 164 start.record() # start timing
 165 start.synchronize()
 166 
 167 for i in range(n_iter):
 168     a = numpy.sin(a)
 169 
 170 end.record() # end timing
 171 # calculate the run length
 172 end.synchronize()
 173 secs = start.time_till(end)*1e-3
 174 print "CPU time and first three results:"
 175 print "%fs, %s" % (secs, str(a[:3]))

PyCuda/Examples/SimpleSpeedTest (last edited 2012-09-18 11:45:19 by proxy31)