SimpleSpeedTest.py
Very simple speed testing code. This shows you how to run a loop over sin() using different methods with a note of the time each method takes.
For the GPU this uses SourceModule, ElementwiseKernel, GPUArray. For the CPU this uses numpy
License of this example: |
GPL |
Date: |
03 March 2010 |
PyCUDA version: |
0.93 |
1 # SimpleSpeedTest.py
2
3 # Very simple speed testing code
4 # Shows you how to run a loop over sin() using different methods
5 # with a note of the time each method takes
6 # For the GPU this uses SourceModule, ElementwiseKernel, GPUArray
7 # For the CPU this uses numpy
8 # Ian@IanOzsvald.com
9
10 # Using a WinXP Intel Core2 Duo 2.66GHz CPU (1 CPU used)
11 # with a 9800GT GPU I get the following timings (smaller is better):
12 #
13 # Using nbr_values == 8192
14 # Calculating 100000 iterations
15 # SourceModule time and first three results:
16 # 0.166590s, [ 0.005477 0.005477 0.005477]
17 # Elementwise time and first three results:
18 # 0.171657s, [ 0.005477 0.005477 0.005477]
19 # Elementwise Python looping time and first three results:
20 # 1.487470s, [ 0.005477 0.005477 0.005477]
21 # GPUArray time and first three results:
22 # 4.740007s, [ 0.005477 0.005477 0.005477]
23 # CPU time and first three results:
24 # 32.933660s, [ 0.005477 0.005477 0.005477]
25
26 import pycuda.driver as drv
27 import pycuda.tools
28 import pycuda.autoinit
29 import numpy
30 from pycuda.compiler import SourceModule
31 import pycuda.gpuarray as gpuarray
32 import pycuda.cumath
33 from pycuda.elementwise import ElementwiseKernel
34
35 blocks = 64
36 block_size = 128
37 nbr_values = blocks * block_size
38
39 print "Using nbr_values ==", nbr_values
40
41 # Number of iterations for the calculations,
42 # 100 is very quick, 2000000 will take a while
43 n_iter = 100000
44 print "Calculating %d iterations" % (n_iter)
45
46 # create two timers so we can speed-test each approach
47 start = drv.Event()
48 end = drv.Event()
49
50 ######################
51 # SourceModele SECTION
52 # We write the C code and the indexing and we have lots of control
53
54 mod = SourceModule("""
55 __global__ void gpusin(float *dest, float *a, int n_iter)
56 {
57 const int i = blockDim.x*blockIdx.x + threadIdx.x;
58 for(int n = 0; n < n_iter; n++) {
59 a[i] = sin(a[i]);
60 }
61 dest[i] = a[i];
62 }
63 """)
64
65 gpusin = mod.get_function("gpusin")
66
67 # create an array of 1s
68 a = numpy.ones(nbr_values).astype(numpy.float32)
69 # create a destination array that will receive the result
70 dest = numpy.zeros_like(a)
71
72 start.record() # start timing
73 gpusin(drv.Out(dest), drv.In(a), numpy.int32(n_iter), grid=(blocks,1), block=(block_size,1,1) )
74 end.record() # end timing
75 # calculate the run length
76 end.synchronize()
77 secs = start.time_till(end)*1e-3
78 print "SourceModule time and first three results:"
79 print "%fs, %s" % (secs, str(dest[:3]))
80
81
82 #####################
83 # Elementwise SECTION
84 # use an ElementwiseKernel with sin in a for loop all in C call from Python
85 kernel = ElementwiseKernel(
86 "float *a, int n_iter",
87 "for(int n = 0; n < n_iter; n++) { a[i] = sin(a[i]);}",
88 "gpusin")
89
90 a = numpy.ones(nbr_values).astype(numpy.float32)
91 a_gpu = gpuarray.to_gpu(a)
92 start.record() # start timing
93 kernel(a_gpu, numpy.int(n_iter))
94 end.record() # end timing
95 # calculate the run length
96 end.synchronize()
97 secs = start.time_till(end)*1e-3
98 print "Elementwise time and first three results:"
99 print "%fs, %s" % (secs, str(a_gpu.get()[:3]))
100
101
102 ####################################
103 # Elementwise Python looping SECTION
104 # as Elementwise but the for loop is in Python, not in C
105 kernel = ElementwiseKernel(
106 "float *a",
107 "a[i] = sin(a[i]);",
108 "gpusin")
109
110 a = numpy.ones(nbr_values).astype(numpy.float32)
111 a_gpu = gpuarray.to_gpu(a)
112 start.record() # start timing
113 for i in range(n_iter):
114 kernel(a_gpu)
115 end.record() # end timing
116 # calculate the run length
117 end.synchronize()
118 secs = start.time_till(end)*1e-3
119 print "Elementwise Python looping time and first three results:"
120 print "%fs, %s" % (secs, str(a_gpu.get()[:3]))
121
122
123 ##################
124 # GPUArray SECTION
125 # The result is copied back to main memory on each iteration, this is a bottleneck
126
127 a = numpy.ones(nbr_values).astype(numpy.float32)
128 a_gpu = gpuarray.to_gpu(a)
129 start.record() # start timing
130 for i in range(n_iter):
131 a_gpu = pycuda.cumath.sin(a_gpu)
132 end.record() # end timing
133 # calculate the run length
134 end.synchronize()
135 secs = start.time_till(end)*1e-3
136 print "GPUArray time and first three results:"
137 print "%fs, %s" % (secs, str(a_gpu.get()[:3]))
138
139
140 #############
141 # CPU SECTION
142 # use numpy the calculate the result on the CPU for reference
143
144 a = numpy.ones(nbr_values).astype(numpy.float32)
145 start.record() # start timing
146
147 for i in range(n_iter):
148 a = numpy.sin(a)
149
150 end.record() # end timing
151 # calculate the run length
152 end.synchronize()
153 secs = start.time_till(end)*1e-3
154 print "CPU time and first three results:"
155 print "%fs, %s" % (secs, str(a[:3]))
156
