|
Size: 4986
Comment:
|
← Revision 8 as of 2012-09-18 11:45:19 ⇥
Size: 5659
Comment: Synchronizing on start event seems to be necessary as of today. See also http://stackoverflow.com/questions/12259010/measuring-time-using-pycuda-driver-event-gives-wrong-results
|
| Deletions are marked like this. | Additions are marked like this. |
| Line 1: | Line 1: |
| = SimpleSpeedTest.py = | SimpleSpeedTest.py |
| Line 8: | Line 8: |
| ||License of this example: ||GPL || ||Date: ||03 March 2010 || ||PyCUDA version: ||0.93 || |
|
| Line 9: | Line 12: |
| || License of this example: || GPL || || Date: || 03 March 2010 || || PyCUDA version: || 0.93 || |
|
| Line 38: | Line 40: |
| # # # Using Win 7 x64, GTX 470 GPU, X5650 Xeon, # Driver v301.42, CUDA 4.2, Python 2.7 x64, # PyCuda 2012.1 gave the following results: # # Using nbr_values == 8192 # Calculating 100000 iterations # SourceModule time and first three results: # 0.058321s, [ 0.005477 0.005477 0.005477] # Elementwise time and first three results: # 0.102110s, [ 0.005477 0.005477 0.005477] # Elementwise Python looping time and first three results: # 2.428810s, [ 0.005477 0.005477 0.005477] # GPUArray time and first three results: # 8.421861s, [ 0.005477 0.005477 0.005477] # CPU time measured using : # 5.905661s, [ 0.005477 0.005477 0.005477] |
|
| Line 45: | Line 66: |
| import pycuda.cumath | import pycuda.cumath |
| Line 54: | Line 75: |
| # Number of iterations for the calculations, | # Number of iterations for the calculations, |
| Line 159: | Line 180: |
| start.synchronize() |
Very simple speed testing code. This shows you how to run a loop over sin() using different methods with a note of the time each method takes.
For the GPU this uses SourceModule, ElementwiseKernel, GPUArray. For the CPU this uses numpy
License of this example: |
GPL |
Date: |
03 March 2010 |
PyCUDA version: |
0.93 |
1 # SimpleSpeedTest.py
2
3 # Very simple speed testing code
4 # Shows you how to run a loop over sin() using different methods
5 # with a note of the time each method takes
6 # For the GPU this uses SourceModule, ElementwiseKernel, GPUArray
7 # For the CPU this uses numpy
8 # Ian@IanOzsvald.com
9
10 # Using a WinXP Intel Core2 Duo 2.66GHz CPU (1 CPU used)
11 # with a 9800GT GPU I get the following timings (smaller is better):
12 #
13 # Using nbr_values == 8192
14 # Calculating 100000 iterations
15 # SourceModule time and first three results:
16 # 0.166590s, [ 0.005477 0.005477 0.005477]
17 # Elementwise time and first three results:
18 # 0.171657s, [ 0.005477 0.005477 0.005477]
19 # Elementwise Python looping time and first three results:
20 # 1.487470s, [ 0.005477 0.005477 0.005477]
21 # GPUArray time and first three results:
22 # 4.740007s, [ 0.005477 0.005477 0.005477]
23 # CPU time and first three results:
24 # 32.933660s, [ 0.005477 0.005477 0.005477]
25 #
26 #
27 # Using Win 7 x64, GTX 470 GPU, X5650 Xeon,
28 # Driver v301.42, CUDA 4.2, Python 2.7 x64,
29 # PyCuda 2012.1 gave the following results:
30 #
31 # Using nbr_values == 8192
32 # Calculating 100000 iterations
33 # SourceModule time and first three results:
34 # 0.058321s, [ 0.005477 0.005477 0.005477]
35 # Elementwise time and first three results:
36 # 0.102110s, [ 0.005477 0.005477 0.005477]
37 # Elementwise Python looping time and first three results:
38 # 2.428810s, [ 0.005477 0.005477 0.005477]
39 # GPUArray time and first three results:
40 # 8.421861s, [ 0.005477 0.005477 0.005477]
41 # CPU time measured using :
42 # 5.905661s, [ 0.005477 0.005477 0.005477]
43
44
45 import pycuda.driver as drv
46 import pycuda.tools
47 import pycuda.autoinit
48 import numpy
49 from pycuda.compiler import SourceModule
50 import pycuda.gpuarray as gpuarray
51 import pycuda.cumath
52 from pycuda.elementwise import ElementwiseKernel
53
54 blocks = 64
55 block_size = 128
56 nbr_values = blocks * block_size
57
58 print "Using nbr_values ==", nbr_values
59
60 # Number of iterations for the calculations,
61 # 100 is very quick, 2000000 will take a while
62 n_iter = 100000
63 print "Calculating %d iterations" % (n_iter)
64
65 # create two timers so we can speed-test each approach
66 start = drv.Event()
67 end = drv.Event()
68
69 ######################
70 # SourceModele SECTION
71 # We write the C code and the indexing and we have lots of control
72
73 mod = SourceModule("""
74 __global__ void gpusin(float *dest, float *a, int n_iter)
75 {
76 const int i = blockDim.x*blockIdx.x + threadIdx.x;
77 for(int n = 0; n < n_iter; n++) {
78 a[i] = sin(a[i]);
79 }
80 dest[i] = a[i];
81 }
82 """)
83
84 gpusin = mod.get_function("gpusin")
85
86 # create an array of 1s
87 a = numpy.ones(nbr_values).astype(numpy.float32)
88 # create a destination array that will receive the result
89 dest = numpy.zeros_like(a)
90
91 start.record() # start timing
92 gpusin(drv.Out(dest), drv.In(a), numpy.int32(n_iter), grid=(blocks,1), block=(block_size,1,1) )
93 end.record() # end timing
94 # calculate the run length
95 end.synchronize()
96 secs = start.time_till(end)*1e-3
97 print "SourceModule time and first three results:"
98 print "%fs, %s" % (secs, str(dest[:3]))
99
100
101 #####################
102 # Elementwise SECTION
103 # use an ElementwiseKernel with sin in a for loop all in C call from Python
104 kernel = ElementwiseKernel(
105 "float *a, int n_iter",
106 "for(int n = 0; n < n_iter; n++) { a[i] = sin(a[i]);}",
107 "gpusin")
108
109 a = numpy.ones(nbr_values).astype(numpy.float32)
110 a_gpu = gpuarray.to_gpu(a)
111 start.record() # start timing
112 kernel(a_gpu, numpy.int(n_iter))
113 end.record() # end timing
114 # calculate the run length
115 end.synchronize()
116 secs = start.time_till(end)*1e-3
117 print "Elementwise time and first three results:"
118 print "%fs, %s" % (secs, str(a_gpu.get()[:3]))
119
120
121 ####################################
122 # Elementwise Python looping SECTION
123 # as Elementwise but the for loop is in Python, not in C
124 kernel = ElementwiseKernel(
125 "float *a",
126 "a[i] = sin(a[i]);",
127 "gpusin")
128
129 a = numpy.ones(nbr_values).astype(numpy.float32)
130 a_gpu = gpuarray.to_gpu(a)
131 start.record() # start timing
132 for i in range(n_iter):
133 kernel(a_gpu)
134 end.record() # end timing
135 # calculate the run length
136 end.synchronize()
137 secs = start.time_till(end)*1e-3
138 print "Elementwise Python looping time and first three results:"
139 print "%fs, %s" % (secs, str(a_gpu.get()[:3]))
140
141
142 ##################
143 # GPUArray SECTION
144 # The result is copied back to main memory on each iteration, this is a bottleneck
145
146 a = numpy.ones(nbr_values).astype(numpy.float32)
147 a_gpu = gpuarray.to_gpu(a)
148 start.record() # start timing
149 for i in range(n_iter):
150 a_gpu = pycuda.cumath.sin(a_gpu)
151 end.record() # end timing
152 # calculate the run length
153 end.synchronize()
154 secs = start.time_till(end)*1e-3
155 print "GPUArray time and first three results:"
156 print "%fs, %s" % (secs, str(a_gpu.get()[:3]))
157
158
159 #############
160 # CPU SECTION
161 # use numpy the calculate the result on the CPU for reference
162
163 a = numpy.ones(nbr_values).astype(numpy.float32)
164 start.record() # start timing
165 start.synchronize()
166
167 for i in range(n_iter):
168 a = numpy.sin(a)
169
170 end.record() # end timing
171 # calculate the run length
172 end.synchronize()
173 secs = start.time_till(end)*1e-3
174 print "CPU time and first three results:"
175 print "%fs, %s" % (secs, str(a[:3]))
