Mandelbrot using GPU and two CPU variants
Calculate the Mandelbrot set, display it in a Tkinter window. You can choose to use the GPU (ElementwiseKernel) for the calculation or two numpy CPU solutions. Timing results are shown in the code.
License of this example: |
GPL |
Date: |
July 2010 |
PyCUDA version: |
0.94 |
1 # Mandelbrot calculate using GPU, Serial numpy and faster numpy
2 # Use to show the speed difference between CPU and GPU calculations
3 # ian@ianozsvald.com July 2010
4
5 # Based on vegaseat's TKinter/numpy example code from 2006
6 # http://www.daniweb.com/code/snippet216851.html#
7 # with minor changes to move to numpy from the obsolete Numeric
8
9 import sys
10 import numpy as nm
11
12 import Tkinter as tk
13 import Image # PIL
14 import ImageTk # PIL
15
16 import pycuda.driver as drv
17 import pycuda.tools
18 import pycuda.autoinit
19 import numpy
20 from pycuda.compiler import SourceModule
21 import pycuda.gpuarray as gpuarray
22
23 # You can choose a calculation routine below (calculate_z), uncomment
24 # one of the three lines to test the three variations
25 # Speed notes are listed in the same place
26
27 # set width and height of window, more pixels take longer to calculate
28 w = 1000
29 h = 1000
30
31 from pycuda.elementwise import ElementwiseKernel
32 complex_gpu = ElementwiseKernel(
33 "pycuda::complex<float> *z, pycuda::complex<float> *q, int *iteration, int maxiter",
34 "for (int n=0; n < maxiter; n++) {z[i] = (z[i]*z[i])+q[i]; if (abs(z[i]) > 2.0f) {iteration[i]=n; z[i] = pycuda::complex<float>(); q[i] = pycuda::complex<float>();};}",
35 "complex5",
36 preamble="#include <pycuda-complex.hpp>",)
37
38 def calculate_z_gpu(q, maxiter, z):
39 output = nm.resize(nm.array(0,), q.shape)
40 q_gpu = gpuarray.to_gpu(q.astype(nm.complex64))
41 z_gpu = gpuarray.to_gpu(z.astype(nm.complex64))
42 iterations_gpu = gpuarray.to_gpu(output)
43 # the for loop and complex calculations are all done on the GPU
44 # we bring the iterations_gpu array back to determine pixel colours later
45 complex_gpu(z_gpu, q_gpu, iterations_gpu, maxiter)
46
47 iterations = iterations_gpu.get()
48 return iterations
49
50
51 def calculate_z_numpy_gpu(q, maxiter, z):
52 """Calculate z using numpy on the GPU via gpuarray"""
53 outputg = gpuarray.to_gpu(nm.resize(nm.array(0,), q.shape))
54 zg = gpuarray.to_gpu(z.astype(nm.complex64))
55 qg = gpuarray.to_gpu(q.astype(nm.complex64))
56 # 2.0 as an array
57 twosg = gpuarray.to_gpu(nm.array([2.0]*zg.size).astype(numpy.float32))
58 # 0+0j as an array
59 cmplx0sg = gpuarray.to_gpu(nm.array([0+0j]*zg.size).astype(nm.complex64))
60 # for abs_zg > twosg result
61 comparison_result = gpuarray.to_gpu(nm.array([False]*zg.size).astype(nm.bool))
62 # we'll add 1 to iterg after each iteration
63 iterg = gpuarray.to_gpu(nm.array([0]*zg.size).astype(nm.int32))
64
65 for iter in range(maxiter):
66 zg = zg*zg + qg
67
68 # abs returns a complex (rather than a float) from the complex
69 # input where the real component is the absolute value (which
70 # looks like a bug) so I take the .real after abs()
71 abs_zg = abs(zg).real
72
73 comparison_result = abs_zg > twosg
74 qg = gpuarray.if_positive(comparison_result, cmplx0sg, qg)
75 zg = gpuarray.if_positive(comparison_result, cmplx0sg, zg)
76 outputg = gpuarray.if_positive(comparison_result, iterg, outputg)
77 iterg = iterg + 1
78 output = outputg.get()
79 return output
80
81
82 def calculate_z_numpy(q, maxiter, z):
83 # calculate z using numpy, this is the original
84 # routine from vegaseat's URL
85 # NOTE this routine was faster using a default of double-precision complex nbrs
86 # rather than the current single precision
87 output = nm.resize(nm.array(0,), q.shape)
88 for iter in range(maxiter):
89 z = z*z + q
90 done = nm.greater(abs(z), 2.0)
91 q = nm.where(done,0+0j, q)
92 z = nm.where(done,0+0j, z)
93 output = nm.where(done, iter, output)
94 return output
95
96 def calculate_z_serial(q, maxiter, z):
97 # calculate z using pure python with numpy arrays
98 # this routine unrolls calculate_z_numpy as an intermediate
99 # step to the creation of calculate_z_gpu
100 # it runs slower than calculate_z_numpy
101 output = nm.resize(nm.array(0,), q.shape)
102 for i in range(len(q)):
103 if i % 100 == 0:
104 # print out some progress info since it is so slow...
105 print "%0.2f%% complete" % (1.0/len(q) * i * 100)
106 for iter in range(maxiter):
107 z[i] = z[i]*z[i] + q[i]
108 if abs(z[i]) > 2.0:
109 q[i] = 0+0j
110 z[i] = 0+0j
111 output[i] = iter
112 return output
113
114
115 show_instructions = False
116 if len(sys.argv) == 1:
117 show_instructions = True
118
119 if len(sys.argv) > 1:
120 if sys.argv[1] not in ['gpu', 'gpuarray', 'numpy', 'python']:
121 show_instructions = True
122
123 if show_instructions:
124 print "Usage: python mandelbrot.py [gpu|gpuarray|numpy|python]"
125 print "Where:"
126 print " gpu is a pure CUDA solution on the GPU"
127 print " gpuarray uses a numpy-like CUDA wrapper in Python on the GPU"
128 print " numpy is a pure Numpy (C-based) solution on the CPU"
129 print " python is a pure Python solution on the CPU with numpy arrays"
130 sys.exit(0)
131
132 routine = {'gpuarray':calculate_z_numpy_gpu,
133 'gpu':calculate_z_gpu,
134 'numpy':calculate_z_numpy,
135 'python':calculate_z_serial}
136
137 calculate_z = routine[sys.argv[1]]
138 ##if sys.argv[1] == 'python':
139 # import psyco
140 # psyco.full()
141
142 # Using a WinXP Intel Core2 Duo 2.66GHz CPU (1 CPU used)
143 # with a 9800GT GPU I get the following timings (smaller is better).
144 # With 200x200 problem with max iterations set at 300:
145 # calculate_z_gpu: 0.03s
146 # calculate_z_serial: 8.7s
147 # calculate_z_numpy: 0.3s
148 #
149 # Using WinXP Intel 2.9GHz CPU (1 CPU used)
150 # with a GTX 480 GPU I get the following using 1000x1000 plot with 1000 max iterations:
151 # gpu: 0.07s
152 # gpuarray: 3.4s
153 # numpy: 43.4s
154 # python (serial): 1605.6s
155
156 class Mandelbrot(object):
157 def __init__(self):
158 # create window
159 self.root = tk.Tk()
160 self.root.title("Mandelbrot Set")
161 self.create_image()
162 self.create_label()
163 # start event loop
164 self.root.mainloop()
165
166
167 def draw(self, x1, x2, y1, y2, maxiter=300):
168 # draw the Mandelbrot set, from numpy example
169 xx = nm.arange(x1, x2, (x2-x1)/w*2)
170 yy = nm.arange(y2, y1, (y1-y2)/h*2) * 1j
171 # force yy, q and z to use 32 bit floats rather than
172 # the default 64 doubles for nm.complex for consistency with CUDA
173 yy = yy.astype(nm.complex64)
174 q = nm.ravel(xx+yy[:, nm.newaxis]).astype(nm.complex64)
175 z = nm.zeros(q.shape, nm.complex64)
176
177 start_main = drv.Event()
178 end_main = drv.Event()
179 start_main.record()
180 output = calculate_z(q, maxiter, z)
181
182 end_main.record()
183 end_main.synchronize()
184 secs = start_main.time_till(end_main)*1e-3
185 print "Main took", secs
186
187 output = (output + (256*output) + (256**2)*output) * 8
188 # convert output to a string
189 self.mandel = output.tostring()
190
191 def create_image(self):
192 """"
193 create the image from the draw() string
194 """
195 self.im = Image.new("RGB", (w/2, h/2))
196 # you can experiment with these x and y ranges
197 self.draw(-2.13, 0.77, -1.3, 1.3, 1000)
198 self.im.fromstring(self.mandel, "raw", "RGBX", 0, -1)
199
200 def create_label(self):
201 # put the image on a label widget
202 self.image = ImageTk.PhotoImage(self.im)
203 self.label = tk.Label(self.root, image=self.image)
204 self.label.pack()
205
206 # test the class
207 if __name__ == '__main__':
208 test = Mandelbrot()
209
