2D FFT using PyFFT, PyCUDA and Multiprocessing
This code does the fast Fourier transform on 2d data of any size. It used the transpose split method to achieve larger sizes and to use multiprocessing. The no of parts the input image is to be split, is decided by the user based on the available GPU memory and CPU processing cores.
License of this example: |
Public Domain |
Date: |
16-08-2010 |
PyCUDA version: |
0.94rc |
1 import numpy
2 import scipy.misc
3 import numpy.fft as nfft
4 import multiprocessing
5
6 from pyfft.cuda import Plan
7 from pycuda.tools import make_default_context
8 import pycuda.tools as pytools
9 import pycuda.gpuarray as garray
10 import pycuda.driver as drv
11
12
13 class GPUMulti(multiprocessing.Process):
14 def __init__(self, number, input_cpu, output_cpu):
15 multiprocessing.Process.__init__(self)
16 self.number = number
17 self.input_cpu = input_cpu
18 self.output_cpu = output_cpu
19
20 def run(self):
21 drv.init()
22 a0=numpy.zeros((p,),dtype=numpy.complex64)
23 self.dev = drv.Device(self.number)
24 self.ctx = self.dev.make_context()
25 #TO VERIFY WHETHER ALL THE MEMORY IS FREED BEFORE NEXT ALLOCATION (THIS DOES NOT HAPPEN IN MULTITHREADING)
26 print drv.mem_get_info()
27 self.gpu_a = garray.empty((self.input_cpu.size,), dtype=numpy.complex64)
28 self.gpu_b = garray.zeros_like(self.gpu_a)
29 self.gpu_a = garray.to_gpu(self.input_cpu)
30 plan = Plan(a0.shape,context=self.ctx)
31 plan.execute(self.gpu_a, self.gpu_b, batch=p/m)
32 self.temp = self.gpu_b.get()
33 self.output_cpu.put(self.temp)
34 self.output_cpu.close()
35 self.ctx.pop()
36 del self.gpu_a
37 del self.gpu_b
38 del self.ctx
39
40 print "till the end %d" %self.number
41
42
43 p = 8192; # INPUT IMAGE SIZE (8192 * 8192)
44 m = 4 # TO DIVIDE THE INPUT IMAGE INTO 4* (2048 * 8192) SIZED IMAGES (Depends on the total memory of your GPU)
45 trans = 2 # FOR TRANSPOSE-SPLIT (TS) ALGORITHM WHICH loops 2 times
46
47
48 #INPUT IMAGE (GENERATE A 2d SINE WAVE PATTERN)
49 p_n = 8000 # No. OF PERIODS OF SINE WAVES
50 x=numpy.arange(0,p_n,float(p_n)/float(p))
51 a_i = 128 + 128 * numpy.sin(2*numpy.pi*x)
52 a2 = numpy.zeros([p,p],dtype=numpy.complex64)
53 a2[::]=a_i
54 scipy.misc.imsave("sine.bmp",numpy.absolute(a2)) #TEST THE GENERATION OF INPUT IMAGE
55
56 #INITIALISE THE VARIABLES
57 a2_1 = numpy.zeros([m,p*p/m],dtype = numpy.complex64) #INPUT TO THE GPU (1d ARRAY)
58 #VERY IMPORTANT
59 output_cpu = multiprocessing.Queue() #STORE RESULT IN GPU (MULTIPROCESSING DOES NOT ALLOW SHARING AND HENCE THIS IS NEEDED FOR COMMUNICATION OF DATA)
60
61 b2pa = numpy.zeros([p/m,p,m],dtype = numpy.complex64) #OUTPUT FROM GPU
62 b2_a = numpy.zeros([p,p],dtype = numpy.complex64) #RESHAPED (8192*8192) OUTPUT
63
64 #NOW WE ARE READY TO KICK START THE GPU
65
66 # THE NO OF GPU'S PRESENT (CHANGE ACCORDING TO THE No.OF GPUS YOU HAVE)
67 num = 2 # I KNOW THIS IS A BAD PRACTISE, BUT I COUNDN'T FIND ANY OTHER WAY(INIT CANNOT BE USED HERE)
68
69 #THE TRANSPOSE-SPLIT ALGORITHM FOR FFT
70 for t in range (0,trans):
71 for i in range (m):
72 a2_1[i,:] = a2[i*p/m:(i+1)*p/m,:].flatten()#DIVIDE AND RESHAPE THE INPUT IMAGE INTO 1D ARRAY
73
74 for j in range (m/num):
75 gpu_multi_list = []
76
77 #CREATE AND START THE MULTIPROCESS
78 for i in range (num):
79 gpu_multi = GPUMulti(i,a2_1[i+j*num,:],output_cpu) #FEED THE DATA INTO THE GPU
80 gpu_multi_list.append(gpu_multi)
81 gpu_multi.start()#THERE YOU GO
82
83 #COLLECT THE OUTPUT FROM THE RUNNING MULTIPROCESS AND RESHAPE
84 for gpu_pro in gpu_multi_list:
85 temp_b2_1 = output_cpu.get(gpu_pro)
86 b2pa[:,:,gpu_pro.number+j*num] = numpy.reshape(temp_b2_1,(p/m,p))
87 gpu_multi.terminate()
88
89 #RESHAPE AGAIN TO (8192 * 8192) IMAGE
90 for i in range(m):
91 b2_a[i*p/m:(i+1)*p/m,:] = b2pa[:,:,i]
