cow## page was renamed from 2DFFT
2D FFT using PyFFT, PyCUDA and Multiprocessing
This code does the fast Fourier transform on 2d data of any size. It used the transpose split method to achieve larger sizes and to use multiprocessing. The no of parts the input image is to be split, is decided by the user based on the available GPU memory and CPU processing cores.
jackin@opt.utsunomiya-u.ac.jp [[!table header="no" class="mointable" data=""" License of this example: | Public Domain Date: | 16-08-2010 PyCUDA version: | 0.94rc """]]
#!python
import numpy
import scipy.misc
import numpy.fft as nfft
import multiprocessing
from pyfft.cuda import Plan
from pycuda.tools import make_default_context
import pycuda.tools as pytools
import pycuda.gpuarray as garray
import pycuda.driver as drv
class GPUMulti(multiprocessing.Process):
def __init__(self, number, input_cpu, output_cpu):
multiprocessing.Process.__init__(self)
self.number = number
self.input_cpu = input_cpu
self.output_cpu = output_cpu
def run(self):
drv.init()
a0=numpy.zeros((p,),dtype=numpy.complex64)
self.dev = drv.Device(self.number)
self.ctx = self.dev.make_context()
#TO VERIFY WHETHER ALL THE MEMORY IS FREED BEFORE NEXT ALLOCATION (THIS DOES NOT HAPPEN IN MULTITHREADING)
print drv.mem_get_info()
self.gpu_a = garray.empty((self.input_cpu.size,), dtype=numpy.complex64)
self.gpu_b = garray.zeros_like(self.gpu_a)
self.gpu_a = garray.to_gpu(self.input_cpu)
plan = Plan(a0.shape,context=self.ctx)
plan.execute(self.gpu_a, self.gpu_b, batch=p/m)
self.temp = self.gpu_b.get()
self.output_cpu.put(self.temp)
self.output_cpu.close()
self.ctx.pop()
del self.gpu_
del self.gpu_b
del self.ctx
print "till the end %d" %self.number
p = 8192; # INPUT IMAGE SIZE (8192 * 8192)
m = 4 # TO DIVIDE THE INPUT IMAGE INTO 4* (2048 * 8192) SIZED IMAGES (Depends on the total memory of your GPU)
trans = 2 # FOR TRANSPOSE-SPLIT (TS) ALGORITHM WHICH loops 2 times
#INPUT IMAGE (GENERATE A 2d SINE WAVE PATTERN)
p_n = 8000 # No. OF PERIODS OF SINE WAVES
x=numpy.arange(0,p_n,float(p_n)/float(p))
a_i = 128 + 128 * numpy.sin(2*numpy.pi*x)
a2 = numpy.zeros([p,p],dtype=numpy.complex64)
a2[::]=a_i
scipy.misc.imsave("sine.bmp",numpy.absolute(a2)) #TEST THE GENERATION OF INPUT IMAGE
#INITIALISE THE VARIABLES
a2_1 = numpy.zeros([m,p*p/m],dtype = numpy.complex64) #INPUT TO THE GPU (1d ARRAY)
#VERY IMPORTANT
output_cpu = multiprocessing.Queue() #STORE RESULT IN GPU (MULTIPROCESSING DOES NOT ALLOW SHARING AND HENCE THIS IS NEEDED FOR COMMUNICATION OF DATA)
b2pa = numpy.zeros([p/m,p,m],dtype = numpy.complex64) #OUTPUT FROM GPU
b2_a = numpy.zeros([p,p],dtype = numpy.complex64) #RESHAPED (8192*8192) OUTPUT
#NOW WE ARE READY TO KICK START THE GPU
# THE NO OF GPU'S PRESENT (CHANGE ACCORDING TO THE No.OF GPUS YOU HAVE)
num = 2 # I KNOW THIS IS A BAD PRACTISE, BUT I COUNDN'T FIND ANY OTHER WAY(INIT CANNOT BE USED HERE)
#THE TRANSPOSE-SPLIT ALGORITHM FOR FFT
for t in range (0,trans):
for i in range (m):
a2_1[i,:] = a2[i*p/m:(i+1)*p/m,:].flatten()#DIVIDE AND RESHAPE THE INPUT IMAGE INTO 1D ARRAY
for j in range (m/num):
gpu_multi_list = []
#CREATE AND START THE MULTIPROCESS
for i in range (num):
gpu_multi = GPUMulti(i,a2_1[i+j*num,:],output_cpu) #FEED THE DATA INTO THE GPU
gpu_multi_list.append(gpu_multi)
gpu_multi.start()#THERE YOU GO
#COLLECT THE OUTPUT FROM THE RUNNING MULTIPROCESS AND RESHAPE
for gpu_pro in gpu_multi_list:
temp_b2_1 = output_cpu.get(gpu_pro)
b2pa[:,:,gpu_pro.number+j*num] = numpy.reshape(temp_b2_1,(p/m,p))
gpu_multi.terminate()
#RESHAPE AGAIN TO (8192 * 8192) IMAGE
for i in range(m):
b2_a[i*p/m:(i+1)*p/m,:] = b2pa[:,:,i]