2D FFT using PyFFT, PyCUDA and Multiprocessing

This code does the fast Fourier transform on 2d data of any size. It used the transpose split method to achieve larger sizes and to use multiprocessing. The no of parts the input image is to be split, is decided by the user based on the available GPU memory and CPU processing cores.

jackin@opt.utsunomiya-u.ac.jp

License of this example:

Public Domain

Date:

16-08-2010

PyCUDA version:

0.94rc

   1 import numpy
   2 import scipy.misc
   3 import numpy.fft as nfft
   4 import multiprocessing
   5 
   6 from pyfft.cuda import Plan
   7 from pycuda.tools import make_default_context
   8 import pycuda.tools as pytools
   9 import pycuda.gpuarray as garray
  10 import pycuda.driver as drv
  11 
  12 
  13 class GPUMulti(multiprocessing.Process):
  14     def __init__(self, number, input_cpu, output_cpu):
  15         multiprocessing.Process.__init__(self)
  16         self.number = number
  17         self.input_cpu = input_cpu
  18         self.output_cpu = output_cpu
  19 
  20     def run(self):
  21         drv.init()
  22         a0=numpy.zeros((p,),dtype=numpy.complex64)
  23         self.dev = drv.Device(self.number)
  24         self.ctx = self.dev.make_context()
  25 #TO VERIFY WHETHER ALL THE MEMORY IS FREED BEFORE NEXT ALLOCATION (THIS DOES NOT HAPPEN IN MULTITHREADING)
  26         print drv.mem_get_info() 
  27         self.gpu_a = garray.empty((self.input_cpu.size,), dtype=numpy.complex64)
  28         self.gpu_b = garray.zeros_like(self.gpu_a)
  29         self.gpu_a = garray.to_gpu(self.input_cpu)
  30         plan = Plan(a0.shape,context=self.ctx)
  31         plan.execute(self.gpu_a, self.gpu_b, batch=p/m)
  32         self.temp = self.gpu_b.get()
  33         self.output_cpu.put(self.temp)
  34         self.output_cpu.close()
  35         self.ctx.pop()
  36         del self.gpu_a
  37         del self.gpu_b
  38         del self.ctx
  39         
  40         print "till the end %d" %self.number
  41         
  42 
  43 p = 8192; # INPUT IMAGE SIZE (8192 * 8192)
  44 m = 4     # TO DIVIDE THE INPUT IMAGE INTO 4* (2048 * 8192) SIZED IMAGES (Depends on the total memory of your GPU)
  45 trans = 2 # FOR TRANSPOSE-SPLIT (TS) ALGORITHM WHICH loops 2 times
  46 
  47 
  48 #INPUT IMAGE (GENERATE A 2d SINE WAVE PATTERN)
  49 p_n = 8000 # No. OF PERIODS OF SINE WAVES
  50 x=numpy.arange(0,p_n,float(p_n)/float(p))
  51 a_i = 128 + 128 * numpy.sin(2*numpy.pi*x)
  52 a2 = numpy.zeros([p,p],dtype=numpy.complex64)
  53 a2[::]=a_i
  54 scipy.misc.imsave("sine.bmp",numpy.absolute(a2)) #TEST THE GENERATION OF INPUT IMAGE
  55 
  56 #INITIALISE THE VARIABLES                  
  57 a2_1 = numpy.zeros([m,p*p/m],dtype = numpy.complex64) #INPUT TO THE GPU (1d ARRAY)
  58 #VERY IMPORTANT
  59 output_cpu  = multiprocessing.Queue() #STORE RESULT IN GPU (MULTIPROCESSING DOES NOT ALLOW SHARING AND HENCE THIS IS NEEDED FOR COMMUNICATION OF DATA)
  60 
  61 b2pa = numpy.zeros([p/m,p,m],dtype = numpy.complex64) #OUTPUT FROM GPU 
  62 b2_a = numpy.zeros([p,p],dtype = numpy.complex64)     #RESHAPED (8192*8192) OUTPUT
  63 
  64 #NOW WE ARE READY TO KICK START THE GPU
  65 
  66 # THE NO OF GPU'S PRESENT (CHANGE ACCORDING TO THE No.OF GPUS YOU HAVE)
  67 num = 2 # I KNOW THIS IS A BAD PRACTISE, BUT I COUNDN'T FIND ANY OTHER WAY(INIT CANNOT BE USED HERE)
  68 
  69 #THE TRANSPOSE-SPLIT ALGORITHM FOR FFT
  70 for t in range (0,trans):
  71     for i in range (m):
  72         a2_1[i,:] = a2[i*p/m:(i+1)*p/m,:].flatten()#DIVIDE AND RESHAPE THE INPUT IMAGE INTO 1D ARRAY
  73         
  74     for j in range (m/num):
  75         gpu_multi_list = []
  76 
  77 #CREATE AND START THE MULTIPROCESS
  78         for i in range (num):
  79             gpu_multi = GPUMulti(i,a2_1[i+j*num,:],output_cpu) #FEED THE DATA INTO THE GPU
  80             gpu_multi_list.append(gpu_multi)
  81             gpu_multi.start()#THERE YOU GO
  82 
  83 #COLLECT THE OUTPUT FROM THE RUNNING MULTIPROCESS AND RESHAPE                
  84         for gpu_pro in gpu_multi_list:
  85             temp_b2_1 = output_cpu.get(gpu_pro)
  86             b2pa[:,:,gpu_pro.number+j*num] = numpy.reshape(temp_b2_1,(p/m,p))
  87         gpu_multi.terminate()
  88 
  89 #RESHAPE AGAIN TO (8192 * 8192) IMAGE    
  90     for i in range(m):
  91         b2_a[i*p/m:(i+1)*p/m,:] = b2pa[:,:,i]


CategoryPyCuda CategoryPyCuda CategoryPyCuda