2D FFT of any size using PYFFT, PYCUDA and Multiprocessing

This code computes the 2D FFT of any size using 2 GPU's using the Transpose-Split algorithm. B.J.Jackin - jackin@opt.utsunomiya-u.ac.jp

License of this example:

Public Domain?

Date:

13-08-2010

PyCUDA version:

0.94rc

   1 #2D FFT USING PYCUDA, PYFFT AND MULTIPROCESSING
   2 #INPUT IMAGE / MATRIX  IS DIVIDED INTO PARTS IN ORDER TO USE THE TRANSPOSE SPLIT METHOD THE FOR 2D FFT
   3 #HOW MANY PARTS THE INPUT IMAGE SHOULD GET DIVIDED IS DECIDED BY THE USER BASED ON THE AVAILABLE MEMORY AND INPUT SIZE
   4 
   5 import numpy
   6 import scipy.misc
   7 import numpy.fft as nfft
   8 import multiprocessing
   9 
  10 from pyfft.cuda import Plan
  11 from pycuda.tools import make_default_context
  12 import pycuda.tools as pytools
  13 import pycuda.gpuarray as garray
  14 import pycuda.driver as drv
  15 
  16 
  17 class GPUMulti(multiprocessing.Process):
  18     def __init__(self, number, input_cpu, output_cpu):
  19         multiprocessing.Process.__init__(self)
  20         self.number = number
  21         self.input_cpu = input_cpu
  22         self.output_cpu = output_cpu
  23 
  24     def run(self):
  25         drv.init()
  26         a0=numpy.zeros((p,),dtype=numpy.complex64)
  27         self.dev = drv.Device(self.number)
  28         self.ctx = self.dev.make_context()
  29 #TO VERIFY WHETHER ALL THE MEMORY IS FREED BEFORE NEXT ALLOCATION (THIS DOES NOT HAPPEN IN MULTITHREADING)
  30         print drv.mem_get_info() 
  31         self.gpu_a = garray.empty((self.input_cpu.size,), dtype=numpy.complex64)
  32         self.gpu_b = garray.zeros_like(self.gpu_a)
  33         self.gpu_a = garray.to_gpu(self.input_cpu)
  34         plan = Plan(a0.shape,context=self.ctx)
  35         plan.execute(self.gpu_a, self.gpu_b, batch=p/m)
  36         self.temp = self.gpu_b.get()
  37         self.output_cpu.put(self.temp)
  38         self.output_cpu.close()
  39         self.ctx.pop()
  40         del self.gpu_a
  41         del self.gpu_b
  42         del self.ctx
  43         
  44         print "till the end %d" %self.number
  45         
  46 
  47 p = 8192; # INPUT IMAGE SIZE (8192 * 8192)
  48 m = 4     # TO DIVIDE THE INPUT IMAGE INTO 4 (2048 * 8192) SIZED IMAGES (Depends on the total memory of your GPU)
  49 trans = 2 # THE WHOLE THING HAS TO BE REPEATED 2 TIMES FOR THE TRANSPOSE-SPLIT (TS) ALGORITHM (So trans = 2) 
  50 
  51 
  52 #INPUT IMAGE (GENERATE A 2d SINE WAVE PATTERN)
  53 p_n = 8000 # No. OF PERIODS OF SINE WAVES
  54 x=numpy.arange(0,p_n,float(p_n)/float(p))
  55 a_i = 128 + 128 * numpy.sin(2*numpy.pi*x)
  56 a2 = numpy.zeros([p,p],dtype=numpy.complex64)
  57 a2[::]=a_i
  58 scipy.misc.imsave("sine.bmp",numpy.absolute(a2)) #TEST THE GENERATION OF INPUT IMAGE
  59 
  60 #INITIALISE THE VARIABLES                  
  61 a2_1 = numpy.zeros([m,p*p/m],dtype = numpy.complex64) #INPUT TO THE GPU 
  62 #VERY IMPORTANT
  63 output_cpu  = multiprocessing.Queue() # GPU VARIABLE THAT STORES THE OUTPUT(MULTIPROCESSING DOES NOT ALLOW SHARING AND HENCE THIS IS NEEDED FOR COMMUNICATION OF DATA)
  64 
  65 b2pa = numpy.zeros([p/m,p,m],dtype = numpy.complex64) #OUTPUT FROM GPU 
  66 b2_a = numpy.zeros([p,p],dtype = numpy.complex64)  #RESHAPED OUPUT   
  67 
  68 #NOW WE ARE READY TO KICK START THE GPU
  69 
  70 # THE NO OF GPU'S PRESENT (CHANGE ACCORDING TO THE No.OF GPUS YOU HAVE)
  71 num = 2 # I KNOW THIS IS A BAD PRACTISE, BUT I COUNDN'T FIND ANY OTHER WAY(INIT CANNOT BE USED HERE)
  72 
  73 #THE TRANSPOSE-SPLIT ALGORITHM FOR FFT
  74 for t in range (0,trans):
  75     for i in range (m):
  76         a2_1[i,:] = a2[i*p/m:(i+1)*p/m,:].flatten()#DIVIDE AND RESHAPE THE INPUT IMAGE INTO 1D ARRAY
  77         
  78     for j in range (m/num):
  79         gpu_multi_list = []
  80 
  81 #CREATE AND START THE MULTIPROCESS
  82         for i in range (num):
  83             gpu_multi = GPUMulti(i,a2_1[i+j*num,:],output_cpu) #FEED THE DATA INTO THE GPU
  84             gpu_multi_list.append(gpu_multi)
  85             gpu_multi.start()#THERE YOU GO
  86 
  87 #COLLECT THE OUTPUT FROM THE RUNNING MULTIPROCESS AND RESHAPE                
  88         for gpu_pro in gpu_multi_list:
  89             temp_b2_1 = output_cpu.get(gpu_pro)
  90             b2pa[:,:,gpu_pro.number+j*num] = numpy.reshape(temp_b2_1,(p/m,p))
  91         gpu_multi.terminate()
  92 
  93 #REUNITE AGAIN TO A (p * p) IMAGE    
  94     for i in range(m):
  95         b2_a[i*p/m:(i+1)*p/m,:] = b2pa[:,:,i]
  96 
  97 #TRANSPOSE THE OUTPUT AND MAKE IT THE INPUT. THE WHOLE PROCESS REPEATS ONCE AGAIN (THIS IS DUE TO THE TRANSPOSE-SPLIT ALGORITHM)    
  98     a2 = b2_a.transpose()
  99 
 100 #SAVE THE OUTPUT RESULT    
 101 scipy.misc.imsave("sine_fft.bmp",numpy.absolute(nfft.fftshift(a2)))
 102 
 103 
 104 
 105 
 106 Note: please leave the '#!python' marker in place above. The script 'examples/download-examples-from-wiki.py' in the PyCUDA distribution relies on it.