1 # Exercise 2 from http://webapp.dam.brown.edu/wiki/SciComp/CudaExercises
   2 
   3 # Generate an array of random numbers between 0 and 1
   4 # List the indices of those numbers that are greater than a given limit
   5 
   6 from __future__ import division
   7 import pycuda.driver as cuda
   8 import pycuda.autoinit
   9 import pycuda.gpuarray as gpuarray
  10 from pycuda.compiler import SourceModule
  11 import numpy
  12 
  13 # Define block size and number of elements per thread
  14 block_size = 512
  15 el_per_thread = 1
  16 multiple_block_size = el_per_thread * block_size
  17 
  18 # Create an array of random numbers and set limit
  19 amount = 256*2560
  20 limit = 0.9
  21 assert amount % multiple_block_size == 0
  22 a = numpy.random.rand(amount)
  23 a = a.astype(numpy.float32)
  24 a_gpu = gpuarray.to_gpu(a)
  25 
  26 # Initialize array for the selection on device
  27 selec = numpy.zeros_like(a)
  28 selec = selec.astype(numpy.int32)
  29 selec.fill(-1)
  30 selec_gpu = gpuarray.to_gpu(selec)
  31 
  32 # Initialize a counter on device
  33 counter_gpu = gpuarray.zeros(1, dtype=numpy.int32)
  34 
  35 # Computation on device
  36 mod = SourceModule("""
  37 #define BLOCK_SIZE %(block_size)d
  38 #define EL_PER_THREAD %(el_per_thread)d
  39 // #define USE_LOOPS 1
  40 
  41 __global__ void select_them(float *a, int *selec, float limit, int *counter)
  42 {
  43     __shared__ int selec_smem[EL_PER_THREAD * BLOCK_SIZE];
  44     __shared__ int counter_smem;
  45     int *counter_smem_ptr;
  46 
  47     int jump = 16;
  48     int idx = EL_PER_THREAD * blockIdx.x * BLOCK_SIZE + threadIdx.x +
  49               (EL_PER_THREAD - 1) * (threadIdx.x / 16) * jump;
  50 
  51     if (threadIdx.x == 1)
  52     {
  53         counter_smem_ptr = &counter_smem;
  54         counter_smem = 0;
  55     }
  56 
  57     #if (EL_PER_THREAD == 1) && !defined(USE_LOOPS)
  58         selec_smem[threadIdx.x] = -1;
  59     #else
  60         for (int i = 0; i <= EL_PER_THREAD - 1; i++)
  61             selec_smem[threadIdx.x + i * BLOCK_SIZE] = -1;
  62     #endif
  63 
  64     __syncthreads();
  65 
  66    // each counting thread writes its index to shared memory
  67 
  68     #if (EL_PER_THREAD == 1) && !defined(USE_LOOPS)
  69         if (a[idx] >= limit)
  70             selec_smem[atomicAdd(counter_smem_ptr, 1)] = idx;
  71     #else
  72          for (int i = 0; i <= EL_PER_THREAD - 1; i++)
  73          {
  74              if (a[idx + i * jump] >= limit)
  75                  selec_smem[atomicAdd(counter_smem_ptr, 1)] = idx + i * jump;
  76          }
  77     #endif
  78 
  79     __syncthreads();
  80 
  81     if (threadIdx.x == 1)
  82         counter_smem = atomicAdd(counter, counter_smem);
  83 
  84     __syncthreads();
  85 
  86     #if (EL_PER_THREAD == 1) && !defined(USE_LOOPS)
  87         if (selec_smem[threadIdx.x] >= 0)
  88             selec[counter_smem + threadIdx.x] = selec_smem[threadIdx.x];
  89     #else
  90         for (int i = 0; i <= EL_PER_THREAD - 1; i++)
  91         {
  92             if (selec_smem[threadIdx.x + i * jump] >= 0)
  93                 selec[counter_smem + threadIdx.x + i * jump] =
  94                 selec_smem[threadIdx.x + i * jump];
  95         }
  96     #endif
  97 }
  98 """ % {"block_size": block_size, "el_per_thread": el_per_thread})
  99 
 100 # Prepare function call
 101 func = mod.get_function("select_them")
 102 func.prepare("PPfP", block=(block_size, 1, 1))
 103 
 104 # Warmup
 105 warmup = 2
 106 for i in range(warmup):
 107     func.prepared_call((amount // multiple_block_size, 1),
 108                        a_gpu.gpudata, selec_gpu.gpudata,
 109                        limit, counter_gpu.gpudata)
 110     counter_gpu = gpuarray.zeros(1, dtype=numpy.int32)
 111 
 112 
 113 # Prepare getting the time
 114 start = cuda.Event()
 115 stop = cuda.Event()
 116 
 117 # Call function and get time
 118 cuda.Context.synchronize()
 119 start.record()
 120 count = 10
 121 for i in range(count):
 122     func.prepared_call((amount // multiple_block_size, 1),
 123                        a_gpu.gpudata, selec_gpu.gpudata,
 124                        limit, counter_gpu.gpudata)
 125     counter_gpu = gpuarray.zeros(1, dtype=numpy.int32)
 126 stop.record()
 127 
 128 # Copy selection from device to host
 129 selec_gpu.get(selec)
 130 
 131 stop.synchronize()
 132 
 133 # Evaluate memory bandwidth and verify solution
 134 elems_in_selec = len(numpy.nonzero(selec >= 0))
 135 
 136 elapsed_seconds = stop.time_since(start) * 1e-3
 137 print "mem bw:", (a.nbytes + elems_in_selec * 4) / elapsed_seconds / 1e9 * count
 138 
 139 filtered_set = sorted(list(item for item in selec if item != -1))
 140 reference_set = sorted(list(i for i, x in enumerate(a) if x >= limit))
 141 assert filtered_set == reference_set

PyCuda/Examples/SelectToList (last edited 2010-01-31 16:30:42 by ip72-221-120-106)