Differences between revisions 1 and 2
Revision 1 as of 2010-01-31 16:30:31
Size: 6263
Editor: ip72-221-120-106
Comment:
Revision 2 as of 2013-12-16 19:44:15
Size: 6265
Editor: 50-193-52-113-static
Comment:
No differences found!

   1 # Exercise 1 from http://webapp.dam.brown.edu/wiki/SciComp/CudaExercises
   2 
   3 # Transposition of a matrix
   4 # by Hendrik Riedmann <riedmann@dam.brown.edu>
   5 
   6 from __future__ import division
   7 
   8 import pycuda.driver as cuda
   9 import pycuda.gpuarray as gpuarray
  10 import pycuda.autoinit
  11 from pycuda.compiler import SourceModule
  12 
  13 import numpy
  14 import numpy.linalg as la
  15 
  16 from pycuda.tools import context_dependent_memoize
  17 
  18 block_size = 16
  19 
  20 @context_dependent_memoize
  21 def _get_transpose_kernel():
  22     mod = SourceModule("""
  23     #define BLOCK_SIZE %(block_size)d
  24     #define A_BLOCK_STRIDE (BLOCK_SIZE * a_width)
  25     #define A_T_BLOCK_STRIDE (BLOCK_SIZE * a_height)
  26 
  27     __global__ void transpose(float *A_t, float *A, int a_width, int a_height)
  28     {
  29         // Base indices in A and A_t
  30         int base_idx_a   = blockIdx.x * BLOCK_SIZE + 
  31         blockIdx.y * A_BLOCK_STRIDE;
  32         int base_idx_a_t = blockIdx.y * BLOCK_SIZE + 
  33         blockIdx.x * A_T_BLOCK_STRIDE;
  34 
  35         // Global indices in A and A_t
  36         int glob_idx_a   = base_idx_a + threadIdx.x + a_width * threadIdx.y;
  37         int glob_idx_a_t = base_idx_a_t + threadIdx.x + a_height * threadIdx.y;
  38 
  39         __shared__ float A_shared[BLOCK_SIZE][BLOCK_SIZE+1];
  40 
  41         // Store transposed submatrix to shared memory
  42         A_shared[threadIdx.y][threadIdx.x] = A[glob_idx_a];
  43           
  44         __syncthreads();
  45 
  46         // Write transposed submatrix to global memory
  47         A_t[glob_idx_a_t] = A_shared[threadIdx.x][threadIdx.y];
  48     }
  49     """% {"block_size": block_size})
  50 
  51     func = mod.get_function("transpose")
  52     func.prepare("PPii", block=(block_size, block_size, 1))
  53 
  54     from pytools import Record
  55     class TransposeKernelInfo(Record): pass
  56 
  57     return TransposeKernelInfo(func=func, 
  58             block_size=block_size,
  59             granularity=block_size)
  60 
  61 
  62 
  63 def _get_big_block_transpose_kernel():
  64     mod = SourceModule("""
  65     #define BLOCK_SIZE %(block_size)d
  66     #define A_BLOCK_STRIDE (BLOCK_SIZE * a_width)
  67     #define A_T_BLOCK_STRIDE (BLOCK_SIZE * a_height)
  68 
  69     __global__ void transpose(float *A, float *A_t, int a_width, int a_height)
  70     {
  71         // Base indices in A and A_t
  72         int base_idx_a   = 2 * blockIdx.x * BLOCK_SIZE + 
  73         2 * blockIdx.y * A_BLOCK_STRIDE;
  74         int base_idx_a_t = 2 * blockIdx.y * BLOCK_SIZE + 
  75         2 * blockIdx.x * A_T_BLOCK_STRIDE;
  76 
  77         // Global indices in A and A_t
  78         int glob_idx_a   = base_idx_a + threadIdx.x + a_width * threadIdx.y;
  79         int glob_idx_a_t = base_idx_a_t + threadIdx.x + a_height * threadIdx.y;
  80 
  81         __shared__ float A_shared[2 * BLOCK_SIZE][2 * BLOCK_SIZE + 1];
  82 
  83         // Store transposed submatrix to shared memory
  84         A_shared[threadIdx.y][threadIdx.x] = A[glob_idx_a];
  85         A_shared[threadIdx.y][threadIdx.x + BLOCK_SIZE] = 
  86         A[glob_idx_a + A_BLOCK_STRIDE];
  87         A_shared[threadIdx.y + BLOCK_SIZE][threadIdx.x] = 
  88         A[glob_idx_a + BLOCK_SIZE];
  89         A_shared[threadIdx.y + BLOCK_SIZE][threadIdx.x + BLOCK_SIZE] = 
  90         A[glob_idx_a + BLOCK_SIZE + A_BLOCK_STRIDE];
  91           
  92         __syncthreads();
  93 
  94         // Write transposed submatrix to global memory
  95         A_t[glob_idx_a_t] = A_shared[threadIdx.x][threadIdx.y];
  96         A_t[glob_idx_a_t + A_T_BLOCK_STRIDE] = 
  97         A_shared[threadIdx.x + BLOCK_SIZE][threadIdx.y];
  98         A_t[glob_idx_a_t + BLOCK_SIZE] = 
  99         A_shared[threadIdx.x][threadIdx.y + BLOCK_SIZE];
 100         A_t[glob_idx_a_t + A_T_BLOCK_STRIDE + BLOCK_SIZE] = 
 101         A_shared[threadIdx.x + BLOCK_SIZE][threadIdx.y + BLOCK_SIZE];
 102     }
 103       """% {"block_size": block_size})
 104 
 105     func = mod.get_function("transpose")
 106     func.prepare("PPii", block=(block_size, block_size, 1))
 107 
 108     from pytools import Record
 109     class TransposeKernelInfo(Record): pass
 110 
 111     return TransposeKernelInfo(func=func, 
 112             block_size=block_size, 
 113             granularity=2*block_size)
 114 
 115 
 116 
 117 
 118 def _transpose(tgt, src):
 119     krnl = _get_transpose_kernel()
 120 
 121     w, h = src.shape
 122     assert tgt.shape == (h, w)
 123     assert w % krnl.granularity == 0
 124     assert h % krnl.granularity == 0
 125 
 126     krnl.func.prepared_call(
 127                     (w // krnl.granularity, h // krnl.granularity),
 128                     tgt.gpudata, src.gpudata, w, h)
 129 
 130 
 131 
 132 
 133 def transpose(src):
 134     w, h = src.shape
 135 
 136     result = gpuarray.empty((h, w), dtype=src.dtype)
 137     _transpose(result, src)
 138     return result
 139 
 140 
 141 
 142 
 143 
 144 def check_transpose():
 145     from pycuda.curandom import rand
 146 
 147     for i in numpy.arange(10, 13, 0.125):
 148         size = int(((2**i) // 32) * 32)
 149         print size
 150 
 151         source = rand((size, size), dtype=numpy.float32)
 152 
 153         result = transpose(source)
 154 
 155         err = source.get().T - result.get()
 156         err_norm = la.norm(err)
 157 
 158         source.gpudata.free()
 159         result.gpudata.free()
 160 
 161         assert err_norm == 0, (size, err_norm)
 162 
 163 
 164 
 165 
 166 def run_benchmark():
 167     from pycuda.curandom import rand
 168 
 169     sizes = []
 170     bandwidths = []
 171     times = []
 172     for i in numpy.arange(10, 13, 2**(-6)):
 173         size = int(((2**i) // 16) * 16)
 174 
 175         source = rand((size, size), dtype=numpy.float32)
 176         target = gpuarray.empty((size, size), dtype=source.dtype)
 177 
 178         start = pycuda.driver.Event()
 179         stop = pycuda.driver.Event()
 180 
 181         warmup = 2
 182 
 183         for i in range(warmup):
 184             _transpose(target, source)
 185 
 186         count = 10
 187 
 188         cuda.Context.synchronize()
 189         start.record()
 190 
 191         for i in range(count):
 192             _transpose(target, source)
 193 
 194         stop.record()
 195         stop.synchronize()
 196 
 197         elapsed_seconds = stop.time_since(start)*1e-3
 198         mem_bw = source.nbytes / elapsed_seconds * 2 * count
 199 
 200         sizes.append(size)
 201         bandwidths.append(mem_bw)
 202         times.append(elapsed_seconds)
 203 
 204     slow_sizes = [s for s, bw in zip(sizes, bandwidths) if bw < 40e9]
 205     print slow_sizes
 206     print [s % 64 for s in slow_sizes]
 207     from matplotlib.pyplot import semilogx, loglog, show, savefig, clf
 208     semilogx(sizes, bandwidths)
 209     savefig("transpose-bw.png")
 210     clf()
 211     loglog(sizes, times)
 212     savefig("transpose-times.png")
 213 
 214 
 215 
 216 
 217 #check_transpose()
 218 run_benchmark()

PyCuda/Examples/MatrixTranspose (last edited 2014-09-04 23:33:14 by AndreasKloeckner)