Differences between revisions 2 and 3
Revision 2 as of 2013-12-16 19:44:15
Size: 6265
Editor: 50-193-52-113-static
Comment:
Revision 3 as of 2014-04-24 18:36:18
Size: 6263
Editor: ::ffff:66
Comment:
Deletions are marked like this. Additions are marked like this.
Line 25: Line 25:
    #define A_BLOCK_STRIDE (BLOCK_SIZE * a_width)
#define A_T_BLOCK_STRIDE (BLOCK_SIZE * a_height)
    #define A_BLOCK_STRIDE (BLOCK_SIZE * a_width) #define A_T_BLOCK_STRIDE (BLOCK_SIZE * a_height)

   1 # Exercise 1 from http://webapp.dam.brown.edu/wiki/SciComp/CudaExercises
   2 
   3 # Transposition of a matrix
   4 # by Hendrik Riedmann <riedmann@dam.brown.edu>
   5 
   6 from __future__ import division
   7 
   8 import pycuda.driver as cuda
   9 import pycuda.gpuarray as gpuarray
  10 import pycuda.autoinit
  11 from pycuda.compiler import SourceModule
  12 
  13 import numpy
  14 import numpy.linalg as la
  15 
  16 from pycuda.tools import context_dependent_memoize
  17 
  18 block_size = 16
  19 
  20 @context_dependent_memoize
  21 def _get_transpose_kernel():
  22     mod = SourceModule("""
  23     #define BLOCK_SIZE %(block_size)d
  24     #define A_BLOCK_STRIDE (BLOCK_SIZE * a_width)    #define A_T_BLOCK_STRIDE (BLOCK_SIZE * a_height)
  25 
  26     __global__ void transpose(float *A_t, float *A, int a_width, int a_height)
  27     {
  28         // Base indices in A and A_t
  29         int base_idx_a   = blockIdx.x * BLOCK_SIZE + 
  30         blockIdx.y * A_BLOCK_STRIDE;
  31         int base_idx_a_t = blockIdx.y * BLOCK_SIZE + 
  32         blockIdx.x * A_T_BLOCK_STRIDE;
  33 
  34         // Global indices in A and A_t
  35         int glob_idx_a   = base_idx_a + threadIdx.x + a_width * threadIdx.y;
  36         int glob_idx_a_t = base_idx_a_t + threadIdx.x + a_height * threadIdx.y;
  37 
  38         __shared__ float A_shared[BLOCK_SIZE][BLOCK_SIZE+1];
  39 
  40         // Store transposed submatrix to shared memory
  41         A_shared[threadIdx.y][threadIdx.x] = A[glob_idx_a];
  42           
  43         __syncthreads();
  44 
  45         // Write transposed submatrix to global memory
  46         A_t[glob_idx_a_t] = A_shared[threadIdx.x][threadIdx.y];
  47     }
  48     """% {"block_size": block_size})
  49 
  50     func = mod.get_function("transpose")
  51     func.prepare("PPii", block=(block_size, block_size, 1))
  52 
  53     from pytools import Record
  54     class TransposeKernelInfo(Record): pass
  55 
  56     return TransposeKernelInfo(func=func, 
  57             block_size=block_size,
  58             granularity=block_size)
  59 
  60 
  61 
  62 def _get_big_block_transpose_kernel():
  63     mod = SourceModule("""
  64     #define BLOCK_SIZE %(block_size)d
  65     #define A_BLOCK_STRIDE (BLOCK_SIZE * a_width)
  66     #define A_T_BLOCK_STRIDE (BLOCK_SIZE * a_height)
  67 
  68     __global__ void transpose(float *A, float *A_t, int a_width, int a_height)
  69     {
  70         // Base indices in A and A_t
  71         int base_idx_a   = 2 * blockIdx.x * BLOCK_SIZE + 
  72         2 * blockIdx.y * A_BLOCK_STRIDE;
  73         int base_idx_a_t = 2 * blockIdx.y * BLOCK_SIZE + 
  74         2 * blockIdx.x * A_T_BLOCK_STRIDE;
  75 
  76         // Global indices in A and A_t
  77         int glob_idx_a   = base_idx_a + threadIdx.x + a_width * threadIdx.y;
  78         int glob_idx_a_t = base_idx_a_t + threadIdx.x + a_height * threadIdx.y;
  79 
  80         __shared__ float A_shared[2 * BLOCK_SIZE][2 * BLOCK_SIZE + 1];
  81 
  82         // Store transposed submatrix to shared memory
  83         A_shared[threadIdx.y][threadIdx.x] = A[glob_idx_a];
  84         A_shared[threadIdx.y][threadIdx.x + BLOCK_SIZE] = 
  85         A[glob_idx_a + A_BLOCK_STRIDE];
  86         A_shared[threadIdx.y + BLOCK_SIZE][threadIdx.x] = 
  87         A[glob_idx_a + BLOCK_SIZE];
  88         A_shared[threadIdx.y + BLOCK_SIZE][threadIdx.x + BLOCK_SIZE] = 
  89         A[glob_idx_a + BLOCK_SIZE + A_BLOCK_STRIDE];
  90           
  91         __syncthreads();
  92 
  93         // Write transposed submatrix to global memory
  94         A_t[glob_idx_a_t] = A_shared[threadIdx.x][threadIdx.y];
  95         A_t[glob_idx_a_t + A_T_BLOCK_STRIDE] = 
  96         A_shared[threadIdx.x + BLOCK_SIZE][threadIdx.y];
  97         A_t[glob_idx_a_t + BLOCK_SIZE] = 
  98         A_shared[threadIdx.x][threadIdx.y + BLOCK_SIZE];
  99         A_t[glob_idx_a_t + A_T_BLOCK_STRIDE + BLOCK_SIZE] = 
 100         A_shared[threadIdx.x + BLOCK_SIZE][threadIdx.y + BLOCK_SIZE];
 101     }
 102       """% {"block_size": block_size})
 103 
 104     func = mod.get_function("transpose")
 105     func.prepare("PPii", block=(block_size, block_size, 1))
 106 
 107     from pytools import Record
 108     class TransposeKernelInfo(Record): pass
 109 
 110     return TransposeKernelInfo(func=func, 
 111             block_size=block_size, 
 112             granularity=2*block_size)
 113 
 114 
 115 
 116 
 117 def _transpose(tgt, src):
 118     krnl = _get_transpose_kernel()
 119 
 120     w, h = src.shape
 121     assert tgt.shape == (h, w)
 122     assert w % krnl.granularity == 0
 123     assert h % krnl.granularity == 0
 124 
 125     krnl.func.prepared_call(
 126                     (w // krnl.granularity, h // krnl.granularity),
 127                     tgt.gpudata, src.gpudata, w, h)
 128 
 129 
 130 
 131 
 132 def transpose(src):
 133     w, h = src.shape
 134 
 135     result = gpuarray.empty((h, w), dtype=src.dtype)
 136     _transpose(result, src)
 137     return result
 138 
 139 
 140 
 141 
 142 
 143 def check_transpose():
 144     from pycuda.curandom import rand
 145 
 146     for i in numpy.arange(10, 13, 0.125):
 147         size = int(((2**i) // 32) * 32)
 148         print size
 149 
 150         source = rand((size, size), dtype=numpy.float32)
 151 
 152         result = transpose(source)
 153 
 154         err = source.get().T - result.get()
 155         err_norm = la.norm(err)
 156 
 157         source.gpudata.free()
 158         result.gpudata.free()
 159 
 160         assert err_norm == 0, (size, err_norm)
 161 
 162 
 163 
 164 
 165 def run_benchmark():
 166     from pycuda.curandom import rand
 167 
 168     sizes = []
 169     bandwidths = []
 170     times = []
 171     for i in numpy.arange(10, 13, 2**(-6)):
 172         size = int(((2**i) // 16) * 16)
 173 
 174         source = rand((size, size), dtype=numpy.float32)
 175         target = gpuarray.empty((size, size), dtype=source.dtype)
 176 
 177         start = pycuda.driver.Event()
 178         stop = pycuda.driver.Event()
 179 
 180         warmup = 2
 181 
 182         for i in range(warmup):
 183             _transpose(target, source)
 184 
 185         count = 10
 186 
 187         cuda.Context.synchronize()
 188         start.record()
 189 
 190         for i in range(count):
 191             _transpose(target, source)
 192 
 193         stop.record()
 194         stop.synchronize()
 195 
 196         elapsed_seconds = stop.time_since(start)*1e-3
 197         mem_bw = source.nbytes / elapsed_seconds * 2 * count
 198 
 199         sizes.append(size)
 200         bandwidths.append(mem_bw)
 201         times.append(elapsed_seconds)
 202 
 203     slow_sizes = [s for s, bw in zip(sizes, bandwidths) if bw < 40e9]
 204     print slow_sizes
 205     print [s % 64 for s in slow_sizes]
 206     from matplotlib.pyplot import semilogx, loglog, show, savefig, clf
 207     semilogx(sizes, bandwidths)
 208     savefig("transpose-bw.png")
 209     clf()
 210     loglog(sizes, times)
 211     savefig("transpose-times.png")
 212 
 213 
 214 
 215 
 216 #check_transpose()
 217 run_benchmark()

PyCuda/Examples/MatrixTranspose (last edited 2014-04-24 18:36:18 by ::ffff:66)