Differences between revisions 3 and 4
Revision 3 as of 2014-04-24 18:36:18
Size: 6263
Editor: ::ffff:66
Comment:
Revision 4 as of 2014-09-04 23:33:14
Size: 6511
Comment:
Deletions are marked like this. Additions are marked like this.
Line 7: Line 7:
from __future__ import division from __future__ import division, print_function
Line 25: Line 25:
    #define A_BLOCK_STRIDE (BLOCK_SIZE * a_width) #define A_T_BLOCK_STRIDE (BLOCK_SIZE * a_height)     #define A_BLOCK_STRIDE (BLOCK_SIZE * a_width)
#define A_T_BLOCK_STRIDE (BLOCK_SIZE * a_height)
Line 30: Line 31:
        int base_idx_a = blockIdx.x * BLOCK_SIZE +
 
blockIdx.y * A_BLOCK_STRIDE;
        int base_idx_a_t = blockIdx.y * BLOCK_SIZE +
 
blockIdx.x * A_T_BLOCK_STRIDE;
        int base_idx_a = blockIdx.x * BLOCK_SIZE +
   blockIdx.y * A_BLOCK_STRIDE;
        int base_idx_a_t = blockIdx.y * BLOCK_SIZE +
   blockIdx.x * A_T_BLOCK_STRIDE;
Line 43: Line 44:
          
Line 52: Line 53:
    func.prepare("PPii", block=(block_size, block_size, 1))     func.prepare("PPii")
Line 57: Line 58:
    return TransposeKernelInfo(func=func,     return TransposeKernelInfo(func=func,
           block=(block_size, block_size, 1),
Line 72: Line 74:
        int base_idx_a = 2 * blockIdx.x * BLOCK_SIZE +
 
2 * blockIdx.y * A_BLOCK_STRIDE;
        int base_idx_a_t = 2 * blockIdx.y * BLOCK_SIZE +
 
2 * blockIdx.x * A_T_BLOCK_STRIDE;
        int base_idx_a = 2 * blockIdx.x * BLOCK_SIZE +
   2 * blockIdx.y * A_BLOCK_STRIDE;
        int base_idx_a_t = 2 * blockIdx.y * BLOCK_SIZE +
   2 * blockIdx.x * A_T_BLOCK_STRIDE;
Line 85: Line 87:
        A_shared[threadIdx.y][threadIdx.x + BLOCK_SIZE] =   A[glob_idx_a + A_BLOCK_STRIDE];
        A_shared[threadIdx.y + BLOCK_SIZE][threadIdx.x] =   A[glob_idx_a + BLOCK_SIZE];
        A_shared[threadIdx.y + BLOCK_SIZE][threadIdx.x + BLOCK_SIZE] = 
        A_shared[threadIdx.y][threadIdx.x + BLOCK_SIZE] =
    
A[glob_idx_a + A_BLOCK_STRIDE];
        A_shared[threadIdx.y + BLOCK_SIZE][threadIdx.x] =
    
A[glob_idx_a + BLOCK_SIZE];
        A_shared[threadIdx.y + BLOCK_SIZE][threadIdx.x + BLOCK_SIZE] =
Line 91: Line 93:
          
Line 96: Line 98:
        A_t[glob_idx_a_t + A_T_BLOCK_STRIDE] =   A_shared[threadIdx.x + BLOCK_SIZE][threadIdx.y];
        A_t[glob_idx_a_t + BLOCK_SIZE] =   A_shared[threadIdx.x][threadIdx.y + BLOCK_SIZE];
        A_t[glob_idx_a_t + A_T_BLOCK_STRIDE + BLOCK_SIZE] =   A_shared[threadIdx.x + BLOCK_SIZE][threadIdx.y + BLOCK_SIZE];
        A_t[glob_idx_a_t + A_T_BLOCK_STRIDE] =
    
A_shared[threadIdx.x + BLOCK_SIZE][threadIdx.y];
        A_t[glob_idx_a_t + BLOCK_SIZE] =
    
A_shared[threadIdx.x][threadIdx.y + BLOCK_SIZE];
        A_t[glob_idx_a_t + A_T_BLOCK_STRIDE + BLOCK_SIZE] =
    
A_shared[threadIdx.x + BLOCK_SIZE][threadIdx.y + BLOCK_SIZE];
Line 106: Line 108:
    func.prepare("PPii", block=(block_size, block_size, 1))     func.prepare("PPii")
Line 111: Line 113:
    return TransposeKernelInfo(func=func,
            block_size=block_size, 
    return TransposeKernelInfo(func=func,
           block=(block_size, block_size, 1),
            block_size=block_size,
Line 127: Line 130:
                    (w // krnl.granularity, h // krnl.granularity),                     (w // krnl.granularity, h // krnl.granularity), krnl.block,
Line 149: Line 152:
        print size         print(size)
Line 169: Line 172:
    sizes = []     powers = numpy.arange(10, 13, 2**(-6))
    sizes = [int(size) for size in numpy.unique(2**powers // 16 * 16)]
Line 172: Line 176:
    for i in numpy.arange(10, 13, 2**(-6)):
        size = int(((2**i) // 16) * 16)

    for size in sizes:
Line 183: Line 187:
 for i in range(warmup):
  _transpose(target, source)
        for i in range(warmup):
         _transpose(target, source)
Line 200: Line 204:
        sizes.append(size)
Line 205: Line 208:
    print slow_sizes
    print [s %
64 for s in slow_sizes]
    from matplotlib.pyplot import semilogx, loglog, show, savefig, clf
    print("Sizes for which bandwidth was low:", slow_sizes)
    print("Ditto, mod
64:", [s % 64 for s in slow_sizes])
    from matplotlib.pyplot import semilogx, loglog, show, savefig, clf, xlabel, ylabel
    xlabel('matrix size')
    ylabel('bandwidth')
Line 211: Line 216:
    xlabel('matrix size')
    ylabel('time')
Line 220: Line 227:

   1 # Exercise 1 from http://webapp.dam.brown.edu/wiki/SciComp/CudaExercises
   2 
   3 # Transposition of a matrix
   4 # by Hendrik Riedmann <riedmann@dam.brown.edu>
   5 
   6 from __future__ import division, print_function
   7 
   8 import pycuda.driver as cuda
   9 import pycuda.gpuarray as gpuarray
  10 import pycuda.autoinit
  11 from pycuda.compiler import SourceModule
  12 
  13 import numpy
  14 import numpy.linalg as la
  15 
  16 from pycuda.tools import context_dependent_memoize
  17 
  18 block_size = 16
  19 
  20 @context_dependent_memoize
  21 def _get_transpose_kernel():
  22     mod = SourceModule("""
  23     #define BLOCK_SIZE %(block_size)d
  24     #define A_BLOCK_STRIDE (BLOCK_SIZE * a_width)
  25     #define A_T_BLOCK_STRIDE (BLOCK_SIZE * a_height)
  26 
  27     __global__ void transpose(float *A_t, float *A, int a_width, int a_height)
  28     {
  29         // Base indices in A and A_t
  30         int base_idx_a   = blockIdx.x * BLOCK_SIZE +
  31     blockIdx.y * A_BLOCK_STRIDE;
  32         int base_idx_a_t = blockIdx.y * BLOCK_SIZE +
  33     blockIdx.x * A_T_BLOCK_STRIDE;
  34 
  35         // Global indices in A and A_t
  36         int glob_idx_a   = base_idx_a + threadIdx.x + a_width * threadIdx.y;
  37         int glob_idx_a_t = base_idx_a_t + threadIdx.x + a_height * threadIdx.y;
  38 
  39         __shared__ float A_shared[BLOCK_SIZE][BLOCK_SIZE+1];
  40 
  41         // Store transposed submatrix to shared memory
  42         A_shared[threadIdx.y][threadIdx.x] = A[glob_idx_a];
  43 
  44         __syncthreads();
  45 
  46         // Write transposed submatrix to global memory
  47         A_t[glob_idx_a_t] = A_shared[threadIdx.x][threadIdx.y];
  48     }
  49     """% {"block_size": block_size})
  50 
  51     func = mod.get_function("transpose")
  52     func.prepare("PPii")
  53 
  54     from pytools import Record
  55     class TransposeKernelInfo(Record): pass
  56 
  57     return TransposeKernelInfo(func=func,
  58             block=(block_size, block_size, 1),
  59             block_size=block_size,
  60             granularity=block_size)
  61 
  62 
  63 
  64 def _get_big_block_transpose_kernel():
  65     mod = SourceModule("""
  66     #define BLOCK_SIZE %(block_size)d
  67     #define A_BLOCK_STRIDE (BLOCK_SIZE * a_width)
  68     #define A_T_BLOCK_STRIDE (BLOCK_SIZE * a_height)
  69 
  70     __global__ void transpose(float *A, float *A_t, int a_width, int a_height)
  71     {
  72         // Base indices in A and A_t
  73         int base_idx_a   = 2 * blockIdx.x * BLOCK_SIZE +
  74     2 * blockIdx.y * A_BLOCK_STRIDE;
  75         int base_idx_a_t = 2 * blockIdx.y * BLOCK_SIZE +
  76     2 * blockIdx.x * A_T_BLOCK_STRIDE;
  77 
  78         // Global indices in A and A_t
  79         int glob_idx_a   = base_idx_a + threadIdx.x + a_width * threadIdx.y;
  80         int glob_idx_a_t = base_idx_a_t + threadIdx.x + a_height * threadIdx.y;
  81 
  82         __shared__ float A_shared[2 * BLOCK_SIZE][2 * BLOCK_SIZE + 1];
  83 
  84         // Store transposed submatrix to shared memory
  85         A_shared[threadIdx.y][threadIdx.x] = A[glob_idx_a];
  86         A_shared[threadIdx.y][threadIdx.x + BLOCK_SIZE] =
  87     A[glob_idx_a + A_BLOCK_STRIDE];
  88         A_shared[threadIdx.y + BLOCK_SIZE][threadIdx.x] =
  89     A[glob_idx_a + BLOCK_SIZE];
  90         A_shared[threadIdx.y + BLOCK_SIZE][threadIdx.x + BLOCK_SIZE] =
  91         A[glob_idx_a + BLOCK_SIZE + A_BLOCK_STRIDE];
  92 
  93         __syncthreads();
  94 
  95         // Write transposed submatrix to global memory
  96         A_t[glob_idx_a_t] = A_shared[threadIdx.x][threadIdx.y];
  97         A_t[glob_idx_a_t + A_T_BLOCK_STRIDE] =
  98     A_shared[threadIdx.x + BLOCK_SIZE][threadIdx.y];
  99         A_t[glob_idx_a_t + BLOCK_SIZE] =
 100     A_shared[threadIdx.x][threadIdx.y + BLOCK_SIZE];
 101         A_t[glob_idx_a_t + A_T_BLOCK_STRIDE + BLOCK_SIZE] =
 102     A_shared[threadIdx.x + BLOCK_SIZE][threadIdx.y + BLOCK_SIZE];
 103     }
 104       """% {"block_size": block_size})
 105 
 106     func = mod.get_function("transpose")
 107     func.prepare("PPii")
 108 
 109     from pytools import Record
 110     class TransposeKernelInfo(Record): pass
 111 
 112     return TransposeKernelInfo(func=func,
 113             block=(block_size, block_size, 1),
 114             block_size=block_size,
 115             granularity=2*block_size)
 116 
 117 
 118 
 119 
 120 def _transpose(tgt, src):
 121     krnl = _get_transpose_kernel()
 122 
 123     w, h = src.shape
 124     assert tgt.shape == (h, w)
 125     assert w % krnl.granularity == 0
 126     assert h % krnl.granularity == 0
 127 
 128     krnl.func.prepared_call(
 129                     (w // krnl.granularity, h // krnl.granularity), krnl.block,
 130                     tgt.gpudata, src.gpudata, w, h)
 131 
 132 
 133 
 134 
 135 def transpose(src):
 136     w, h = src.shape
 137 
 138     result = gpuarray.empty((h, w), dtype=src.dtype)
 139     _transpose(result, src)
 140     return result
 141 
 142 
 143 
 144 
 145 
 146 def check_transpose():
 147     from pycuda.curandom import rand
 148 
 149     for i in numpy.arange(10, 13, 0.125):
 150         size = int(((2**i) // 32) * 32)
 151         print(size)
 152 
 153         source = rand((size, size), dtype=numpy.float32)
 154 
 155         result = transpose(source)
 156 
 157         err = source.get().T - result.get()
 158         err_norm = la.norm(err)
 159 
 160         source.gpudata.free()
 161         result.gpudata.free()
 162 
 163         assert err_norm == 0, (size, err_norm)
 164 
 165 
 166 
 167 
 168 def run_benchmark():
 169     from pycuda.curandom import rand
 170 
 171     powers = numpy.arange(10, 13, 2**(-6))
 172     sizes = [int(size) for size in numpy.unique(2**powers // 16 * 16)]
 173     bandwidths = []
 174     times = []
 175 
 176     for size in sizes:
 177 
 178         source = rand((size, size), dtype=numpy.float32)
 179         target = gpuarray.empty((size, size), dtype=source.dtype)
 180 
 181         start = pycuda.driver.Event()
 182         stop = pycuda.driver.Event()
 183 
 184         warmup = 2
 185 
 186         for i in range(warmup):
 187             _transpose(target, source)
 188 
 189         count = 10
 190 
 191         cuda.Context.synchronize()
 192         start.record()
 193 
 194         for i in range(count):
 195             _transpose(target, source)
 196 
 197         stop.record()
 198         stop.synchronize()
 199 
 200         elapsed_seconds = stop.time_since(start)*1e-3
 201         mem_bw = source.nbytes / elapsed_seconds * 2 * count
 202 
 203         bandwidths.append(mem_bw)
 204         times.append(elapsed_seconds)
 205 
 206     slow_sizes = [s for s, bw in zip(sizes, bandwidths) if bw < 40e9]
 207     print("Sizes for which bandwidth was low:", slow_sizes)
 208     print("Ditto, mod 64:", [s % 64 for s in slow_sizes])
 209     from matplotlib.pyplot import semilogx, loglog, show, savefig, clf, xlabel, ylabel
 210     xlabel('matrix size')
 211     ylabel('bandwidth')
 212     semilogx(sizes, bandwidths)
 213     savefig("transpose-bw.png")
 214     clf()
 215     xlabel('matrix size')
 216     ylabel('time')
 217     loglog(sizes, times)
 218     savefig("transpose-times.png")
 219 
 220 
 221 
 222 
 223 #check_transpose()
 224 run_benchmark()

PyCuda/Examples/MatrixTranspose (last edited 2014-09-04 23:33:14 by AndreasKloeckner)