Attachment 'demo_meta_matrixmul_cheetah.template.cu'

Download

   1 #raw
   2 #include <stdio.h>
   3 #end raw
   4 
   5 // =============================================================================
   6 // PyCuda Optimized Matrix Multiplication 
   7 // Template Meta-programming Example using Cheetah 
   8 // -----------------------------------------------------------------------------
   9 // These are the five performance-tuning parameters at your disposal
  10 // -----------------------------------------------------------------------------
  11 
  12 // Thread block size 
  13 // (available values are 1, 2, 4, 8, 12 and 16)
  14 #if $BLOCK_SIZE not in [1, 2, 4, 8, 12, 16]
  15 #raise ValueError, "$BLOCK_SIZE not in [1, 2, 4, 8, 12, 16]"
  16 #end if
  17 
  18 // Work size, or number of matrix N tiles per thread 
  19 // (available values are 1, 2 and 4)
  20 #if $WORK_SIZE not in [1, 2, 4]
  21 #raise ValueError, "$WORK_SIZE not in [1, 2, 4]"
  22 #end if
  23 
  24 // Dot product loop unrolling factor
  25 // (available values are 0, 1, 3, 7 and 15)
  26 #if $UNROLL not in [0, 1, 3, 7, 15]
  27 #raise ValueError, "$UNROLL not in [0, 1, 3, 7, 15]"
  28 #end if
  29 
  30 // Register spilling
  31 // (boolean)
  32 #if type($SPILL) != bool
  33 #raise ValueError, "type($SPILL) != bool"
  34 #end if
  35 
  36 // Prefetching 
  37 // (boolean)
  38 #if type($PREFETCH) != bool
  39 #raise ValueError, "type($PREFETCH) != bool"
  40 #end if
  41 
  42 // =============================================================================
  43 __global__ void
  44 matrixMul(float* C, float* A, float* B)
  45 {
  46   // Block index
  47   const unsigned bx = blockIdx.x;
  48   const unsigned by = blockIdx.y;
  49   
  50   // Thread index
  51   const unsigned tx = threadIdx.x;
  52   const unsigned ty = threadIdx.y;
  53   
  54   // Index of the first sub-matrix of A processed by this block
  55   const unsigned aBegin = $A_WIDTH * $BLOCK_SIZE * by;
  56   // Index of the last element of the sub-matrix of A processed by this block
  57   const unsigned aEnd = aBegin + $A_WIDTH * (ty + 1) + tx;
  58   // Step size used to iterate through the sub-matrices of A
  59   const unsigned aStep = $BLOCK_SIZE;
  60 
  61   // Index of the first sub-matrix of B processed by this block
  62   const unsigned bBegin = $BLOCK_SIZE * $WORK_SIZE * bx;  
  63   // Step size used to iterate through the sub-matrices of B
  64   const unsigned bStep = $BLOCK_SIZE * $B_WIDTH;
  65 
  66   // Index of the output value for this thread
  67 #if $SPILL
  68   // Create a shared-memory buffer to spill a register value
  69   // into shared memory, hopefully reducing the total required
  70   // register count.
  71   __shared__ int c[$BLOCK_SIZE][$BLOCK_SIZE];
  72   c[tx][ty] = bBegin + $B_WIDTH * $BLOCK_SIZE * by + $B_WIDTH * ty + tx;
  73 #else
  74   const unsigned c =  bBegin + $B_WIDTH * $BLOCK_SIZE * by + $B_WIDTH * ty + tx;
  75 #end if // $SPILL
  76 
  77   // Initialize (sub)result(s) to 0.
  78   float sub[$WORK_SIZE];
  79 #for w in xrange($WORK_SIZE)
  80   sub[$w] = 0;
  81 #end for
  82 
  83   // Current indexes
  84   unsigned a = aBegin + $A_WIDTH * ty + tx;
  85   unsigned b = bBegin + $B_WIDTH * ty + tx;
  86 
  87 #if not $PREFETCH
  88   // ---------------------------------------------------------------------------
  89   // Code *without* prefetching
  90   // ---------------------------------------------------------------------------  
  91   // Loop over all the sub-matrices of A and B
  92   // required to compute the block sub-matrix
  93   while (a < aEnd) {
  94 
  95     // Shared memory for the sub-matrix of A
  96     __shared__ float As[$BLOCK_SIZE][$BLOCK_SIZE];
  97     // Shared memory for the sub-matrix of B
  98     __shared__ float Bs[$BLOCK_SIZE][$BLOCK_SIZE * $WORK_SIZE];
  99 
 100     // Load the matrices from device memory
 101     // directly to shared memory
 102     As[ty][tx] = A[a];
 103 #for w in xrange($WORK_SIZE)
 104     Bs[ty][tx + $BLOCK_SIZE * $w] = B[b + $BLOCK_SIZE * $w];
 105 #end for // w in xrange($WORK_SIZE)
 106     
 107     // Update for next loop
 108     a += aStep;
 109     b += bStep;
 110 
 111     // Synchronize to make sure the shared memory
 112     // tiles are ready
 113     __syncthreads();
 114 
 115     // Compute dot-product (with easy unroll ;-)
 116     for (int i = 0; i < $BLOCK_SIZE; i += $UNROLL + 1)
 117       {
 118 #for u in xrange(min($BLOCK_SIZE, $UNROLL + 1))
 119 #for w in xrange($WORK_SIZE)
 120 	sub[$w] += As[ty][i+$u] * Bs[i+$u][tx + $BLOCK_SIZE * $w];
 121 #end for // w in xrange($WORK_SIZE)
 122 #end for // u in xrange($UNROLL + 1)
 123       }
 124 
 125     // Synchronize to make sure that the preceding
 126     // computation is done before overwriting new
 127     // shared memory sub-matrices of A and B in the next iteration
 128     __syncthreads();
 129   }
 130 
 131 #else
 132 
 133   // ---------------------------------------------------------------------------
 134   // Code *with* prefetching
 135   // ---------------------------------------------------------------------------  
 136   // Initial prefetch.  Issues loads to main memory and store 
 137   // in temporary variables which will later be stored to shared memory
 138   float fa = A[a];
 139   float fb[$WORK_SIZE];
 140 #for w in xrange($WORK_SIZE)
 141   fb[$w] = B[b + $BLOCK_SIZE * $w];
 142 #end for // w in xrange($WORK_SIZE)
 143   
 144   // Shared memory for the sub-matrix of A
 145   __shared__ float As[$BLOCK_SIZE][$BLOCK_SIZE];
 146   // Shared memory for the sub-matrix of B
 147   __shared__ float Bs[$BLOCK_SIZE][$BLOCK_SIZE * $WORK_SIZE];
 148 
 149   // ---------------------------------------------------------------------------
 150   // Loop over all the sub-matrices of A and B
 151   // required to compute the block sub-matrix
 152   while (a < (aEnd-aStep)) {
 153 
 154     // When performing prefetching, the values are already loaded
 155     // from memory, and the temporary variables holding the loaded
 156     // values are stored to shared memory.
 157     As[ty][tx] = fa;
 158 #for w in xrange($WORK_SIZE)
 159     Bs[ty][tx + $BLOCK_SIZE * $w] = fb[$w];
 160 #end for // w in xrange($WORK_SIZE)
 161     
 162     // Update for next loop
 163     a += aStep;
 164     b += bStep;
 165 
 166     // Synchronize to make sure the shared memory
 167     // tiles are ready
 168     __syncthreads();
 169 
 170     // Issue the loads for the next tiles preemptively.
 171     // The loads will complete and be stored into these temporary
 172     // variables while the current shared memory tiles
 173     // are being operated on.
 174     fa = A[a];
 175 #for w in xrange($WORK_SIZE)
 176     fb[$w] = B[b + $BLOCK_SIZE * $w];
 177 #end for // w in xrange($WORK_SIZE)
 178 
 179     // Compute dot-product (with easy unroll ;-)
 180     for (int i = 0; i < $BLOCK_SIZE; i += $UNROLL + 1)
 181       {
 182 #for u in xrange(min($BLOCK_SIZE, $UNROLL + 1))
 183 #for w in xrange($WORK_SIZE)
 184 	sub[$w] += As[ty][i+$u] * Bs[i+$u][tx + $BLOCK_SIZE * $w];
 185 #end for // w in xrange($WORK_SIZE)
 186 #end for // u in xrange($UNROLL + 1)
 187       }
 188 
 189     // Synchronize to make sure that the preceding
 190     // computation is done before overwriting new
 191     // shared memory sub-matrices of A and B in the next iteration
 192     __syncthreads();
 193   }
 194 
 195   // Last iteration (with no pre-emptive loading) 
 196   As[ty][tx] = fa;
 197 #for w in xrange($WORK_SIZE)
 198   Bs[ty][tx + $BLOCK_SIZE * $w] = fb[$w];
 199 #end for // w in xrange($WORK_SIZE)
 200     
 201   // Update for next loop
 202   a += aStep;
 203   b += bStep;
 204 
 205   // Synchronize to make sure the shared memory
 206   // tiles are ready
 207   __syncthreads();
 208 
 209   // Compute dot-product (with easy unroll ;-)
 210   for (int i = 0; i < $BLOCK_SIZE; i += $UNROLL + 1)
 211     {
 212 #for u in xrange(min($BLOCK_SIZE, $UNROLL + 1))
 213 #for w in xrange($WORK_SIZE)
 214       sub[$w] += As[ty][i+$u] * Bs[i+$u][tx + $BLOCK_SIZE * $w];
 215 #end for // w in xrange($WORK_SIZE)
 216 #end for // u in xrange($UNROLL + 1)
 217     }
 218 
 219   // Synchronize to make sure that the preceding
 220   // computation is done before overwriting new
 221   // shared memory sub-matrices of A and B in the next iteration
 222   __syncthreads();
 223 
 224 #end if // not $PRETETCH 
 225 
 226   // ---------------------------------------------------------------------------
 227   // Output the final result(s) for each thread.
 228 #for w in xrange($WORK_SIZE)
 229 #if $SPILL
 230   // If we spilled the output index at the beginning, load it back
 231   // from the shared memory array.
 232   C[c[tx][ty] + $BLOCK_SIZE * $w] = sub[$w];
 233 #else
 234   C[c + $BLOCK_SIZE * $w] = sub[$w];
 235 #end if // $SPILL
 236 #end for // w in xrange($WORK_SIZE)
 237 
 238 }

New Attachment

File to upload
Rename to
Overwrite existing attachment of same name
SPAM prevention question: Opposite of left?

Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.
  • [get | view] (2011-02-08 23:10:11, 7.5 KB) [[attachment:demo_meta_matrixmul_cheetah.template.cu]]
 All files | Selected Files: delete move to page copy to page