1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 
   4 """ 
   5 Multiples two square matrices together using multiple blocks and shared memory. 
   6 Each thread block is assigned a "tile" of the resulting matrix and is responsible
   7 for generating the elements in that tile.  Each thread in a block computes one element 
   8 of the tile.
   9 """
  10 
  11 import numpy as np
  12 from numpy import linalg as la
  13 from pycuda import driver, compiler, gpuarray, tools
  14 
  15 # -- initialize the device
  16 import pycuda.autoinit
  17 
  18 kernel_code_template = """
  19 __global__ void MatrixMulKernel(float *A, float *B, float *C)
  20 {
  21 
  22   const uint wA = %(MATRIX_SIZE)s;
  23   const uint wB = %(MATRIX_SIZE)s;  
  24   
  25   // Block index
  26   const uint bx = blockIdx.x;
  27   const uint by = blockIdx.y;
  28 
  29   // Thread index
  30   const uint tx = threadIdx.x;
  31   const uint ty = threadIdx.y;
  32 
  33   // Index of the first sub-matrix of A processed by the block
  34   const uint aBegin = wA * %(BLOCK_SIZE)s * by;
  35   // Index of the last sub-matrix of A processed by the block
  36   const uint aEnd = aBegin + wA - 1;
  37   // Step size used to iterate through the sub-matrices of A
  38   const uint aStep = %(BLOCK_SIZE)s;
  39 
  40   // Index of the first sub-matrix of B processed by the block
  41   const uint bBegin = %(BLOCK_SIZE)s * bx;
  42   // Step size used to iterate through the sub-matrices of B
  43   const uint bStep = %(BLOCK_SIZE)s * wB;
  44 
  45   // The element of the block sub-matrix that is computed
  46   // by the thread
  47   float Csub = 0;
  48   // Loop over all the sub-matrices of A and B required to
  49   // compute the block sub-matrix
  50   for (int a = aBegin, b = bBegin;
  51        a <= aEnd;
  52        a += aStep, b += bStep) 
  53     {
  54       // Shared memory for the sub-matrix of A
  55       __shared__ float As[%(BLOCK_SIZE)s][%(BLOCK_SIZE)s];
  56       // Shared memory for the sub-matrix of B
  57       __shared__ float Bs[%(BLOCK_SIZE)s][%(BLOCK_SIZE)s];
  58 
  59       // Load the matrices from global memory to shared memory;
  60       // each thread loads one element of each matrix
  61       As[ty][tx] = A[a + wA * ty + tx];
  62       Bs[ty][tx] = B[b + wB * ty + tx];
  63       // Synchronize to make sure the matrices are loaded
  64       __syncthreads();
  65 
  66       // Multiply the two matrices together;
  67       // each thread computes one element
  68       // of the block sub-matrix
  69       for (int k = 0; k < %(BLOCK_SIZE)s; ++k)
  70         Csub += As[ty][k] * Bs[k][tx];
  71 
  72       // Synchronize to make sure that the preceding
  73       // computation is done before loading two new
  74       // sub-matrices of A and B in the next iteration
  75       __syncthreads();
  76     }
  77 
  78   // Write the block sub-matrix to global memory;
  79   // each thread writes one element
  80   const uint c = wB * %(BLOCK_SIZE)s * by + %(BLOCK_SIZE)s * bx;
  81   C[c + wB * ty + tx] = Csub;
  82 }
  83 """
  84 
  85 # define the (square) matrix size
  86 MATRIX_SIZE = 4
  87 
  88 # define size of blocks and tiles sub-matrix 
  89 # (we assume that the block size is same as tile size)
  90 TILE_SIZE = 2
  91 BLOCK_SIZE = TILE_SIZE
  92 
  93 # create two random square matrices
  94 a_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32)
  95 b_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32)
  96 
  97 # compute reference on the CPU to verify GPU computation
  98 c_cpu = np.dot(a_cpu, b_cpu)
  99 
 100 # transfer host (CPU) memory to device (GPU) memory 
 101 a_gpu = gpuarray.to_gpu(a_cpu) 
 102 b_gpu = gpuarray.to_gpu(b_cpu)
 103 
 104 # create empty gpu array for the result (C = A * B)
 105 c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)
 106 
 107 # get the kernel code from the template 
 108 # by specifying the constants MATRIX_SIZE and BLOCK_SIZE
 109 kernel_code = kernel_code_template % { 
 110     'MATRIX_SIZE': MATRIX_SIZE,
 111     'BLOCK_SIZE': BLOCK_SIZE,
 112     }
 113 
 114 # compile the kernel code
 115 mod = compiler.SourceModule(kernel_code)
 116 
 117 # get the kernel function from the compiled module
 118 matrixmul = mod.get_function("MatrixMulKernel")
 119 
 120 # call the kernel on the card
 121 matrixmul(
 122     # inputs
 123     a_gpu, b_gpu, 
 124     # output
 125     c_gpu, 
 126     # grid of multiple blocks
 127     grid = (MATRIX_SIZE / TILE_SIZE, MATRIX_SIZE / TILE_SIZE),
 128     # block of multiple threads
 129     block = (TILE_SIZE, TILE_SIZE, 1), 
 130     )
 131 
 132 # print the results
 133 print "-" * 80
 134 print "Matrix A (GPU):"
 135 print a_gpu.get()
 136 
 137 print "-" * 80
 138 print "Matrix B (GPU):"
 139 print b_gpu.get()
 140 
 141 print "-" * 80
 142 print "Matrix C (GPU):"
 143 print c_gpu.get()
 144 
 145 print "-" * 80
 146 print "CPU-GPU difference:"
 147 print c_cpu - c_gpu.get()
 148 print "L2 norm:", la.norm(c_cpu - c_gpu.get())
 149 np.allclose(c_cpu, c_gpu.get())