1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 
   4 from __future__ import division
   5 
   6 """ 
   7 Multiples two square matrices together using multiple blocks and shared memory. 
   8 Each thread block is assigned a "tile" of the resulting matrix and is responsible
   9 for generating the elements in that tile.  Each thread in a block computes one element 
  10 of the tile.
  11 """
  12 
  13 import numpy as np
  14 from numpy import linalg as la
  15 from pycuda import driver, compiler, gpuarray, tools
  16 
  17 # -- initialize the device
  18 import pycuda.autoinit
  19 
  20 kernel_code_template = """
  21 __global__ void MatrixMulKernel(float *A, float *B, float *C)
  22 {
  23 
  24   const uint wA = %(MATRIX_SIZE)s;
  25   const uint wB = %(MATRIX_SIZE)s;  
  26   
  27   // Block index
  28   const uint bx = blockIdx.x;
  29   const uint by = blockIdx.y;
  30 
  31   // Thread index
  32   const uint tx = threadIdx.x;
  33   const uint ty = threadIdx.y;
  34 
  35   // Index of the first sub-matrix of A processed by the block
  36   const uint aBegin = wA * %(BLOCK_SIZE)s * by;
  37   // Index of the last sub-matrix of A processed by the block
  38   const uint aEnd = aBegin + wA - 1;
  39   // Step size used to iterate through the sub-matrices of A
  40   const uint aStep = %(BLOCK_SIZE)s;
  41 
  42   // Index of the first sub-matrix of B processed by the block
  43   const uint bBegin = %(BLOCK_SIZE)s * bx;
  44   // Step size used to iterate through the sub-matrices of B
  45   const uint bStep = %(BLOCK_SIZE)s * wB;
  46 
  47   // The element of the block sub-matrix that is computed
  48   // by the thread
  49   float Csub = 0;
  50   // Loop over all the sub-matrices of A and B required to
  51   // compute the block sub-matrix
  52   for (int a = aBegin, b = bBegin;
  53        a <= aEnd;
  54        a += aStep, b += bStep) 
  55     {
  56       // Shared memory for the sub-matrix of A
  57       __shared__ float As[%(BLOCK_SIZE)s][%(BLOCK_SIZE)s];
  58       // Shared memory for the sub-matrix of B
  59       __shared__ float Bs[%(BLOCK_SIZE)s][%(BLOCK_SIZE)s];
  60 
  61       // Load the matrices from global memory to shared memory
  62       // each thread loads one element of each matrix
  63       As[ty][tx] = A[a + wA * ty + tx];
  64       Bs[ty][tx] = B[b + wB * ty + tx];
  65       // Synchronize to make sure the matrices are loaded
  66       __syncthreads();
  67 
  68       // Multiply the two matrices together;
  69       // each thread computes one element
  70       // of the block sub-matrix
  71       for (int k = 0; k < %(BLOCK_SIZE)s; ++k)
  72         Csub += As[ty][k] * Bs[k][tx];
  73 
  74       // Synchronize to make sure that the preceding
  75       // computation is done before loading two new
  76       // sub-matrices of A and B in the next iteration
  77       __syncthreads();
  78     }
  79 
  80   // Write the block sub-matrix to global memory;
  81   // each thread writes one element
  82   const uint c = wB * %(BLOCK_SIZE)s * by + %(BLOCK_SIZE)s * bx;
  83   C[c + wB * ty + tx] = Csub;
  84 }
  85 """
  86 
  87 # define the (square) matrix size
  88 MATRIX_SIZE = 4
  89 
  90 # define size of blocks and tiles sub-matrix 
  91 # (we assume that the block size is same as tile size)
  92 TILE_SIZE = 2
  93 BLOCK_SIZE = TILE_SIZE
  94 
  95 # create two random square matrices
  96 a_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32)
  97 b_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32)
  98 
  99 # compute reference on the CPU to verify GPU computation
 100 c_cpu = np.dot(a_cpu, b_cpu)
 101 
 102 # transfer host (CPU) memory to device (GPU) memory 
 103 a_gpu = gpuarray.to_gpu(a_cpu) 
 104 b_gpu = gpuarray.to_gpu(b_cpu)
 105 
 106 # create empty gpu array for the result (C = A * B)
 107 c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)
 108 
 109 # get the kernel code from the template 
 110 # by specifying the constants MATRIX_SIZE and BLOCK_SIZE
 111 kernel_code = kernel_code_template % { 
 112     'MATRIX_SIZE': MATRIX_SIZE,
 113     'BLOCK_SIZE': BLOCK_SIZE,
 114     }
 115 
 116 # compile the kernel code
 117 mod = compiler.SourceModule(kernel_code)
 118 
 119 # get the kernel function from the compiled module
 120 matrixmul = mod.get_function("MatrixMulKernel")
 121 
 122 # call the kernel on the card
 123 matrixmul(
 124     # inputs
 125     a_gpu, b_gpu, 
 126     # output
 127     c_gpu, 
 128     # grid of multiple blocks
 129     grid = (MATRIX_SIZE // TILE_SIZE, MATRIX_SIZE // TILE_SIZE),
 130     # block of multiple threads
 131     block = (TILE_SIZE, TILE_SIZE, 1), 
 132     )
 133 
 134 # print the results
 135 print "-" * 80
 136 print "Matrix A (GPU):"
 137 print a_gpu.get()
 138 
 139 print "-" * 80
 140 print "Matrix B (GPU):"
 141 print b_gpu.get()
 142 
 143 print "-" * 80
 144 print "Matrix C (GPU):"
 145 print c_gpu.get()
 146 
 147 print "-" * 80
 148 print "CPU-GPU difference:"
 149 print c_cpu - c_gpu.get()
 150 print "L2 norm:", la.norm(c_cpu - c_gpu.get())
 151 np.allclose(c_cpu, c_gpu.get())

PyCuda/Examples/MatrixmulTiled (last edited 2013-12-14 17:05:11 by cpe-67-247-200-74)