Auto-tuned matrix multiplication via Cheetah template

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 
   4 from __future__ import division
   5 
   6 """ 
   7 PyCuda Optimized Matrix Multiplication 
   8 Template Meta-programming Example using Cheetah
   9 (modified from SciPy09 Advanced Tutorial)
  10 """
  11 
  12 # ------------------------------------------------------------------------------
  13 import numpy as np
  14 from pycuda import driver, compiler, gpuarray, tools
  15 from Cheetah.Template import Template
  16 
  17 import pycuda.autoinit
  18 
  19 # -- default parameters
  20 DEFAULT_BLOCK_SIZE = 16
  21 DEFAULT_WORK_SIZE = 1
  22 DEFAULT_UNROLL = 0
  23 DEFAULT_SPILL = False
  24 DEFAULT_PREFETCH = False
  25 
  26 from os import path
  27 MYPATH = path.dirname(path.abspath(__file__))
  28 TEMPLATE_FILENAME = path.join(MYPATH, "demo_meta_matrixmul_cheetah.template.cu")
  29 
  30 # ------------------------------------------------------------------------------
  31 def matrixmul_opt(mat_a, mat_b, 
  32                   block_size = DEFAULT_BLOCK_SIZE,
  33                   work_size = DEFAULT_WORK_SIZE,
  34                   unroll = DEFAULT_UNROLL,
  35                   spill = DEFAULT_SPILL,
  36                   prefetch = DEFAULT_PREFETCH):
  37     
  38     ah, aw = mat_a.shape
  39     bh, bw = mat_b.shape
  40     
  41     assert aw == bh
  42 
  43     # -- pad input matrices appropriately
  44     ah_padded = int(np.ceil(ah/block_size)) * block_size
  45     aw_padded = int(np.ceil(aw/block_size)) * (block_size*work_size)
  46     mat_a_padded = np.zeros((ah_padded, aw_padded), np.float32)
  47     mat_a_padded[:ah,:aw] = mat_a
  48 
  49     bh_padded = aw_padded 
  50     bw_padded = int(np.ceil(bw/(block_size*work_size))) * (block_size*work_size)
  51     mat_b_padded = np.zeros((bh_padded, bw_padded), np.float32)
  52     mat_b_padded[:bh, :bw] = mat_b
  53 
  54     ch_padded = ah_padded
  55     cw_padded = bw_padded
  56 
  57     # -- upload padded input matrices to the GPU
  58     mat_a_gpu = gpuarray.to_gpu(mat_a_padded) 
  59     mat_b_gpu = gpuarray.to_gpu(mat_b_padded)
  60 
  61     # -- create empty container matrix for the result (C = A * B)
  62     mat_c_gpu = gpuarray.zeros((ch_padded, cw_padded), np.float32)
  63 
  64     # -- generate and compile the code
  65     # prepare the template parameters
  66     template_params = { 
  67         'BLOCK_SIZE': block_size, 
  68         'WORK_SIZE': work_size, 
  69         'UNROLL': unroll, 
  70         'SPILL': spill, 
  71         'PREFETCH': prefetch, 
  72         'A_WIDTH': aw_padded, 
  73         'A_HEIGHT': ah_padded, 
  74         'B_WIDTH': bw_padded,
  75         }
  76     
  77     # run the template engine to get the code
  78     kernel_code = Template(
  79         file = TEMPLATE_FILENAME, 
  80         searchList = [template_params],
  81         )
  82     
  83     # compile the code
  84     module = compiler.SourceModule(kernel_code)
  85     
  86     # get the kernel from the module
  87     matrixmul_func = module.get_function("matrixMul")
  88 
  89     # some info about the module
  90     print "number of registers used:", matrixmul_func.num_regs
  91 
  92     # block of threads
  93     # ATTENTION: block is (threadDim.x, threadDim.y, threadDim.z) 
  94     #            and not (threadDim.z, threadDim.y, threadDim.x)
  95     block =  block_size, block_size, 1
  96     
  97     # grid of blocks 
  98     # ATTENTION: it's (blockDim.x, blockDim.y) 
  99     #            and not (blockDim.y, blockDim.x)
 100     grid = int(cw_padded/block_size/work_size), int(ch_padded/block_size)
 101 
 102     # -- call the kernel on the GPU
 103     # Note that when we use time_kernel=True pycuda will automatically synchronize the kernel 
 104     # to make sure that the timing is correct. If you time the code yourself, you'll have to
 105     # synchronize the current Context.
 106     gpu_time = matrixmul_func(
 107         # -- output
 108         mat_c_gpu, 
 109         # -- inputs
 110         mat_a_gpu, mat_b_gpu, 
 111         # -- grid of blocks
 112         grid = grid, 
 113         # -- block of threads
 114         block = block, 
 115         # -- time the kernel (approx.)
 116         time_kernel = True,
 117         )
 118 
 119     # get the GPU matrix back to CPU memory
 120     mat_c_padded = mat_c_gpu.get()
 121     mat_c = mat_c_padded[:ah, :bw]
 122 
 123     return mat_c, gpu_time
 124 
 125 # ------------------------------------------------------------------------------
 126 if __name__ == "__main__": 
 127 
 128     # matrix sizes
 129     a_height = 1024
 130     a_width = 1024
 131     b_height = a_width
 132     b_width = 1024
 133     
 134     # create random square matrices
 135     np.random.seed(0)
 136     mat_a = np.random.randn(a_height, a_width).astype(np.float32)
 137     mat_b = np.random.randn(b_height, b_width).astype(np.float32)
 138     
 139     # compute reference on the cpu to verify GPU computation
 140     mat_ref = np.dot(mat_a, mat_b)
 141 
 142     # -- this is a good place to auto-tune the code (using the optimization kwargs)
 143     # (note that you may need more that one iteration to get accurate timing estimates)
 144     mat_c, gpu_time = matrixmul_opt(mat_a, mat_b)
 145 
 146     # check for correctness
 147     diff = mat_c - mat_ref
 148     error = np.absolute(diff).max()
 149     assert error <= 1e-2
 150     l2norm = np.linalg.norm(diff)
 151     print "l2norm: ", l2norm
 152 
 153     # print some stats
 154     print "gpu time:", gpu_time
 155     gflop = mat_c.size * (a_width * 2.) / (1000**3.)
 156     gflops = gflop / gpu_time
 157     print "gflops:", gflops

Also needs this file: demo_meta_matrixmul_cheetah.template.cu

PyCuda/Examples/DemoMetaMatrixmulCheetah (last edited 2010-03-17 08:20:16 by danchia)