Auto-tuned matrix multiplication via Cheetah template
#!python
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
"""
PyCuda Optimized Matrix Multiplication
Template Meta-programming Example using Cheetah
(modified from SciPy09 Advanced Tutorial)
"""
# ------------------------------------------------------------------------------
import numpy as np
from pycuda import driver, compiler, gpuarray, tools
from Cheetah.Template import Template
import pycuda.autoinit
# -- default parameters
DEFAULT_BLOCK_SIZE = 16
DEFAULT_WORK_SIZE = 1
DEFAULT_UNROLL = 0
DEFAULT_SPILL = False
DEFAULT_PREFETCH = False
from os import path
MYPATH = path.dirname(path.abspath(__file__))
TEMPLATE_FILENAME = path.join(MYPATH, "demo_meta_matrixmul_cheetah.template.cu")
# ------------------------------------------------------------------------------
def matrixmul_opt(mat_a, mat_b,
block_size = DEFAULT_BLOCK_SIZE,
work_size = DEFAULT_WORK_SIZE,
unroll = DEFAULT_UNROLL,
spill = DEFAULT_SPILL,
prefetch = DEFAULT_PREFETCH):
ah, aw = mat_a.shape
bh, bw = mat_b.shape
assert aw == bh
# -- pad input matrices appropriately
ah_padded = int(np.ceil(ah/block_size)) * block_size
aw_padded = int(np.ceil(aw/block_size)) * (block_size*work_size)
mat_a_padded = np.zeros((ah_padded, aw_padded), np.float32)
mat_a_padded[:ah,:aw] = mat_a
bh_padded = aw_padded
bw_padded = int(np.ceil(bw/(block_size*work_size))) * (block_size*work_size)
mat_b_padded = np.zeros((bh_padded, bw_padded), np.float32)
mat_b_padded[:bh, :bw] = mat_b
ch_padded = ah_padded
cw_padded = bw_padded
# -- upload padded input matrices to the GPU
mat_a_gpu = gpuarray.to_gpu(mat_a_padded)
mat_b_gpu = gpuarray.to_gpu(mat_b_padded)
# -- create empty container matrix for the result (C = A * B)
mat_c_gpu = gpuarray.zeros((ch_padded, cw_padded), np.float32)
# -- generate and compile the code
# prepare the template parameters
template_params = {
'BLOCK_SIZE': block_size,
'WORK_SIZE': work_size,
'UNROLL': unroll,
'SPILL': spill,
'PREFETCH': prefetch,
'A_WIDTH': aw_padded,
'A_HEIGHT': ah_padded,
'B_WIDTH': bw_padded,
}
# run the template engine to get the code
kernel_code = Template(
file = TEMPLATE_FILENAME,
searchList = [template_params],
)
# compile the code
module = compiler.SourceModule(kernel_code)
# get the kernel from the module
matrixmul_func = module.get_function("matrixMul")
# some info about the module
print "number of registers used:", matrixmul_func.num_regs
# block of threads
# ATTENTION: block is (threadDim.x, threadDim.y, threadDim.z)
# and not (threadDim.z, threadDim.y, threadDim.x)
block = block_size, block_size, 1
# grid of blocks
# ATTENTION: it's (blockDim.x, blockDim.y)
# and not (blockDim.y, blockDim.x)
grid = int(cw_padded/block_size/work_size), int(ch_padded/block_size)
# -- call the kernel on the GPU
# Note that when we use time_kernel=True pycuda will automatically synchronize the kernel
# to make sure that the timing is correct. If you time the code yourself, you'll have to
# synchronize the current Context.
gpu_time = matrixmul_func(
# -- output
mat_c_gpu,
# -- inputs
mat_a_gpu, mat_b_gpu,
# -- grid of blocks
grid = grid,
# -- block of threads
block = block,
# -- time the kernel (approx.)
time_kernel = True,
)
# get the GPU matrix back to CPU memory
mat_c_padded = mat_c_gpu.get()
mat_c = mat_c_padded[:ah, :bw]
return mat_c, gpu_time
# ------------------------------------------------------------------------------
if __name__ == "__main__":
# matrix sizes
a_height = 1024
a_width = 1024
b_height = a_width
b_width = 1024
# create random square matrices
np.random.seed(0)
mat_a = np.random.randn(a_height, a_width).astype(np.float32)
mat_b = np.random.randn(b_height, b_width).astype(np.float32)
# compute reference on the cpu to verify GPU computation
mat_ref = np.dot(mat_a, mat_b)
# -- this is a good place to auto-tune the code (using the optimization kwargs)
# (note that you may need more that one iteration to get accurate timing estimates)
mat_c, gpu_time = matrixmul_opt(mat_a, mat_b)
# check for correctness
diff = mat_c - mat_ref
error = np.absolute(diff).max()
assert error <= 1e-2
l2norm = np.linalg.norm(diff)
print "l2norm: ", l2norm
# print some stats
print "gpu time:", gpu_time
gflop = mat_c.size * (a_width * 2.) / (1000**3.)
gflops = gflop / gpu_time
print "gflops:", gflops
Also needs this file: demo_meta_matrixmul_cheetah.template.cu