Auto-tuned matrix multiplication via Cheetah template
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import division
5
6 """
7 PyCuda Optimized Matrix Multiplication
8 Template Meta-programming Example using Cheetah
9 (modified from SciPy09 Advanced Tutorial)
10 """
11
12 # ------------------------------------------------------------------------------
13 import numpy as np
14 from pycuda import driver, compiler, gpuarray, tools
15 from Cheetah.Template import Template
16
17 import pycuda.autoinit
18
19 # -- default parameters
20 DEFAULT_BLOCK_SIZE = 16
21 DEFAULT_WORK_SIZE = 1
22 DEFAULT_UNROLL = 0
23 DEFAULT_SPILL = False
24 DEFAULT_PREFETCH = False
25
26 from os import path
27 MYPATH = path.dirname(path.abspath(__file__))
28 TEMPLATE_FILENAME = path.join(MYPATH, "demo_meta_matrixmul_cheetah.template.cu")
29
30 # ------------------------------------------------------------------------------
31 def matrixmul_opt(mat_a, mat_b,
32 block_size = DEFAULT_BLOCK_SIZE,
33 work_size = DEFAULT_WORK_SIZE,
34 unroll = DEFAULT_UNROLL,
35 spill = DEFAULT_SPILL,
36 prefetch = DEFAULT_PREFETCH):
37
38 ah, aw = mat_a.shape
39 bh, bw = mat_b.shape
40
41 assert aw == bh
42
43 # -- pad input matrices appropriately
44 ah_padded = int(np.ceil(ah/block_size)) * block_size
45 aw_padded = int(np.ceil(aw/block_size)) * (block_size*work_size)
46 mat_a_padded = np.zeros((ah_padded, aw_padded), np.float32)
47 mat_a_padded[:ah,:aw] = mat_a
48
49 bh_padded = aw_padded
50 bw_padded = int(np.ceil(bw/(block_size*work_size))) * (block_size*work_size)
51 mat_b_padded = np.zeros((bh_padded, bw_padded), np.float32)
52 mat_b_padded[:bh, :bw] = mat_b
53
54 ch_padded = ah_padded
55 cw_padded = bw_padded
56
57 # -- upload padded input matrices to the GPU
58 mat_a_gpu = gpuarray.to_gpu(mat_a_padded)
59 mat_b_gpu = gpuarray.to_gpu(mat_b_padded)
60
61 # -- create empty container matrix for the result (C = A * B)
62 mat_c_gpu = gpuarray.zeros((ch_padded, cw_padded), np.float32)
63
64 # -- generate and compile the code
65 # prepare the template parameters
66 template_params = {
67 'BLOCK_SIZE': block_size,
68 'WORK_SIZE': work_size,
69 'UNROLL': unroll,
70 'SPILL': spill,
71 'PREFETCH': prefetch,
72 'A_WIDTH': aw_padded,
73 'A_HEIGHT': ah_padded,
74 'B_WIDTH': bw_padded,
75 }
76
77 # run the template engine to get the code
78 kernel_code = Template(
79 file = TEMPLATE_FILENAME,
80 searchList = [template_params],
81 )
82
83 # compile the code
84 module = compiler.SourceModule(kernel_code)
85
86 # get the kernel from the module
87 matrixmul_func = module.get_function("matrixMul")
88
89 # some info about the module
90 print "number of registers used:", matrixmul_func.num_regs
91
92 # block of threads
93 # ATTENTION: block is (threadDim.x, threadDim.y, threadDim.z)
94 # and not (threadDim.z, threadDim.y, threadDim.x)
95 block = block_size, block_size, 1
96
97 # grid of blocks
98 # ATTENTION: it's (blockDim.x, blockDim.y)
99 # and not (blockDim.y, blockDim.x)
100 grid = int(cw_padded/block_size/work_size), int(ch_padded/block_size)
101
102 # -- call the kernel on the GPU
103 # Note that when we use time_kernel=True pycuda will automatically synchronize the kernel
104 # to make sure that the timing is correct. If you time the code yourself, you'll have to
105 # synchronize the current Context.
106 gpu_time = matrixmul_func(
107 # -- output
108 mat_c_gpu,
109 # -- inputs
110 mat_a_gpu, mat_b_gpu,
111 # -- grid of blocks
112 grid = grid,
113 # -- block of threads
114 block = block,
115 # -- time the kernel (approx.)
116 time_kernel = True,
117 )
118
119 # get the GPU matrix back to CPU memory
120 mat_c_padded = mat_c_gpu.get()
121 mat_c = mat_c_padded[:ah, :bw]
122
123 return mat_c, gpu_time
124
125 # ------------------------------------------------------------------------------
126 if __name__ == "__main__":
127
128 # matrix sizes
129 a_height = 1024
130 a_width = 1024
131 b_height = a_width
132 b_width = 1024
133
134 # create random square matrices
135 np.random.seed(0)
136 mat_a = np.random.randn(a_height, a_width).astype(np.float32)
137 mat_b = np.random.randn(b_height, b_width).astype(np.float32)
138
139 # compute reference on the cpu to verify GPU computation
140 mat_ref = np.dot(mat_a, mat_b)
141
142 # -- this is a good place to auto-tune the code (using the optimization kwargs)
143 # (note that you may need more that one iteration to get accurate timing estimates)
144 mat_c, gpu_time = matrixmul_opt(mat_a, mat_b)
145
146 # check for correctness
147 diff = mat_c - mat_ref
148 error = np.absolute(diff).max()
149 assert error <= 1e-2
150 l2norm = np.linalg.norm(diff)
151 print "l2norm: ", l2norm
152
153 # print some stats
154 print "gpu time:", gpu_time
155 gflop = mat_c.size * (a_width * 2.) / (1000**3.)
156 gflops = gflop / gpu_time
157 print "gflops:", gflops
Also needs this file: demo_meta_matrixmul_cheetah.template.cu
