1
2
3
4 """
5 Multiples two square matrices together using multiple blocks and shared memory.
6 Each thread block is assigned a "tile" of the resulting matrix and is responsible
7 for generating the elements in that tile. Each thread in a block computes one element
8 of the tile.
9 """
10
11 import numpy as np
12 from numpy import linalg as la
13 from pycuda import driver, compiler, gpuarray, tools
14
15
16 import pycuda.autoinit
17
18 kernel_code_template = """
19 __global__ void MatrixMulKernel(float *A, float *B, float *C)
20 {
21
22 const uint wA = %(MATRIX_SIZE)s;
23 const uint wB = %(MATRIX_SIZE)s;
24
25 // Block index
26 const uint bx = blockIdx.x;
27 const uint by = blockIdx.y;
28
29 // Thread index
30 const uint tx = threadIdx.x;
31 const uint ty = threadIdx.y;
32
33 // Index of the first sub-matrix of A processed by the block
34 const uint aBegin = wA * %(BLOCK_SIZE)s * by;
35 // Index of the last sub-matrix of A processed by the block
36 const uint aEnd = aBegin + wA - 1;
37 // Step size used to iterate through the sub-matrices of A
38 const uint aStep = %(BLOCK_SIZE)s;
39
40 // Index of the first sub-matrix of B processed by the block
41 const uint bBegin = %(BLOCK_SIZE)s * bx;
42 // Step size used to iterate through the sub-matrices of B
43 const uint bStep = %(BLOCK_SIZE)s * wB;
44
45 // The element of the block sub-matrix that is computed
46 // by the thread
47 float Csub = 0;
48 // Loop over all the sub-matrices of A and B required to
49 // compute the block sub-matrix
50 for (int a = aBegin, b = bBegin;
51 a <= aEnd;
52 a += aStep, b += bStep)
53 {
54 // Shared memory for the sub-matrix of A
55 __shared__ float As[%(BLOCK_SIZE)s][%(BLOCK_SIZE)s];
56 // Shared memory for the sub-matrix of B
57 __shared__ float Bs[%(BLOCK_SIZE)s][%(BLOCK_SIZE)s];
58
59 // Load the matrices from global memory to shared memory;
60 // each thread loads one element of each matrix
61 As[ty][tx] = A[a + wA * ty + tx];
62 Bs[ty][tx] = B[b + wB * ty + tx];
63 // Synchronize to make sure the matrices are loaded
64 __syncthreads();
65
66 // Multiply the two matrices together;
67 // each thread computes one element
68 // of the block sub-matrix
69 for (int k = 0; k < %(BLOCK_SIZE)s; ++k)
70 Csub += As[ty][k] * Bs[k][tx];
71
72 // Synchronize to make sure that the preceding
73 // computation is done before loading two new
74 // sub-matrices of A and B in the next iteration
75 __syncthreads();
76 }
77
78 // Write the block sub-matrix to global memory;
79 // each thread writes one element
80 const uint c = wB * %(BLOCK_SIZE)s * by + %(BLOCK_SIZE)s * bx;
81 C[c + wB * ty + tx] = Csub;
82 }
83 """
84
85
86 MATRIX_SIZE = 4
87
88
89
90 TILE_SIZE = 2
91 BLOCK_SIZE = TILE_SIZE
92
93
94 a_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32)
95 b_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32)
96
97
98 c_cpu = np.dot(a_cpu, b_cpu)
99
100
101 a_gpu = gpuarray.to_gpu(a_cpu)
102 b_gpu = gpuarray.to_gpu(b_cpu)
103
104
105 c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)
106
107
108
109 kernel_code = kernel_code_template % {
110 'MATRIX_SIZE': MATRIX_SIZE,
111 'BLOCK_SIZE': BLOCK_SIZE,
112 }
113
114
115 mod = compiler.SourceModule(kernel_code)
116
117
118 matrixmul = mod.get_function("MatrixMulKernel")
119
120
121 matrixmul(
122
123 a_gpu, b_gpu,
124
125 c_gpu,
126
127 grid = (MATRIX_SIZE / TILE_SIZE, MATRIX_SIZE / TILE_SIZE),
128
129 block = (TILE_SIZE, TILE_SIZE, 1),
130 )
131
132
133 print "-" * 80
134 print "Matrix A (GPU):"
135 print a_gpu.get()
136
137 print "-" * 80
138 print "Matrix B (GPU):"
139 print b_gpu.get()
140
141 print "-" * 80
142 print "Matrix C (GPU):"
143 print c_gpu.get()
144
145 print "-" * 80
146 print "CPU-GPU difference:"
147 print c_cpu - c_gpu.get()
148 print "L2 norm:", la.norm(c_cpu - c_gpu.get())
149 np.allclose(c_cpu, c_gpu.get())