Matrix multiplication

This used to be in the PyOpenCL distribution, but was moved here for license concerns.

License of this example:

(unclear, likely non-free)

Date:

2013-09-16

PyOpenCL version:

2013.1

OpenCL implementations (and versions) tried:

all but Apple CPU

   1 # example provided by Eilif Muller
   2 
   3 from __future__ import division
   4 
   5 KERNEL_CODE = """
   6 
   7 // Thread block size
   8 #define BLOCK_SIZE %(block_size)d
   9 
  10 // Matrix dimensions
  11 // (chosen as multiples of the thread block size for simplicity)
  12 #define WA %(w_a)d // Matrix A width
  13 #define HA %(h_a)d // Matrix A height
  14 #define WB %(w_b)d // Matrix B width
  15 #define HB WA  // Matrix B height
  16 #define WC WB  // Matrix C width
  17 #define HC HA  // Matrix C height
  18 
  19 
  20 /*
  21  * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
  22  *
  23  * NVIDIA Corporation and its licensors retain all intellectual property and
  24  * proprietary rights in and to this software and related documentation.
  25  * Any use, reproduction, disclosure, or distribution of this software
  26  * and related documentation without an express license agreement from
  27  * NVIDIA Corporation is strictly prohibited.
  28  *
  29  * Please refer to the applicable NVIDIA end user license agreement (EULA)
  30  * associated with this source code for terms and conditions that govern
  31  * your use of this NVIDIA software.
  32  *
  33  */
  34 
  35 /* Matrix multiplication: C = A * B.
  36  * Device code.
  37  */
  38 
  39 #define AS(j, i) As[i + j * BLOCK_SIZE]
  40 #define BS(j, i) Bs[i + j * BLOCK_SIZE]
  41 
  42 ////////////////////////////////////////////////////////////////////////////////
  43 //! Matrix multiplication on the device: C = A * B
  44 //! WA is A's width and WB is B's width
  45 ////////////////////////////////////////////////////////////////////////////////
  46 __kernel __attribute__((reqd_work_group_size(BLOCK_SIZE,BLOCK_SIZE,1))) 
  47 void
  48 matrixMul( __global float* C, __global float* A, __global float* B)
  49 {
  50     __local float As[BLOCK_SIZE*BLOCK_SIZE];
  51     __local float Bs[BLOCK_SIZE*BLOCK_SIZE];
  52 
  53     // Block index
  54     int bx = get_group_id(0);
  55     int by = get_group_id(1);
  56 
  57     // Thread index
  58     int tx = get_local_id(0);
  59     int ty = get_local_id(1);
  60 
  61     // Index of the first sub-matrix of A processed by the block
  62     int aBegin = WA * BLOCK_SIZE * by;
  63 
  64     // Index of the last sub-matrix of A processed by the block
  65     int aEnd   = aBegin + WA - 1;
  66 
  67     // Step size used to iterate through the sub-matrices of A
  68     int aStep  = BLOCK_SIZE;
  69 
  70     // Index of the first sub-matrix of B processed by the block
  71     int bBegin = BLOCK_SIZE * bx;
  72 
  73     // Step size used to iterate through the sub-matrices of B
  74     int bStep  = BLOCK_SIZE * WB;
  75 
  76     // Csub is used to store the element of the block sub-matrix
  77     // that is computed by the thread
  78     float Csub = 0.0f;
  79 
  80     // Loop over all the sub-matrices of A and B
  81     // required to compute the block sub-matrix
  82     for (int a = aBegin, b = bBegin;
  83              a <= aEnd;
  84              a += aStep, b += bStep) {
  85 
  86         // Load the matrices from device memory
  87         // to shared memory; each thread loads
  88         // one element of each matrix
  89         AS(ty, tx) = A[a + WA * ty + tx];
  90         BS(ty, tx) = B[b + WB * ty + tx];
  91 
  92         // Synchronize to make sure the matrices are loaded
  93         barrier(CLK_LOCAL_MEM_FENCE);
  94 
  95         // Multiply the two matrices together;
  96         // each thread computes one element
  97         // of the block sub-matrix
  98         for (int k = 0; k < BLOCK_SIZE; ++k)
  99             Csub += AS(ty, k) * BS(k, tx);
 100 
 101         // Synchronize to make sure that the preceding
 102         // computation is done before loading two new
 103         // sub-matrices of A and B in the next iteration
 104         barrier(CLK_LOCAL_MEM_FENCE);
 105     }
 106 
 107     // Write the block sub-matrix to device memory;
 108     // each thread writes one element
 109     C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = Csub;
 110 
 111 }
 112 
 113 """
 114 
 115 import pyopencl as cl
 116 from time import time
 117 import numpy
 118 
 119 block_size = 16
 120 
 121 ctx = cl.create_some_context()
 122 
 123 for dev in ctx.devices:
 124     assert dev.local_mem_size > 0
 125 
 126 queue = cl.CommandQueue(ctx,
 127         properties=cl.command_queue_properties.PROFILING_ENABLE)
 128 
 129 #queue = cl.CommandQueue(ctx)
 130 
 131 if False:
 132     a_height = 4096
 133     #a_height = 1024
 134     a_width = 2048
 135     #a_width = 256
 136     #b_height == a_width
 137     b_width = a_height
 138 
 139 elif False:
 140     # like PyCUDA
 141     a_height = 2516
 142     a_width = 1472
 143     b_height = a_width
 144     b_width = 2144
 145 
 146 else:
 147     # CL SDK
 148     a_width = 50*block_size
 149     a_height = 100*block_size
 150     b_width = 50*block_size
 151     b_height = a_width
 152 
 153 c_width = b_width
 154 c_height = a_height
 155 
 156 h_a = numpy.random.rand(a_height, a_width).astype(numpy.float32)
 157 h_b = numpy.random.rand(b_height, b_width).astype(numpy.float32)
 158 h_c = numpy.empty((c_height, c_width)).astype(numpy.float32)
 159 
 160 
 161 kernel_params = {"block_size": block_size,
 162         "w_a":a_width, "h_a":a_height, "w_b":b_width}
 163 
 164 if "NVIDIA" in queue.device.vendor:
 165     options = "-cl-mad-enable -cl-fast-relaxed-math"
 166 else:
 167     options = ""
 168 prg = cl.Program(ctx, KERNEL_CODE % kernel_params,
 169         ).build(options=options)
 170 kernel = prg.matrixMul
 171 #print prg.binaries[0]
 172 
 173 assert a_width % block_size == 0
 174 assert a_height % block_size == 0
 175 assert b_width % block_size == 0
 176 
 177 # transfer host -> device -----------------------------------------------------
 178 mf = cl.mem_flags
 179 
 180 t1 = time()
 181 
 182 d_a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=h_a)
 183 d_b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=h_b)
 184 d_c_buf = cl.Buffer(ctx, mf.WRITE_ONLY, size=h_c.nbytes)
 185 
 186 push_time = time()-t1
 187 
 188 # warmup ----------------------------------------------------------------------
 189 for i in range(5):
 190     event = kernel(queue, h_c.shape[::-1], (block_size, block_size), 
 191             d_c_buf, d_a_buf, d_b_buf)
 192     event.wait()
 193 
 194 queue.finish()
 195 
 196 # actual benchmark ------------------------------------------------------------
 197 t1 = time()
 198 
 199 count = 20
 200 for i in range(count):
 201     event = kernel(queue, h_c.shape[::-1], (block_size, block_size),
 202             d_c_buf, d_a_buf, d_b_buf)
 203 
 204 event.wait()
 205 
 206 gpu_time = (time()-t1)/count
 207 
 208 # transfer device -> host -----------------------------------------------------
 209 t1 = time()
 210 cl.enqueue_copy(queue, h_c, d_c_buf)
 211 pull_time = time()-t1
 212 
 213 # timing output ---------------------------------------------------------------
 214 gpu_total_time = gpu_time+push_time+pull_time
 215 
 216 print "GPU push+compute+pull total [s]:", gpu_total_time
 217 print "GPU push [s]:", push_time
 218 print "GPU pull [s]:", pull_time
 219 print "GPU compute (host-timed) [s]:", gpu_time
 220 print "GPU compute (event-timed) [s]: ", (event.profile.end-event.profile.start)*1e-9
 221 
 222 gflop = h_c.size * (a_width * 2.) / (1000**3.)
 223 gflops = gflop / gpu_time
 224 
 225 print
 226 print "GFlops/s:", gflops
 227 
 228 # cpu comparison --------------------------------------------------------------
 229 t1 = time()
 230 h_c_cpu = numpy.dot(h_a,h_b)
 231 cpu_time = time()-t1
 232 
 233 print
 234 print "GPU==CPU:",numpy.allclose(h_c, h_c_cpu)
 235 print
 236 print "CPU time (s)", cpu_time
 237 print
 238 
 239 print "GPU speedup (with transfer): ", cpu_time/gpu_total_time
 240 print "GPU speedup (without transfer): ", cpu_time/gpu_time

PyOpenCL/Examples/MatrixMultiply (last edited 2013-09-16 02:38:07 by AndreasKloeckner)