1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 
   4 """ 
   5 Multiples two square matrices together using a *single* block of threads and 
   6 global memory only. Each thread computes one element of the resulting matrix.
   7 """
   8 
   9 import numpy as np
  10 from pycuda import driver, compiler, gpuarray, tools
  11 
  12 # -- initialize the device
  13 import pycuda.autoinit
  14 
  15 kernel_code_template = """
  16 __global__ void MatrixMulKernel(float *a, float *b, float *c)
  17 {
  18     // 2D Thread ID (assuming that only *one* block will be executed)
  19     int tx = threadIdx.x;
  20     int ty = threadIdx.y;
  21 
  22     // Pvalue is used to store the element of the matrix
  23     // that is computed by the thread
  24     float Pvalue = 0;
  25 
  26     // Each thread loads one row of M and one column of N, 
  27     //   to produce one element of P.
  28     for (int k = 0; k < %(MATRIX_SIZE)s; ++k) {
  29         float Aelement = a[ty * %(MATRIX_SIZE)s + k];
  30         float Belement = b[k * %(MATRIX_SIZE)s + tx];
  31         Pvalue += Aelement * Belement;
  32     }
  33 
  34     // Write the matrix to device memory;
  35     // each thread writes one element
  36     c[ty * %(MATRIX_SIZE)s + tx] = Pvalue;
  37 }
  38 """
  39 
  40 # define the (square) matrix size
  41 #  note that we'll only use *one* block of threads here
  42 #  as a consequence this number (squared) can't exceed max_threads,
  43 #  see http://documen.tician.de/pycuda/util.html#pycuda.tools.DeviceData
  44 #  for more information on how to get this number for your device
  45 MATRIX_SIZE = 2
  46 
  47 # create two random square matrices
  48 a_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32)
  49 b_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32)
  50 
  51 # compute reference on the CPU to verify GPU computation
  52 c_cpu = np.dot(a_cpu, b_cpu)
  53 
  54 # transfer host (CPU) memory to device (GPU) memory 
  55 a_gpu = gpuarray.to_gpu(a_cpu) 
  56 b_gpu = gpuarray.to_gpu(b_cpu)
  57 
  58 # create empty gpu array for the result (C = A * B)
  59 c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)
  60 
  61 # get the kernel code from the template 
  62 # by specifying the constant MATRIX_SIZE
  63 kernel_code = kernel_code_template % {
  64     'MATRIX_SIZE': MATRIX_SIZE 
  65     }
  66 
  67 # compile the kernel code 
  68 mod = compiler.SourceModule(kernel_code)
  69 
  70 # get the kernel function from the compiled module
  71 matrixmul = mod.get_function("MatrixMulKernel")
  72 
  73 # call the kernel on the card
  74 matrixmul(
  75     # inputs
  76     a_gpu, b_gpu, 
  77     # output
  78     c_gpu, 
  79     # (only one) block of MATRIX_SIZE x MATRIX_SIZE threads
  80     block = (MATRIX_SIZE, MATRIX_SIZE, 1),
  81     )
  82 
  83 # print the results
  84 print "-" * 80
  85 print "Matrix A (GPU):"
  86 print a_gpu.get()
  87 
  88 print "-" * 80
  89 print "Matrix B (GPU):"
  90 print b_gpu.get()
  91 
  92 print "-" * 80
  93 print "Matrix C (GPU):"
  94 print c_gpu.get()
  95 
  96 print "-" * 80
  97 print "CPU-GPU difference:"
  98 print c_cpu - c_gpu.get()
  99 
 100 np.allclose(c_cpu, c_gpu.get())

PyCuda/Examples/MatrixmulSimple (last edited 2014-02-26 14:08:02 by lauria)