Kernel Concurrency

Demonstrates concurrent execution of multiple (2) kernels, using PyCuda. To "prove" that both kernels are executing at the same time, simply comment out line 63. This should break concurrency and the runtime should be doubled.

Jesse Lu, jesselu@stanford.edu

License of this example:

Public Domain

Date:

2011-04-07

PyCUDA version:

2011.01

   1 #! /usr/bin/env python
   2 # A simple program to illustrate kernel concurrency with PyCuda.
   3 # Reference: Chapter 3.2.6.5 in Cuda C Programming Guide Version 3.2.
   4 # Jesse Lu, 2011-04-04
   5 
   6 import numpy as np
   7 import pycuda.autoinit
   8 import pycuda.driver as drv
   9 from pycuda.compiler import SourceModule
  10 
  11 #
  12 # Set up test scenario.
  13 # 
  14 
  15 # Create a simple test kernel.
  16 mod = SourceModule("""
  17 __global__ void my_kernel(float *d) {
  18     const int i = threadIdx.x;
  19     for (int m=0; m<100; m++) {
  20         for (int k=0; k<100 ; k++)
  21             d[i] = d[i] * 2.0;
  22         for (int k=0; k<100 ; k++)
  23             d[i] = d[i] / 2.0;
  24     }
  25     d[i] = d[i] * 2.0;
  26 }
  27 """)
  28 my_kernel = mod.get_function("my_kernel")
  29 
  30 # Create the test data on the host.
  31 N = 400 # Size of datasets.
  32 n = 2 # Number of datasets (and concurrent operations) used.
  33 data, data_check, d_data = [], [], []
  34 for k in range(n):
  35     data.append(np.random.randn(N).astype(np.float32)) # Create random data.
  36     data_check.append(data[k].copy()) # For checking the result afterwards. 
  37     d_data.append(drv.mem_alloc(data[k].nbytes)) # Allocate memory on device.
  38 
  39 #
  40 # Start concurrency test.
  41 #
  42 
  43 # Use this event as a reference point.
  44 ref = drv.Event()
  45 ref.record()
  46 
  47 # Create the streams and events needed.
  48 stream, event = [], []
  49 marker_names = ['kernel_begin', 'kernel_end']
  50 for k in range(n):
  51     stream.append(drv.Stream())
  52     event.append(dict([(marker_names[l], drv.Event()) for l in range(len(marker_names))]))
  53 
  54 # Transfer to device.
  55 for k in range(n):
  56     drv.memcpy_htod(d_data[k], data[k]) 
  57 
  58 # Run kernels many times, we will only keep data from last loop iteration.
  59 for j in range(10):
  60     for k in range(n):
  61         event[k]['kernel_begin'].record(stream[k])
  62         my_kernel(d_data[k], block=(N,1,1), stream=stream[k]) 
  63     for k in range(n): # Commenting out this line should break concurrency.
  64         event[k]['kernel_end'].record(stream[k])
  65 
  66 # Transfer data back to host.
  67 for k in range(n):
  68     drv.memcpy_dtoh(data[k], d_data[k]) 
  69 
  70 # 
  71 # Output results.
  72 #
  73 
  74 print '\n=== Device attributes'
  75 dev = pycuda.autoinit.device
  76 print 'Name:', dev.name()
  77 print 'Compute capability:', dev.compute_capability()
  78 print 'Concurrent Kernels:', \
  79     bool(dev.get_attribute(drv.device_attribute.CONCURRENT_KERNELS))
  80 
  81 print '\n=== Checking answers'
  82 for k in range(n):
  83     print 'Dataset', k, ':',
  84     if (np.linalg.norm((data_check[k] * 2**(j+1)) - data[k]) == 0.0):
  85         print 'passed.'
  86     else:
  87         print 'FAILED!'
  88 
  89 print '\n=== Timing info (for last set of kernel launches)'
  90 for k in range(n):
  91     print 'Dataset', k 
  92     for l in range(len(marker_names)):
  93         print marker_names[l], ':', ref.time_till(event[k][marker_names[l]])