1
2
3
4
5
6 from __future__ import division
7
8 import pycuda.driver as cuda
9 import pycuda.gpuarray as gpuarray
10 import pycuda.autoinit
11 from pycuda.compiler import SourceModule
12
13 import numpy
14 import numpy.linalg as la
15
16 from pycuda.tools import context_dependent_memoize
17
18 block_size = 16
19
20 @context_dependent_memoize
21 def _get_transpose_kernel():
22 mod = SourceModule("""
23 #define BLOCK_SIZE %(block_size)d
24 #define A_BLOCK_STRIDE (BLOCK_SIZE * a_width)
25 #define A_T_BLOCK_STRIDE (BLOCK_SIZE * a_height)
26
27 __global__ void transpose(float *A_t, float *A, int a_width, int a_height)
28 {
29 // Base indices in A and A_t
30 int base_idx_a = blockIdx.x * BLOCK_SIZE +
31 blockIdx.y * A_BLOCK_STRIDE;
32 int base_idx_a_t = blockIdx.y * BLOCK_SIZE +
33 blockIdx.x * A_T_BLOCK_STRIDE;
34
35 // Global indices in A and A_t
36 int glob_idx_a = base_idx_a + threadIdx.x + a_width * threadIdx.y;
37 int glob_idx_a_t = base_idx_a_t + threadIdx.x + a_height * threadIdx.y;
38
39 __shared__ float A_shared[BLOCK_SIZE][BLOCK_SIZE+1];
40
41 // Store transposed submatrix to shared memory
42 A_shared[threadIdx.y][threadIdx.x] = A[glob_idx_a];
43
44 __syncthreads();
45
46 // Write transposed submatrix to global memory
47 A_t[glob_idx_a_t] = A_shared[threadIdx.x][threadIdx.y];
48 }
49 """% {"block_size": block_size})
50
51 func = mod.get_function("transpose")
52 func.prepare("PPii", block=(block_size, block_size, 1))
53
54 from pytools import Record
55 class TransposeKernelInfo(Record): pass
56
57 return TransposeKernelInfo(func=func,
58 block_size=block_size,
59 granularity=block_size)
60
61
62
63 def _get_big_block_transpose_kernel():
64 mod = SourceModule("""
65 #define BLOCK_SIZE %(block_size)d
66 #define A_BLOCK_STRIDE (BLOCK_SIZE * a_width)
67 #define A_T_BLOCK_STRIDE (BLOCK_SIZE * a_height)
68
69 __global__ void transpose(float *A, float *A_t, int a_width, int a_height)
70 {
71 // Base indices in A and A_t
72 int base_idx_a = 2 * blockIdx.x * BLOCK_SIZE +
73 2 * blockIdx.y * A_BLOCK_STRIDE;
74 int base_idx_a_t = 2 * blockIdx.y * BLOCK_SIZE +
75 2 * blockIdx.x * A_T_BLOCK_STRIDE;
76
77 // Global indices in A and A_t
78 int glob_idx_a = base_idx_a + threadIdx.x + a_width * threadIdx.y;
79 int glob_idx_a_t = base_idx_a_t + threadIdx.x + a_height * threadIdx.y;
80
81 __shared__ float A_shared[2 * BLOCK_SIZE][2 * BLOCK_SIZE + 1];
82
83 // Store transposed submatrix to shared memory
84 A_shared[threadIdx.y][threadIdx.x] = A[glob_idx_a];
85 A_shared[threadIdx.y][threadIdx.x + BLOCK_SIZE] =
86 A[glob_idx_a + A_BLOCK_STRIDE];
87 A_shared[threadIdx.y + BLOCK_SIZE][threadIdx.x] =
88 A[glob_idx_a + BLOCK_SIZE];
89 A_shared[threadIdx.y + BLOCK_SIZE][threadIdx.x + BLOCK_SIZE] =
90 A[glob_idx_a + BLOCK_SIZE + A_BLOCK_STRIDE];
91
92 __syncthreads();
93
94 // Write transposed submatrix to global memory
95 A_t[glob_idx_a_t] = A_shared[threadIdx.x][threadIdx.y];
96 A_t[glob_idx_a_t + A_T_BLOCK_STRIDE] =
97 A_shared[threadIdx.x + BLOCK_SIZE][threadIdx.y];
98 A_t[glob_idx_a_t + BLOCK_SIZE] =
99 A_shared[threadIdx.x][threadIdx.y + BLOCK_SIZE];
100 A_t[glob_idx_a_t + A_T_BLOCK_STRIDE + BLOCK_SIZE] =
101 A_shared[threadIdx.x + BLOCK_SIZE][threadIdx.y + BLOCK_SIZE];
102 }
103 """% {"block_size": block_size})
104
105 func = mod.get_function("transpose")
106 func.prepare("PPii", block=(block_size, block_size, 1))
107
108 from pytools import Record
109 class TransposeKernelInfo(Record): pass
110
111 return TransposeKernelInfo(func=func,
112 block_size=block_size,
113 granularity=2*block_size)
114
115
116
117
118 def _transpose(tgt, src):
119 krnl = _get_transpose_kernel()
120
121 w, h = src.shape
122 assert tgt.shape == (h, w)
123 assert w % krnl.granularity == 0
124 assert h % krnl.granularity == 0
125
126 krnl.func.prepared_call(
127 (w // krnl.granularity, h // krnl.granularity),
128 tgt.gpudata, src.gpudata, w, h)
129
130
131
132
133 def transpose(src):
134 w, h = src.shape
135
136 result = gpuarray.empty((h, w), dtype=src.dtype)
137 _transpose(result, src)
138 return result
139
140
141
142
143
144 def check_transpose():
145 from pycuda.curandom import rand
146
147 for i in numpy.arange(10, 13, 0.125):
148 size = int(((2**i) // 32) * 32)
149 print size
150
151 source = rand((size, size), dtype=numpy.float32)
152
153 result = transpose(source)
154
155 err = source.get().T - result.get()
156 err_norm = la.norm(err)
157
158 source.gpudata.free()
159 result.gpudata.free()
160
161 assert err_norm == 0, (size, err_norm)
162
163
164
165
166 def run_benchmark():
167 from pycuda.curandom import rand
168
169 sizes = []
170 bandwidths = []
171 times = []
172 for i in numpy.arange(10, 13, 2**(-6)):
173 size = int(((2**i) // 16) * 16)
174
175 source = rand((size, size), dtype=numpy.float32)
176 target = gpuarray.empty((size, size), dtype=source.dtype)
177
178 start = pycuda.driver.Event()
179 stop = pycuda.driver.Event()
180
181 warmup = 2
182
183 for i in range(warmup):
184 _transpose(target, source)
185
186 count = 10
187
188 cuda.Context.synchronize()
189 start.record()
190
191 for i in range(count):
192 _transpose(target, source)
193
194 stop.record()
195 stop.synchronize()
196
197 elapsed_seconds = stop.time_since(start)*1e-3
198 mem_bw = source.nbytes / elapsed_seconds * 2 * count
199
200 sizes.append(size)
201 bandwidths.append(mem_bw)
202 times.append(elapsed_seconds)
203
204 slow_sizes = [s for s, bw in zip(sizes, bandwidths) if bw < 40e9]
205 print slow_sizes
206 print [s % 64 for s in slow_sizes]
207 from matplotlib.pyplot import semilogx, loglog, show, savefig, clf
208 semilogx(sizes, bandwidths)
209 savefig("transpose-bw.png")
210 clf()
211 loglog(sizes, times)
212 savefig("transpose-times.png")
213
214
215
216
217
218 run_benchmark()