Differences between revisions 1 and 2
 ⇤ ← Revision 1 as of 2010-01-31 16:30:31 → Size: 6263 Editor: ip72-221-120-106 Comment: ← Revision 2 as of 2013-12-16 19:44:15 → ⇥ Size: 6265 Editor: 50-193-52-113-static Comment: No differences found!

```   1 # Exercise 1 from http://webapp.dam.brown.edu/wiki/SciComp/CudaExercises
2
3 # Transposition of a matrix
4 # by Hendrik Riedmann <riedmann@dam.brown.edu>
5
6 from __future__ import division
7
8 import pycuda.driver as cuda
9 import pycuda.gpuarray as gpuarray
10 import pycuda.autoinit
11 from pycuda.compiler import SourceModule
12
13 import numpy
14 import numpy.linalg as la
15
16 from pycuda.tools import context_dependent_memoize
17
18 block_size = 16
19
20 @context_dependent_memoize
21 def _get_transpose_kernel():
22     mod = SourceModule("""
23     #define BLOCK_SIZE %(block_size)d
24     #define A_BLOCK_STRIDE (BLOCK_SIZE * a_width)
25     #define A_T_BLOCK_STRIDE (BLOCK_SIZE * a_height)
26
27     __global__ void transpose(float *A_t, float *A, int a_width, int a_height)
28     {
29         // Base indices in A and A_t
30         int base_idx_a   = blockIdx.x * BLOCK_SIZE +
31         blockIdx.y * A_BLOCK_STRIDE;
32         int base_idx_a_t = blockIdx.y * BLOCK_SIZE +
33         blockIdx.x * A_T_BLOCK_STRIDE;
34
35         // Global indices in A and A_t
36         int glob_idx_a   = base_idx_a + threadIdx.x + a_width * threadIdx.y;
37         int glob_idx_a_t = base_idx_a_t + threadIdx.x + a_height * threadIdx.y;
38
39         __shared__ float A_shared[BLOCK_SIZE][BLOCK_SIZE+1];
40
41         // Store transposed submatrix to shared memory
43
45
46         // Write transposed submatrix to global memory
48     }
49     """% {"block_size": block_size})
50
51     func = mod.get_function("transpose")
52     func.prepare("PPii", block=(block_size, block_size, 1))
53
54     from pytools import Record
55     class TransposeKernelInfo(Record): pass
56
57     return TransposeKernelInfo(func=func,
58             block_size=block_size,
59             granularity=block_size)
60
61
62
63 def _get_big_block_transpose_kernel():
64     mod = SourceModule("""
65     #define BLOCK_SIZE %(block_size)d
66     #define A_BLOCK_STRIDE (BLOCK_SIZE * a_width)
67     #define A_T_BLOCK_STRIDE (BLOCK_SIZE * a_height)
68
69     __global__ void transpose(float *A, float *A_t, int a_width, int a_height)
70     {
71         // Base indices in A and A_t
72         int base_idx_a   = 2 * blockIdx.x * BLOCK_SIZE +
73         2 * blockIdx.y * A_BLOCK_STRIDE;
74         int base_idx_a_t = 2 * blockIdx.y * BLOCK_SIZE +
75         2 * blockIdx.x * A_T_BLOCK_STRIDE;
76
77         // Global indices in A and A_t
78         int glob_idx_a   = base_idx_a + threadIdx.x + a_width * threadIdx.y;
79         int glob_idx_a_t = base_idx_a_t + threadIdx.x + a_height * threadIdx.y;
80
81         __shared__ float A_shared[2 * BLOCK_SIZE][2 * BLOCK_SIZE + 1];
82
83         // Store transposed submatrix to shared memory
86         A[glob_idx_a + A_BLOCK_STRIDE];
88         A[glob_idx_a + BLOCK_SIZE];
89         A_shared[threadIdx.y + BLOCK_SIZE][threadIdx.x + BLOCK_SIZE] =
90         A[glob_idx_a + BLOCK_SIZE + A_BLOCK_STRIDE];
91
93
94         // Write transposed submatrix to global memory
96         A_t[glob_idx_a_t + A_T_BLOCK_STRIDE] =
98         A_t[glob_idx_a_t + BLOCK_SIZE] =
100         A_t[glob_idx_a_t + A_T_BLOCK_STRIDE + BLOCK_SIZE] =
102     }
103       """% {"block_size": block_size})
104
105     func = mod.get_function("transpose")
106     func.prepare("PPii", block=(block_size, block_size, 1))
107
108     from pytools import Record
109     class TransposeKernelInfo(Record): pass
110
111     return TransposeKernelInfo(func=func,
112             block_size=block_size,
113             granularity=2*block_size)
114
115
116
117
118 def _transpose(tgt, src):
119     krnl = _get_transpose_kernel()
120
121     w, h = src.shape
122     assert tgt.shape == (h, w)
123     assert w % krnl.granularity == 0
124     assert h % krnl.granularity == 0
125
126     krnl.func.prepared_call(
127                     (w // krnl.granularity, h // krnl.granularity),
128                     tgt.gpudata, src.gpudata, w, h)
129
130
131
132
133 def transpose(src):
134     w, h = src.shape
135
136     result = gpuarray.empty((h, w), dtype=src.dtype)
137     _transpose(result, src)
138     return result
139
140
141
142
143
144 def check_transpose():
145     from pycuda.curandom import rand
146
147     for i in numpy.arange(10, 13, 0.125):
148         size = int(((2**i) // 32) * 32)
149         print size
150
151         source = rand((size, size), dtype=numpy.float32)
152
153         result = transpose(source)
154
155         err = source.get().T - result.get()
156         err_norm = la.norm(err)
157
158         source.gpudata.free()
159         result.gpudata.free()
160
161         assert err_norm == 0, (size, err_norm)
162
163
164
165
166 def run_benchmark():
167     from pycuda.curandom import rand
168
169     sizes = []
170     bandwidths = []
171     times = []
172     for i in numpy.arange(10, 13, 2**(-6)):
173         size = int(((2**i) // 16) * 16)
174
175         source = rand((size, size), dtype=numpy.float32)
176         target = gpuarray.empty((size, size), dtype=source.dtype)
177
178         start = pycuda.driver.Event()
179         stop = pycuda.driver.Event()
180
181         warmup = 2
182
183         for i in range(warmup):
184             _transpose(target, source)
185
186         count = 10
187
188         cuda.Context.synchronize()
189         start.record()
190
191         for i in range(count):
192             _transpose(target, source)
193
194         stop.record()
195         stop.synchronize()
196
197         elapsed_seconds = stop.time_since(start)*1e-3
198         mem_bw = source.nbytes / elapsed_seconds * 2 * count
199
200         sizes.append(size)
201         bandwidths.append(mem_bw)
202         times.append(elapsed_seconds)
203
204     slow_sizes = [s for s, bw in zip(sizes, bandwidths) if bw < 40e9]
205     print slow_sizes
206     print [s % 64 for s in slow_sizes]
207     from matplotlib.pyplot import semilogx, loglog, show, savefig, clf
208     semilogx(sizes, bandwidths)
209     savefig("transpose-bw.png")
210     clf()
211     loglog(sizes, times)
212     savefig("transpose-times.png")
213
214
215
216
217 #check_transpose()
218 run_benchmark()
```

PyCuda/Examples/MatrixTranspose (last edited 2014-09-04 23:33:14 by AndreasKloeckner)