-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmat_mul_kernel.cl
More file actions
49 lines (39 loc) · 1.53 KB
/
mat_mul_kernel.cl
File metadata and controls
49 lines (39 loc) · 1.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
// compile option: -D TS=<work_group_size>
// Tiled and coalesced version
__kernel void main(const __global float* A,
const __global float* B,
__global float* C,
const int M,
const int K,
const int N)
{
// Thread identifiers
const int row = get_local_id(0); // Local row ID (max: TS)
const int col = get_local_id(1); // Local col ID (max: TS)
const int globalRow = TS * get_group_id(0) + row; // Row ID of C (0..M)
const int globalCol = TS * get_group_id(1) + col; // Col ID of C (0..N)
// Local memory to fit a tile of TS*TS elements of A and B
__local float Asub[TS][TS];
__local float Bsub[TS][TS];
// Initialise the accumulation register
float acc = 0.0f;
// Loop over all tiles
const int numTiles = K / TS;
for (int t = 0; t < numTiles; t++) {
// Load one tile of A and B into local memory
const int tiledRow = TS * t + row;
const int tiledCol = TS * t + col;
Asub[col][row] = A[tiledCol * M + globalRow];
Bsub[col][row] = B[globalCol * K + tiledRow];
// Synchronise to make sure the tile is loaded
barrier(CLK_LOCAL_MEM_FENCE);
// Perform the computation for a single tile
for (int k = 0; k < TS; k++) {
acc += Asub[k][row] * Bsub[col][k];
}
// Synchronise before loading the next tile
barrier(CLK_LOCAL_MEM_FENCE);
}
// Store the final result in C
C[globalCol * M + globalRow] = acc;
}