From one excellent blog post by Cedric Nugerten, I am trying to implement a similar kernel to accelerate matrix multiplication using SYCL instead of oCL. His tiled implementation is shown below.
// Thread identifiers
const int row = get_local_id(0); // Local row ID (max: TS)
const int col = get_local_id(1); // Local col ID (max: TS)
const int globalRow = TS*get_group_id(0) + row; // Row ID of C (0..M)
const int globalCol = TS*get_group_id(1) + col; // Col ID of C (0..N)
// Local memory to fit a tile of TS*TS elements of A and B
__local float Asub[TS][TS];
__local float Bsub[TS][TS];
// Initialise the accumulation register
float acc = 0.0f;
// Loop over all tiles
const int numTiles = K/TS;
for (int t=0; t<numTiles; t++) {
// Load one tile of A and B into local memory
const int tiledRow = TS*t + row;
const int tiledCol = TS*t + col;
Asub[col][row] = A[tiledCol*M + globalRow];
Bsub[col][row] = B[globalCol*K + tiledRow];
// Synchronise to make sure the tile is loaded
barrier(CLK_LOCAL_MEM_FENCE);
// Perform the computation for a single tile
for (int k=0; k<TS; k++) {
acc += Asub[k][row] * Bsub[col][k];
}
// Synchronise before loading the next tile
barrier(CLK_LOCAL_MEM_FENCE);
}
// Store the final result in C
C[globalCol*M + globalRow] = acc;
How can I create __local
buffers in SYCL? Is there any specific keyword for it? Also, what are equivalent barrier(...)
functions to ensure caches are ready before the computation?
Currently, my naive matrix multiplication (similar to Cedric’s naive implementation) looks like this:
size_t M = a_host.rows();
size_t N = a_host.cols();
size_t P = b_host.cols();
buffer<double, 2> a(a_host.data(), range<2>{M, N});
buffer<double, 2> b(b_host.data(), range<2>{N, P});
buffer<double, 2> c(c_gpu.data(), range<2>{M, P});
PROFILE_SCOPE("Starting Multiply on GPU");
std::cout << "GPU::Multiplying A and B into C.\n";
auto e = q.submit([&](handler& h) {
auto A = a.get_access<access::mode::read>(h);
auto B = b.get_access<access::mode::read>(h);
auto C = c.get_access<access::mode::write>(h);
// buffer.get_range() returns the dimensions (shape) of the buffer/array
int num_cols_a = a.get_range()[1];
h.parallel_for(range<2>{M, P}, [=](id<2> index) {
// Thread identifiers
int row = index[0];
int col = index[1];
auto sum = 0.0;
// Compute result of ONE element of C
for (int i = 0; i < num_cols_a; i++)
sum += A[row][i] * B[i][col];
C[index] = sum;
});
});
e.wait();
I am new to this. Can you explain me what changes do I need to make to implement a tiling version of matmul?