Hello dear programmers, i m begineer on opencl and i m working in matrix multiplication and trying to optimise it but i don t really get an interesting results. there is my kernel pls help me i m blocked `
#define BLOCK_SIZE 16
#ifndef SIMD_WORK_ITEMS
#define SIMD_WORK_ITEMS 4 // default value
#endif
__kernel
__attribute((reqd_work_group_size(BLOCK_SIZE,BLOCK_SIZE,1)))
__attribute((num_simd_work_items(SIMD_WORK_ITEMS)))
void matrixMultiplication(__global float* Ar,__global float* Ai, __global float* Br,__global float* Bi, __global float* Cr, __global float* Ci, int widthA, int widthB )
{
__local float Ar_local[BLOCK_SIZE][BLOCK_SIZE];
__local float Br_local[BLOCK_SIZE][BLOCK_SIZE];
__local float Ai_local[BLOCK_SIZE][BLOCK_SIZE];
__local float Bi_local[BLOCK_SIZE][BLOCK_SIZE];
int i = get_group_id(0);
int j = get_group_id(1);
int local_x = get_local_id(0);
int local_y = get_local_id(1);
int a_start = widthA * BLOCK_SIZE * j;
int a_end = a_start + widthA - 1;
int b_start = BLOCK_SIZE * i;
float value1 = 0;
float value2 = 0;
for (int a = a_start, b = b_start; a <= a_end; a += BLOCK_SIZE, b += (BLOCK_SIZE * widthB))
{
Ar_local[local_y][local_x] = Ar[a + widthA * local_y + local_x];
Br_local[local_x][local_y] = Br[b + widthB * local_y + local_x];
Ai_local[local_y][local_x] = Ai[a + widthA * local_y + local_x];
Bi_local[local_x][local_y] = Bi[b + widthA * local_y + local_x];
barrier(CLK_LOCAL_MEM_FENCE);
#pragma unroll
for (int k=0; k<BLOCK_SIZE; ++k) {
value1 += Ar_local[local_y][k] * Br_local[local_x][k] - Ai_local[local_y][k] * Bi_local[local_x][k];
value2 += Ar_local[local_y][k] * Bi_local[local_x][k] + Ai_local[local_y][k] * Br_local[local_x][k];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
Cr[get_global_id(1) * widthA + get_global_id(0)] = value1;
Ci[get_global_id(1) * widthA + get_global_id(0)] = value2;
}