Matrix multiplication optimisation in opencl

Hello dear programmers, i m begineer on opencl and i m working in matrix multiplication and trying to optimise it but i don t really get an interesting results. there is my kernel pls help me i m blocked `

#define BLOCK_SIZE 16
#ifndef SIMD_WORK_ITEMS
#define SIMD_WORK_ITEMS 4 // default value
#endif

__kernel 
__attribute((reqd_work_group_size(BLOCK_SIZE,BLOCK_SIZE,1)))
__attribute((num_simd_work_items(SIMD_WORK_ITEMS)))
void matrixMultiplication(__global float* Ar,__global float* Ai, __global float* Br,__global float* Bi, __global float* Cr, __global float* Ci, int widthA, int widthB )
{
__local float Ar_local[BLOCK_SIZE][BLOCK_SIZE];
__local float Br_local[BLOCK_SIZE][BLOCK_SIZE];
__local float Ai_local[BLOCK_SIZE][BLOCK_SIZE];
__local float Bi_local[BLOCK_SIZE][BLOCK_SIZE];

    int i = get_group_id(0);
    int j = get_group_id(1);

	int local_x = get_local_id(0);
    int local_y = get_local_id(1);

int a_start = widthA * BLOCK_SIZE * j;
    int a_end   = a_start + widthA - 1;
    int b_start = BLOCK_SIZE * i;

    float value1 = 0;
	float value2 = 0;

	for (int a = a_start, b = b_start; a <= a_end; a += BLOCK_SIZE, b += (BLOCK_SIZE * widthB))
    {
	    Ar_local[local_y][local_x] = Ar[a + widthA * local_y + local_x];
        Br_local[local_x][local_y] = Br[b + widthB * local_y + local_x];
		Ai_local[local_y][local_x] = Ai[a + widthA * local_y + local_x];
        Bi_local[local_x][local_y] = Bi[b + widthA * local_y + local_x];

		barrier(CLK_LOCAL_MEM_FENCE);

   #pragma unroll
  for (int k=0; k<BLOCK_SIZE; ++k) {
  
     value1 += Ar_local[local_y][k] * Br_local[local_x][k] - Ai_local[local_y][k] * Bi_local[local_x][k];
	 value2 += Ar_local[local_y][k] * Bi_local[local_x][k] + Ai_local[local_y][k] * Br_local[local_x][k];
  }
    barrier(CLK_LOCAL_MEM_FENCE);
    }

    Cr[get_global_id(1) * widthA + get_global_id(0)] = value1;
	Ci[get_global_id(1) * widthA + get_global_id(0)] = value2;

}

Hello, I replied with some matrix multiplication links on a related thread that you may find useful, here: