Hi to everyone,

I’ve created a simply opencl program that performs a matrix multiplication, my kernel works with cuda but with opencl works only with datasets of square matrices. Someone could help me to figure out please?

Thanks

IN THE HOST

…

size_t bd[]={16,16};

size_t gd[]={bd[0]*(numCColumns/bd[0]+1), bd[1]*(numCRows/bd[1]+1)};

err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, gd,

bd, 0, NULL, NULL);

…

DEVICE

__kernel void matrixMultiply(__global float * A, __global float * B, __global float * C,

int numARows, int numAColumns, int numBRows, int numBColumns,

int numCRows, int numCColumns) {

int i=get_global_id(0);

int j=get_global_id(1);

int len=numAColumns;

if(i<numCColumns && j<numCRows)

{

float sum=0;

"

for(int k=0;k<len;k++)

"

sum+=A[j*numAColumns+k] B[knumBColumns+i];
C[j*numCColumns+i]=sum;

}

}

dataset 0

MAT A MAT B

64x64 64x64 OK

dataset 1

128x64 64x128 OK

dataset 2

100x128 128x56 NOT WORKING

dataset 3

128x64 64x128 OK

dataset 4

128x32 32x128 OK

dataset 5

200x100 100x256 NOT WORKING

dataset 6

256x256 256x256 OK

dataset 7

300x256 256x300 OK

dataset 8

128x64 64x128 OK

dataset 9

256x256 256x257 NOT WORKING