Hello,

i got a problem with the performance of writing my results of the kernel function to the output array. The whole kernel function time increases from 3ms to like 80ms just because of one writing operation.

Surprisingly, the performance does not decrease if I write a constant value into the output array.

The Code of the kernel is the following:

```
__kernel void linearMatching(
__global float* list,
__global float* list2,
__global float* resultList,
__global float2* tempList,
const unsigned int count,
const unsigned int count2,
const unsigned int nDim,
const unsigned int nBufferSize
)
{
const int nWorkgrps = get_num_groups(0);
const int global_id = get_group_id(0);
const int local_id = get_local_id(0);
const int LOCAL_WORKSIZE = min(get_local_size(0), ARRAY_SIZE);
//work groups needed for list2
int nWGs_L2;
if(count2 % LOCAL_WORKSIZE == 0)
nWGs_L2 = (count2 / LOCAL_WORKSIZE);
else
nWGs_L2 = (count2 / LOCAL_WORKSIZE) + 1;
//current list1 index
int L1_index = global_id / nWGs_L2;
//current list2 index
int L2_index = (global_id % nWGs_L2) * LOCAL_WORKSIZE + local_id;
//l2 items left to check
int leftL2 = min(LOCAL_WORKSIZE, count2 - (global_id % nWGs_L2) * LOCAL_WORKSIZE);
if(L1_index >= count || L2_index >= count2)
return;
if(local_id >= ARRAY_SIZE)
return;
//builds sums for LOCAL_WORKSIZE items of l2 with one item of l1
float diff;
__local float2 sums[ARRAY_SIZE];
sums[local_id].x = 0;
sums[local_id].y = (float)L2_index;
for(int k = 0; k < nDim; k++) {
diff = (list[L1_index*nDim + k] - list2[L2_index*nDim + k]);
sums[local_id].x += diff * diff;
}
//get minimum
const int nSearchThreads = (8 < LOCAL_WORKSIZE)? 8 : LOCAL_WORKSIZE;
float cur_min_error = sums[0].x;
int best_index = sums[0].y;
if(local_id % (nDim/nSearchThreads) == 0){
const int nItemstosearch = nDim/nSearchThreads;
const int offset = (local_id / nSearchThreads) * nItemstosearch;
const int end = min(nItemstosearch + offset, count2%LOCAL_WORKSIZE);
for(int k = offset; k < end; k++){
if(sums[k].x < cur_min_error){
cur_min_error = sums[k].x;
best_index = sums[k].y;
}
}
sums[(local_id / nSearchThreads)].x = cur_min_error;
sums[(local_id / nSearchThreads)].y = best_index;
}
if(local_id == 0){
for(int k = 0; k < nSearchThreads; k++)
if(sums[k].x < cur_min_error){
cur_min_error = sums[k].x;
best_index = sums[k].y;
}
int index = (L1_index*nWGs_L2 + L2_index/nWGs_L2) * 2;
resultList[index] = 11.3f; //<---- with this line, no performance decrease
resultList[index+1] = best_index; //<- with this line it is like 30x slower
}
}
```

The last lines with “resultList” are the lines I’m talking about. With the first one, it is fast. With the second one, it is slow.

The workgrp size is 256 and a total of 256*2048 work items.

Has anybody an idea why it is so slow and how to fix it?

Another question on the side: is it in any way faster if split the work items in 2 dimensions instead of one?

kind regards,

veio