I am working to learn openCL. I have two buffers, a, and b, each is 256 elements in size. a buffer is all 1s so has a sum of 256. I want a kernel that for each b element, performs a reduce on a and stores in b element. I can get reduce to work on one dimension, but when I try to use two dimensions, one for the a reduce, and one for b, I don’t know what’s going on! Here is the kernel:
kernel void test2(global float* input, global float* output) {
const int tid = get_local_id(0);
const int gid = get_global_id(0);
const int local_size = get_local_size(0);
__local float tmp[256];
// Load input data into local memory
__local float local_data[256];
local_data[tid] = (gid < 256) ? input[gid] : 0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
// Perform parallel reduction on local data
float sum = work_group_reduce_add(local_data[tid]);
barrier(CLK_LOCAL_MEM_FENCE);
// Write result to output buffer
if (tid == 0) {
tmp[get_group_id(0)] = sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
float o=0;
for( int i=0; i < get_num_groups(0); ++i){
o+= tmp[i];
}
output[get_global_id(1)] =o;
}
thanks for the help!