Final Sum Strategy

Hello, for my class project i need to implement sum reduction using 1k or more array with float’s. I would like to compute the final sum by calling the kernel “X” time’s, i just can’t understand how to do it right, if i have 512 work groups how many time’s i have to call it? and why? which functions i should use to call it again? how can i add the returned value and send it again?

Part of main code:

   error = clGetPlatformIDs(2, cp_Platform, NULL);	//array with two devices

error = clGetDeviceIDs(cp_Platform[1], CL_DEVICE_TYPE_GPU, 1, &Device_ID, NULL); // cp_platform[1] = Nvidia GPU

context = clCreateContext(NULL, 1, &Device_ID, NULL, NULL, &error); // creating openCL context ----> error 2

queue = clCreateCommandQueue(context, Device_ID, 0, &error); // creating command queue, executing openCL context on device cp_Platform[1] ****

program = clCreateProgramWithSource(context, 1, (const char **)& kernelSource, (const size_t *)&source_size, &error); //this function creates a program object for this specific openCL 

error = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); //compiles and links a program executable from the program source

kernel = clCreateKernel(program, "GPUfunction", &error); //creating kernel object 

d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, n * sizeof(float), NULL, NULL);

d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float), NULL, NULL);

error = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0, n * sizeof(float), h_a, 0, NULL, NULL); //Enqueue commands to write to a buffer object from host memory.

error |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0, sizeof(float), h_s, 0, NULL, NULL); //Enqueue commands to write to a buffer object from host memory.

error |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a); //Used to set the argument value for a specific argument of a kernel.

error |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);

error |= clSetKernelArg(kernel, 2, sizeof(int), &n);

error |= clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL); // Enqueues a command to execute a kernel on a device.

clFinish(queue);

clEnqueueReadBuffer(queue, d_b, CL_TRUE, 0, sizeof(float), h_s, 0, NULL, NULL); ////writing data from the device (d_b) to host(h_b)

Kernel code:

__kernel void GPUfunction(__global float *vec, __global float *outPutSum, int n)
{
__local float tempSum[512];

int i;
int globalID = get_global_id(0); 
int tid = get_local_id(0);		 
int BlockDIM = get_local_size(0);


if (globalID < n)
{
	tempSum[tid] = vec[globalID];  

}
else
{
	tempSum[tid] = 0;
}
barrier(CLK_LOCAL_MEM_FENCE);	 

for (i = BlockDIM / 2; i > 0; i /= 2)
{
	barrier(CLK_LOCAL_MEM_FENCE);
	if (tid < i)
	{

		tempSum[tid] += tempSum[tid + i];
	}
	

}


if (tid == 0)
{
	outPutSum[get_group_id(0)] = tempSum[0];
}

}

, if i have 512 work groups how many time’s i have to call it?

This means problem size is reduced by 512 times* after each iteration. At some point it will be <= 512, which means it is the last iteration.

  • Hint: you need to think of the case it is not divisible by 512.

which functions i should use to call it again?

You simply use swap buffers you used on previous iterations and do this part again:

error |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);

error |= clSetKernelArg(kernel, 2, sizeof(int), &n);

error |= clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL); // Enqueues a command to execute a kernel on a device.

Also, size of the buffer d_b should be proportional to the number of workgroups.