GPU cores have no activity

Hey guys,

I’m new in OCL programming. I think I understood most of the basic programming. I wanted to parallelize my code on the GPU to save time and increase the performance. But the program is a bit slower then running multi-threading on CPU. If I’m watching the performance-monitor of Windows 10, I can see that 1 CPU-core is performing but the GPU is in idle. If chosen the platform “NVIDIA CUDA” and my GPU “GTX 1660” to create the “context” and “queue”. The only thing I can recognize is, that the memory of my graphics card gets allocated and filled.
Can someone help me please?

Thank you very much for any help!

Here is my schematic init of the OCL (just to let you know what I did):

Context = clCreateContextFromType(contextProperties, **CL_DEVICE_TYPE_GPU**, NULL, NULL, NULL);
Queue = clCreateCommandQueue(Context, DeviceID, 0, NULL);
KernelString = ...getting kernel from file...
Program = clCreateProgramWithSource(Context, 1, (const char**)(&Kernel), &KernelLength, NULL);
ierr = clBuildProgram(Program, 1, &DeviceID, NULL, NULL, NULL);
Kernel = clCreateKernel(Program, KernelString, NULL);
DeviceBuf = clCreateBuffer(Context, CL_MEM_READ_WRITE, size, NULL, NULL);
ierr = clEnqueueWriteBuffer(Queue, DeviceBuf, blocking, 0, size, HostBuf, 0, NULL, NULL); 
ierr = clSetKernelArg(MyKernel, 0, sizeof(cl_mem), (void*)(&DeviceBuf));
ierr = (int)clGetDeviceInfo(DeviceID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &numCompUnits, NULL);
local_item_size = (size_t)numCompUnits;
global_item_size = ceil((double)psi_max / (double)local_item_size) * local_item_size;
ierr = clEnqueueNDRangeKernel(Queue, GenHoughThreadKernel, 1/*workDim*/, NULL, &global_item_size, &local_item_size/*NULL*/, 0, NULL, NULL);
ierr = clFinish(Queue);
ierr = clEnqueueReadBuffer(Queue, DeviceBuf, blocking, 0, size, HostBuf, 0, NULL, NULL);
ierr = clReleaseKernel(Kernel);
ierr = clReleaseProgram(Program);
ierr = clReleaseCommandQueue(Queue);
ierr = (int)clReleaseContext(Context);

And here is my Kernel:

__kernel void MyKernel(__global int *H, __global unsigned char *Pic,
                             __global float *theta, int nx, int ny,
			                       __global int *amountPhi, int rowsPhi,
                             __global double *R, __global double *BETA,
                             int resolution, int psi_max)
	int idx, x_H, y_H;
	double beta, r;
	int nn = nx*ny;
	int psi = get_global_id(0);

  if (psi >= psi_max) // psi_max cannot get bigger than 360°

	for (int j = 0; j < ny; j++)
		for (int i = 0; i < nx; i++)
			if ((float)Pic[i+j*nx] > 0.0)
				idx = (int)(((double)(resolution-1) * (double)theta[i+j*nx])/M_PI);

			 	for (int id_phi = 0; id_phi < amountPhi[idx]; id_phi++)
					r = R[id_phi+(idx*rowsPhi)];
					beta = fabs(BETA[id_phi+(idx*rowsPhi)]);