Very poor OpenCL performance

petr.machacek · August 14, 2011, 4:28am

Hello,

I’m new to OpenCL, so I guess I’m doing some silly mistake…

I tried to create simple program for image thresholding using OpenCL. Simplified version of the source code source is attached to this post. The program work well, but execution time is very poor.

When I execute the program on RGB image 5760x3240, the program output is [time is in miliseconds]:

clCreateBuffer: 25.649242
clSetKernelArg: 0.001205
clEnqueueNDRangeKernel: 0.536059
clFinish: 66.903236
clEnqueueReadBuffer: 17.060545

When I use IPP’s threshold (Intel performance primitives) on the same picture, then the threshold takes 40 ms.

Why is my OpenCL program running on GPU so slow? I expected it to be much more faster.

My hardware:

Win7 Home, 64bit
CPU AMD Phenom II X4 965, 3.4GHs (4 cores)
6GB RAM
GPU: GIGABYTE, ATI Radeon 5750
AMD APP SDK v2.4

Thanks for any hint…

My program:
//---------------------------------------------------------------------------

#inclue <stdio.h>
#include <windows.h>
#include <CL\opencl.h>

typedef struct _PICBUF
{
    unsigned  Width;
    unsigned  Height;
    unsigned  BytesPerPixel;
    unsigned  Components;
    unsigned  BitDepth;
    unsigned  Stride;
    unsigned char  *Data;
}PICBUF, *PPICBUF;

#define SIZEOF_PICBUF(picbuf) \
((picbuf).Stride * (picbuf).Height)

#define COUNTER_TO_MS(Count)    ((Count)/1000000.0)

const char *KernelSource = 
    "__kernel void threshold(__global uchar *input, __global uchar *output, const uchar thres)
"\
    "{
"\
    "   size_t id = get_global_id(0);
"\
    "   output[id] = input[id] < thres ? input[id] : thres;
"\
    "}
"\
    "
";

unsigned GetCounter(double *pdTime);

//------------------------------
int main()
{  
   cl_context context;
   cl_context_properties properties[3];
   cl_kernel kernel;
   cl_command_queue command_queue;
   cl_program program;
   cl_int err;
   cl_uint num_of_platforms = 0;
   cl_platform_id platform_id[2];
   cl_device_id device_id;
   cl_uint num_of_devices = 0;
   cl_mem input, output;
   size_t global;   
   cl_uchar thres;        
   PICBUF picIn = {0,};
   PICBUF picOut = {0,};
   double now, prev;        
   int i = 0;

   ImageApiReadImage(L"Image.jpg", &picIn);
   ImageApiAllocPicBufEx(&picOut, &picIn);

   if(clGetPlatformIDs(2, platform_id, &num_of_platforms) != CL_SUCCESS)
   {
      printf("Could not read the platform id
");
      return 1;
   }
   
   //I have two platforms, the index 1 is AMD Accelerated parallel processing
   if(clGetDeviceIDs(platform_id[1], CL_DEVICE_TYPE_GPU, 1, &device_id, &num_of_devices) != CL_SUCCESS)
   {
      printf("Could not read the device id
");
      return 1;
   }

   properties[0] = CL_CONTEXT_PLATFORM;
   properties[1] = (cl_context_properties) platform_id[1];
   properties[2] = 0;

   context = clCreateContext(properties, 1, &device_id, NULL, NULL, &err);
   command_queue = clCreateCommandQueue(context, device_id, 0, &err);

   program = clCreateProgramWithSource(context, 1, (const char **)&KernelSource, NULL, &err);

   if(err = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL) != CL_SUCCESS)
   {   
      printf("Could not compile the program.
");
      return 1;
   }

   kernel = clCreateKernel(program, "threshold", &err);

   GetCounter(&prev);
   input = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, SIZEOF_PICBUF(picIn), picIn.Data, NULL);
   output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, SIZEOF_PICBUF(picOut), NULL, NULL);
   GetCounter(&now);
   printf("clCreateBuffer: %Lf
", COUNTER_TO_MS(now - prev));
   
   GetCounter(&prev);
   thres = 128;
   err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
   err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
   err = clSetKernelArg(kernel, 2, sizeof(cl_uchar), &thres);
   GetCounter(&now);
   printf("clSetKernelArg: %Lf
", COUNTER_TO_MS(now - prev));
   
   global = SIZEOF_PICBUF(picIn);
         
   GetCounter(&prev);
   if(err = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL)!=CL_SUCCESS)
   {
      printf("clEnqueueNDRangeKernel failed
");
      return 0;
   }   
   GetCounter(&now);
   printf("clEnqueueNDRangeKernel: %Lf
", COUNTER_TO_MS(now - prev));
   
   GetCounter(&prev);
   if(clFinish(command_queue)!= CL_SUCCESS)
   {
      printf("clFinish failed
");
      return 1;
   }
   GetCounter(&now);
   printf("clFinish: %Lf
", COUNTER_TO_MS(now - prev));
   
   GetCounter(&prev);
   if(clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, SIZEOF_PICBUF(picOut), picOut.Data, 0, NULL, NULL)!=CL_SUCCESS)
   {
      printf("clEnqueueReadBuffer failed
");
      return 1;
   }   
   GetCounter(&now);
   printf("clEnqueueReadBuffer: %Lf
", COUNTER_TO_MS(now - prev));
   
   ImageApiWriteImage(L"ImageOcl.jpg", &picOut, &par);

   ImageApiFinishPicBuf(&picIn);
   ImageApiFinishPicBuf(&picOut);

   clReleaseMemObject(input);
   clReleaseMemObject(output);
   clReleaseProgram(program);
   clReleaseKernel(kernel);
   clReleaseCommandQueue(command_queue);
   clReleaseContext(context);
}

//------------------------------   
unsigned GetCounter(
    double *pdDouble
)
{
    unsigned res = 0;    

    static BOOL perfSupported = TRUE;
    static LARGE_INTEGER freq;
    LARGE_INTEGER count;
    static BOOL first = TRUE;

    if(first)
    {   
        perfSupported = QueryPerformanceFrequency(&freq);
        first = FALSE;
    }
    if(perfSupported)
    {
        BOOL ret = QueryPerformanceCounter(&count);
        if(ret)
        {
            *pdTime = (1000000000. * (count.QuadPart / (double)freq.QuadPart));
        }
        else
        {
            res = -1;
        }
    }
    else
    {
        DWORD ticks = GetTickCount();
      *pdTime = (double)(ticks * 1000000);        
    }
    return res;
}

david.garcia · August 14, 2011, 6:55am

The amount of computation is very little compared with the amount of data that is transferred back and forth to the GPU.

There are some things you can try to see how they affect performance but at the end of the day you are barely doing any computation.

Some ideas from most to least recommended (you can do all of them):

Modify the image in place instead of writing to a new image. Since all you are doing is a simple thresholding algorithm it should be possible. This will improve memory access locality.
Use CL_MEM_USE_HOST_PTR instead of CL_MEM_COPY_HOST_PTR when you allocate “input”. You may need picIn.Data to be page-aligned for this flag to have an effect. This can prevent the initial copy from host to device memory; instead, the data will stay on the host all the time.
Instead of clEnqueueReadBuffer() use clEnqueueMapBuffer(). This can prevent the final copy from device to host memory.
Use “persistent threads” (in CUDA terminology) to compensate that there’s so little computation in your kernel. What that means is that when you call clEnqueueNDRangeKernel() you will choose a global size that matches exactly the number of work-items that your hardware can truly run in parallel.

Your kernel then needs to be modified like this:


// input_len is equal to SIZEOF_PICBUF(picIn)
__kernel void myKernel(__global uchar* input, uchar threshold, uint input_len)
{
    size_t i = get_global_id(0);

    while(i < input_len)
    {
        // Process one pixel here.
        // Notice that consecutive work-items will read and write
        // to consecutive pixels. This coalesced memory access pattern is important.
        input[i] = input[i] > threshold;

        // Jump to the next block of data
        i += get_global_size(0);
    }
}

petr.machacek · August 14, 2011, 12:04pm

Thanks for reply!

If I do the threshold inplace then the clFinish takes 87ms - it’s slower :(… very strange.
If I use CL_MEM_USE_HOST_PTR instead of CL_MEM_COPY_HOST_PTR, then the clCreateBuffer takes 0.0075 ms - it’s faster (compare with 25ms) BUT the clFinish takes about 25ms longer.
The clEnqueueMapBuffer takes 41ms (compare with clEngueueReadBuffer which takes 17ms)
I’m not sure, what “the number of work-items that your hardware can truly run in parallel” is. I guess its a number of stream processors on my GPU… If so then it is 720. It then doubles the clFinish’s execution time.

Setting up all your recommendations I got this output:

clCreateBuffer: 0.0075
clSetKernelArg: 0.001205
clEnqueueNDRangeKernel: 0.158
clFinish: 223.52
clEnqueueMapBuffer: 43.21

Probably the thresholding is not the right task for GPU… as you wrote, the amount of computation is very little.

david.garcia:

The amount of computation is very little compared with the amount of data that is transferred back and forth to the GPU.

There are some things you can try to see how they affect performance but at the end of the day you are barely doing any computation.

Some ideas from most to least recommended (you can do all of them):

Modify the image in place instead of writing to a new image. Since all you are doing is a simple thresholding algorithm it should be possible. This will improve memory access locality.

Use CL_MEM_USE_HOST_PTR instead of CL_MEM_COPY_HOST_PTR when you allocate “input”. You may need picIn.Data to be page-aligned for this flag to have an effect. This can prevent the initial copy from host to device memory; instead, the data will stay on the host all the time.

Instead of clEnqueueReadBuffer() use clEnqueueMapBuffer(). This can prevent the final copy from device to host memory.

Use “persistent threads” (in CUDA terminology) to compensate that there’s so little computation in your kernel. What that means is that when you call clEnqueueNDRangeKernel() you will choose a global size that matches exactly the number of work-items that your hardware can truly run in parallel.

Your kernel then needs to be modified like this:
// input_len is equal to SIZEOF_PICBUF(picIn)
__kernel void myKernel(__global uchar* input, uchar threshold, uint input_len)
{
    size_t i = get_global_id(0);

    while(i < input_len)
    {
        // Process one pixel here.
        // Notice that consecutive work-items will read and write
        // to consecutive pixels. This coalesced memory access pattern is important.
        input[i] = input[i] > threshold;

        // Jump to the next block of data
        i += get_global_size(0);
    }
}

david.garcia · August 14, 2011, 4:47pm

If I do the threshold in place then the clFinish takes 87ms - it’s slower … very strange.

Certainly not what I expected. I wonder what’s going on.

If I use CL_MEM_USE_HOST_PTR instead of CL_MEM_COPY_HOST_PTR, then the clCreateBuffer takes 0.0075 ms - it’s faster (compare with 25ms) BUT the clFinish takes about 25ms longer.

Have you made sure that picIn.Data was aligned to a page (4KB) boundary? The fact that clEnqueueMapBuffer() took so long strongly suggests that the OpenCL driver did a copy under the hood instead of actually using the host pointer you gave it.

I’m not sure, what “the number of work-items that your hardware can truly run in parallel” is. I guess its a number of stream processors on my GPU… If so then it is 720. It then doubles the clFinish’s execution time.

That number is too small. What you need to do to find out the number of work-items that can truly execute concurrently in your hardware is multiply the number of compute units in your device by the maximum work-group size that you can use on the given kernel.

You can query the number of compute units by calling clGetDeviceInfo(…, CL_DEVICE_MAX_COMPUTE_UNITS, …). You can query the maximum work-group size for a given kernel by calling clGetKernelWorkGroupInfo(…, CL_KERNEL_WORK_GROUP_SIZE, …) after compilation.

petr.machacek · August 15, 2011, 12:41pm

Have you made sure that picIn.Data was aligned to a page (4KB) boundary? The fact that clEnqueueMapBuffer() took so long strongly suggests that the OpenCL driver did a copy under the hood instead of actually using the host pointer you gave it.

Now I have aligned correctly the buffer, but the clEnqueueMapBuffer’s time was almost the same.

That number is too small. What you need to do to find out the number of work-items that can truly execute concurrently in your hardware is multiply the number of compute units in your device by the maximum work-group size that you can use on the given kernel.

You can query the number of compute units by calling clGetDeviceInfo(…, CL_DEVICE_MAX_COMPUTE_UNITS, …). You can query the maximum work-group size for a given kernel by calling clGetKernelWorkGroupInfo(…, CL_KERNEL_WORK_GROUP_SIZE, …) after compilation.

It improved the clFinish’s execution time from 223ms to 150ms.

I have just received a reply from AMD phorum for the same post like here. Someone from AMD wrote me, that the threshold is not good task for GPU. There is no computation in the kernel and the GPU’s power doesn’t take no effect here. But it is good learning exercice.
As I understood, the threshold calculation on GPU is similar to calculation “1+1” on digital calculator.

Thanks very much for your help. You learned me a lot of new things about OpenCL.