OpenCL only reads/writes from/to 1/4 of the buffer memory

I have a problem with OpenCL, which is that it executes the entire command queue, but it only reads only 1/4 of the input and writes only 1/4 of the result.
No matter how many iterations, always 1/4.

And also it sometimes randomly crashes…with debugging I dont get any information, since there is no debug symbols, where it crashes (0x4c4783f6 in ???, etc.)

Source code:


    #include <iostream>
    #include <cl/cl.h>
    #include <cassert>
    #include <cstring>
    
    const char *progsrc[] = {
    "#pragma OPENCL EXTENSION cl_intel_printf : enable
\
    __kernel void add(__global const int *a, __global const int *b, __global int *out) \
    { \
        int tid = get_global_id(0);\
        out[tid] = tid/*a[tid]+b[tid]*/;\
        printf(\"krnl: %d = %d + %d \
\", out[tid], a[tid], b[tid]);\
    }"};
    
    const int iterations = 20;
    
    #define CLCheck(a) \
    do\
    {\
        if(a != CL_SUCCESS)\
        {\
            std::cerr << "OpenCL Error(" << a << ") at " << __LINE__ << std::endl;\
            return -1;\
        }\
    } while(0)
    
    int main()
    {
        cl_int err = CL_SUCCESS;
    
        int *aH = NULL;
        int *bH = NULL;
        int *outH = NULL;
    
      	cl_uint platnum, devnum;
      	cl_device_id dev;
      	cl_platform_id plat;
      	err = clGetPlatformIDs(0, 0, &platnum);
      	CLCheck(err);
      	cl_platform_id pfids[platnum];
        err = clGetPlatformIDs(platnum, pfids, &platnum);
        CLCheck(err);
    
        if(!platnum)
        {
            std::cerr << "No platform found." << std::endl;
            return -1;
        }
        else
            std::cout << platnum << " OpenCL platform(s) found.
" << std::endl;
    
        for(unsigned int i = 0; i != platnum; i++)
        {
            char buf[4096];
            
            err = clGetDeviceIDs(pfids[i], CL_DEVICE_TYPE_ALL, 0, 0, &devnum);
            CLCheck(err);
            cl_device_id devids[devnum];
            err = clGetDeviceIDs(pfids[i], CL_DEVICE_TYPE_ALL, devnum, devids, &devnum);
            CLCheck(err);
            if(!devnum)
            {
                std::cerr << "No device found." << std::endl;
                return -1;
            }
            else
                std::cout << " " << devnum << " OpenCL device(s) found.
" << std::endl;
    
            for(unsigned int i2 = 0; i2 != devnum; i2++)
            {
                char buf[1024];
                std::cout << ": 
	Name: " << buf;
                err = clGetDeviceInfo(devids[i2], CL_DEVICE_VENDOR, 1024, buf, NULL);
                CLCheck(err);
                if(!strncmp(buf, "Intel", 5))
                {
                    dev = devids[0];
                    plat = pfids[i];
                    std::cout << "
	Found Intel(R) OpenCL device.";
                }
            }
        }
        cl_context_properties ctxprop[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)plat, 0};
        cl_context ctx = clCreateContext(ctxprop, 1, &dev, NULL, NULL, &err);
        CLCheck(err);
    
        cl_program program = clCreateProgramWithSource(ctx, 1, progsrc, NULL, &err);
        CLCheck(err);
        err = clBuildProgram(program, 1, &dev, "", NULL, NULL);
        if(err != CL_SUCCESS)
        {
            size_t bufsz;
    		err = clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 0, 0, &bufsz);
    		char buf[bufsz];
    		err = clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, bufsz, buf, &bufsz);
    		std::cerr << "OpenCL program building failed: " << buf << std::endl;
    		return -1;
    	}
    	err = clUnloadCompiler();
    	CLCheck(err);
    
        aH = new int[iterations];
        bH = new int[iterations];
        outH = new int[iterations];
        memset(outH, 0, iterations*sizeof(int));
        for(int i = 0; i != iterations; i++)
        {
            aH[i] = i;
            bH[i] = i*2;
        }
    
        cl_mem aCL = clCreateBuffer(ctx, CL_MEM_READ_ONLY, iterations, NULL, &err);
        cl_mem bCL = clCreateBuffer(ctx, CL_MEM_READ_ONLY, iterations, NULL, &err);
        CLCheck(err);
        cl_mem outCL = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, iterations, NULL, &err);
        CLCheck(err);
    
        cl_kernel krnl = clCreateKernel(program, "add", &err);
        CLCheck(err);
    
        err = clSetKernelArg(krnl, 0, sizeof(aCL), &aCL);
        CLCheck(err);
        err = clSetKernelArg(krnl, 1, sizeof(bCL), &bCL);
        CLCheck(err);
        err = clSetKernelArg(krnl, 2, sizeof(outCL), &outCL);
        CLCheck(err);
    
        cl_command_queue cmdqueue = clCreateCommandQueue(ctx, dev, 0, &err);
        cl_event evt;
        size_t global_work_size[1] = { iterations };
        err = clEnqueueWriteBuffer(cmdqueue, aCL, CL_TRUE, 0, iterations, aH, 0, NULL, NULL);
        err = clEnqueueWriteBuffer(cmdqueue, bCL, CL_TRUE, 0, iterations, bH, 0, NULL, NULL);
        err = clEnqueueNDRangeKernel(cmdqueue, krnl, 1, NULL, global_work_size, NULL, 0, NULL, &evt);
        err = clWaitForEvents(1, &evt);
        err = clEnqueueReadBuffer(cmdqueue, outCL, CL_TRUE, 0, iterations, outH, 0, NULL, &evt);
    
        for(int i = 0; i != iterations; i++)
        {
            std::cout << outH[i] << std::endl;
        }
    
        err = clReleaseEvent(evt);
        err = clReleaseCommandQueue(cmdqueue);
        err = clReleaseKernel(krnl);
        err = clReleaseMemObject(outCL);
        err = clReleaseMemObject(bCL);
        err = clReleaseMemObject(aCL);
        err = clReleaseProgram(program);
        err = clReleaseContext(ctx);
    
        if(aH)
            delete aH;
        if(bH)
            delete bH;
        if(outH)
            delete outH;
        return 0;
    }

output:


    2 OpenCL platform(s) found.
    
    Platform 0 :
            Name: NVIDIA CUDA
            Vendor: NVIDIA Corporation
            Profile: FULL_PROFILE
            Version: OpenCL 1.1 CUDA 4.0.1
            Extensions: cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing c
    l_nv_d3d9_sharing cl_nv_d3d10_sharing cl_khr_d3d10_sharing cl_nv_d3d11_sharing c
    l_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll
    
     1 OpenCL device(s) found.
    
      Device 0:
            Name: GeForce GT 425M
            Vendor: NVIDIA Corporation
            Profile: FULL_PROFILE
            Driver version: 280.26
            OpenCL version: OpenCL C 1.1
            Version: OpenCL 1.1 CUDA
            Extensions: cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing c
    l_nv_d3d9_sharing cl_nv_d3d10_sharing cl_khr_d3d10_sharing cl_nv_d3d11_sharing c
    l_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll  cl_khr_g
    lobal_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32
    _base_atomics cl_khr_local_int32_extended_atomics cl_khr_fp64
    
    Platform 1 :
            Name: Intel(R) OpenCL
            Vendor: Intel(R) Corporation
            Profile: FULL_PROFILE
            Version: OpenCL 1.1
            Extensions: cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_i
    nt32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extende
    d_atomics cl_khr_byte_addressable_store cl_intel_printf cl_ext_device_fission cl
    _intel_immediate_execution cl_khr_gl_sharing cl_khr_icd
    
     1 OpenCL device(s) found.
    
      Device 0:
            Name: Intel(R) Core(TM) i3 CPU       M 370  @ 2.40GHz
            Found Intel(R) OpenCL device.
            Vendor: Intel(R) Corporation
            Profile: FULL_PROFILE
            Driver version: 1.1
            OpenCL version: OpenCL C 1.1
            Version: OpenCL 1.1 (Build 15293.6650)
            Extensions: cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_i
    nt32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extende
    d_atomics cl_khr_byte_addressable_store cl_intel_printf cl_ext_device_fission cl
    _intel_immediate_execution cl_khr_gl_sharing
    
    krnl: 0 = 0 + 0
    krnl: 1 = 1 + 2
    krnl: 2 = 2 + 4
    krnl: 3 = 3 + 6
    krnl: 4 = 4 + 8
    krnl: 5 = 0 + 0
    krnl: 6 = 0 + 0
    krnl: 7 = 0 + 0
    krnl: 16 = 0 + 492859489
    krnl: 17 = 0 + -1042621749
    krnl: 18 = 0 + 1310105771
    krnl: 19 = 0 + 134230852
    krnl: 8 = 0 + 0
    krnl: 9 = 0 + 0
    krnl: 10 = 0 + -1094462526
    krnl: 11 = 0 + -1094462526
    krnl: 12 = 0 + -1230120245
    krnl: 13 = 0 + 500723958
    krnl: 14 = 0 + 530164160
    krnl: 15 = 0 + 492859489
    0
    1
    2
    3
    4
    0
    0
    0
    0
    0
    0
    0
    0
    0
    0
    0
    0
    0
    0
    0

Thanks :slight_smile:

oh, it seems that i forgot to use iterations*sizeof(int) instead of iterations only when calling buffer functions…thats solved now, thanks to mystical at http://stackoverflow.com/questions/7627 … -sometimes