clCreateKernel returning -46 invalid kernel name

I have modified source from example code to do very simple program but repeatedly running into problem. I can start from original working source but I would rather debug it to see what is wrong.
clCreatekernel is returning -46 which means invalid kernel name. But I double checked everything and even did bcompare original vs. modified one (below).
I can not find problem with the kernel name at all. Perhaps more experience opencl dev-s can look and spot something??
source and output below:
PS. Do not worry about file name referring to CUDA, it has nothing to do with CUDA, at least now so ignore the name:

=~=~=~=~=~=~=~=~=~=~=~= MobaXterm log 2020.06.23 10:29:28 =~=~=~=~=~=~=~=~=~=~=~=
ls
Makefile  ex-code-1  ex-code-1.c  ex-code-1.o  p25-cuda  p25-cuda.c  p25-cuda.o
root@sriov-guest:/git.co/dev-learn/rocm/opencl/opencl-programming-guide/cuda-conversion# cat p25-cuda.c 
//
// Copyright (c) 2010 Advanced Micro Devices, Inc. All rights reserved.
//

// A minimalist OpenCL program.

#include <CL/cl.h>
#include <stdio.h>

#define printDeviceInfo(X)   printf("\n%s: %s",  (X));
#define declareDeviceInfo(X) char str(X)[] = "(X)";

#define NWITEMS 512
// A simple simple_add kernel
const char *source =
"kernel void simple_add(     global uint *c, global uint a, global uint b)  \n"
"{                                                                      \n"
"        *c = a + b;                                                    \n"
"}                                                                      \n";

int main(int argc, char ** argv) {
    int c;
    int * dev_c;

    int stat;
    char str1[100];
    size_t strLen;
    int i;

    // 1. Get a platform.

    cl_platform_id platform;
    clGetPlatformIDs( 1, &platform, NULL );

    // 2. Find a gpu device.

    cl_device_id device;
    cl_device_info deviceInfos[]={CL_DEVICE_NAME, CL_DEVICE_VENDOR, CL_DEVICE_VERSION, CL_DRIVER_VERSION, CL_DEVICE_EXTENSIONS};

    stat = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);

    for (int i = 0 ; i < sizeof(deviceInfos)/sizeof(cl_device_info); i ++ ) {
        clGetDeviceInfo(device, deviceInfos[i], sizeof(str1), str1, &strLen);

        if (stat == 0)  {
            printf("\n%s.", str1);
        } else {
            printf("\nclGetDevicesIDs FAIL.");
        return 1;
        }
    }    

    // 3. Create a context and command queue on that device.

    cl_context context = clCreateContext( NULL, 1,  &device, NULL, NULL, NULL);
    cl_command_queue queue = clCreateCommandQueue( context, device, 0, NULL );

    // 4. Perform runtime source compilation, and obtain kernel entry point.

    cl_program program = clCreateProgramWithSource( context, 1, &source,  NULL, NULL );
    clBuildProgram( program, 1, &device, NULL, NULL, NULL );
    cl_kernel kernel = clCreateKernel( program, "simple_add", NULL );

    // 5. Create a data buffer.

    cl_mem buffer = clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof(cl_uint), NULL, NULL );

    // 6. Launch the kernel. Let OpenCL pick the local work size.

    size_t global_work_size = NWITEMS;  
    clSetKernelArg(kernel, 0, sizeof(buffer), (void*) &buffer);
    clSetKernelArg(kernel, 0, sizeof(buffer), (void*)2);
    clSetKernelArg(kernel, 0, sizeof(buffer), (void*)7);
    clEnqueueNDRangeKernel( queue, kernel,  1,  NULL, &global_work_size, NULL, 0,  NULL, NULL);
    clFinish( queue );

    // 7. Look at the results via synchronous buffer map.

    cl_uint *ptr;
    ptr = (cl_uint *) clEnqueueMapBuffer( queue, buffer, CL_TRUE, CL_MAP_READ, 0, NWITEMS * sizeof(cl_uint), 0, NULL, NULL, NULL );

/*

    for(i=0; i < NWITEMS; i++) {
        if (i % 16 == 0) 
            printf("\n");

        printf("%03d: %04d. ", i, ptr[i]);
        
    }
*/
    return 0;
}
root@sriov-guest:/git.co/dev-learn/rocm/opencl/opencl-programming-guide/cuda-conversion# 
=~=~=~=~=~=~=~=~=~=~=~= MobaXterm log 2020.06.26 01:08:33 =~=~=~=~=~=~=~=~=~=~=~=
rm p25-cuda ; make p25-cuda ; ./p25-cuda nano -w p25-cuda.c rm p25-cuda ; make p25-cuda ; ./p25-cuda 
g++ -o p25-cuda.o -c p25-cuda.c -I/opt/rocm/opencl//include
In file included from /opt/rocm/opencl//include/CL/cl.h:32:0,
                 from p25-cuda.c:7:
/opt/rocm/opencl//include/CL/cl_version.h:34:104: note: #pragma message: cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 220 (OpenCL 2.2)
 #pragma message("cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 220 (OpenCL 2.2)")
                                                                                                        ^
p25-cuda.c: In function 'int main(int, char**)':
p25-cuda.c:66:77: warning: '_cl_command_queue* clCreateCommandQueue(cl_context, cl_device_id, cl_command_queue_properties, cl_int*)' is deprecated [-Wdeprecated-declarations]
     cl_command_queue queue = clCreateCommandQueue( context, device, 0, &ret );
                                                                             ^
In file included from p25-cuda.c:7:0:
/opt/rocm/opencl//include/CL/cl.h:1813:1: note: declared here
 clCreateCommandQueue(cl_context                     context,
 ^~~~~~~~~~~~~~~~~~~~
g++ -o p25-cuda p25-cuda.o -lOpenCL -L/opt/rocm/opencl//lib/x86_64

gfx900.
Advanced Micro Devices, Inc..
OpenCL 2.0 AMD-APP (3004.5).
3004.5 (PAL,HSAIL).
cl_khr_fp64 cl_amd_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_kh.
Error: clCreateKernel returned non-zero: -46.
root@sriov-guest:/git.co/dev-learn/rocm/opencl/opencl-programming-guide/cuda-conversion# cd ..
root@sriov-guest:/git.co/dev-learn/rocm/opencl/opencl-programming-guide#

Hello!

When I run your program on my system I’m seeing that the previous call to clBuildProgram fails, and then the call to clCreateKernel fails as well. Here is the output from your program with the OpenCL Intercept Layer, with CallLogging enabled:

>>>> clBuildProgram: program = 0x55eebdedec00, pfn_notify = (nil)
<<<< clBuildProgram -> CL_BUILD_PROGRAM_FAILURE
>>>> clCreateKernel: program = 0x55eebdedec00, kernel_name = simple_add
<<<< clCreateKernel: returned (nil) -> CL_INVALID_PROGRAM_EXECUTABLE

It looks like this is caused by an error in your kernel:

kernel void simple_add(     global uint *c, global uint a, global uint b)

Because the scalar kernel arguments a and b are scalars, they should not include an address space, and instead should be:

kernel void simple_add(     global uint *c, uint a, uint b)

Note that I see a slightly different error message CL_INVALID_PROGRAM_EXECUTABLE than you do, but regardless, the kernel error is the root of the problem.

After fixing this error, the program compiles successfully and the kernel can be created, but there is one more issue to fix when setting the scalar kernel arguments:

    clSetKernelArg(kernel, 0, sizeof(bufer), (void*) &buffer);
    cl_uint a = 2;
    clSetKernelArg(kernel, 1, sizeof(a), &a);
    cl_uint b = 7;
    clSetKernelArg(kernel, 2, sizeof(b), &b);

Also, one more small change when either allocating or mapping the buffer, because the current buffer size is only big enough for a single cl_uint:

    cl_uint *ptr;
    ptr = (cl_uint *) clEnqueueMapBuffer( queue, buffer, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_uint), 0, NULL, NULL, NULL );
    printf("output is: %d\n", ptr[0]);

After making these changes I’m seeing 9 printed, which I believe is the intent.

oh god, the error message are misleading. As such, the OPENCL intercept layer seems quite usable, I will dig on that. Thank you for lifeline, my whole self-study has come screenching halt because of it.

hi [bashbaug], do you mind checking your code one more time?
Now although code compiles and runs OK which itself a big step, but I am getting garbage at ptr return from last call to fetch value from GPU:
here is the code I modified according to your post.
I am bit skeptical about passing pointer and then declaring kernel parameter by-value however, it is not relevant because , even with hard assignment of *c to 100, once back to CPU domain, it prints garbage. So there must be something wrong with fetching the value from GPU to CPU address space.

I will also keep looking at it myself:

=~=~=~=~=~=~=~=~=~=~=~= MobaXterm log 2020.06.30 00:40:14 =~=~=~=~=~=~=~=~=~=~=~=
cat p25-cuda.c
//
// Copyright © 2010 Advanced Micro Devices, Inc. All rights reserved.
//

// A minimalist OpenCL program.

#include <CL/cl.h>
#include <stdio.h>

#define printDeviceInfo(X) printf("\n%s: %s", (X));
#define declareDeviceInfo(X) char str(X)[] = “(X)”;

#define NWITEMS 512
// A simple simple_add kernel
const char *source =

“kernel void simple_add( global uint *c, uint a, uint b) \n”
“{ \n”
" *c = a + b; \n"
" *c = 100; \n"
“} \n”;

int main(int argc, char ** argv) {
int c;
int * dev_c;

int stat;
char str1[100];
size_t strLen;
int i;
cl_int ret; 

// 1. Get a platform.

cl_platform_id platform;
clGetPlatformIDs( 1, &platform, NULL );

// 2. Find a gpu device.

cl_device_id device;
cl_device_info deviceInfos[]={CL_DEVICE_NAME, CL_DEVICE_VENDOR, CL_DEVICE_VERSION, CL_DRIVER_VERSION, CL_DEVICE_EXTENSIONS};

stat = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);

for (int i = 0 ; i < sizeof(deviceInfos)/sizeof(cl_device_info); i ++ ) {
    clGetDeviceInfo(device, deviceInfos[i], sizeof(str1), str1, &strLen);

    if (stat == 0)  {
        printf("\n%s.", str1);
    } else {
        printf("\nclGetDevicesIDs FAIL.");
    return 1;
    }
}    

printf("\n");

// 3. Create a context and command queue on that device.

cl_context context = clCreateContext( NULL, 1,  &device, NULL, NULL, &ret);

if (ret) {
printf("Error: clCreateContext returned non-zero: %d.\n", ret);
return 1;
}

cl_command_queue queue = clCreateCommandQueue( context, device, 0, &ret );

if (ret) {
printf("Error: clCreateCommandQueue returned non-zero: %d.\n", ret);
return 1;
}
// 4. Perform runtime source compilation, and obtain kernel entry point.

cl_program program = clCreateProgramWithSource( context, 1, &source,  NULL, &ret);

if (ret) {

printf(“Error: clCreateProgramWithSource returned non-zero: %d.\n”, ret);
return 1;
}

clBuildProgram( program, 1, &device, NULL, NULL, NULL );
cl_kernel kernel = clCreateKernel( program, "simple_add", &ret);

if (ret) {

printf(“Error: clCreateKernel returned non-zero: %d.\n”, ret);
return 1;
}

// 5. Create a data buffer.

cl_mem buffer = clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof(cl_uint), NULL, NULL );

// 6. Launch the kernel. Let OpenCL pick the local work size.

size_t global_work_size = NWITEMS;  
clSetKernelArg(kernel, 0, sizeof(buffer), (void*) &buffer);
cl_uint a = 2;
cl_uint b = 7;
//clSetKernelArg(kernel, 1, sizeof(a), (void*)a);
//clSetKernelArg(kernel, 2, sizeof(b), (void*)b);
clEnqueueNDRangeKernel( queue, kernel,  1,  NULL, &global_work_size, NULL, 0,  NULL, NULL);
ret = clFinish( queue );

if (ret) {

printf(“Error: clFinish returned non-zero: %u”, ret);
return 1;
}

// 7. Look at the results via synchronous buffer map.

cl_uint *ptr;

// ptr = (cl_uint *) clEnqueueMapBuffer( queue, buffer, CL_TRUE, CL_MAP_READ, 0, NWITEMS * sizeof(cl_uint), 0, NULL, NULL, &ret );
ptr = (cl_uint *) clEnqueueMapBuffer( queue, buffer, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_uint), 0, NULL, NULL, NULL );

printf("output is: %d\n", ptr[0]);
/*
if (ptr) {
    for(i=0; i < 4; i++) {
      if (i % 16 == 0) 
            printf("\n");
        printf("\n%03d: %04d. ", i, ptr[i]);
        
    }
    printf("\n");
    return 0;
} else {
    printf("ERROR: clEnqueueMapBuffer returned error, error code: %d.\n", ret);
} 
*/      

}
root@sriov-guest:/git.co/dev-learn/rocm/opencl/opencl-programming-guide/cuda-conversion# cat p25-cuda.c nano -w p25-cuda.c rm p25-cuda ; make p25-cuda ; ./p25-cuda it push
g++ -o p25-cuda.o -c p25-cuda.c -I/opt/rocm/opencl//include
In file included from /opt/rocm/opencl//include/CL/cl.h:32:0,
from p25-cuda.c:7:
/opt/rocm/opencl//include/CL/cl_version.h:34:104: note: #pragma message: cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 220 (OpenCL 2.2)
#pragma message(“cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 220 (OpenCL 2.2)”)
^
p25-cuda.c: In function ‘int main(int, char**)’:
p25-cuda.c:67:77: warning: ‘_cl_command_queue* clCreateCommandQueue(cl_context, cl_device_id, cl_command_queue_properties, cl_int*)’ is deprecated [-Wdeprecated-declarations]
cl_command_queue queue = clCreateCommandQueue( context, device, 0, &ret );
^
In file included from p25-cuda.c:7:0:
/opt/rocm/opencl//include/CL/cl.h:1813:1: note: declared here
clCreateCommandQueue(cl_context context,
^~~~~~~~~~~~~~~~~~~~
g++ -o p25-cuda p25-cuda.o -lOpenCL -L/opt/rocm/opencl//lib/x86_64

gfx900.
Advanced Micro Devices, Inc…
OpenCL 2.0 AMD-APP (3004.5).
3004.5 (PAL,HSAIL).
cl_khr_fp64 cl_amd_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_kh.
output is: 742154543
root@sriov-guest:/git.co/dev-learn/rocm/opencl/opencl-programming-guide/cuda-conversion#

Also tried printing up to 32 adjacent values and getting following, I am not seeing assigned value there. I also tried index from sub-zero which results in segfault.

// 7. Look at the results via synchronous buffer map.

cl_uint *ptr;
ptr = (cl_uint *) clEnqueueMapBuffer( queue, buffer, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_uint), 0, NULL, NULL, &ret);

if (ret == 0) {
    for (int i = 0; i < 32; i++)
        printf("output is: idx: %d, %d\n", i, ptr[i]);
} else {
    printf("ERROR: clEnqueueMapBuffer returned error, error code: %d.\n", ret);
    printf("output is: %d\n", ptr[0]);
}

cl_khr_fp64 cl_amd_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_kh.
output is: idx: 0, 742154543
output is: idx: 1, 1
output is: idx: 2, 1
output is: idx: 3, 0
output is: idx: 4, 256
output is: idx: 5, 0
output is: idx: 6, 0
output is: idx: 7, 0
output is: idx: 8, 0
output is: idx: 9, 0
output is: idx: 10, 0
output is: idx: 11, 0
output is: idx: 12, 11272256
output is: idx: 13, 16
output is: idx: 14, 655401
output is: idx: 15, 0
output is: idx: 16, 0
output is: idx: 17, 0
output is: idx: 18, 64
output is: idx: 19, 0
output is: idx: 20, 0
output is: idx: 21, 131085
output is: idx: 22, 2
output is: idx: 23, 11
output is: idx: 24, 0
output is: idx: 25, 100926468
output is: idx: 26, 0
output is: idx: 27, 0
output is: idx: 28, 0
output is: idx: 29, 0
output is: idx: 30, -1997035120
output is: idx: 31, 22047

Well, i found all the mistakes I made and pretty much figured out everything, thank you!!!

1 Like