Hi,
I’m working on an application, which compares two bitmaps on GPU and it uses a main kernel to enqueue a second kernel (as multiple work-items), for the actual comparison. The first call to enqueue_kernel seems to be fine, but any subsequent call returns error -10. If the call is set to generate an event, the function appears to hang. The application is running on an Nvidia GPU with OpenCL 3.0, with the latest drivers.
__kernel void MatCmp(
__global uchar* ABackgroundBmp,
__global uchar* ASubBmp,
__global int* AResultedErrCount,
const unsigned int ABackgroundWidth,
const unsigned int ASubBmpWidth,
const unsigned int ASubBmpHeight,
const unsigned int AXOffset,
const unsigned int AYOffset,
const uchar AColorError,
const long ASlaveQueue) //Without passing SlaveQueue to this kernel, it seems that the variable is optimized out in main kernel, and cannot be used properly.
{
int YIdx = get_global_id(0);
//int YIdx = get_local_id(0);
__global uchar const * BGRow = &ABackgroundBmp[((YIdx + AYOffset) * ABackgroundWidth + AXOffset) * 3];
__global uchar const * SubRow = &ASubBmp[(YIdx * ASubBmpWidth) * 3];
int ErrCount = 0;
for (int x = 0; x < ASubBmpWidth; x++)
{
int x0_BG = x * 3 + 0;
int x1_BG = x * 3 + 1;
int x2_BG = x * 3 + 2;
int x0_Sub = x * 3 + 0;
int x1_Sub = x * 3 + 1;
int x2_Sub = x * 3 + 2;
short SubPxB = SubRow[x0_Sub];
short BGPxB = BGRow[x0_BG];
short SubPxG = SubRow[x1_Sub];
short BGPxG = BGRow[x1_BG];
short SubPxR = SubRow[x2_Sub];
short BGPxR = BGRow[x2_BG];
if ((abs(SubPxR - BGPxR) > AColorError) ||
(abs(SubPxG - BGPxG) > AColorError) ||
(abs(SubPxB - BGPxB) > AColorError))
{
ErrCount++;
} //if
} //for
AResultedErrCount[YIdx] = ErrCount;
//AResultedErrCount[YIdx] = get_work_dim(); //Uncomment, to get the value of get_work_dim on a slave kernel.
}
__kernel void SlideSearch(
__global uchar* ABackgroundBmp,
__global uchar* ASubBmp,
__global int* AResultedErrCount,
__global int* ADebuggingInfo,
const unsigned int ABackgroundWidth,
const unsigned int ASubBmpWidth,
const unsigned int ASubBmpHeight,
const unsigned int AXOffset,
const unsigned int AYOffset,
const uchar AColorError,
const long ASlaveQueue,
const unsigned ATotalErrorCount)
{
queue_t SlaveQueue = (queue_t)ASlaveQueue; //get_default_queue() requies OpenCL >= 2.0 and __opencl_c_device_enqueue (so... it may not be available)
clk_event_t AllKernelsEvent;
clk_event_t FinalEvent;
ndrange_t ndrange = ndrange_1D(1, ASubBmpHeight); //defined as ndrange_1D(global, local)
kernel_enqueue_flags_t MyFlags;
MyFlags = CLK_ENQUEUE_FLAGS_NO_WAIT;
int i, j, k = 0;
bool Found = false;
int EnqKrnErr = -1234;
int EnqMrkErr = -4567;
int XOffset = AXOffset;
int YOffset = AYOffset;
int DifferentCount = 0;
for (i = 0; i < YOffset; i++)
{
for (j = 0; j < XOffset; j++)
{
EnqKrnErr = enqueue_kernel(
SlaveQueue,
MyFlags,
ndrange,
0, //comment for err -10
NULL, //comment for err -10
&AllKernelsEvent, //comment for err -10
^{MatCmp(ABackgroundBmp, ASubBmp, AResultedErrCount, ABackgroundWidth, ASubBmpWidth, ASubBmpHeight, i, j, AColorError, ASlaveQueue);});
ADebuggingInfo[0] = EnqKrnErr;
EnqMrkErr = enqueue_marker(SlaveQueue, 1, &AllKernelsEvent, &FinalEvent);
ADebuggingInfo[1] = EnqMrkErr;
release_event(AllKernelsEvent); //should be called here, right after enqueue_marker?
release_event(FinalEvent); //should be called here, right after enqueue_marker?
DifferentCount = 0;
for (k = 0; k < ASubBmpHeight; k++)
DifferentCount += AResultedErrCount[k];
int TotalErrorCount = ATotalErrorCount;
if ((DifferentCount < TotalErrorCount) || (EnqKrnErr < 0))
{
Found = true;
break;
}
} //for j
if (Found || EnqKrnErr < 0)
break;
} // for i
ADebuggingInfo[2] = i;
ADebuggingInfo[3] = j;
ADebuggingInfo[4] = DifferentCount;
ADebuggingInfo[5] = (int)Found;
ADebuggingInfo[6] = get_work_dim();
ADebuggingInfo[7] = get_global_size(1);
ADebuggingInfo[8] = get_local_size(1);
ADebuggingInfo[9] = get_enqueued_local_size(1);
ADebuggingInfo[10] = ATotalErrorCount;
release_event(AllKernelsEvent);
release_event(FinalEvent);
} //func
Full app source code here: GitHub - VCC02/NestedCLKernels: OpenCL experiment with nested kernels
My questions are:
- Why the device appears to hang when lines, marked with “comment for err -10”, are not commented in the .cl file?
In this case, enqueue_kernel should generate an event, and update the “AllKernelsEvent” variable.
On host, clFinish returns CL_INVALID_COMMAND_QUEUE. Subsequent runs cause error -9999 to be returned by clCreateContext.
app log:
3/20/2025 10:54:26 PM Running kernel…
3/20/2025 10:54:30 PM Error CL_INVALID_COMMAND_QUEUE " at “clFinish CmdQueue (Before clEnqueueReadBuffer)” OpenCL API call.
3/20/2025 10:54:30 PM Done running kernel.
3/20/2025 10:56:30 PM Running kernel…
3/20/2025 10:56:30 PM Error Bad OpenCL state. Please restart application. Or maybe release and reload OpenCL. " at “clCreateContext” OpenCL API call. Error is -9999
3/20/2025 10:56:30 PM Done running kernel.
3/20/2025 10:57:05 PM Running kernel…
3/20/2025 10:57:05 PM Error Bad OpenCL state. Please restart application. Or maybe release and reload OpenCL. " at “clCreateContext” OpenCL API call. Error is -9999
3/20/2025 10:57:05 PM Done running kernel.
- What is error -10, returned by enqueue_kernel?
To reproduce, please comment lines, marked with “comment for err -10”, then run. Notice that the kernel exits with j = 1, meaning that at least one enqueue_kernel call returned success (reproducible on 32-bit only, so far).
To understand the log, “ResultedErrCount[]” shows the “AResultedErrCount” array, from kernels, where each work-item updates a single array item.
The other values come from the “ADebuggingInfo” array, as an output from the “SlideSearch” kernel.
app log:
3/20/2025 10:58:45 PM Running kernel…
3/20/2025 10:58:45 PM ErrCount:
ResultedErrCount[0] = 0
ResultedErrCount[1] = 0
ResultedErrCount[2] = 0
ResultedErrCount[n - 4] = 0
ResultedErrCount[n - 3] = 0
ResultedErrCount[n - 2] = 0
ResultedErrCount[n - 1] = 0
3/20/2025 10:58:45 PM Misc info:
enqueue_kernel = -10
enqueue_marker = 0
i = 0
j = 1
DifferentCount = 154
Found = 1
get_work_dim on “SlideSearch” = 1
get_global_size on “SlideSearch” = 1
get_local_size on “SlideSearch” = 1
get_enqueued_local_size on “SlideSearch” = 1
3/20/2025 10:58:45 PM Done running kernel.
- Why get_work_dim() returns 3, instead of 1 in a slave kernel (one that is enqueued with ndrange_1D)?
To reproduce, please uncomment the last line from “MatCmp” kernel, to return get_work_dim, instead of “ErrCount” (reproducible on 32-bit only, so far).
Keep the lines marked with “comment for err -10”, still commented.
On the main kernel (“SlideSearch”), get_work_dim() returns 1, as expected (visible in log).
app log:
3/20/2025 11:00:30 PM Running kernel…
3/20/2025 11:00:30 PM ErrCount:
ResultedErrCount[0] = 3
ResultedErrCount[1] = 3
ResultedErrCount[2] = 3
ResultedErrCount[n - 4] = 3
ResultedErrCount[n - 3] = 3
ResultedErrCount[n - 2] = 3
ResultedErrCount[n - 1] = 3
3/20/2025 11:00:30 PM Misc info:
enqueue_kernel = -10
enqueue_marker = 0
i = 0
j = 1
DifferentCount = 120
Found = 1
get_work_dim on “SlideSearch” = 1
get_global_size on “SlideSearch” = 1
get_local_size on “SlideSearch” = 1
get_enqueued_local_size on “SlideSearch” = 1
3/20/2025 11:00:30 PM Done running kernel.
-
As a continuation of question 3, if get_work_dim() returns 3, does this mean that there are multiple work-items in every dimension, instead of a single dimension?
If yes, are all these additional work-items overwriting the output array, “AResultedErrCount”, at the same index? This array is only ASubBmpHeight items long.
See “YIdx = get_global_id(0);” line. -
In the “SlideSearch” kernel, when should release_event(AllKernelsEvent) and release_event(FinalEvent) be called, right after enqueue_marker, at every iteration, or once at the end of the kernel?
Thank you.