Error : CL_BUILD_PROGRAM_FAILURE, the source code seems ok

skdysch · August 26, 2014, 1:48am

Hi all

I’m relatively new to the OpenCL, therefore producing tons of errors and have lots of questions.
would really appreciate if you’d help me

I am using OpenCL for Nvidia NVS 5200m quadro graphic card

basically I want to go through the grey scaled image of siye 352x288 and check if the value of it is equal to 15.0.
here’s my host code:

#include <stdio.h>	
#include <stdlib.h>
 
#include <CL/cl.h>	
#include <cv.h>

#include <create2dmap.h>
#include <Node.h>
#include <RandomTree.h>

 
#define MAX_SOURCE_SIZE (0x100000)	
 
void runopencl(std::vector<Node*> Forest, cv::Mat DepthImage, int width, int height)
{	
	cl_platform_id platform_id = NULL;
	cl_device_id device_id = NULL;
	cl_context context = NULL;
	cl_command_queue command_queue = NULL;
	cl_mem resultBuf = NULL;	
	cl_mem depthBuf = NULL;
	cl_mem matrixBuf = NULL;
	cl_program program = NULL;
	cl_kernel kernel = NULL;	
	cl_uint ret_num_devices;
	cl_uint ret_num_platforms;
	cl_int ret;
 
	int i, j;
	float *result = new float[101376];
	float *depthArray = new float[101376];
	float Matrix[1024][16];
	float *MatrixInLine = new float[1024*16];
	
	cl_float16 Matrix16[1024];


	for (int y = 0; y < height; y++)
		for (int x = 0; x < width; x++)
			{depthArray[x+y*width]=DepthImage.at<float>(y,x);
			result[x+y*width]=0;}

	converttree(Forest, Matrix);

	for (int y = 0; y < 1024; y++)
		for (int x = 0; x < 16; x++)
			MatrixInLine[y*16+x] = Matrix[y][x];

	FILE *fp;
	const char fileName[] = "D:\\USERDATA\\aevaus2\
ewADTF\\ADTFDevelopment\\src\\oclPersonSegmentationFilter\\classifypixel.cl";
	size_t source_size;
	char *source_str;
 
	// Load kernel source file 
	fp = fopen(fileName, "r");
	if (!fp) {
		fprintf(stderr, "Failed to load kernel.
");	
		exit(1);
	}	
	source_str = (char *)malloc(MAX_SOURCE_SIZE);
	source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
	fclose(fp);
 

 
	// Get Platform/Device Information
	ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);	
	ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
 	// Create OpenCL Context 
	context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
 	// Create command queue 
	command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
	
	// Create Buffer Object 
	depthBuf = clCreateBuffer(context, CL_MEM_READ_WRITE, 101376*sizeof(float), NULL, &ret);
	matrixBuf = clCreateBuffer(context, CL_MEM_READ_WRITE, 1024*16*sizeof(float), NULL, &ret); 
	resultBuf = clCreateBuffer(context, CL_MEM_READ_WRITE, 101376*sizeof(float), NULL, &ret);

 
	/// Copy input data to the memory buffer 
	ret = clEnqueueWriteBuffer(command_queue,  depthBuf, CL_TRUE, 0, 101376*sizeof(float), depthArray, 0, NULL, NULL);
	ret = clEnqueueWriteBuffer(command_queue, matrixBuf, CL_TRUE, 0, 1024*16*sizeof(float), &MatrixInLine, 0, NULL, NULL);
	ret = clEnqueueWriteBuffer(command_queue, resultBuf, CL_TRUE, 0, 101376*sizeof(float), result, 0, NULL, NULL);
 
	// Create kernel program from source file
	program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);	
	ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
 
	// Create data parallel OpenCL kernel 	
	kernel = clCreateKernel(program, "dataParallel", &ret);
			
	// Set OpenCL kernel arguments 
	ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&depthBuf);
	ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&matrixBuf);
	ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&resultBuf);
	
 
	size_t global_item_size[2] = {352,288};
	size_t local_item_size[2] = {1,1};
			
	// Execute OpenCL kernel as data parallel 
	ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global_item_size, local_item_size, 0, NULL, NULL);
 
	// Transfer result to host 
	ret = clEnqueueReadBuffer(command_queue, resultBuf, CL_TRUE, 0, 101376*sizeof(float), result, 0, NULL, NULL);
 
	//Finalization 
	ret = clFlush(command_queue);	
	ret = clFinish(command_queue);
	ret = clReleaseKernel(kernel);
	ret = clReleaseProgram(program);
	ret = clReleaseMemObject(resultBuf);
	ret = clReleaseMemObject(depthBuf);
	ret = clReleaseMemObject(matrixBuf);

	ret = clReleaseCommandQueue(command_queue);
	ret = clReleaseContext(context);	
 
	free(source_str);
 	
}

and my simple kernel code

__kernel void dataParallel(__global float* depthImage, __global float* MatrixInLinePtr, __global float* result)
{
	int x = get_gloabal_id(0);
	int y = get_gloabal_id(1);
	
	if ((x>=352 || y>=288) return ;
	
	if (depthImage[x+y*width]==15.0) result[x+y*width] = 14;	
}

as a result I receive error 11, error during building the program, but I don’t get what could be wrong with the kernel

thank you for your attention!

skdysch · August 26, 2014, 2:16am

okay, I’ve fixed the kernel, now it looks like that


__kernel void dataParallel(__constant float* depthImage, __constant float* MatrixInLinePtr, __global float* result)
{
	int x = get_global_id(0);
	int y = get_global_id(1);
	
	if (x>=352 || y>=288) return ;
	
	if (depthImage[x+y*352]==15.0) result[x+y*352] = 14;
}

and I receive error 52 CL_INVALID_KERNEL_ARGS

utnapishtim · September 1, 2014, 12:38am

Check the max size of a constant buffer with CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE.
It is generally 64KB on a GPU, so your buffer is probably too big to fit into a constant buffer.