help on image processing

Stiliyan · September 16, 2014, 6:50am

Hi, this is my first experience with OpenCL.I successfully made an easy opencl example for adding two vectors using multiple GPUs but now I am trying to run Gaussian Blur.I found example on the Internet:
gaussian-blur-using-opencl-and-the-built-in-images-textures by Erik Smistad(I can’t post links).
I am reusing his kernel and createBlurMask function and this is my host code:

int GaussianBlur(OpenCLass& OCLass)
{
	cl_int errNum;
	cl_device_id * devices = OCLass.getDevices();
	cl_uint devicesCount = OCLass.getDevicesCount();
	cl_image_format format;
	_cl_image_desc desc;
	cv::Mat img = cv::imread("R9.jpg", CV_LOAD_IMAGE_ANYCOLOR | CV_LOAD_IMAGE_ANYDEPTH);
	if (img.empty())
	{
		std::cout << "Couldn't load the image
";
		return 1;
	}
	std::cout << std::endl << img.type() << std::endl;
	std::cout << img.channels();
	cv::Mat transformedImg;

	cv::cvtColor(img, transformedImg, CV_BGR2RGBA);
	std::cout << "
 Transformed image: " << transformedImg.type();
	std::cout << "
 Transformed image channels: " << transformedImg.channels() << std::endl;
	char *buffer = reinterpret_cast<char *>(transformedImg.data);

//	imwrite("RGBA_Image.jpg", transformedImg);
	//defining image format and desc
	format.image_channel_data_type = CL_UNSIGNED_INT8;
	format.image_channel_order = CL_RGBA;
	desc.image_width = transformedImg.cols;
	desc.image_height = transformedImg.rows;

	size_t globalWorkSize = desc.image_width*desc.image_height;
	// Compile OpenCL code
	cl_program gaussianBlur = createProgram(OCLass.getContext(), devicesCount, devices, "GaussianBlur.cl");

	cl_command_queue queue = clCreateCommandQueue(OCLass.getContext(), devices[0], NULL, NULL);

	cl_mem memoryObjects[3];
	// Create an OpenCL Image / texture and transfer data to the device
	memoryObjects[0] = clCreateImage(OCLass.getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &format, &desc, buffer, &errNum);

	// Create a buffer for the result
	memoryObjects[2] = clCreateBuffer(OCLass.getContext(), CL_MEM_WRITE_ONLY, sizeof(int)*globalWorkSize, NULL, &errNum);
	 
	// Create Gaussian mask
	int maskSize;
	float * mask = createBlurMask(10.0f, &maskSize);

	// Create buffer for mask and transfer it to the device
	memoryObjects[1] = clCreateBuffer(OCLass.getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float)*(maskSize * 2 + 1)*(maskSize * 2 + 1), mask, &errNum);

	// Run Gaussian kernel
	cl_kernel kernel = clCreateKernel(gaussianBlur, "gaussian_blur", &errNum);
	errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memoryObjects[0]);
	errNum = clSetKernelArg(kernel, 1, sizeof(cl_mem), &memoryObjects[1]);
	errNum = clSetKernelArg(kernel, 2, sizeof(cl_mem), &memoryObjects[2]);
	errNum = clSetKernelArg(kernel, 3, sizeof(int), (void*)&maskSize);
	if (errNum != CL_SUCCESS)
	{
		std::cerr << "Error setting kernel Arguments.
" << std::endl;
		Cleanup(gaussianBlur, kernel, memoryObjects, 3);
		return 1;
	}
	size_t localWorkSize = chooseLocalWorkSize(kernel, devices, 1, globalWorkSize);
	errNum = clEnqueueNDRangeKernel(OCLass.getCommandQueue(0), kernel, 2, 0, &globalWorkSize, NULL, 0, NULL, NULL);
	if (errNum != CL_SUCCESS)
	{
		std::cerr << "Error queuing kernel for execution." << std::endl;
		Cleanup(gaussianBlur, kernel, memoryObjects, 3);
		return 1;
	}
	// Transfer image back to host
	uchar* data = new uchar[globalWorkSize*4];
	
	clEnqueueReadBuffer(queue, memoryObjects[2], CL_TRUE, 0, sizeof(uchar)*globalWorkSize*4, data, 0, NULL, NULL);
for (unsigned i = 0; i < 100; ++i)
	{
		std::cout << "data[" << i << "] = " << data[i] << std::endl;
	}
	//create cv::Size object for creating a Mat object with proper size
	cv::Size s;
	s.height = desc.image_height;
	s.width = desc.image_width;
	//create cv::Mat object with data
	cv::Mat outputImageRGBA(s, CV_8UC4, data);
	cv::Mat outputImageBGR;
	//convert to BGR because imwrite doc: "Only 8-bit (or 16-bit unsigned (CV_16U) in case of PNG, JPEG 2000, and TIFF) single-channel or 3-channel (with ‘BGR’ channel order) images can be saved using this function."
	cv::cvtColor(outputImageRGBA, outputImageBGR, CV_RGBA2BGR);
	//save to disc with imwrite
	cv::imwrite("BlurredPic.jpg", outputImageBGR);
	Cleanup(gaussianBlur, kernel, memoryObjects, 3);
	delete[] data;
}

The problem is that BlurredPic.jpg is completely black.
The author of the tutorial is using single channel image - CL_R and I am trying to do it with CL_RGBA type memory object.Do I need to change something in kernel to reflect this difference?Or the problem is with reading back the result memory buffer because neither element of the data array prints value?
Thanks in advance!

Stiliyan · September 24, 2014, 5:19am

Hi, I’ve found my mistakes in the code and I am going to share them if somebody has similar problem.
1.

inFormat.image_channel_data_type = CL_UNSIGNED_INT8;

must be

inFormat.image_channel_data_type = CL_UNORM_INT8;

2.Another thing is that I started using the clCreateImage2D function instead of clCreateImage which helped me to find out that I was missing to initialize row_pitch parameter(this may not be fatal error because opencl implementation says it will be calculated automaticaly:image_row_pitch is calculated as image_width * size of element in bytes.

memoryObjects[0] = clCreateImage2D(OCLass.getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &inFormat, transformedImg.cols, transformedImg.rows, transformedImg.cols * 4, transformedImg.data, &errNum);

3.My third memory object is no longer regular buffer.I am using second image2d with float data:

float * data = new float[arrSize];
	// Create a buffer for the result
	outFormat.image_channel_data_type = CL_FLOAT; 
	outFormat.image_channel_order = CL_RGBA;
	memoryObjects[2] = clCreateImage2D(OCLass.getContext(), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, &outFormat, transformedImg.cols, transformedImg.rows, transformedImg.cols * 16, data, &errNum);

4.Global work size must be set for each dimension…:

	size_t glWork[2] = { transformedImg.cols, transformedImg.rows };
	errNum = clEnqueueNDRangeKernel(OCLass.getCommandQueue(0), kernel, 2, 0, glWork,NULL, 0, NULL, &myEvent);

5.Reading back the result:

cl::size_t<3> origin;
	origin[0] = 0; origin[1] = 0, origin[2] = 0;
	cl::size_t<3> region;
	region[0] = transformedImg.cols; region[1] = transformedImg.rows; region[2] = 1;
	errNum = clEnqueueReadImage(OCLass.getCommandQueue(0), memoryObjects[2], CL_TRUE, origin, region, transformedImg.cols*16, 0 , data, 0,  NULL, NULL);

6.Saving the blurred image:

for (int i = 0; i < arrSize; ++i)
	{
		data[i] *= 255;
	}
...
cv::Mat outputImageRGBA(size, CV_32FC4, data);

7.Kernel code:

__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
__kernel void gaussian_blur(
        __read_only image2d_t image,
        __constant float * mask,
        __write_only image2d_t blurredImage,
        __private int maskSize
    ) 
{	
const int2 pos = {get_global_id(0), get_global_id(1)};
 float4 temp;
 float4 outputPixel;
    // Collect neighbor values and multiply with Gaussian for R component
    float sum = 0.0f;
    for(int a = -maskSize; a < maskSize+1; a++) {
        for(int b = -maskSize; b < maskSize+1; b++) {
			temp = convert_float4(read_imagef(image, sampler, pos + (int2)(a,b)));
            sum += mask[a+maskSize+(b+maskSize)*(maskSize*2+1)]*temp.x;
        }
    }
	outputPixel.x = sum;
	    // Collect neighbor values and multiply with Gaussian for G component
    sum = 0.0f;
    for(int a = -maskSize; a < maskSize+1; a++) {
        for(int b = -maskSize; b < maskSize+1; b++) {
			temp = convert_float4(read_imagef(image, sampler, pos + (int2)(a,b)));
            sum += mask[a+maskSize+(b+maskSize)*(maskSize*2+1)]*temp.y;
        }
    }
	outputPixel.y = sum;
	    // Collect neighbor values and multiply with Gaussian for B component
    sum = 0.0f;
    for(int a = -maskSize; a < maskSize+1; a++) {
        for(int b = -maskSize; b < maskSize+1; b++) {
			temp = convert_float4(read_imagef(image, sampler, pos + (int2)(a,b)));
            sum += mask[a+maskSize+(b+maskSize)*(maskSize*2+1)]*temp.z;
        }
    }
	outputPixel.z = sum;
	outputPixel.w = 1.0f;
	write_imagef(blurredImage, pos, outputPixel);
	
}

akashsrikanth2310 · July 20, 2020, 11:10pm

Was trying out your code. But I always seem to be getting 0 returned in kernel when i do read_imagef any idea?