Problem using image on OPENCL

Hello, i’m new in this type of forums (and not english native, so sorry for any errors or mistakes),

so i’m starting on opencl for a college course, and i’m having a problem,
i want to open a image and then apply an effect on the image, but i’m first trying to put the image on the buffer, and with the kernel just copy it to the output buffer to see if the image can be opened and saved.
The problem i have right now is that the image can be opened (and if i just do fread and fwrite it works), but for opencl it just does the thing for the first line of the image, and doesn’t work for the whole image, so on the result image i have the first line of pixels, and the rest is just black.
(i’m trying to do it on 1280x720p image btw)
The code ist this one:

#include <stdio.h>
#include <stdlib.h>

#ifdef __APPLE__
#include <OpenCL/opencl.h>
#include <CL/cl.h>
#include "main.h"

#define MAX_SOURCE_SIZE (0x100000)

int main(void) {

	unsigned int image_width = 1280;
	unsigned int image_height = 720;

	printf("Program Started

	FILE *FicheiroInput;
	FicheiroInput = fopen("leo.rgba", "rb");

	unsigned int *sourceImage =
		(unsigned int*)malloc(sizeof(unsigned int) * image_width * image_height);

	fread(&sourceImage[0], sizeof(unsigned int), image_width * image_height, FicheiroInput);


	unsigned int *outputImage =
		(unsigned int*)malloc(sizeof(unsigned int) * image_width * image_height);

	// Load the kernel source code into the array source_str
	FILE *fp;
	char *source_str;
	size_t source_size;

	fp = fopen("", "r");
	if (!fp) {
		fprintf(stderr, "Failed to load kernel.
	source_str = (char*)malloc(MAX_SOURCE_SIZE);
	source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
	printf("kernel loading done

	// Get platform and device information
	cl_device_id device_id = NULL;
	cl_uint ret_num_devices;
	cl_uint ret_num_platforms;

	cl_int ret = clGetPlatformIDs(0, NULL, &ret_num_platforms);
	cl_platform_id *platforms = NULL;
	platforms = (cl_platform_id*)malloc(ret_num_platforms * sizeof(cl_platform_id));

	ret = clGetPlatformIDs(ret_num_platforms, platforms, NULL);
	printf("ret at %d is %d
", __LINE__, ret);

	ret = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 1,
		&device_id, &ret_num_devices);
	printf("ret at %d is %d
", __LINE__, ret);

	// Create an OpenCL context
	cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
	printf("ret at %d is %d
", __LINE__, ret);

	// Create a command queue
	cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
	printf("ret at %d is %d
", __LINE__, ret);


	cl_image_desc desc;
	desc.image_type = CL_MEM_OBJECT_IMAGE2D;
	desc.image_width = image_width;
	desc.image_height = image_height;
	desc.image_depth = 0;
	desc.num_samples = 0;
	desc.image_array_size = 0;
	desc.image_row_pitch = 0;
	desc.image_slice_pitch = 0;
	desc.image_row_pitch = 0;
	desc.num_mip_levels = 0;
	desc.num_samples = 0;
	desc.buffer = NULL;

	cl_image_format format;
	format.image_channel_order = CL_RGBA;
	format.image_channel_data_type = CL_UNORM_INT8;

	//create memory buffer on the device for each image
	cl_mem input_image = clCreateImage(context, CL_MEM_READ_ONLY,
		&format, &desc, NULL, &ret);
	printf("ret at %d is %d
", __LINE__, ret);

	cl_mem output_image = clCreateImage(context, CL_MEM_WRITE_ONLY,
		&format, &desc, NULL, &ret);
	printf("ret at %d is %d
", __LINE__, ret);

	size_t origin_image[3] = { 0,0,0 };

	size_t size_image[3] = { image_width, image_height, 1 };

	ret = clEnqueueWriteImage(command_queue, input_image, CL_FALSE,
		origin_image, size_image, 0, 0, sourceImage, 0, 0, 0);
	printf("ret at %d is %d
", __LINE__, ret);

	cl_sampler sampler = clCreateSampler(

	printf("before building
	// Create a program from the kernel source
	cl_program program = clCreateProgramWithSource(context, 1,
		(const char **)&source_str, (const size_t *)&source_size, &ret);
	printf("ret at %d is %d
", __LINE__, ret);

	// Build the program
	ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
	printf("ret at %d is %d
", __LINE__, ret);

	printf("after building

	cl_kernel kernel = clCreateKernel(program, "convolution", &ret);
	printf("ret at %d is %d
", __LINE__, ret);

	// Set the arguments of the kernel

	ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
	printf("ret at %d is %d
", __LINE__, ret);

	ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
	printf("ret at %d is %d
", __LINE__, ret);

	ret = clSetKernelArg(kernel, 2, sizeof(image_height), &image_height);
	printf("ret at %d is %d
", __LINE__, ret);

	ret = clSetKernelArg(kernel, 3, sizeof(image_width), &image_width);
	printf("ret at %d is %d
", __LINE__, ret);

	ret = clSetKernelArg(kernel, 4, sizeof(cl_sampler), (void *)&sampler);
	printf("ret at %d is %d
", __LINE__, ret);

	printf("before execution
	// Execute the OpenCL kernel on the list

	size_t global_item_size[2] = { image_width , image_height };//Process // ; the entire lists

	size_t local_item_size[2] = { 80,80 }; // Divide work items into groups of 80

	ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
		global_item_size, local_item_size, 0, NULL, NULL);

	printf("after execution

	ret = clEnqueueReadImage(command_queue, output_image, CL_TRUE,
		origin_image, size_image, 0, 0, outputImage, 0, 0, 0);

	FILE *FicheiroOutput = fopen("leopard2.rgba", "wb");

	fwrite(&outputImage[0], sizeof(unsigned int), image_height * image_width, FicheiroOutput);


	return 0;
void convolution(
	__read_only image2d_t sourceImage,
	__write_only image2d_t outputImage,
	int rows,
	int cols,
	sampler_t sampler)

	int x = get_global_id(0);
	int y = get_global_id(1);

	float4 pixel = { 0.0f, 0.0f, 0.0f, 0.0f };

	pixel = read_imagef(sourceImage,  sampler, (int2)(x , y ));
", x);
", y);

	write_imagef(outputImage, (int2)(x, y), pixel);


i’ve tried a lot of stuff, but i’m new to this, and i have no teacher or someone to help, please help me :frowning:

I suspect it’s your local_item_size of {80,80} – that’s 6400 items in a workgroup, and most hardware can’t do that big (check the device info for the hardware max, it might be something like 128, 256, or 1024). To test this, in the clEnqueueNDRangeKernel call, try passing NULL for the local workgroup size. If that fixes it, the too-large workgroup size was the problem.

I did what you said to do, i put NULL on clEnqueueNDRangeKernel for local workgroup size, and it does the same as before. (but thanks for the info, i’ll let it be at NULL :slight_smile: )
I even have the prints on the kernel, to print the values of X and Y, and the X value goes all the values from 1 to 1280 (at random), but the Y value is always 0

Check your third parameter to clEnqueueNDRangeKernel (“work_dim”) – you have it set to 1, but it should be 2.

thanks a lot, it worked :smiley: it does what i wanted, now i can work with it :smiley: