Hello,
I’ve just started with OpenCL and a problem I can’t figure out how to solve:
I want to implement a Gauss-Filter in the kernel. I use Image2D-objects for the source- and destination-image.
My problem is, when I repeatedly execute the same exact code (of course execution is after compiling) I get three different output images:
- the source image
- the blurred image (that’s what always should be the output)
- a fully black image
I would really appreciate your help. Thank you very much.
Here comes the code:
device_gauss.cl:
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
__kernel void gauss(__read_only image2d_t src, __write_only image2d_t dst, __global float *weighter,
int offset, int rows, int cols) {
int i,j;
float sum = 0.0f;
float norm = 0.0f;
//Get coordinates
int gid0 = get_global_id(0);
int gid1 = get_global_id(1);
float4 pixel;
float4 pixeltemp;
int dist;
pixel=read_imagef(src,sampler,(int2)(gid0,gid1));
for (i = -offset; i <= offset; i++) {
for (j = -offset; j <= offset; j++) {
// sum up the color values of the neighbour pixels
pixeltemp = read_imagef(src,sampler,(int2)(gid0+i,gid1 +j));
dist = (int) (sqrt((i*i)+(j*j))+0.5);
sum += weighter[dist] * pixeltemp.x;
norm += weighter[dist];
}
}
pixel.x = (sum)/(norm);
if(norm>0.0f)
write_imagef (dst,(int2)(gid0, gid1),pixel);
}
host_gauss.c
#include <CL/cl.h>
#include <stdio.h>
#include <sys/stat.h>
#include <math.h>
#include "load_write_pgm.h"
#define DIMENSION 2
#define WEIGHTER_SIZE 13
#define MILLION 1000000.0
void check(cl_int*);
void copyimg_to_linear(float *src, float *dst, struct imageMatrix* img);
void save_1dimage(float *src, struct imageMatrix * img, char* filename);
void do2Dto1D(struct imageMatrix* source, float* destination);
void do1Dto2D(float* source, struct imageMatrix* destination);
void init_weight_array(size_t size, float* weighter);
int main() {
cl_int err;
cl_event event;
cl_uint amount;
int i;
//Get platforms
clGetPlatformIDs(NULL, NULL, &amount);
cl_platform_id *platform = (cl_platform_id*) malloc(amount
* sizeof(cl_platform_id));
err = clGetPlatformIDs(amount, platform, NULL);
check(&err);
//Get devices of first platform
clGetDeviceIDs(*platform, CL_DEVICE_TYPE_GPU, NULL, NULL, &amount);
cl_device_id *devices = (cl_device_id*) malloc(amount
* sizeof(cl_device_id));
err = clGetDeviceIDs(*platform, CL_DEVICE_TYPE_GPU, amount, devices, NULL);
check(&err);
//Get MaxWorkGroupSize
size_t workgroup = 0;
err = clGetDeviceInfo(*devices, CL_DEVICE_MAX_WORK_GROUP_SIZE,
sizeof(workgroup), &workgroup, NULL);
check(&err);
int workgroupsize = (int) workgroup;
//Create a context
cl_context ctx;
cl_context_properties props[] = { CL_CONTEXT_PLATFORM,
(cl_context_properties) *platform, NULL };
ctx = clCreateContext(props, amount, devices, NULL, NULL, &err);
check(&err);
//Create a command queue for all devices associated with the first platform
cl_command_queue *queues = (cl_command_queue*) malloc(amount * sizeof(cl_command_queue));
for (i = 0; i < amount; i++) {
queues[i] = clCreateCommandQueue(ctx, devices[i],
CL_QUEUE_PROFILING_ENABLE, &err);
}
//Read source from file
FILE *f;
f = fopen("kernels/device_gauss.cl", "r");
struct stat finfo;
if (f == NULL) {
printf("ERROR: Could not load file.
");
exit(EXIT_FAILURE);
}
stat("kernels/device_gauss.cl", &finfo);
char *buffer = (char*) malloc(finfo.st_size + 1);
char c;
i = 0;
while ((c = getc(f)) != EOF) {
buffer[i] = c;
i++;
}
buffer[i] = '\0';
//Create program by source
cl_program program;
program = clCreateProgramWithSource(ctx, 1, &buffer, NULL, &err);
check(&err);
//Build program
err = clBuildProgram(program, NULL, NULL, NULL, NULL, NULL);
check(&err);
//Create kernel object
cl_kernel gauss_kernel;
gauss_kernel = clCreateKernel(program, "gauss", &err);
check(&err);
//Start measuring
cl_ulong start;
clEnqueueMarker(queues[0], &event);
clFinish(queues[0]);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
sizeof(cl_ulong), &start, NULL);
//Create buffers
cl_mem d_weighter;
float* h_src, *h_dst, *weighter;
struct imageMatrix img;
int dim[2];
char* filename = "pictures/pgm.pgm";
int offset = WEIGHTER_SIZE / 2;
//method to load .pgm-file
load_pgm_image(&img, filename);
dim[0] = img.i_rows;
dim[1] = img.i_cols;
//Host-Source-Image
h_src = (float*) malloc(dim[0] * dim[1] * sizeof(float));
//Host-Destination-Image
h_dst = (float*) malloc(dim[0] * dim[1] * sizeof(float));
//Copy 2D-Image to a 1D-Array
copyimg_to_linear(h_src, h_dst, &img);
//fill destination-image with zeros
h_dst = (float*) calloc(dim[0] * dim[1], sizeof(float));
//********Create IMG Objects**********
cl_mem d_src_2d;
cl_mem d_dst_2d;
cl_image_format format;
format.image_channel_order = CL_R;
format.image_channel_data_type = CL_FLOAT;
printf("%i
", (int) sizeof(CL_UNSIGNED_INT8));
int num_channels_per_pixel = 1;
int channel_size = (int) sizeof(CL_UNSIGNED_INT8);
int pixel_size = num_channels_per_pixel * channel_size;
//Create Image2D-Objects für source and destination
d_src_2d = clCreateImage2D(ctx, CL_MEM_READ_ONLY, &format, dim[1], dim[0],
0, NULL, &err);
check(&err);
d_dst_2d = clCreateImage2D(ctx, CL_MEM_WRITE_ONLY, &format, dim[1], dim[0],
0, NULL, &err);
check(&err);
//create weight for neighbor-pixel
weighter = (float *) malloc(WEIGHTER_SIZE * sizeof(float));
init_weight_array(WEIGHTER_SIZE, weighter);
d_weighter = clCreateBuffer(ctx, CL_MEM_ALLOC_HOST_PTR
| CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY, WEIGHTER_SIZE
* sizeof(float), weighter, &err);
check(&err);
//Set kernel parameter
err = clSetKernelArg(gauss_kernel, 0, sizeof(cl_mem), (void*) &d_src_2d);
check(&err);
err = clSetKernelArg(gauss_kernel, 1, sizeof(cl_mem), (void*) &d_dst_2d);
check(&err);
err = clSetKernelArg(gauss_kernel, 2, sizeof(cl_mem), &d_weighter);
check(&err);
err = clSetKernelArg(gauss_kernel, 3, sizeof(int), &offset);
check(&err);
err = clSetKernelArg(gauss_kernel, 4, sizeof(int), &dim[1]);
check(&err);
err = clSetKernelArg(gauss_kernel, 5, sizeof(int), &dim[0]);
check(&err);
//Copy image to device
size_t origin[] = { 0, 0, 0 };
size_t region[] = { dim[1], dim[0], 1 };
err = clEnqueueWriteImage(queues[0], d_src_2d, CL_TRUE, origin, region, 0,
0, h_src, 0, NULL, &event);
check(&err);
err = clEnqueueWriteImage(queues[0], d_dst_2d, CL_TRUE, origin, region, 0,
0, h_dst, 0, NULL, &event);
check(&err);
//Enqueue kernel execution command in command queue
int remainder1 = dim[1] % workgroupsize;
int remainder2 = dim[0] % workgroupsize;
size_t global_size[] = { dim[1] + workgroup - remainder1, dim[0]
+ workgroup - remainder2 };
err = clEnqueueNDRangeKernel(queues[0], gauss_kernel, 2, NULL, global_size,
NULL, NULL, NULL, &event);
check(&err);
//Wait for execution of the gauss algorithm
err = clWaitForEvents(1, &event);
check(&err);
//Download result from device memory
err = clEnqueueReadImage(queues[0], d_dst_2d, CL_TRUE, origin, region, 0,
0, h_dst, NULL, NULL, NULL);
check(&err);
//Measure elapsed time
cl_ulong end;
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong),
&end, NULL);
clFinish(queues[0]);
float elapsed = (end - start) / MILLION;
printf("Elapsed time: %f ms
", elapsed);
//Write result image to disk
save_1dimage(h_dst, &img, "pictures/result.pgm");
clReleaseMemObject(d_dst_2d);
clReleaseMemObject(d_src_2d);
free(h_src);
free(h_dst);
free(weighter);
destroyImageMatrix(&img);
return 0;
}
void copyimg_to_linear(float *src, float *dst, struct imageMatrix* img) {
do2Dto1D(img, src);
do2Dto1D(img, dst);
}
void save_1dimage(float *src, struct imageMatrix* img, char* filename) {
do1Dto2D(src, img);
write_pgm_image(img, filename);
}
/*Transform a 2D array in a 1D Array*/
void do2Dto1D(struct imageMatrix* source, float* destination) {
int i, j;
for (i = 0; i < source->i_rows; i++) {
for (j = 0; j < source->i_cols; j++) {
destination[j + (i * source->i_cols)] = source->imageMatrix[i][j];
}
}
}
/**
*Transform a 1D array in a 2D Array
*/
void do1Dto2D(float* source, struct imageMatrix* destination) {
int i, j;
for (i = 0; i < destination->i_rows; i++) {
for (j = 0; j < destination->i_cols; j++) {
destination->imageMatrix[i][j] = source[j + (i
* destination->i_cols)];
}
}
}
/**
*
*/
void init_weight_array(size_t size, float* weighter) {
int offset = size / 2;
int fwhm = 5;
/*
* Given as parameter
* FWHM = 2 sqrt(2 ln2) sigma ~ 2.35 sigma
*/
float a = (fwhm / 2.354);
int i;
/* set up kernel to weight the pixels */
/* (KERNEL_SIZE - offset -1) is the CORRECT version */
for (i = -offset; i <= (size - offset - 1); i++) {
weighter[i + offset] = exp(-i * i / (2 * a * a));
}
}
void check(cl_int *err) {
switch (*err) {
case CL_SUCCESS:
return;
break;
case CL_DEVICE_NOT_FOUND:
printf("Device not found.");
break;
case CL_DEVICE_NOT_AVAILABLE:
printf("Device not available");
break;
case CL_COMPILER_NOT_AVAILABLE:
printf("Compiler not available");
break;
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
printf("Memory object allocation failure");
break;
case CL_OUT_OF_RESOURCES:
printf("Out of resources");
break;
case CL_OUT_OF_HOST_MEMORY:
printf("Out of host memory");
break;
case CL_PROFILING_INFO_NOT_AVAILABLE:
printf("Profiling information not available");
break;
case CL_MEM_COPY_OVERLAP:
printf("Memory copy overlap");
break;
case CL_IMAGE_FORMAT_MISMATCH:
printf("Image format mismatch");
break;
case CL_IMAGE_FORMAT_NOT_SUPPORTED:
printf("Image format not supported");
break;
case CL_BUILD_PROGRAM_FAILURE:
printf("Program build failure");
break;
case CL_MAP_FAILURE:
printf("Map failure");
break;
case CL_INVALID_VALUE:
printf("Invalid value");
break;
case CL_INVALID_DEVICE_TYPE:
printf("Invalid device type");
break;
case CL_INVALID_PLATFORM:
printf("Invalid platform");
break;
case CL_INVALID_DEVICE:
printf("Invalid device");
break;
case CL_INVALID_CONTEXT:
printf("Invalid context");
break;
case CL_INVALID_QUEUE_PROPERTIES:
printf("Invalid queue properties");
break;
case CL_INVALID_COMMAND_QUEUE:
printf("Invalid command queue");
break;
case CL_INVALID_HOST_PTR:
printf("Invalid host pointer");
break;
case CL_INVALID_MEM_OBJECT:
printf("Invalid memory object");
break;
case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
printf("Invalid image format descriptor");
break;
case CL_INVALID_IMAGE_SIZE:
printf("Invalid image size");
break;
case CL_INVALID_SAMPLER:
printf("Invalid sampler");
break;
case CL_INVALID_BINARY:
printf("Invalid binary");
break;
case CL_INVALID_BUILD_OPTIONS:
printf("Invalid build options");
break;
case CL_INVALID_PROGRAM:
printf("Invalid program");
break;
case CL_INVALID_PROGRAM_EXECUTABLE:
printf("Invalid program executable");
break;
case CL_INVALID_KERNEL_NAME:
printf("Invalid kernel name");
break;
case CL_INVALID_KERNEL_DEFINITION:
printf("Invalid kernel definition");
break;
case CL_INVALID_KERNEL:
printf("Invalid kernel");
break;
case CL_INVALID_ARG_INDEX:
printf("Invalid argument index");
break;
case CL_INVALID_ARG_VALUE:
printf("Invalid argument value");
break;
case CL_INVALID_ARG_SIZE:
printf("Invalid argument size");
break;
case CL_INVALID_KERNEL_ARGS:
printf("Invalid kernel arguments");
break;
case CL_INVALID_WORK_DIMENSION:
printf("Invalid work dimension");
break;
case CL_INVALID_WORK_GROUP_SIZE:
printf("Invalid work group size");
break;
case CL_INVALID_WORK_ITEM_SIZE:
printf("Invalid work item size");
break;
case CL_INVALID_GLOBAL_OFFSET:
printf("Invalid global offset");
break;
case CL_INVALID_EVENT_WAIT_LIST:
printf("Invalid event wait list");
break;
case CL_INVALID_EVENT:
printf("Invalid event");
break;
case CL_INVALID_OPERATION:
printf("Invalid operation");
break;
case CL_INVALID_GL_OBJECT:
printf("Invalid OpenGL object");
break;
case CL_INVALID_BUFFER_SIZE:
printf("Invalid buffer size");
break;
case CL_INVALID_MIP_LEVEL:
printf("Invalid mip-map level");
break;
default:
printf("Unknown");
break;
}
printf("
");
}