Swapping OpenCL memory buffer does not work

I need to swap OpenCL buffer objects. Basically what I am doing is implementing leapfrog scheme

leapf

but in 3D.

My kernel looks something like

p_tf[i] = p_tp[i] - u*C*(p_tn[i+1] - p_tn[i-1]);

Note that p_tf is some property ‘u’ at time step t+1, p_tn is at time step t, and p_tp is at time step t-1.

I’ve created three 3D arrays to consider the array of time step t+1, t and t-1. And for every time step I need to swap them in order to update them like the following

p_tp = p_tn;
p_tn = p_tf;

So I’ve created three kernels and used clSetKernelArg() to set the arguments in different order, and enqueued kernel calls like the following

kernel1 = clCreateKernel(program, "leapfrog3d", ... );
kernel2 = clCreateKernel(program, "leapfrog3d", ... );
kernel3 = clCreateKernel(program, "leapfrog3d", ... );


clSetKernelArg(kernel1, 0, sizeof(cl_mem), &buf_p_tf);
clSetKernelArg(kernel1, 0, sizeof(cl_mem), &buf_p_tn);
clSetKernelArg(kernel1, 0, sizeof(cl_mem), &buf_p_tp);

clSetKernelArg(kernel2, 0, sizeof(cl_mem), &buf_p_tp);
clSetKernelArg(kernel2, 0, sizeof(cl_mem), &buf_p_tf);
clSetKernelArg(kernel2, 0, sizeof(cl_mem), &buf_p_tn);

clSetKernelArg(kernel3, 0, sizeof(cl_mem), &buf_p_tn);
clSetKernelArg(kernel3, 0, sizeof(cl_mem), &buf_p_tp);
clSetKernelArg(kernel3, 0, sizeof(cl_mem), &buf_p_tf);

for (t=0;t<1000;t++)
{
   clEnqueueNDRangeKernel(cmdQueue, kernel1, 3, ...);
   clEnqueueNDRangeKernel(cmdQueue, kernel2, 3, ...);
   clEnqueueNDRangeKernel(cmdQueue, kernel3, 3, ...);
}

So, when I copy the buffer buf_p_tf and look at it, it should at least show some noise due to numerical instabilities, but I am getting initial values. I can’t find what I am doing wrong.

Here is the full code

#include <stdio.h>
#include <stdlib.h>
#include <netcdf.h>

#define CL_TARGET_OPENCL_VERSION 120 
#include <CL/cl.h>
#include "cl_err.h"

// netCDF constants
#define err(e) {printf("Error: %s\n", nc_strerror(e)); return(2);}
#define clerrchk(arg, e) {printf("  %-40s : %s\n",arg, geterrstr(e));}
#define debug(line) {printf("Here? Line : %d\n",line);}

#define fname "leap3d.nc"

// Variable sizes and dimensions (constants)
#define ndims 4


void data_init(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr);


int main()
{
int      i,j,k,t;

int      p_xi, p_yi, p_zi,
         c_xi, c_yi, c_zi,
         m_xi, m_yi, m_zi;

int      nx       =  256,
         ny       =  256,
         nz       =  16,
         nt       =  1000;

float    u        =  0.0,
         v        =  1.0,
         w        =  0.0,
         c        =  0.01;

//p_tf : p at future
//p_tn : p at now
//p_tp : p at past

float    p_tf[nz*ny*nx],
         p_tn[nz*ny*nx],
         p_tp[nz*ny*nx];

size_t   n_siz =  sizeof(int),
         p_siz =  sizeof(float) * nx * ny * nz;

int      ncid, retval, varid, x_dimid, y_dimid, z_dimid, t_dimid;
int      dimids[ndims];
size_t   start[ndims], count[ndims];


// netCDF file operation
// Creating netCDF file
if ((retval = nc_create(fname, NC_CLOBBER, &ncid)))
   err(retval);

// Define dimensions
if ((retval = nc_def_dim(ncid, "z", nz, &z_dimid)))
   err(retval);
if ((retval = nc_def_dim(ncid, "y", ny, &y_dimid)))
   err(retval);
if ((retval = nc_def_dim(ncid, "x", nx, &x_dimid)))
   err(retval);
if ((retval = nc_def_dim(ncid, "t", NC_UNLIMITED, &t_dimid)))
   err(retval);


// Dimension ids
dimids[0] = t_dimid;
dimids[1] = z_dimid;
dimids[2] = y_dimid;
dimids[3] = x_dimid;

// Variable for writing netCDF data one timestep at a time
count[0] = 1;       // For time dimension : 1 timestep
count[1] = nz;      // For z              : write everything
count[2] = ny;      // For y              : write everything
count[3] = nx;      // For x              : write everything

start[1] = 0;       // For z              : don't do anything
start[2] = 0;       // For y              : don't do anything
start[3] = 0;       // For x              : don't do anything


if ((retval = nc_def_var(ncid, "data", NC_FLOAT, ndims, dimids, &varid)))
   err(retval);

if ((retval = nc_enddef(ncid)))
   err(retval);


data_init(nx,ny,nz,(float*)p_tf); 
data_init(nx,ny,nz,(float*)p_tn); 
data_init(nx,ny,nz,(float*)p_tp); 

start[0] = 0;

if (retval = nc_put_vara_float(ncid, varid, start, count, &p_tf[0]))
   err(retval);

// Euler scheme for the first time step
for(k=0;k<nz;k++)
   for(j=0;j<ny;j++)
      for(i=0;i<nx;i++)
      {
         c_xi     = i % nx;
         c_yi     = j % ny;
         c_zi     = k % nz;
         m_xi     = (i+nx-1)%nx;
         m_yi     = (j+ny-1)%ny;
         m_zi     = (k+nz-1)%nz;

         p_tf[c_xi + c_yi * nx + c_zi * nx * ny] = 
         p_tn[c_xi + c_yi * nx + c_zi * nx * ny]
        - u * c * (p_tn[c_xi + c_yi * nx + c_zi * nx * ny] - p_tn[m_xi + c_yi * nx + c_zi * nx * ny])
        - v * c * (p_tn[c_xi + c_yi * nx + c_zi * nx * ny] - p_tn[c_xi + m_yi * nx + c_zi * nx * ny])
        - w * c * (p_tn[c_xi + c_yi * nx + c_zi * nx * ny] - p_tn[c_xi + c_yi * nx + m_zi * nx * ny]);
      }

start[0] = 1;
if (retval = nc_put_vara_float(ncid, varid, start, count, &p_tf[0]))
   err(retval);



// OpenCL part //

// Use this to check the output of each API call
cl_int status;

// Retrieve the number of Platforms
cl_uint numPlatforms = 0;
status = clGetPlatformIDs(0, NULL, &numPlatforms);

// Allocate enough space for each Platform
cl_platform_id *platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));

// Fill in the Platforms
status = clGetPlatformIDs(numPlatforms, platforms, NULL);

// Retrieve the number of Devices
cl_uint numDevices = 0;
status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
clerrchk("get number of devices", status);

// Allocate enough spaces for each Devices
char name_data[100];
int *comp_units;
cl_device_fp_config cfg;
cl_device_id *devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));

// Fill in the Devices
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
clerrchk("get device ids", status);

// Create a context and associate it with the devices
cl_context context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
clerrchk("create context", status);

// Create a command queue and associate it with the devices
cl_command_queue cmdQueue = clCreateCommandQueue(context, devices[0], 0, &status);
clerrchk("create cmd queue", status);

cl_mem buf_p_tf    =  clCreateBuffer(context, CL_MEM_READ_WRITE, p_siz, NULL, &status);
clerrchk("create buffer buf_p_tf", status);
cl_mem buf_p_tn    =  clCreateBuffer(context, CL_MEM_READ_WRITE, p_siz, NULL, &status);
clerrchk("create buffer buf_p_tn", status);
cl_mem buf_p_tp    =  clCreateBuffer(context, CL_MEM_READ_WRITE, p_siz, NULL, &status);
clerrchk("create buffer buf_p_tp", status);
cl_mem buf_nx      =  clCreateBuffer(context, CL_MEM_READ_ONLY , n_siz, NULL, &status);
clerrchk("create buffer buf_nx", status);
cl_mem buf_ny      =  clCreateBuffer(context, CL_MEM_READ_ONLY , n_siz, NULL, &status);
clerrchk("create buffer buf_ny", status);
cl_mem buf_nz      =  clCreateBuffer(context, CL_MEM_READ_ONLY , n_siz, NULL, &status);
clerrchk("create buffer buf_nz", status);


status   =  clEnqueueWriteBuffer(cmdQueue, buf_p_tf  , CL_FALSE, 0, p_siz, &p_tf,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_p_tf", status);
status   =  clEnqueueWriteBuffer(cmdQueue, buf_p_tn  , CL_FALSE, 0, p_siz, &p_tn,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_p_tn", status);
status   =  clEnqueueWriteBuffer(cmdQueue, buf_p_tp  , CL_FALSE, 0, p_siz, &p_tp,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_p_tp", status);
status   =  clEnqueueWriteBuffer(cmdQueue, buf_nx    , CL_FALSE, 0, n_siz, &nx  ,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_nx", status);
status   =  clEnqueueWriteBuffer(cmdQueue, buf_ny    , CL_FALSE, 0, n_siz, &ny  ,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_ny", status);
status   =  clEnqueueWriteBuffer(cmdQueue, buf_nz    , CL_FALSE, 0, n_siz, &nz  ,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_nz", status);



// Create Program with the source code
cl_program program = NULL;
size_t program_size;
char *program_source;
FILE *program_handle = fopen("leapfrog2.cl","r");

fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_source = (char*)malloc(program_size+1);
program_source[program_size] = '\0';
fread(program_source, sizeof(char), program_size, program_handle);
fclose(program_handle);

program = clCreateProgramWithSource(context, 1, (const char**)&program_source, &program_size, &status);
clerrchk("create program", status);


// Compile the Program for the Device
status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);

if(status != CL_SUCCESS)
{
   size_t log_size;
   clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);

   char *log = (char *) malloc(log_size);

   clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);

   printf("%s\n", log);
}

// Create a kernel
cl_kernel kernel1, kernel2, kernel3;
kernel1 = clCreateKernel(program, "leapfrog3d", &status);
clerrchk("create kernel1", status);
kernel2 = clCreateKernel(program, "leapfrog3d", &status);
clerrchk("create kernel2", status);
kernel3 = clCreateKernel(program, "leapfrog3d", &status);
clerrchk("create kernel3", status);


// Associate the input and output buffers with the kernel
status = clSetKernelArg(kernel1, 0, sizeof(cl_int), &buf_nx  );
clerrchk("set kerne1l buf_nx", status);
status = clSetKernelArg(kernel1, 1, sizeof(cl_int), &buf_ny  );
clerrchk("set kernel1 buf_ny", status);
status = clSetKernelArg(kernel1, 2, sizeof(cl_int), &buf_nz  );
clerrchk("set kernel1 buf_nz", status);
status = clSetKernelArg(kernel1, 3, sizeof(cl_mem), &buf_p_tf);
clerrchk("set kernel1 buf_p_tf", status);
status = clSetKernelArg(kernel1, 4, sizeof(cl_mem), &buf_p_tn);
clerrchk("set kernel1 buf_p_tn", status);
status = clSetKernelArg(kernel1, 5, sizeof(cl_mem), &buf_p_tp);
clerrchk("set kernel1 buf_p_tp", status);


// Associate the input and output buffers with the kernel
status = clSetKernelArg(kernel2, 0, sizeof(cl_int), &buf_nx  );
clerrchk("set kernel2 buf_nx", status);
status = clSetKernelArg(kernel2, 1, sizeof(cl_int), &buf_ny  );
clerrchk("set kernel2 buf_ny", status);
status = clSetKernelArg(kernel2, 2, sizeof(cl_int), &buf_nz  );
clerrchk("set kernel2 buf_nz", status);
status = clSetKernelArg(kernel2, 3, sizeof(cl_mem), &buf_p_tp);
clerrchk("set kernel2 buf_p_tp", status);
status = clSetKernelArg(kernel2, 4, sizeof(cl_mem), &buf_p_tf);
clerrchk("set kernel2 buf_p_tf", status);
status = clSetKernelArg(kernel2, 5, sizeof(cl_mem), &buf_p_tn);
clerrchk("set kernel2 buf_p_tn", status);


// Associate the input and output buffers with the kernel
status = clSetKernelArg(kernel3, 0, sizeof(cl_int), &buf_nx  );
clerrchk("set kernel3 buf_nx", status);
status = clSetKernelArg(kernel3, 1, sizeof(cl_int), &buf_ny  );
clerrchk("set kernel3 buf_ny", status);
status = clSetKernelArg(kernel3, 2, sizeof(cl_int), &buf_nz  );
clerrchk("set kernel3 buf_nz", status);
status = clSetKernelArg(kernel3, 3, sizeof(cl_mem), &buf_p_tn);
clerrchk("set kernel3 buf_p_tn", status);
status = clSetKernelArg(kernel3, 4, sizeof(cl_mem), &buf_p_tp);
clerrchk("set kernel3 buf_p_tp", status);
status = clSetKernelArg(kernel3, 5, sizeof(cl_mem), &buf_p_tf);
clerrchk("set kernel3 buf_p_tf", status);


// Define index space (global work size) of work items for execution
// A workgroup size (local work size) is not required, but can be used
size_t lclwksiz[3] = {16,16,4}; 
size_t glbwksiz[3] = {nx,ny,nz};


for(t=0;t<200;t++)
{
// Execute the kernel1 for execution
status = clEnqueueNDRangeKernel(cmdQueue, kernel1, 3, NULL, glbwksiz, NULL, 0, NULL, NULL);
clerrchk("enqueue kernel1", status);

// Execute the kernel2 for execution
status = clEnqueueNDRangeKernel(cmdQueue, kernel2, 3, NULL, glbwksiz, NULL, 0, NULL, NULL);
clerrchk("enqueue kernel2", status);

// Execute the kernel3 for execution
status = clEnqueueNDRangeKernel(cmdQueue, kernel3, 3, NULL, glbwksiz, NULL, 0, NULL, NULL);
clerrchk("enqueue kernel3", status);
}


// Read the Device output buffer to the host output array
status = clEnqueueReadBuffer(cmdQueue, buf_p_tn, CL_FALSE, 0, p_siz, &p_cc, 0, NULL, NULL);
clerrchk("enqueue read buffer", status);

// Writing the data into netcdf file
start[0] = 2;

if (retval = nc_put_vara_float(ncid, varid, start, count, &p_cc[0]))
   err(retval);

// Close netcdf data
if ((retval = nc_close(ncid)))
   err(retval);


clFinish(cmdQueue);
clReleaseMemObject(buf_p_tf);
clReleaseMemObject(buf_p_tn);
clReleaseMemObject(buf_p_tp);
clReleaseMemObject(buf_nx);
clReleaseMemObject(buf_ny);
clReleaseMemObject(buf_nz);
clReleaseContext(context); 
clReleaseKernel(kernel1);
clReleaseKernel(kernel2);
clReleaseKernel(kernel3);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);

printf("\nDone. . .\n");

return 0;                     
}                             


// Data initialization
void data_init(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr)
{
int i,j,k;

int i_min   =  199,
    i_max   =  229,
    j_min   =  127,
    j_max   =  159;

for(k=0;k<in_z_siz;k++)
   for(j=0;j<in_y_siz;j++)
      for(i=0;i<in_x_siz;i++)
         in_arr[k * in_y_siz * in_x_siz + j * in_x_siz +i] = 0.0;

 for(k=0;k<in_z_siz;k++)
   for(j=j_min;j<j_max;j++)
      for(i=i_min;i<i_max;i++)
         in_arr[k * in_y_siz * in_x_siz + j * in_x_siz +i] = 3.0;
}

here is the kernel code

kernel void leapfrog3d(
                        const  int x_siz,
                        const  int y_siz,
                        const  int z_siz,
                        global float *in_p_tf,
                        global float *in_p_tn,
                        global float *in_p_tp
                      )
{
   int nx   =  x_siz;
   int ny   =  y_siz;
   int nz   =  z_siz;
   float u  =  0.0;
   float v  =  1.0;
   float w  =  0.0;
   float C  =  0.01;
   int i    =  get_global_id(0);
   int j    =  get_global_id(1);
   int k    =  get_global_id(2);

   int idx0, idx_i0, idx_i1, idx_j0, idx_j1, idx_k0, idx_k1;

   int p_xi = (i+1)%nx,
       p_yi = (j+1)%ny, 
       p_zi = (k+1)%nz,
       c_xi = i % nx, 
       c_yi = j % ny, 
       c_zi = k % nz,
       m_xi = (i+nx-1)%nx, 
       m_yi = (j+ny-1)%ny, 
       m_zi = (k+nz-1)%nz;


   idx0     =  c_xi + c_yi * nx + c_zi * nx * ny;

   idx_i0   =  p_xi + c_yi * nx + c_zi * nx * ny;
   idx_j0   =  c_xi + p_yi * nx + c_zi * nx * ny;
   idx_k0   =  c_xi + c_yi * nx + p_zi * nx * ny;
   
   idx_i1   =  m_xi + c_yi * nx + c_zi * nx * ny;
   idx_j1   =  c_xi + m_yi * nx + c_zi * nx * ny;
   idx_k1   =  c_xi + c_yi * nx + m_zi * nx * ny;

   in_p_tf[idx0] = in_p_tp[idx0] 
                 - u * C * (in_p_tn[idx_i0] - in_p_tn[idx_i1])
                 - v * C * (in_p_tn[idx_j0] - in_p_tn[idx_j1])
                 - w * C * (in_p_tn[idx_k0] - in_p_tn[idx_k1]);

}

This already looks weird. Why is your clSetKernelArg parameter arg_index always 0? It should be incremented because it’s the index of the parameter in your kernel.

And where do you set the parameters index 0 to 2? I only see the float* parameters index 3 to 5.

kernel void leapfrog3d(
                        const  int x_siz, // index 0
                        const  int y_siz, // index 1
                        const  int z_siz, // index 2
                        global float *in_p_tf, // index 3
                        global float *in_p_tn, // index 4
                        global float *in_p_tp // index 5
                      )
{

Ah ok, forget my comment above. Just looked at your code and you have incremented the index. You just didn’t posted your code there but something else to confuse me.

status = clEnqueueWriteBuffer(cmdQueue, buf_p_tf , CL_FALSE, 0, p_siz, &p_tf,0, NULL, NULL);

This is not a good idea. You set the parameter blocking_write to false. This makes clEnqueueWriteBuffer an asynchronous call. But you don’t have any callbacks set when the function is finished writing data. Use the blocking mode:

status = clEnqueueWriteBuffer(cmdQueue, buf_p_tf , CL_TRUE, 0, p_siz, &p_tf,0, NULL, NULL);

Also with clEnqueueReadBuffer use the blocking mode. My guess is that you start the kernel before clEnqueueWriteBuffer actually finished writing the data. Use the blocking mode so that the write/read blocks until it’s done.

Thank you for your reply!

I remember setting it as non-block mode for clEnqueueReadBuffer() because I was getting OUT_OF_RESOURCE error, while back, not knowing what it actually does. I tried changing it to blocking mode, and I am getting the same OUT_OF_RESOURCE error. Does this have to do with executing multiple kernel in a row, but each kernels are not completing? If so, how do I let the kernel2 to wait for the kernel1 to finish?

Thanks.

Where do you get this error? From clEnqueueNDRangeKernel? clEnqueueNDRangeKernel is blocking anyway, so if it’s finish it’s finish.

CL_OUT_OF_RESOURCES if there is a failure to queue the execution instance of kernel on the command-queue because of insufficient resources needed to execute the kernel. For example, the explicitly specified local_work_size causes a failure to execute the kernel because of insufficient resources such as registers or local memory. Another example would be the number of read-only image args used in kernel exceed the CL_DEVICE_MAX_READ_IMAGE_ARGS value for device or the number of write-only and read-write image args used in kernel exceed the CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS value for device or the number of samplers used in kernel exceed CL_DEVICE_MAX_SAMPLERS for device. The OpenCL™ Specification

How much RAM do you have on your device? What is the max local size?

I’m getting CL_OUT_OF_RESOURCE from clEnqueueReadBuffer().

I think I have enough RAM on my device. It’s got 16GB, and the max work item size is 1024x1024x64

Can you calculate how much memory you are using?

3 float buffers * 3 kernels * how many datasets you have = x MB

This also weird:

status   =  clEnqueueWriteBuffer(cmdQueue, buf_p_tf  , CL_FALSE, 0, p_siz, &p_tf,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_p_tf", status);
status   =  clEnqueueWriteBuffer(cmdQueue, buf_p_tn  , CL_FALSE, 0, p_siz, &p_tn,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_p_tn", status);
status   =  clEnqueueWriteBuffer(cmdQueue, buf_p_tp  , CL_FALSE, 0, p_siz, &p_tp,0, NULL, NULL);

p_tf is already a pointer. Why do you pass a pointer to p_tf with &p_tf?

You just have to pass p_tf without the &.

status   =  clEnqueueWriteBuffer(cmdQueue, buf_p_tf  , CL_FALSE, 0, p_siz, p_tf,0, NULL, NULL);

Here you have to use & because nx, ny, nz are just int and not pointers.

status   =  clEnqueueWriteBuffer(cmdQueue, buf_nx    , CL_FALSE, 0, n_siz, &nx  ,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_nx", status);
status   =  clEnqueueWriteBuffer(cmdQueue, buf_ny    , CL_FALSE, 0, n_siz, &ny  ,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_ny", status);
status   =  clEnqueueWriteBuffer(cmdQueue, buf_nz    , CL_FALSE, 0, n_siz, &nz  ,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_nz", status);

Same here, p_cc is already a pointer.

status = clEnqueueReadBuffer(cmdQueue, buf_p_tn, CL_FALSE, 0, p_siz, p_cc, 0, NULL, NULL);
clerrchk("enqueue read buffer", status);

Try it out if it works.

Also you are not using the heap. Those are all static arrays that are on the stack. I don’t even know if the stack is big enough for those. To use the heap use either malloc or new[].

float*    p_tf = new float[nz*ny*nx],
         p_tn = new float[nz*ny*nx],
         p_tp = new float[nz*ny*nx];

And then of course free or delete[]:

delete [] p_tf;
delete [] p_tn;
delete [] p_tp;

I would really recommend you to read those examples

Thank you for your detailed reply.

Rough calculation of the data size that I am putting into the device is…

256 x 256 x 16 x 4 bytes x 3 = 12 GBytes. So, it should be okay. Also, I’ve tried much less: 16 x 16 x 2 x 4 bytes x 3 = 6 MBytes, and I am still getting the same error.

Also… I did try using malloc() and I’ve got the same CL_OUT_OF_RESOURCES error message from clEnqueueReadBuffer().

Do you think the array size parameter p_siz which is declared as

size_t p_siz =  sizeof(float) * nx * ny * nz;

that passes into clCreateBuffer(), clEnqueueWriteBuffer(), and clEnqueueReadBuffer() is correct?

Yeah. In the example link (first example sharpen.c), I’ve noticed that the code declares the datasize as

int dataSize = imageHeight * imageWidth * sizeof(float);

But, I thought we should use size_t datatype for the data size to pass it into clCreateBuffer(), clEnqueueWriteBuffer()' and 'clEnqueueReadBuffer(). Is this not correct?

size_t is more correct. But then I have no idea what’s wrong. Maybe your index in the kernel is wrong and you write over data outside the in_p_tf, etc. parameters?

I think I found the problem. In addition to your suggestions, size parameter x_siz, y_siz, and z_siz inside the kernel is not being passed properly into the kernel. When I just hard code the array size inside the kernel the code runs just fine. So, how do I fix this…?

I found it out… I have been passing nx, ny, and nz as a buffer object into the kernel. I figured this is how the things are done for integer objects, but apparently not. So when I passed them just as is (not as OpenCL buffer object but just by integers) it worked.

Anyways, I really appreciate your help devent!

1 Like

This topic was automatically closed 183 days after the last reply. New replies are no longer allowed.