Swapping OpenCL memory buffer does not work

ep-98d · September 16, 2021, 3:43pm

I need to swap OpenCL buffer objects. Basically what I am doing is implementing leapfrog scheme

but in 3D.

My kernel looks something like

p_tf[i] = p_tp[i] - u*C*(p_tn[i+1] - p_tn[i-1]);

Note that p_tf is some property ‘u’ at time step t+1, p_tn is at time step t, and p_tp is at time step t-1.

I’ve created three 3D arrays to consider the array of time step t+1, t and t-1. And for every time step I need to swap them in order to update them like the following

p_tp = p_tn;
p_tn = p_tf;

So I’ve created three kernels and used clSetKernelArg() to set the arguments in different order, and enqueued kernel calls like the following

kernel1 = clCreateKernel(program, "leapfrog3d", ... );
kernel2 = clCreateKernel(program, "leapfrog3d", ... );
kernel3 = clCreateKernel(program, "leapfrog3d", ... );


clSetKernelArg(kernel1, 0, sizeof(cl_mem), &buf_p_tf);
clSetKernelArg(kernel1, 0, sizeof(cl_mem), &buf_p_tn);
clSetKernelArg(kernel1, 0, sizeof(cl_mem), &buf_p_tp);

clSetKernelArg(kernel2, 0, sizeof(cl_mem), &buf_p_tp);
clSetKernelArg(kernel2, 0, sizeof(cl_mem), &buf_p_tf);
clSetKernelArg(kernel2, 0, sizeof(cl_mem), &buf_p_tn);

clSetKernelArg(kernel3, 0, sizeof(cl_mem), &buf_p_tn);
clSetKernelArg(kernel3, 0, sizeof(cl_mem), &buf_p_tp);
clSetKernelArg(kernel3, 0, sizeof(cl_mem), &buf_p_tf);

for (t=0;t<1000;t++)
{
   clEnqueueNDRangeKernel(cmdQueue, kernel1, 3, ...);
   clEnqueueNDRangeKernel(cmdQueue, kernel2, 3, ...);
   clEnqueueNDRangeKernel(cmdQueue, kernel3, 3, ...);
}

So, when I copy the buffer buf_p_tf and look at it, it should at least show some noise due to numerical instabilities, but I am getting initial values. I can’t find what I am doing wrong.

Here is the full code

#include <stdio.h>
#include <stdlib.h>
#include <netcdf.h>

#define CL_TARGET_OPENCL_VERSION 120 
#include <CL/cl.h>
#include "cl_err.h"

// netCDF constants
#define err(e) {printf("Error: %s\n", nc_strerror(e)); return(2);}
#define clerrchk(arg, e) {printf("  %-40s : %s\n",arg, geterrstr(e));}
#define debug(line) {printf("Here? Line : %d\n",line);}

#define fname "leap3d.nc"

// Variable sizes and dimensions (constants)
#define ndims 4


void data_init(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr);


int main()
{
int      i,j,k,t;

int      p_xi, p_yi, p_zi,
         c_xi, c_yi, c_zi,
         m_xi, m_yi, m_zi;

int      nx       =  256,
         ny       =  256,
         nz       =  16,
         nt       =  1000;

float    u        =  0.0,
         v        =  1.0,
         w        =  0.0,
         c        =  0.01;

//p_tf : p at future
//p_tn : p at now
//p_tp : p at past

float    p_tf[nz*ny*nx],
         p_tn[nz*ny*nx],
         p_tp[nz*ny*nx];

size_t   n_siz =  sizeof(int),
         p_siz =  sizeof(float) * nx * ny * nz;

int      ncid, retval, varid, x_dimid, y_dimid, z_dimid, t_dimid;
int      dimids[ndims];
size_t   start[ndims], count[ndims];


// netCDF file operation
// Creating netCDF file
if ((retval = nc_create(fname, NC_CLOBBER, &ncid)))
   err(retval);

// Define dimensions
if ((retval = nc_def_dim(ncid, "z", nz, &z_dimid)))
   err(retval);
if ((retval = nc_def_dim(ncid, "y", ny, &y_dimid)))
   err(retval);
if ((retval = nc_def_dim(ncid, "x", nx, &x_dimid)))
   err(retval);
if ((retval = nc_def_dim(ncid, "t", NC_UNLIMITED, &t_dimid)))
   err(retval);


// Dimension ids
dimids[0] = t_dimid;
dimids[1] = z_dimid;
dimids[2] = y_dimid;
dimids[3] = x_dimid;

// Variable for writing netCDF data one timestep at a time
count[0] = 1;       // For time dimension : 1 timestep
count[1] = nz;      // For z              : write everything
count[2] = ny;      // For y              : write everything
count[3] = nx;      // For x              : write everything

start[1] = 0;       // For z              : don't do anything
start[2] = 0;       // For y              : don't do anything
start[3] = 0;       // For x              : don't do anything


if ((retval = nc_def_var(ncid, "data", NC_FLOAT, ndims, dimids, &varid)))
   err(retval);

if ((retval = nc_enddef(ncid)))
   err(retval);


data_init(nx,ny,nz,(float*)p_tf); 
data_init(nx,ny,nz,(float*)p_tn); 
data_init(nx,ny,nz,(float*)p_tp); 

start[0] = 0;

if (retval = nc_put_vara_float(ncid, varid, start, count, &p_tf[0]))
   err(retval);

// Euler scheme for the first time step
for(k=0;k<nz;k++)
   for(j=0;j<ny;j++)
      for(i=0;i<nx;i++)
      {
         c_xi     = i % nx;
         c_yi     = j % ny;
         c_zi     = k % nz;
         m_xi     = (i+nx-1)%nx;
         m_yi     = (j+ny-1)%ny;
         m_zi     = (k+nz-1)%nz;

         p_tf[c_xi + c_yi * nx + c_zi * nx * ny] = 
         p_tn[c_xi + c_yi * nx + c_zi * nx * ny]
        - u * c * (p_tn[c_xi + c_yi * nx + c_zi * nx * ny] - p_tn[m_xi + c_yi * nx + c_zi * nx * ny])
        - v * c * (p_tn[c_xi + c_yi * nx + c_zi * nx * ny] - p_tn[c_xi + m_yi * nx + c_zi * nx * ny])
        - w * c * (p_tn[c_xi + c_yi * nx + c_zi * nx * ny] - p_tn[c_xi + c_yi * nx + m_zi * nx * ny]);
      }

start[0] = 1;
if (retval = nc_put_vara_float(ncid, varid, start, count, &p_tf[0]))
   err(retval);



// OpenCL part //

// Use this to check the output of each API call
cl_int status;

// Retrieve the number of Platforms
cl_uint numPlatforms = 0;
status = clGetPlatformIDs(0, NULL, &numPlatforms);

// Allocate enough space for each Platform
cl_platform_id *platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));

// Fill in the Platforms
status = clGetPlatformIDs(numPlatforms, platforms, NULL);

// Retrieve the number of Devices
cl_uint numDevices = 0;
status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
clerrchk("get number of devices", status);

// Allocate enough spaces for each Devices
char name_data[100];
int *comp_units;
cl_device_fp_config cfg;
cl_device_id *devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));

// Fill in the Devices
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
clerrchk("get device ids", status);

// Create a context and associate it with the devices
cl_context context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
clerrchk("create context", status);

// Create a command queue and associate it with the devices
cl_command_queue cmdQueue = clCreateCommandQueue(context, devices[0], 0, &status);
clerrchk("create cmd queue", status);

cl_mem buf_p_tf    =  clCreateBuffer(context, CL_MEM_READ_WRITE, p_siz, NULL, &status);
clerrchk("create buffer buf_p_tf", status);
cl_mem buf_p_tn    =  clCreateBuffer(context, CL_MEM_READ_WRITE, p_siz, NULL, &status);
clerrchk("create buffer buf_p_tn", status);
cl_mem buf_p_tp    =  clCreateBuffer(context, CL_MEM_READ_WRITE, p_siz, NULL, &status);
clerrchk("create buffer buf_p_tp", status);
cl_mem buf_nx      =  clCreateBuffer(context, CL_MEM_READ_ONLY , n_siz, NULL, &status);
clerrchk("create buffer buf_nx", status);
cl_mem buf_ny      =  clCreateBuffer(context, CL_MEM_READ_ONLY , n_siz, NULL, &status);
clerrchk("create buffer buf_ny", status);
cl_mem buf_nz      =  clCreateBuffer(context, CL_MEM_READ_ONLY , n_siz, NULL, &status);
clerrchk("create buffer buf_nz", status);


status   =  clEnqueueWriteBuffer(cmdQueue, buf_p_tf  , CL_FALSE, 0, p_siz, &p_tf,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_p_tf", status);
status   =  clEnqueueWriteBuffer(cmdQueue, buf_p_tn  , CL_FALSE, 0, p_siz, &p_tn,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_p_tn", status);
status   =  clEnqueueWriteBuffer(cmdQueue, buf_p_tp  , CL_FALSE, 0, p_siz, &p_tp,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_p_tp", status);
status   =  clEnqueueWriteBuffer(cmdQueue, buf_nx    , CL_FALSE, 0, n_siz, &nx  ,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_nx", status);
status   =  clEnqueueWriteBuffer(cmdQueue, buf_ny    , CL_FALSE, 0, n_siz, &ny  ,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_ny", status);
status   =  clEnqueueWriteBuffer(cmdQueue, buf_nz    , CL_FALSE, 0, n_siz, &nz  ,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_nz", status);



// Create Program with the source code
cl_program program = NULL;
size_t program_size;
char *program_source;
FILE *program_handle = fopen("leapfrog2.cl","r");

fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_source = (char*)malloc(program_size+1);
program_source[program_size] = '\0';
fread(program_source, sizeof(char), program_size, program_handle);
fclose(program_handle);

program = clCreateProgramWithSource(context, 1, (const char**)&program_source, &program_size, &status);
clerrchk("create program", status);


// Compile the Program for the Device
status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);

if(status != CL_SUCCESS)
{
   size_t log_size;
   clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);

   char *log = (char *) malloc(log_size);

   clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);

   printf("%s\n", log);
}

// Create a kernel
cl_kernel kernel1, kernel2, kernel3;
kernel1 = clCreateKernel(program, "leapfrog3d", &status);
clerrchk("create kernel1", status);
kernel2 = clCreateKernel(program, "leapfrog3d", &status);
clerrchk("create kernel2", status);
kernel3 = clCreateKernel(program, "leapfrog3d", &status);
clerrchk("create kernel3", status);


// Associate the input and output buffers with the kernel
status = clSetKernelArg(kernel1, 0, sizeof(cl_int), &buf_nx  );
clerrchk("set kerne1l buf_nx", status);
status = clSetKernelArg(kernel1, 1, sizeof(cl_int), &buf_ny  );
clerrchk("set kernel1 buf_ny", status);
status = clSetKernelArg(kernel1, 2, sizeof(cl_int), &buf_nz  );
clerrchk("set kernel1 buf_nz", status);
status = clSetKernelArg(kernel1, 3, sizeof(cl_mem), &buf_p_tf);
clerrchk("set kernel1 buf_p_tf", status);
status = clSetKernelArg(kernel1, 4, sizeof(cl_mem), &buf_p_tn);
clerrchk("set kernel1 buf_p_tn", status);
status = clSetKernelArg(kernel1, 5, sizeof(cl_mem), &buf_p_tp);
clerrchk("set kernel1 buf_p_tp", status);


// Associate the input and output buffers with the kernel
status = clSetKernelArg(kernel2, 0, sizeof(cl_int), &buf_nx  );
clerrchk("set kernel2 buf_nx", status);
status = clSetKernelArg(kernel2, 1, sizeof(cl_int), &buf_ny  );
clerrchk("set kernel2 buf_ny", status);
status = clSetKernelArg(kernel2, 2, sizeof(cl_int), &buf_nz  );
clerrchk("set kernel2 buf_nz", status);
status = clSetKernelArg(kernel2, 3, sizeof(cl_mem), &buf_p_tp);
clerrchk("set kernel2 buf_p_tp", status);
status = clSetKernelArg(kernel2, 4, sizeof(cl_mem), &buf_p_tf);
clerrchk("set kernel2 buf_p_tf", status);
status = clSetKernelArg(kernel2, 5, sizeof(cl_mem), &buf_p_tn);
clerrchk("set kernel2 buf_p_tn", status);


// Associate the input and output buffers with the kernel
status = clSetKernelArg(kernel3, 0, sizeof(cl_int), &buf_nx  );
clerrchk("set kernel3 buf_nx", status);
status = clSetKernelArg(kernel3, 1, sizeof(cl_int), &buf_ny  );
clerrchk("set kernel3 buf_ny", status);
status = clSetKernelArg(kernel3, 2, sizeof(cl_int), &buf_nz  );
clerrchk("set kernel3 buf_nz", status);
status = clSetKernelArg(kernel3, 3, sizeof(cl_mem), &buf_p_tn);
clerrchk("set kernel3 buf_p_tn", status);
status = clSetKernelArg(kernel3, 4, sizeof(cl_mem), &buf_p_tp);
clerrchk("set kernel3 buf_p_tp", status);
status = clSetKernelArg(kernel3, 5, sizeof(cl_mem), &buf_p_tf);
clerrchk("set kernel3 buf_p_tf", status);


// Define index space (global work size) of work items for execution
// A workgroup size (local work size) is not required, but can be used
size_t lclwksiz[3] = {16,16,4}; 
size_t glbwksiz[3] = {nx,ny,nz};


for(t=0;t<200;t++)
{
// Execute the kernel1 for execution
status = clEnqueueNDRangeKernel(cmdQueue, kernel1, 3, NULL, glbwksiz, NULL, 0, NULL, NULL);
clerrchk("enqueue kernel1", status);

// Execute the kernel2 for execution
status = clEnqueueNDRangeKernel(cmdQueue, kernel2, 3, NULL, glbwksiz, NULL, 0, NULL, NULL);
clerrchk("enqueue kernel2", status);

// Execute the kernel3 for execution
status = clEnqueueNDRangeKernel(cmdQueue, kernel3, 3, NULL, glbwksiz, NULL, 0, NULL, NULL);
clerrchk("enqueue kernel3", status);
}


// Read the Device output buffer to the host output array
status = clEnqueueReadBuffer(cmdQueue, buf_p_tn, CL_FALSE, 0, p_siz, &p_cc, 0, NULL, NULL);
clerrchk("enqueue read buffer", status);

// Writing the data into netcdf file
start[0] = 2;

if (retval = nc_put_vara_float(ncid, varid, start, count, &p_cc[0]))
   err(retval);

// Close netcdf data
if ((retval = nc_close(ncid)))
   err(retval);


clFinish(cmdQueue);
clReleaseMemObject(buf_p_tf);
clReleaseMemObject(buf_p_tn);
clReleaseMemObject(buf_p_tp);
clReleaseMemObject(buf_nx);
clReleaseMemObject(buf_ny);
clReleaseMemObject(buf_nz);
clReleaseContext(context); 
clReleaseKernel(kernel1);
clReleaseKernel(kernel2);
clReleaseKernel(kernel3);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);

printf("\nDone. . .\n");

return 0;                     
}                             


// Data initialization
void data_init(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr)
{
int i,j,k;

int i_min   =  199,
    i_max   =  229,
    j_min   =  127,
    j_max   =  159;

for(k=0;k<in_z_siz;k++)
   for(j=0;j<in_y_siz;j++)
      for(i=0;i<in_x_siz;i++)
         in_arr[k * in_y_siz * in_x_siz + j * in_x_siz +i] = 0.0;

 for(k=0;k<in_z_siz;k++)
   for(j=j_min;j<j_max;j++)
      for(i=i_min;i<i_max;i++)
         in_arr[k * in_y_siz * in_x_siz + j * in_x_siz +i] = 3.0;
}

here is the kernel code

kernel void leapfrog3d(
                        const  int x_siz,
                        const  int y_siz,
                        const  int z_siz,
                        global float *in_p_tf,
                        global float *in_p_tn,
                        global float *in_p_tp
                      )
{
   int nx   =  x_siz;
   int ny   =  y_siz;
   int nz   =  z_siz;
   float u  =  0.0;
   float v  =  1.0;
   float w  =  0.0;
   float C  =  0.01;
   int i    =  get_global_id(0);
   int j    =  get_global_id(1);
   int k    =  get_global_id(2);

   int idx0, idx_i0, idx_i1, idx_j0, idx_j1, idx_k0, idx_k1;

   int p_xi = (i+1)%nx,
       p_yi = (j+1)%ny, 
       p_zi = (k+1)%nz,
       c_xi = i % nx, 
       c_yi = j % ny, 
       c_zi = k % nz,
       m_xi = (i+nx-1)%nx, 
       m_yi = (j+ny-1)%ny, 
       m_zi = (k+nz-1)%nz;


   idx0     =  c_xi + c_yi * nx + c_zi * nx * ny;

   idx_i0   =  p_xi + c_yi * nx + c_zi * nx * ny;
   idx_j0   =  c_xi + p_yi * nx + c_zi * nx * ny;
   idx_k0   =  c_xi + c_yi * nx + p_zi * nx * ny;
   
   idx_i1   =  m_xi + c_yi * nx + c_zi * nx * ny;
   idx_j1   =  c_xi + m_yi * nx + c_zi * nx * ny;
   idx_k1   =  c_xi + c_yi * nx + m_zi * nx * ny;

   in_p_tf[idx0] = in_p_tp[idx0] 
                 - u * C * (in_p_tn[idx_i0] - in_p_tn[idx_i1])
                 - v * C * (in_p_tn[idx_j0] - in_p_tn[idx_j1])
                 - w * C * (in_p_tn[idx_k0] - in_p_tn[idx_k1]);

}

devent · September 17, 2021, 11:29am

ep-98d:

clSetKernelArg(kernel1, 0, sizeof(cl_mem), &buf_p_tf);
clSetKernelArg(kernel1, 0, sizeof(cl_mem), &buf_p_tn);
clSetKernelArg(kernel1, 0, sizeof(cl_mem), &buf_p_tp);

clSetKernelArg(kernel2, 0, sizeof(cl_mem), &buf_p_tp);
clSetKernelArg(kernel2, 0, sizeof(cl_mem), &buf_p_tf);
clSetKernelArg(kernel2, 0, sizeof(cl_mem), &buf_p_tn);

clSetKernelArg(kernel3, 0, sizeof(cl_mem), &buf_p_tn);
clSetKernelArg(kernel3, 0, sizeof(cl_mem), &buf_p_tp);
clSetKernelArg(kernel3, 0, sizeof(cl_mem), &buf_p_tf);

This already looks weird. Why is your clSetKernelArg parameter arg_index always 0? It should be incremented because it’s the index of the parameter in your kernel.

And where do you set the parameters index 0 to 2? I only see the float* parameters index 3 to 5.

kernel void leapfrog3d(
                        const  int x_siz, // index 0
                        const  int y_siz, // index 1
                        const  int z_siz, // index 2
                        global float *in_p_tf, // index 3
                        global float *in_p_tn, // index 4
                        global float *in_p_tp // index 5
                      )
{

devent · September 17, 2021, 11:40am

Ah ok, forget my comment above. Just looked at your code and you have incremented the index. You just didn’t posted your code there but something else to confuse me.

status = clEnqueueWriteBuffer(cmdQueue, buf_p_tf , CL_FALSE, 0, p_siz, &p_tf,0, NULL, NULL);

This is not a good idea. You set the parameter blocking_write to false. This makes clEnqueueWriteBuffer an asynchronous call. But you don’t have any callbacks set when the function is finished writing data. Use the blocking mode:

status = clEnqueueWriteBuffer(cmdQueue, buf_p_tf , CL_TRUE, 0, p_siz, &p_tf,0, NULL, NULL);

Also with clEnqueueReadBuffer use the blocking mode. My guess is that you start the kernel before clEnqueueWriteBuffer actually finished writing the data. Use the blocking mode so that the write/read blocks until it’s done.

ep-98d · September 17, 2021, 2:44pm

Thank you for your reply!

I remember setting it as non-block mode for clEnqueueReadBuffer() because I was getting OUT_OF_RESOURCE error, while back, not knowing what it actually does. I tried changing it to blocking mode, and I am getting the same OUT_OF_RESOURCE error. Does this have to do with executing multiple kernel in a row, but each kernels are not completing? If so, how do I let the kernel2 to wait for the kernel1 to finish?

Thanks.

devent · September 17, 2021, 4:27pm

Where do you get this error? From clEnqueueNDRangeKernel? clEnqueueNDRangeKernel is blocking anyway, so if it’s finish it’s finish.

CL_OUT_OF_RESOURCES if there is a failure to queue the execution instance of kernel on the command-queue because of insufficient resources needed to execute the kernel. For example, the explicitly specified local_work_size causes a failure to execute the kernel because of insufficient resources such as registers or local memory. Another example would be the number of read-only image args used in kernel exceed the CL_DEVICE_MAX_READ_IMAGE_ARGS value for device or the number of write-only and read-write image args used in kernel exceed the CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS value for device or the number of samplers used in kernel exceed CL_DEVICE_MAX_SAMPLERS for device. The OpenCL™ Specification

How much RAM do you have on your device? What is the max local size?

ep-98d · September 17, 2021, 4:53pm

I’m getting CL_OUT_OF_RESOURCE from clEnqueueReadBuffer().

I think I have enough RAM on my device. It’s got 16GB, and the max work item size is 1024x1024x64

devent · September 18, 2021, 4:12pm

Can you calculate how much memory you are using?

3 float buffers * 3 kernels * how many datasets you have = x MB

This also weird:

status   =  clEnqueueWriteBuffer(cmdQueue, buf_p_tf  , CL_FALSE, 0, p_siz, &p_tf,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_p_tf", status);
status   =  clEnqueueWriteBuffer(cmdQueue, buf_p_tn  , CL_FALSE, 0, p_siz, &p_tn,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_p_tn", status);
status   =  clEnqueueWriteBuffer(cmdQueue, buf_p_tp  , CL_FALSE, 0, p_siz, &p_tp,0, NULL, NULL);

p_tf is already a pointer. Why do you pass a pointer to p_tf with &p_tf?

You just have to pass p_tf without the &.

status   =  clEnqueueWriteBuffer(cmdQueue, buf_p_tf  , CL_FALSE, 0, p_siz, p_tf,0, NULL, NULL);

Here you have to use & because nx, ny, nz are just int and not pointers.

status   =  clEnqueueWriteBuffer(cmdQueue, buf_nx    , CL_FALSE, 0, n_siz, &nx  ,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_nx", status);
status   =  clEnqueueWriteBuffer(cmdQueue, buf_ny    , CL_FALSE, 0, n_siz, &ny  ,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_ny", status);
status   =  clEnqueueWriteBuffer(cmdQueue, buf_nz    , CL_FALSE, 0, n_siz, &nz  ,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_nz", status);

Same here, p_cc is already a pointer.

status = clEnqueueReadBuffer(cmdQueue, buf_p_tn, CL_FALSE, 0, p_siz, p_cc, 0, NULL, NULL);
clerrchk("enqueue read buffer", status);

Try it out if it works.

Also you are not using the heap. Those are all static arrays that are on the stack. I don’t even know if the stack is big enough for those. To use the heap use either malloc or new[].

float*    p_tf = new float[nz*ny*nx],
         p_tn = new float[nz*ny*nx],
         p_tp = new float[nz*ny*nx];

And then of course free or delete[]:

delete [] p_tf;
delete [] p_tn;
delete [] p_tp;

I would really recommend you to read those examples

ep-98d · September 18, 2021, 5:42pm

Thank you for your detailed reply.

Rough calculation of the data size that I am putting into the device is…

256 x 256 x 16 x 4 bytes x 3 = 12 GBytes. So, it should be okay. Also, I’ve tried much less: 16 x 16 x 2 x 4 bytes x 3 = 6 MBytes, and I am still getting the same error.

Also… I did try using malloc() and I’ve got the same CL_OUT_OF_RESOURCES error message from clEnqueueReadBuffer().

Do you think the array size parameter p_siz which is declared as

size_t p_siz =  sizeof(float) * nx * ny * nz;

that passes into clCreateBuffer(), clEnqueueWriteBuffer(), and clEnqueueReadBuffer() is correct?

ep-98d · September 18, 2021, 5:49pm

Yeah. In the example link (first example sharpen.c), I’ve noticed that the code declares the datasize as

int dataSize = imageHeight * imageWidth * sizeof(float);

But, I thought we should use size_t datatype for the data size to pass it into clCreateBuffer(), clEnqueueWriteBuffer()' and 'clEnqueueReadBuffer(). Is this not correct?

devent · September 19, 2021, 9:40am

size_t is more correct. But then I have no idea what’s wrong. Maybe your index in the kernel is wrong and you write over data outside the in_p_tf, etc. parameters?

ep-98d · September 19, 2021, 5:27pm

I think I found the problem. In addition to your suggestions, size parameter x_siz, y_siz, and z_siz inside the kernel is not being passed properly into the kernel. When I just hard code the array size inside the kernel the code runs just fine. So, how do I fix this…?

ep-98d · September 19, 2021, 5:50pm

I found it out… I have been passing nx, ny, and nz as a buffer object into the kernel. I figured this is how the things are done for integer objects, but apparently not. So when I passed them just as is (not as OpenCL buffer object but just by integers) it worked.

Anyways, I really appreciate your help devent!