I need to swap OpenCL buffer objects. Basically what I am doing is implementing leapfrog scheme
but in 3D.
My kernel looks something like
p_tf[i] = p_tp[i] - u*C*(p_tn[i+1] - p_tn[i-1]);
Note that p_tf
is some property ‘u’ at time step t+1, p_tn
is at time step t, and p_tp
is at time step t-1.
I’ve created three 3D arrays to consider the array of time step t+1, t and t-1. And for every time step I need to swap them in order to update them like the following
p_tp = p_tn;
p_tn = p_tf;
So I’ve created three kernels and used clSetKernelArg()
to set the arguments in different order, and enqueued kernel calls like the following
kernel1 = clCreateKernel(program, "leapfrog3d", ... );
kernel2 = clCreateKernel(program, "leapfrog3d", ... );
kernel3 = clCreateKernel(program, "leapfrog3d", ... );
clSetKernelArg(kernel1, 0, sizeof(cl_mem), &buf_p_tf);
clSetKernelArg(kernel1, 0, sizeof(cl_mem), &buf_p_tn);
clSetKernelArg(kernel1, 0, sizeof(cl_mem), &buf_p_tp);
clSetKernelArg(kernel2, 0, sizeof(cl_mem), &buf_p_tp);
clSetKernelArg(kernel2, 0, sizeof(cl_mem), &buf_p_tf);
clSetKernelArg(kernel2, 0, sizeof(cl_mem), &buf_p_tn);
clSetKernelArg(kernel3, 0, sizeof(cl_mem), &buf_p_tn);
clSetKernelArg(kernel3, 0, sizeof(cl_mem), &buf_p_tp);
clSetKernelArg(kernel3, 0, sizeof(cl_mem), &buf_p_tf);
for (t=0;t<1000;t++)
{
clEnqueueNDRangeKernel(cmdQueue, kernel1, 3, ...);
clEnqueueNDRangeKernel(cmdQueue, kernel2, 3, ...);
clEnqueueNDRangeKernel(cmdQueue, kernel3, 3, ...);
}
So, when I copy the buffer buf_p_tf
and look at it, it should at least show some noise due to numerical instabilities, but I am getting initial values. I can’t find what I am doing wrong.
Here is the full code
#include <stdio.h>
#include <stdlib.h>
#include <netcdf.h>
#define CL_TARGET_OPENCL_VERSION 120
#include <CL/cl.h>
#include "cl_err.h"
// netCDF constants
#define err(e) {printf("Error: %s\n", nc_strerror(e)); return(2);}
#define clerrchk(arg, e) {printf(" %-40s : %s\n",arg, geterrstr(e));}
#define debug(line) {printf("Here? Line : %d\n",line);}
#define fname "leap3d.nc"
// Variable sizes and dimensions (constants)
#define ndims 4
void data_init(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr);
int main()
{
int i,j,k,t;
int p_xi, p_yi, p_zi,
c_xi, c_yi, c_zi,
m_xi, m_yi, m_zi;
int nx = 256,
ny = 256,
nz = 16,
nt = 1000;
float u = 0.0,
v = 1.0,
w = 0.0,
c = 0.01;
//p_tf : p at future
//p_tn : p at now
//p_tp : p at past
float p_tf[nz*ny*nx],
p_tn[nz*ny*nx],
p_tp[nz*ny*nx];
size_t n_siz = sizeof(int),
p_siz = sizeof(float) * nx * ny * nz;
int ncid, retval, varid, x_dimid, y_dimid, z_dimid, t_dimid;
int dimids[ndims];
size_t start[ndims], count[ndims];
// netCDF file operation
// Creating netCDF file
if ((retval = nc_create(fname, NC_CLOBBER, &ncid)))
err(retval);
// Define dimensions
if ((retval = nc_def_dim(ncid, "z", nz, &z_dimid)))
err(retval);
if ((retval = nc_def_dim(ncid, "y", ny, &y_dimid)))
err(retval);
if ((retval = nc_def_dim(ncid, "x", nx, &x_dimid)))
err(retval);
if ((retval = nc_def_dim(ncid, "t", NC_UNLIMITED, &t_dimid)))
err(retval);
// Dimension ids
dimids[0] = t_dimid;
dimids[1] = z_dimid;
dimids[2] = y_dimid;
dimids[3] = x_dimid;
// Variable for writing netCDF data one timestep at a time
count[0] = 1; // For time dimension : 1 timestep
count[1] = nz; // For z : write everything
count[2] = ny; // For y : write everything
count[3] = nx; // For x : write everything
start[1] = 0; // For z : don't do anything
start[2] = 0; // For y : don't do anything
start[3] = 0; // For x : don't do anything
if ((retval = nc_def_var(ncid, "data", NC_FLOAT, ndims, dimids, &varid)))
err(retval);
if ((retval = nc_enddef(ncid)))
err(retval);
data_init(nx,ny,nz,(float*)p_tf);
data_init(nx,ny,nz,(float*)p_tn);
data_init(nx,ny,nz,(float*)p_tp);
start[0] = 0;
if (retval = nc_put_vara_float(ncid, varid, start, count, &p_tf[0]))
err(retval);
// Euler scheme for the first time step
for(k=0;k<nz;k++)
for(j=0;j<ny;j++)
for(i=0;i<nx;i++)
{
c_xi = i % nx;
c_yi = j % ny;
c_zi = k % nz;
m_xi = (i+nx-1)%nx;
m_yi = (j+ny-1)%ny;
m_zi = (k+nz-1)%nz;
p_tf[c_xi + c_yi * nx + c_zi * nx * ny] =
p_tn[c_xi + c_yi * nx + c_zi * nx * ny]
- u * c * (p_tn[c_xi + c_yi * nx + c_zi * nx * ny] - p_tn[m_xi + c_yi * nx + c_zi * nx * ny])
- v * c * (p_tn[c_xi + c_yi * nx + c_zi * nx * ny] - p_tn[c_xi + m_yi * nx + c_zi * nx * ny])
- w * c * (p_tn[c_xi + c_yi * nx + c_zi * nx * ny] - p_tn[c_xi + c_yi * nx + m_zi * nx * ny]);
}
start[0] = 1;
if (retval = nc_put_vara_float(ncid, varid, start, count, &p_tf[0]))
err(retval);
// OpenCL part //
// Use this to check the output of each API call
cl_int status;
// Retrieve the number of Platforms
cl_uint numPlatforms = 0;
status = clGetPlatformIDs(0, NULL, &numPlatforms);
// Allocate enough space for each Platform
cl_platform_id *platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));
// Fill in the Platforms
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
// Retrieve the number of Devices
cl_uint numDevices = 0;
status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
clerrchk("get number of devices", status);
// Allocate enough spaces for each Devices
char name_data[100];
int *comp_units;
cl_device_fp_config cfg;
cl_device_id *devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));
// Fill in the Devices
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
clerrchk("get device ids", status);
// Create a context and associate it with the devices
cl_context context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
clerrchk("create context", status);
// Create a command queue and associate it with the devices
cl_command_queue cmdQueue = clCreateCommandQueue(context, devices[0], 0, &status);
clerrchk("create cmd queue", status);
cl_mem buf_p_tf = clCreateBuffer(context, CL_MEM_READ_WRITE, p_siz, NULL, &status);
clerrchk("create buffer buf_p_tf", status);
cl_mem buf_p_tn = clCreateBuffer(context, CL_MEM_READ_WRITE, p_siz, NULL, &status);
clerrchk("create buffer buf_p_tn", status);
cl_mem buf_p_tp = clCreateBuffer(context, CL_MEM_READ_WRITE, p_siz, NULL, &status);
clerrchk("create buffer buf_p_tp", status);
cl_mem buf_nx = clCreateBuffer(context, CL_MEM_READ_ONLY , n_siz, NULL, &status);
clerrchk("create buffer buf_nx", status);
cl_mem buf_ny = clCreateBuffer(context, CL_MEM_READ_ONLY , n_siz, NULL, &status);
clerrchk("create buffer buf_ny", status);
cl_mem buf_nz = clCreateBuffer(context, CL_MEM_READ_ONLY , n_siz, NULL, &status);
clerrchk("create buffer buf_nz", status);
status = clEnqueueWriteBuffer(cmdQueue, buf_p_tf , CL_FALSE, 0, p_siz, &p_tf,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_p_tf", status);
status = clEnqueueWriteBuffer(cmdQueue, buf_p_tn , CL_FALSE, 0, p_siz, &p_tn,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_p_tn", status);
status = clEnqueueWriteBuffer(cmdQueue, buf_p_tp , CL_FALSE, 0, p_siz, &p_tp,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_p_tp", status);
status = clEnqueueWriteBuffer(cmdQueue, buf_nx , CL_FALSE, 0, n_siz, &nx ,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_nx", status);
status = clEnqueueWriteBuffer(cmdQueue, buf_ny , CL_FALSE, 0, n_siz, &ny ,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_ny", status);
status = clEnqueueWriteBuffer(cmdQueue, buf_nz , CL_FALSE, 0, n_siz, &nz ,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_nz", status);
// Create Program with the source code
cl_program program = NULL;
size_t program_size;
char *program_source;
FILE *program_handle = fopen("leapfrog2.cl","r");
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_source = (char*)malloc(program_size+1);
program_source[program_size] = '\0';
fread(program_source, sizeof(char), program_size, program_handle);
fclose(program_handle);
program = clCreateProgramWithSource(context, 1, (const char**)&program_source, &program_size, &status);
clerrchk("create program", status);
// Compile the Program for the Device
status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
if(status != CL_SUCCESS)
{
size_t log_size;
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
char *log = (char *) malloc(log_size);
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
printf("%s\n", log);
}
// Create a kernel
cl_kernel kernel1, kernel2, kernel3;
kernel1 = clCreateKernel(program, "leapfrog3d", &status);
clerrchk("create kernel1", status);
kernel2 = clCreateKernel(program, "leapfrog3d", &status);
clerrchk("create kernel2", status);
kernel3 = clCreateKernel(program, "leapfrog3d", &status);
clerrchk("create kernel3", status);
// Associate the input and output buffers with the kernel
status = clSetKernelArg(kernel1, 0, sizeof(cl_int), &buf_nx );
clerrchk("set kerne1l buf_nx", status);
status = clSetKernelArg(kernel1, 1, sizeof(cl_int), &buf_ny );
clerrchk("set kernel1 buf_ny", status);
status = clSetKernelArg(kernel1, 2, sizeof(cl_int), &buf_nz );
clerrchk("set kernel1 buf_nz", status);
status = clSetKernelArg(kernel1, 3, sizeof(cl_mem), &buf_p_tf);
clerrchk("set kernel1 buf_p_tf", status);
status = clSetKernelArg(kernel1, 4, sizeof(cl_mem), &buf_p_tn);
clerrchk("set kernel1 buf_p_tn", status);
status = clSetKernelArg(kernel1, 5, sizeof(cl_mem), &buf_p_tp);
clerrchk("set kernel1 buf_p_tp", status);
// Associate the input and output buffers with the kernel
status = clSetKernelArg(kernel2, 0, sizeof(cl_int), &buf_nx );
clerrchk("set kernel2 buf_nx", status);
status = clSetKernelArg(kernel2, 1, sizeof(cl_int), &buf_ny );
clerrchk("set kernel2 buf_ny", status);
status = clSetKernelArg(kernel2, 2, sizeof(cl_int), &buf_nz );
clerrchk("set kernel2 buf_nz", status);
status = clSetKernelArg(kernel2, 3, sizeof(cl_mem), &buf_p_tp);
clerrchk("set kernel2 buf_p_tp", status);
status = clSetKernelArg(kernel2, 4, sizeof(cl_mem), &buf_p_tf);
clerrchk("set kernel2 buf_p_tf", status);
status = clSetKernelArg(kernel2, 5, sizeof(cl_mem), &buf_p_tn);
clerrchk("set kernel2 buf_p_tn", status);
// Associate the input and output buffers with the kernel
status = clSetKernelArg(kernel3, 0, sizeof(cl_int), &buf_nx );
clerrchk("set kernel3 buf_nx", status);
status = clSetKernelArg(kernel3, 1, sizeof(cl_int), &buf_ny );
clerrchk("set kernel3 buf_ny", status);
status = clSetKernelArg(kernel3, 2, sizeof(cl_int), &buf_nz );
clerrchk("set kernel3 buf_nz", status);
status = clSetKernelArg(kernel3, 3, sizeof(cl_mem), &buf_p_tn);
clerrchk("set kernel3 buf_p_tn", status);
status = clSetKernelArg(kernel3, 4, sizeof(cl_mem), &buf_p_tp);
clerrchk("set kernel3 buf_p_tp", status);
status = clSetKernelArg(kernel3, 5, sizeof(cl_mem), &buf_p_tf);
clerrchk("set kernel3 buf_p_tf", status);
// Define index space (global work size) of work items for execution
// A workgroup size (local work size) is not required, but can be used
size_t lclwksiz[3] = {16,16,4};
size_t glbwksiz[3] = {nx,ny,nz};
for(t=0;t<200;t++)
{
// Execute the kernel1 for execution
status = clEnqueueNDRangeKernel(cmdQueue, kernel1, 3, NULL, glbwksiz, NULL, 0, NULL, NULL);
clerrchk("enqueue kernel1", status);
// Execute the kernel2 for execution
status = clEnqueueNDRangeKernel(cmdQueue, kernel2, 3, NULL, glbwksiz, NULL, 0, NULL, NULL);
clerrchk("enqueue kernel2", status);
// Execute the kernel3 for execution
status = clEnqueueNDRangeKernel(cmdQueue, kernel3, 3, NULL, glbwksiz, NULL, 0, NULL, NULL);
clerrchk("enqueue kernel3", status);
}
// Read the Device output buffer to the host output array
status = clEnqueueReadBuffer(cmdQueue, buf_p_tn, CL_FALSE, 0, p_siz, &p_cc, 0, NULL, NULL);
clerrchk("enqueue read buffer", status);
// Writing the data into netcdf file
start[0] = 2;
if (retval = nc_put_vara_float(ncid, varid, start, count, &p_cc[0]))
err(retval);
// Close netcdf data
if ((retval = nc_close(ncid)))
err(retval);
clFinish(cmdQueue);
clReleaseMemObject(buf_p_tf);
clReleaseMemObject(buf_p_tn);
clReleaseMemObject(buf_p_tp);
clReleaseMemObject(buf_nx);
clReleaseMemObject(buf_ny);
clReleaseMemObject(buf_nz);
clReleaseContext(context);
clReleaseKernel(kernel1);
clReleaseKernel(kernel2);
clReleaseKernel(kernel3);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
printf("\nDone. . .\n");
return 0;
}
// Data initialization
void data_init(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr)
{
int i,j,k;
int i_min = 199,
i_max = 229,
j_min = 127,
j_max = 159;
for(k=0;k<in_z_siz;k++)
for(j=0;j<in_y_siz;j++)
for(i=0;i<in_x_siz;i++)
in_arr[k * in_y_siz * in_x_siz + j * in_x_siz +i] = 0.0;
for(k=0;k<in_z_siz;k++)
for(j=j_min;j<j_max;j++)
for(i=i_min;i<i_max;i++)
in_arr[k * in_y_siz * in_x_siz + j * in_x_siz +i] = 3.0;
}
here is the kernel code
kernel void leapfrog3d(
const int x_siz,
const int y_siz,
const int z_siz,
global float *in_p_tf,
global float *in_p_tn,
global float *in_p_tp
)
{
int nx = x_siz;
int ny = y_siz;
int nz = z_siz;
float u = 0.0;
float v = 1.0;
float w = 0.0;
float C = 0.01;
int i = get_global_id(0);
int j = get_global_id(1);
int k = get_global_id(2);
int idx0, idx_i0, idx_i1, idx_j0, idx_j1, idx_k0, idx_k1;
int p_xi = (i+1)%nx,
p_yi = (j+1)%ny,
p_zi = (k+1)%nz,
c_xi = i % nx,
c_yi = j % ny,
c_zi = k % nz,
m_xi = (i+nx-1)%nx,
m_yi = (j+ny-1)%ny,
m_zi = (k+nz-1)%nz;
idx0 = c_xi + c_yi * nx + c_zi * nx * ny;
idx_i0 = p_xi + c_yi * nx + c_zi * nx * ny;
idx_j0 = c_xi + p_yi * nx + c_zi * nx * ny;
idx_k0 = c_xi + c_yi * nx + p_zi * nx * ny;
idx_i1 = m_xi + c_yi * nx + c_zi * nx * ny;
idx_j1 = c_xi + m_yi * nx + c_zi * nx * ny;
idx_k1 = c_xi + c_yi * nx + m_zi * nx * ny;
in_p_tf[idx0] = in_p_tp[idx0]
- u * C * (in_p_tn[idx_i0] - in_p_tn[idx_i1])
- v * C * (in_p_tn[idx_j0] - in_p_tn[idx_j1])
- w * C * (in_p_tn[idx_k0] - in_p_tn[idx_k1]);
}