Issue with Mapping Buffers

I have a question regarding Mapping Buffers in OpenCL. I posted a simple example version of my code from a stripped down version of Intel’s base example. You should be able to create a solution, place these two files into it and be able to execute. (make sure that cl.h is in the correct folder area)

My question is, what am I doing wrong with regards to Mapping Buffers. If you look within the main function, you can call the EnqueueMapBuffers and the program will work. (At this moment they are commented out.) If you try to place those two calls inside of a separate function instead, the program fails, and it seems that the buffers are never made.

Am I just making a rookie mistake of sorts?

As a note, I get it that in this example it is really only 2 lines (7 if you count error checking) but in my main code of I have roughly 15 of these calls. Which I would prefer to have in a separate function.

#include <vector>
#include "CL\cl.h"

#define OPENCL_VERSION_1_2  1.2f

struct ocl_args_d_t
{
    ocl_args_d_t();
    ~ocl_args_d_t();

    // Regular OpenCL objects:
    cl_context       context;          
    cl_device_id     device;           
    cl_command_queue commandQueue;    
    cl_program       program;          
    cl_kernel        kernel;        
    
    // Memory Buffers
    // Host-to-Device-Buffers
    cl_mem           srcA;             
    cl_mem           srcB;              

    // Device-to-Host Buffers
    cl_mem           dstMem;  

    // Input/Data Buffers
    cl_mem           devA;
    cl_mem           devB;
};

ocl_args_d_t::ocl_args_d_t():
        context(NULL),
        device(NULL),
        commandQueue(NULL),
        program(NULL),
        kernel(NULL),
        srcA(NULL),
        srcB(NULL),
        dstMem(NULL),
        devA(NULL),
        devB(NULL)
{
}

ocl_args_d_t::~ocl_args_d_t()
{
    if (kernel)
        clReleaseKernel(kernel);
    if (program)
        clReleaseProgram(program);
    if (srcA)
        clReleaseMemObject(srcA);
    if (srcB)
        clReleaseMemObject(srcB);
    if (dstMem)
        clReleaseMemObject(dstMem);
    if (devA)
       clReleaseMemObject(devA);
    if (devB)
       clReleaseMemObject(devB);
    if (commandQueue)
        clReleaseCommandQueue(commandQueue);
    if (device)
        clReleaseDevice(device);
    if (context)
        clReleaseContext(context);
}

int ReadSourceFromFile(const char* fileName, char** source, size_t* sourceSize)
{
   int errorCode = CL_SUCCESS;

   FILE* fp = NULL;
   fopen_s(&fp, fileName, "rb");
   if (fp == NULL)
   {
      printf("Error: Couldn't find program source file '%s'.
", fileName);
      errorCode = CL_INVALID_VALUE;
   }
   else {
      fseek(fp, 0, SEEK_END);
      *sourceSize = ftell(fp);
      fseek(fp, 0, SEEK_SET);

      *source = new char[*sourceSize];
      if (*source == NULL)
      {
         printf("Error: Couldn't allocate %d bytes for program source from file '%s'.
", *sourceSize, fileName);
         errorCode = CL_OUT_OF_HOST_MEMORY;
      }
      else {
         fread(*source, 1, *sourceSize, fp);
      }
   }
   return errorCode;
}

void generateInput(cl_float* inputArray, cl_uint arrayWidth, cl_uint arrayHeight, float seed)
{
    cl_uint array_size = arrayWidth * arrayHeight;
    for (cl_uint i = 0; i < array_size; ++i)
    {
        inputArray[i] = seed;
    }
}

int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType)
{
   cl_uint numPlatforms = 0;
   cl_uint numDevices = 0;
   cl_platform_id platformId = NULL;
   size_t stringLength = 1024;

   clGetPlatformIDs(NULL, NULL, &numPlatforms);

   if (numPlatforms > 1)
      printf("Number of available platforms: %u
", numPlatforms);
   else
   {
      printf("Error: No platforms found!
");
      return CL_INVALID_VALUE;
   }

   std::vector<cl_platform_id> platforms(numPlatforms);

   clGetPlatformIDs(numPlatforms, &platforms[0], NULL);

   std::vector<char> platformName(stringLength);
   for (cl_uint i = 0; i < numPlatforms; ++i)
   {
      clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, stringLength, &platformName[0], NULL);

      if (strstr(&platformName[0], "NVIDIA CUDA") != 0)
      {
         clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &numDevices);
         if (numDevices != 0)
            platformId = platforms[i];
      }
   }


   cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platformId, 0 };
   ocl->context = clCreateContextFromType(contextProperties, deviceType, NULL, NULL, NULL);

   clGetContextInfo(ocl->context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &ocl->device, NULL);


   cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE;
   ocl->commandQueue = clCreateCommandQueue(ocl->context, ocl->device, properties, NULL);

   return CL_SUCCESS;
}

int CreateAndBuildProgram(ocl_args_d_t *ocl)
{
    char* source = NULL;
    size_t src_size = 0;
    ReadSourceFromFile("Template.cl", &source, &src_size);

    ocl->program = clCreateProgramWithSource(ocl->context, 1, (const char**)&source, &src_size, NULL);

    clBuildProgram(ocl->program, 1, &ocl->device, "", NULL, NULL);

    return CL_SUCCESS;
}

int CreateBufferArguments(ocl_args_d_t *ocl, size_t dataSize)
{
    ocl->srcA = clCreateBuffer(ocl->context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, dataSize, NULL, NULL);
    ocl->srcB = clCreateBuffer(ocl->context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, dataSize, NULL, NULL);
    ocl->dstMem = clCreateBuffer(ocl->context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, dataSize, NULL, NULL);
    ocl->devA = clCreateBuffer(ocl->context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, dataSize, NULL, NULL);
    ocl->devB = clCreateBuffer(ocl->context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, dataSize, NULL, NULL);

    return CL_SUCCESS;
}

int SetKernelArguments(ocl_args_d_t *ocl)
{
    clSetKernelArg(ocl->kernel, 0, sizeof(cl_mem), (void *)&ocl->srcA);
    clSetKernelArg(ocl->kernel, 1, sizeof(cl_mem), (void *)&ocl->srcB);
    clSetKernelArg(ocl->kernel, 2, sizeof(cl_mem), (void *)&ocl->dstMem);

    return CL_SUCCESS;
}

int ExecuteAddKernel(ocl_args_d_t *ocl, cl_float *inputA, cl_float *inputB,
   cl_float *output, cl_uint width, cl_uint height, int runs)
{
    size_t globalWorkSize[2] = {width, height};
    size_t dataSize = sizeof(cl_float) * width * height;

    clFinish(ocl->commandQueue);
    printf("
Add
");

    for (int i = 0; i < runs; ++i)
    {
       clEnqueueWriteBuffer(ocl->commandQueue, ocl->srcA, CL_FALSE, 0, dataSize, (void*)inputA, 0, NULL, NULL);
       clEnqueueWriteBuffer(ocl->commandQueue, ocl->srcB, CL_FALSE, 0, dataSize, (void*)inputB, 0, NULL, NULL);
       clEnqueueNDRangeKernel(ocl->commandQueue, ocl->kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
       output = (float*)clEnqueueMapBuffer(ocl->commandQueue, ocl->dstMem, CL_TRUE, CL_MAP_READ, 0, dataSize, 0, NULL, NULL, NULL);

       clEnqueueUnmapMemObject(ocl->commandQueue, ocl->dstMem, output, 0, NULL, NULL);
       for (unsigned int a = 0; a < width * height; ++a)
       {
          if (output[a] != inputA[a] + inputB[a])
             printf("Kernel 0 Failure @ %d: %0.2f != %0.2f + %0.2f
", a, output[a], inputA[a], inputB[a]);
       }
       clFinish(ocl->commandQueue);
    }
    printf("Finished");
    return CL_SUCCESS;
}

int MapBuffers(ocl_args_d_t *ocl, cl_float *inputA, cl_float *inputB, size_t dataSize)
{
   inputA = (cl_float*)clEnqueueMapBuffer(ocl->commandQueue, ocl->devA, CL_TRUE, CL_MAP_READ, 0, dataSize, 0, NULL, NULL, NULL);
   inputB = (cl_float*)clEnqueueMapBuffer(ocl->commandQueue, ocl->devB, CL_TRUE, CL_MAP_READ, 0, dataSize, 0, NULL, NULL, NULL);

   return CL_SUCCESS;
}

int main()
{
    ocl_args_d_t ocl;
    cl_device_type deviceType = CL_DEVICE_TYPE_GPU;

    cl_uint arrayWidth  = 1024;
    cl_uint arrayHeight = 1024;
    size_t dataSize = sizeof(cl_float) * arrayHeight * arrayWidth;

    SetupOpenCL(&ocl, deviceType);
    
    float* inputA = NULL;
    float* inputB = NULL;
    float* output = NULL;

    CreateBufferArguments(&ocl, dataSize);

    MapBuffers(&ocl, inputA, inputB, dataSize);

    //inputA = (cl_float*)clEnqueueMapBuffer(ocl.commandQueue, ocl.devA, CL_TRUE, CL_MAP_READ, 0, dataSize, 0, NULL, NULL, NULL);
    //inputB = (cl_float*)clEnqueueMapBuffer(ocl.commandQueue, ocl.devB, CL_TRUE, CL_MAP_READ, 0, dataSize, 0, NULL, NULL, NULL);

    CreateAndBuildProgram(&ocl);

    ocl.kernel = clCreateKernel(ocl.program, "Add", NULL);

    SetKernelArguments(&ocl);

    generateInput(inputA, arrayWidth, arrayHeight, 5.0f);
    generateInput(inputB, arrayWidth, arrayHeight, 6.0f);

    ExecuteAddKernel(&ocl, inputA, inputB, output, arrayWidth, arrayHeight, 5);

    clEnqueueUnmapMemObject(ocl.commandQueue, ocl.devA, inputA, 0, NULL, NULL);
    clEnqueueUnmapMemObject(ocl.commandQueue, ocl.devB, inputB, 0, NULL, NULL);

    return 0;
}
__kernel void Add(__global float* pA, __global float* pB, __global float* pC)
{
    const int x     = get_global_id(0);
    const int y     = get_global_id(1);
    const int width = get_global_size(0);

    const int id = y * width + x;

    float inA = pA[id];
    float inB = pB[id];
    float out = 0.0f;

    out = inA + inB;
    pC[id] = out;
}

inputA and inputB are out parameters:


int MapBuffers(ocl_args_d_t *ocl, cl_float **inputA, cl_float **inputB, size_t dataSize)
{
   *inputA = (cl_float*)clEnqueueMapBuffer(ocl->commandQueue, ocl->devA, CL_TRUE, CL_MAP_READ, 0, dataSize, 0, NULL, NULL, NULL);
   *inputB = (cl_float*)clEnqueueMapBuffer(ocl->commandQueue, ocl->devB, CL_TRUE, CL_MAP_READ, 0, dataSize, 0, NULL, NULL, NULL);
 
   return CL_SUCCESS;
}

Thank you!