Synchronisation between tasks in a queue in openCL

I created a class called relaxation . The relaxation.hpp is as follows:

```
#ifndef RELAXATION_HPP
#define RELAXATION_HPP

#define __CL_ENABLE_EXCEPTIONS

#include <iostream>
#include <CL/cl.hpp>
#include <cmath>
#include "CL_ERROR.hpp"
#include "defines.hpp"


template<class T>
class relaxation
{
  protected:
    int stencilDataWidth = 3;
    int relaxationStencilSize = pow(stencilDataWidth,3);
    T* stencil   = NULL;
    T omega = 0.8;  

    cl::Buffer       relaxationStencilBuffer;
    cl::Kernel       kernel;

  public:
     relaxation(cl::Program& _programInput, cl::CommandQueue& _queue, 
                const cl::Context& _context);
    ~relaxation();
     T getOmega();
     void createKernel(cl::Program& program);
     void setStencil(const cl::Context& context, cl::CommandQueue& queue);
     void relaxing(cl::CommandQueue& queue, 
                   cl::Buffer& inoutBuffer,  cl::Buffer& intermediateBuffer, 
                   cl::Buffer& RHSBuffer, 
                   const cl::NDRange& globalRange, const cl::NDRange& localRange,
                   const int numberOfRelaxationSweep,
                   const int& domainHeight, const int& domainWidth,
                   const int& domainDepth, cl::size_t<3> bufferOrigin,
                   cl::size_t<3> hostOrigin, cl::size_t<3> region,
                   const int& deviceWidth, const int& spatialStepSize, cl::Event& event);
};

//#include "../src/relaxation.cc"

#endif

and the source file (relaxation.cc) is

#include "../includes/relaxation.hpp"

template<class T>
relaxation<T>::relaxation(cl::Program& _program, cl::CommandQueue& _queue,
                          const cl::Context& _context)
{
  createKernel(_program);
  setStencil(_context, _queue);
}


template<class T>
relaxation<T>::~relaxation()
{
   delete[] stencil;
}

template<class T>
inline T relaxation<T>::getOmega()
{
   return omega;
}


template<class T>
void relaxation<T>::setStencil(const cl::Context& context, cl::CommandQueue& queue)
{
   stencil = new T[relaxationStencilSize];

  T coeff = omega;

  stencil[0] = 0.0;
  stencil[1] = 0.0;
  stencil[2] = 0.0;
  stencil[3] = 0.0;
  stencil[4] = 1.0 / 6.0 * coeff;
  stencil[5] = 0.0;
  stencil[6] = 0.0;
  stencil[7] = 0.0;
  stencil[8] = 0.0;

  stencil[9] = 0.0;
  stencil[10] = 1.0 / 6.0 * coeff;
  stencil[11] = 0.0;
  stencil[12] = 1.0 / 6.0 * coeff;
  stencil[13] = (1.0 - omega);
  stencil[14] = 1.0 / 6.0 * coeff;
  stencil[15] = 0.0;
  stencil[16] = 1.0 / 6.0 * coeff;
  stencil[17] = 0.0;

  stencil[18] = 0.0;
  stencil[19] = 0.0;
  stencil[20] = 0.0;
  stencil[21] = 0.0;
  stencil[22] = 1.0 / 6.0 * coeff;
  stencil[23] = 0.0;
  stencil[24] = 0.0;
  stencil[25] = 0.0;
  stencil[26] = 0.0;

  try
  {
    relaxationStencilBuffer = cl::Buffer(context, CL_MEM_READ_ONLY,  
                                         relaxationStencilSize * sizeof(T));
    queue.enqueueWriteBuffer(relaxationStencilBuffer,  CL_TRUE, 0, 
                             relaxationStencilSize * sizeof(T), stencil);

  }catch (const cl::Error& error)
  {
    std::cout << "  -> Relaxaation class, Problem in buffer creation/writing "
                   "data to device " << std::endl;
    std::cout << "  -> " << getErrorString(error) << std::endl;
    exit(0);
  }
}



template<class T>
void relaxation<T>::createKernel(cl::Program& program)
{
  std::cout << "==> Relaxation class, Creating kernels";
  try
  {
    kernel  = cl::Kernel(program, "relaxation");
    std::cout << "\t-> Done!" << std::endl;
  }catch (const cl::Error& error)
  {
    std::cout << "  -> Relaxation class, Problem in kernel  " << std::endl;
    std::cout << "  -> " << getErrorString(error) << std::endl;
    exit(0);
  }

}


/*
 * x_n = b - Ax_{n-1}
 * inoutBuffer -> x
 * RHSBuffer   -> b
 * relaxationStencilBuffer -> A
 */
template<class T>
void relaxation<T>::relaxing(cl::CommandQueue& queue, 
                         cl::Buffer& inoutBuffer,cl::Buffer& intermediateBuffer, 
                         cl::Buffer& RHSBuffer,
                         const cl::NDRange& globalRange, const cl::NDRange& localRange,
                         const int numberOfRelaxationSweep,
                         const int& domainHeight, const int& domainWidth,
                         const int& domainDepth, cl::size_t<3> bufferOrigin,
                         cl::size_t<3> hostOrigin, cl::size_t<3> region, 
                         const int& deviceWidth, const int& spatialStepSize, 
                         cl::Event& event)
{
  // this step is done to have same boundary vales in intermedaite buffer as x
  cl::Event copyEvent;
  cl::Event iterationEvent;
  try
  {
    queue.enqueueCopyBufferRect(inoutBuffer, intermediateBuffer, bufferOrigin, 
                hostOrigin, region, deviceWidth * sizeof(T), 0, 
                deviceWidth * sizeof(T), 0, NULL, &copyEvent);
  } catch (const cl::Error& error)
  {
    std::cout << "  -> Problem in copying buffer x to y" << std::endl;
    std::cout << "  -> " << getErrorString(error) << std::endl;
    exit(0);
  }
  copyEvent.wait();

  int argCount = 0;
  try
  {
    kernel.setArg(argCount++, inoutBuffer);
    kernel.setArg(argCount++, domainHeight);
    kernel.setArg(argCount++, domainWidth);
    kernel.setArg(argCount++, domainDepth);
    kernel.setArg(argCount++, relaxationStencilBuffer);
    kernel.setArg(argCount++, stencilDataWidth);
    kernel.setArg(argCount++, RHSBuffer);
    kernel.setArg(argCount++, intermediateBuffer);
    kernel.setArg(argCount++, spatialStepSize);
    kernel.setArg(argCount++, localMemSize * sizeof(T), NULL);
    kernel.setArg(argCount++, localHeight);
    kernel.setArg(argCount++, localWidth);
    kernel.setArg(argCount++, localDepth);
  } catch (const cl::Error& error)
  {
    std::cout << "  -> Relaxation class, Problem in setting the argument of kernel" << 
                std::endl;
    std::cout << "  -> " << getErrorString(error) << std::endl;
    exit(0);
  }

  for (int i = 0; i < numberOfRelaxationSweep; ++i)
  {
    try
    {
      queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalRange, localRange, 
                                 NULL, &iterationEvent);
    }catch (const cl::Error& error)
    {
      std::cout << "  -> Relaxation class, Problem in enqueue kernel" << 
                   std::endl;
      std::cout << "  -> " << getErrorString(error) << std::endl;
      exit(0);
    }

    try
    {
      queue.finish();
    }catch (const cl::Error& error)
    {
      std::cout << "  -> Relaxation class, Problem in finishing kernel" << std::endl;
      std::cout << "  -> " << getErrorString(error) << std::endl;
      exit(0);
    }

    iterationEvent.wait();
    if (i == numberOfRelaxationSweep -1)
    {
       try
       {
          queue.enqueueCopyBufferRect(intermediateBuffer, inoutBuffer, bufferOrigin, 
                hostOrigin, region, deviceWidth * sizeof(T), 0, 
                deviceWidth * sizeof(T), 0, NULL, &event);   \\ event release here
       } catch (const cl::Error& error)
       {
           std::cout << "  -> Problem in copying buffer x to y" << std::endl;
           std::cout << "  -> " << getErrorString(error) << std::endl;
           exit(0);
       }
    } else
    {
       try
       {
          queue.enqueueCopyBufferRect(intermediateBuffer, inoutBuffer, bufferOrigin, 
                hostOrigin, region, deviceWidth * sizeof(T), 0, 
                deviceWidth * sizeof(T), 0, NULL, &copyEvent);
       } catch (const cl::Error& error)
       {
           std::cout << "  -> Problem in copying buffer x to y" << std::endl;
           std::cout << "  -> " << getErrorString(error) << std::endl;
           exit(0);
       }
       copyEvent.wait();
    }
  }
}

to run the code in main.cc I do like this

cl::Event event;
relaxation<T> fmgRelaxation   = new relaxation<T>(program, queue, context);
fmgRelaxation->relaxing(queue, xBuffer, intermediateBuffer[0], bBuffer, 
                                globalRange[0], localRange, preSweepNumber, 
                                deviceHeight[0], deviceWidth[0], deviceDepth[0],
                                bufferOrigin, hostOrigin, region[0], deviceWidth[0], 
                                spatialStepSize[0], event);
        event.wait(); // if the code wait here to everything in relaxation class are done????

// Do sth else

Other parameters created beforehand (I did not add all the code in last part since it is too long). I passed a reference to event in main.cc to relaxing method of relaxation class and the last EnqueueCopyBuffer in the relaxing method would release it (as I make short comment). now my question is the code in main.cc if the code is blocked (i.e. in host) to everything in relaxation class is done and then continue in main.cc. If not, how should I do it.

The problem is not from sync by using event, the problem stems from using

 try
  {
    queue.enqueueFillBuffer(intermediateBuffer, &value, 0,
                        distDeviceWidth * distDeviceHeight * distDeviceDepth * sizeof(T),
                        NULL, &copyEvent);
  }catch (const cl::Error& error)
  {
    std::cout << "  -> Prolongation class, Problem in enqueue fill buffer" << std::endl;
    std::cout << "  -> " << getErrorString(error) << std::endl;
    exit(0);
  }

  try
  {
    queue.finish();
  }catch (const cl::Error& error)
  {
    std::cout << "  -> Prolongation class, Problem in finishing fill buffer" << std::endl;
    std::cout << "  -> " << getErrorString(error) << std::endl;
    exit(0);
  }
  copyEvent.wait();

which produce garbage in intermediateBuffer. I use opencl 2.0 to build, I do not know why it happens