Hi,
I’m trying to test an APU with different buffer creation/allocation strategies. The algorithm is the classic Saxpy (y = ax + y, where x and y are vectors).
I encounter a problem when I try to perform a test where buffers are created with USE_HOST_PTR flag and mapping/unmapping is performed. This is probably due to the fact that I still have some problems to understand the data transferring mechanism between host and device behind the mapping.
To perform the test, I follow these steps:
- Allocate on the host the vectors X and Y (and Z, to test the computation on the host without overwriting Y, which has to be used later for the GPU)
- Create a buffer for X, using USE_HOST_PTR and passing the pointer;
- Create a buffer for Y, using USE_HOST_PTR and passing hte pointer;
- Execute the kernel
- Call clEnqueueMapBuffer and wait for it to complete, so to get consistend values for Y.
When I try to run it, I get a memory violation exception. The nested function is HeapAlloc(_crtheap, 0, size ? size : 1), as reported by the Visual studio debugger.
The point where the exception is raised changes from time to time, but it is always located in the part of the code where I get the result of the computation, i.e. where the host tries to read Y after clEnqueueMapBuffer.
I post the relevant part of the code, hoping you can help me to find the mistake.
Gloabal declarations:
cl_float * pX = NULL;
cl_float * pY = NULL;
cl_float * pZ = NULL;
cl_float a = 2.f;
Host initialization and computation:
void initHost(unsigned int length)
{
size_t sizeInBytes = length * sizeof(cl_float);
pX = (cl_float *) malloc(sizeInBytes);
if (pX == NULL)
throw(string("Error: Failed to allocate input memory on host
"));
pY = (cl_float *) malloc(sizeInBytes);
if (pY == NULL)
throw(string("Error: Failed to allocate input memory on host
"));
pZ = (cl_float *) malloc(sizeInBytes);
if (pZ == NULL)
throw(string("Error: Failed to allocate input memory on host
"));
for(int i = 0; i < length; i++)
{
pX[i] = cl_float(i);
pY[i] = cl_float(length-1-i);
}
}
void vectorAddHost(
const float* pfData1,
const float* pfData2,
float* pfResult,
int iNumElements)
{
int i;
for (i = 0; i < iNumElements; i++)
{
pfResult[i] = a * pfData1[i] + pfData2[i];
}
}
Code to initialize and run OpenCL computation and to compare results:
//128 is the local work size
currNumElements = 128 * 1024;
/////////////////////////////////////////////////////////////////
// Allocate and initialize memory on the host
/////////////////////////////////////////////////////////////////
initHost(currNumElements);
/////////////////////////////////////////////////////////////////
// Test host
/////////////////////////////////////////////////////////////////
LARGE_INTEGER frequency;
LARGE_INTEGER cpu_start = startTimer(&frequency);
vectorAddHost(pX, pY, pZ, currNumElements);
double cpu_time = getTimer(frequency, cpu_start);
cout << "CPU TIME (CPU timer) = " << cpu_time << " ms" << endl;
cpu_data << currNumElements << " " << cpu_time << endl;
/////////////////////////////////////////////////////////////////
// Start timer
/////////////////////////////////////////////////////////////////
LARGE_INTEGER gpu_start = startTimer(&frequency);
/////////////////////////////////////////////////////////////////
// Create OpenCL memory buffers
/////////////////////////////////////////////////////////////////
bufX = cl::Buffer(
context,
CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
sizeof(cl_float) * currNumElements,
pX);
bufY = cl::Buffer(
context,
CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
sizeof(cl_float) * currNumElements,
pY);
/////////////////////////////////////////////////////////////////
// Set the arguments that will be used for kernel execution
/////////////////////////////////////////////////////////////////
kernel.setArg(
0,
bufX);
kernel.setArg(
1,
bufY);
kernel.setArg(
2,
a);
/////////////////////////////////////////////////////////////////
// Enqueue the kernel to the queue
// with appropriate global and local work sizes
/////////////////////////////////////////////////////////////////
queue.enqueueNDRangeKernel(
kernel,
cl::NDRange(),
cl::NDRange(currNumElements),
cl::NDRange(localSize));
/////////////////////////////////////////////////////////////////
// Map buffers (get capability?)
/////////////////////////////////////////////////////////////////
cl_int err;
cl_float* pEnd = (cl_float*)queue.enqueueMapBuffer(bufY, TRUE, CL_MAP_READ, 0,
currNumElements * sizeof(cl_float), NULL, NULL, &err);
//err == 0
printf("%d
", err);
/////////////////////////////////////////////////////////////////
// Test gpu
/////////////////////////////////////////////////////////////////
queue.finish();
double gpu_time = getTimer(frequency, gpu_start);
cout << "GPU TIME (CPU timer) = " << gpu_time << " ms" << std::endl;
gpu_data << currNumElements << " " << gpu_time << endl;
if(verify(pEnd, pZ, currNumElements))
cout << "Verification SUCCESS" << endl;
else
cout << "Verification FAIL" << endl;
/////////////////////////////////////////////////////////////////
// Release host resources
/////////////////////////////////////////////////////////////////
cleanupHost();
Thank you very much!