Using events for measuring time

gggh000 · July 12, 2020, 7:17pm

I tried using events to measure the clEnqueueWriteBuffer (or read) but it is returning -7 which is a CL_PROFILING_INFO_NOT_AVAILABLE according to error code:

gist.github.com

https://gist.github.com/bmount/4a7144ce801e5569a0b6

opencl-error-codes.txt

CL_SUCCESS                                  0
CL_DEVICE_NOT_FOUND                         -1
CL_DEVICE_NOT_AVAILABLE                     -2
CL_COMPILER_NOT_AVAILABLE                   -3
CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
CL_OUT_OF_RESOURCES                         -5
CL_OUT_OF_HOST_MEMORY                       -6
CL_PROFILING_INFO_NOT_AVAILABLE             -7
CL_MEM_COPY_OVERLAP                         -8
CL_IMAGE_FORMAT_MISMATCH                    -9

This file has been truncated. show original

Here is my code below. This one is a porting attempt from similar example in Cuda by example code in p189

I also looked up example code from: https://www.eecis.udel.edu/~cavazos/cisc879/Lecture-9B.pdf

//
// Copyright (c) 2010 Advanced Micro Devices, Inc. All rights reserved.
//

// A minimalist OpenCL program.

#include <CL/cl.h>
#include <stdio.h>

#define printDeviceInfo(X) printf(“\n%s: %s”, (X));
#define declareDeviceInfo(X) char str(X)[] = “(X)”;

#define SIZE 2048
#define LOCAL_WORK_SIZE 256
#define DEBUG 0
//#define SIZE (1010241024)
#define SIZE (110241024)

// A simple kernelfcn kernel
const char *source =

“kernel void kernelfcn( global uint *dev_c, global uint *dev_a, global uint *dev_b) \n”
“{ \n”
" uint tid = get_global_id(0); \n"
" dev_c[tid] = dev_a[tid] + dev_b[tid]; \n"
“} \n”;

float opencl_malloc_test(int size, int up, int hostAlloc, cl_context * context, cl_command_queue * queue) {
int *a;
cl_mem dev_a;
float elapsedTime = 0;
int ret;
cl_event evtWrite;
cl_ulong start, end;

if (hostAlloc) {
    //a = (int*)cudaHostAlloc((void**)&a, size * sizeof(*a), cudaHostAllocDefault);
    a = (int*)malloc(size * sizeof(*a));
} else {
    a = (int*)malloc(size * sizeof(*a));

    if (a == NULL) {
        printf("malloc fail with size %u.\n", size); 
        return 1;
    }
}

dev_a = clCreateBuffer( *context, CL_MEM_READ_WRITE, SIZE * sizeof(cl_uint), NULL, &ret);

if (ret) {
    printf("clCreateBuffer fail code %d.\n", ret);
    return 1;
}

if (up)
    //cudaMemcpy(dev_a, a, size * sizeof(*dev_a), cudaMemcpyHostToDevice);
    ret = clEnqueueWriteBuffer(*queue, dev_a, CL_TRUE, 0, SIZE * sizeof(cl_uint), a, NULL, NULL, NULL);
else
    //cudaMemcpy(a, dev_a, size * sizeof(*dev_a), cudaMemcpyDeviceToHost);
    //ret = clEnqueueReadBuffer(*queue, dev_a, CL_TRUE, 0, SIZE * sizeof(cl_uint), a, NULL, NULL, &evtWrite);
    ret = clEnqueueReadBuffer(*queue, dev_a, CL_TRUE, 0, SIZE * sizeof(cl_uint), a, NULL, NULL, NULL);

if (ret) {
    printf("clEnqueueWrite/ReadBuffer fail code %d.\n", ret);
    return 1;
}


ret = clGetEventProfilingInfo(evtWrite,CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);

if (ret != 0) {
    printf("clGetEventProfilingInfo (END) failed with code %d.\n", ret);
    return 1;
}

ret = clGetEventProfilingInfo(evtWrite,CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);

if (ret != 0) {
    printf("clGetEventProfilingInfo (END) failed with code %d.\n", ret);
    return 1;
}

elapsedTime  = (end - start) * 1.0e-6f;

/*
if (hostAlloc) {
    cudaFreeHost(dev_a);
} else {
    cudaFree(dev_a);
}
*/
//cudaFree(a);
//cudaEventDestroy(start);
//cudaEventDestroy(stop);
return elapsedTime;

}

int main(int argc, char ** argv)
{
int stat;
char str1[100];
ushort ushort1;
uint uint1;
ulong ulong1;
size_t strLen;
cl_int ret;
uint a[SIZE];
int i;

// 1. Get a platform.

cl_uint CONFIG_MAX_PLATFORMS=20;
cl_platform_id platforms[CONFIG_MAX_PLATFORMS];
cl_uint platforms_available;

clGetPlatformIDs(CONFIG_MAX_PLATFORMS, platforms, &platforms_available );
printf("\nNo. of platforms available: %d.\n", platforms_available);

for (int i = 0 ; i < platforms_available; i ++ ) {
    printf("Platform %d: %d.\n", i, platforms[i]);
}

// 2. Find a gpu/cpu device.

cl_uint CONFIG_MAX_DEVICES = 20;
cl_uint devices_available;

enum enum_device_info_types {DEVINFO_STRING=1, DEVINFO_USHORT=2, DEVINFO_UINT=3, DEVINFO_ULONG=4, DEVINFO_SIZE_T=5};

enum enum_device_info_types device_info_types[] = {
    DEVINFO_STRING, \
    DEVINFO_STRING, \    
    DEVINFO_STRING, \    
    DEVINFO_STRING, \    
    DEVINFO_ULONG, \    
    DEVINFO_ULONG, \    
    DEVINFO_USHORT, \    
    DEVINFO_UINT, \    
    DEVINFO_UINT, \    
    DEVINFO_SIZE_T, \    
    DEVINFO_UINT, \    
    DEVINFO_SIZE_T, \    
    DEVINFO_USHORT, \    
    DEVINFO_STRING, \    
DEVINFO_SIZE_T \
};
char *str_device_info[]={\
    "CL_DEVICE_NAME", \
    "CL_DEVICE_VENDOR", \
    "CL_DEVICE_VERSION", \
    "CL_DRIVER_VERSION", \
    "CL_DEVICE_GLOBAL_MEM_SIZE", \
    "CL_DEVICE_LOCAL_MEM_SIZE", \
    "CL_DEVICE_LOCAL_MEM_TYPE", \
    "CL_DEVICE_MAX_CLOCK_FREQUENCY", \
    "CL_DEVICE_MAX_COMPUTE_UNITS", \
    "CL_DEVICE_MAX_WORK_GROUP_SIZE", \
    "CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS", \
    "CL_DEVICE_MAX_WORK_ITEM_SIZES", \
    "CL_DEVICE_TYPE", \
    "CL_DEVICE_EXTENSIONS", \
"CL_DEVICE_MAX_PARAMETER_SIZE" \

};
cl_device_id device[CONFIG_MAX_DEVICES];
cl_device_info deviceInfos[]={\
    CL_DEVICE_NAME, \
    CL_DEVICE_VENDOR, \
    CL_DEVICE_VERSION, \
    CL_DRIVER_VERSION, \
    CL_DEVICE_GLOBAL_MEM_SIZE, \
    CL_DEVICE_LOCAL_MEM_SIZE, \
    CL_DEVICE_LOCAL_MEM_TYPE, \
    CL_DEVICE_MAX_CLOCK_FREQUENCY, \
    CL_DEVICE_MAX_COMPUTE_UNITS, \
    CL_DEVICE_MAX_WORK_GROUP_SIZE, \
    CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, \
    CL_DEVICE_MAX_WORK_ITEM_SIZES, \
    CL_DEVICE_TYPE, \
    CL_DEVICE_EXTENSIONS, \
CL_DEVICE_MAX_PARAMETER_SIZE \

};
stat = clGetDeviceIDs( platforms[0], CL_DEVICE_TYPE_ALL, CONFIG_MAX_DEVICES, device, &devices_available);

printf("No. of devices available: %d.\n", devices_available);

// 3. Create a context and command queue on that device.

cl_context context = clCreateContext( NULL, 1,  &device[0], NULL, NULL, NULL);
cl_command_queue queue = clCreateCommandQueue( context, device[0], 0, NULL );

// 4. Perform runtime source compilation, and obtain kernel entry point.

cl_program program = clCreateProgramWithSource( context, 1, &source, NULL, NULL );

if (ret) {
    printf("Error: clCreateProgramWithSource returned non-zero: %d.\n", ret);
    return 1;
} else  {
    printf("clCreateProgramWithSource return OK.... %d.\n", ret);
}

float elapsedTime;

printf("clCreateBuffer test:\n");
float MB = (float)100 * SIZE * sizeof(int) / 1024 / 1024;
elapsedTime = opencl_malloc_test(SIZE, CL_TRUE, 0, &context, &queue);
printf("Time using clCreateBuffer(up): %3.1f ms.\n", elapsedTime);
printf("MB/s during copy up: %3.1f.\n", MB / (elapsedTime / 1000));
elapsedTime = opencl_malloc_test(SIZE, CL_FALSE, 0, &context, &queue);
printf("Time using clCreateBuffer(down): %3.1f ms.\n", elapsedTime);
printf("MB/s during copy up: %3.1f.\n", MB / (elapsedTime / 1000));

printf("Hostalloc test:\n");
elapsedTime = opencl_malloc_test(SIZE, CL_TRUE, 1, &context, &queue);
printf("Time using clCreateBuffer(up): %3.1f ms.\n", elapsedTime);
printf("MB/s during copy up: %3.1f.\n", MB / (elapsedTime / 1000));
elapsedTime = opencl_malloc_test(SIZE, CL_FALSE, 1, &context, &queue);
printf("Time using clCreateBuffer(down): %3.1f ms.\n", elapsedTime);
printf("MB/s during copy up: %3.1f.\n", MB / (elapsedTime / 1000));

printf("\n");
return 0;

}

bashbaug · July 13, 2020, 4:16pm

Hi gggh000!

To query event profiling time you need to create your command queue with profiling enabled.

See: CL_QUEUE_PROFILING_ENABLE

This basically means that instead of passing 0 as your queue properties:

You’ll want to pass CL_QUEUE_PROFILING_ENABLE instead:

cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE;
cl_command_queue queue = clCreateCommandQueue( context, device[0], properties, NULL );

Give this a try and see if it fixes the CL_PROFILING_INFO_NOT_AVAILABLE error.

gggh000 · July 13, 2020, 7:19pm

Hi bashbaug, after posting in OP, I actually found and already did albeit it is bit different, I put that property directly into the clCreateCommandQueue function argument, nevertheless I put yours and still causing the segfault.

I put also error check by passing the last argument and return value and clCreateCommandQueue itself returns OK only later calls to clGetEventProfilingInfo to check the profile is causing the segfault, because if I comment out, it runs fine, except, obviously timing information is missing.
1.
cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE;
cl_command_queue queue = clCreateCommandQueue( context, device[0], properties, &ret );
//cl_command_queue queue = clCreateCommandQueue( context, device[0], CL_QUEUE_PROFILING_ENABLE, NULL );

if (ret) {
    printf("Error: clCreateCommandQueue returned non-zero: %d.\n", ret);
    return 1;
} else  {
    printf("clCreateCommandQueue return OK.... %d.\n", ret);
}

…

// still segfault.
No. of platforms available: 1.
Platform 0: -113762544.
No. of devices available: 1.
clCreateCommandQueue return OK… 0.
clCreateProgramWithSource return OK… 0.
clCreateBuffer test:
Segmentation fault (core dumped)

commented out:
/*
ret = clGetEventProfilingInfo(evtWrite,CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);

if (ret != 0) {
printf(“clGetEventProfilingInfo (END) failed with code %d.\n”, ret);
return 1;
}

ret = clGetEventProfilingInfo(evtWrite,CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);

if (ret != 0) {
printf(“clGetEventProfilingInfo (END) failed with code %d.\n”, ret);
return 1;
}
*/

it runs fine:

++ -o p189 p189.o -lOpenCL -L/opt/rocm/opencl//lib/x86_64

No. of platforms available: 1.
Platform 0: 2076733200.
No. of devices available: 1.
clCreateCommandQueue return OK… 0.
clCreateProgramWithSource return OK… 0.
clCreateBuffer test:
Time using clCreateBuffer(up): 0.0 ms.
MB/s during copy up: inf.
Time using clCreateBuffer(down): 0.0 ms.
MB/s during copy up: inf.
Hostalloc test:
Time using clCreateBuffer(up): 0.0 ms.
MB/s during copy up: inf.
Time using clCreateBuffer(down): 0.0 ms.
MB/s during copy up: inf.

bashbaug · July 13, 2020, 9:04pm

Hmm, if you’re getting a segfault and not an OpenCL error then something else must be going on.

Just to confirm, are you actually creating the event you are querying? The reason I’m asking is because in the code snip above it is commented out:

gggh000:

    //ret = clEnqueueReadBuffer(*queue, dev_a, CL_TRUE, 0, SIZE * sizeof(cl_uint), a, NULL, NULL, &evtWrite);
    ret = clEnqueueReadBuffer(*queue, dev_a, CL_TRUE, 0, SIZE * sizeof(cl_uint), a, NULL, NULL, NULL);

My usual recommendation for debugging issues like these is to run your program with the OpenCL Intercept Layer, with the CallLogging control enabled:

With a call log it should be obvious whether the event is being properly created and whether the right parameters are being passed to clGetEventProfilingInfo.

gggh000 · July 14, 2020, 8:18am

ok, I will try that, i briefly tried intercept layer from ur respnse in my previous post, however has not studied well yet.

E.Peshkov · July 14, 2020, 1:53pm

Hello @gggh000.
Try CLtracer

gggh000 · July 14, 2020, 8:05pm

Peshkob, that looks like a nice tool. I have some experience using AMD RGP in the past, it is fairly complicated and comprehensive tool, it looks like similar to it Radeon™ GPU Profiler - AMD GPUOpen

gggh000 · July 15, 2020, 4:51am

Good news! Previously I ran on AMD engineering sample and was causing above mentioned error. Today at home computer, I tried on RX VEGA 56 and voila, it runs!

The only issue remaining is cuda equivalent average about 4-6GB/sec with GTX1080, and this one is reporting +200GB/sec with VEGA56, that can not be right. I need to look at the calculations, units of measurements etc.,

^~~~~~~~~~~~~~~~~~~~
g++ -o p189 p189.o -lOpenCL -L/opt/rocm/opencl//lib/x86_64

No. of platforms available: 1.
Platform 0: -1169859280.
No. of devices available: 1.
clCreateProgramWithSource return OK… 0.
clCreateBuffer test:
clWaitForEvent return code 0.
Time using clCreateBuffer(up): 2.0 ms.
MB/s during copy up: 197727.3.
clWaitForEvent return code 0.
Time using clCreateBuffer(down): 2.0 ms.
MB/s during copy up: 203530.5.
Hostalloc test:
clWaitForEvent return code 0.
Time using clCreateBuffer(up): 1.5 ms.
MB/s during copy up: 271500.8.
clWaitForEvent return code 0.
Time using clCreateBuffer(down): 1.4 ms

E.Peshkov · July 15, 2020, 11:15am

CLtracer is very easy to use. You need to select an executable and start an app. After profiling is finished you’ll get timeline of all OpenCL commands, their device and host timings, NDRanges, amount of transfered data, etc. There are useful performance metrics. Also CLtracer supports any OpenCL device (not only AMD Radeon) and available on both Linux and Windows.

system · January 14, 2021, 11:26am

This topic was automatically closed 183 days after the last reply. New replies are no longer allowed.