Atomic compare and swap

Hi everyone,

I’m trying to use the atom_cmpxchg (OpenCL version is 1.0, GPU is NVIDIA 9600M GT) function, but I cant manage to get the expected result: the swap dont happen.

I tried to find a example (my code is rather long, so I looked for something simpler), but I can’t make it work either (this code can be found on the internet, but I just dont remember the address right now):

The output result:

	Old A = 500
	New A 500

I may not have understood what this function is supposed to do…
Given the prototype of the function atom_cmpxchg (__global int *p, int cmp, int val): I want to swap the value at *p by val if and only if *p == cmp (store the old value of *p if *p!=cmp). Is that right? Or I am missing something?

Thanks for your help!

The kernel:

__kernel void atomiccmpxchg(__global int *old, __global int *new)
	__local int v,v1;
        v = 500;
	*old = atom_cmpxchg(new,v,v1);

The host code:

#include <iostream>
#include <cstdlib>
#include <fstream>
#include <string>
#if defined __APPLE__ || defined (MACOSX)
#include <OpenCL/cl.h>
#include <CL/cl.h>

using namespace std;

void err_check( int err, string err_code ) {
	if ( err != CL_SUCCESS ) {
		cout << "Error: " << err_code << "(" << err << ")" << endl;

int main()
	cl_platform_id platform_id = NULL;
	cl_device_id device_id = NULL;
	cl_context context = NULL;
	cl_command_queue command_queue = NULL;
	cl_mem mobj_a = NULL;
	cl_mem mobj_b = NULL;
	cl_program program = NULL;
	cl_kernel kernel = NULL;
	cl_uint ret_num_devices;
	cl_uint ret_num_platforms;
	cl_int err;
	int a, b;
	a = 500;
	b = 500;
	// Get platform/device information 
	err = clGetPlatformIDs( 1, &platform_id, &ret_num_platforms );
	err_check( err, "clGetPlatformIDs" );
	// Get information about the device
	err = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices );
	err_check( err, "clGetDeviceIDs" );
	// Create OpenCL Context
	context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &err );
	err_check( err, "clCreateContext" );
	// Create Command Queue
	command_queue = clCreateCommandQueue( context, device_id, CL_QUEUE_PROFILING_ENABLE, &err );
	err_check( err, "clCreateCommandQueue" );
	// Create memory objects and tranfer the data to memory buffer
	mobj_a = clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof(int), NULL, &err );
	err = clEnqueueWriteBuffer( command_queue, mobj_a, CL_TRUE, 0, sizeof(int), &a, 0, NULL, NULL );
	err_check( err, "clEnqueueWriteBuffer" );
	mobj_b = clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof(int), NULL, &err );
	err = clEnqueueWriteBuffer( command_queue, mobj_b, CL_TRUE, 0, sizeof(int), &b, 0, NULL, NULL );
	err_check( err, "clEnqueueWriteBuffer" );	
	// Read kernel file
	ifstream file("");
	string prog( istreambuf_iterator<char>( file ), ( istreambuf_iterator<char>() ) );
	const char *source_str = prog.c_str();
	// Create Kernel program from the read in source
	program = clCreateProgramWithSource( context, 1, (const char **) &source_str, 0, &err );
	err_check( err, "clCreateProgramWithSource" );
	// Build Kernel Program
	err = clBuildProgram( program, 1, &device_id, NULL, NULL, NULL );
  size_t len;
  char buffer[2048];
  clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
                        sizeof(buffer), buffer, &len);
  printf("--- Build log ---
", buffer);
	err_check( err, "clBuildProgram" );
	// Create OpenCL Kernel
	kernel = clCreateKernel( program, "atomiccmpxchg", &err );
	err_check( err, "clCreateKernel" );
	//  Set OpenCL kernel argument
	err = clSetKernelArg( kernel, 0, sizeof( cl_mem ), (void *) &mobj_a );
	err_check( err, "clSetKernelArg" );
	err = clSetKernelArg( kernel, 1, sizeof( cl_mem ), (void *) &mobj_b );
	err_check( err, "clSetKernelArg" );
	//  Execute OpenCL kernel in task parallel
	clEnqueueTask( command_queue, kernel, 0, NULL, NULL );
	err_check( err, "clEnqueueTask" );	
	//  Read (Transfer result) from the memory buffer
	err = clEnqueueReadBuffer( command_queue, mobj_a, CL_TRUE, 0, sizeof(int), &a, 0, NULL, NULL );
	err = clEnqueueReadBuffer( command_queue, mobj_b, CL_TRUE, 0, sizeof(int), &b, 0, NULL, NULL );
	// Free objects
	err = clFlush( command_queue );
	err = clFinish( command_queue );
	err = clReleaseKernel( kernel );
	err = clReleaseProgram( program );
	err = clReleaseMemObject( mobj_a );
	err = clReleaseMemObject( mobj_b );
	err = clReleaseCommandQueue( command_queue );
	err = clReleaseContext( context );
	// Display result
	cout << "	Old A = " << a << endl;
	cout << "	New A " << b << endl;
	return 0;

You aren’t checking the return code from clEnqueueTask. Your code is

clEnqueueTask( command_queue, kernel, 0, NULL, NULL );

But it should be

err = clEnqueueTask( command_queue, kernel, 0, NULL, NULL );

Er… well yes, indeed it was missing. It does not help though (I’ve added the return code check, just in case…).
As I was saying, it is not my code, I was just looking for a minimal example to try this function. The swap wont happen, either in this example or in my own program.

Have you enabled atomic operations? Might be disabled?

NOTE: The atomic built-in functions that use the atom_ prefix and are described by the
following extensions
in sections 9.5 and 9.6 of the OpenCL 1.0 specification are also supported.

Since you first write to local memory when settings v and v1, then read this local memory in the atom_cmpxchg() function, you should insert a local memory fence before calling atom_cmpxchg.

Yep, atomics operations are enabled (I tripled checked)

I’ve added the barrier, no change though, still cant manage to swap the values.
I tried with privates variables instead, so that no barrier is needed, still nothing.

I can’t see any error in your program. In fact, it works fine on a 9600M GS (driver 306.23)

Have you tried with a simple assignment:

*old = *new;
if (*new == v) *new = v1;

Yes it could be done that way, but I need to prevent any workitem to interfer in an other workitem’s instructions (would cause data inconsistency).

That’s really weird…