uint size

Hello all,
So I’ve written a fair amount of OpenCL code to do various math operations on large integers. All those work fine with a single core of the CPU executing them (the functions implemented thus far are not to be paralleled, but used by the paralleled function). So today I was moving on to the part of the program that is to be parallel. First off I switched to using the GPU, and executed a hello world program just to make certain my host program was functioning correctly on the GPU. I figured all the rest would work as well, but that’s when I hit problems…

I’ve narrowed down the problem to be a difference in sizes of integers (or so it seems). So I wrote a small kernel shown below, which accepts two inputs (an output buffer, and an input buffer). It is supposed to copy the input buffer to the output buffer, but that doesn’t happen correctly. Can anyone help out here?

#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
__kernel void hello(__global uint * out, __global uint * in) {
    size_t tid = get_global_id(0);
    out[tid] = in[tid];

For both the input and output buffers, in the host program I have defined them as follows:

unsigned int * outH = new unsigned int[2];
unsigned int * input = new unsigned int[2];

When the output comes out, the zero’th element is 6619691 and the first element is 0. Looking at the bit pattern from the zero’th element I see that the 12 least significant bits match the bit pattern of 555. The 4 bits more significant than those are 0. So if I cast the zero’th element to a short it displays 555 correctly. With this knowledge I looked into the device’s CL_DEVICE_ADDRESS_BITS, which returns 32. My host system is a 64 bit system. I think this has something to do with my problem, but I can’t justify it as I’m pretty certain that would not change the size of an unsigned integer. Can anyone offer some insight?


Perhaps more information would aid in figuring out the problem. So here is the host program, note this is an adaption of the “Hello World” program which I’m sure you all have seen before online.

Host Program:

#include <utility> // for pairs
#define __NO_STD_VECTOR // Use cl::vector instead of STL version
#include <CL/cl.hpp>
#include <cstdio>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <string>
#include <iterator>

using namespace std;

void chkError(cl_int, string);

int main (int argc, char * argv[]) {
	// Get a list of available OpenCL Platforms
	cl_int err;
	cl::vector <cl::Platform> platforms;

	// See if there was an error getting the platforms
	chkError (platforms.size()!=0 ? CL_SUCCESS : -1, 
	cout << "Platforms returned from CL: " << platforms.size()
		<< endl;

	// Get the vendor of the first returned platform
	string platformVendor;
    cout << "Platform Vendor: " << platformVendor << endl;

	// Setup the context of our OpenCL program
	cl_context_properties cprops[3] =
		(cl_context_properties)(platforms[0])(), 0};
    cl::Context context(
	// Make sure there were no errors
    chkError(err, "Context::Context()");    
	// This creates some memory for our OpenCL program to use 
	//	directly, giving it a pointer to the memory and giving it 
	//	write access
	unsigned int * outH = new unsigned int[2];
    cl::Buffer outCL(
	// Make sure there were no errors
    chkError(err, "Buffer::Buffer()");

	unsigned int * input = new unsigned int[2];

	cl::Buffer inCL(

	// Gives us a list of devices to be used in the context
	cl::vector<cl::Device> devices;
    devices = context.getInfo<CL_CONTEXT_DEVICES>();
	// Make sure there were no errors
    chkError(devices.size() > 0 ? CL_SUCCESS : -1, 

	// Open our OpenCL program for reading
	ifstream file; 
	// Make sure there were no errors
    chkError(file.is_open() ? CL_SUCCESS : -1, 

	// Create a string of our source
    string prog(istreambuf_iterator<char>(file),
	// Create a source from our source code string
  	cl::Program::Sources source(
        make_pair(prog.c_str(), prog.length()+1));
	// Compile and build the program for source for our device
    cl::Program program(context, source);
    err = program.build(devices,""); 
    chkError(err, "Program::build()");

	// Build a kernel and tell it to start in the hello() method
	cl::Kernel kernel(program, "hello", &err);
    chkError(err, "Kernel::Kernel()");
	// Set the argument at index 0 for our kernel to be the buffer
	//	we made for the program
    err = kernel.setArg(0, outCL);
    chkError(err, "Kernel::setArg(0)");
	err = kernel.setArg(1, inCL);
    chkError(err, "Kernel::setArg(1)");

	// Create a queue of work to be done on device[0]
	cl::CommandQueue queue(context, devices[0], 0, &err);
    chkError(err, "CommandQueue::CommandQueue()");
    cl::Event event;
	// Enqueue the workitems on the device
    err = queue.enqueueNDRangeKernel(
        cl::NDRange(1, 1), 
    chkError(err, "CommandQueue::enqueueNDRangeKernel()");

	// Blocks until all work items have been completed
	// Reads the buffer outCL which was updated by the kernel into
	//	the local variable outH
    err = queue.enqueueReadBuffer(
    chkError(err, "CommandQueue::enqueueReadBuffer()");
    cout << "0: " << outH[0] << endl;
	cout << "1: " << outH[1] << endl;

	return 0;

void chkError (cl_int status, string name) {
	if (status != CL_SUCCESS) {
		cerr << "::ERROR:: " << name << endl;
		exit (EXIT_FAILURE);

And the full kernel code, once again:

#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
//__constant char hw[] = "Hello World
__kernel void hello(__global uint * out, __global uint * hw)
    size_t tid = get_global_id(0);
    out[tid] = hw[tid];

Seems the devil is in the details once again… I was only reading 2 bytes hence why my answer only stored two bytes.

    err = queue.enqueueReadBuffer(