OpenCL on MacBook Pro with NVidia 320m

n00b2key · January 23, 2011, 2:30pm

Hey all

im trying to setup an Application to make some calculation on my video card. The problem is, that my cpu is much faster then the gpu.

When i start the program, i get the following msg:

Connecting to NVIDIA GeForce 320M,
max_compute_units: 6
max_work_groub_size: 512
max_work_item_dimensions: 3

It’s working on any of your system? If so, where is my mistake?

The Kernel:

__kernel void
add(__global float a,
__global float b,
__global float answer)
{
int gid = get_global_id(0);
answer[gid] = a[gid] + b[gid];
answer[gid] = 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] /= 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] = 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] /= 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] = 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] /= 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] = 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] /= 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] = 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] /= 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] = 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] /= 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] = 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] /= 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] = 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] /= 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] = 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] /= 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] = 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] /= 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] = 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] /= 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] = 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] /= 0.460.486.544.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
}

My main.c:

#ifdef APPLE
#include <OpenGL/OpenGL.h>
#include <GLUT/glut.h>
//#include <OpenGL/glu.h>
#else
#include <GL/glut.h>
//#include <GL/glu.h>
#endif

#include <OpenCL/OpenCL.h>
#include <iostream>
#include <assert.h>
#include <sys/sysctl.h>
#include <sys/stat.h>
#include <stdlib.h>
#include <stdio.h>

#pragma mark -
#pragma mark Utilities
char * load_program_source(const char *filename)
{
struct stat statbuf;
FILE *fh;

char *source;
fh = fopen(filename, “r”);
if (fh == 0)
return 0;

stat(filename, &statbuf);
source = (char *) malloc(statbuf.st_size + 1);
fread(source, statbuf.st_size, 1, fh);
source[statbuf.st_size] = ‘\0’;
return source;
}

#pragma mark -
#pragma mark Main OpenCL Routine
int runCL(float * a, float * b, float * results, int n)
{
cl_program program[1];
cl_kernel kernel[1];

cl_command_queue cmd_queue;
cl_context context;

cl_device_id cpu = NULL, device = NULL;

cl_int err = 0;
size_t returned_size = 0;
size_t buffer_size;

cl_mem a_mem, b_mem, ans_mem;

#pragma mark Device Information
{
// Find the CPU CL device, as a fallback
//26:00
err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_CPU, 1, &cpu, NULL);
assert(err == CL_SUCCESS);

// Find the GPU CL device, this is what we really want
// If there is no GPU device is CL capable, fall back to CPU
err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
//if (err != CL_SUCCESS)
  device = cpu;
assert(device);
    
// Get some information about the returned device
cl_char vendor_name[1024] = {0};
cl_char device_name[1024] = {0};
cl_uint max_compute_units = 0;
size_t max_work_groub_size = 0;
cl_uint max_work_item_dimensions = 0;


//27:00
err = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(vendor_name), 
  	  vendor_name, &returned_size);

err |= clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), 
  	   device_name, &returned_size);

err |= clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(max_compute_units), 
  	   &max_compute_units, &returned_size);

err |= clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_work_groub_size), 
  	   &max_work_groub_size, &returned_size);

err |= clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(max_work_item_dimensions), 
  	   &max_work_item_dimensions, &returned_size);

assert(err == CL_SUCCESS);
printf("Connecting to %s %s, 
max_compute_units: %d
max_work_groub_size: %zu
max_work_item_dimensions: %d…
", vendor_name, device_name, max_compute_units, max_work_groub_size, max_work_item_dimensions);

}

#pragma mark Context and Command Queue
{
// Now create a context to perform our calculation with the
// specified device
context = clCreateContext(0, 1, &device, NULL, NULL, &err);
assert(err == CL_SUCCESS);

// And also a command queue for the context
cmd_queue = clCreateCommandQueue(context, device, 0, NULL);

}

#pragma mark Program and Kernel Creation
{
// Load the program source from disk
// The kernel/program is the project directory and in Xcode the executable
// is set to launch from that directory hence we use a relative path
const char * filename = “example.cl”;
char program_source = load_program_source(filename);
program[0] = clCreateProgramWithSource(context, 1, (const char*)&program_source,
NULL, &err);

assert(err == CL_SUCCESS);
    
// 28:40
err = clBuildProgram(program[0], 0, NULL, NULL, NULL, NULL);
assert(err == CL_SUCCESS);

// Now create the kernel "objects" that we want to use in the example file 
kernel[0] = clCreateKernel(program[0], "add", &err);

}

#pragma mark Memory Allocation
{

// Allocate memory on the device to hold our data and store the results into
buffer_size = sizeof(float) * n;

// Input array a
//30:10
a_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);

//32:20
err = clEnqueueWriteBuffer(cmd_queue, a_mem, CL_TRUE, 0, buffer_size,
  	       (void*)a, 0, NULL, NULL);

// Input array b
b_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);

err |= clEnqueueWriteBuffer(cmd_queue, b_mem, CL_TRUE, 0, buffer_size,
  		(void*)b, 0, NULL, NULL);

assert(err == CL_SUCCESS);

// Results array
ans_mem= clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, NULL); 

// Get all of the stuff written and allocated 
clFinish(cmd_queue);

}

#pragma mark Kernel Arguments
{

// Now setup the arguments to our kernel
//33:48
err  = clSetKernelArg(kernel[0],  0, sizeof(cl_mem), &a_mem);
err |= clSetKernelArg(kernel[0],  1, sizeof(cl_mem), &b_mem);
err |= clSetKernelArg(kernel[0],  2, sizeof(cl_mem), &ans_mem);

assert(err == CL_SUCCESS);

}

#pragma mark Execution and Read
{

// Run the calculation by enqueuing it and forcing the 
// command queue to complete the task
size_t global_work_size = n;
//33:59
err = clEnqueueNDRangeKernel(cmd_queue, kernel[0], 1, NULL, 
  		 &global_work_size, NULL, 0, NULL, NULL);

   
assert(err == CL_SUCCESS);
clFinish(cmd_queue);

// Once finished read back the results from the answer 
// array into the results array
//35:35
err = clEnqueueReadBuffer(cmd_queue, ans_mem, CL_TRUE, 0, buffer_size, 
  	      results, 0, NULL, NULL);

assert(err == CL_SUCCESS);
clFinish(cmd_queue);

}

#pragma mark Teardown
{
clReleaseMemObject(a_mem);
clReleaseMemObject(b_mem);
clReleaseMemObject(ans_mem);

clReleaseCommandQueue(cmd_queue);
clReleaseContext(context);

}
return CL_SUCCESS;
}

int main(int argc, char **argv) {
// Problem size
// int n = 20481616164;
int n = 40;

// Allocate some memory and a place for the results
float * a = (float )malloc(nsizeof(float));
float * b = (float )malloc(nsizeof(float));
float * results = (float )malloc(nsizeof(float));

// Fill in the values
for(int i=0;i<n;i++) {
a[i] = (float)i;
b[i] = (float)n-i;
results[i] = 0.f;
}

// Do the OpenCL calculation
runCL(a, b, results, n);

// Print out some results.
// for(int i=0;i<n;i++)
//if (i+1 != results[i])
// printf("%f
",results[i]);

printf("%f
",results[n-1]);

// Free up memory
free(a);
free(b);
free(results);

return 0;
}

thank you

HolyGeneralK · January 24, 2011, 7:56am

What information do you have saying the CPU is faster than your GPU?

Also, that message is a good message - it means that you have initialized your hardware correctly.

The main thing I see that might be problematic is the number of memory accesses you have in the kernel. IIRC, The global memory can be fickle, depending on the implementation as to where it is actually being stored. Your implementation might be storing the global memory in the main RAM, instead of on the GPU, thus causing a huge amount of communication to occur. <Note, I could be wrong on this - David.Garcia should clarify me on this>.

You could try and improve this by something along the lines of:


__kernel void add(__global float *a, __global float *b, __global float *answer)
{
int gid = get_global_id(0);
int iValA = a[gid];
int iValB = b[gid];

... Rest of your code...

This more or less would store a value of a[] and b[] for that gid point on the card. Thus, it would minimize the number of accesses you are doing. If the communication overhead is really your issue, this should take care of it.

I don’t see anything else glaring at me for this issue. I’ll take a look at it tonight when I am at home and can run it on my non-Mac desktop to see if I can reproduce your slowness.

chai · January 24, 2011, 11:26am

A good compiler will optimize for those memory accesses and use a local variable as HolyGeneralK suggested, but i wouldn’t try to rely on that. If, however, the low-level vm code is making those accesses on-the-fly and executing all these computations sequentially, it wouldn’t surprise me that the CPU version is faster, as memory accesses are cheaper and sequential functions more efficient on CPU than GPU.

To make it more GPU-friendly, try something like


int gid = get_global_id(0);
float valA = a[gid];
float valB = b[gid];
float resultValA;
float multiplier =  0.46*0.48*6.54*4.21 * (10.56*sin(valA) + 3.47 * cos(valB)*valB*valA);
float divider = 1.0/multiplier; // or native_recip(multiplier) or half_recip(multiplier) for faster approximations

resultValA = valA + valB;
resultValA *= multiplier;
resultValA *= divider; // multiplying by a reciprocal is cheaper than dividing
// ... repeat lines above as needed

answer[gid] = resultValA;

only three memory accesses and far less sequential computations (again, this may be done by the compiler, but not necessarily)

n00b2key · January 24, 2011, 1:56pm

Hey all

The short version is: chai’s tip is working very well.

i did a little bit of modification on my code above and also tried the following 3 kernels:

kernel 1:

__kernel void
add(__global float *a,
             __global float *b,
             __global float *answer)
{
        int gid = get_global_id(0);
        answer[gid] = a[gid] + b[gid];
	for (int i = 1; i < 100000; i++) {
            answer[gid] *= i*(10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
            answer[gid] /= i*(10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
        }
}

kernel 2 (inspired by HolyGeneralK):

__kernel void
add(__global float *a,
             __global float *b,
             __global float *answer)
{
        int gid = get_global_id(0);
        float iValA = a[gid];
	float iValB = b[gid];
        answer[gid] = iValA + iValB;
        for (int i = 1; i < 100000; i++) {
            answer[gid] *= i*(10.56*sin(iValA) + 3.47 * cos(iValB)*iValB*iValA);
            answer[gid] /= i*(10.56*sin(iValA) + 3.47 * cos(iValB)*iValB*iValA);
	}
}

kernel 3 (inspired by chai):

__kernel void
add( __global float *a,
     __global float *b,
     __global float *answer)
{
        int gid = get_global_id(0);
        float valA = a[gid];
        float valB = b[gid];
        float resultValA;
        float multiplier =  0.46*0.48*6.54*4.21 * (10.56*sin(valA) + 3.47 * cos(valB)*valB*valA);
        float divider = 1.0/multiplier; // or native_recip(multiplier) or half_recip(multiplier) for faster approximations

	resultValA = valA + valB;
	for (int i = 1; i < 100000; i++) {
            resultValA *= multiplier;
            resultValA *= divider; // multiplying by a reciprocal is cheaper than dividing
	}

        answer[gid] = resultValA;
}

Well, the main function now calls runCL(…) with both, GPU and CPU version. It also messures the time in seconds. Here are the outputs:

with kernel 1:

n = 200

GPU…
Connecting to NVIDIA GeForce 320M,
max_compute_units: 6
max_work_groub_size: 512
max_work_item_dimensions: 3…

seconds = 9

CPU…
Connecting to Intel Intel(R) Core™2 Duo CPU P8600 @ 2.40GHz,
max_compute_units: 2
max_work_groub_size: 1
max_work_item_dimensions: 3…

seconds = 1

with n = 2000 i get the following

Assertion failed: (err == CL_SUCCESS), function runCL, file main.cc, line 199.
Abort trap

with kernel 2:

n = 2000

GPU…
Connecting to NVIDIA GeForce 320M,
max_compute_units: 6
max_work_groub_size: 512
max_work_item_dimensions: 3…

seconds = 13

CPU…
Connecting to Intel Intel(R) Core™2 Duo CPU P8600 @ 2.40GHz,
max_compute_units: 2
max_work_groub_size: 1
max_work_item_dimensions: 3…

seconds = 4

for n = 20000 i get the following:

Assertion failed: (err == CL_SUCCESS), function runCL, file main.cc, line 199.
Abort trap

with kernel 3:

n = 200000

GPU…
Connecting to NVIDIA GeForce 320M,
max_compute_units: 6
max_work_groub_size: 512
max_work_item_dimensions: 3…

seconds = 13

CPU…
Connecting to Intel Intel(R) Core™2 Duo CPU P8600 @ 2.40GHz,
max_compute_units: 2
max_work_groub_size: 1
max_work_item_dimensions: 3…

seconds = 61

ok, now the main.cc:

#ifdef __APPLE__
#include <OpenCL/OpenCL.h>
#else
#include <CL/cl.h>
#endif


#include <iostream>
#include <assert.h>
#include <sys/sysctl.h>
#include <sys/stat.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/time.h>
#include <time.h>



#pragma mark -
#pragma mark Utilities
char * load_program_source(const char *filename)
{
  struct stat statbuf;
  FILE *fh;
   
  char *source;
  fh = fopen(filename, "r");
  if (fh == 0)
    return 0;
    
  stat(filename, &statbuf);
  source = (char *) malloc(statbuf.st_size + 1);
  fread(source, statbuf.st_size, 1, fh);
  source[statbuf.st_size] = '\0';
  return source;   
}

 
#pragma mark -
#pragma mark Main OpenCL Routine
int runCL(float * a, float * b, float * results, int n, int dev)
{
  cl_program program[1];
  cl_kernel kernel[1];

  cl_command_queue cmd_queue;
  cl_context   context;

  cl_device_id cpu = NULL, device = NULL;

  cl_int err = 0;
  size_t returned_size = 0;
  size_t buffer_size;

  cl_mem a_mem, b_mem, ans_mem;
  
#pragma mark Device Information
  {    
    // Find the CPU CL device, as a fallback
    //26:00
    err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_CPU, 1, &cpu, NULL);
    assert(err == CL_SUCCESS);
     
    // Find the GPU CL device, this is what we really want
    // If there is no GPU device is CL capable, fall back to CPU
    err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if (err != CL_SUCCESS || dev > 0)
      device = cpu;
    assert(device);
        
    // Get some information about the returned device
    cl_char vendor_name[1024] = {0};
    cl_char device_name[1024] = {0};
    cl_uint max_compute_units = 0;
    size_t max_work_groub_size = 0;
    cl_uint max_work_item_dimensions = 0;
    
    
    //27:00
    err = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(vendor_name), 
			  vendor_name, &returned_size);
    
    err |= clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), 
			   device_name, &returned_size);

    err |= clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(max_compute_units), 
			   &max_compute_units, &returned_size);

    err |= clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_work_groub_size), 
			   &max_work_groub_size, &returned_size);

    err |= clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(max_work_item_dimensions), 
			   &max_work_item_dimensions, &returned_size);
    
    assert(err == CL_SUCCESS);
    printf("Connecting to %s %s, 
max_compute_units: %d
max_work_groub_size: %zu 
max_work_item_dimensions: %d...
", vendor_name, device_name, max_compute_units, max_work_groub_size, max_work_item_dimensions);
  }
  
  
#pragma mark Context and Command Queue
  {
    // Now create a context to perform our calculation with the 
    // specified device 
    context = clCreateContext(0, 1, &device, NULL, NULL, &err);
    assert(err == CL_SUCCESS);
        
    // And also a command queue for the context
    cmd_queue = clCreateCommandQueue(context, device, 0, NULL);
  }
  
  
#pragma mark Program and Kernel Creation
  {
    // Load the program source from disk
    // The kernel/program is the project directory and in Xcode the executable
    // is set to launch from that directory hence we use a relative path
    const char * filename = "example3.cl";
    char *program_source = load_program_source(filename);
    program[0] = clCreateProgramWithSource(context, 1, (const char**)&program_source,
					   NULL, &err);
    
    assert(err == CL_SUCCESS);
        
    // 28:40
    err = clBuildProgram(program[0], 0, NULL, NULL, NULL, NULL);
    assert(err == CL_SUCCESS);
    
    // Now create the kernel "objects" that we want to use in the example file 
    kernel[0] = clCreateKernel(program[0], "add", &err);
  }
  
  
#pragma mark Memory Allocation
  {
    
    // Allocate memory on the device to hold our data and store the results into
    buffer_size = sizeof(float) * n;
    
    // Input array a
    //30:10
    a_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);
    
    //32:20
    err = clEnqueueWriteBuffer(cmd_queue, a_mem, CL_TRUE, 0, buffer_size,
			       (void*)a, 0, NULL, NULL);
    
    // Input array b
    b_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);
    
    err |= clEnqueueWriteBuffer(cmd_queue, b_mem, CL_TRUE, 0, buffer_size,
				(void*)b, 0, NULL, NULL);
    
    assert(err == CL_SUCCESS);
    
    // Results array
    ans_mem= clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, NULL); 
    
    // Get all of the stuff written and allocated 
    clFinish(cmd_queue);
  }
  
  
#pragma mark Kernel Arguments
  {
    
    // Now setup the arguments to our kernel
    //33:48
    err  = clSetKernelArg(kernel[0],  0, sizeof(cl_mem), &a_mem);
    err |= clSetKernelArg(kernel[0],  1, sizeof(cl_mem), &b_mem);
    err |= clSetKernelArg(kernel[0],  2, sizeof(cl_mem), &ans_mem);
    
    assert(err == CL_SUCCESS);
    
  }
  
  
#pragma mark Execution and Read
  {
    
    // Run the calculation by enqueuing it and forcing the 
    // command queue to complete the task
    size_t global_work_size = n;
    //33:59
    err = clEnqueueNDRangeKernel(cmd_queue, kernel[0], 1, NULL, 
				 &global_work_size, NULL, 0, NULL, NULL);
    
       
    assert(err == CL_SUCCESS);
    clFinish(cmd_queue);
    
    // Once finished read back the results from the answer 
    // array into the results array
    //35:35
    err = clEnqueueReadBuffer(cmd_queue, ans_mem, CL_TRUE, 0, buffer_size, 
			      results, 0, NULL, NULL);
    
    assert(err == CL_SUCCESS);
    clFinish(cmd_queue);
    
  }
  
  
#pragma mark Teardown
  {
    clReleaseMemObject(a_mem);
    clReleaseMemObject(b_mem);
    clReleaseMemObject(ans_mem);
    
    clReleaseCommandQueue(cmd_queue);
    clReleaseContext(context);
  }
  return CL_SUCCESS;
}



int main(int argc, char **argv) { 
  int n;
  if (argc < 2)
    n = 8;
  else
    n = atoi(argv[1]);

  printf( "
 n = %d
", n);

  struct timeval tp1;
  struct timeval tp2;
  
  // Allocate some memory and a place for the results
  float * a = (float *)malloc(n*sizeof(float));
  float * b = (float *)malloc(n*sizeof(float));
  float * results = (float *)malloc(n*sizeof(float));
  
  // Fill in the values
  for(int i=0;i<n;i++)  {
    a[i] = (float)i;
    b[i] = (float)n-i;
    results[i] = 0.f;
  }
  
  // Do the OpenCL calculation
  printf("==========================================================
");
  printf("   GPU...
");
  gettimeofday(&tp1, NULL);
  runCL(a, b, results, n, 0); // GPU
  gettimeofday(&tp2, NULL);
  printf( "
");
  printf( "
   seconds = %ld
", tp2.tv_sec-tp1.tv_sec );

  printf("---------------------------------------------------------
");
  printf("   CPU...
");
  gettimeofday(&tp1, NULL);
  runCL(a, b, results, n, 1); // CPU
  gettimeofday(&tp2, NULL);
  printf( "
");
  printf( "
   seconds = %ld
", tp2.tv_sec-tp1.tv_sec );
  printf("---------------------------------------------------------
");
  
  // Free up memory
  free(a);
  free(b);
  free(results);
    
  return 0; 
}

im compiling with

g++-4.2 -Wall -O3 -funroll-loops  -MMD -MF release/main.d -c main.cc -o release/main.o
g++-4.2 -o release/main ./release/main.o -framework OpenCL

thank you all a lot. Now i can play with this code.

bye

chai · January 24, 2011, 3:31pm

thanks for the very thorough follow-up! glad the suggestions helped.

also, since you’re on NVIDIA, you might want to try the loop unrolling extension. It would take way too long to fully unroll the 100000 iterations, but the extension can divide it into N unrolled “chunks”. The tradeoff is compiled kernel size and compilation time vs a potential speedup of avoiding conditional statements. GPUs like unrolled kernels, CPUs are optimized for conditionals and sequential loops.

cl_nv_pragma_unroll documentation


#pragma OPENCL EXTENSION cl_nv_pragma_unroll : enable // not sure if this line is necessary or not, I'm using ATI :P

__kernel void
add( __global float *a,
     __global float *b,
     __global float *answer)
{
        int gid = get_global_id(0);
        float valA = a[gid];
        float valB = b[gid];
        float resultValA;
        float multiplier =  0.46*0.48*6.54*4.21 * (10.56*sin(valA) + 3.47 * cos(valB)*valB*valA);
        float divider = 1.0/multiplier; // or native_recip(multiplier) or half_recip(multiplier) for faster approximations

   resultValA = valA + valB;
#pragma unroll 100 //smaller values = shorter compile time and less kernel memory, larger values = longer compile time, more memory, but often much faster
   for (int i = 1; i < 100000; i++) {
            resultValA *= multiplier;
            resultValA *= divider; // multiplying by a reciprocal is cheaper than dividing
   }

        answer[gid] = resultValA;
}

n00b2key · January 25, 2011, 2:41am

Hey

thanks for the tip.

Kernel 4:

__kernel void
add( __global float *a,
     __global float *b,
     __global float *answer)
{
        int gid = get_global_id(0);
        float valA = a[gid];
        float valB = b[gid];
        float resultValA;
        float multiplier =  0.46*0.48*6.54*4.21 * (10.56*sin(valA) + 3.47 * cos(valB)*valB*valA);
        float divider = 1.0/multiplier; // or native_recip(multiplier) or half_recip(multiplier) for faster approximations

        resultValA = valA + valB;
        #pragma unroll 100 //smaller values = shorter compile time and less kernel memory, larger values = longer compile time, more memory, but often much\
 faster
        for (int i = 1; i < 100000; i++) {
            resultValA *= multiplier;
            resultValA *= divider; // multiplying by a reciprocal is cheaper than dividing
        }

        answer[gid] = resultValA;
}

It’s output:

n = 200000

GPU…
Connecting to NVIDIA GeForce 320M,
max_compute_units: 6
max_work_groub_size: 512
max_work_item_dimensions: 3…

seconds = 7

CPU…
Connecting to Intel Intel(R) Core™2 Duo CPU P8600 @ 2.40GHz,
max_compute_units: 2
max_work_groub_size: 1
max_work_item_dimensions: 3…

seconds = 60