Poor bandwith on matrix multiplication with local memory?

Well, i’m trying to do the best i can to increase BW on matrix multiplication but my efforts aren’t coming to where i want… in a Tesla C1060 the operation of multiplying two matrix of 2048x2048 is done in approx 0.136…seg. Today i was looking the OpenCLProfiler and in the box of Occupancy figures a 0… The code (shortest) is under, i was playing with clEuqneueMap and clEnqueueUnmap last time i saw de code. Any idea to improve performace? Thanks!


#include "CL/cl.h"
#include <stdlib.h>
#include <stdio.h>
#include <math.h>

#define m 2048
#define n m

void chkError(cl_int errCode,const char* file,cl_int line){
	if(errCode != CL_SUCCESS){
		printf("Error %i in file %s near line %i.
",errCode,file,line-1);
		exit(0);
	}
}

int main(int argc,char*argv[]){
	unsigned int szMem = m*n*sizeof(float);

	cl_platform_id clPlatform;
	cl_uint numPlatforms;
	cl_int errCode;
	errCode = clGetPlatformIDs(0,NULL,&numPlatforms);
	chkError(errCode,__FILE__,__LINE__);
	errCode = clGetPlatformIDs(numPlatforms,&clPlatform,NULL);
	chkError(errCode,__FILE__,__LINE__);
	
	size_t szParam;
	errCode = clGetPlatformInfo(clPlatform,CL_PLATFORM_PROFILE,0,NULL,&szParam);
	chkError(errCode,__FILE__,__LINE__);
	char* param = (char*) malloc (szParam);
	errCode = clGetPlatformInfo(clPlatform,CL_PLATFORM_PROFILE,szParam,param,NULL);
	chkError(errCode,__FILE__,__LINE__);

	cl_device_id clDevices;
	cl_uint numDevices;
	errCode = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,0,NULL,&numDevices);
	chkError(errCode,__FILE__,__LINE__);
	errCode = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,numDevices,&clDevices,NULL);
	chkError(errCode,__FILE__,__LINE__);

	//---------------------------------------------------------------------------------
	cl_uint maxComputeUnits;
	errCode = clGetDeviceInfo(clDevices,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&maxComputeUnits,NULL);
	chkError(errCode,__FILE__,__LINE__);
	//---------------------------------------------------------------------------------

	cl_context clContext;
	clContext = clCreateContext(NULL,numDevices,&clDevices,NULL,NULL,&errCode);
	chkError(errCode,__FILE__,__LINE__);

	cl_command_queue clCommandQueue;
	clCommandQueue = clCreateCommandQueue(clContext,clDevices,CL_QUEUE_PROFILING_ENABLE,&errCode);
	chkError(errCode,__FILE__,__LINE__);

  FILE *fp;
  fp = fopen("clNew.cl", "r");
  if (!fp) {
    fprintf(stderr, "Failed to load kernel.
");
    exit(1);
  }
  fseek(fp,0,SEEK_END);
  const size_t kernelLength = ftell(fp);
  rewind(fp);
  char *clNew = (char *) malloc (kernelLength);
  fread(clNew,1,kernelLength,fp);
  fclose(fp);

	cl_program clProgram;
	clProgram = clCreateProgramWithSource(clContext,1,(const char**)&clNew,&kernelLength,&errCode);
	chkError(errCode,__FILE__,__LINE__);
	errCode = clBuildProgram(clProgram,0,NULL,NULL,NULL,NULL);
	chkError(errCode,__FILE__,__LINE__);
	
	cl_kernel clKernel; 
	const char* kernelName = "matMult";
	clKernel = clCreateKernel(clProgram,kernelName,&errCode);
	chkError(errCode,__FILE__,__LINE__);

	cl_mem clDevA,clDevB,clDevC;
	clDevA = clCreateBuffer(clContext,CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,szMem,NULL,&errCode);
	chkError(errCode,__FILE__,__LINE__);
	clDevB = clCreateBuffer(clContext,CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,szMem,NULL,&errCode);
	chkError(errCode,__FILE__,__LINE__);
	clDevC = clCreateBuffer(clContext,CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,szMem,NULL,&errCode);
	chkError(errCode,__FILE__,__LINE__);

	float *A = (float*) malloc (szMem);
	float *B = (float*) malloc (szMem);
	float *C = (float*) malloc (szMem);

	A = (float *)clEnqueueMapBuffer(clCommandQueue,clDevA,CL_TRUE,CL_MAP_WRITE,0,sizeof(clDevA),0,NULL,NULL,&errCode);
	chkError(errCode,__FILE__,__LINE__);
	B = (float *)clEnqueueMapBuffer(clCommandQueue,clDevB,CL_TRUE,CL_MAP_WRITE,0,sizeof(clDevB),0,NULL,NULL,&errCode);
	chkError(errCode,__FILE__,__LINE__);
	C = (float *)clEnqueueMapBuffer(clCommandQueue,clDevC,CL_TRUE,CL_MAP_READ,0,sizeof(clDevC),0,NULL,NULL,&errCode);
	chkError(errCode,__FILE__,__LINE__);

	errCode = clEnqueueUnmapMemObject(clCommandQueue,clDevA,A,0,NULL,NULL);
	chkError(errCode,__FILE__,__LINE__);
	errCode = clEnqueueUnmapMemObject(clCommandQueue,clDevB,B,0,NULL,NULL);
	chkError(errCode,__FILE__,__LINE__);

	int row = n, col = m;
	errCode = clSetKernelArg(clKernel,0,sizeof(clDevC),(const void*)&clDevC);
	chkError(errCode,__FILE__,__LINE__);
	errCode = clSetKernelArg(clKernel,1,sizeof(clDevA),(const void*)&clDevA);
	chkError(errCode,__FILE__,__LINE__);
	errCode = clSetKernelArg(clKernel,2,sizeof(clDevB),(const void*)&clDevB);
	chkError(errCode,__FILE__,__LINE__);
	errCode = clSetKernelArg(clKernel,3,sizeof(cl_int),(const void*)&col);
	chkError(errCode,__FILE__,__LINE__);
	errCode = clSetKernelArg(clKernel,4,256*sizeof(cl_float),NULL);
	chkError(errCode,__FILE__,__LINE__);
	errCode = clSetKernelArg(clKernel,5,256*sizeof(cl_float),NULL);
	chkError(errCode,__FILE__,__LINE__);

	const size_t clLocalWorkSize[2] = {16,16}, clGlobalWorkSize[2] = {m,n};
	cl_event clEvent;
	errCode = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,0,clGlobalWorkSize,clLocalWorkSize,0,NULL,&clEvent);
	chkError(errCode,__FILE__,__LINE__);
	
	long long  start,end;
	cl_int status;
	status = clWaitForEvents(1,&clEvent);
	chkError(errCode,__FILE__,__LINE__);
  status = clGetEventProfilingInfo(clEvent,CL_PROFILING_COMMAND_START,
                        sizeof(long long),&start,NULL);
	chkError(errCode,__FILE__,__LINE__);
	status = clGetEventProfilingInfo(clEvent,CL_PROFILING_COMMAND_END,
													sizeof(long long),&end,NULL);
	chkError(errCode,__FILE__,__LINE__);
	cl_double total = (cl_double)(end - start) / 1e9;
  printf("Profiling: Total kernel time was %f secs.
", total);

	C = (float*)clEnqueueMapBuffer(clCommandQueue,clDevC,CL_FALSE,CL_MAP_READ,0,sizeof(clDevC),0,NULL,NULL,&errCode);
	chkError(errCode,__FILE__,__LINE__);
	
	errCode = clEnqueueUnmapMemObject(clCommandQueue,clDevC,C,0,NULL,NULL);
	chkError(errCode,__FILE__,__LINE__);
	
	clReleaseMemObject(clDevA);
	clReleaseMemObject(clDevB);
	clReleaseMemObject(clDevC);
	clReleaseEvent(clEvent);
	clReleaseKernel(clKernel);
	clReleaseProgram(clProgram);
	clReleaseCommandQueue(clCommandQueue);
	clReleaseContext(clContext);
	return 0;
}


#define bSize 16

__kernel void
matMult(__global float* C, 
				__global float* A, 
				__global float* B,
				int N, 
				__local float Asub [bSize][bSize],
				__local float Bsub [bSize][bSize])
{
	int gidx = get_group_id(0);
	int gidy = get_group_id(1);
	float Csub = 0;
	int lidx = get_local_id(0);
	int lidy = get_local_id(1);
	int aBegin = gidy*N*bSize;
	int aStep = bSize;
	int aEnd = aBegin+N-1;
	int bBegin = gidx*bSize;
	int bStep = N*bSize;
	int base = lidy*N+lidx;;
	for (int i = aBegin,j = bBegin; i < aEnd; i+=2*aStep,j+=2*bStep) {
		/*
		Asub[lidy][lidx] = A[i+base];
		Bsub[lidy][lidx] = B[j+base];
		barrier(CLK_LOCAL_MEM_FENCE);
		//for(int k=0;k<bSize;k++) Csub += Asub[lidy][k]*Bsub[k][lidx];
		Csub += Asub[lidy][0]*Bsub[0][lidx];
		Csub += Asub[lidy][1]*Bsub[1][lidx];
		Csub += Asub[lidy][2]*Bsub[2][lidx];
		Csub += Asub[lidy][3]*Bsub[3][lidx];
		Csub += Asub[lidy][4]*Bsub[4][lidx];
		Csub += Asub[lidy][5]*Bsub[5][lidx];
		Csub += Asub[lidy][6]*Bsub[6][lidx];
		Csub += Asub[lidy][7]*Bsub[7][lidx];
		Csub += Asub[lidy][8]*Bsub[8][lidx];
		Csub += Asub[lidy][9]*Bsub[9][lidx];
		Csub += Asub[lidy][10]*Bsub[10][lidx];
		Csub += Asub[lidy][11]*Bsub[11][lidx];
		Csub += Asub[lidy][12]*Bsub[12][lidx];
		Csub += Asub[lidy][13]*Bsub[13][lidx];
		Csub += Asub[lidy][14]*Bsub[14][lidx];
		Csub += Asub[lidy][15]*Bsub[15][lidx];
		barrier(CLK_LOCAL_MEM_FENCE);*/
		Asub[lidy][lidx] = A[i+base];
		Bsub[lidy][lidx] = B[j+base];
		barrier(CLK_LOCAL_MEM_FENCE);
		//for(int k=0;k<bSize;k++) Csub += Asub[lidy][k]*Bsub[k][lidx];
		Csub += Asub[lidy][0]*Bsub[0][lidx];
		Csub += Asub[lidy][1]*Bsub[1][lidx];
		Csub += Asub[lidy][2]*Bsub[2][lidx];
		Csub += Asub[lidy][3]*Bsub[3][lidx];
		Csub += Asub[lidy][4]*Bsub[4][lidx];
		Csub += Asub[lidy][5]*Bsub[5][lidx];
		Csub += Asub[lidy][6]*Bsub[6][lidx];
		Csub += Asub[lidy][7]*Bsub[7][lidx];
		Csub += Asub[lidy][8]*Bsub[8][lidx];
		Csub += Asub[lidy][9]*Bsub[9][lidx];
		Csub += Asub[lidy][10]*Bsub[10][lidx];
		Csub += Asub[lidy][11]*Bsub[11][lidx];
		Csub += Asub[lidy][12]*Bsub[12][lidx];
		Csub += Asub[lidy][13]*Bsub[13][lidx];
		Csub += Asub[lidy][14]*Bsub[14][lidx];
		Csub += Asub[lidy][15]*Bsub[15][lidx];
		barrier(CLK_LOCAL_MEM_FENCE);
		Asub[lidy][lidx] = A[i+aStep+base];
		Bsub[lidy][lidx] = B[j+bStep+base];
		barrier(CLK_LOCAL_MEM_FENCE);
		//for(int k=0;k<bSize;k++) Csub += Asub[lidy][k]*Bsub[k][lidx];
		Csub += Asub[lidy][0]*Bsub[0][lidx];
		Csub += Asub[lidy][1]*Bsub[1][lidx];
		Csub += Asub[lidy][2]*Bsub[2][lidx];
		Csub += Asub[lidy][3]*Bsub[3][lidx];
		Csub += Asub[lidy][4]*Bsub[4][lidx];
		Csub += Asub[lidy][5]*Bsub[5][lidx];
		Csub += Asub[lidy][6]*Bsub[6][lidx];
		Csub += Asub[lidy][7]*Bsub[7][lidx];
		Csub += Asub[lidy][8]*Bsub[8][lidx];
		Csub += Asub[lidy][9]*Bsub[9][lidx];
		Csub += Asub[lidy][10]*Bsub[10][lidx];
		Csub += Asub[lidy][11]*Bsub[11][lidx];
		Csub += Asub[lidy][12]*Bsub[12][lidx];
		Csub += Asub[lidy][13]*Bsub[13][lidx];
		Csub += Asub[lidy][14]*Bsub[14][lidx];
		Csub += Asub[lidy][15]*Bsub[15][lidx];
		barrier(CLK_LOCAL_MEM_FENCE);
	}
	C[aBegin+bBegin+base] = Csub;
}

Have you taken a look at some of the papers published on getting maximum matrix multiplication performance on Nvidia hardware? They use very specific tricks to get the best results. I’ve heard that Nvidia’s OpenCL currently has some serious performance bugs (like 2x worse than cuda) so you may have a hard time getting their level of performance, though.

im reading the papers, but still no idea for the poor performance… im trying to implement another method, but at the time im confused…

Two comments after looking at your code:

  1. I’m not sure why you’re using map instead of just writing to the buffer with your data or creating the buffer with COPY_HOST_PTR. (It looks like you’re mapping and unmapping without actually changing the data.)
  2. The first time you execute a kernel you may incur the delayed cost of compiling the kernel, plus the cost of transferring the data. You should execute the kernel once and then average your time over at least 10-100 runs of the kernel to avoid skewed results due to system/device allocation and transfer overhead.

Thanks for your reply, the maps were only for play and learn how is it works. In my original code i use MEM_COPY_HOST_PTR with a valid pointer where is the info for passing the data to the device. Respecting your second answer, you suggest that i must run multiple times my kernel and then promediate the results no? It’s seems to be logical, i run the code manually multiple times and them i conclude the final time of execution. Respect to the code, the writes on Global Memory are expensive… and the “internal for” it has been rolled and this enhance performance… but i’m far that i consider a GOOD PERFORMANCE… when i run the OpenCL Profiles in Occupancy there ir a 0, and no idea why! there arent uncoaleced access, or anything rare…

Running the code manually many times is not going to have the same effect since the data will still have to be transferred and all the setup and initialization will have to happen.

If you want to time performance with a kernel you should to do something like:


buildProgram
writeData
enqueueKernel
finish

start = get_time
for (i=0; i<20; i++
  enqueueKernel
finish
stop = get_time

total_time = (stop-start)/20

If you don’t do this you’ll be measuring a huge amount of overhead to the kernel execution which can screw up the analysis.

If you want to include the data transfer time then you should explicitly include it in the loop as well.