Well, i’m trying to do the best i can to increase BW on matrix multiplication but my efforts aren’t coming to where i want… in a Tesla C1060 the operation of multiplying two matrix of 2048x2048 is done in approx 0.136…seg. Today i was looking the OpenCLProfiler and in the box of Occupancy figures a 0… The code (shortest) is under, i was playing with clEuqneueMap and clEnqueueUnmap last time i saw de code. Any idea to improve performace? Thanks!
#include "CL/cl.h"
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#define m 2048
#define n m
void chkError(cl_int errCode,const char* file,cl_int line){
if(errCode != CL_SUCCESS){
printf("Error %i in file %s near line %i.
",errCode,file,line-1);
exit(0);
}
}
int main(int argc,char*argv[]){
unsigned int szMem = m*n*sizeof(float);
cl_platform_id clPlatform;
cl_uint numPlatforms;
cl_int errCode;
errCode = clGetPlatformIDs(0,NULL,&numPlatforms);
chkError(errCode,__FILE__,__LINE__);
errCode = clGetPlatformIDs(numPlatforms,&clPlatform,NULL);
chkError(errCode,__FILE__,__LINE__);
size_t szParam;
errCode = clGetPlatformInfo(clPlatform,CL_PLATFORM_PROFILE,0,NULL,&szParam);
chkError(errCode,__FILE__,__LINE__);
char* param = (char*) malloc (szParam);
errCode = clGetPlatformInfo(clPlatform,CL_PLATFORM_PROFILE,szParam,param,NULL);
chkError(errCode,__FILE__,__LINE__);
cl_device_id clDevices;
cl_uint numDevices;
errCode = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,0,NULL,&numDevices);
chkError(errCode,__FILE__,__LINE__);
errCode = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,numDevices,&clDevices,NULL);
chkError(errCode,__FILE__,__LINE__);
//---------------------------------------------------------------------------------
cl_uint maxComputeUnits;
errCode = clGetDeviceInfo(clDevices,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&maxComputeUnits,NULL);
chkError(errCode,__FILE__,__LINE__);
//---------------------------------------------------------------------------------
cl_context clContext;
clContext = clCreateContext(NULL,numDevices,&clDevices,NULL,NULL,&errCode);
chkError(errCode,__FILE__,__LINE__);
cl_command_queue clCommandQueue;
clCommandQueue = clCreateCommandQueue(clContext,clDevices,CL_QUEUE_PROFILING_ENABLE,&errCode);
chkError(errCode,__FILE__,__LINE__);
FILE *fp;
fp = fopen("clNew.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.
");
exit(1);
}
fseek(fp,0,SEEK_END);
const size_t kernelLength = ftell(fp);
rewind(fp);
char *clNew = (char *) malloc (kernelLength);
fread(clNew,1,kernelLength,fp);
fclose(fp);
cl_program clProgram;
clProgram = clCreateProgramWithSource(clContext,1,(const char**)&clNew,&kernelLength,&errCode);
chkError(errCode,__FILE__,__LINE__);
errCode = clBuildProgram(clProgram,0,NULL,NULL,NULL,NULL);
chkError(errCode,__FILE__,__LINE__);
cl_kernel clKernel;
const char* kernelName = "matMult";
clKernel = clCreateKernel(clProgram,kernelName,&errCode);
chkError(errCode,__FILE__,__LINE__);
cl_mem clDevA,clDevB,clDevC;
clDevA = clCreateBuffer(clContext,CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,szMem,NULL,&errCode);
chkError(errCode,__FILE__,__LINE__);
clDevB = clCreateBuffer(clContext,CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,szMem,NULL,&errCode);
chkError(errCode,__FILE__,__LINE__);
clDevC = clCreateBuffer(clContext,CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,szMem,NULL,&errCode);
chkError(errCode,__FILE__,__LINE__);
float *A = (float*) malloc (szMem);
float *B = (float*) malloc (szMem);
float *C = (float*) malloc (szMem);
A = (float *)clEnqueueMapBuffer(clCommandQueue,clDevA,CL_TRUE,CL_MAP_WRITE,0,sizeof(clDevA),0,NULL,NULL,&errCode);
chkError(errCode,__FILE__,__LINE__);
B = (float *)clEnqueueMapBuffer(clCommandQueue,clDevB,CL_TRUE,CL_MAP_WRITE,0,sizeof(clDevB),0,NULL,NULL,&errCode);
chkError(errCode,__FILE__,__LINE__);
C = (float *)clEnqueueMapBuffer(clCommandQueue,clDevC,CL_TRUE,CL_MAP_READ,0,sizeof(clDevC),0,NULL,NULL,&errCode);
chkError(errCode,__FILE__,__LINE__);
errCode = clEnqueueUnmapMemObject(clCommandQueue,clDevA,A,0,NULL,NULL);
chkError(errCode,__FILE__,__LINE__);
errCode = clEnqueueUnmapMemObject(clCommandQueue,clDevB,B,0,NULL,NULL);
chkError(errCode,__FILE__,__LINE__);
int row = n, col = m;
errCode = clSetKernelArg(clKernel,0,sizeof(clDevC),(const void*)&clDevC);
chkError(errCode,__FILE__,__LINE__);
errCode = clSetKernelArg(clKernel,1,sizeof(clDevA),(const void*)&clDevA);
chkError(errCode,__FILE__,__LINE__);
errCode = clSetKernelArg(clKernel,2,sizeof(clDevB),(const void*)&clDevB);
chkError(errCode,__FILE__,__LINE__);
errCode = clSetKernelArg(clKernel,3,sizeof(cl_int),(const void*)&col);
chkError(errCode,__FILE__,__LINE__);
errCode = clSetKernelArg(clKernel,4,256*sizeof(cl_float),NULL);
chkError(errCode,__FILE__,__LINE__);
errCode = clSetKernelArg(clKernel,5,256*sizeof(cl_float),NULL);
chkError(errCode,__FILE__,__LINE__);
const size_t clLocalWorkSize[2] = {16,16}, clGlobalWorkSize[2] = {m,n};
cl_event clEvent;
errCode = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,0,clGlobalWorkSize,clLocalWorkSize,0,NULL,&clEvent);
chkError(errCode,__FILE__,__LINE__);
long long start,end;
cl_int status;
status = clWaitForEvents(1,&clEvent);
chkError(errCode,__FILE__,__LINE__);
status = clGetEventProfilingInfo(clEvent,CL_PROFILING_COMMAND_START,
sizeof(long long),&start,NULL);
chkError(errCode,__FILE__,__LINE__);
status = clGetEventProfilingInfo(clEvent,CL_PROFILING_COMMAND_END,
sizeof(long long),&end,NULL);
chkError(errCode,__FILE__,__LINE__);
cl_double total = (cl_double)(end - start) / 1e9;
printf("Profiling: Total kernel time was %f secs.
", total);
C = (float*)clEnqueueMapBuffer(clCommandQueue,clDevC,CL_FALSE,CL_MAP_READ,0,sizeof(clDevC),0,NULL,NULL,&errCode);
chkError(errCode,__FILE__,__LINE__);
errCode = clEnqueueUnmapMemObject(clCommandQueue,clDevC,C,0,NULL,NULL);
chkError(errCode,__FILE__,__LINE__);
clReleaseMemObject(clDevA);
clReleaseMemObject(clDevB);
clReleaseMemObject(clDevC);
clReleaseEvent(clEvent);
clReleaseKernel(clKernel);
clReleaseProgram(clProgram);
clReleaseCommandQueue(clCommandQueue);
clReleaseContext(clContext);
return 0;
}
#define bSize 16
__kernel void
matMult(__global float* C,
__global float* A,
__global float* B,
int N,
__local float Asub [bSize][bSize],
__local float Bsub [bSize][bSize])
{
int gidx = get_group_id(0);
int gidy = get_group_id(1);
float Csub = 0;
int lidx = get_local_id(0);
int lidy = get_local_id(1);
int aBegin = gidy*N*bSize;
int aStep = bSize;
int aEnd = aBegin+N-1;
int bBegin = gidx*bSize;
int bStep = N*bSize;
int base = lidy*N+lidx;;
for (int i = aBegin,j = bBegin; i < aEnd; i+=2*aStep,j+=2*bStep) {
/*
Asub[lidy][lidx] = A[i+base];
Bsub[lidy][lidx] = B[j+base];
barrier(CLK_LOCAL_MEM_FENCE);
//for(int k=0;k<bSize;k++) Csub += Asub[lidy][k]*Bsub[k][lidx];
Csub += Asub[lidy][0]*Bsub[0][lidx];
Csub += Asub[lidy][1]*Bsub[1][lidx];
Csub += Asub[lidy][2]*Bsub[2][lidx];
Csub += Asub[lidy][3]*Bsub[3][lidx];
Csub += Asub[lidy][4]*Bsub[4][lidx];
Csub += Asub[lidy][5]*Bsub[5][lidx];
Csub += Asub[lidy][6]*Bsub[6][lidx];
Csub += Asub[lidy][7]*Bsub[7][lidx];
Csub += Asub[lidy][8]*Bsub[8][lidx];
Csub += Asub[lidy][9]*Bsub[9][lidx];
Csub += Asub[lidy][10]*Bsub[10][lidx];
Csub += Asub[lidy][11]*Bsub[11][lidx];
Csub += Asub[lidy][12]*Bsub[12][lidx];
Csub += Asub[lidy][13]*Bsub[13][lidx];
Csub += Asub[lidy][14]*Bsub[14][lidx];
Csub += Asub[lidy][15]*Bsub[15][lidx];
barrier(CLK_LOCAL_MEM_FENCE);*/
Asub[lidy][lidx] = A[i+base];
Bsub[lidy][lidx] = B[j+base];
barrier(CLK_LOCAL_MEM_FENCE);
//for(int k=0;k<bSize;k++) Csub += Asub[lidy][k]*Bsub[k][lidx];
Csub += Asub[lidy][0]*Bsub[0][lidx];
Csub += Asub[lidy][1]*Bsub[1][lidx];
Csub += Asub[lidy][2]*Bsub[2][lidx];
Csub += Asub[lidy][3]*Bsub[3][lidx];
Csub += Asub[lidy][4]*Bsub[4][lidx];
Csub += Asub[lidy][5]*Bsub[5][lidx];
Csub += Asub[lidy][6]*Bsub[6][lidx];
Csub += Asub[lidy][7]*Bsub[7][lidx];
Csub += Asub[lidy][8]*Bsub[8][lidx];
Csub += Asub[lidy][9]*Bsub[9][lidx];
Csub += Asub[lidy][10]*Bsub[10][lidx];
Csub += Asub[lidy][11]*Bsub[11][lidx];
Csub += Asub[lidy][12]*Bsub[12][lidx];
Csub += Asub[lidy][13]*Bsub[13][lidx];
Csub += Asub[lidy][14]*Bsub[14][lidx];
Csub += Asub[lidy][15]*Bsub[15][lidx];
barrier(CLK_LOCAL_MEM_FENCE);
Asub[lidy][lidx] = A[i+aStep+base];
Bsub[lidy][lidx] = B[j+bStep+base];
barrier(CLK_LOCAL_MEM_FENCE);
//for(int k=0;k<bSize;k++) Csub += Asub[lidy][k]*Bsub[k][lidx];
Csub += Asub[lidy][0]*Bsub[0][lidx];
Csub += Asub[lidy][1]*Bsub[1][lidx];
Csub += Asub[lidy][2]*Bsub[2][lidx];
Csub += Asub[lidy][3]*Bsub[3][lidx];
Csub += Asub[lidy][4]*Bsub[4][lidx];
Csub += Asub[lidy][5]*Bsub[5][lidx];
Csub += Asub[lidy][6]*Bsub[6][lidx];
Csub += Asub[lidy][7]*Bsub[7][lidx];
Csub += Asub[lidy][8]*Bsub[8][lidx];
Csub += Asub[lidy][9]*Bsub[9][lidx];
Csub += Asub[lidy][10]*Bsub[10][lidx];
Csub += Asub[lidy][11]*Bsub[11][lidx];
Csub += Asub[lidy][12]*Bsub[12][lidx];
Csub += Asub[lidy][13]*Bsub[13][lidx];
Csub += Asub[lidy][14]*Bsub[14][lidx];
Csub += Asub[lidy][15]*Bsub[15][lidx];
barrier(CLK_LOCAL_MEM_FENCE);
}
C[aBegin+bBegin+base] = Csub;
}