# Fast Fourier transform with OpenCL (GPU)

I decide develop parallel FFT program for GPU using OpenCL. But in result my parallel program slower than program for CPU. Can you help me find out why this happen?

``````#include <CL\opencl.h>
#include <iostream>
#include <windows.h>
#include <time.h>

//***Serial FFT ****

#define M_PIx2      6.283185307179586476925286766559

struct Complex {

Complex( float r, float i ) : re(r), im(i) {}
Complex(){re = 0.0f; im = 0.0f;}
Complex operator+( Complex &other );
Complex operator-( Complex &other );
Complex operator*( Complex &other );
//void Display( ) {   cout << re << ", " << im << endl; }
float getRe() {return re; }
float getIm() {return im; }
private:
float re, im;
};

Complex Complex::operator+( Complex &other ) {
return Complex( re + other.re, im + other.im );
}

Complex Complex::operator-( Complex &other ) {
return Complex( re - other.re, im - other.im );
}

Complex Complex::operator*( Complex &other ) {
return Complex( re*other.re - im*other.im, re*other.im + other.re * im );
}

void FFT(const double *cpfData, Complex *pSpectra, DWORD dwDataSize,
DWORD n, DWORD dwBits)
{
// Form initial array
memset(pSpectra, 0, n * sizeof(Complex));
for(DWORD k = 0; k < dwDataSize; k++)
{
pSpectra[k] = Complex(1.0f, 0.0f); //pSpectra[BitRewers32(k, (BYTE)dwBits)] = cpfData[k];
}

// Now run!
DWORD s, m, j, m2;
Complex W = Complex(0, 0), Wm = Complex(0, 0), T = Complex(0, 0), U = Complex(0, 0);	//Complex W = Complex(0, 0 ), Wm, T, U;
for(s = 1; s <= dwBits; s++)
{
m = 1 << s;
m2 = m >> 1;
Wm = Complex((float)cos(M_PIx2 / m), (float)sin(M_PIx2 / m));  //Wm = std::exp(Complex(0.0, M_PIx2 / m));
W = Complex(1.0, 0); //W = 1.0;
for(j = 0; j < m2; j++)
{
for(DWORD k = j; k < n; k += m)
{
T = W * pSpectra[k + m2];
U = pSpectra[k];
pSpectra[k] = U + T;
pSpectra[k + m2] = U - T;
}
W = W * Wm; //W *= Wm;
}
}
}
//*****************

const int N = 16384;
int dwBits = 14;
const int COUNT_REPEAT_MAX = 10;

// Round Up Division function
size_t roundUpSize(int group_size, int global_size)
{
int r = global_size % group_size;
if(r == 0)
{
return global_size;
} else
{
return global_size + group_size - r;
}
}

int main()
{
cl_int error;

cl_float2 hostSpectr[N];
cl_float2 hostResult[N];
int ids[N];

for (int i = 0; i <= N-1; i++)
{
hostSpectr[i].s[0] = 1;
hostSpectr[i].s[1] = 0;
}

//---Platform
cl_platform_id platform;
clGetPlatformIDs(1, &platform, NULL);

//---Device
cl_device_id device;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);

//---Context
cl_context context;
context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);

//---Source of GPU program
const char *source =
"#define ADD_COMPLEX_RE(CNUM1, CNUM2) ((CNUM1.x) + (CNUM2.x))
"
"#define ADD_COMPLEX_IMG(CNUM1, CNUM2) ((CNUM1.y) + (CNUM2.y))
"
"#define MUL_COMPLEX_RE(CNUM1, CNUM2) ((CNUM1.x)*(CNUM2.x) - (CNUM1.y)*(CNUM2.y))
"
"#define MUL_COMPLEX_IMG(CNUM1, CNUM2) ((CNUM1.x)*(CNUM2.y) + (CNUM1.y)*(CNUM2.x))
"
"__kernel void fft_gpu(	__global const float2 *inSpectr,
"
"								__global float2 *outResult,
"
"								int n,
"
"								__global int *ids)
"
"{
"
"	int id = get_global_id(0);
"
"	int i;
"
"	float2 w;
"
"	if (id < n)
"
"	{
"
"		ids[id] = id;
"
"		outResult[id].x = 0;
"
"		outResult[id].y = 0;
"
"		for (i = 0; i<= n-1; i++)
"
"		{
"
"			w.x = cos(2* M_PI * ((i * id) % n) / n);
"
"			w.y = sin(2* M_PI * ((i * id) % n) / n);
"
"			float2 mul_result;
"
"			mul_result.x = MUL_COMPLEX_RE(w,inSpectr[i]);
"
"			mul_result.y = MUL_COMPLEX_IMG(w,inSpectr[i]);
"
"		}
"
"	}
"
"}
";

//---Program
cl_program program;
program = clCreateProgramWithSource(context, 1, &source, NULL, &error);
if (error != CL_SUCCESS)
{
std::cout << "Creating program error!
";
}

//---Building program
error = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (error != CL_SUCCESS)
{
std::cout << "Building error!
";
}

//---Kernel
cl_kernel kernel;
kernel = clCreateKernel(program, "fft_gpu", NULL);

//---Buffers
cl_mem deviceBufSpectr;
cl_mem deviceBufResult;
cl_mem deviceBufIds;
deviceBufSpectr = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_float2) * N, NULL, NULL);
deviceBufResult = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float2) * N, NULL, NULL);
deviceBufIds = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int) * N, NULL, NULL);

//---Command queue
cl_command_queue cmd_queue;
cmd_queue = clCreateCommandQueue(context, device, NULL, NULL);

//---Data transfer from Host to Device
error = clEnqueueWriteBuffer(cmd_queue, deviceBufSpectr, CL_FALSE, 0, sizeof(cl_float2) * N, hostSpectr, 0, NULL, NULL);
if (error != CL_SUCCESS)
{
std::cout << "Data transfer from Host to Device error!
";
}

//---Global and local work size
size_t localWorkSize = 512;
size_t globalWorkSize = roundUpSize(localWorkSize, N);

//---FFT start (GPU)

time_t timeWorkGPU = clock();

//---Setting kernel arguments
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&deviceBufSpectr);
clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&deviceBufResult);
clSetKernelArg(kernel, 2, sizeof(unsigned int), (void *)&N);
clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&deviceBufIds);

for (int countRepeat = 0; countRepeat < COUNT_REPEAT_MAX; countRepeat++)
{

//---Kernel invocation
error = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
if (error != CL_SUCCESS)
{
std::cout << "Kernel invocation error!
";
}

}

//---Data transfer from Device to Host
error = clEnqueueReadBuffer(cmd_queue, deviceBufResult, CL_TRUE, 0, sizeof(cl_float2) * N, hostResult, 0, NULL, NULL);
if (error != CL_SUCCESS)
{
std::cout << "Data transfer from Device to Host error!
";
}

error = clEnqueueReadBuffer(cmd_queue, deviceBufIds, CL_TRUE, 0, sizeof(int) * N, ids, 0, NULL, NULL);
if (error != CL_SUCCESS)
{
std::cout << "Data transfer from Device to Host error!
";
}

timeWorkGPU = clock() - timeWorkGPU;
std::cout << "GPU time: " << (double)timeWorkGPU / CLOCKS_PER_SEC<<"
";

/*for (int i = 0; i <= N-1; i++)
{
std::cout << hostResult[i].s[0] << "	" << hostResult[i].s[1] << "
";
}*/

//*** DSP.cpp start
double cpfData[2] = {1.0, 1.0};
Complex pSpectra[N];

time_t timeWorkCPU = clock();

for (int countRepeat = 0; countRepeat < COUNT_REPEAT_MAX; countRepeat++)
{

FFT(cpfData, pSpectra, N, N, dwBits); //Serial FFT

}

timeWorkCPU = clock() - timeWorkCPU;
std::cout << "CPU time: " << (double)timeWorkCPU / CLOCKS_PER_SEC<<"
";

/*std::cout<< "--- Result (CPU): ---" << std::endl;
for (int i = 0; i <= N - 1; i++)
{
std::cout << "[" << i <<"]: " <<  pSpectra[i].getRe() << ", " << pSpectra[i].getIm() << "
";
}*/

//*** DSP.cpp end

system("PAUSE");
return 0;

}

``````

You account PCI-E transfer into compute time, so this is pretty much expected. Add clFinish after clEnqueueWriteBuffer and replace clEnqueueReadBuffer s with clFinish. This will give you the actual computation time (though, using the actual OpenCL profile capabilities will be more accurate).

Thank you.
According to your advice I changed my code and got next results:
GPU time: 5.365
CPU time: 0.615
GPU slower than CPU again.

``````
#include <CL\opencl.h>
#include <iostream>
#include <windows.h>
#include <time.h>

//***Serial FFT ****

#define M_PIx2      6.283185307179586476925286766559

struct Complex {

Complex( float r, float i ) : re(r), im(i) {}
Complex(){re = 0.0f; im = 0.0f;}
Complex operator+( Complex &other );
Complex operator-( Complex &other );
Complex operator*( Complex &other );
//void Display( ) {   cout << re << ", " << im << endl; }
float getRe() {return re; }
float getIm() {return im; }
private:
float re, im;
};

Complex Complex::operator+( Complex &other ) {
return Complex( re + other.re, im + other.im );
}

Complex Complex::operator-( Complex &other ) {
return Complex( re - other.re, im - other.im );
}

Complex Complex::operator*( Complex &other ) {
return Complex( re*other.re - im*other.im, re*other.im + other.re * im );
}

void FFT(const double *cpfData, Complex *pSpectra, DWORD dwDataSize,
DWORD n, DWORD dwBits)
{
// Form initial array
memset(pSpectra, 0, n * sizeof(Complex));
for(DWORD k = 0; k < dwDataSize; k++)
{
pSpectra[k] = Complex(1.0f, 0.0f); //pSpectra[BitRewers32(k, (BYTE)dwBits)] = cpfData[k];
}

// Now run!
DWORD s, m, j, m2;
Complex W = Complex(0, 0), Wm = Complex(0, 0), T = Complex(0, 0), U = Complex(0, 0);	//Complex W = Complex(0, 0 ), Wm, T, U;
for(s = 1; s <= dwBits; s++)
{
m = 1 << s;
m2 = m >> 1;
Wm = Complex((float)cos(M_PIx2 / m), (float)sin(M_PIx2 / m));  //Wm = std::exp(Complex(0.0, M_PIx2 / m));
W = Complex(1.0, 0); //W = 1.0;
for(j = 0; j < m2; j++)
{
for(DWORD k = j; k < n; k += m)
{
T = W * pSpectra[k + m2];
U = pSpectra[k];
pSpectra[k] = U + T;
pSpectra[k + m2] = U - T;
}
W = W * Wm; //W *= Wm;
}
}
}
//*****************

const int N = 16384;
int dwBits = 14;
const int COUNT_REPEAT_MAX = 10;

// Round Up Division function
size_t roundUpSize(int group_size, int global_size)
{
int r = global_size % group_size;
if(r == 0)
{
return global_size;
} else
{
return global_size + group_size - r;
}
}

int main()
{
cl_int error;

cl_float2 hostSpectr[N];
cl_float2 hostResult[N];
int ids[N];

for (int i = 0; i <= N-1; i++)
{
hostSpectr[i].s[0] = 1;
hostSpectr[i].s[1] = 0;
}

//---Platform
cl_platform_id platform;
clGetPlatformIDs(1, &platform, NULL);

//---Device
cl_device_id device;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);

//---Context
cl_context context;
context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);

//---Source of GPU program
const char *source =
"#define ADD_COMPLEX_RE(CNUM1, CNUM2) ((CNUM1.x) + (CNUM2.x))
"
"#define ADD_COMPLEX_IMG(CNUM1, CNUM2) ((CNUM1.y) + (CNUM2.y))
"
"#define MUL_COMPLEX_RE(CNUM1, CNUM2) ((CNUM1.x)*(CNUM2.x) - (CNUM1.y)*(CNUM2.y))
"
"#define MUL_COMPLEX_IMG(CNUM1, CNUM2) ((CNUM1.x)*(CNUM2.y) + (CNUM1.y)*(CNUM2.x))
"
"__kernel void fft_gpu(	__global const float2 *inSpectr,
"
"								__global float2 *outResult,
"
"								int n,
"
"								__global int *ids)
"
"{
"
"	int id = get_global_id(0);
"
"	int i;
"
"	float2 w;
"
"	if (id < n)
"
"	{
"
"		ids[id] = id;
"
"		outResult[id].x = 0;
"
"		outResult[id].y = 0;
"
"		for (i = 0; i<= n-1; i++)
"
"		{
"
"			w.x = cos(2* M_PI * ((i * id) % n) / n);
"
"			w.y = sin(2* M_PI * ((i * id) % n) / n);
"
"			float2 mul_result;
"
"			mul_result.x = MUL_COMPLEX_RE(w,inSpectr[i]);
"
"			mul_result.y = MUL_COMPLEX_IMG(w,inSpectr[i]);
"
"		}
"
"	}
"
"}
";

//---Program
cl_program program;
program = clCreateProgramWithSource(context, 1, &source, NULL, &error);
if (error != CL_SUCCESS)
{
std::cout << "Creating program error!
";
}

//---Building program
error = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (error != CL_SUCCESS)
{
std::cout << "Building error!
";
}

//---Kernel
cl_kernel kernel;
kernel = clCreateKernel(program, "fft_gpu", NULL);

//---Buffers
cl_mem deviceBufSpectr;
cl_mem deviceBufResult;
cl_mem deviceBufIds;
deviceBufSpectr = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_float2) * N, NULL, NULL);
deviceBufResult = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float2) * N, NULL, NULL);
deviceBufIds = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int) * N, NULL, NULL);

//---Command queue
cl_command_queue cmd_queue;
cmd_queue = clCreateCommandQueue(context, device, NULL, NULL);

//---Data transfer from Host to Device
error = clEnqueueWriteBuffer(cmd_queue, deviceBufSpectr, CL_FALSE, 0, sizeof(cl_float2) * N, hostSpectr, 0, NULL, NULL);
if (error != CL_SUCCESS)
{
std::cout << "Data transfer from Host to Device error!
";
}
clFinish(cmd_queue);

//---Global and local work size
size_t localWorkSize = 512;
size_t globalWorkSize = roundUpSize(localWorkSize, N);

//---FFT start (GPU)

time_t timeWorkGPU = clock();

//---Setting kernel arguments
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&deviceBufSpectr);
clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&deviceBufResult);
clSetKernelArg(kernel, 2, sizeof(unsigned int), (void *)&N);
clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&deviceBufIds);

for (int countRepeat = 0; countRepeat < COUNT_REPEAT_MAX; countRepeat++)
{

//---Kernel invocation
error = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
if (error != CL_SUCCESS)
{
std::cout << "Kernel invocation error!
";
}

}

clFinish(cmd_queue);

/*//---Data transfer from Device to Host
error = clEnqueueReadBuffer(cmd_queue, deviceBufResult, CL_TRUE, 0, sizeof(cl_float2) * N, hostResult, 0, NULL, NULL);
if (error != CL_SUCCESS)
{
std::cout << "Data transfer from Device to Host error!
";
}
//clFinish(cmd_queue);

error = clEnqueueReadBuffer(cmd_queue, deviceBufIds, CL_TRUE, 0, sizeof(int) * N, ids, 0, NULL, NULL);
if (error != CL_SUCCESS)
{
std::cout << "Data transfer from Device to Host error!
";
}*/

timeWorkGPU = clock() - timeWorkGPU;
std::cout << "GPU time: " << (double)timeWorkGPU / CLOCKS_PER_SEC<<"
";

/*for (int i = 0; i <= N-1; i++)
{
std::cout << hostResult[i].s[0] << "	" << hostResult[i].s[1] << "
";
}*/

//*** DSP.cpp start
double cpfData[2] = {1.0, 1.0};
Complex pSpectra[N];

time_t timeWorkCPU = clock();

for (int countRepeat = 0; countRepeat < COUNT_REPEAT_MAX; countRepeat++)
{

FFT(cpfData, pSpectra, N, N, dwBits); //Serial FFT

}

timeWorkCPU = clock() - timeWorkCPU;
std::cout << "CPU time: " << (double)timeWorkCPU / CLOCKS_PER_SEC<<"
";

/*std::cout<< "--- Result (CPU): ---" << std::endl;
for (int i = 0; i <= N - 1; i++)
{
std::cout << "[" << i <<"]: " <<  pSpectra[i].getRe() << ", " << pSpectra[i].getIm() << "
";
}*/

//*** DSP.cpp end

system("PAUSE");
return 0;

}

``````

You don’t need 2 clFinish in a row Also it might me worth it for you to measure time properly.

The problem is, perhaps, is related to poor FFT implementation. In fact, I’m fairly certain you perform “naive fourier transform” on GPU while CPU runs a legit FFT. If the latter is not the case, a simple optimization can be made. Global memory reads and writes are expensive on GPU, so you’d better replace

``````
"		outResult[id].x = 0;
"
"		outResult[id].y = 0;
"
``````
``````
with

``````

__private float2 accumulator;//__private is still an optional keyword, but kinda I’m trying to outline the difference

``````
and

``````
``````	"			outResult[id].x = ADD_COMPLEX_RE(outResult[id], mul_result);"
``````
``````
with

``````
``````	"			accumulator.x = ADD_COMPLEX_RE(accumulator, mul_result);"
``````
``````
Then you add "outResult[id] = accumulator;" after the for loop.

There is also a common optimization for the cases when every thread in a workgroup wants to read the same __global variable:

``````

__local float a;
if (id == 0){
a = global_array[j];
}
__barrier(CLK_LOCAL_MEM);

Thank you very much!
You are right. My FFT implementation is poor. Realy I use “naive fourier transform” on GPU. I must design other parallel algorithm.