Rigth. My host code is as follows
#define CL_HPP_ENABLE_EXCEPTIONS
#define CL_HPP_TARGET_OPENCL_VERSION 200
#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
#define __CL_ENABLE_EXCEPTIONS
#include <iostream>
#include <vector>
#include <fstream>
#include <CL/cl.hpp>
#define M 1024
#define N 1024
#define K 1024
int main() {
// Initialize input vectors with values of the real and imaginary parts of matrices A and B
std::vector<double> A_real(M * K, 1);
std::vector<double> A_imag(M * K, 1);
std::vector<double> B_real(K * N, 2);
std::vector<double> B_imag(K * N, 1);
// Initialize output vector to store multiplication results
std::vector<double> C_real(M * N);
std::vector<double> C_imag(M * N);
try {
// Set up OpenCL platform
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
if (platforms.empty()) {
std::cerr << "No platforms found!" << std::endl;
return 1;
}
// Select the first device of the first platform
cl_context_properties properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0 };
cl::Context context(CL_DEVICE_TYPE_GPU, properties);
std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
// Create command queue
cl::CommandQueue queue(context, devices[0]);
// Create memory buffers for input and output vectors
cl::Buffer buffer_A_real(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(double) * A_real.size(), A_real.data());
cl::Buffer buffer_A_imag(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(double) * A_imag.size(), A_imag.data());
cl::Buffer buffer_B_real(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(double) * B_real.size(), B_real.data());
cl::Buffer buffer_B_imag(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(double) * B_imag.size(), B_imag.data());
cl::Buffer buffer_C_real(context, CL_MEM_WRITE_ONLY, sizeof(double) * C_real.size());
cl::Buffer buffer_C_imag(context, CL_MEM_WRITE_ONLY, sizeof(double) * C_imag.size());
// Load OpenCL kernel from file
std::ifstream kernelFile("complex_matrix_mult_kernel.cl");
std::string src(std::istreambuf_iterator<char>(kernelFile), (std::istreambuf_iterator<char>()));
cl::Program::Sources sources(1, std::make_pair(src.c_str(), src.length() + 1));
cl::Program program(context, sources);
// Compile OpenCL program
program.build(devices);
// Create OpenCL kernel
cl::Kernel kernel(program, "complex_matrix_mult");
// Set kernel arguments
kernel.setArg(0, buffer_A_real);
kernel.setArg(1, buffer_A_imag);
kernel.setArg(2, buffer_B_real);
kernel.setArg(3, buffer_B_imag);
kernel.setArg(4, buffer_C_real);
kernel.setArg(5, buffer_C_imag);
kernel.setArg(6, M);
kernel.setArg(7, N);
kernel.setArg(8, K);
// Execute OpenCL kernel
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(M, K), cl::NDRange(16, 16));
// Read results back from device
queue.enqueueReadBuffer(buffer_C_real, CL_TRUE, 0, sizeof(double) * C_real.size(), C_real.data());
queue.enqueueReadBuffer(buffer_C_imag, CL_TRUE, 0, sizeof(double) * C_imag.size(), C_imag.data());
// Print results
/*std::cout << "Resultado da multiplicação de matrizes complexas:" << std::endl;
for (int i = 0; i < M; ++i) {
for (int j = 0; j < K; ++j) {
if (C_imag[i * K + j] >=0){
std::cout << C_real[i * K + j] << " + " << C_imag[i * K + j] << "i\t";
}
else {
std::cout << C_real[i * K + j] << " " << C_imag[i * K + j] << "i\t";
}
}
std::cout << std::endl;
}*/
std::cout << "Results: \n\tC_real[0] = " << C_real[0] << std::endl;
std::cout << "Results: \n\tC_imag[0] = " << C_imag[0] << std::endl;
}
catch (cl::Error err) {
std::cerr << "Erro: " << err.what() << "(" << err.err() << ")" << std::endl;
}
return 0;
}
And the kernel code is
__kernel void complex_matrix_mult(__global double* A_real,
__global double* A_imag,
__global double* B_real,
__global double* B_imag,
__global double* C_real,
__global double* C_imag,
const int M,
const int N,
const int K) {
int i = get_global_id(0);
int j = get_global_id(1);
// Inicializa os valores da parte real e imaginária do elemento resultante como 0
C_real[i * K + j] = 0;
C_imag[i * K + j] = 0;
// Realiza a multiplicação de cada elemento da linha i de A pela coluna j de B
for (int k = 0; k < N; k++) {
// Calcula a parte real e imaginária do produto
C_real[i * K + j] += A_real[i * N + k] * B_real[k * K + j] - A_imag[i * N + k] * B_imag[k * K + j];
C_imag[i * K + j] += A_real[i * N + k] * B_imag[k * K + j] + A_imag[i * N + k] * B_real[k * K + j];
}
}
I’ve used the code below to find the CL_KERNEL_WORK_GROUP_SIZE of my kernel code, and the return is 256.
#define CL_HPP_ENABLE_EXCEPTIONS
#define CL_HPP_TARGET_OPENCL_VERSION 200
#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
#include <CL/cl.hpp>
#include <iostream>
#include <fstream>
#include <vector>
int main() {
// Obter todas as plataformas disponíveis
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
if (platforms.empty()) {
std::cerr << "Nenhuma plataforma OpenCL encontrada." << std::endl;
return 1;
}
// Escolher a primeira plataforma
cl::Platform platform = platforms.front();
// Obter todos os dispositivos na plataforma escolhida
std::vector<cl::Device> devices;
platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
if (devices.empty()) {
std::cerr << "Nenhum dispositivo OpenCL encontrado." << std::endl;
return 1;
}
// Escolher o primeiro dispositivo
cl::Device device = devices.front();
// Ler o código-fonte do kernel de um arquivo
std::ifstream sourceFile("complex_matrix_mult_kernel.cl");
std::string sourceCode(std::istreambuf_iterator<char>(sourceFile), (std::istreambuf_iterator<char>()));
cl::Program::Sources sources(1, std::make_pair(sourceCode.c_str(), sourceCode.length() + 1));
// Criar o programa a partir do código-fonte
cl::Context context(device);
cl::Program program(context, sources);
program.build({ device });
// Obter o tamanho máximo do work-group para o kernel específico
cl::Kernel kernel(program, "complex_matrix_mult");
size_t maxWorkGroupSize;
kernel.getWorkGroupInfo(device, CL_KERNEL_WORK_GROUP_SIZE, &maxWorkGroupSize);
std::cout << "Tamanho máximo do work-group para o kernel específico: " << maxWorkGroupSize << std::endl;
return 0;
}