Work-group sizes

Hello everyone!

I’m working with a kernel to offload complex matrix multiplication in my GPU, and it works fine for small dimensios matrices, but it doesn’t work as the dimensions size increase. I think the problem is the global and local work group sizes, but I don’t know how to fix. Can anyone help me?

I’ve tried to multiply two 1024x1024 matrices and it worked for global_work_size = [1024, 1024, 0] and local_work_size = [16, 16, 0], but when I tried to multiply 1024x5 by 7x1024, for example, didn’t work. is this related with my processors and GPU settings? I have an AMD Radeon 540X on my laptop, the kernel is offloaded on it.

You probably should post your kernel code, so folks can see how you’re handling spare threads in a wavefront/warp and handling non-square matrices.

Also…

I assume you meant 5x1024 and 1024x7. Since if you don’t have the 1024 dim paired up that would be one reason why it didn’t work. :slight_smile:

Rigth. My host code is as follows

#define CL_HPP_ENABLE_EXCEPTIONS
#define CL_HPP_TARGET_OPENCL_VERSION 200
#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
#define __CL_ENABLE_EXCEPTIONS

#include <iostream>
#include <vector>
#include <fstream>
#include <CL/cl.hpp>

#define M 1024
#define N 1024
#define K 1024

int main() {
    // Initialize input vectors with values of the real and imaginary parts of matrices A and B
    std::vector<double> A_real(M * K, 1);
    std::vector<double> A_imag(M * K, 1);
    std::vector<double> B_real(K * N, 2);
    std::vector<double> B_imag(K * N, 1);

    // Initialize output vector to store multiplication results
    std::vector<double> C_real(M * N);
    std::vector<double> C_imag(M * N);

    try {
        // Set up OpenCL platform
        std::vector<cl::Platform> platforms;
        cl::Platform::get(&platforms);
        if (platforms.empty()) {
            std::cerr << "No platforms found!" << std::endl;
            return 1;
        }

        // Select the first device of the first platform
        cl_context_properties properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0 };
        cl::Context context(CL_DEVICE_TYPE_GPU, properties);

        std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();

        // Create command queue
        cl::CommandQueue queue(context, devices[0]);

        // Create memory buffers for input and output vectors
        cl::Buffer buffer_A_real(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(double) * A_real.size(), A_real.data());
        cl::Buffer buffer_A_imag(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(double) * A_imag.size(), A_imag.data());
        cl::Buffer buffer_B_real(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(double) * B_real.size(), B_real.data());
        cl::Buffer buffer_B_imag(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(double) * B_imag.size(), B_imag.data());
        cl::Buffer buffer_C_real(context, CL_MEM_WRITE_ONLY, sizeof(double) * C_real.size());
        cl::Buffer buffer_C_imag(context, CL_MEM_WRITE_ONLY, sizeof(double) * C_imag.size());

        // Load OpenCL kernel from file
        std::ifstream kernelFile("complex_matrix_mult_kernel.cl");
        std::string src(std::istreambuf_iterator<char>(kernelFile), (std::istreambuf_iterator<char>()));
        cl::Program::Sources sources(1, std::make_pair(src.c_str(), src.length() + 1));
        cl::Program program(context, sources);

        // Compile OpenCL program
        program.build(devices);

        // Create OpenCL kernel
        cl::Kernel kernel(program, "complex_matrix_mult");

        // Set kernel arguments
        kernel.setArg(0, buffer_A_real);
        kernel.setArg(1, buffer_A_imag);
        kernel.setArg(2, buffer_B_real);
        kernel.setArg(3, buffer_B_imag);
        kernel.setArg(4, buffer_C_real);
        kernel.setArg(5, buffer_C_imag);
        kernel.setArg(6, M);
        kernel.setArg(7, N);
        kernel.setArg(8, K);

        // Execute OpenCL kernel
        queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(M, K), cl::NDRange(16, 16));

        // Read results back from device
        queue.enqueueReadBuffer(buffer_C_real, CL_TRUE, 0, sizeof(double) * C_real.size(), C_real.data());
        queue.enqueueReadBuffer(buffer_C_imag, CL_TRUE, 0, sizeof(double) * C_imag.size(), C_imag.data());

        // Print results
        /*std::cout << "Resultado da multiplicação de matrizes complexas:" << std::endl;
        for (int i = 0; i < M; ++i) {
            for (int j = 0; j < K; ++j) {
                if (C_imag[i * K + j] >=0){
                    std::cout << C_real[i * K + j] << " + " << C_imag[i * K + j] << "i\t";
                }
                else {
                    std::cout << C_real[i * K + j] << "  " << C_imag[i * K + j] << "i\t";
                }           
            }
            std::cout << std::endl;
        }*/

        std::cout << "Results: \n\tC_real[0] = " << C_real[0] << std::endl;
        std::cout << "Results: \n\tC_imag[0] = " << C_imag[0] << std::endl;
    }
    catch (cl::Error err) {
        std::cerr << "Erro: " << err.what() << "(" << err.err() << ")" << std::endl;
    }

    return 0;
}

And the kernel code is

__kernel void complex_matrix_mult(__global double* A_real,
                                  __global double* A_imag,
                                  __global double* B_real,
                                  __global double* B_imag,
                                  __global double* C_real,
                                  __global double* C_imag,
                                  const int M, 
                                  const int N, 
                                  const int K) {
    int i = get_global_id(0);
    int j = get_global_id(1);

    // Inicializa os valores da parte real e imaginária do elemento resultante como 0
    C_real[i * K + j] = 0;
    C_imag[i * K + j] = 0;

    // Realiza a multiplicação de cada elemento da linha i de A pela coluna j de B
    for (int k = 0; k < N; k++) {
        // Calcula a parte real e imaginária do produto
        C_real[i * K + j] += A_real[i * N + k] * B_real[k * K + j] - A_imag[i * N + k] * B_imag[k * K + j];
        C_imag[i * K + j] += A_real[i * N + k] * B_imag[k * K + j] + A_imag[i * N + k] * B_real[k * K + j];
    }
}

I’ve used the code below to find the CL_KERNEL_WORK_GROUP_SIZE of my kernel code, and the return is 256.

#define CL_HPP_ENABLE_EXCEPTIONS
#define CL_HPP_TARGET_OPENCL_VERSION 200
#define CL_USE_DEPRECATED_OPENCL_2_0_APIS

#include <CL/cl.hpp>
#include <iostream>
#include <fstream>
#include <vector>

int main() {
        // Obter todas as plataformas disponíveis
        std::vector<cl::Platform> platforms;
        cl::Platform::get(&platforms);

        if (platforms.empty()) {
            std::cerr << "Nenhuma plataforma OpenCL encontrada." << std::endl;
            return 1;
        }

        // Escolher a primeira plataforma
        cl::Platform platform = platforms.front();

        // Obter todos os dispositivos na plataforma escolhida
        std::vector<cl::Device> devices;
        platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);

        if (devices.empty()) {
            std::cerr << "Nenhum dispositivo OpenCL encontrado." << std::endl;
            return 1;
        }

        // Escolher o primeiro dispositivo
        cl::Device device = devices.front();

        // Ler o código-fonte do kernel de um arquivo
        std::ifstream sourceFile("complex_matrix_mult_kernel.cl");
        std::string sourceCode(std::istreambuf_iterator<char>(sourceFile), (std::istreambuf_iterator<char>()));
        cl::Program::Sources sources(1, std::make_pair(sourceCode.c_str(), sourceCode.length() + 1));

        // Criar o programa a partir do código-fonte
        cl::Context context(device);
        cl::Program program(context, sources);
        program.build({ device });

        // Obter o tamanho máximo do work-group para o kernel específico
        cl::Kernel kernel(program, "complex_matrix_mult");
        size_t maxWorkGroupSize;
        kernel.getWorkGroupInfo(device, CL_KERNEL_WORK_GROUP_SIZE, &maxWorkGroupSize);
        std::cout << "Tamanho máximo do work-group para o kernel específico: " << maxWorkGroupSize << std::endl;

    

    return 0;
}