Work-group sizes

bieelpaiao · April 23, 2024, 12:00am

Hello everyone!

I’m working with a kernel to offload complex matrix multiplication in my GPU, and it works fine for small dimensios matrices, but it doesn’t work as the dimensions size increase. I think the problem is the global and local work group sizes, but I don’t know how to fix. Can anyone help me?

I’ve tried to multiply two 1024x1024 matrices and it worked for global_work_size = [1024, 1024, 0] and local_work_size = [16, 16, 0], but when I tried to multiply 1024x5 by 7x1024, for example, didn’t work. is this related with my processors and GPU settings? I have an AMD Radeon 540X on my laptop, the kernel is offloaded on it.

Dark_Photon · April 23, 2024, 12:22pm

You probably should post your kernel code, so folks can see how you’re handling spare threads in a wavefront/warp and handling non-square matrices.

Also…

I assume you meant 5x1024 and 1024x7. Since if you don’t have the 1024 dim paired up that would be one reason why it didn’t work.

bieelpaiao · April 24, 2024, 4:30pm

Rigth. My host code is as follows

#define CL_HPP_ENABLE_EXCEPTIONS
#define CL_HPP_TARGET_OPENCL_VERSION 200
#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
#define __CL_ENABLE_EXCEPTIONS

#include <iostream>
#include <vector>
#include <fstream>
#include <CL/cl.hpp>

#define M 1024
#define N 1024
#define K 1024

int main() {
    // Initialize input vectors with values of the real and imaginary parts of matrices A and B
    std::vector<double> A_real(M * K, 1);
    std::vector<double> A_imag(M * K, 1);
    std::vector<double> B_real(K * N, 2);
    std::vector<double> B_imag(K * N, 1);

    // Initialize output vector to store multiplication results
    std::vector<double> C_real(M * N);
    std::vector<double> C_imag(M * N);

    try {
        // Set up OpenCL platform
        std::vector<cl::Platform> platforms;
        cl::Platform::get(&platforms);
        if (platforms.empty()) {
            std::cerr << "No platforms found!" << std::endl;
            return 1;
        }

        // Select the first device of the first platform
        cl_context_properties properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0 };
        cl::Context context(CL_DEVICE_TYPE_GPU, properties);

        std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();

        // Create command queue
        cl::CommandQueue queue(context, devices[0]);

        // Create memory buffers for input and output vectors
        cl::Buffer buffer_A_real(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(double) * A_real.size(), A_real.data());
        cl::Buffer buffer_A_imag(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(double) * A_imag.size(), A_imag.data());
        cl::Buffer buffer_B_real(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(double) * B_real.size(), B_real.data());
        cl::Buffer buffer_B_imag(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(double) * B_imag.size(), B_imag.data());
        cl::Buffer buffer_C_real(context, CL_MEM_WRITE_ONLY, sizeof(double) * C_real.size());
        cl::Buffer buffer_C_imag(context, CL_MEM_WRITE_ONLY, sizeof(double) * C_imag.size());

        // Load OpenCL kernel from file
        std::ifstream kernelFile("complex_matrix_mult_kernel.cl");
        std::string src(std::istreambuf_iterator<char>(kernelFile), (std::istreambuf_iterator<char>()));
        cl::Program::Sources sources(1, std::make_pair(src.c_str(), src.length() + 1));
        cl::Program program(context, sources);

        // Compile OpenCL program
        program.build(devices);

        // Create OpenCL kernel
        cl::Kernel kernel(program, "complex_matrix_mult");

        // Set kernel arguments
        kernel.setArg(0, buffer_A_real);
        kernel.setArg(1, buffer_A_imag);
        kernel.setArg(2, buffer_B_real);
        kernel.setArg(3, buffer_B_imag);
        kernel.setArg(4, buffer_C_real);
        kernel.setArg(5, buffer_C_imag);
        kernel.setArg(6, M);
        kernel.setArg(7, N);
        kernel.setArg(8, K);

        // Execute OpenCL kernel
        queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(M, K), cl::NDRange(16, 16));

        // Read results back from device
        queue.enqueueReadBuffer(buffer_C_real, CL_TRUE, 0, sizeof(double) * C_real.size(), C_real.data());
        queue.enqueueReadBuffer(buffer_C_imag, CL_TRUE, 0, sizeof(double) * C_imag.size(), C_imag.data());

        // Print results
        /*std::cout << "Resultado da multiplicação de matrizes complexas:" << std::endl;
        for (int i = 0; i < M; ++i) {
            for (int j = 0; j < K; ++j) {
                if (C_imag[i * K + j] >=0){
                    std::cout << C_real[i * K + j] << " + " << C_imag[i * K + j] << "i\t";
                }
                else {
                    std::cout << C_real[i * K + j] << "  " << C_imag[i * K + j] << "i\t";
                }           
            }
            std::cout << std::endl;
        }*/

        std::cout << "Results: \n\tC_real[0] = " << C_real[0] << std::endl;
        std::cout << "Results: \n\tC_imag[0] = " << C_imag[0] << std::endl;
    }
    catch (cl::Error err) {
        std::cerr << "Erro: " << err.what() << "(" << err.err() << ")" << std::endl;
    }

    return 0;
}

And the kernel code is

__kernel void complex_matrix_mult(__global double* A_real,
                                  __global double* A_imag,
                                  __global double* B_real,
                                  __global double* B_imag,
                                  __global double* C_real,
                                  __global double* C_imag,
                                  const int M, 
                                  const int N, 
                                  const int K) {
    int i = get_global_id(0);
    int j = get_global_id(1);

    // Inicializa os valores da parte real e imaginária do elemento resultante como 0
    C_real[i * K + j] = 0;
    C_imag[i * K + j] = 0;

    // Realiza a multiplicação de cada elemento da linha i de A pela coluna j de B
    for (int k = 0; k < N; k++) {
        // Calcula a parte real e imaginária do produto
        C_real[i * K + j] += A_real[i * N + k] * B_real[k * K + j] - A_imag[i * N + k] * B_imag[k * K + j];
        C_imag[i * K + j] += A_real[i * N + k] * B_imag[k * K + j] + A_imag[i * N + k] * B_real[k * K + j];
    }
}

I’ve used the code below to find the CL_KERNEL_WORK_GROUP_SIZE of my kernel code, and the return is 256.

#define CL_HPP_ENABLE_EXCEPTIONS
#define CL_HPP_TARGET_OPENCL_VERSION 200
#define CL_USE_DEPRECATED_OPENCL_2_0_APIS

#include <CL/cl.hpp>
#include <iostream>
#include <fstream>
#include <vector>

int main() {
        // Obter todas as plataformas disponíveis
        std::vector<cl::Platform> platforms;
        cl::Platform::get(&platforms);

        if (platforms.empty()) {
            std::cerr << "Nenhuma plataforma OpenCL encontrada." << std::endl;
            return 1;
        }

        // Escolher a primeira plataforma
        cl::Platform platform = platforms.front();

        // Obter todos os dispositivos na plataforma escolhida
        std::vector<cl::Device> devices;
        platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);

        if (devices.empty()) {
            std::cerr << "Nenhum dispositivo OpenCL encontrado." << std::endl;
            return 1;
        }

        // Escolher o primeiro dispositivo
        cl::Device device = devices.front();

        // Ler o código-fonte do kernel de um arquivo
        std::ifstream sourceFile("complex_matrix_mult_kernel.cl");
        std::string sourceCode(std::istreambuf_iterator<char>(sourceFile), (std::istreambuf_iterator<char>()));
        cl::Program::Sources sources(1, std::make_pair(sourceCode.c_str(), sourceCode.length() + 1));

        // Criar o programa a partir do código-fonte
        cl::Context context(device);
        cl::Program program(context, sources);
        program.build({ device });

        // Obter o tamanho máximo do work-group para o kernel específico
        cl::Kernel kernel(program, "complex_matrix_mult");
        size_t maxWorkGroupSize;
        kernel.getWorkGroupInfo(device, CL_KERNEL_WORK_GROUP_SIZE, &maxWorkGroupSize);
        std::cout << "Tamanho máximo do work-group para o kernel específico: " << maxWorkGroupSize << std::endl;

    

    return 0;
}