Initialization of local memory for hierarchical kernels

I have a sycl::buffer<int, 1> explicitly initialized on the target device with data std::vector<int>.

My work item operations are going to access the data in the buffer a lot, and so I want to make local copies for efficiency. However, this is for hierarchical kernels:

#include <CL/sycl.hpp>

#include <chrono>
#include <cstdint>
#include <iostream>

template <typename T>
sycl::buffer<T, 1> buffer_create_1D(sycl::queue &q, const std::vector<T> &data, sycl::event &res_event)
{
    sycl::buffer<T> result(sycl::range<1>(data.size()));

    res_event = q.submit([&](sycl::handler &h)
                         {
                auto res_acc = result.template get_access<sycl::access::mode::write>(h);
                h.copy(data.data(), res_acc); });
    return result;
}

int main()
{
    sycl::queue q(sycl::gpu_selector_v);
    auto device = q.get_device();
    auto max_work_group_size = device.get_info<sycl::info::device::max_work_group_size>();
    auto max_compute_units = device.get_info<sycl::info::device::max_compute_units>();
    uint32_t N = max_work_group_size * max_compute_units;

    std::vector<int> vec(N, 1);
    sycl::event res_event;
    auto buf = buffer_create_1D(q, vec, res_event);

    q.submit([&](sycl::handler &h)
             {
        h.depends_on(res_event);
        auto acc =  buf.template get_access<sycl::access::mode::read_write>(h);
        auto local_acc = sycl::accessor<int, 1, sycl::access::mode::read_write, sycl::access::target::local>(sycl::range<1>(max_work_group_size), h);
        h.parallel_for_work_group(sycl::range<1>(max_compute_units), [=](sycl::group<1> gr)
        {
            gr.parallel_for_work_item([&](sycl::h_item<1> it)
            {
                auto gid = it.get_global_id();
                auto lid = it.get_local_id();
                auto abs_id = it.get_global_id() + gr.get_id() * max_work_group_size;
                local_acc[lid] = acc[abs_id];
            });
            gr.parallel_for_work_item([&](sycl::h_item<1> it)
            {
                auto gid = it.get_global_id();
                auto lid = it.get_local_id();
                local_acc[lid] += 1;

            });

            gr.parallel_for_work_item([&](sycl::h_item<1> it)
            {
                auto gid = it.get_global_id();
                auto lid = it.get_local_id();
                auto abs_id = it.get_global_id() + gr.get_id() * max_work_group_size;
                acc[abs_id] = local_acc[lid];
            });
        }); });

    return 0;
}

Be it read_only or read_write, is it efficient to perform a copy like this, or is it better to have the access be implicitly handled using the global accessor acc?

You should try first without using local memory but taking care that each work-group is using its own coalesced part of the global buffer. As modern GPU have a lot of cache, it my just work out of the box with good performance. Beware to the false-sharing at the cache-line boundary between work-group. You might want to align the access to full cache lines.
If you are still far from the expected peak performance, you might dive into using local memory.

This topic was automatically closed 180 days after the last reply. New replies are no longer allowed.