I have a sycl::buffer<int, 1>
explicitly initialized on the target device with data std::vector<int>
.
My work item operations are going to access the data in the buffer a lot, and so I want to make local copies for efficiency. However, this is for hierarchical kernels:
#include <CL/sycl.hpp>
#include <chrono>
#include <cstdint>
#include <iostream>
template <typename T>
sycl::buffer<T, 1> buffer_create_1D(sycl::queue &q, const std::vector<T> &data, sycl::event &res_event)
{
sycl::buffer<T> result(sycl::range<1>(data.size()));
res_event = q.submit([&](sycl::handler &h)
{
auto res_acc = result.template get_access<sycl::access::mode::write>(h);
h.copy(data.data(), res_acc); });
return result;
}
int main()
{
sycl::queue q(sycl::gpu_selector_v);
auto device = q.get_device();
auto max_work_group_size = device.get_info<sycl::info::device::max_work_group_size>();
auto max_compute_units = device.get_info<sycl::info::device::max_compute_units>();
uint32_t N = max_work_group_size * max_compute_units;
std::vector<int> vec(N, 1);
sycl::event res_event;
auto buf = buffer_create_1D(q, vec, res_event);
q.submit([&](sycl::handler &h)
{
h.depends_on(res_event);
auto acc = buf.template get_access<sycl::access::mode::read_write>(h);
auto local_acc = sycl::accessor<int, 1, sycl::access::mode::read_write, sycl::access::target::local>(sycl::range<1>(max_work_group_size), h);
h.parallel_for_work_group(sycl::range<1>(max_compute_units), [=](sycl::group<1> gr)
{
gr.parallel_for_work_item([&](sycl::h_item<1> it)
{
auto gid = it.get_global_id();
auto lid = it.get_local_id();
auto abs_id = it.get_global_id() + gr.get_id() * max_work_group_size;
local_acc[lid] = acc[abs_id];
});
gr.parallel_for_work_item([&](sycl::h_item<1> it)
{
auto gid = it.get_global_id();
auto lid = it.get_local_id();
local_acc[lid] += 1;
});
gr.parallel_for_work_item([&](sycl::h_item<1> it)
{
auto gid = it.get_global_id();
auto lid = it.get_local_id();
auto abs_id = it.get_global_id() + gr.get_id() * max_work_group_size;
acc[abs_id] = local_acc[lid];
});
}); });
return 0;
}
Be it read_only
or read_write
, is it efficient to perform a copy like this, or is it better to have the access be implicitly handled using the global accessor acc
?