50namespace shamalgs::numeric::details {
55 sycl::buffer<T> exclusive_sum_gpugems39_1(sycl::queue &q, sycl::buffer<T> &buf1,
u32 len) {
57 sycl::buffer<T> out1(len);
58 sycl::buffer<T> out2(len);
60 auto get_in_buf_ref = [&](
u32 step) -> sycl::buffer<T> & {
68 auto get_out_buf_ref = [&](
u32 step) -> sycl::buffer<T> & {
78 q.submit([&](sycl::handler &cgh) {
79 sycl::accessor acc_in{buf1, cgh, sycl::read_only};
80 sycl::accessor acc_out{get_in_buf_ref(step), cgh, sycl::write_only, sycl::no_init};
82 cgh.parallel_for(sycl::range<1>{len}, [=](sycl::item<1> id) {
83 u32 thid =
id.get_linear_id();
84 acc_out[id] = (thid > 0) ? acc_in[thid - 1] : 0;
88 for (
int offset = 1; offset < len; offset *= 2) {
90 q.submit([&, offset](sycl::handler &cgh) {
91 sycl::accessor acc_in{get_in_buf_ref(step), cgh, sycl::read_only};
92 sycl::accessor acc_out{get_out_buf_ref(step), cgh, sycl::write_only};
94 cgh.parallel_for<KernelExclsum_1<T>>(sycl::range<1>{len}, [=](sycl::item<1> id) {
95 u32 thid =
id.get_linear_id();
97 const auto in_val = acc_in[thid];
99 acc_out[thid] = (thid >= offset) ? in_val + acc_in[thid - offset] : in_val;
106 return std::move(get_in_buf_ref(step));
113 sycl::buffer<T> exclusive_sum_gpugems39_2(sycl::queue &q, sycl::buffer<T> &buf1,
u32 len) {
117 sycl::buffer<T> out1(rounded_len);
118 sycl::buffer<T> out2(rounded_len);
120 auto get_in_buf_ref = [&](
u32 step) -> sycl::buffer<T> & {
128 auto get_out_buf_ref = [&](
u32 step) -> sycl::buffer<T> & {
138 q.submit([&](sycl::handler &cgh) {
139 u32 correct_len = len;
140 sycl::accessor acc_in{buf1, cgh, sycl::read_only};
141 sycl::accessor acc_out{get_in_buf_ref(step), cgh, sycl::write_only, sycl::no_init};
143 cgh.parallel_for(sycl::range<1>{rounded_len}, [=](sycl::item<1> id) {
144 u32 thid =
id.get_linear_id();
145 acc_out[id] = (thid > 0 && thid < correct_len) ? acc_in[thid - 1] : 0;
149 for (
int offset = 1; offset < rounded_len; offset *= 2) {
151 q.submit([&, offset](sycl::handler &cgh) {
152 sycl::accessor acc_in{get_in_buf_ref(step), cgh, sycl::read_only};
153 sycl::accessor acc_out{get_out_buf_ref(step), cgh, sycl::write_only};
155 cgh.parallel_for<KernelExclsum_2<T>>(
156 sycl::range<1>{rounded_len}, [=](sycl::item<1> id) {
157 u32 thid =
id.get_linear_id();
159 const auto in_val = acc_in[thid];
161 acc_out[thid] = (thid >= offset) ? in_val + acc_in[thid - offset] : in_val;
168 return std::move(get_in_buf_ref(step));
175 sycl::buffer<T> exclusive_sum_gpugems39_3(sycl::queue &q, sycl::buffer<T> &buf1,
u32 len) {
179 sycl::buffer<T> out1(rounded_len);
180 sycl::buffer<T> out2(rounded_len);
182 auto get_in_buf_ref = [&](
u32 step) -> sycl::buffer<T> & {
190 auto get_out_buf_ref = [&](
u32 step) -> sycl::buffer<T> & {
200 q.submit([&](sycl::handler &cgh) {
201 u32 correct_len = len;
202 sycl::accessor acc_in{buf1, cgh, sycl::read_only};
203 sycl::accessor acc_out{get_in_buf_ref(step), cgh, sycl::write_only, sycl::no_init};
205 cgh.parallel_for(sycl::range<1>{rounded_len}, [=](sycl::item<1> id) {
206 u32 thid =
id.get_linear_id();
207 acc_out[id] = (thid > 0 && thid < correct_len) ? acc_in[thid - 1] : 0;
211 for (
int offset = 1; offset < rounded_len; offset *= 2) {
213 q.submit([&, offset](sycl::handler &cgh) {
214 sycl::accessor acc_in{get_in_buf_ref(step), cgh, sycl::read_only};
215 sycl::accessor acc_out{get_out_buf_ref(step), cgh, sycl::write_only};
217 cgh.parallel_for<KernelExclsum_3<T>>(
218 sycl::range<1>{rounded_len}, [=](sycl::item<1> id) {
219 u32 thid =
id.get_linear_id();
221 const auto in_val = acc_in[thid];
223 acc_out[thid] = (thid >= offset) ? in_val + acc_in[thid - offset] : in_val;
230 return std::move(get_in_buf_ref(step));
233 template sycl::buffer<u32> exclusive_sum_gpugems39_1(
234 sycl::queue &q, sycl::buffer<u32> &buf1,
u32 len);
236 template sycl::buffer<u32> exclusive_sum_gpugems39_2(
237 sycl::queue &q, sycl::buffer<u32> &buf1,
u32 len);
239 template sycl::buffer<u32> exclusive_sum_gpugems39_3(
240 sycl::queue &q, sycl::buffer<u32> &buf1,
u32 len);
std::uint32_t u32
32 bit unsigned integer
constexpr T roundup_pow2_clz(T v) noexcept
round up to the next power of two 0 is rounded up to 1 as it is not a pow of 2 every input above the ...
main include file for memory algorithms