30class Kernel_generate_split_table_morton32;
31class Kernel_generate_split_table_morton64;
33template<
class u_morton,
class kername,
class split_
int>
34void sycl_generate_split_table(
37 std::unique_ptr<sycl::buffer<u_morton>> &buf_morton,
38 std::unique_ptr<sycl::buffer<split_int>> &buf_split_table) {
40 sycl::range<1> range_morton_count{morton_count};
42 queue.submit([&](sycl::handler &cgh) {
43 sycl::accessor m{*buf_morton, cgh, sycl::read_only};
44 sycl::accessor split_out{*buf_split_table, cgh, sycl::write_only, sycl::no_init};
46 cgh.parallel_for<kername>(range_morton_count, [=](sycl::item<1> item) {
47 u32 i = (
u32) item.get_id(0);
50 if (m[i - 1] != m[i]) {
62template<
class u_morton,
class split_
int>
63void sycl_generate_split_table(
74 [](
u32 i,
const u_morton *__restrict m, split_int *__restrict split_out) {
76 if (m[i - 1] != m[i]) {
129class Kernel_iterate_reduction_morton32;
130class Kernel_iterate_reduction_morton64;
143template<
class u_morton,
class kername,
class split_
int>
144void sycl_reduction_iteration(
147 std::unique_ptr<sycl::buffer<u_morton>> &buf_morton,
148 std::unique_ptr<sycl::buffer<split_int>> &buf_split_table_in,
149 std::unique_ptr<sycl::buffer<split_int>> &buf_split_table_out) {
151 sycl::range<1> range_morton_count{morton_count};
153 queue.submit([&](sycl::handler &cgh) {
154 u32 _morton_cnt = morton_count;
156 sycl::accessor m{*buf_morton, cgh, sycl::read_only};
157 sycl::accessor split_in{*buf_split_table_in, cgh, sycl::read_only};
158 sycl::accessor split_out{*buf_split_table_out, cgh, sycl::write_only, sycl::no_init};
160 cgh.parallel_for<kername>(range_morton_count, [=](sycl::item<1> item) {
161 int i = item.get_id(0);
163 auto DELTA = [=](
i32 x,
i32 y) {
169 while (before1 <= _morton_cnt - 1 && !split_in[before1 OFFSET])
174 u32 before2 = before1 - 1;
175 while (before2 <= _morton_cnt - 1 && !split_in[before2 OFFSET])
180 while (next1 <= _morton_cnt - 1 && !split_in[next1])
183 int delt_0 = DELTA(i, next1);
184 int delt_m = DELTA(i, before1);
185 int delt_mm = DELTA(before1, before2);
187 if (!(delt_0 < delt_m && delt_mm < delt_m) && split_in[i]) {
196template<
class u_morton,
class split_
int>
197void sycl_reduction_iteration(
209 [_morton_cnt = morton_count](
211 const u_morton *__restrict m,
212 const split_int *__restrict split_in,
213 split_int *__restrict split_out) {
214 auto DELTA = [=](
i32 x,
i32 y) {
220 while (before1 <= _morton_cnt - 1 && !split_in[before1 OFFSET])
225 u32 before2 = before1 - 1;
226 while (before2 <= _morton_cnt - 1 && !split_in[before2 OFFSET])
231 while (next1 <= _morton_cnt - 1 && !split_in[next1])
234 int delt_0 = DELTA(i, next1);
235 int delt_m = DELTA(i, before1);
236 int delt_mm = DELTA(before1, before2);
238 if (!(delt_0 < delt_m && delt_mm < delt_m) && split_in[i]) {
246void update_morton_buf(
250 sycl::buffer<u32> &buf_src,
251 std::unique_ptr<sycl::buffer<u32>> &buf_reduc_index_map) {
253 sycl::range<1> range_morton_count{len + 2};
255 queue.submit([&](sycl::handler &cgh) {
259 sycl::accessor src{buf_src, cgh, sycl::read_only};
260 sycl::accessor dest{*buf_reduc_index_map, cgh, sycl::write_only, sycl::no_init};
262 cgh.parallel_for(range_morton_count, [=](sycl::item<1> item) {
263 if (item.get_linear_id() < _len) {
264 dest[item] = src[item];
265 }
else if (item.get_linear_id() == _len) {
267 }
else if (item.get_linear_id() == _len + 1) {
274void update_morton_buf(
286 [_len = len, val = val_ins](
u32 i,
const u32 *__restrict src,
u32 *__restrict dest) {
289 }
else if (i == _len) {
291 }
else if (i == _len + 1) {
297template<
class split_
int>
301 u32 &morton_leaf_count,
302 std::unique_ptr<sycl::buffer<split_int>> &buf_split_table,
303 std::unique_ptr<sycl::buffer<u32>> &buf_reduc_index_map) {
307 morton_leaf_count = len;
309 buf_reduc_index_map = std::make_unique<sycl::buffer<u32>>(morton_leaf_count + 2);
312 update_morton_buf(queue, len, morton_count, *buf, buf_reduc_index_map);
317 if constexpr (
false) {
318 std::vector<u32> reduc_index_map;
323 sycl::host_accessor acc{*buf_split_table, sycl::read_only};
326 for (
unsigned int i = 0; i < morton_count; i++) {
328 reduc_index_map.push_back(i);
332 reduc_index_map.push_back(morton_count);
334 reduc_index_map.push_back(0);
338 if (leafs != morton_leaf_count) {
342 sycl::host_accessor dest{*buf_reduc_index_map, sycl::read_only};
344 for (
unsigned int i = 0; i < morton_leaf_count + 2; i++) {
345 if (dest[i] != reduc_index_map[i]) {
347 "difference i = {}, {} != {}", i, dest[i], reduc_index_map[i]));
352 buf_reduc_index_map = std::make_unique<sycl::buffer<u32>>(
353 shamalgs::memory::vector_to_buf(queue, reduc_index_map));
357template<
class split_
int>
359 const sham::DeviceScheduler_ptr &dev_sched,
365 u32 morton_leaf_count = buf.get_size();
370 dev_sched->get_queue(), morton_leaf_count, morton_count, buf, buf_reduc_index_map);
372 return {std::move(buf_reduc_index_map), morton_leaf_count};
375template<
class u_morton,
class kername_split,
class kername_reduc_it>
376void reduction_alg_impl(
380 std::unique_ptr<sycl::buffer<u_morton>> &buf_morton,
383 std::unique_ptr<sycl::buffer<u32>> &buf_reduc_index_map,
384 u32 &morton_leaf_count) {
386 auto buf_split_table1 = std::make_unique<sycl::buffer<u32>>(morton_count);
387 auto buf_split_table2 = std::make_unique<sycl::buffer<u32>>(morton_count);
389 sycl_generate_split_table<u_morton, kername_split>(
390 queue, morton_count, buf_morton, buf_split_table1);
392 for (
unsigned int iter = 1; iter <= reduction_level; iter++) {
395 sycl_reduction_iteration<u_morton, kername_reduc_it>(
396 queue, morton_count, buf_morton, buf_split_table2, buf_split_table1);
398 sycl_reduction_iteration<u_morton, kername_reduc_it>(
399 queue, morton_count, buf_morton, buf_split_table1, buf_split_table2);
403 std::unique_ptr<sycl::buffer<u32>> buf_split_table;
404 if ((reduction_level) % 2 == 0) {
405 buf_split_table = std::move(buf_split_table1);
407 buf_split_table = std::move(buf_split_table2);
410 make_indexmap(queue, morton_count, morton_leaf_count, buf_split_table, buf_reduc_index_map);
413template<
class u_morton>
415 const sham::DeviceScheduler_ptr &dev_sched,
418 u32 reduction_level) {
423 sycl_generate_split_table<u_morton>(
424 dev_sched->get_queue(), morton_count, buf_morton, buf_split_table1);
426 for (
unsigned int iter = 1; iter <= reduction_level; iter++) {
429 sycl_reduction_iteration<u_morton>(
430 dev_sched->get_queue(),
436 sycl_reduction_iteration<u_morton>(
437 dev_sched->get_queue(),
445 auto get_correct_buf = [&]() {
446 if ((reduction_level) % 2 == 0) {
447 return std::move(buf_split_table1);
449 return std::move(buf_split_table2);
455 return make_indexmap(dev_sched, morton_count, buf_split_table);
459void reduction_alg<u32>(
463 std::unique_ptr<sycl::buffer<u32>> &buf_morton,
466 std::unique_ptr<sycl::buffer<u32>> &buf_reduc_index_map,
467 u32 &morton_leaf_count) {
470 Kernel_generate_split_table_morton32,
471 Kernel_iterate_reduction_morton32>(
472 queue, morton_count, buf_morton, reduction_level, buf_reduc_index_map, morton_leaf_count);
476void reduction_alg<u64>(
480 std::unique_ptr<sycl::buffer<u64>> &buf_morton,
483 std::unique_ptr<sycl::buffer<u32>> &buf_reduc_index_map,
484 u32 &morton_leaf_count) {
487 Kernel_generate_split_table_morton64,
488 Kernel_iterate_reduction_morton64>(
489 queue, morton_count, buf_morton, reduction_level, buf_reduc_index_map, morton_leaf_count);
492class Kernel_remap_morton_code_morton32;
493class Kernel_remap_morton_code_morton64;
497 const sham::DeviceScheduler_ptr &dev_sched,
500 u32 reduction_level) {
501 return reduction_alg_impl<u32>(dev_sched, morton_count, buf_morton, reduction_level);
506 const sham::DeviceScheduler_ptr &dev_sched,
509 u32 reduction_level) {
510 return reduction_alg_impl<u64>(dev_sched, morton_count, buf_morton, reduction_level);
513template<
class u_morton,
class kername>
514void __sycl_morton_remap_reduction(
517 u32 morton_leaf_count,
518 std::unique_ptr<sycl::buffer<u32>> &buf_reduc_index_map,
519 std::unique_ptr<sycl::buffer<u_morton>> &buf_morton,
521 std::unique_ptr<sycl::buffer<u_morton>> &buf_leaf_morton) {
522 sycl::range<1> range_remap_morton{morton_leaf_count};
524 queue.submit([&](sycl::handler &cgh) {
525 auto id_remaped = buf_reduc_index_map->get_access<sycl::access::mode::read>(cgh);
526 auto m = buf_morton->template get_access<sycl::access::mode::read>(cgh);
528 = buf_leaf_morton->template get_access<sycl::access::mode::discard_write>(cgh);
530 cgh.parallel_for<kername>(range_remap_morton, [=](sycl::item<1> item) {
531 int i = item.get_id(0);
533 m_remaped[i] = m[id_remaped[i]];
539void sycl_morton_remap_reduction<u32>(
542 u32 morton_leaf_count,
543 std::unique_ptr<sycl::buffer<u32>> &buf_reduc_index_map,
544 std::unique_ptr<sycl::buffer<u32>> &buf_morton,
546 std::unique_ptr<sycl::buffer<u32>> &buf_leaf_morton) {
547 __sycl_morton_remap_reduction<u32, Kernel_remap_morton_code_morton32>(
548 queue, morton_leaf_count, buf_reduc_index_map, buf_morton, buf_leaf_morton);
552void sycl_morton_remap_reduction<u64>(
555 u32 morton_leaf_count,
556 std::unique_ptr<sycl::buffer<u32>> &buf_reduc_index_map,
557 std::unique_ptr<sycl::buffer<u64>> &buf_morton,
559 std::unique_ptr<sycl::buffer<u64>> &buf_leaf_morton) {
560 __sycl_morton_remap_reduction<u64, Kernel_remap_morton_code_morton64>(
561 queue, morton_leaf_count, buf_reduc_index_map, buf_morton, buf_leaf_morton);
564template<
class u_morton>
568 u32 morton_leaf_count,
580 const u32 *__restrict id_remaped,
581 const u_morton *__restrict m,
582 u_morton *__restrict m_remaped) {
583 m_remaped[i] = m[id_remaped[i]];
587template void sycl_morton_remap_reduction<u32>(
590 u32 morton_leaf_count,
596template void sycl_morton_remap_reduction<u64>(
599 u32 morton_leaf_count,
std::uint32_t u32
32 bit unsigned integer
std::uint64_t u64
64 bit unsigned integer
std::int32_t i32
32 bit integer
A buffer allocated in USM (Unified Shared Memory)
A SYCL queue associated with a device and a context.
This header file contains utility functions related to exception handling in the code.
i32 karras_delta(i32 x, i32 y, u32 morton_length, Acc m) noexcept
delta operator defined in Karras 2012
void kernel_call(sham::DeviceQueue &q, RefIn in, RefOut in_out, u32 n, Functor &&func, SourceLocation &&callsite=SourceLocation{})
Submit a kernel to a SYCL queue.
std::tuple< std::optional< sycl::buffer< u32 > >, u32 > stream_compact(sycl::queue &q, sycl::buffer< u32 > &buf_flags, u32 len)
Stream compaction algorithm.
void throw_with_loc(std::string message, SourceLocation loc=SourceLocation{})
Throw an exception and append the source location to it.
void sycl_morton_remap_reduction(sham::DeviceQueue &queue, u32 morton_leaf_count, sham::DeviceBuffer< u32 > &buf_reduc_index_map, sham::DeviceBuffer< u_morton > &buf_morton, sham::DeviceBuffer< u_morton > &buf_leaf_morton)
Remaps a Morton tree on device using a reduction index map.
main include file for memory algorithms
Return type of reduction algorithms.
A class that references multiple buffers or similar objects.