Shamrock 2025.10.0
Astrophysical Code
Loading...
Searching...
No Matches
reduction_alg.cpp
Go to the documentation of this file.
1// -------------------------------------------------------//
2//
3// SHAMROCK code for hydrodynamics
4// Copyright (c) 2021-2026 Timothée David--Cléris <tim.shamrock@proton.me>
5// SPDX-License-Identifier: CeCILL Free Software License Agreement v2.1
6// Shamrock is licensed under the CeCILL 2.1 License, see LICENSE for more information
7//
8// -------------------------------------------------------//
9
17#include "shambase/string.hpp"
18#include "shamalgs/memory.hpp"
19#include "shamalgs/numeric.hpp"
23#include "shambackends/math.hpp"
24#include "shambackends/sycl.hpp"
26#include <algorithm>
27#include <memory>
28#include <vector>
29
30class Kernel_generate_split_table_morton32;
31class Kernel_generate_split_table_morton64;
32
33template<class u_morton, class kername, class split_int>
34void sycl_generate_split_table(
35 sycl::queue &queue,
36 u32 morton_count,
37 std::unique_ptr<sycl::buffer<u_morton>> &buf_morton,
38 std::unique_ptr<sycl::buffer<split_int>> &buf_split_table) {
39
40 sycl::range<1> range_morton_count{morton_count};
41
42 queue.submit([&](sycl::handler &cgh) {
43 sycl::accessor m{*buf_morton, cgh, sycl::read_only};
44 sycl::accessor split_out{*buf_split_table, cgh, sycl::write_only, sycl::no_init};
45
46 cgh.parallel_for<kername>(range_morton_count, [=](sycl::item<1> item) {
47 u32 i = (u32) item.get_id(0);
48
49 if (i > 0) {
50 if (m[i - 1] != m[i]) {
51 split_out[i] = 1;
52 } else {
53 split_out[i] = 0;
54 }
55 } else {
56 split_out[i] = 1;
57 }
58 });
59 });
60}
61
62template<class u_morton, class split_int>
63void sycl_generate_split_table(
64 sham::DeviceQueue &queue,
65 u32 morton_count,
67 sham::DeviceBuffer<split_int> &buf_split_table) {
68
70 queue,
71 sham::MultiRef{buf_morton},
72 sham::MultiRef{buf_split_table},
73 morton_count,
74 [](u32 i, const u_morton *__restrict m, split_int *__restrict split_out) {
75 if (i > 0) {
76 if (m[i - 1] != m[i]) {
77 split_out[i] = 1;
78 } else {
79 split_out[i] = 0;
80 }
81 } else {
82 split_out[i] = 1;
83 }
84 });
85}
86
87/*
88Godbolt snippet for testing
89
90#include <array>
91#include <cstdio>
92
93template<int Na, int Nb>
94void print(std::array<bool, Na> & foo, std::array<int, Nb> cursors){
95 for (int i = 0; i < Na; ++i)
96 printf("%d", foo[i]);
97 printf("\n");
98
99 for(int i : cursors){
100 for(int j = 0 ; j < i ; j++) printf(" ");
101 printf("|\n");
102 }
103}
104
105
106int main(){
107 auto a = std::array<bool,11>{1,0,1,1,1,0,0,1,0,1,0};
108
109 int i = 7;
110 int before1 = i - 1;
111 while (!a[before1])
112 before1--;
113
114 int before2 = before1 - 1;
115 while (!a[before2])
116 before2--;
117
118
119 int next1 = i +1;
120 while (!a[next1])
121 next1++;
122
123 print<11,4>(a, {next1,i,before1, before2});
124}
125
126
127*/
128
129class Kernel_iterate_reduction_morton32;
130class Kernel_iterate_reduction_morton64;
131
132// #define OLD_BEHAVIOR
133#define NEW_BEHAVIOR
134
135#ifdef NEW_BEHAVIOR
136 #define OFFSET
137#endif
138
139#ifdef OLD_BEHAVIOR
140 #define OFFSET +1
141#endif
142
143template<class u_morton, class kername, class split_int>
144void sycl_reduction_iteration(
145 sycl::queue &queue,
146 u32 morton_count,
147 std::unique_ptr<sycl::buffer<u_morton>> &buf_morton,
148 std::unique_ptr<sycl::buffer<split_int>> &buf_split_table_in,
149 std::unique_ptr<sycl::buffer<split_int>> &buf_split_table_out) {
150
151 sycl::range<1> range_morton_count{morton_count};
152
153 queue.submit([&](sycl::handler &cgh) {
154 u32 _morton_cnt = morton_count;
155
156 sycl::accessor m{*buf_morton, cgh, sycl::read_only};
157 sycl::accessor split_in{*buf_split_table_in, cgh, sycl::read_only};
158 sycl::accessor split_out{*buf_split_table_out, cgh, sycl::write_only, sycl::no_init};
159
160 cgh.parallel_for<kername>(range_morton_count, [=](sycl::item<1> item) {
161 int i = item.get_id(0);
162
163 auto DELTA = [=](i32 x, i32 y) {
164 return sham::karras_delta(x, y, _morton_cnt, m);
165 };
166
167 // find index of preceding i-1 non duplicate morton code
168 u32 before1 = i - 1;
169 while (before1 <= _morton_cnt - 1 && !split_in[before1 OFFSET])
170 before1--;
171
172 // find index of preceding i-2 non duplicate morton code
173 // safe bc delta(before1,before2) return -1 if any of the 2 are -1 because of order
174 u32 before2 = before1 - 1;
175 while (before2 <= _morton_cnt - 1 && !split_in[before2 OFFSET])
176 before2--;
177
178 // find index of next i+1 non duplicate morton code
179 u32 next1 = i + 1;
180 while (next1 <= _morton_cnt - 1 && !split_in[next1])
181 next1++;
182
183 int delt_0 = DELTA(i, next1);
184 int delt_m = DELTA(i, before1);
185 int delt_mm = DELTA(before1, before2);
186
187 if (!(delt_0 < delt_m && delt_mm < delt_m) && split_in[i]) {
188 split_out[i] = 1;
189 } else {
190 split_out[i] = 0;
191 }
192 });
193 });
194}
195
196template<class u_morton, class split_int>
197void sycl_reduction_iteration(
198 sham::DeviceQueue &queue,
199 u32 morton_count,
201 sham::DeviceBuffer<split_int> &buf_split_table_in,
202 sham::DeviceBuffer<split_int> &buf_split_table_out) {
203
205 queue,
206 sham::MultiRef{buf_morton, buf_split_table_in},
207 sham::MultiRef{buf_split_table_out},
208 morton_count,
209 [_morton_cnt = morton_count](
210 u32 i,
211 const u_morton *__restrict m,
212 const split_int *__restrict split_in,
213 split_int *__restrict split_out) {
214 auto DELTA = [=](i32 x, i32 y) {
215 return sham::karras_delta(x, y, _morton_cnt, m);
216 };
217
218 // find index of preceding i-1 non duplicate morton code
219 u32 before1 = i - 1;
220 while (before1 <= _morton_cnt - 1 && !split_in[before1 OFFSET])
221 before1--;
222
223 // find index of preceding i-2 non duplicate morton code
224 // safe bc delta(before1,before2) return -1 if any of the 2 are -1 because of order
225 u32 before2 = before1 - 1;
226 while (before2 <= _morton_cnt - 1 && !split_in[before2 OFFSET])
227 before2--;
228
229 // find index of next i+1 non duplicate morton code
230 u32 next1 = i + 1;
231 while (next1 <= _morton_cnt - 1 && !split_in[next1])
232 next1++;
233
234 int delt_0 = DELTA(i, next1);
235 int delt_m = DELTA(i, before1);
236 int delt_mm = DELTA(before1, before2);
237
238 if (!(delt_0 < delt_m && delt_mm < delt_m) && split_in[i]) {
239 split_out[i] = 1;
240 } else {
241 split_out[i] = 0;
242 }
243 });
244}
245
246void update_morton_buf(
247 sycl::queue &queue,
248 u32 len,
249 u32 val_ins,
250 sycl::buffer<u32> &buf_src,
251 std::unique_ptr<sycl::buffer<u32>> &buf_reduc_index_map) {
252
253 sycl::range<1> range_morton_count{len + 2};
254
255 queue.submit([&](sycl::handler &cgh) {
256 u32 _len = len;
257 u32 val = val_ins;
258
259 sycl::accessor src{buf_src, cgh, sycl::read_only};
260 sycl::accessor dest{*buf_reduc_index_map, cgh, sycl::write_only, sycl::no_init};
261
262 cgh.parallel_for(range_morton_count, [=](sycl::item<1> item) {
263 if (item.get_linear_id() < _len) {
264 dest[item] = src[item];
265 } else if (item.get_linear_id() == _len) {
266 dest[item] = val;
267 } else if (item.get_linear_id() == _len + 1) {
268 dest[item] = 0;
269 }
270 });
271 });
272}
273
274void update_morton_buf(
275 sham::DeviceQueue &queue,
276 u32 len,
277 u32 val_ins,
279 sham::DeviceBuffer<u32> &buf_reduc_index_map) {
280
282 queue,
283 sham::MultiRef{buf_src},
284 sham::MultiRef{buf_reduc_index_map},
285 len + 2,
286 [_len = len, val = val_ins](u32 i, const u32 *__restrict src, u32 *__restrict dest) {
287 if (i < _len) {
288 dest[i] = src[i];
289 } else if (i == _len) {
290 dest[i] = val;
291 } else if (i == _len + 1) {
292 dest[i] = 0;
293 }
294 });
295}
296
297template<class split_int>
298void make_indexmap(
299 sycl::queue &queue,
300 u32 morton_count,
301 u32 &morton_leaf_count,
302 std::unique_ptr<sycl::buffer<split_int>> &buf_split_table,
303 std::unique_ptr<sycl::buffer<u32>> &buf_reduc_index_map) {
304
305 auto [buf, len] = shamalgs::numeric::stream_compact(queue, *buf_split_table, morton_count);
306
307 morton_leaf_count = len;
308
309 buf_reduc_index_map = std::make_unique<sycl::buffer<u32>>(morton_leaf_count + 2);
310
311 if (buf) {
312 update_morton_buf(queue, len, morton_count, *buf, buf_reduc_index_map);
313 } else {
314 throw shambase::make_except_with_loc<std::runtime_error>("this result shouldn't be null");
315 }
316
317 if constexpr (false) {
318 std::vector<u32> reduc_index_map;
319
320 u32 leafs = 0;
321
322 {
323 sycl::host_accessor acc{*buf_split_table, sycl::read_only};
324
325 // reduc_index_map.reserve(split_count);
326 for (unsigned int i = 0; i < morton_count; i++) {
327 if (acc[i]) {
328 reduc_index_map.push_back(i);
329 leafs++;
330 }
331 }
332 reduc_index_map.push_back(morton_count);
333 // for one cell mode the last range is inverted to avoid iteration
334 reduc_index_map.push_back(0);
335 }
336
337 {
338 if (leafs != morton_leaf_count) {
340 }
341
342 sycl::host_accessor dest{*buf_reduc_index_map, sycl::read_only};
343
344 for (unsigned int i = 0; i < morton_leaf_count + 2; i++) {
345 if (dest[i] != reduc_index_map[i]) {
347 "difference i = {}, {} != {}", i, dest[i], reduc_index_map[i]));
348 }
349 }
350 }
351
352 buf_reduc_index_map = std::make_unique<sycl::buffer<u32>>(
353 shamalgs::memory::vector_to_buf(queue, reduc_index_map));
354 }
355}
356
357template<class split_int>
358reduc_ret_t<split_int> make_indexmap(
359 const sham::DeviceScheduler_ptr &dev_sched,
360 u32 morton_count,
361 sham::DeviceBuffer<split_int> &buf_split_table) {
362
363 auto buf = shamalgs::numeric::stream_compact(dev_sched, buf_split_table, morton_count);
364
365 u32 morton_leaf_count = buf.get_size();
366
367 sham::DeviceBuffer<u32> buf_reduc_index_map(morton_leaf_count + 2, dev_sched);
368
369 update_morton_buf(
370 dev_sched->get_queue(), morton_leaf_count, morton_count, buf, buf_reduc_index_map);
371
372 return {std::move(buf_reduc_index_map), morton_leaf_count};
373}
374
375template<class u_morton, class kername_split, class kername_reduc_it>
376void reduction_alg_impl(
377 // in
378 sycl::queue &queue,
379 u32 morton_count,
380 std::unique_ptr<sycl::buffer<u_morton>> &buf_morton,
381 u32 reduction_level,
382 // out
383 std::unique_ptr<sycl::buffer<u32>> &buf_reduc_index_map,
384 u32 &morton_leaf_count) {
385
386 auto buf_split_table1 = std::make_unique<sycl::buffer<u32>>(morton_count);
387 auto buf_split_table2 = std::make_unique<sycl::buffer<u32>>(morton_count);
388
389 sycl_generate_split_table<u_morton, kername_split>(
390 queue, morton_count, buf_morton, buf_split_table1);
391
392 for (unsigned int iter = 1; iter <= reduction_level; iter++) {
393
394 if (iter % 2 == 0) {
395 sycl_reduction_iteration<u_morton, kername_reduc_it>(
396 queue, morton_count, buf_morton, buf_split_table2, buf_split_table1);
397 } else {
398 sycl_reduction_iteration<u_morton, kername_reduc_it>(
399 queue, morton_count, buf_morton, buf_split_table1, buf_split_table2);
400 }
401 }
402
403 std::unique_ptr<sycl::buffer<u32>> buf_split_table;
404 if ((reduction_level) % 2 == 0) {
405 buf_split_table = std::move(buf_split_table1);
406 } else {
407 buf_split_table = std::move(buf_split_table2);
408 }
409
410 make_indexmap(queue, morton_count, morton_leaf_count, buf_split_table, buf_reduc_index_map);
411}
412
413template<class u_morton>
414reduc_ret_t<u32> reduction_alg_impl(
415 const sham::DeviceScheduler_ptr &dev_sched,
416 u32 morton_count,
418 u32 reduction_level) {
419
420 sham::DeviceBuffer<u32> buf_split_table1(morton_count, dev_sched);
421 sham::DeviceBuffer<u32> buf_split_table2(morton_count, dev_sched);
422
423 sycl_generate_split_table<u_morton>(
424 dev_sched->get_queue(), morton_count, buf_morton, buf_split_table1);
425
426 for (unsigned int iter = 1; iter <= reduction_level; iter++) {
427
428 if (iter % 2 == 0) {
429 sycl_reduction_iteration<u_morton>(
430 dev_sched->get_queue(),
431 morton_count,
432 buf_morton,
433 buf_split_table2,
434 buf_split_table1);
435 } else {
436 sycl_reduction_iteration<u_morton>(
437 dev_sched->get_queue(),
438 morton_count,
439 buf_morton,
440 buf_split_table1,
441 buf_split_table2);
442 }
443 }
444
445 auto get_correct_buf = [&]() {
446 if ((reduction_level) % 2 == 0) {
447 return std::move(buf_split_table1);
448 } else {
449 return std::move(buf_split_table2);
450 }
451 };
452
453 sham::DeviceBuffer<u32> buf_split_table = get_correct_buf();
454
455 return make_indexmap(dev_sched, morton_count, buf_split_table);
456}
457
458template<>
459void reduction_alg<u32>(
460 // in
461 sycl::queue &queue,
462 u32 morton_count,
463 std::unique_ptr<sycl::buffer<u32>> &buf_morton,
464 u32 reduction_level,
465 // out
466 std::unique_ptr<sycl::buffer<u32>> &buf_reduc_index_map,
467 u32 &morton_leaf_count) {
468 reduction_alg_impl<
469 u32,
470 Kernel_generate_split_table_morton32,
471 Kernel_iterate_reduction_morton32>(
472 queue, morton_count, buf_morton, reduction_level, buf_reduc_index_map, morton_leaf_count);
473}
474
475template<>
476void reduction_alg<u64>(
477 // in
478 sycl::queue &queue,
479 u32 morton_count,
480 std::unique_ptr<sycl::buffer<u64>> &buf_morton,
481 u32 reduction_level,
482 // out
483 std::unique_ptr<sycl::buffer<u32>> &buf_reduc_index_map,
484 u32 &morton_leaf_count) {
485 reduction_alg_impl<
486 u64,
487 Kernel_generate_split_table_morton64,
488 Kernel_iterate_reduction_morton64>(
489 queue, morton_count, buf_morton, reduction_level, buf_reduc_index_map, morton_leaf_count);
490}
491
492class Kernel_remap_morton_code_morton32;
493class Kernel_remap_morton_code_morton64;
494
495template<>
496reduc_ret_t<u32> reduction_alg<u32>(
497 const sham::DeviceScheduler_ptr &dev_sched,
498 u32 morton_count,
499 sham::DeviceBuffer<u32> &buf_morton,
500 u32 reduction_level) {
501 return reduction_alg_impl<u32>(dev_sched, morton_count, buf_morton, reduction_level);
502}
503
504template<>
505reduc_ret_t<u32> reduction_alg<u64>(
506 const sham::DeviceScheduler_ptr &dev_sched,
507 u32 morton_count,
508 sham::DeviceBuffer<u64> &buf_morton,
509 u32 reduction_level) {
510 return reduction_alg_impl<u64>(dev_sched, morton_count, buf_morton, reduction_level);
511}
512
513template<class u_morton, class kername>
514void __sycl_morton_remap_reduction(
515 // in
516 sycl::queue &queue,
517 u32 morton_leaf_count,
518 std::unique_ptr<sycl::buffer<u32>> &buf_reduc_index_map,
519 std::unique_ptr<sycl::buffer<u_morton>> &buf_morton,
520 // out
521 std::unique_ptr<sycl::buffer<u_morton>> &buf_leaf_morton) {
522 sycl::range<1> range_remap_morton{morton_leaf_count};
523
524 queue.submit([&](sycl::handler &cgh) {
525 auto id_remaped = buf_reduc_index_map->get_access<sycl::access::mode::read>(cgh);
526 auto m = buf_morton->template get_access<sycl::access::mode::read>(cgh);
527 auto m_remaped
528 = buf_leaf_morton->template get_access<sycl::access::mode::discard_write>(cgh);
529
530 cgh.parallel_for<kername>(range_remap_morton, [=](sycl::item<1> item) {
531 int i = item.get_id(0);
532
533 m_remaped[i] = m[id_remaped[i]];
534 });
535 });
536}
537
538template<>
539void sycl_morton_remap_reduction<u32>(
540 // in
541 sycl::queue &queue,
542 u32 morton_leaf_count,
543 std::unique_ptr<sycl::buffer<u32>> &buf_reduc_index_map,
544 std::unique_ptr<sycl::buffer<u32>> &buf_morton,
545 // out
546 std::unique_ptr<sycl::buffer<u32>> &buf_leaf_morton) {
547 __sycl_morton_remap_reduction<u32, Kernel_remap_morton_code_morton32>(
548 queue, morton_leaf_count, buf_reduc_index_map, buf_morton, buf_leaf_morton);
549}
550
551template<>
552void sycl_morton_remap_reduction<u64>(
553 // in
554 sycl::queue &queue,
555 u32 morton_leaf_count,
556 std::unique_ptr<sycl::buffer<u32>> &buf_reduc_index_map,
557 std::unique_ptr<sycl::buffer<u64>> &buf_morton,
558 // out
559 std::unique_ptr<sycl::buffer<u64>> &buf_leaf_morton) {
560 __sycl_morton_remap_reduction<u64, Kernel_remap_morton_code_morton64>(
561 queue, morton_leaf_count, buf_reduc_index_map, buf_morton, buf_leaf_morton);
562}
563
564template<class u_morton>
566 // in
567 sham::DeviceQueue &queue,
568 u32 morton_leaf_count,
569 sham::DeviceBuffer<u32> &buf_reduc_index_map,
571 // out
572 sham::DeviceBuffer<u_morton> &buf_leaf_morton) {
573
575 queue,
576 sham::MultiRef{buf_reduc_index_map, buf_morton},
577 sham::MultiRef{buf_leaf_morton},
578 morton_leaf_count,
579 [](u32 i,
580 const u32 *__restrict id_remaped,
581 const u_morton *__restrict m,
582 u_morton *__restrict m_remaped) {
583 m_remaped[i] = m[id_remaped[i]];
584 });
585}
586
587template void sycl_morton_remap_reduction<u32>(
588 // in
589 sham::DeviceQueue &queue,
590 u32 morton_leaf_count,
591 sham::DeviceBuffer<u32> &buf_reduc_index_map,
592 sham::DeviceBuffer<u32> &buf_morton,
593 // out
594 sham::DeviceBuffer<u32> &buf_leaf_morton);
595
596template void sycl_morton_remap_reduction<u64>(
597 // in
598 sham::DeviceQueue &queue,
599 u32 morton_leaf_count,
600 sham::DeviceBuffer<u32> &buf_reduc_index_map,
601 sham::DeviceBuffer<u64> &buf_morton,
602 // out
603 sham::DeviceBuffer<u64> &buf_leaf_morton);
std::uint32_t u32
32 bit unsigned integer
std::uint64_t u64
64 bit unsigned integer
std::int32_t i32
32 bit integer
A buffer allocated in USM (Unified Shared Memory)
A SYCL queue associated with a device and a context.
This header file contains utility functions related to exception handling in the code.
i32 karras_delta(i32 x, i32 y, u32 morton_length, Acc m) noexcept
delta operator defined in Karras 2012
Definition math.hpp:826
void kernel_call(sham::DeviceQueue &q, RefIn in, RefOut in_out, u32 n, Functor &&func, SourceLocation &&callsite=SourceLocation{})
Submit a kernel to a SYCL queue.
std::tuple< std::optional< sycl::buffer< u32 > >, u32 > stream_compact(sycl::queue &q, sycl::buffer< u32 > &buf_flags, u32 len)
Stream compaction algorithm.
Definition numeric.cpp:84
void throw_with_loc(std::string message, SourceLocation loc=SourceLocation{})
Throw an exception and append the source location to it.
void sycl_morton_remap_reduction(sham::DeviceQueue &queue, u32 morton_leaf_count, sham::DeviceBuffer< u32 > &buf_reduc_index_map, sham::DeviceBuffer< u_morton > &buf_morton, sham::DeviceBuffer< u_morton > &buf_leaf_morton)
Remaps a Morton tree on device using a reduction index map.
main include file for memory algorithms
Return type of reduction algorithms.
A class that references multiple buffers or similar objects.