Shamrock 2025.10.0
Astrophysical Code
Loading...
Searching...
No Matches
scan_exclusive_sum_in_place.cpp
Go to the documentation of this file.
1// -------------------------------------------------------//
2//
3// SHAMROCK code for hydrodynamics
4// Copyright (c) 2021-2026 Timothée David--Cléris <tim.shamrock@proton.me>
5// SPDX-License-Identifier: CeCILL Free Software License Agreement v2.1
6// Shamrock is licensed under the CeCILL 2.1 License, see LICENSE for more information
7//
8// -------------------------------------------------------//
9
24#include <numeric>
25
26#if defined(__has_include)
27 #if __has_include(<AdaptiveCpp/algorithms/numeric.hpp>)
28 #include <AdaptiveCpp/algorithms/numeric.hpp>
29 #define ACPP_ALG_AVAILABLE
30 #endif
31#endif
32
33namespace {
34
35#ifdef __ACPP__
36 template<class T>
37 void scan_exclusive_sum_in_place_std_scan_single_task_acpp(
38 sham::DeviceBuffer<T> &buf1, u32 len) {
39
40 auto &q = buf1.get_dev_scheduler_ptr()->get_queue();
41
42 sycl::queue &q_s = q.q;
43
44 if (q_s.is_host()) {
45 sham::EventList deps{};
46 T *in_out_ptr = buf1.get_write_access(deps);
47
48 auto e = q.submit(deps, [&](sycl::handler &cgh) {
49 cgh.single_task([=]() {
50 std::exclusive_scan(in_out_ptr, in_out_ptr + len, in_out_ptr, T{});
51 });
52 });
53
55 } else {
56 auto acc_src = buf1.copy_to_stdvec_idx_range(0, len);
57 std::exclusive_scan(acc_src.begin(), acc_src.end(), acc_src.begin(), T{});
58 buf1.copy_from_stdvec(acc_src, len);
59 }
60 }
61#endif
62
63 template<class T>
64 void scan_exclusive_sum_in_place_fallback(sham::DeviceBuffer<T> &buf1, u32 len) {
65 auto acc_src = buf1.copy_to_stdvec_idx_range(0, len);
66 std::exclusive_scan(acc_src.begin(), acc_src.end(), acc_src.begin(), 0);
67 buf1.copy_from_stdvec(acc_src, len);
68 }
69
70#ifdef SYCL2020_FEATURE_GROUP_REDUCTION
71 template<class T>
72 void scan_exclusive_sum_in_place_decoupled_lookback_512(sham::DeviceBuffer<T> &buf1, u32 len) {
73 shamalgs::numeric::details::exclusive_sum_atomic_decoupled_v5_usm_in_place<T, 512>(
74 buf1, len);
75 }
76#endif
77
78#ifdef ACPP_ALG_AVAILABLE
79 template<class T>
80 void scan_exclusive_sum_in_place_adaptivecpp(sham::DeviceBuffer<T> &buf1, u32 len) {
81 auto &q = buf1.get_dev_scheduler_ptr()->get_queue().q;
82
83 acpp::algorithms::util::allocation_cache cache{
84 acpp::algorithms::util::allocation_type::device};
85 acpp::algorithms::util::allocation_group scratch{&cache, q.get_device()};
86
88
89 sham::EventList deps{};
90 const T *in_out_ptr = buf1.get_read_access(deps);
91 T *temp_ptr = temp.get_write_access(deps);
92
93 sycl::event e = adaptivecpp::algorithms::exclusive_scan(
94 q, scratch, in_out_ptr, in_out_ptr + len, temp_ptr, T{}, deps.get_events());
95 deps.set_consumed(true);
96
98 temp.complete_event_state(e);
99
100 buf1.copy_from(temp, len);
101 }
102#endif
103} // namespace
104
105namespace shamalgs::primitives {
106
107 enum class EXSCAN_IN_PLACE_IMPL : u32 {
108 STD_SCAN,
109#ifdef __ACPP__
110 STD_SCAN_SINGLE_TASK_ACPP,
111#endif
112#ifdef SYCL2020_FEATURE_GROUP_REDUCTION
113 DECOUPLED_LOOKBACK_512,
114#endif
115#ifdef ACPP_ALG_AVAILABLE
116 ADAPTIVECPP_ALG,
117#endif
118 };
119
120 EXSCAN_IN_PLACE_IMPL get_default_scan_exclusive_sum_in_place_impl() {
121#ifdef __MACH__ // decoupled lookback perf on mac os is awful
122 #ifdef __ACPP__ // for acpp we gain using enqueue custom operation instead of copying
123 return EXSCAN_IN_PLACE_IMPL::STD_SCAN_SINGLE_TASK_ACPP;
124 #else
125 return EXSCAN_IN_PLACE_IMPL::STD_SCAN;
126 #endif
127#else
128 #ifdef SYCL2020_FEATURE_GROUP_REDUCTION
129 return EXSCAN_IN_PLACE_IMPL::DECOUPLED_LOOKBACK_512;
130 #else
131 return EXSCAN_IN_PLACE_IMPL::STD_SCAN;
132 #endif
133#endif
134 }
135
136 EXSCAN_IN_PLACE_IMPL scan_exclusive_sum_in_place_impl
137 = get_default_scan_exclusive_sum_in_place_impl();
138
139 inline EXSCAN_IN_PLACE_IMPL scan_exclusive_sum_in_place_impl_from_params(
140 const std::string &impl) {
141 if (impl == "std_scan") {
142 return EXSCAN_IN_PLACE_IMPL::STD_SCAN;
143#ifdef __ACPP__
144 } else if (impl == "std_scan_single_task_acpp") {
145 return EXSCAN_IN_PLACE_IMPL::STD_SCAN_SINGLE_TASK_ACPP;
146#endif
147#ifdef SYCL2020_FEATURE_GROUP_REDUCTION
148 } else if (impl == "decoupled_lookback_512") {
149 return EXSCAN_IN_PLACE_IMPL::DECOUPLED_LOOKBACK_512;
150#endif
151#ifdef ACPP_ALG_AVAILABLE
152 } else if (impl == "acpp_alg") {
153 return EXSCAN_IN_PLACE_IMPL::ADAPTIVECPP_ALG;
154#endif
155 }
156
158 "invalid implementation : {}, possible implementations : {}",
159 impl,
161 }
162
163 inline shamalgs::impl_param scan_exclusive_sum_in_place_impl_to_params(
164 const EXSCAN_IN_PLACE_IMPL &impl) {
165 if (impl == EXSCAN_IN_PLACE_IMPL::STD_SCAN) {
166 return {"std_scan", ""};
167#ifdef __ACPP__
168 } else if (impl == EXSCAN_IN_PLACE_IMPL::STD_SCAN_SINGLE_TASK_ACPP) {
169 return {"std_scan_single_task_acpp", ""};
170#endif
171#ifdef SYCL2020_FEATURE_GROUP_REDUCTION
172 } else if (impl == EXSCAN_IN_PLACE_IMPL::DECOUPLED_LOOKBACK_512) {
173 return {"decoupled_lookback_512", ""};
174#endif
175#ifdef ACPP_ALG_AVAILABLE
176 } else if (impl == EXSCAN_IN_PLACE_IMPL::ADAPTIVECPP_ALG) {
177 return {"acpp_alg", ""};
178#endif
179 }
180
182 shambase::format("unknown scan_exclusive_sum_in_place implementation : {}", u32(impl)));
183 }
184
186 return {
187 {"std_scan", ""},
188#ifdef __ACPP__
189 {"std_scan_single_task_acpp", ""},
190#endif
191#ifdef SYCL2020_FEATURE_GROUP_REDUCTION
192 {"decoupled_lookback_512", ""},
193#endif
194#ifdef ACPP_ALG_AVAILABLE
195 {"acpp_alg", ""},
196#endif
197 };
198 }
199
201 return scan_exclusive_sum_in_place_impl_to_params(scan_exclusive_sum_in_place_impl);
202 }
203
205 const std::string &impl, const std::string &param) {
206 shamlog_info_ln(
207 "tree", "setting scan_exclusive_sum_in_place implementation to impl :", impl);
208 scan_exclusive_sum_in_place_impl = scan_exclusive_sum_in_place_impl_from_params(impl);
209 }
210
211 template<class T>
213
214 if (len == 0) {
215 return;
216 }
217
218 if (len > buf1.get_size()) {
220 "The buffer is smaller than the length of the scan\n"
221 "len > buf1.get_size(), len = {}, buf1.get_size() = {}",
222 len,
223 buf1.get_size()));
224 }
225
226 switch (scan_exclusive_sum_in_place_impl) {
227 case EXSCAN_IN_PLACE_IMPL::STD_SCAN: scan_exclusive_sum_in_place_fallback(buf1, len); break;
228#ifdef __ACPP__
229 case EXSCAN_IN_PLACE_IMPL::STD_SCAN_SINGLE_TASK_ACPP:
230 scan_exclusive_sum_in_place_std_scan_single_task_acpp(buf1, len);
231 break;
232#endif
233#ifdef SYCL2020_FEATURE_GROUP_REDUCTION
234 case EXSCAN_IN_PLACE_IMPL::DECOUPLED_LOOKBACK_512:
235 scan_exclusive_sum_in_place_decoupled_lookback_512(buf1, len);
236 break;
237#endif
238#ifdef ACPP_ALG_AVAILABLE
239 case EXSCAN_IN_PLACE_IMPL::ADAPTIVECPP_ALG:
240 scan_exclusive_sum_in_place_adaptivecpp(buf1, len);
241 break;
242#endif
243 default:
245 shambase::format("unimplemented case : {}", u32(scan_exclusive_sum_in_place_impl)));
246 }
247 }
248
249 template void scan_exclusive_sum_in_place<u32>(sham::DeviceBuffer<u32> &buf1, u32 len);
250
251} // namespace shamalgs::primitives
std::uint32_t u32
32 bit unsigned integer
A buffer allocated in USM (Unified Shared Memory)
void complete_event_state(sycl::event e) const
Complete the event state of the buffer.
void copy_from_stdvec(const std::vector< T > &vec)
Copy the content of a std::vector into the buffer.
T * get_write_access(sham::EventList &depends_list, SourceLocation src_loc=SourceLocation{})
Get a read-write pointer to the buffer's data.
std::shared_ptr< DeviceScheduler > & get_dev_scheduler_ptr()
Gets the Device scheduler pointer corresponding to the held allocation.
void copy_from(const DeviceBuffer< T, new_target > &other, size_t copy_size)
Copies the content of another buffer to this one.
size_t get_size() const
Gets the number of elements in the buffer.
std::vector< T > copy_to_stdvec_idx_range(size_t begin, size_t end) const
Copies a specified range of elements from the buffer to a std::vector.
const T * get_read_access(sham::EventList &depends_list, SourceLocation src_loc=SourceLocation{}) const
Get a read-only pointer to the buffer's data.
Class to manage a list of SYCL events.
Definition EventList.hpp:31
This header file contains utility functions related to exception handling in the code.
std::vector< shamalgs::impl_param > get_default_impl_list_scan_exclusive_sum_in_place()
Get list of available scan_exclusive_sum_in_place implementations.
shamalgs::impl_param get_current_impl_scan_exclusive_sum_in_place()
Get the current implementation for scan_exclusive_sum_in_place.
void set_impl_scan_exclusive_sum_in_place(const std::string &impl, const std::string &param="")
Set the implementation for scan_exclusive_sum_in_place.
namespace for primitive algorithm (e.g. sort, scan, reductions, ...)
void scan_exclusive_sum_in_place(sham::DeviceBuffer< T > &buf1, u32 len)
Compute exclusive prefix sum in-place on a device buffer.
void throw_with_loc(std::string message, SourceLocation loc=SourceLocation{})
Throw an exception and append the source location to it.
In-place exclusive scan (prefix sum) algorithm for device buffers.