26#if defined(__has_include)
27 #if __has_include(<AdaptiveCpp/algorithms/numeric.hpp>)
28 #include <AdaptiveCpp/algorithms/numeric.hpp>
29 #define ACPP_ALG_AVAILABLE
37 void scan_exclusive_sum_in_place_std_scan_single_task_acpp(
42 sycl::queue &q_s = q.q;
48 auto e = q.submit(deps, [&](sycl::handler &cgh) {
49 cgh.single_task([=]() {
50 std::exclusive_scan(in_out_ptr, in_out_ptr + len, in_out_ptr, T{});
57 std::exclusive_scan(acc_src.begin(), acc_src.end(), acc_src.begin(), T{});
66 std::exclusive_scan(acc_src.begin(), acc_src.end(), acc_src.begin(), 0);
70#ifdef SYCL2020_FEATURE_GROUP_REDUCTION
73 shamalgs::numeric::details::exclusive_sum_atomic_decoupled_v5_usm_in_place<T, 512>(
78#ifdef ACPP_ALG_AVAILABLE
83 acpp::algorithms::util::allocation_cache cache{
84 acpp::algorithms::util::allocation_type::device};
85 acpp::algorithms::util::allocation_group scratch{&cache, q.get_device()};
91 T *temp_ptr = temp.get_write_access(deps);
93 sycl::event e = adaptivecpp::algorithms::exclusive_scan(
94 q, scratch, in_out_ptr, in_out_ptr + len, temp_ptr, T{}, deps.get_events());
95 deps.set_consumed(
true);
98 temp.complete_event_state(e);
107 enum class EXSCAN_IN_PLACE_IMPL :
u32 {
110 STD_SCAN_SINGLE_TASK_ACPP,
112#ifdef SYCL2020_FEATURE_GROUP_REDUCTION
113 DECOUPLED_LOOKBACK_512,
115#ifdef ACPP_ALG_AVAILABLE
120 EXSCAN_IN_PLACE_IMPL get_default_scan_exclusive_sum_in_place_impl() {
123 return EXSCAN_IN_PLACE_IMPL::STD_SCAN_SINGLE_TASK_ACPP;
125 return EXSCAN_IN_PLACE_IMPL::STD_SCAN;
128 #ifdef SYCL2020_FEATURE_GROUP_REDUCTION
129 return EXSCAN_IN_PLACE_IMPL::DECOUPLED_LOOKBACK_512;
131 return EXSCAN_IN_PLACE_IMPL::STD_SCAN;
136 EXSCAN_IN_PLACE_IMPL scan_exclusive_sum_in_place_impl
137 = get_default_scan_exclusive_sum_in_place_impl();
139 inline EXSCAN_IN_PLACE_IMPL scan_exclusive_sum_in_place_impl_from_params(
140 const std::string &impl) {
141 if (impl ==
"std_scan") {
142 return EXSCAN_IN_PLACE_IMPL::STD_SCAN;
144 }
else if (impl ==
"std_scan_single_task_acpp") {
145 return EXSCAN_IN_PLACE_IMPL::STD_SCAN_SINGLE_TASK_ACPP;
147#ifdef SYCL2020_FEATURE_GROUP_REDUCTION
148 }
else if (impl ==
"decoupled_lookback_512") {
149 return EXSCAN_IN_PLACE_IMPL::DECOUPLED_LOOKBACK_512;
151#ifdef ACPP_ALG_AVAILABLE
152 }
else if (impl ==
"acpp_alg") {
153 return EXSCAN_IN_PLACE_IMPL::ADAPTIVECPP_ALG;
158 "invalid implementation : {}, possible implementations : {}",
164 const EXSCAN_IN_PLACE_IMPL &impl) {
165 if (impl == EXSCAN_IN_PLACE_IMPL::STD_SCAN) {
166 return {
"std_scan",
""};
168 }
else if (impl == EXSCAN_IN_PLACE_IMPL::STD_SCAN_SINGLE_TASK_ACPP) {
169 return {
"std_scan_single_task_acpp",
""};
171#ifdef SYCL2020_FEATURE_GROUP_REDUCTION
172 }
else if (impl == EXSCAN_IN_PLACE_IMPL::DECOUPLED_LOOKBACK_512) {
173 return {
"decoupled_lookback_512",
""};
175#ifdef ACPP_ALG_AVAILABLE
176 }
else if (impl == EXSCAN_IN_PLACE_IMPL::ADAPTIVECPP_ALG) {
177 return {
"acpp_alg",
""};
182 shambase::format(
"unknown scan_exclusive_sum_in_place implementation : {}",
u32(impl)));
189 {
"std_scan_single_task_acpp",
""},
191#ifdef SYCL2020_FEATURE_GROUP_REDUCTION
192 {
"decoupled_lookback_512",
""},
194#ifdef ACPP_ALG_AVAILABLE
201 return scan_exclusive_sum_in_place_impl_to_params(scan_exclusive_sum_in_place_impl);
205 const std::string &impl,
const std::string ¶m) {
207 "tree",
"setting scan_exclusive_sum_in_place implementation to impl :", impl);
208 scan_exclusive_sum_in_place_impl = scan_exclusive_sum_in_place_impl_from_params(impl);
220 "The buffer is smaller than the length of the scan\n"
221 "len > buf1.get_size(), len = {}, buf1.get_size() = {}",
226 switch (scan_exclusive_sum_in_place_impl) {
227 case EXSCAN_IN_PLACE_IMPL::STD_SCAN: scan_exclusive_sum_in_place_fallback(buf1, len);
break;
229 case EXSCAN_IN_PLACE_IMPL::STD_SCAN_SINGLE_TASK_ACPP:
230 scan_exclusive_sum_in_place_std_scan_single_task_acpp(buf1, len);
233#ifdef SYCL2020_FEATURE_GROUP_REDUCTION
234 case EXSCAN_IN_PLACE_IMPL::DECOUPLED_LOOKBACK_512:
235 scan_exclusive_sum_in_place_decoupled_lookback_512(buf1, len);
238#ifdef ACPP_ALG_AVAILABLE
239 case EXSCAN_IN_PLACE_IMPL::ADAPTIVECPP_ALG:
240 scan_exclusive_sum_in_place_adaptivecpp(buf1, len);
245 shambase::format(
"unimplemented case : {}",
u32(scan_exclusive_sum_in_place_impl)));
std::uint32_t u32
32 bit unsigned integer
A buffer allocated in USM (Unified Shared Memory)
void complete_event_state(sycl::event e) const
Complete the event state of the buffer.
void copy_from_stdvec(const std::vector< T > &vec)
Copy the content of a std::vector into the buffer.
T * get_write_access(sham::EventList &depends_list, SourceLocation src_loc=SourceLocation{})
Get a read-write pointer to the buffer's data.
std::shared_ptr< DeviceScheduler > & get_dev_scheduler_ptr()
Gets the Device scheduler pointer corresponding to the held allocation.
void copy_from(const DeviceBuffer< T, new_target > &other, size_t copy_size)
Copies the content of another buffer to this one.
size_t get_size() const
Gets the number of elements in the buffer.
std::vector< T > copy_to_stdvec_idx_range(size_t begin, size_t end) const
Copies a specified range of elements from the buffer to a std::vector.
const T * get_read_access(sham::EventList &depends_list, SourceLocation src_loc=SourceLocation{}) const
Get a read-only pointer to the buffer's data.
Class to manage a list of SYCL events.
This header file contains utility functions related to exception handling in the code.
std::vector< shamalgs::impl_param > get_default_impl_list_scan_exclusive_sum_in_place()
Get list of available scan_exclusive_sum_in_place implementations.
shamalgs::impl_param get_current_impl_scan_exclusive_sum_in_place()
Get the current implementation for scan_exclusive_sum_in_place.
void set_impl_scan_exclusive_sum_in_place(const std::string &impl, const std::string ¶m="")
Set the implementation for scan_exclusive_sum_in_place.
namespace for primitive algorithm (e.g. sort, scan, reductions, ...)
void scan_exclusive_sum_in_place(sham::DeviceBuffer< T > &buf1, u32 len)
Compute exclusive prefix sum in-place on a device buffer.
void throw_with_loc(std::string message, SourceLocation loc=SourceLocation{})
Throw an exception and append the source location to it.
In-place exclusive scan (prefix sum) algorithm for device buffers.