Shamrock 2025.10.0
Astrophysical Code
Loading...
Searching...
No Matches
exclusiveScanGPUGems39.cpp
Go to the documentation of this file.
1// -------------------------------------------------------//
2//
3// SHAMROCK code for hydrodynamics
4// Copyright (c) 2021-2026 Timothée David--Cléris <tim.shamrock@proton.me>
5// SPDX-License-Identifier: CeCILL Free Software License Agreement v2.1
6// Shamrock is licensed under the CeCILL 2.1 License, see LICENSE for more information
7//
8// -------------------------------------------------------//
9
18#include "shambase/integer.hpp"
19#include "shamalgs/memory.hpp"
20#include "shambackends/math.hpp"
21
22/*
23
24GPU GEMS chapter 39
25(https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda)
26
27ALG 1 :
28__global__ void scan_1(float *g_odata, float *g_idata, int n) {
29 extern __shared__ float temp[]; // allocated on invocation
30 int thid = threadIdx.x;
31 int pout = 0, pin = 1; // Load input into shared memory.
32 // This is exclusive scan, so shift right by one
33 // and set first element to 0
34 temp[pout * n + thid] = (thid > 0) ? g_idata[thid - 1] : 0;
35 __syncthreads();
36 for (int offset = 1; offset < n; offset *= 2) {
37 pout = 1 - pout; // swap double buffer indices
38 pin = 1 - pout;
39 if (thid >= offset)
40 temp[pout * n + thid] += temp[pin * n + thid - offset];
41 else
42 temp[pout * n + thid] = temp[pin * n + thid];
43 __syncthreads();
44 }
45 g_odata[thid] = temp[pout * n + thid]; // write output
46}
47
48*/
49
50namespace shamalgs::numeric::details {
51 template<class T>
53
54 template<class T>
55 sycl::buffer<T> exclusive_sum_gpugems39_1(sycl::queue &q, sycl::buffer<T> &buf1, u32 len) {
56
57 sycl::buffer<T> out1(len);
58 sycl::buffer<T> out2(len);
59
60 auto get_in_buf_ref = [&](u32 step) -> sycl::buffer<T> & {
61 if (step % 2 == 0) {
62 return out1;
63 } else {
64 return out2;
65 }
66 };
67
68 auto get_out_buf_ref = [&](u32 step) -> sycl::buffer<T> & {
69 if (step % 2 == 1) {
70 return out1;
71 } else {
72 return out2;
73 }
74 };
75
76 u32 step = 0;
77
78 q.submit([&](sycl::handler &cgh) {
79 sycl::accessor acc_in{buf1, cgh, sycl::read_only};
80 sycl::accessor acc_out{get_in_buf_ref(step), cgh, sycl::write_only, sycl::no_init};
81
82 cgh.parallel_for(sycl::range<1>{len}, [=](sycl::item<1> id) {
83 u32 thid = id.get_linear_id();
84 acc_out[id] = (thid > 0) ? acc_in[thid - 1] : 0;
85 });
86 });
87
88 for (int offset = 1; offset < len; offset *= 2) {
89
90 q.submit([&, offset](sycl::handler &cgh) {
91 sycl::accessor acc_in{get_in_buf_ref(step), cgh, sycl::read_only};
92 sycl::accessor acc_out{get_out_buf_ref(step), cgh, sycl::write_only};
93
94 cgh.parallel_for<KernelExclsum_1<T>>(sycl::range<1>{len}, [=](sycl::item<1> id) {
95 u32 thid = id.get_linear_id();
96
97 const auto in_val = acc_in[thid];
98
99 acc_out[thid] = (thid >= offset) ? in_val + acc_in[thid - offset] : in_val;
100 });
101 });
102
103 step++;
104 }
105
106 return std::move(get_in_buf_ref(step));
107 }
108
109 template<class T>
111
112 template<class T>
113 sycl::buffer<T> exclusive_sum_gpugems39_2(sycl::queue &q, sycl::buffer<T> &buf1, u32 len) {
114
115 u32 rounded_len = sham::roundup_pow2_clz(len);
116
117 sycl::buffer<T> out1(rounded_len);
118 sycl::buffer<T> out2(rounded_len);
119
120 auto get_in_buf_ref = [&](u32 step) -> sycl::buffer<T> & {
121 if (step % 2 == 0) {
122 return out1;
123 } else {
124 return out2;
125 }
126 };
127
128 auto get_out_buf_ref = [&](u32 step) -> sycl::buffer<T> & {
129 if (step % 2 == 1) {
130 return out1;
131 } else {
132 return out2;
133 }
134 };
135
136 u32 step = 0;
137
138 q.submit([&](sycl::handler &cgh) {
139 u32 correct_len = len;
140 sycl::accessor acc_in{buf1, cgh, sycl::read_only};
141 sycl::accessor acc_out{get_in_buf_ref(step), cgh, sycl::write_only, sycl::no_init};
142
143 cgh.parallel_for(sycl::range<1>{rounded_len}, [=](sycl::item<1> id) {
144 u32 thid = id.get_linear_id();
145 acc_out[id] = (thid > 0 && thid < correct_len) ? acc_in[thid - 1] : 0;
146 });
147 });
148
149 for (int offset = 1; offset < rounded_len; offset *= 2) {
150
151 q.submit([&, offset](sycl::handler &cgh) {
152 sycl::accessor acc_in{get_in_buf_ref(step), cgh, sycl::read_only};
153 sycl::accessor acc_out{get_out_buf_ref(step), cgh, sycl::write_only};
154
155 cgh.parallel_for<KernelExclsum_2<T>>(
156 sycl::range<1>{rounded_len}, [=](sycl::item<1> id) {
157 u32 thid = id.get_linear_id();
158
159 const auto in_val = acc_in[thid];
160
161 acc_out[thid] = (thid >= offset) ? in_val + acc_in[thid - offset] : in_val;
162 });
163 });
164
165 step++;
166 }
167
168 return std::move(get_in_buf_ref(step));
169 }
170
171 template<class T>
173
174 template<class T>
175 sycl::buffer<T> exclusive_sum_gpugems39_3(sycl::queue &q, sycl::buffer<T> &buf1, u32 len) {
176
177 u32 rounded_len = sham::roundup_pow2_clz(len);
178
179 sycl::buffer<T> out1(rounded_len);
180 sycl::buffer<T> out2(rounded_len);
181
182 auto get_in_buf_ref = [&](u32 step) -> sycl::buffer<T> & {
183 if (step % 2 == 0) {
184 return out1;
185 } else {
186 return out2;
187 }
188 };
189
190 auto get_out_buf_ref = [&](u32 step) -> sycl::buffer<T> & {
191 if (step % 2 == 1) {
192 return out1;
193 } else {
194 return out2;
195 }
196 };
197
198 u32 step = 0;
199
200 q.submit([&](sycl::handler &cgh) {
201 u32 correct_len = len;
202 sycl::accessor acc_in{buf1, cgh, sycl::read_only};
203 sycl::accessor acc_out{get_in_buf_ref(step), cgh, sycl::write_only, sycl::no_init};
204
205 cgh.parallel_for(sycl::range<1>{rounded_len}, [=](sycl::item<1> id) {
206 u32 thid = id.get_linear_id();
207 acc_out[id] = (thid > 0 && thid < correct_len) ? acc_in[thid - 1] : 0;
208 });
209 });
210
211 for (int offset = 1; offset < rounded_len; offset *= 2) {
212
213 q.submit([&, offset](sycl::handler &cgh) {
214 sycl::accessor acc_in{get_in_buf_ref(step), cgh, sycl::read_only};
215 sycl::accessor acc_out{get_out_buf_ref(step), cgh, sycl::write_only};
216
217 cgh.parallel_for<KernelExclsum_3<T>>(
218 sycl::range<1>{rounded_len}, [=](sycl::item<1> id) {
219 u32 thid = id.get_linear_id();
220
221 const auto in_val = acc_in[thid];
222
223 acc_out[thid] = (thid >= offset) ? in_val + acc_in[thid - offset] : in_val;
224 });
225 });
226
227 step++;
228 }
229
230 return std::move(get_in_buf_ref(step));
231 }
232
233 template sycl::buffer<u32> exclusive_sum_gpugems39_1(
234 sycl::queue &q, sycl::buffer<u32> &buf1, u32 len);
235
236 template sycl::buffer<u32> exclusive_sum_gpugems39_2(
237 sycl::queue &q, sycl::buffer<u32> &buf1, u32 len);
238
239 template sycl::buffer<u32> exclusive_sum_gpugems39_3(
240 sycl::queue &q, sycl::buffer<u32> &buf1, u32 len);
241
242} // namespace shamalgs::numeric::details
std::uint32_t u32
32 bit unsigned integer
constexpr T roundup_pow2_clz(T v) noexcept
round up to the next power of two 0 is rounded up to 1 as it is not a pow of 2 every input above the ...
Definition math.hpp:805
main include file for memory algorithms