Shamrock 2025.10.0
Astrophysical Code
Loading...
Searching...
No Matches
bitonicSort_updated_xor_swap.cpp
Go to the documentation of this file.
1// -------------------------------------------------------//
2//
3// SHAMROCK code for hydrodynamics
4// Copyright (c) 2021-2026 Timothée David--Cléris <tim.shamrock@proton.me>
5// SPDX-License-Identifier: CeCILL Free Software License Agreement v2.1
6// Shamrock is licensed under the CeCILL 2.1 License, see LICENSE for more information
7//
8// -------------------------------------------------------//
9
18#include "shambase/integer.hpp"
20
21// modified from http://www.bealto.com/gpu-sorting.html
22
24
25 template<class Tkey, class Tval>
27
28 using AccKey
29 = sycl::accessor<Tkey, 1, sycl::access::mode::read_write, sycl::target::device>;
30 using AccVal
31 = sycl::accessor<Tval, 1, sycl::access::mode::read_write, sycl::target::device>;
32
33 inline static void _order(Tkey &a, Tkey &b, Tval &va, Tval &vb, bool reverse) {
34 bool swap = reverse ^ (a < b);
35 if (swap) {
36 a ^= b;
37 b ^= a;
38 a ^= b;
39 va ^= vb;
40 vb ^= va;
41 va ^= vb;
42 }
43 }
44
45 inline static void _orderV(Tkey *x, Tval *vx, u32 a, u32 b, bool reverse) {
46 bool swap = reverse ^ (x[a] < x[b]);
47
48 if (swap) {
49 x[a] ^= x[b];
50 x[b] ^= x[a];
51 x[a] ^= x[b];
52 vx[a] ^= vx[b];
53 vx[b] ^= vx[a];
54 vx[a] ^= vx[b];
55 }
56 }
57
58 template<u32 stencil_size>
59 static void order_stencil(Tkey *x, Tval *vx, u32 a, bool reverse);
60
61 template<>
62 inline void order_stencil<2>(Tkey *x, Tval *vx, u32 a, bool reverse) {
63 _orderV(x, vx, a, a + 1, reverse);
64 }
65
66 template<>
67 inline void order_stencil<4>(Tkey *x, Tval *vx, u32 a, bool reverse) {
68#pragma unroll
69 for (int i4 = 0; i4 < 2; i4++) {
70 _orderV(x, vx, a + i4, a + i4 + 2, reverse);
71 }
72 order_stencil<2>(x, vx, a, reverse);
73 order_stencil<2>(x, vx, a + 2, reverse);
74 }
75
76 template<>
77 inline void order_stencil<8>(Tkey *x, Tval *vx, u32 a, bool reverse) {
78#pragma unroll
79 for (int i8 = 0; i8 < 4; i8++) {
80 _orderV(x, vx, a + i8, a + i8 + 4, reverse);
81 }
82 order_stencil<4>(x, vx, a, reverse);
83 order_stencil<4>(x, vx, a + 4, reverse);
84 }
85
86 template<>
87 inline void order_stencil<16>(Tkey *x, Tval *vx, u32 a, bool reverse) {
88#pragma unroll
89 for (int i16 = 0; i16 < 8; i16++) {
90 _orderV(x, vx, a + i16, a + i16 + 8, reverse);
91 }
92 order_stencil<8>(x, vx, a, reverse);
93 order_stencil<8>(x, vx, a + 8, reverse);
94 }
95
96 template<>
97 inline void order_stencil<32>(Tkey *x, Tval *vx, u32 a, bool reverse) {
98#pragma unroll
99 for (int i32 = 0; i32 < 16; i32++) {
100 _orderV(x, vx, a + i32, a + i32 + 16, reverse);
101 }
102 order_stencil<16>(x, vx, a, reverse);
103 order_stencil<16>(x, vx, a + 16, reverse);
104 }
105
106 template<u32 stencil_size>
107 static void order_kernel(AccKey m, AccVal id, u32 inc, u32 length, i32 t);
108
109 template<>
110 inline void order_kernel<32>(AccKey m, AccVal id, u32 inc, u32 length, i32 t) {
111 u32 _inc = inc;
112 u32 _dir = length << 1U;
113
114 _inc >>= 4;
115 int low = t & (_inc - 1); // low order bits (below INC)
116 int i = ((t - low) << 5) + low; // insert 000 at position INC
117 bool reverse = ((_dir & i) == 0); // asc/desc order
118
119 // Load
120 Tkey x[32];
121 for (int k = 0; k < 32; k++)
122 x[k] = m[k * _inc + i];
123
124 uint idx[32];
125 for (int k = 0; k < 32; k++)
126 idx[k] = id[k * _inc + i];
127
128 // Sort
129 order_stencil<32>(x, idx, 0, reverse);
130
131 // Store
132 for (int k = 0; k < 32; k++)
133 m[k * _inc + i] = x[k];
134 for (int k = 0; k < 32; k++)
135 id[k * _inc + i] = idx[k];
136 }
137
138 template<>
139 inline void order_kernel<16>(AccKey m, AccVal id, u32 inc, u32 length, i32 t) {
140
141 u32 _inc = inc;
142 u32 _dir = length << 1;
143
144 _inc >>= 3;
145 int low = t & (_inc - 1); // low order bits (below INC)
146 int i = ((t - low) << 4) + low; // insert 000 at position INC
147 bool reverse = ((_dir & i) == 0); // asc/desc order
148
149 // Load
150 Tkey x[16];
151 for (int k = 0; k < 16; k++)
152 x[k] = m[k * _inc + i];
153
154 Tval idx[16];
155 for (int k = 0; k < 16; k++)
156 idx[k] = id[k * _inc + i];
157
158 // Sort
159 order_stencil<16>(x, idx, 0, reverse);
160
161 // Store
162 for (int k = 0; k < 16; k++)
163 m[k * _inc + i] = x[k];
164 for (int k = 0; k < 16; k++)
165 id[k * _inc + i] = idx[k];
166 }
167
168 template<>
169 inline void order_kernel<8>(AccKey m, AccVal id, u32 inc, u32 length, i32 t) {
170 u32 _inc = inc;
171 u32 _dir = length << 1;
172
173 _inc >>= 2;
174 int low = t & (_inc - 1); // low order bits (below INC)
175 int i = ((t - low) << 3) + low; // insert 000 at position INC
176 bool reverse = ((_dir & i) == 0); // asc/desc order
177
178 // Load
179 Tkey x[8];
180 for (int k = 0; k < 8; k++)
181 x[k] = m[k * _inc + i];
182
183 Tval idx[8];
184 for (int k = 0; k < 8; k++)
185 idx[k] = id[k * _inc + i];
186
187 // Sort
188 order_stencil<8>(x, idx, 0, reverse);
189
190 // Store
191 for (int k = 0; k < 8; k++)
192 m[k * _inc + i] = x[k];
193 for (int k = 0; k < 8; k++)
194 id[k * _inc + i] = idx[k];
195 }
196
197 template<>
198 inline void order_kernel<4>(AccKey m, AccVal id, u32 inc, u32 length, i32 t) {
199 u32 _inc = inc;
200 u32 _dir = length << 1;
201
202 _inc >>= 1;
203 int low = t & (_inc - 1); // low order bits (below INC)
204 int i = ((t - low) << 2) + low; // insert 00 at position INC
205 bool reverse = ((_dir & i) == 0); // asc/desc order
206
207 // Load
208 Tkey x0 = m[0 + i];
209 Tkey x1 = m[_inc + i];
210 Tkey x2 = m[2 * _inc + i];
211 Tkey x3 = m[3 * _inc + i];
212
213 Tval idx0 = id[0 + i];
214 Tval idx1 = id[_inc + i];
215 Tval idx2 = id[2 * _inc + i];
216 Tval idx3 = id[3 * _inc + i];
217
218 // Sort
219 _order(x0, x2, idx0, idx2, reverse);
220 _order(x1, x3, idx1, idx3, reverse);
221 _order(x0, x1, idx0, idx1, reverse);
222 _order(x2, x3, idx2, idx3, reverse);
223
224 // Store
225 m[0 + i] = x0;
226 m[_inc + i] = x1;
227 m[2 * _inc + i] = x2;
228 m[3 * _inc + i] = x3;
229
230 id[0 + i] = idx0;
231 id[_inc + i] = idx1;
232 id[2 * _inc + i] = idx2;
233 id[3 * _inc + i] = idx3;
234 }
235
236 template<>
237 inline void order_kernel<2>(AccKey m, AccVal id, u32 inc, u32 length, i32 t) {
238 u32 _inc = inc;
239 u32 _dir = length << 1;
240
241 int low = t & (_inc - 1); // low order bits (below INC)
242 int i = (t << 1) - low; // insert 0 at position INC
243 bool reverse = ((_dir & i) == 0); // asc/desc order
244
245 // Load
246 Tkey x0 = m[0 + i];
247 Tkey x1 = m[_inc + i];
248 Tval idx0 = id[0 + i];
249 Tval idx1 = id[_inc + i];
250
251 // Sort
252 _order(x0, x1, idx0, idx1, reverse);
253
254 // Store
255 m[0 + i] = x0;
256 m[_inc + i] = x1;
257 id[0 + i] = idx0;
258 id[_inc + i] = idx1;
259 }
260 };
261
262 template<class Tkey, class Tval, u32 MaxStencilSize>
263 void sort_by_key_bitonic_updated_xor_swap(
264 sycl::queue &q, sycl::buffer<Tkey> &buf_key, sycl::buffer<Tval> &buf_values, u32 len) {
265
266 if (!shambase::is_pow_of_two(len)) {
268 "this algorithm can only be used with length that are powers of two");
269 }
270
271 using B = OrderingPrimitiveXorSwap<Tkey, Tval>;
272
273 for (u32 length = 1; length < len; length <<= 1) {
274 u32 inc = length;
275 while (inc > 0) {
276 // log("inc : %d\n",inc);
277 // int ninc = 1;
278 u32 ninc = 0;
279
280 // B32 sort kernel is less performant than the B16 because of cache size
281 if constexpr (MaxStencilSize >= 32) {
282 if (inc >= 16 && ninc == 0) {
283 ninc = 5;
284 unsigned int nThreads = len >> ninc;
285 sycl::range<1> range{nThreads};
286
287 auto ker_sort_morton_b32 = [&](sycl::handler &cgh) {
288 sycl::accessor m{buf_key, cgh, sycl::read_write};
289 sycl::accessor id{buf_values, cgh, sycl::read_write};
290
291 cgh.parallel_for(range, [=](sycl::item<1> item) {
292 //(__global data_t * data,__global uint * ids,int inc,int dir)
293
294 B::template order_kernel<32>(m, id, inc, length, item.get_id(0));
295 });
296 };
297 q.submit(ker_sort_morton_b32);
298 }
299 }
300
301 if constexpr (MaxStencilSize >= 16) {
302 if (inc >= 8 && ninc == 0) {
303 ninc = 4;
304 unsigned int nThreads = len >> ninc;
305 sycl::range<1> range{nThreads};
306
307 auto ker_sort_morton_b16 = [&](sycl::handler &cgh) {
308 sycl::accessor m{buf_key, cgh, sycl::read_write};
309 sycl::accessor id{buf_values, cgh, sycl::read_write};
310
311 cgh.parallel_for(range, [=](sycl::item<1> item) {
312 //(__global data_t * data,__global uint * ids,int inc,int dir)
313
314 B::template order_kernel<16>(m, id, inc, length, item.get_id(0));
315 });
316 };
317 q.submit(ker_sort_morton_b16);
318
319 // sort_kernel_B8(arg_eq,* buf_key->buf,*
320 // particles::buf_ids->buf,inc,length<<1);//.wait();
321 }
322 }
323
324 if constexpr (MaxStencilSize >= 8) {
325 // B8
326 if (inc >= 4 && ninc == 0) {
327 ninc = 3;
328 unsigned int nThreads = len >> ninc;
329 sycl::range<1> range{nThreads};
330
331 auto ker_sort_morton_b8 = [&](sycl::handler &cgh) {
332 sycl::accessor m{buf_key, cgh, sycl::read_write};
333 sycl::accessor id{buf_values, cgh, sycl::read_write};
334
335 cgh.parallel_for(range, [=](sycl::item<1> item) {
336 //(__global data_t * data,__global uint * ids,int inc,int dir)
337
338 B::template order_kernel<8>(m, id, inc, length, item.get_id(0));
339 });
340 };
341 q.submit(ker_sort_morton_b8);
342
343 // sort_kernel_B8(arg_eq,* buf_key->buf,*
344 // particles::buf_ids->buf,inc,length<<1);//.wait();
345 }
346 }
347
348 if constexpr (MaxStencilSize >= 4) {
349 // B4
350 if (inc >= 2 && ninc == 0) {
351 ninc = 2;
352 unsigned int nThreads = len >> ninc;
353 sycl::range<1> range{nThreads};
354 // sort_kernel_B4(arg_eq,* buf_key->buf,*
355 // particles::buf_ids->buf,inc,length<<1);
356 auto ker_sort_morton_b4 = [&](sycl::handler &cgh) {
357 sycl::accessor m{buf_key, cgh, sycl::read_write};
358 sycl::accessor id{buf_values, cgh, sycl::read_write};
359 cgh.parallel_for(range, [=](sycl::item<1> item) {
360 B::template order_kernel<4>(m, id, inc, length, item.get_id(0));
361 });
362 };
363 q.submit(ker_sort_morton_b4);
364 }
365 }
366
367 // B2
368 if (ninc == 0) {
369 ninc = 1;
370 unsigned int nThreads = len >> ninc;
371 sycl::range<1> range{nThreads};
372 // sort_kernel_B2(arg_eq,* buf_key->buf,*
373 // particles::buf_ids->buf,inc,length<<1);
374 auto ker_sort_morton_b2 = [&](sycl::handler &cgh) {
375 sycl::accessor m{buf_key, cgh, sycl::read_write};
376 sycl::accessor id{buf_values, cgh, sycl::read_write};
377
378 cgh.parallel_for(range, [=](sycl::item<1> item) {
379 //(__global data_t * data,__global uint * ids,int inc,int dir)
380
381 B::template order_kernel<2>(m, id, inc, length, item.get_id(0));
382 });
383 };
384 q.submit(ker_sort_morton_b2);
385 }
386
387 inc >>= ninc;
388 }
389 }
390 }
391
392 template void sort_by_key_bitonic_updated_xor_swap<u32, u32, 16>(
393 sycl::queue &q, sycl::buffer<u32> &buf_key, sycl::buffer<u32> &buf_values, u32 len);
394
395 template void sort_by_key_bitonic_updated_xor_swap<u64, u32, 16>(
396 sycl::queue &q, sycl::buffer<u64> &buf_key, sycl::buffer<u32> &buf_values, u32 len);
397
398 template void sort_by_key_bitonic_updated_xor_swap<u32, u32, 8>(
399 sycl::queue &q, sycl::buffer<u32> &buf_key, sycl::buffer<u32> &buf_values, u32 len);
400
401 template void sort_by_key_bitonic_updated_xor_swap<u64, u32, 8>(
402 sycl::queue &q, sycl::buffer<u64> &buf_key, sycl::buffer<u32> &buf_values, u32 len);
403
404 template void sort_by_key_bitonic_updated_xor_swap<u32, u32, 32>(
405 sycl::queue &q, sycl::buffer<u32> &buf_key, sycl::buffer<u32> &buf_values, u32 len);
406
407 template void sort_by_key_bitonic_updated_xor_swap<u64, u32, 32>(
408 sycl::queue &q, sycl::buffer<u64> &buf_key, sycl::buffer<u32> &buf_values, u32 len);
409
410} // namespace shamalgs::algorithm::details
std::int8_t i8
8 bit integer
std::uint32_t u32
32 bit unsigned integer
std::int16_t i16
16 bit integer
std::int32_t i32
32 bit integer
This header file contains utility functions related to exception handling in the code.
namespace to store algorithms implemented by shamalgs
constexpr bool is_pow_of_two(T v) noexcept
determine if v is a power of two and check if v==0 Source : https://graphics.stanford....
Definition integer.hpp:49
void throw_with_loc(std::string message, SourceLocation loc=SourceLocation{})
Throw an exception and append the source location to it.