25#define MAXORDER_SORT_KERNEL 16
27#define ORDER(a, b, ida, idb) \
29 bool swap = reverse ^ (a < b); \
34 a = (swap) ? auxb : auxa; \
35 b = (swap) ? auxa : auxb; \
36 ida = (swap) ? auxidb : auxida; \
37 idb = (swap) ? auxida : auxidb; \
40#define ORDERV(x, idx, a, b) \
42 bool swap = reverse ^ (x[a] < x[b]); \
45 Tval auxida = idx[a]; \
46 Tval auxidb = idx[b]; \
47 x[a] = (swap) ? auxb : auxa; \
48 x[b] = (swap) ? auxa : auxb; \
49 idx[a] = (swap) ? auxidb : auxida; \
50 idx[b] = (swap) ? auxida : auxidb; \
53#define B2V(x, idx, a) {ORDERV(x, idx, a, a + 1)}
55#define B4V(x, idx, a) \
57 for (int i4 = 0; i4 < 2; i4++) { \
58 ORDERV(x, idx, a + i4, a + i4 + 2) \
60 B2V(x, idx, a) B2V(x, idx, a + 2) \
63#define B8V(x, idx, a) \
65 for (int i8 = 0; i8 < 4; i8++) { \
66 ORDERV(x, idx, a + i8, a + i8 + 4) \
68 B4V(x, idx, a) B4V(x, idx, a + 4) \
71#define B16V(x, idx, a) \
73 for (int i16 = 0; i16 < 8; i16++) { \
74 ORDERV(x, idx, a + i16, a + i16 + 8) \
76 B8V(x, idx, a) B8V(x, idx, a + 8) \
79#define B32V(x, idx, a) \
81 for (int i32 = 0; i32 < 16; i32++) { \
82 ORDERV(x, idx, a + i32, a + i32 + 16) \
84 B16V(x, idx, a) B16V(x, idx, a + 16) \
87class Bitonic_sort_B32_morton32;
88class Bitonic_sort_B16_morton32;
89class Bitonic_sort_B8_morton32;
90class Bitonic_sort_B4_morton32;
91class Bitonic_sort_B2_morton32;
93class Bitonic_sort_B32_morton64;
94class Bitonic_sort_B16_morton64;
95class Bitonic_sort_B8_morton64;
96class Bitonic_sort_B4_morton64;
97class Bitonic_sort_B2_morton64;
101 template<
class Tkey,
class Tval>
102 void sort_by_key_bitonic_legacy(
103 sycl::queue &q, sycl::buffer<Tkey> &buf_key, sycl::buffer<Tval> &buf_values,
u32 len) {
107 "this algorithm can only be used with length that are powers of two");
111 "BitonicSorter",
"submit : sycl_sort_morton_key_pair<u32, MultiKernel>");
113 for (
u32 length = 1; length < len; length <<= 1) {
121#if MAXORDER_SORT_KERNEL >= 32
122 if (inc >= 16 && ninc == 0) {
124 unsigned int nThreads = len >> ninc;
125 sycl::range<1> range{nThreads};
127 auto ker_sort_morton_b32 = [&](sycl::handler &cgh) {
128 sycl::accessor m{buf_key, cgh, sycl::read_write};
129 sycl::accessor
id{buf_values, cgh, sycl::read_write};
131 cgh.parallel_for(range, [=](sycl::item<1> item) {
135 u32 _dir = length << 1;
138 int t = item.get_id();
139 int low = t & (_inc - 1);
140 int i = ((t - low) << 5) + low;
141 bool reverse = ((_dir & i) == 0);
145 for (
int k = 0; k < 32; k++)
146 x[k] = m[k * _inc + i];
149 for (
int k = 0; k < 32; k++)
150 idx[k] =
id[k * _inc + i];
156 for (
int k = 0; k < 32; k++)
157 m[k * _inc + i] = x[k];
158 for (
int k = 0; k < 32; k++)
159 id[k * _inc + i] = idx[k];
162 q.submit(ker_sort_morton_b32);
166#if MAXORDER_SORT_KERNEL >= 16
167 if (inc >= 8 && ninc == 0) {
169 unsigned int nThreads = len >> ninc;
170 sycl::range<1> range{nThreads};
172 auto ker_sort_morton_b16 = [&](sycl::handler &cgh) {
173 sycl::accessor m{buf_key, cgh, sycl::read_write};
174 sycl::accessor
id{buf_values, cgh, sycl::read_write};
176 cgh.parallel_for(range, [=](sycl::item<1> item) {
180 u32 _dir = length << 1;
183 int t = item.get_id(0);
184 int low = t & (_inc - 1);
185 int i = ((t - low) << 4) + low;
186 bool reverse = ((_dir & i) == 0);
190 for (
int k = 0; k < 16; k++)
191 x[k] = m[k * _inc + i];
194 for (
int k = 0; k < 16; k++)
195 idx[k] =
id[k * _inc + i];
201 for (
int k = 0; k < 16; k++)
202 m[k * _inc + i] = x[k];
203 for (
int k = 0; k < 16; k++)
204 id[k * _inc + i] = idx[k];
207 q.submit(ker_sort_morton_b16);
214#if MAXORDER_SORT_KERNEL >= 8
216 if (inc >= 4 && ninc == 0) {
218 unsigned int nThreads = len >> ninc;
219 sycl::range<1> range{nThreads};
221 auto ker_sort_morton_b8 = [&](sycl::handler &cgh) {
222 sycl::accessor m{buf_key, cgh, sycl::read_write};
223 sycl::accessor
id{buf_values, cgh, sycl::read_write};
225 cgh.parallel_for(range, [=](sycl::item<1> item) {
229 u32 _dir = length << 1;
232 int t = item.get_id(0);
233 int low = t & (_inc - 1);
234 int i = ((t - low) << 3) + low;
235 bool reverse = ((_dir & i) == 0);
239 for (
int k = 0; k < 8; k++)
240 x[k] = m[k * _inc + i];
243 for (
int k = 0; k < 8; k++)
244 idx[k] =
id[k * _inc + i];
250 for (
int k = 0; k < 8; k++)
251 m[k * _inc + i] = x[k];
252 for (
int k = 0; k < 8; k++)
253 id[k * _inc + i] = idx[k];
256 q.submit(ker_sort_morton_b8);
263#if MAXORDER_SORT_KERNEL >= 4
265 if (inc >= 2 && ninc == 0) {
267 unsigned int nThreads = len >> ninc;
268 sycl::range<1> range{nThreads};
271 auto ker_sort_morton_b4 = [&](sycl::handler &cgh) {
272 sycl::accessor m{buf_key, cgh, sycl::read_write};
273 sycl::accessor
id{buf_values, cgh, sycl::read_write};
274 cgh.parallel_for(range, [=](sycl::item<1> item) {
278 u32 _dir = length << 1;
281 int t = item.get_id(0);
282 int low = t & (_inc - 1);
283 int i = ((t - low) << 2) + low;
284 bool reverse = ((_dir & i) == 0);
288 Tkey x1 = m[_inc + i];
289 Tkey x2 = m[2 * _inc + i];
290 Tkey x3 = m[3 * _inc + i];
292 Tval idx0 =
id[0 + i];
293 Tval idx1 =
id[_inc + i];
294 Tval idx2 =
id[2 * _inc + i];
295 Tval idx3 =
id[3 * _inc + i];
298 ORDER(x0, x2, idx0, idx2)
299 ORDER(x1, x3, idx1, idx3)
300 ORDER(x0, x1, idx0, idx1)
301 ORDER(x2, x3, idx2, idx3)
306 m[2 * _inc + i] = x2;
307 m[3 * _inc + i] = x3;
311 id[2 * _inc + i] = idx2;
312 id[3 * _inc + i] = idx3;
315 q.submit(ker_sort_morton_b4);
322 unsigned int nThreads = len >> ninc;
323 sycl::range<1> range{nThreads};
326 auto ker_sort_morton_b2 = [&](sycl::handler &cgh) {
327 sycl::accessor m{buf_key, cgh, sycl::read_write};
328 sycl::accessor
id{buf_values, cgh, sycl::read_write};
330 cgh.parallel_for(range, [=](sycl::item<1> item) {
334 u32 _dir = length << 1;
336 int t = item.get_id(0);
337 int low = t & (_inc - 1);
338 int i = (t << 1) - low;
339 bool reverse = ((_dir & i) == 0);
343 Tkey x1 = m[_inc + i];
344 Tval idx0 =
id[0 + i];
345 Tval idx1 =
id[_inc + i];
348 ORDER(x0, x1, idx0, idx1)
357 q.submit(ker_sort_morton_b2);
365 template void sort_by_key_bitonic_legacy(
366 sycl::queue &q, sycl::buffer<u32> &buf_key, sycl::buffer<u32> &buf_values,
u32 len);
368 template void sort_by_key_bitonic_legacy(
369 sycl::queue &q, sycl::buffer<u64> &buf_key, sycl::buffer<u32> &buf_values,
u32 len);
constexpr const char * uint
Specific internal energy u.
std::uint32_t u32
32 bit unsigned integer
This header file contains utility functions related to exception handling in the code.
namespace to store algorithms implemented by shamalgs
constexpr bool is_pow_of_two(T v) noexcept
determine if v is a power of two and check if v==0 Source : https://graphics.stanford....
void throw_with_loc(std::string message, SourceLocation loc=SourceLocation{})
Throw an exception and append the source location to it.
void debug_sycl_ln(std::string module_name, Types... var2)
Prints a log message with multiple arguments followed by a newline.