25 template<
class Tkey,
class Tval>
29 = sycl::accessor<Tkey, 1, sycl::access::mode::read_write, sycl::target::device>;
31 = sycl::accessor<Tval, 1, sycl::access::mode::read_write, sycl::target::device>;
33 inline static void _order(Tkey &a, Tkey &b, Tval &va, Tval &vb,
bool reverse) {
34 bool swap = reverse ^ (a < b);
45 inline static void _orderV(Tkey *x, Tval *vx,
u32 a,
u32 b,
bool reverse) {
46 bool swap = reverse ^ (x[a] < x[b]);
58 template<u32 stencil_size>
59 static void order_stencil(Tkey *x, Tval *vx,
u32 a,
bool reverse);
62 inline void order_stencil<2>(Tkey *x, Tval *vx,
u32 a,
bool reverse) {
63 _orderV(x, vx, a, a + 1, reverse);
67 inline void order_stencil<4>(Tkey *x, Tval *vx,
u32 a,
bool reverse) {
69 for (
int i4 = 0; i4 < 2; i4++) {
70 _orderV(x, vx, a + i4, a + i4 + 2, reverse);
72 order_stencil<2>(x, vx, a, reverse);
73 order_stencil<2>(x, vx, a + 2, reverse);
77 inline void order_stencil<8>(Tkey *x, Tval *vx,
u32 a,
bool reverse) {
79 for (
int i8 = 0;
i8 < 4;
i8++) {
80 _orderV(x, vx, a +
i8, a +
i8 + 4, reverse);
82 order_stencil<4>(x, vx, a, reverse);
83 order_stencil<4>(x, vx, a + 4, reverse);
87 inline void order_stencil<16>(Tkey *x, Tval *vx,
u32 a,
bool reverse) {
90 _orderV(x, vx, a +
i16, a +
i16 + 8, reverse);
92 order_stencil<8>(x, vx, a, reverse);
93 order_stencil<8>(x, vx, a + 8, reverse);
97 inline void order_stencil<32>(Tkey *x, Tval *vx,
u32 a,
bool reverse) {
100 _orderV(x, vx, a +
i32, a +
i32 + 16, reverse);
102 order_stencil<16>(x, vx, a, reverse);
103 order_stencil<16>(x, vx, a + 16, reverse);
106 template<u32 stencil_size>
107 static void order_kernel(AccKey m, AccVal
id,
u32 inc,
u32 length,
i32 t);
110 inline void order_kernel<32>(AccKey m, AccVal
id,
u32 inc,
u32 length,
i32 t) {
112 u32 _dir = length << 1U;
115 int low = t & (_inc - 1);
116 int i = ((t - low) << 5) + low;
117 bool reverse = ((_dir & i) == 0);
121 for (
int k = 0; k < 32; k++)
122 x[k] = m[k * _inc + i];
125 for (
int k = 0; k < 32; k++)
126 idx[k] =
id[k * _inc + i];
129 order_stencil<32>(x, idx, 0, reverse);
132 for (
int k = 0; k < 32; k++)
133 m[k * _inc + i] = x[k];
134 for (
int k = 0; k < 32; k++)
135 id[k * _inc + i] = idx[k];
139 inline void order_kernel<16>(AccKey m, AccVal
id,
u32 inc,
u32 length,
i32 t) {
142 u32 _dir = length << 1;
145 int low = t & (_inc - 1);
146 int i = ((t - low) << 4) + low;
147 bool reverse = ((_dir & i) == 0);
151 for (
int k = 0; k < 16; k++)
152 x[k] = m[k * _inc + i];
155 for (
int k = 0; k < 16; k++)
156 idx[k] =
id[k * _inc + i];
159 order_stencil<16>(x, idx, 0, reverse);
162 for (
int k = 0; k < 16; k++)
163 m[k * _inc + i] = x[k];
164 for (
int k = 0; k < 16; k++)
165 id[k * _inc + i] = idx[k];
169 inline void order_kernel<8>(AccKey m, AccVal
id,
u32 inc,
u32 length,
i32 t) {
171 u32 _dir = length << 1;
174 int low = t & (_inc - 1);
175 int i = ((t - low) << 3) + low;
176 bool reverse = ((_dir & i) == 0);
180 for (
int k = 0; k < 8; k++)
181 x[k] = m[k * _inc + i];
184 for (
int k = 0; k < 8; k++)
185 idx[k] =
id[k * _inc + i];
188 order_stencil<8>(x, idx, 0, reverse);
191 for (
int k = 0; k < 8; k++)
192 m[k * _inc + i] = x[k];
193 for (
int k = 0; k < 8; k++)
194 id[k * _inc + i] = idx[k];
198 inline void order_kernel<4>(AccKey m, AccVal
id,
u32 inc,
u32 length,
i32 t) {
200 u32 _dir = length << 1;
203 int low = t & (_inc - 1);
204 int i = ((t - low) << 2) + low;
205 bool reverse = ((_dir & i) == 0);
209 Tkey x1 = m[_inc + i];
210 Tkey x2 = m[2 * _inc + i];
211 Tkey x3 = m[3 * _inc + i];
213 Tval idx0 =
id[0 + i];
214 Tval idx1 =
id[_inc + i];
215 Tval idx2 =
id[2 * _inc + i];
216 Tval idx3 =
id[3 * _inc + i];
219 _order(x0, x2, idx0, idx2, reverse);
220 _order(x1, x3, idx1, idx3, reverse);
221 _order(x0, x1, idx0, idx1, reverse);
222 _order(x2, x3, idx2, idx3, reverse);
227 m[2 * _inc + i] = x2;
228 m[3 * _inc + i] = x3;
232 id[2 * _inc + i] = idx2;
233 id[3 * _inc + i] = idx3;
237 inline void order_kernel<2>(AccKey m, AccVal
id,
u32 inc,
u32 length,
i32 t) {
239 u32 _dir = length << 1;
241 int low = t & (_inc - 1);
242 int i = (t << 1) - low;
243 bool reverse = ((_dir & i) == 0);
247 Tkey x1 = m[_inc + i];
248 Tval idx0 =
id[0 + i];
249 Tval idx1 =
id[_inc + i];
252 _order(x0, x1, idx0, idx1, reverse);
262 template<
class Tkey,
class Tval, u32 MaxStencilSize>
263 void sort_by_key_bitonic_updated_xor_swap(
264 sycl::queue &q, sycl::buffer<Tkey> &buf_key, sycl::buffer<Tval> &buf_values,
u32 len) {
268 "this algorithm can only be used with length that are powers of two");
271 using B = OrderingPrimitiveXorSwap<Tkey, Tval>;
273 for (
u32 length = 1; length < len; length <<= 1) {
281 if constexpr (MaxStencilSize >= 32) {
282 if (inc >= 16 && ninc == 0) {
284 unsigned int nThreads = len >> ninc;
285 sycl::range<1> range{nThreads};
287 auto ker_sort_morton_b32 = [&](sycl::handler &cgh) {
288 sycl::accessor m{buf_key, cgh, sycl::read_write};
289 sycl::accessor
id{buf_values, cgh, sycl::read_write};
291 cgh.parallel_for(range, [=](sycl::item<1> item) {
294 B::template order_kernel<32>(m,
id, inc, length, item.get_id(0));
297 q.submit(ker_sort_morton_b32);
301 if constexpr (MaxStencilSize >= 16) {
302 if (inc >= 8 && ninc == 0) {
304 unsigned int nThreads = len >> ninc;
305 sycl::range<1> range{nThreads};
307 auto ker_sort_morton_b16 = [&](sycl::handler &cgh) {
308 sycl::accessor m{buf_key, cgh, sycl::read_write};
309 sycl::accessor
id{buf_values, cgh, sycl::read_write};
311 cgh.parallel_for(range, [=](sycl::item<1> item) {
314 B::template order_kernel<16>(m,
id, inc, length, item.get_id(0));
317 q.submit(ker_sort_morton_b16);
324 if constexpr (MaxStencilSize >= 8) {
326 if (inc >= 4 && ninc == 0) {
328 unsigned int nThreads = len >> ninc;
329 sycl::range<1> range{nThreads};
331 auto ker_sort_morton_b8 = [&](sycl::handler &cgh) {
332 sycl::accessor m{buf_key, cgh, sycl::read_write};
333 sycl::accessor
id{buf_values, cgh, sycl::read_write};
335 cgh.parallel_for(range, [=](sycl::item<1> item) {
338 B::template order_kernel<8>(m,
id, inc, length, item.get_id(0));
341 q.submit(ker_sort_morton_b8);
348 if constexpr (MaxStencilSize >= 4) {
350 if (inc >= 2 && ninc == 0) {
352 unsigned int nThreads = len >> ninc;
353 sycl::range<1> range{nThreads};
356 auto ker_sort_morton_b4 = [&](sycl::handler &cgh) {
357 sycl::accessor m{buf_key, cgh, sycl::read_write};
358 sycl::accessor
id{buf_values, cgh, sycl::read_write};
359 cgh.parallel_for(range, [=](sycl::item<1> item) {
360 B::template order_kernel<4>(m,
id, inc, length, item.get_id(0));
363 q.submit(ker_sort_morton_b4);
370 unsigned int nThreads = len >> ninc;
371 sycl::range<1> range{nThreads};
374 auto ker_sort_morton_b2 = [&](sycl::handler &cgh) {
375 sycl::accessor m{buf_key, cgh, sycl::read_write};
376 sycl::accessor
id{buf_values, cgh, sycl::read_write};
378 cgh.parallel_for(range, [=](sycl::item<1> item) {
381 B::template order_kernel<2>(m,
id, inc, length, item.get_id(0));
384 q.submit(ker_sort_morton_b2);
392 template void sort_by_key_bitonic_updated_xor_swap<u32, u32, 16>(
393 sycl::queue &q, sycl::buffer<u32> &buf_key, sycl::buffer<u32> &buf_values,
u32 len);
395 template void sort_by_key_bitonic_updated_xor_swap<u64, u32, 16>(
396 sycl::queue &q, sycl::buffer<u64> &buf_key, sycl::buffer<u32> &buf_values,
u32 len);
398 template void sort_by_key_bitonic_updated_xor_swap<u32, u32, 8>(
399 sycl::queue &q, sycl::buffer<u32> &buf_key, sycl::buffer<u32> &buf_values,
u32 len);
401 template void sort_by_key_bitonic_updated_xor_swap<u64, u32, 8>(
402 sycl::queue &q, sycl::buffer<u64> &buf_key, sycl::buffer<u32> &buf_values,
u32 len);
404 template void sort_by_key_bitonic_updated_xor_swap<u32, u32, 32>(
405 sycl::queue &q, sycl::buffer<u32> &buf_key, sycl::buffer<u32> &buf_values,
u32 len);
407 template void sort_by_key_bitonic_updated_xor_swap<u64, u32, 32>(
408 sycl::queue &q, sycl::buffer<u64> &buf_key, sycl::buffer<u32> &buf_values,
u32 len);
std::int8_t i8
8 bit integer
std::uint32_t u32
32 bit unsigned integer
std::int16_t i16
16 bit integer
std::int32_t i32
32 bit integer
This header file contains utility functions related to exception handling in the code.
namespace to store algorithms implemented by shamalgs
constexpr bool is_pow_of_two(T v) noexcept
determine if v is a power of two and check if v==0 Source : https://graphics.stanford....
void throw_with_loc(std::string message, SourceLocation loc=SourceLocation{})
Throw an exception and append the source location to it.