28 inline static void _order(Tkey &a, Tkey &b, Tval &va, Tval &vb,
bool reverse) {
29 bool swap = reverse ^ (a < b);
34 a = (swap) ? auxb : auxa;
35 b = (swap) ? auxa : auxb;
36 va = (swap) ? auxidb : auxida;
37 vb = (swap) ? auxida : auxidb;
40 inline static void _orderV(
41 Tkey *__restrict__ x, Tval *__restrict__ vx,
u32 a,
u32 b,
bool reverse) {
42 bool swap = reverse ^ (x[a] < x[b]);
47 x[a] = (swap) ? auxb : auxa;
48 x[b] = (swap) ? auxa : auxb;
49 vx[a] = (swap) ? auxidb : auxida;
50 vx[b] = (swap) ? auxida : auxidb;
53 template<u32 stencil_size>
54 static void order_stencil(Tkey *__restrict__ x, Tval *__restrict__ vx,
u32 a,
bool reverse);
57 inline void order_stencil<2>(
58 Tkey *__restrict__ x, Tval *__restrict__ vx,
u32 a,
bool reverse) {
59 _orderV(x, vx, a, a + 1, reverse);
63 inline void order_stencil<4>(
64 Tkey *__restrict__ x, Tval *__restrict__ vx,
u32 a,
bool reverse) {
66 for (
int i4 = 0; i4 < 2; i4++) {
67 _orderV(x, vx, a + i4, a + i4 + 2, reverse);
69 order_stencil<2>(x, vx, a, reverse);
70 order_stencil<2>(x, vx, a + 2, reverse);
74 inline void order_stencil<8>(
75 Tkey *__restrict__ x, Tval *__restrict__ vx,
u32 a,
bool reverse) {
77 for (
int i8 = 0;
i8 < 4;
i8++) {
78 _orderV(x, vx, a +
i8, a +
i8 + 4, reverse);
80 order_stencil<4>(x, vx, a, reverse);
81 order_stencil<4>(x, vx, a + 4, reverse);
85 inline void order_stencil<16>(
86 Tkey *__restrict__ x, Tval *__restrict__ vx,
u32 a,
bool reverse) {
89 _orderV(x, vx, a +
i16, a +
i16 + 8, reverse);
91 order_stencil<8>(x, vx, a, reverse);
92 order_stencil<8>(x, vx, a + 8, reverse);
96 inline void order_stencil<32>(
97 Tkey *__restrict__ x, Tval *__restrict__ vx,
u32 a,
bool reverse) {
100 _orderV(x, vx, a +
i32, a +
i32 + 16, reverse);
102 order_stencil<16>(x, vx, a, reverse);
103 order_stencil<16>(x, vx, a + 16, reverse);
106 template<u32 stencil_size>
107 static void order_kernel(
108 Tkey *__restrict__ m, Tval *__restrict__
id,
u32 inc,
u32 length,
i32 t);
111 inline void order_kernel<32>(
112 Tkey *__restrict__ m, Tval *__restrict__
id,
u32 inc,
u32 length,
i32 t) {
114 u32 _dir = length << 1U;
117 int low = t & (_inc - 1);
118 int i = ((t - low) << 5) + low;
119 bool reverse = ((_dir & i) == 0);
124 for (
int k = 0; k < 32; k++)
125 x[k] = m[k * _inc + i];
129 for (
int k = 0; k < 32; k++)
130 idx[k] =
id[k * _inc + i];
133 order_stencil<32>(x, idx, 0, reverse);
137 for (
int k = 0; k < 32; k++)
138 m[k * _inc + i] = x[k];
140 for (
int k = 0; k < 32; k++)
141 id[k * _inc + i] = idx[k];
145 inline void order_kernel<16>(
146 Tkey *__restrict__ m, Tval *__restrict__
id,
u32 inc,
u32 length,
i32 t) {
149 u32 _dir = length << 1;
152 int low = t & (_inc - 1);
153 int i = ((t - low) << 4) + low;
154 bool reverse = ((_dir & i) == 0);
159 for (
int k = 0; k < 16; k++)
160 x[k] = m[k * _inc + i];
164 for (
int k = 0; k < 16; k++)
165 idx[k] =
id[k * _inc + i];
168 order_stencil<16>(x, idx, 0, reverse);
172 for (
int k = 0; k < 16; k++)
173 m[k * _inc + i] = x[k];
175 for (
int k = 0; k < 16; k++)
176 id[k * _inc + i] = idx[k];
180 inline void order_kernel<8>(
181 Tkey *__restrict__ m, Tval *__restrict__
id,
u32 inc,
u32 length,
i32 t) {
183 u32 _dir = length << 1;
186 int low = t & (_inc - 1);
187 int i = ((t - low) << 3) + low;
188 bool reverse = ((_dir & i) == 0);
193 for (
int k = 0; k < 8; k++)
194 x[k] = m[k * _inc + i];
198 for (
int k = 0; k < 8; k++)
199 idx[k] =
id[k * _inc + i];
202 order_stencil<8>(x, idx, 0, reverse);
206 for (
int k = 0; k < 8; k++)
207 m[k * _inc + i] = x[k];
209 for (
int k = 0; k < 8; k++)
210 id[k * _inc + i] = idx[k];
214 inline void order_kernel<4>(
215 Tkey *__restrict__ m, Tval *__restrict__
id,
u32 inc,
u32 length,
i32 t) {
217 u32 _dir = length << 1;
220 int low = t & (_inc - 1);
221 int i = ((t - low) << 2) + low;
222 bool reverse = ((_dir & i) == 0);
226 Tkey x1 = m[_inc + i];
227 Tkey x2 = m[2 * _inc + i];
228 Tkey x3 = m[3 * _inc + i];
230 Tval idx0 =
id[0 + i];
231 Tval idx1 =
id[_inc + i];
232 Tval idx2 =
id[2 * _inc + i];
233 Tval idx3 =
id[3 * _inc + i];
236 _order(x0, x2, idx0, idx2, reverse);
237 _order(x1, x3, idx1, idx3, reverse);
238 _order(x0, x1, idx0, idx1, reverse);
239 _order(x2, x3, idx2, idx3, reverse);
244 m[2 * _inc + i] = x2;
245 m[3 * _inc + i] = x3;
249 id[2 * _inc + i] = idx2;
250 id[3 * _inc + i] = idx3;
254 inline void order_kernel<2>(
255 Tkey *__restrict__ m, Tval *__restrict__
id,
u32 inc,
u32 length,
i32 t) {
257 u32 _dir = length << 1;
259 int low = t & (_inc - 1);
260 int i = (t << 1) - low;
261 bool reverse = ((_dir & i) == 0);
264 u32 addr_2 = _inc + i;
269 Tval idx0 =
id[addr_1];
270 Tval idx1 =
id[addr_2];
273 _order(x0, x1, idx0, idx1, reverse);