Shamrock 2025.10.0
Astrophysical Code
Loading...
Searching...
No Matches
LoadBalanceStrategy.hpp
Go to the documentation of this file.
1// -------------------------------------------------------//
2//
3// SHAMROCK code for hydrodynamics
4// Copyright (c) 2021-2026 Timothée David--Cléris <tim.shamrock@proton.me>
5// SPDX-License-Identifier: CeCILL Free Software License Agreement v2.1
6// Shamrock is licensed under the CeCILL 2.1 License, see LICENSE for more information
7//
8// -------------------------------------------------------//
9
10#pragma once
11
20#include "shambackends/sycl.hpp"
21#include "shambackends/vec.hpp"
22#include "shamcomm/logs.hpp"
24#include <vector>
25
26namespace shamrock::scheduler {
27 template<class Torder, class Tweight>
28 struct TileWithLoad {
29 Torder ordering_val;
30 Tweight load_value;
31 };
32} // namespace shamrock::scheduler
33
34namespace shamrock::scheduler::details {
35
36 template<class Torder, class Tweight>
38 Torder ordering_val;
39 Tweight load_value;
40 Tweight accumulated_load_value;
41 u64 index;
42 i32 new_owner;
43
44 LoadBalancedTile() = default;
45
47 : ordering_val(in.ordering_val), load_value(in.load_value), index(inindex) {}
48 };
49
57 template<class Torder, class Tweight>
58 inline void apply_ordering(std::vector<LoadBalancedTile<Torder, Tweight>> &lb_vec) {
59 using LBTileResult = LoadBalancedTile<Torder, Tweight>;
60 std::sort(lb_vec.begin(), lb_vec.end(), [](LBTileResult &left, LBTileResult &right) {
61 return left.ordering_val < right.ordering_val;
62 });
63 }
64
74 template<class Torder, class Tweight>
75 inline std::vector<i32> lb_startegy_parallel_sweep(
76 const std::vector<TileWithLoad<Torder, Tweight>> &lb_vector, i32 wsize) {
77
78 using LBTile = TileWithLoad<Torder, Tweight>;
80
81 std::vector<LBTileResult> res(lb_vector.size());
82#pragma omp parallel for
83 for (u64 i = 0; i < lb_vector.size(); i++) {
84 res[i] = LBTileResult{lb_vector[i], i};
85 }
86
87 // apply the ordering
88 apply_ordering(res);
89
90 // compute increments for load
91 u64 accum = 0;
92 for (LBTileResult &tile : res) {
93 u64 cur_val = tile.load_value;
94 tile.accumulated_load_value = accum;
95 accum += cur_val;
96 }
97
98 double target_datacnt = double(res[res.size() - 1].accumulated_load_value) / wsize;
99
100#pragma omp parallel for
101 for (u64 i = 0; i < res.size(); i++) {
102 LBTileResult &tile = res[i];
103 tile.new_owner
104 = (target_datacnt == 0)
105 ? 0
106 : sycl::clamp(
107 i32(tile.accumulated_load_value / target_datacnt), 0, wsize - 1);
108 }
109
110 if (shamcomm::world_rank() == 0
111 && shamcomm::logs::get_loglevel() >= shamcomm::logs::log_debug) {
112 for (LBTileResult t : res) {
113 shamlog_debug_ln(
114 "HilbertLoadBalance",
115 t.ordering_val,
116 t.accumulated_load_value,
117 t.index,
118 (target_datacnt == 0)
119 ? 0
120 : sycl::clamp(
121 i32(t.accumulated_load_value / target_datacnt), 0, i32(wsize) - 1),
122 (target_datacnt == 0) ? 0 : (t.accumulated_load_value / target_datacnt));
123 }
124 }
125
126 std::vector<i32> new_owners(res.size());
127 for (LBTileResult &tile : res) {
128 new_owners[tile.index] = tile.new_owner;
129 }
130
131 return new_owners;
132 }
133
143 template<class Torder, class Tweight>
144 inline std::vector<i32> lb_startegy_roundrobin(
145 const std::vector<TileWithLoad<Torder, Tweight>> &lb_vector, i32 wsize) {
146
147 using LBTile = TileWithLoad<Torder, Tweight>;
149
150 std::vector<LBTileResult> res(lb_vector.size());
151#pragma omp parallel for
152 for (u64 i = 0; i < lb_vector.size(); i++) {
153 res[i] = LBTileResult{lb_vector[i], i};
154 }
155
156 // apply the ordering
157 apply_ordering(res);
158
159 // compute increments for load
160 u64 accum = 0;
161 for (LBTileResult &tile : res) {
162 tile.accumulated_load_value = accum;
163 // modify the lB above by assuming that each patch has the same load
164 // which effectivelly does a round robin balancing
165 accum += 1;
166 }
167
168 double target_datacnt = double(res[res.size() - 1].accumulated_load_value) / wsize;
169
170#pragma omp parallel for
171 for (u64 i = 0; i < res.size(); i++) {
172 LBTileResult &tile = res[i];
173 tile.new_owner
174 = (target_datacnt == 0)
175 ? 0
176 : sycl::clamp(
177 i32(tile.accumulated_load_value / target_datacnt), 0, wsize - 1);
178 }
179
180 if (shamcomm::world_rank() == 0
181 && shamcomm::logs::get_loglevel() >= shamcomm::logs::log_debug) {
182 for (LBTileResult t : res) {
183 shamlog_debug_ln(
184 "HilbertLoadBalance",
185 t.ordering_val,
186 t.accumulated_load_value,
187 t.index,
188 (target_datacnt == 0)
189 ? 0
190 : sycl::clamp(
191 i32(t.accumulated_load_value / target_datacnt), 0, i32(wsize) - 1),
192 (target_datacnt == 0) ? 0 : (t.accumulated_load_value / target_datacnt));
193 }
194 }
195
196 std::vector<i32> new_owners(res.size());
197 for (LBTileResult &tile : res) {
198 new_owners[tile.index] = tile.new_owner;
199 }
200
201 return new_owners;
202 }
203
204 struct LBMetric {
205 f64 min;
206 f64 max;
207 f64 mean;
208 f64 stddev;
209 };
210
221 template<class Torder, class Tweight>
223 const std::vector<TileWithLoad<Torder, Tweight>> &lb_vector,
224 const std::vector<i32> &new_owners,
225 i32 world_size,
226 f64 strategy_weight) {
227
228 std::vector<u64> load_per_node(world_size, 0);
229
230 for (u64 i = 0; i < lb_vector.size(); i++) {
231 load_per_node[new_owners[i]] += lb_vector[i].load_value;
232 }
233
236 f64 avg = 0;
237 f64 var = 0;
238
239 for (i32 nid = 0; nid < world_size; nid++) {
240 f64 val = load_per_node[nid];
241 min = sycl::fmin(min, val);
242 max = sycl::fmax(max, val);
243 avg += val;
244
245 // shamlog_debug_ln("HilbertLoadBalance", "node :",nid, "load :",load_per_node[nid]);
246 }
247 avg /= world_size;
248 for (i32 nid = 0; nid < world_size; nid++) {
249 f64 val = load_per_node[nid];
250 var += (val - avg) * (val - avg);
251 }
252 var /= world_size;
253
254 return {
255 min * strategy_weight,
256 max * strategy_weight,
257 avg * strategy_weight,
258 sycl::sqrt(var) * strategy_weight};
259 }
260
261} // namespace shamrock::scheduler::details
262
263namespace shamrock::scheduler {
264
273 template<class Torder, class Tweight>
274 inline std::vector<i32> load_balance(
275 std::vector<TileWithLoad<Torder, Tweight>> &&lb_vector,
276 i32 world_size = shamcomm::world_size()) {
277
278 using namespace details;
279
280 f64 factor_boost_psweep = 1;
281 auto tmpres = lb_startegy_parallel_sweep(lb_vector, world_size);
282 auto metric_psweep = compute_LB_metric(lb_vector, tmpres, world_size, factor_boost_psweep);
283
284 // We boost the round robin strategy to favor it if the difference is around 5% since the
285 // increased uniformity will probably offset the cost anyway
286 f64 factor_boost_rrobin = 0.95;
287 auto tmpres_2 = lb_startegy_roundrobin(lb_vector, world_size);
288 auto metric_rrobin
289 = compute_LB_metric(lb_vector, tmpres_2, world_size, factor_boost_rrobin);
290
291 std::string strategy_name = "parallel sweep";
292 if (metric_rrobin.max < metric_psweep.max) {
293 tmpres = tmpres_2;
294 strategy_name = "round robin";
295 }
296
297 if (shamcomm::world_rank() == 0) {
298 logger::info_ln(
299 "LoadBalance",
300 shambase::format(
301 R"=(Summary (strategy = {0:}):
302 - strategy "psweep" : max = {1:.1f} min = {2:.1f} factor = {3:}
303 - strategy "round robin" : max = {4:.1f} min = {5:.1f} factor = {6:})=",
304 strategy_name,
305 metric_psweep.max,
306 metric_psweep.min,
307 factor_boost_psweep,
308 metric_rrobin.max,
309 metric_rrobin.min,
310 factor_boost_rrobin));
311 }
312 return tmpres;
313 }
314
315} // namespace shamrock::scheduler
std::vector< i32 > load_balance(std::vector< TileWithLoad< Torder, Tweight > > &&lb_vector, i32 world_size=shamcomm::world_size())
load balance the input vector
std::vector< i32 > lb_startegy_roundrobin(const std::vector< TileWithLoad< Torder, Tweight > > &lb_vector, i32 wsize)
Load balance using round-robin strategy ignoring actual load values.
LBMetric compute_LB_metric(const std::vector< TileWithLoad< Torder, Tweight > > &lb_vector, const std::vector< i32 > &new_owners, i32 world_size, f64 strategy_weight)
Compute load balance quality metrics.
std::vector< i32 > lb_startegy_parallel_sweep(const std::vector< TileWithLoad< Torder, Tweight > > &lb_vector, i32 wsize)
Load balance using parallel sweep strategy based on accumulated load.
void apply_ordering(std::vector< LoadBalancedTile< Torder, Tweight > > &lb_vec)
Sort tiles by their ordering value.
double f64
Alias for double.
std::uint64_t u64
64 bit unsigned integer
std::int32_t i32
32 bit integer
Namespace for internal details of the logs module.
i8 get_loglevel()
Get the current global log level.
Definition loglevel.hpp:52
i32 world_rank()
Gives the rank of the current process in the MPI communicator.
Definition worldInfo.cpp:40
i32 world_size()
Gives the size of the MPI communicator.
Definition worldInfo.cpp:38
Functions related to the MPI communicator.