26namespace shamrock::scheduler {
27 template<
class Torder,
class Tweight>
34namespace shamrock::scheduler::details {
36 template<
class Torder,
class Tweight>
40 Tweight accumulated_load_value;
47 : ordering_val(in.ordering_val), load_value(in.load_value), index(inindex) {}
57 template<
class Torder,
class Tweight>
60 std::sort(lb_vec.begin(), lb_vec.end(), [](LBTileResult &left, LBTileResult &right) {
61 return left.ordering_val < right.ordering_val;
74 template<
class Torder,
class Tweight>
81 std::vector<LBTileResult> res(lb_vector.size());
82#pragma omp parallel for
83 for (
u64 i = 0; i < lb_vector.size(); i++) {
84 res[i] = LBTileResult{lb_vector[i], i};
92 for (LBTileResult &tile : res) {
93 u64 cur_val = tile.load_value;
94 tile.accumulated_load_value = accum;
98 double target_datacnt = double(res[res.size() - 1].accumulated_load_value) / wsize;
100#pragma omp parallel for
101 for (
u64 i = 0; i < res.size(); i++) {
102 LBTileResult &tile = res[i];
104 = (target_datacnt == 0)
107 i32(tile.accumulated_load_value / target_datacnt), 0, wsize - 1);
112 for (LBTileResult t : res) {
114 "HilbertLoadBalance",
116 t.accumulated_load_value,
118 (target_datacnt == 0)
121 i32(t.accumulated_load_value / target_datacnt), 0,
i32(wsize) - 1),
122 (target_datacnt == 0) ? 0 : (t.accumulated_load_value / target_datacnt));
126 std::vector<i32> new_owners(res.size());
127 for (LBTileResult &tile : res) {
128 new_owners[tile.index] = tile.new_owner;
143 template<
class Torder,
class Tweight>
150 std::vector<LBTileResult> res(lb_vector.size());
151#pragma omp parallel for
152 for (
u64 i = 0; i < lb_vector.size(); i++) {
153 res[i] = LBTileResult{lb_vector[i], i};
161 for (LBTileResult &tile : res) {
162 tile.accumulated_load_value = accum;
168 double target_datacnt = double(res[res.size() - 1].accumulated_load_value) / wsize;
170#pragma omp parallel for
171 for (
u64 i = 0; i < res.size(); i++) {
172 LBTileResult &tile = res[i];
174 = (target_datacnt == 0)
177 i32(tile.accumulated_load_value / target_datacnt), 0, wsize - 1);
182 for (LBTileResult t : res) {
184 "HilbertLoadBalance",
186 t.accumulated_load_value,
188 (target_datacnt == 0)
191 i32(t.accumulated_load_value / target_datacnt), 0,
i32(wsize) - 1),
192 (target_datacnt == 0) ? 0 : (t.accumulated_load_value / target_datacnt));
196 std::vector<i32> new_owners(res.size());
197 for (LBTileResult &tile : res) {
198 new_owners[tile.index] = tile.new_owner;
221 template<
class Torder,
class Tweight>
224 const std::vector<i32> &new_owners,
226 f64 strategy_weight) {
228 std::vector<u64> load_per_node(world_size, 0);
230 for (
u64 i = 0; i < lb_vector.size(); i++) {
231 load_per_node[new_owners[i]] += lb_vector[i].load_value;
239 for (
i32 nid = 0; nid < world_size; nid++) {
240 f64 val = load_per_node[nid];
241 min = sycl::fmin(min, val);
242 max = sycl::fmax(max, val);
248 for (
i32 nid = 0; nid < world_size; nid++) {
249 f64 val = load_per_node[nid];
250 var += (val - avg) * (val - avg);
255 min * strategy_weight,
256 max * strategy_weight,
257 avg * strategy_weight,
258 sycl::sqrt(var) * strategy_weight};
263namespace shamrock::scheduler {
273 template<
class Torder,
class Tweight>
280 f64 factor_boost_psweep = 1;
281 auto tmpres = lb_startegy_parallel_sweep(lb_vector, world_size);
282 auto metric_psweep = compute_LB_metric(lb_vector, tmpres, world_size, factor_boost_psweep);
286 f64 factor_boost_rrobin = 0.95;
287 auto tmpres_2 = lb_startegy_roundrobin(lb_vector, world_size);
289 = compute_LB_metric(lb_vector, tmpres_2, world_size, factor_boost_rrobin);
291 std::string strategy_name =
"parallel sweep";
292 if (metric_rrobin.max < metric_psweep.max) {
294 strategy_name =
"round robin";
301 R
"=(Summary (strategy = {0:}):
302 - strategy "psweep" : max = {1:.1f} min = {2:.1f} factor = {3:}
303 - strategy "round robin" : max = {4:.1f} min = {5:.1f} factor = {6:})=",
310 factor_boost_rrobin));
std::vector< i32 > load_balance(std::vector< TileWithLoad< Torder, Tweight > > &&lb_vector, i32 world_size=shamcomm::world_size())
load balance the input vector
std::vector< i32 > lb_startegy_roundrobin(const std::vector< TileWithLoad< Torder, Tweight > > &lb_vector, i32 wsize)
Load balance using round-robin strategy ignoring actual load values.
LBMetric compute_LB_metric(const std::vector< TileWithLoad< Torder, Tweight > > &lb_vector, const std::vector< i32 > &new_owners, i32 world_size, f64 strategy_weight)
Compute load balance quality metrics.
std::vector< i32 > lb_startegy_parallel_sweep(const std::vector< TileWithLoad< Torder, Tweight > > &lb_vector, i32 wsize)
Load balance using parallel sweep strategy based on accumulated load.
void apply_ordering(std::vector< LoadBalancedTile< Torder, Tweight > > &lb_vec)
Sort tiles by their ordering value.
double f64
Alias for double.
std::uint64_t u64
64 bit unsigned integer
std::int32_t i32
32 bit integer
Namespace for internal details of the logs module.
i8 get_loglevel()
Get the current global log level.
i32 world_rank()
Gives the rank of the current process in the MPI communicator.
i32 world_size()
Gives the size of the MPI communicator.
Functions related to the MPI communicator.