Shamrock/doxygen/LoadBalanceStrategy_8hpp_source.html

// -------------------------------------------------------//

//

// SHAMROCK code for hydrodynamics

// Copyright (c) 2021-2026 Timothée David--Cléris <tim.shamrock@proton.me>

// SPDX-License-Identifier: CeCILL Free Software License Agreement v2.1

// Shamrock is licensed under the CeCILL 2.1 License, see LICENSE for more information

//

// -------------------------------------------------------//


#pragma once


#include "shambase/aliases_int.hpp"

#include "shambackends/sycl.hpp"

#include "shambackends/vec.hpp"

#include "shamcomm/logs.hpp"

#include "shamcomm/worldInfo.hpp"

#include <vector>


namespace shamrock::scheduler {

    template<class Torder, class Tweight>


    struct TileWithLoad {

        Torder ordering_val;

        Tweight load_value;

    };


} // namespace shamrock::scheduler


namespace shamrock::scheduler::details {


    template<class Torder, class Tweight>


    struct LoadBalancedTile {

        Torder ordering_val;

        Tweight load_value;

        Tweight accumulated_load_value;

        u64 index;

        i32 new_owner;


        LoadBalancedTile() = default;


        LoadBalancedTile(TileWithLoad<Torder, Tweight> in, u64 inindex)

            : ordering_val(in.ordering_val), load_value(in.load_value), index(inindex) {}

    };


    template<class Torder, class Tweight>


    inline void apply_ordering(std::vector<LoadBalancedTile<Torder, Tweight>> &lb_vec) {

        using LBTileResult = LoadBalancedTile<Torder, Tweight>;

        std::sort(lb_vec.begin(), lb_vec.end(), [](LBTileResult &left, LBTileResult &right) {

            return left.ordering_val < right.ordering_val;

        });

    }


    template<class Torder, class Tweight>


    inline std::vector<i32> lb_startegy_parallel_sweep(

        const std::vector<TileWithLoad<Torder, Tweight>> &lb_vector, i32 wsize) {


        using LBTile       = TileWithLoad<Torder, Tweight>;

        using LBTileResult = details::LoadBalancedTile<Torder, Tweight>;


        std::vector<LBTileResult> res(lb_vector.size());

#pragma omp parallel for

        for (u64 i = 0; i < lb_vector.size(); i++) {

            res[i] = LBTileResult{lb_vector[i], i};

        }


        // apply the ordering

        apply_ordering(res);


        // compute increments for load

        u64 accum = 0;

        for (LBTileResult &tile : res) {

            u64 cur_val                 = tile.load_value;

            tile.accumulated_load_value = accum;

            accum += cur_val;

        }


        double target_datacnt = double(res[res.size() - 1].accumulated_load_value) / wsize;


#pragma omp parallel for

        for (u64 i = 0; i < res.size(); i++) {

            LBTileResult &tile = res[i];

            tile.new_owner

                = (target_datacnt == 0)

                      ? 0

                      : sycl::clamp(

                            i32(tile.accumulated_load_value / target_datacnt), 0, wsize - 1);

        }


        if (shamcomm::world_rank() == 0

            && shamcomm::logs::get_loglevel() >= shamcomm::logs::log_debug) {

            for (LBTileResult t : res) {

                shamlog_debug_ln(

                    "HilbertLoadBalance",

                    t.ordering_val,

                    t.accumulated_load_value,

                    t.index,

                    (target_datacnt == 0)

                        ? 0

                        : sycl::clamp(

                              i32(t.accumulated_load_value / target_datacnt), 0, i32(wsize) - 1),

                    (target_datacnt == 0) ? 0 : (t.accumulated_load_value / target_datacnt));

            }

        }


        std::vector<i32> new_owners(res.size());

        for (LBTileResult &tile : res) {

            new_owners[tile.index] = tile.new_owner;

        }


        return new_owners;

    }


    template<class Torder, class Tweight>


    inline std::vector<i32> lb_startegy_roundrobin(

        const std::vector<TileWithLoad<Torder, Tweight>> &lb_vector, i32 wsize) {


        using LBTile       = TileWithLoad<Torder, Tweight>;

        using LBTileResult = details::LoadBalancedTile<Torder, Tweight>;


        std::vector<LBTileResult> res(lb_vector.size());

#pragma omp parallel for

        for (u64 i = 0; i < lb_vector.size(); i++) {

            res[i] = LBTileResult{lb_vector[i], i};

        }


        // apply the ordering

        apply_ordering(res);


        // compute increments for load

        u64 accum = 0;

        for (LBTileResult &tile : res) {

            tile.accumulated_load_value = accum;

            // modify the lB above by assuming that each patch has the same load

            // which effectivelly does a round robin balancing

            accum += 1;

        }


        double target_datacnt = double(res[res.size() - 1].accumulated_load_value) / wsize;


#pragma omp parallel for

        for (u64 i = 0; i < res.size(); i++) {

            LBTileResult &tile = res[i];

            tile.new_owner

                = (target_datacnt == 0)

                      ? 0

                      : sycl::clamp(

                            i32(tile.accumulated_load_value / target_datacnt), 0, wsize - 1);

        }


        if (shamcomm::world_rank() == 0

            && shamcomm::logs::get_loglevel() >= shamcomm::logs::log_debug) {

            for (LBTileResult t : res) {

                shamlog_debug_ln(

                    "HilbertLoadBalance",

                    t.ordering_val,

                    t.accumulated_load_value,

                    t.index,

                    (target_datacnt == 0)

                        ? 0

                        : sycl::clamp(

                              i32(t.accumulated_load_value / target_datacnt), 0, i32(wsize) - 1),

                    (target_datacnt == 0) ? 0 : (t.accumulated_load_value / target_datacnt));

            }

        }


        std::vector<i32> new_owners(res.size());

        for (LBTileResult &tile : res) {

            new_owners[tile.index] = tile.new_owner;

        }


        return new_owners;

    }


    struct LBMetric {

        f64 min;

        f64 max;

        f64 mean;

        f64 stddev;

    };


    template<class Torder, class Tweight>


    inline LBMetric compute_LB_metric(

        const std::vector<TileWithLoad<Torder, Tweight>> &lb_vector,

        const std::vector<i32> &new_owners,

        i32 world_size,

        f64 strategy_weight) {


        std::vector<u64> load_per_node(world_size, 0);


        for (u64 i = 0; i < lb_vector.size(); i++) {

            load_per_node[new_owners[i]] += lb_vector[i].load_value;

        }


        f64 min = shambase::VectorProperties<f64>::get_inf();

        f64 max = -shambase::VectorProperties<f64>::get_inf();

        f64 avg = 0;

        f64 var = 0;


        for (i32 nid = 0; nid < world_size; nid++) {

            f64 val = load_per_node[nid];

            min     = sycl::fmin(min, val);

            max     = sycl::fmax(max, val);

            avg += val;


            // shamlog_debug_ln("HilbertLoadBalance", "node :",nid, "load :",load_per_node[nid]);

        }

        avg /= world_size;

        for (i32 nid = 0; nid < world_size; nid++) {

            f64 val = load_per_node[nid];

            var += (val - avg) * (val - avg);

        }

        var /= world_size;


        return {

            min * strategy_weight,

            max * strategy_weight,

            avg * strategy_weight,

            sycl::sqrt(var) * strategy_weight};

    }


} // namespace shamrock::scheduler::details


namespace shamrock::scheduler {


    template<class Torder, class Tweight>


    inline std::vector<i32> load_balance(

        std::vector<TileWithLoad<Torder, Tweight>> &&lb_vector,

        i32 world_size = shamcomm::world_size()) {


        using namespace details;


        f64 factor_boost_psweep = 1;

        auto tmpres             = lb_startegy_parallel_sweep(lb_vector, world_size);

        auto metric_psweep = compute_LB_metric(lb_vector, tmpres, world_size, factor_boost_psweep);


        // We boost the round robin strategy to favor it if the difference is around 5% since the

        // increased uniformity will probably offset the cost anyway

        f64 factor_boost_rrobin = 0.95;

        auto tmpres_2           = lb_startegy_roundrobin(lb_vector, world_size);

        auto metric_rrobin

            = compute_LB_metric(lb_vector, tmpres_2, world_size, factor_boost_rrobin);


        std::string strategy_name = "parallel sweep";

        if (metric_rrobin.max < metric_psweep.max) {

            tmpres        = tmpres_2;

            strategy_name = "round robin";

        }


        if (shamcomm::world_rank() == 0) {

            logger::info_ln(

                "LoadBalance",

                shambase::format(

                    R"=(Summary (strategy = {0:}):

 - strategy "psweep"      : max = {1:.1f} min = {2:.1f} factor = {3:}

 - strategy "round robin" : max = {4:.1f} min = {5:.1f} factor = {6:})=",

                    strategy_name,

                    metric_psweep.max,

                    metric_psweep.min,

                    factor_boost_psweep,

                    metric_rrobin.max,

                    metric_rrobin.min,

                    factor_boost_rrobin));

        }


        return tmpres;

    }


} // namespace shamrock::scheduler

shamrock::scheduler::load_balance
std::vector< i32 > load_balance(std::vector< TileWithLoad< Torder, Tweight > > &&lb_vector, i32 world_size=shamcomm::world_size())
load balance the input vector
Definition LoadBalanceStrategy.hpp:274

shamrock::scheduler::details::lb_startegy_roundrobin
std::vector< i32 > lb_startegy_roundrobin(const std::vector< TileWithLoad< Torder, Tweight > > &lb_vector, i32 wsize)
Load balance using round-robin strategy ignoring actual load values.
Definition LoadBalanceStrategy.hpp:144

shamrock::scheduler::details::compute_LB_metric
LBMetric compute_LB_metric(const std::vector< TileWithLoad< Torder, Tweight > > &lb_vector, const std::vector< i32 > &new_owners, i32 world_size, f64 strategy_weight)
Compute load balance quality metrics.
Definition LoadBalanceStrategy.hpp:222

shamrock::scheduler::details::lb_startegy_parallel_sweep
std::vector< i32 > lb_startegy_parallel_sweep(const std::vector< TileWithLoad< Torder, Tweight > > &lb_vector, i32 wsize)
Load balance using parallel sweep strategy based on accumulated load.
Definition LoadBalanceStrategy.hpp:75

shamrock::scheduler::details::apply_ordering
void apply_ordering(std::vector< LoadBalancedTile< Torder, Tweight > > &lb_vec)
Sort tiles by their ordering value.
Definition LoadBalanceStrategy.hpp:58

f64
double f64
Alias for double.
Definition aliases_float.hpp:20

aliases_int.hpp

u64
std::uint64_t u64
64 bit unsigned integer
Definition aliases_int.hpp:26

i32
std::int32_t i32
32 bit integer
Definition aliases_int.hpp:23

details
Namespace for internal details of the logs module.

shambase::logs::get_loglevel
i8 get_loglevel()
Get the current global log level.
Definition loglevel.hpp:52

shamcomm::world_rank
i32 world_rank()
Gives the rank of the current process in the MPI communicator.
Definition worldInfo.cpp:40

shamcomm::world_size
i32 world_size()
Gives the size of the MPI communicator.
Definition worldInfo.cpp:38

logs.hpp

sham::VectorProperties
Definition vec.hpp:25

shamrock::scheduler::TileWithLoad
Definition LoadBalanceStrategy.hpp:28

shamrock::scheduler::details::LBMetric
Definition LoadBalanceStrategy.hpp:204

shamrock::scheduler::details::LoadBalancedTile
Definition LoadBalanceStrategy.hpp:37

sycl.hpp

vec.hpp

worldInfo.hpp
Functions related to the MPI communicator.