25#ifdef SHAMROCK_USE_GEOPM
26 #include <geopm/PlatformIO.hpp>
27 #include <geopm/PlatformTopo.hpp>
32#ifdef SHAMROCK_USE_GEOPM
34 class AuroraSystemMetricReporterLinked :
public ISystemMetricReporter {
36 std::optional<f64> get_rank_energy_consummed()
override {
37 if (shamcomm::is_main_node_rank()) {
38 return geopm::platform_io().read_signal(
"BOARD_ENERGY", GEOPM_DOMAIN_BOARD, 0);
43 std::optional<f64> get_gpu_energy_consummed()
override {
44 if (shamcomm::is_main_node_rank()) {
45 return geopm::platform_io().read_signal(
"GPU_ENERGY", GEOPM_DOMAIN_BOARD, 0);
50 std::optional<f64> get_cpu_energy_consummed()
override {
51 if (shamcomm::is_main_node_rank()) {
52 return geopm::platform_io().read_signal(
"CPU_ENERGY", GEOPM_DOMAIN_BOARD, 0);
57 std::optional<f64> get_dram_energy_consummed()
override {
58 if (shamcomm::is_main_node_rank()) {
59 return geopm::platform_io().read_signal(
"DRAM_ENERGY", GEOPM_DOMAIN_BOARD, 0);
64 bool support_rank_energy_consummed()
override {
return true; }
65 bool support_gpu_energy_consummed()
override {
return true; }
66 bool support_cpu_energy_consummed()
override {
return true; }
67 bool support_dram_energy_consummed()
override {
return true; }
73 std::optional<f64> get_rank_energy_consummed()
override {
74 if (shamcomm::is_main_node_rank()) {
76 return std::stoull(output.c_str());
81 std::optional<f64> get_gpu_energy_consummed()
override {
82 if (shamcomm::is_main_node_rank()) {
84 return std::stoull(output.c_str());
89 std::optional<f64> get_cpu_energy_consummed()
override {
90 if (shamcomm::is_main_node_rank()) {
92 return std::stoull(output.c_str());
97 std::optional<f64> get_dram_energy_consummed()
override {
98 if (shamcomm::is_main_node_rank()) {
100 return std::stoull(output.c_str());
105 bool support_rank_energy_consummed()
override {
return true; }
106 bool support_gpu_energy_consummed()
override {
return true; }
107 bool support_cpu_energy_consummed()
override {
return true; }
108 bool support_dram_energy_consummed()
override {
return true; }
113 std::optional<f64> get_rank_energy_consummed()
override {
114 if (shamcomm::is_main_node_rank()) {
116 "cat /sys/class/powercap/intel-rapl:0/energy_uj");
117 return f64(std::stoull(output.c_str())) * 1e-6;
122 std::optional<f64> get_gpu_energy_consummed()
override {
return std::nullopt; }
124 std::optional<f64> get_cpu_energy_consummed()
override {
return std::nullopt; }
126 std::optional<f64> get_dram_energy_consummed()
override {
return std::nullopt; }
128 bool support_rank_energy_consummed()
override {
return true; }
129 bool support_gpu_energy_consummed()
override {
return false; }
130 bool support_cpu_energy_consummed()
override {
return false; }
131 bool support_dram_energy_consummed()
override {
return false; }
136 std::optional<f64> get_rank_energy_consummed()
override {
return std::nullopt; }
137 std::optional<f64> get_gpu_energy_consummed()
override {
return std::nullopt; }
138 std::optional<f64> get_cpu_energy_consummed()
override {
return std::nullopt; }
139 std::optional<f64> get_dram_energy_consummed()
override {
return std::nullopt; }
141 bool support_rank_energy_consummed()
override {
return false; }
142 bool support_gpu_energy_consummed()
override {
return false; }
143 bool support_cpu_energy_consummed()
override {
return false; }
144 bool support_dram_energy_consummed()
override {
return false; }
147 bool has_reporter() {
148 auto &reporter = current_reporter();
153 return dynamic_cast<NoopSystemMetricReporter *
>(reporter.get()) ==
nullptr;
156 std::unique_ptr<ISystemMetricReporter> make_reporter(std::string_view reporter_name) {
157 if (reporter_name ==
"aurora") {
158 return std::make_unique<AuroraSystemMetricReporter>();
159#ifdef SHAMROCK_USE_GEOPM
160 }
else if (reporter_name ==
"aurora-linked") {
161 return std::make_unique<AuroraSystemMetricReporterLinked>();
163 }
else if (reporter_name ==
"intel-rapl") {
164 return std::make_unique<IntelRAPLSystemMetricReport>();
165 }
else if (reporter_name ==
"noop" || reporter_name ==
"none" || reporter_name ==
"") {
166 return std::make_unique<NoopSystemMetricReporter>();
169 "Unknown system metrics reporter: {}, valid reporters are: aurora, aurora-linked, "
173 return std::make_unique<NoopSystemMetricReporter>();
176 std::unique_ptr<ISystemMetricReporter> make_reporter() {
177 if (SHAM_SYSTEM_METRICS_REPORTER) {
178 return make_reporter(*SHAM_SYSTEM_METRICS_REPORTER);
180 return std::make_unique<NoopSystemMetricReporter>();
191 std::unique_ptr<ISystemMetricReporter> ¤t_reporter() {
192 static std::unique_ptr<ISystemMetricReporter> reporter =
nullptr;
194 reporter = make_reporter();
200 SystemMetrics get_system_metrics(
bool barrier) {
202 barrier = barrier && has_reporter();
208 auto ret = SystemMetrics{
210 get_rank_energy_consummed(),
211 get_gpu_energy_consummed(),
212 get_cpu_energy_consummed(),
213 get_dram_energy_consummed()};
220 std::vector<SystemMetrics> gather_rank_metrics(
const SystemMetrics &input) {
223 auto optional_gather_power = [&](
const std::optional<f64> &value) -> std::vector<f64> {
224 return shamalgs::collective::gather(value ? value.value() : 0._f64);
227 std::vector<f64> rank_energy_consummed_all_ranks
228 = optional_gather_power(input.rank_energy_consummed);
229 std::vector<f64> gpu_energy_consummed_all_ranks
230 = optional_gather_power(input.gpu_energy_consummed);
231 std::vector<f64> cpu_energy_consummed_all_ranks
232 = optional_gather_power(input.cpu_energy_consummed);
233 std::vector<f64> dram_energy_consummed_all_ranks
234 = optional_gather_power(input.dram_energy_consummed);
235 std::vector<f64> metric_time_all_ranks = shamalgs::collective::gather(input.wall_time);
238 ret[i] = SystemMetrics{
239 metric_time_all_ranks[i],
240 (shamsys::support_rank_energy_consummed())
241 ? std::optional<f64>{rank_energy_consummed_all_ranks[i]}
243 (shamsys::support_gpu_energy_consummed())
244 ? std::optional<f64>{gpu_energy_consummed_all_ranks[i]}
246 (shamsys::support_cpu_energy_consummed())
247 ? std::optional<f64>{cpu_energy_consummed_all_ranks[i]}
249 (shamsys::support_dram_energy_consummed())
250 ? std::optional<f64>{dram_energy_consummed_all_ranks[i]}
258 SystemMetrics aggregate_rank_metrics(
const std::vector<SystemMetrics> &input) {
259 f64 sum_rank_energy_consummed = 0._f64;
260 f64 sum_gpu_energy_consummed = 0._f64;
261 f64 sum_cpu_energy_consummed = 0._f64;
262 f64 sum_dram_energy_consummed = 0._f64;
263 f64 metric_time_all = 0._f64;
265 for (
const auto &m : input) {
266 sum_rank_energy_consummed
267 += (m.rank_energy_consummed ? m.rank_energy_consummed.value() : 0._f64);
268 sum_gpu_energy_consummed
269 += (m.gpu_energy_consummed ? m.gpu_energy_consummed.value() : 0._f64);
270 sum_cpu_energy_consummed
271 += (m.cpu_energy_consummed ? m.cpu_energy_consummed.value() : 0._f64);
272 sum_dram_energy_consummed
273 += (m.dram_energy_consummed ? m.dram_energy_consummed.value() : 0._f64);
274 metric_time_all = std::max(metric_time_all, m.wall_time);
277 SystemMetrics system_metrics;
278 system_metrics.wall_time = metric_time_all;
279 system_metrics.rank_energy_consummed = (shamsys::support_rank_energy_consummed())
280 ? sum_rank_energy_consummed
282 system_metrics.gpu_energy_consummed = (shamsys::support_gpu_energy_consummed())
283 ? sum_gpu_energy_consummed
285 system_metrics.cpu_energy_consummed = (shamsys::support_cpu_energy_consummed())
286 ? sum_cpu_energy_consummed
288 system_metrics.dram_energy_consummed = (shamsys::support_dram_energy_consummed())
289 ? sum_dram_energy_consummed
292 return system_metrics;
296 auto format_metric = [](
const std::optional<f64> &energy,
298 std::optional<std::string> &out_power,
299 std::optional<std::string> &out_energy) {
300 if (energy.has_value()) {
301 if (wall_time > 0._f64 && energy.value() > 0._f64) {
302 f64 consumed_energy = energy.value();
303 f64 power = consumed_energy / wall_time;
304 out_power = shambase::format(
"{:.1f} W", power);
305 out_energy = shambase::format(
"{:.1f} J", consumed_energy);
314 shambase::format(
"{:.1f} s", input.wall_time),
326 input.rank_energy_consummed,
329 ret.rank_energy_consummed);
331 input.gpu_energy_consummed,
334 ret.gpu_energy_consummed );
336 input.cpu_energy_consummed,
339 ret.cpu_energy_consummed );
341 input.dram_energy_consummed,
344 ret.dram_energy_consummed);
double f64
Alias for double.
std::uint32_t u32
32 bit unsigned integer
Functions related to the MPI communicator.
void throw_with_loc(std::string message, SourceLocation loc=SourceLocation{})
Throw an exception and append the source location to it.
T & get_check_ref(const std::unique_ptr< T > &ptr, SourceLocation loc=SourceLocation())
Takes a std::unique_ptr and returns a reference to the object it holds. It throws a std::runtime_erro...
std::string popen_fetch_output(const char *command)
Run a command and return the output.
i32 world_size()
Gives the size of the MPI communicator.
namespace for the system handling
void test_reporter(std::unique_ptr< ISystemMetricReporter > &reporter)
test that there is no crashes
FormattedSystemMetrics format_system_metrics(const SystemMetrics &input)
Only to be used on deltas, not the raw one.
This file contains the definition for the stacktrace related functionality.
f64 get_wtime()
Returns the current wall clock time in seconds.
void Barrier(MPI_Comm comm)
MPI wrapper for MPI_Barrier.