Shamrock 2025.10.0
Astrophysical Code
Loading...
Searching...
No Matches
system_metrics.cpp
Go to the documentation of this file.
1// -------------------------------------------------------//
2//
3// SHAMROCK code for hydrodynamics
4// Copyright (c) 2021-2026 Timothée David--Cléris <tim.shamrock@proton.me>
5// SPDX-License-Identifier: CeCILL Free Software License Agreement v2.1
6// Shamrock is licensed under the CeCILL 2.1 License, see LICENSE for more information
7//
8// -------------------------------------------------------//
9
15
17#include "shambase/popen.hpp"
21#include "shamcomm/wrapper.hpp"
23#include <cstdlib>
24
25#ifdef SHAMROCK_USE_GEOPM
26 #include <geopm/PlatformIO.hpp>
27 #include <geopm/PlatformTopo.hpp>
28#endif
29
30namespace shamsys {
31
32#ifdef SHAMROCK_USE_GEOPM
33
34 class AuroraSystemMetricReporterLinked : public ISystemMetricReporter {
35 public:
36 std::optional<f64> get_rank_energy_consummed() override {
37 if (shamcomm::is_main_node_rank()) {
38 return geopm::platform_io().read_signal("BOARD_ENERGY", GEOPM_DOMAIN_BOARD, 0);
39 }
40 return std::nullopt;
41 }
42
43 std::optional<f64> get_gpu_energy_consummed() override {
44 if (shamcomm::is_main_node_rank()) {
45 return geopm::platform_io().read_signal("GPU_ENERGY", GEOPM_DOMAIN_BOARD, 0);
46 }
47 return std::nullopt;
48 }
49
50 std::optional<f64> get_cpu_energy_consummed() override {
51 if (shamcomm::is_main_node_rank()) {
52 return geopm::platform_io().read_signal("CPU_ENERGY", GEOPM_DOMAIN_BOARD, 0);
53 }
54 return std::nullopt;
55 }
56
57 std::optional<f64> get_dram_energy_consummed() override {
58 if (shamcomm::is_main_node_rank()) {
59 return geopm::platform_io().read_signal("DRAM_ENERGY", GEOPM_DOMAIN_BOARD, 0);
60 }
61 return std::nullopt;
62 }
63
64 bool support_rank_energy_consummed() override { return true; }
65 bool support_gpu_energy_consummed() override { return true; }
66 bool support_cpu_energy_consummed() override { return true; }
67 bool support_dram_energy_consummed() override { return true; }
68 };
69#endif
70
72 public:
73 std::optional<f64> get_rank_energy_consummed() override {
74 if (shamcomm::is_main_node_rank()) {
75 std::string output = shambase::popen_fetch_output("geopmread BOARD_ENERGY board 0");
76 return std::stoull(output.c_str());
77 }
78 return std::nullopt;
79 }
80
81 std::optional<f64> get_gpu_energy_consummed() override {
82 if (shamcomm::is_main_node_rank()) {
83 std::string output = shambase::popen_fetch_output("geopmread GPU_ENERGY board 0");
84 return std::stoull(output.c_str());
85 }
86 return std::nullopt;
87 }
88
89 std::optional<f64> get_cpu_energy_consummed() override {
90 if (shamcomm::is_main_node_rank()) {
91 std::string output = shambase::popen_fetch_output("geopmread CPU_ENERGY board 0");
92 return std::stoull(output.c_str());
93 }
94 return std::nullopt;
95 }
96
97 std::optional<f64> get_dram_energy_consummed() override {
98 if (shamcomm::is_main_node_rank()) {
99 std::string output = shambase::popen_fetch_output("geopmread DRAM_ENERGY board 0");
100 return std::stoull(output.c_str());
101 }
102 return std::nullopt;
103 }
104
105 bool support_rank_energy_consummed() override { return true; }
106 bool support_gpu_energy_consummed() override { return true; }
107 bool support_cpu_energy_consummed() override { return true; }
108 bool support_dram_energy_consummed() override { return true; }
109 };
110
112 public:
113 std::optional<f64> get_rank_energy_consummed() override {
114 if (shamcomm::is_main_node_rank()) {
115 std::string output = shambase::popen_fetch_output(
116 "cat /sys/class/powercap/intel-rapl:0/energy_uj");
117 return f64(std::stoull(output.c_str())) * 1e-6;
118 }
119 return std::nullopt;
120 }
121
122 std::optional<f64> get_gpu_energy_consummed() override { return std::nullopt; }
123
124 std::optional<f64> get_cpu_energy_consummed() override { return std::nullopt; }
125
126 std::optional<f64> get_dram_energy_consummed() override { return std::nullopt; }
127
128 bool support_rank_energy_consummed() override { return true; }
129 bool support_gpu_energy_consummed() override { return false; }
130 bool support_cpu_energy_consummed() override { return false; }
131 bool support_dram_energy_consummed() override { return false; }
132 };
133
135 public:
136 std::optional<f64> get_rank_energy_consummed() override { return std::nullopt; }
137 std::optional<f64> get_gpu_energy_consummed() override { return std::nullopt; }
138 std::optional<f64> get_cpu_energy_consummed() override { return std::nullopt; }
139 std::optional<f64> get_dram_energy_consummed() override { return std::nullopt; }
140
141 bool support_rank_energy_consummed() override { return false; }
142 bool support_gpu_energy_consummed() override { return false; }
143 bool support_cpu_energy_consummed() override { return false; }
144 bool support_dram_energy_consummed() override { return false; }
145 };
146
147 bool has_reporter() {
148 auto &reporter = current_reporter();
149 if (!reporter) {
150 return false;
151 }
152 // dynamic_cast returns nullptr if the cast fails, so we check for that
153 return dynamic_cast<NoopSystemMetricReporter *>(reporter.get()) == nullptr;
154 }
155
156 std::unique_ptr<ISystemMetricReporter> make_reporter(std::string_view reporter_name) {
157 if (reporter_name == "aurora") {
158 return std::make_unique<AuroraSystemMetricReporter>();
159#ifdef SHAMROCK_USE_GEOPM
160 } else if (reporter_name == "aurora-linked") {
161 return std::make_unique<AuroraSystemMetricReporterLinked>();
162#endif
163 } else if (reporter_name == "intel-rapl") {
164 return std::make_unique<IntelRAPLSystemMetricReport>();
165 } else if (reporter_name == "noop" || reporter_name == "none" || reporter_name == "") {
166 return std::make_unique<NoopSystemMetricReporter>();
167 } else {
169 "Unknown system metrics reporter: {}, valid reporters are: aurora, aurora-linked, "
170 "intel-rapl, noop",
171 reporter_name));
172 }
173 return std::make_unique<NoopSystemMetricReporter>();
174 }
175
176 std::unique_ptr<ISystemMetricReporter> make_reporter() {
177 if (SHAM_SYSTEM_METRICS_REPORTER) {
178 return make_reporter(*SHAM_SYSTEM_METRICS_REPORTER);
179 }
180 return std::make_unique<NoopSystemMetricReporter>();
181 }
182
184 void test_reporter(std::unique_ptr<ISystemMetricReporter> &reporter) {
185 shambase::get_check_ref(reporter).get_rank_energy_consummed();
186 shambase::get_check_ref(reporter).get_gpu_energy_consummed();
187 shambase::get_check_ref(reporter).get_cpu_energy_consummed();
188 shambase::get_check_ref(reporter).get_dram_energy_consummed();
189 }
190
191 std::unique_ptr<ISystemMetricReporter> &current_reporter() {
192 static std::unique_ptr<ISystemMetricReporter> reporter = nullptr;
193 if (!reporter) {
194 reporter = make_reporter();
195 test_reporter(reporter);
196 }
197 return reporter;
198 }
199
200 SystemMetrics get_system_metrics(bool barrier) {
201 // Ensure that barriers aren't used if there is no reporter
202 barrier = barrier && has_reporter();
203
204 if (barrier) {
205 shamcomm::mpi::Barrier(MPI_COMM_WORLD);
206 }
207 f64 wall_time = shambase::details::get_wtime();
208 auto ret = SystemMetrics{
209 .wall_time = wall_time,
210 .rank_energy_consummed = get_rank_energy_consummed(),
211 .gpu_energy_consummed = get_gpu_energy_consummed(),
212 .cpu_energy_consummed = get_cpu_energy_consummed(),
213 .dram_energy_consummed = get_dram_energy_consummed()};
214 if (barrier) {
215 shamcomm::mpi::Barrier(MPI_COMM_WORLD);
216 }
217 return ret;
218 }
219
220 std::vector<SystemMetrics> gather_rank_metrics(const SystemMetrics &input) {
221 std::vector<SystemMetrics> ret(shamcomm::world_size());
222
223 auto optional_gather_power = [&](const std::optional<f64> &value) -> std::vector<f64> {
224 return shamalgs::collective::gather(value ? value.value() : 0._f64);
225 };
226
227 std::vector<f64> rank_energy_consummed_all_ranks
228 = optional_gather_power(input.rank_energy_consummed);
229 std::vector<f64> gpu_energy_consummed_all_ranks
230 = optional_gather_power(input.gpu_energy_consummed);
231 std::vector<f64> cpu_energy_consummed_all_ranks
232 = optional_gather_power(input.cpu_energy_consummed);
233 std::vector<f64> dram_energy_consummed_all_ranks
234 = optional_gather_power(input.dram_energy_consummed);
235 std::vector<f64> metric_time_all_ranks = shamalgs::collective::gather(input.wall_time);
236
237 for (u32 i = 0; i < shamcomm::world_size(); i++) {
238 ret[i] = SystemMetrics{
239 .wall_time = metric_time_all_ranks[i],
240 .rank_energy_consummed
241 = (shamsys::support_rank_energy_consummed())
242 ? std::optional<f64>{rank_energy_consummed_all_ranks[i]}
243 : std::nullopt,
244 .gpu_energy_consummed = (shamsys::support_gpu_energy_consummed())
245 ? std::optional<f64>{gpu_energy_consummed_all_ranks[i]}
246 : std::nullopt,
247 .cpu_energy_consummed = (shamsys::support_cpu_energy_consummed())
248 ? std::optional<f64>{cpu_energy_consummed_all_ranks[i]}
249 : std::nullopt,
250 .dram_energy_consummed
251 = (shamsys::support_dram_energy_consummed())
252 ? std::optional<f64>{dram_energy_consummed_all_ranks[i]}
253 : std::nullopt,
254 };
255 }
256
257 return ret;
258 }
259
260 SystemMetrics aggregate_rank_metrics(const std::vector<SystemMetrics> &input) {
261 f64 sum_rank_energy_consummed = 0._f64;
262 f64 sum_gpu_energy_consummed = 0._f64;
263 f64 sum_cpu_energy_consummed = 0._f64;
264 f64 sum_dram_energy_consummed = 0._f64;
265 f64 metric_time_all = 0._f64;
266
267 for (const auto &m : input) {
268 sum_rank_energy_consummed
269 += (m.rank_energy_consummed ? m.rank_energy_consummed.value() : 0._f64);
270 sum_gpu_energy_consummed
271 += (m.gpu_energy_consummed ? m.gpu_energy_consummed.value() : 0._f64);
272 sum_cpu_energy_consummed
273 += (m.cpu_energy_consummed ? m.cpu_energy_consummed.value() : 0._f64);
274 sum_dram_energy_consummed
275 += (m.dram_energy_consummed ? m.dram_energy_consummed.value() : 0._f64);
276 metric_time_all = std::max(metric_time_all, m.wall_time);
277 }
278
279 SystemMetrics system_metrics;
280 system_metrics.wall_time = metric_time_all;
281 system_metrics.rank_energy_consummed = (shamsys::support_rank_energy_consummed())
282 ? sum_rank_energy_consummed
283 : std::optional<f64>{};
284 system_metrics.gpu_energy_consummed = (shamsys::support_gpu_energy_consummed())
285 ? sum_gpu_energy_consummed
286 : std::optional<f64>{};
287 system_metrics.cpu_energy_consummed = (shamsys::support_cpu_energy_consummed())
288 ? sum_cpu_energy_consummed
289 : std::optional<f64>{};
290 system_metrics.dram_energy_consummed = (shamsys::support_dram_energy_consummed())
291 ? sum_dram_energy_consummed
292 : std::optional<f64>{};
293
294 return system_metrics;
295 }
296
298 auto format_metric = [](const std::optional<f64> &energy,
299 f64 wall_time,
300 std::optional<std::string> &out_power,
301 std::optional<std::string> &out_energy) {
302 if (energy.has_value()) {
303 if (wall_time > 0._f64 && energy.value() > 0._f64) {
304 f64 consumed_energy = energy.value();
305 f64 power = consumed_energy / wall_time;
306 out_power = shambase::format("{:.1f} W", power);
307 out_energy = shambase::format("{:.1f} J", consumed_energy);
308 } else {
309 out_power = "N/A";
310 out_energy = "N/A";
311 }
312 }
313 };
314
316 .wall_time = shambase::format("{:.1f} s", input.wall_time),
317 .rank_energy_consummed = std::nullopt,
318 .gpu_energy_consummed = std::nullopt,
319 .cpu_energy_consummed = std::nullopt,
320 .dram_energy_consummed = std::nullopt,
321 .rank_power = std::nullopt,
322 .gpu_power = std::nullopt,
323 .cpu_power = std::nullopt,
324 .dram_power = std::nullopt,
325 };
326
327 format_metric(
328 input.rank_energy_consummed,
329 input.wall_time,
330 ret.rank_power,
331 ret.rank_energy_consummed);
332 format_metric(
333 input.gpu_energy_consummed,
334 input.wall_time,
335 ret.gpu_power,
336 ret.gpu_energy_consummed /* */);
337 format_metric(
338 input.cpu_energy_consummed,
339 input.wall_time,
340 ret.cpu_power,
341 ret.cpu_energy_consummed /* */);
342 format_metric(
343 input.dram_energy_consummed,
344 input.wall_time,
345 ret.dram_power,
346 ret.dram_energy_consummed);
347
348 return ret;
349 }
350} // namespace shamsys
double f64
Alias for double.
std::uint32_t u32
32 bit unsigned integer
Functions related to the MPI communicator.
T & get_check_ref(const std::unique_ptr< T > &ptr, SourceLocation loc=SourceLocation())
Takes a std::unique_ptr and returns a reference to the object it holds. It throws a std::runtime_erro...
Definition memory.hpp:110
ExcptTypes make_except_with_loc(std::string message, SourceLocation loc=SourceLocation{})
Create an exception with a message and a location.
std::string popen_fetch_output(const char *command)
Run a command and return the output.
Definition popen.cpp:23
i32 world_size()
Gives the size of the MPI communicator.
Definition worldInfo.cpp:38
namespace for the system handling
void test_reporter(std::unique_ptr< ISystemMetricReporter > &reporter)
test that there is no crashes
FormattedSystemMetrics format_system_metrics(const SystemMetrics &input)
Only to be used on deltas, not the raw one.
This file contains the definition for the stacktrace related functionality.
f64 get_wtime()
Returns the current wall clock time in seconds.
void Barrier(MPI_Comm comm)
MPI wrapper for MPI_Barrier.
Definition wrapper.cpp:194