Shamrock 2025.10.0
Astrophysical Code
Loading...
Searching...
No Matches
system_metrics.cpp
Go to the documentation of this file.
1// -------------------------------------------------------//
2//
3// SHAMROCK code for hydrodynamics
4// Copyright (c) 2021-2026 Timothée David--Cléris <tim.shamrock@proton.me>
5// SPDX-License-Identifier: CeCILL Free Software License Agreement v2.1
6// Shamrock is licensed under the CeCILL 2.1 License, see LICENSE for more information
7//
8// -------------------------------------------------------//
9
17#include "shambase/popen.hpp"
21#include "shamcomm/wrapper.hpp"
23#include <cstdlib>
24
25#ifdef SHAMROCK_USE_GEOPM
26 #include <geopm/PlatformIO.hpp>
27 #include <geopm/PlatformTopo.hpp>
28#endif
29
30namespace shamsys {
31
32#ifdef SHAMROCK_USE_GEOPM
33
34 class AuroraSystemMetricReporterLinked : public ISystemMetricReporter {
35 public:
36 std::optional<f64> get_rank_energy_consummed() override {
37 if (shamcomm::is_main_node_rank()) {
38 return geopm::platform_io().read_signal("BOARD_ENERGY", GEOPM_DOMAIN_BOARD, 0);
39 }
40 return std::nullopt;
41 }
42
43 std::optional<f64> get_gpu_energy_consummed() override {
44 if (shamcomm::is_main_node_rank()) {
45 return geopm::platform_io().read_signal("GPU_ENERGY", GEOPM_DOMAIN_BOARD, 0);
46 }
47 return std::nullopt;
48 }
49
50 std::optional<f64> get_cpu_energy_consummed() override {
51 if (shamcomm::is_main_node_rank()) {
52 return geopm::platform_io().read_signal("CPU_ENERGY", GEOPM_DOMAIN_BOARD, 0);
53 }
54 return std::nullopt;
55 }
56
57 std::optional<f64> get_dram_energy_consummed() override {
58 if (shamcomm::is_main_node_rank()) {
59 return geopm::platform_io().read_signal("DRAM_ENERGY", GEOPM_DOMAIN_BOARD, 0);
60 }
61 return std::nullopt;
62 }
63
64 bool support_rank_energy_consummed() override { return true; }
65 bool support_gpu_energy_consummed() override { return true; }
66 bool support_cpu_energy_consummed() override { return true; }
67 bool support_dram_energy_consummed() override { return true; }
68 };
69#endif
70
72 public:
73 std::optional<f64> get_rank_energy_consummed() override {
74 if (shamcomm::is_main_node_rank()) {
75 std::string output = shambase::popen_fetch_output("geopmread BOARD_ENERGY board 0");
76 return std::stoull(output.c_str());
77 }
78 return std::nullopt;
79 }
80
81 std::optional<f64> get_gpu_energy_consummed() override {
82 if (shamcomm::is_main_node_rank()) {
83 std::string output = shambase::popen_fetch_output("geopmread GPU_ENERGY board 0");
84 return std::stoull(output.c_str());
85 }
86 return std::nullopt;
87 }
88
89 std::optional<f64> get_cpu_energy_consummed() override {
90 if (shamcomm::is_main_node_rank()) {
91 std::string output = shambase::popen_fetch_output("geopmread CPU_ENERGY board 0");
92 return std::stoull(output.c_str());
93 }
94 return std::nullopt;
95 }
96
97 std::optional<f64> get_dram_energy_consummed() override {
98 if (shamcomm::is_main_node_rank()) {
99 std::string output = shambase::popen_fetch_output("geopmread DRAM_ENERGY board 0");
100 return std::stoull(output.c_str());
101 }
102 return std::nullopt;
103 }
104
105 bool support_rank_energy_consummed() override { return true; }
106 bool support_gpu_energy_consummed() override { return true; }
107 bool support_cpu_energy_consummed() override { return true; }
108 bool support_dram_energy_consummed() override { return true; }
109 };
110
112 public:
113 std::optional<f64> get_rank_energy_consummed() override {
114 if (shamcomm::is_main_node_rank()) {
115 std::string output = shambase::popen_fetch_output(
116 "cat /sys/class/powercap/intel-rapl:0/energy_uj");
117 return f64(std::stoull(output.c_str())) * 1e-6;
118 }
119 return std::nullopt;
120 }
121
122 std::optional<f64> get_gpu_energy_consummed() override { return std::nullopt; }
123
124 std::optional<f64> get_cpu_energy_consummed() override { return std::nullopt; }
125
126 std::optional<f64> get_dram_energy_consummed() override { return std::nullopt; }
127
128 bool support_rank_energy_consummed() override { return true; }
129 bool support_gpu_energy_consummed() override { return false; }
130 bool support_cpu_energy_consummed() override { return false; }
131 bool support_dram_energy_consummed() override { return false; }
132 };
133
135 public:
136 std::optional<f64> get_rank_energy_consummed() override { return std::nullopt; }
137 std::optional<f64> get_gpu_energy_consummed() override { return std::nullopt; }
138 std::optional<f64> get_cpu_energy_consummed() override { return std::nullopt; }
139 std::optional<f64> get_dram_energy_consummed() override { return std::nullopt; }
140
141 bool support_rank_energy_consummed() override { return false; }
142 bool support_gpu_energy_consummed() override { return false; }
143 bool support_cpu_energy_consummed() override { return false; }
144 bool support_dram_energy_consummed() override { return false; }
145 };
146
147 bool has_reporter() {
148 auto &reporter = current_reporter();
149 if (!reporter) {
150 return false;
151 }
152 // dynamic_cast returns nullptr if the cast fails, so we check for that
153 return dynamic_cast<NoopSystemMetricReporter *>(reporter.get()) == nullptr;
154 }
155
156 std::unique_ptr<ISystemMetricReporter> make_reporter(std::string_view reporter_name) {
157 if (reporter_name == "aurora") {
158 return std::make_unique<AuroraSystemMetricReporter>();
159#ifdef SHAMROCK_USE_GEOPM
160 } else if (reporter_name == "aurora-linked") {
161 return std::make_unique<AuroraSystemMetricReporterLinked>();
162#endif
163 } else if (reporter_name == "intel-rapl") {
164 return std::make_unique<IntelRAPLSystemMetricReport>();
165 } else if (reporter_name == "noop" || reporter_name == "none" || reporter_name == "") {
166 return std::make_unique<NoopSystemMetricReporter>();
167 } else {
169 "Unknown system metrics reporter: {}, valid reporters are: aurora, aurora-linked, "
170 "intel-rapl, noop",
171 reporter_name));
172 }
173 return std::make_unique<NoopSystemMetricReporter>();
174 }
175
176 std::unique_ptr<ISystemMetricReporter> make_reporter() {
177 if (SHAM_SYSTEM_METRICS_REPORTER) {
178 return make_reporter(*SHAM_SYSTEM_METRICS_REPORTER);
179 }
180 return std::make_unique<NoopSystemMetricReporter>();
181 }
182
184 void test_reporter(std::unique_ptr<ISystemMetricReporter> &reporter) {
185 shambase::get_check_ref(reporter).get_rank_energy_consummed();
186 shambase::get_check_ref(reporter).get_gpu_energy_consummed();
187 shambase::get_check_ref(reporter).get_cpu_energy_consummed();
188 shambase::get_check_ref(reporter).get_dram_energy_consummed();
189 }
190
191 std::unique_ptr<ISystemMetricReporter> &current_reporter() {
192 static std::unique_ptr<ISystemMetricReporter> reporter = nullptr;
193 if (!reporter) {
194 reporter = make_reporter();
195 test_reporter(reporter);
196 }
197 return reporter;
198 }
199
200 SystemMetrics get_system_metrics(bool barrier) {
201 // Ensure that barriers aren't used if there is no reporter
202 barrier = barrier && has_reporter();
203
204 if (barrier) {
205 shamcomm::mpi::Barrier(MPI_COMM_WORLD);
206 }
207 f64 wall_time = shambase::details::get_wtime();
208 auto ret = SystemMetrics{
209 wall_time,
210 get_rank_energy_consummed(),
211 get_gpu_energy_consummed(),
212 get_cpu_energy_consummed(),
213 get_dram_energy_consummed()};
214 if (barrier) {
215 shamcomm::mpi::Barrier(MPI_COMM_WORLD);
216 }
217 return ret;
218 }
219
220 std::vector<SystemMetrics> gather_rank_metrics(const SystemMetrics &input) {
221 std::vector<SystemMetrics> ret(shamcomm::world_size());
222
223 auto optional_gather_power = [&](const std::optional<f64> &value) -> std::vector<f64> {
224 return shamalgs::collective::gather(value ? value.value() : 0._f64);
225 };
226
227 std::vector<f64> rank_energy_consummed_all_ranks
228 = optional_gather_power(input.rank_energy_consummed);
229 std::vector<f64> gpu_energy_consummed_all_ranks
230 = optional_gather_power(input.gpu_energy_consummed);
231 std::vector<f64> cpu_energy_consummed_all_ranks
232 = optional_gather_power(input.cpu_energy_consummed);
233 std::vector<f64> dram_energy_consummed_all_ranks
234 = optional_gather_power(input.dram_energy_consummed);
235 std::vector<f64> metric_time_all_ranks = shamalgs::collective::gather(input.wall_time);
236
237 for (u32 i = 0; i < shamcomm::world_size(); i++) {
238 ret[i] = SystemMetrics{
239 metric_time_all_ranks[i],
240 (shamsys::support_rank_energy_consummed())
241 ? std::optional<f64>{rank_energy_consummed_all_ranks[i]}
242 : std::nullopt,
243 (shamsys::support_gpu_energy_consummed())
244 ? std::optional<f64>{gpu_energy_consummed_all_ranks[i]}
245 : std::nullopt,
246 (shamsys::support_cpu_energy_consummed())
247 ? std::optional<f64>{cpu_energy_consummed_all_ranks[i]}
248 : std::nullopt,
249 (shamsys::support_dram_energy_consummed())
250 ? std::optional<f64>{dram_energy_consummed_all_ranks[i]}
251 : std::nullopt,
252 };
253 }
254
255 return ret;
256 }
257
258 SystemMetrics aggregate_rank_metrics(const std::vector<SystemMetrics> &input) {
259 f64 sum_rank_energy_consummed = 0._f64;
260 f64 sum_gpu_energy_consummed = 0._f64;
261 f64 sum_cpu_energy_consummed = 0._f64;
262 f64 sum_dram_energy_consummed = 0._f64;
263 f64 metric_time_all = 0._f64;
264
265 for (const auto &m : input) {
266 sum_rank_energy_consummed
267 += (m.rank_energy_consummed ? m.rank_energy_consummed.value() : 0._f64);
268 sum_gpu_energy_consummed
269 += (m.gpu_energy_consummed ? m.gpu_energy_consummed.value() : 0._f64);
270 sum_cpu_energy_consummed
271 += (m.cpu_energy_consummed ? m.cpu_energy_consummed.value() : 0._f64);
272 sum_dram_energy_consummed
273 += (m.dram_energy_consummed ? m.dram_energy_consummed.value() : 0._f64);
274 metric_time_all = std::max(metric_time_all, m.wall_time);
275 }
276
277 SystemMetrics system_metrics;
278 system_metrics.wall_time = metric_time_all;
279 system_metrics.rank_energy_consummed = (shamsys::support_rank_energy_consummed())
280 ? sum_rank_energy_consummed
281 : std::optional<f64>{};
282 system_metrics.gpu_energy_consummed = (shamsys::support_gpu_energy_consummed())
283 ? sum_gpu_energy_consummed
284 : std::optional<f64>{};
285 system_metrics.cpu_energy_consummed = (shamsys::support_cpu_energy_consummed())
286 ? sum_cpu_energy_consummed
287 : std::optional<f64>{};
288 system_metrics.dram_energy_consummed = (shamsys::support_dram_energy_consummed())
289 ? sum_dram_energy_consummed
290 : std::optional<f64>{};
291
292 return system_metrics;
293 }
294
296 auto format_metric = [](const std::optional<f64> &energy,
297 f64 wall_time,
298 std::optional<std::string> &out_power,
299 std::optional<std::string> &out_energy) {
300 if (energy.has_value()) {
301 if (wall_time > 0._f64 && energy.value() > 0._f64) {
302 f64 consumed_energy = energy.value();
303 f64 power = consumed_energy / wall_time;
304 out_power = shambase::format("{:.1f} W", power);
305 out_energy = shambase::format("{:.1f} J", consumed_energy);
306 } else {
307 out_power = "N/A";
308 out_energy = "N/A";
309 }
310 }
311 };
312
314 shambase::format("{:.1f} s", input.wall_time),
315 std::nullopt,
316 std::nullopt,
317 std::nullopt,
318 std::nullopt,
319 std::nullopt,
320 std::nullopt,
321 std::nullopt,
322 std::nullopt,
323 };
324
325 format_metric(
326 input.rank_energy_consummed,
327 input.wall_time,
328 ret.rank_power,
329 ret.rank_energy_consummed);
330 format_metric(
331 input.gpu_energy_consummed,
332 input.wall_time,
333 ret.gpu_power,
334 ret.gpu_energy_consummed /* */);
335 format_metric(
336 input.cpu_energy_consummed,
337 input.wall_time,
338 ret.cpu_power,
339 ret.cpu_energy_consummed /* */);
340 format_metric(
341 input.dram_energy_consummed,
342 input.wall_time,
343 ret.dram_power,
344 ret.dram_energy_consummed);
345
346 return ret;
347 }
348} // namespace shamsys
double f64
Alias for double.
std::uint32_t u32
32 bit unsigned integer
Functions related to the MPI communicator.
void throw_with_loc(std::string message, SourceLocation loc=SourceLocation{})
Throw an exception and append the source location to it.
T & get_check_ref(const std::unique_ptr< T > &ptr, SourceLocation loc=SourceLocation())
Takes a std::unique_ptr and returns a reference to the object it holds. It throws a std::runtime_erro...
Definition memory.hpp:110
std::string popen_fetch_output(const char *command)
Run a command and return the output.
Definition popen.cpp:23
i32 world_size()
Gives the size of the MPI communicator.
Definition worldInfo.cpp:38
namespace for the system handling
void test_reporter(std::unique_ptr< ISystemMetricReporter > &reporter)
test that there is no crashes
FormattedSystemMetrics format_system_metrics(const SystemMetrics &input)
Only to be used on deltas, not the raw one.
STL namespace.
This file contains the definition for the stacktrace related functionality.
f64 get_wtime()
Returns the current wall clock time in seconds.
void Barrier(MPI_Comm comm)
MPI wrapper for MPI_Barrier.
Definition wrapper.cpp:194