34std::string shammodels::report_perf_timestep(
40 f64 alloc_time_device,
42 size_t max_mem_device,
45 bool report_power_usage) {
49 std::vector<f64> rate_all_ranks = shamalgs::collective::gather(rate);
50 std::vector<u64> nobj_all_ranks = shamalgs::collective::gather(nobj);
51 std::vector<u64> npatch_all_ranks = shamalgs::collective::gather(npatch);
52 std::vector<f64> tcompute_all_ranks = shamalgs::collective::gather(tcompute);
53 std::vector<f64> mpi_timer_all_ranks = shamalgs::collective::gather(mpi_timer);
54 std::vector<f64> alloc_time_device_all_ranks = shamalgs::collective::gather(alloc_time_device);
55 std::vector<f64> alloc_time_host_all_ranks = shamalgs::collective::gather(alloc_time_host);
56 std::vector<size_t> max_mem_device_all_ranks = shamalgs::collective::gather(max_mem_device);
57 std::vector<size_t> max_mem_host_all_ranks = shamalgs::collective::gather(max_mem_host);
59 auto rank_metrics = (report_power_usage) ? shamsys::gather_rank_metrics(system_metrics)
60 : std::vector<shamsys::SystemMetrics>{};
67 u64 obj_total = std::accumulate(nobj_all_ranks.begin(), nobj_all_ranks.end(), 0_u64);
68 u64 npatch_total = std::accumulate(npatch_all_ranks.begin(), npatch_all_ranks.end(), 0_u64);
69 f64 max_t = *std::max_element(tcompute_all_ranks.begin(), tcompute_all_ranks.end());
70 f64 sum_t = std::accumulate(tcompute_all_ranks.begin(), tcompute_all_ranks.end(), 0.0);
71 f64 sum_mpi = std::accumulate(mpi_timer_all_ranks.begin(), mpi_timer_all_ranks.end(), 0.0);
72 f64 sum_alloc_device = std::accumulate(
73 alloc_time_device_all_ranks.begin(), alloc_time_device_all_ranks.end(), 0.0);
75 = std::accumulate(alloc_time_host_all_ranks.begin(), alloc_time_host_all_ranks.end(), 0.0);
76 size_t sum_mem_device_total
77 = std::accumulate(max_mem_device_all_ranks.begin(), max_mem_device_all_ranks.end(), 0_u64);
78 size_t sum_mem_host_total
79 = std::accumulate(max_mem_host_all_ranks.begin(), max_mem_host_all_ranks.end(), 0_u64);
83 std::vector<shamsys::FormattedSystemMetrics> formatted_rank_metrics{};
85 if (report_power_usage) {
86 for (
const auto &metric : rank_metrics) {
92 u32 cols_count = 9_u32;
93 if (report_power_usage) {
94 if (shamsys::support_rank_energy_consummed()) {
97 if (shamsys::support_gpu_energy_consummed()) {
100 if (shamsys::support_cpu_energy_consummed()) {
103 if (shamsys::support_dram_energy_consummed()) {
110 Table table(cols_count);
112 std::vector<std::string> header
122 if (report_power_usage) {
123 if (shamsys::support_rank_energy_consummed()) {
124 header.push_back(
"power");
126 if (shamsys::support_gpu_energy_consummed()) {
127 header.push_back(
"gpu power");
129 if (shamsys::support_cpu_energy_consummed()) {
130 header.push_back(
"cpu power");
132 if (shamsys::support_dram_energy_consummed()) {
133 header.push_back(
"dram power");
137 table.add_double_rule();
138 table.add_data(header, Table::center);
139 table.add_double_rule();
141 std::vector<std::string> row = {
142 shambase::format(
"{:<4}", i),
143 shambase::format(
"{:.4e}", rate_all_ranks[i]),
144 shambase::format(
"{:}", nobj_all_ranks[i]),
145 shambase::format(
"{:}", npatch_all_ranks[i]),
146 shambase::format(
"{:.3e}", tcompute_all_ranks[i]),
147 shambase::format(
"{:.1f}%", 100 * (mpi_timer_all_ranks[i] / tcompute_all_ranks[i])),
150 100 * (alloc_time_device_all_ranks[i] / tcompute_all_ranks[i]),
151 100 * (alloc_time_host_all_ranks[i] / tcompute_all_ranks[i])),
155 if (report_power_usage) {
156 if (shamsys::support_rank_energy_consummed()) {
157 row.push_back(formatted_rank_metrics[i].rank_power.value_or(
"N/A"));
159 if (shamsys::support_gpu_energy_consummed()) {
160 row.push_back(formatted_rank_metrics[i].gpu_power.value_or(
"N/A"));
162 if (shamsys::support_cpu_energy_consummed()) {
163 row.push_back(formatted_rank_metrics[i].cpu_power.value_or(
"N/A"));
165 if (shamsys::support_dram_energy_consummed()) {
166 row.push_back(formatted_rank_metrics[i].dram_power.value_or(
"N/A"));
169 table.add_data(row, Table::right);
172 std::vector<std::string> ruled
173 = {
"",
"<sum N/max t>",
"<sum>",
"<sum>",
"<max>",
"<avg>",
"<avg>",
"<sum>",
"<sum>"};
174 if (report_power_usage) {
175 if (shamsys::support_rank_energy_consummed()) {
176 ruled.push_back(
"<sum>");
178 if (shamsys::support_gpu_energy_consummed()) {
179 ruled.push_back(
"<sum>");
181 if (shamsys::support_cpu_energy_consummed()) {
182 ruled.push_back(
"<sum>");
184 if (shamsys::support_dram_energy_consummed()) {
185 ruled.push_back(
"<sum>");
188 table.add_rulled_data(ruled);
189 std::vector<std::string> all_row = {
191 shambase::format(
"{:.4e}",
f64(obj_total) / max_t),
192 shambase::format(
"{:}", obj_total),
193 shambase::format(
"{:}", npatch_total),
194 shambase::format(
"{:.3e}", max_t),
195 shambase::format(
"{:.1f}%", 100 * (sum_mpi / sum_t)),
198 100 * (sum_alloc_device / sum_t),
199 100 * (sum_alloc_host / sum_t)),
203 if (report_power_usage) {
204 if (shamsys::support_rank_energy_consummed()) {
205 all_row.push_back(formatted_aggregated_metrics.rank_power.value_or(
"N/A"));
207 if (shamsys::support_gpu_energy_consummed()) {
208 all_row.push_back(formatted_aggregated_metrics.gpu_power.value_or(
"N/A"));
210 if (shamsys::support_cpu_energy_consummed()) {
211 all_row.push_back(formatted_aggregated_metrics.cpu_power.value_or(
"N/A"));
213 if (shamsys::support_dram_energy_consummed()) {
214 all_row.push_back(formatted_aggregated_metrics.dram_power.value_or(
"N/A"));
217 table.add_data(all_row, Table::right);
221 return "Timestep perf report:" + table.render();
double f64
Alias for double.
std::uint32_t u32
32 bit unsigned integer
std::uint64_t u64
64 bit unsigned integer
MPI string gather / allgather helpers (declarations; implementations in shamalgs/src/collective/gathe...
std::string readable_sizeof(double size)
given a sizeof value return a readble string Example : readable_sizeof(1024*1024*1024) -> "1....
i32 world_rank()
Gives the rank of the current process in the MPI communicator.
i32 world_size()
Gives the size of the MPI communicator.
FormattedSystemMetrics format_system_metrics(const SystemMetrics &input)
Only to be used on deltas, not the raw one.
This file contains the definition for the stacktrace related functionality.
#define __shamrock_stack_entry()
Macro to create a stack entry.
Functions related to the MPI communicator.