Shamrock 2025.10.0
Astrophysical Code
Loading...
Searching...
No Matches
MicroBenchmark.cpp
Go to the documentation of this file.
1// -------------------------------------------------------//
2//
3// SHAMROCK code for hydrodynamics
4// Copyright (c) 2021-2026 Timothée David--Cléris <tim.shamrock@proton.me>
5// SPDX-License-Identifier: CeCILL Free Software License Agreement v2.1
6// Shamrock is licensed under the CeCILL 2.1 License, see LICENSE for more information
7//
8// -------------------------------------------------------//
9
18#include "shambase/string.hpp"
19#include "shambase/time.hpp"
26#include "shambackends/math.hpp"
27#include "shamcomm/wrapper.hpp"
32#include <stdexcept>
33#include <vector>
34
35namespace {
36
37 std::unordered_map<std::string, double> microbench_results = {};
38
39}
40
41namespace shamsys::microbench {
43 void p2p_bandwidth(u32 wr_sender, u32 wr_receiv);
44
46 void p2p_latency(u32 wr1, u32 wr2);
47
49 template<typename T>
50 void saxpy();
51
53 template<typename T>
55
57 void vector_allgather(u32 el_per_rank);
58
59 std::tuple<std::string, std::string> format_result(f64 val) {
60
61 std::array<char, 6> prefixes = {'k', 'M', 'G', 'T', 'P', 'E'};
62 std::array<f64, 6> prefixes_val = {1.e3, 1.e6, 1.e9, 1.e12, 1.e15, 1.e18};
63
64 std::string prefix = "";
65 f64 val_out = val;
66 for (size_t i = 0; i < prefixes.size(); i++) {
67 if (val > prefixes_val[i]) {
68 prefix = prefixes[i];
69 val_out = val / prefixes_val[i];
70 }
71 }
72 return {prefix, shambase::format("{:.3}", val_out)};
73 }
74
75} // namespace shamsys::microbench
76
78 StackEntry stack_loc{};
79
80 if (shamcomm::world_rank() == 0) {
81 logger::raw_ln("Running micro benchmarks:");
82 }
83
84 u32 wr1 = 0;
85 u32 wr2 = shamcomm::world_size() - 1;
86
87 microbench::p2p_bandwidth(wr1, wr2);
88 if (shamcomm::world_size() > 1) {
89 microbench::p2p_latency(wr1, wr2);
90 }
91 microbench::saxpy<f32>();
92 microbench::saxpy<f64>();
93 microbench::saxpy<f32_2>();
94 microbench::saxpy<f64_2>();
95 microbench::saxpy<f32_3>();
96 microbench::saxpy<f64_3>();
97 microbench::saxpy<f32_4>();
98 microbench::saxpy<f64_4>();
99 microbench::fma_chains_rotation<f32>();
100 microbench::fma_chains_rotation<f64>();
101 microbench::fma_chains_rotation<f32_2>();
102 microbench::fma_chains_rotation<f64_2>();
103 microbench::fma_chains_rotation<f32_3>();
104 microbench::fma_chains_rotation<f64_3>();
105 microbench::fma_chains_rotation<f32_4>();
106 microbench::fma_chains_rotation<f64_4>();
107 microbench::vector_allgather(1);
108 microbench::vector_allgather(8);
109 microbench::vector_allgather(64);
110 microbench::vector_allgather(128);
111 microbench::vector_allgather(150);
112 microbench::vector_allgather(1024);
113}
114
115void shamsys::microbench::p2p_bandwidth(u32 wr_sender, u32 wr_receiv) {
116 StackEntry stack_loc{};
117
119
120 u64 length = 1024UL * 1014UL * 8UL; // 8MB messages
121 shamcomm::CommunicationBuffer buf_recv{length, instance::get_compute_scheduler_ptr()};
122 shamcomm::CommunicationBuffer buf_send{length, instance::get_compute_scheduler_ptr()};
123
124 std::vector<MPI_Request> rqs;
125
126 f64 t = 0;
127 u64 loops = 0;
128 bool is_used = false;
129 do {
130 loops++;
131
132 mpi::barrier(MPI_COMM_WORLD);
133 f64 t_start = MPI_Wtime();
134
135 if (wr == wr_sender) {
136 rqs.push_back(MPI_Request{});
137 u32 rq_index = rqs.size() - 1;
138 auto &rq = rqs[rq_index];
139 shamcomm::mpi::Isend(
140 buf_send.get_ptr(), length, MPI_BYTE, wr_receiv, 0, MPI_COMM_WORLD, &rq);
141 is_used = true;
142 }
143
144 if (wr == wr_receiv) {
145 MPI_Status s;
146 shamcomm::mpi::Recv(
147 buf_recv.get_ptr(), length, MPI_BYTE, wr_sender, 0, MPI_COMM_WORLD, &s);
148 is_used = true;
149 }
150
151 if (!is_used) {
152 t = 1;
153 }
154 std::vector<MPI_Status> st_lst(rqs.size());
155 if (rqs.size() > 0) {
156 shamcomm::mpi::Waitall(rqs.size(), rqs.data(), st_lst.data());
157 }
158 f64 t_end = MPI_Wtime();
159 t += t_end - t_start;
160
161 } while (shamalgs::collective::allreduce_min(t) < 1);
162
163 f64 bw = f64(length * loops) / t;
164
165 microbench_results["p2p_bandwidth"] = bw;
166
167 if (shamcomm::world_rank() == 0) {
168 auto [prefix, val] = format_result(bw);
169 logger::raw_ln(
170 shambase::format(
171 " - p2p bandwidth : {} {}B.s^-1 (ranks : {} -> {}) (loops : {})",
172 val,
173 prefix,
174 wr_sender,
175 wr_receiv,
176 loops));
177 }
178}
179
181 StackEntry stack_loc{};
182
183 if (wr1 == wr2) {
185 "can not launch this test with same ranks");
186 }
187
189
190 u64 length = 8ULL; // 8B messages
191 shamcomm::CommunicationBuffer buf_recv{length, instance::get_compute_scheduler_ptr()};
192 shamcomm::CommunicationBuffer buf_send{length, instance::get_compute_scheduler_ptr()};
193
194 shambase::Timer bench_timer;
195 bench_timer.start();
196
197 f64 t = 0;
198 u64 loops = 0;
199 bool is_used = false;
200 do {
201 loops++;
202
203 mpi::barrier(MPI_COMM_WORLD);
204 f64 t_start = MPI_Wtime();
205
206 if (wr == wr1) {
207 MPI_Status s;
208 shamcomm::mpi::Send(buf_send.get_ptr(), length, MPI_BYTE, wr2, 0, MPI_COMM_WORLD);
209 shamcomm::mpi::Recv(buf_recv.get_ptr(), length, MPI_BYTE, wr2, 1, MPI_COMM_WORLD, &s);
210 is_used = true;
211 }
212
213 if (wr == wr2) {
214 MPI_Status s;
215 shamcomm::mpi::Recv(buf_recv.get_ptr(), length, MPI_BYTE, wr1, 0, MPI_COMM_WORLD, &s);
216 shamcomm::mpi::Send(buf_send.get_ptr(), length, MPI_BYTE, wr1, 1, MPI_COMM_WORLD);
217 is_used = true;
218 }
219
220 if (!is_used) {
221 t = 1;
222 }
223 f64 t_end = MPI_Wtime();
224 t += t_end - t_start;
225
226 bench_timer.end();
227
228 } while (shamalgs::collective::allreduce_min(bench_timer.elasped_sec()) < 1);
229
230 f64 latency = t / f64(loops);
231 microbench_results["p2p_latency"] = latency;
232
233 if (shamcomm::world_rank() == 0) {
234 logger::raw_ln(
235 shambase::format(
236 " - p2p latency : {:.4e} s (ranks : {} <-> {}) (loops : {})",
237 latency,
238 wr1,
239 wr2,
240 loops));
241 }
242}
243
244template<typename T>
246 int Tsize = sizeof(T);
247
248 std::string type_name;
249 T init_x, init_y, a;
250 if constexpr (std::is_same_v<T, f32>) {
251 type_name = "f32";
252 init_x = 1.0f;
253 init_y = 2.0f;
254 a = 2.0f;
255 } else if constexpr (std::is_same_v<T, f64>) {
256 type_name = "f64";
257 init_x = 1.0;
258 init_y = 2.0;
259 a = 2.0;
260 } else if constexpr (std::is_same_v<T, f32_2>) {
261 type_name = "f32_2";
262 init_x = {1.0f, 1.0f};
263 init_y = {2.0f, 2.0f};
264 a = {2.0f, 2.0f};
265 } else if constexpr (std::is_same_v<T, f64_2>) {
266 type_name = "f64_2";
267 init_x = {1.0, 1.0};
268 init_y = {2.0, 2.0};
269 a = {2.0, 2.0};
270 } else if constexpr (std::is_same_v<T, f32_3>) {
271 type_name = "f32_3";
272 init_x = {1.0f, 1.0f, 1.0f};
273 init_y = {2.0f, 2.0f, 2.0f};
274 a = {2.0f, 2.0f, 2.0f};
275 } else if constexpr (std::is_same_v<T, f64_3>) {
276 type_name = "f64_3";
277 init_x = {1.0, 1.0, 1.0};
278 init_y = {2.0, 2.0, 2.0};
279 a = {2.0, 2.0, 2.0};
280 } else if constexpr (std::is_same_v<T, f32_4>) {
281 type_name = "f32_4";
282 init_x = {1.0f, 1.0f, 1.0f, 1.0f};
283 init_y = {2.0f, 2.0f, 2.0f, 2.0f};
284 a = {2.0f, 2.0f, 2.0f, 2.0f};
285 } else if constexpr (std::is_same_v<T, f64_4>) {
286 type_name = "f64_4";
287 init_x = {1.0, 1.0, 1.0, 1.0};
288 init_y = {2.0, 2.0, 2.0, 2.0};
289 a = {2.0, 2.0, 2.0, 2.0};
290 } else {
292 }
293
294 auto bench_step = [&](int N) {
295 return sham::benchmarks::saxpy_bench<T>(
296 instance::get_compute_scheduler_ptr(), N, init_x, init_y, a, Tsize, N < (1 << 17));
297 };
298
299 auto benchmark = [&]() {
300 size_t N = (1 << 15);
301
302 auto &dev_ctx = shambase::get_check_ref(instance::get_compute_scheduler().ctx);
303 auto &dev_ptr = dev_ctx.device;
304 auto &dev = shambase::get_check_ref(dev_ptr);
305
306 size_t max_alloc
307 = std::min<size_t>(dev.prop.max_mem_alloc_size_dev, dev.prop.global_mem_size);
308 double max_size = double(max_alloc) / (Tsize * 4); // there is 2 allocations so /4
309 if (max_size >= (1 << 30)) {
310 max_size = (1 << 30);
311 }
312
313 auto result = bench_step(shambase::narrow_or_throw<i32>(N));
314
315 for (; N <= (1 << 30) && static_cast<double>(N) <= max_size; N *= 2) {
316 result = bench_step(shambase::narrow_or_throw<i32>(N));
317
318 // std::cout << N << " " << result_new.seconds << " " << result_new.bandwidth
319 // << std::endl;
320
321 if (result.seconds > 1e-3) {
322 break;
323 }
324 }
325
326 return result;
327 };
328
329 auto result = benchmark();
330
331 f64 bw = result.bandwidth * 1e9;
332
333 f64 min_bw = shamalgs::collective::allreduce_min(bw);
334 f64 max_bw = shamalgs::collective::allreduce_max(bw);
335 f64 sum_bw = shamalgs::collective::allreduce_sum(bw);
336 f64 avg_bw = sum_bw / (f64) shamcomm::world_size();
337
338 microbench_results["saxpy_" + type_name] = sum_bw;
339
340 if (shamcomm::world_rank() == 0) {
341 auto [prefix, val] = format_result(sum_bw);
342 logger::raw_ln(
343 shambase::format(
344 " - saxpy ({}) : {} {}B.s^-1 (min = {:.1e}, max = {:.1e}, avg = {:.1e}) "
345 "({:.1e} ms, {})",
346 type_name,
347 val,
348 prefix,
349 min_bw,
350 max_bw,
351 avg_bw,
352 result.seconds * 1e3,
353 shambase::readable_sizeof(result.byte_used)));
354 }
355}
356
357template<typename T>
359 int N = (1 << 22);
360
361 auto result
362 = sham::benchmarks::fma_chains_bench<T>(instance::get_compute_scheduler_ptr(), N, 0.2);
363
364 std::string type_name;
365 f64 flops_multiplier = 1;
366 if constexpr (std::is_same_v<T, f32>) {
367 type_name = "f32";
368 flops_multiplier = 1;
369 } else if constexpr (std::is_same_v<T, f64>) {
370 type_name = "f64";
371 flops_multiplier = 1;
372 } else if constexpr (std::is_same_v<T, f32_2>) {
373 type_name = "f32_2";
374 flops_multiplier = 2;
375 } else if constexpr (std::is_same_v<T, f64_2>) {
376 type_name = "f64_2";
377 flops_multiplier = 2;
378 } else if constexpr (std::is_same_v<T, f32_3>) {
379 type_name = "f32_3";
380 flops_multiplier = 3;
381 } else if constexpr (std::is_same_v<T, f64_3>) {
382 type_name = "f64_3";
383 flops_multiplier = 3;
384 } else if constexpr (std::is_same_v<T, f32_4>) {
385 type_name = "f32_4";
386 flops_multiplier = 4;
387 } else if constexpr (std::is_same_v<T, f64_4>) {
388 type_name = "f64_4";
389 flops_multiplier = 4;
390 } else {
392 }
393
394 f64 min_flop = shamalgs::collective::allreduce_min(result.flops);
395 f64 max_flop = shamalgs::collective::allreduce_max(result.flops);
396 f64 sum_flop = shamalgs::collective::allreduce_sum(result.flops);
397 f64 avg_flop = sum_flop / (f64) shamcomm::world_size();
398
399 microbench_results["fma_chains_" + type_name] = sum_flop * flops_multiplier;
400
401 if (shamcomm::world_rank() == 0) {
402 auto [prefix, val] = format_result(sum_flop * flops_multiplier);
403 logger::raw_ln(
404 shambase::format(
405 " - fma_chains ({}) : {} {}flops (min = {:.1e}, max = {:.1e}, avg = {:.1e}) "
406 "({:.1e} ms, rotations = {})",
407 type_name,
408 val,
409 prefix,
410 min_flop * flops_multiplier,
411 max_flop * flops_multiplier,
412 avg_flop * flops_multiplier,
413 result.seconds * 1e3,
414 result.nrotations));
415 }
416}
417
419
420 using T = u64;
421 std::vector<u64> send_data(el_per_rank);
422
423 std::vector<u64> recv_data;
424
425 f64 t = 0;
426 u64 loops = 0;
427
428 auto benchmark_step = [&]() {
429 shamcomm::mpi::Barrier(MPI_COMM_WORLD);
430 f64 t_start = MPI_Wtime();
431 shamalgs::collective::vector_allgatherv(send_data, recv_data, MPI_COMM_WORLD);
432 f64 t_end = MPI_Wtime();
433 t += t_end - t_start;
434 loops++;
435 };
436
437 do {
438 benchmark_step();
439 } while (shamalgs::collective::allreduce_min(t) < 0.1);
440
441 t /= loops;
442
443 f64 min_t = shamalgs::collective::allreduce_min(t);
444 f64 max_t = shamalgs::collective::allreduce_max(t);
445 f64 sum_t = shamalgs::collective::allreduce_sum(t);
446 f64 avg_t = sum_t / (f64) shamcomm::world_size();
447
448 microbench_results["vector_allgather_u64_" + std::to_string(el_per_rank)] = avg_t;
449
450 if (shamcomm::world_rank() == 0) {
451 logger::raw_ln(
452 shambase::format(
453 " - vector_allgather (u64, n={:4}) : {:.3e} s (min = {:.2e}, max = {:.2e}, loops = "
454 "{})",
455 el_per_rank,
456 avg_t,
457 min_t,
458 max_t,
459 loops));
460 }
461}
462
463const std::unordered_map<std::string, double> &shamsys::get_microbench_results() {
464 return microbench_results;
465}
Shamrock communication buffers.
void vector_allgather(u32 el_per_rank)
Vector allgather benchmark.
void p2p_latency(u32 wr1, u32 wr2)
MPI point-to-point latency benchmark.
void p2p_bandwidth(u32 wr_sender, u32 wr_receiv)
MPI point-to-point bandwidth benchmark.
void saxpy()
SAXPY benchmark, to get the maximum bandwidth.
void fma_chains_rotation()
FMA chains benchmark to get the maximum floating point performance.
This header does the MPI include and wrap MPI calls.
Header file describing a Node Instance.
double f64
Alias for double.
std::uint32_t u32
32 bit unsigned integer
std::uint64_t u64
64 bit unsigned integer
Class Timer measures the time elapsed since the timer was started.
Definition time.hpp:96
void end()
Stops the timer and stores the elapsed time in nanoseconds.
Definition time.hpp:111
f64 elasped_sec() const
Converts the stored nanosecond time to a floating point representation in seconds.
Definition time.hpp:123
void start()
Starts the timer.
Definition time.hpp:106
Shamrock communication buffers.
This header file contains utility functions related to exception handling in the code.
Port of Argonne National Laboratory's FMA chains benchmark flops.cpp.
std::string readable_sizeof(double size)
given a sizeof value return a readble string Example : readable_sizeof(1024*1024*1024) -> "1....
Definition string.hpp:139
void throw_with_loc(std::string message, SourceLocation loc=SourceLocation{})
Throw an exception and append the source location to it.
T & get_check_ref(const std::unique_ptr< T > &ptr, SourceLocation loc=SourceLocation())
Takes a std::unique_ptr and returns a reference to the object it holds. It throws a std::runtime_erro...
Definition memory.hpp:110
i32 world_rank()
Gives the rank of the current process in the MPI communicator.
Definition worldInfo.cpp:40
i32 world_size()
Gives the size of the MPI communicator.
Definition worldInfo.cpp:38
void run_micro_benchmark()
Run latency & bandwidth benchmark those benchmark where adapted from osu_microbenchmark.
const std::unordered_map< std::string, double > & get_microbench_results()
Get the microbench results.
void saxpy(u32 i, int n, T a, T *__restrict x, T *__restrict y)
saxpy function for benchmarking.
Definition saxpy.hpp:35
This file contains the definition for the stacktrace related functionality.