Shamrock 2025.10.0
Astrophysical Code
Loading...
Searching...
No Matches
MicroBenchmark.cpp
Go to the documentation of this file.
1// -------------------------------------------------------//
2//
3// SHAMROCK code for hydrodynamics
4// Copyright (c) 2021-2026 Timothée David--Cléris <tim.shamrock@proton.me>
5// SPDX-License-Identifier: CeCILL Free Software License Agreement v2.1
6// Shamrock is licensed under the CeCILL 2.1 License, see LICENSE for more information
7//
8// -------------------------------------------------------//
9
15
18#include "shambase/string.hpp"
19#include "shambase/time.hpp"
27#include "shambackends/math.hpp"
28#include "shamcomm/wrapper.hpp"
33#include <stdexcept>
34#include <vector>
35
36namespace {
37
38 std::unordered_map<std::string, double> microbench_results = {};
39
40}
41
42namespace shamsys::microbench {
44 void p2p_bandwidth(u32 wr_sender, u32 wr_receiv);
45
47 void p2p_latency(u32 wr1, u32 wr2);
48
50 template<typename T>
51 void saxpy();
52
54 template<typename T>
56
58 void vector_allgather(u32 el_per_rank);
59
60} // namespace shamsys::microbench
61
99
100void shamsys::microbench::p2p_bandwidth(u32 wr_sender, u32 wr_receiv) {
101 StackEntry stack_loc{};
102
104
105 u64 length = 1024UL * 1014UL * 8UL; // 8MB messages
106 shamcomm::CommunicationBuffer buf_recv{length, instance::get_compute_scheduler_ptr()};
107 shamcomm::CommunicationBuffer buf_send{length, instance::get_compute_scheduler_ptr()};
108
109 std::vector<MPI_Request> rqs;
110
111 f64 t = 0;
112 u64 loops = 0;
113 bool is_used = false;
114 do {
115 loops++;
116
117 mpi::barrier(MPI_COMM_WORLD);
118 f64 t_start = MPI_Wtime();
119
120 if (wr == wr_sender) {
121 rqs.push_back(MPI_Request{});
122 u32 rq_index = rqs.size() - 1;
123 auto &rq = rqs[rq_index];
125 buf_send.get_ptr(), length, MPI_BYTE, wr_receiv, 0, MPI_COMM_WORLD, &rq);
126 is_used = true;
127 }
128
129 if (wr == wr_receiv) {
130 MPI_Status s;
132 buf_recv.get_ptr(), length, MPI_BYTE, wr_sender, 0, MPI_COMM_WORLD, &s);
133 is_used = true;
134 }
135
136 if (!is_used) {
137 t = 1;
138 }
139 std::vector<MPI_Status> st_lst(rqs.size());
140 if (rqs.size() > 0) {
141 shamcomm::mpi::Waitall(rqs.size(), rqs.data(), st_lst.data());
142 }
143 f64 t_end = MPI_Wtime();
144 t += t_end - t_start;
145
146 } while (shamalgs::collective::allreduce_min(t) < 1);
147
148 f64 bw = f64(length * loops) / t;
149
150 microbench_results["p2p_bandwidth"] = bw;
151
152 if (shamcomm::world_rank() == 0) {
153 auto hr_bw = sham::to_human_readable<false>(bw);
155 shambase::format(
156 " - p2p bandwidth : {:.2f} {}B.s^-1 (ranks : {} -> {}) (loops : {})",
157 hr_bw.value,
158 hr_bw.prefix,
159 wr_sender,
160 wr_receiv,
161 loops));
162 }
163}
164
166 StackEntry stack_loc{};
167
168 if (wr1 == wr2) {
170 "can not launch this test with same ranks");
171 }
172
174
175 u64 length = 8ULL; // 8B messages
176 shamcomm::CommunicationBuffer buf_recv{length, instance::get_compute_scheduler_ptr()};
177 shamcomm::CommunicationBuffer buf_send{length, instance::get_compute_scheduler_ptr()};
178
179 shambase::Timer bench_timer;
180 bench_timer.start();
181
182 f64 t = 0;
183 u64 loops = 0;
184 bool is_used = false;
185 do {
186 loops++;
187
188 mpi::barrier(MPI_COMM_WORLD);
189 f64 t_start = MPI_Wtime();
190
191 if (wr == wr1) {
192 MPI_Status s;
193 shamcomm::mpi::Send(buf_send.get_ptr(), length, MPI_BYTE, wr2, 0, MPI_COMM_WORLD);
194 shamcomm::mpi::Recv(buf_recv.get_ptr(), length, MPI_BYTE, wr2, 1, MPI_COMM_WORLD, &s);
195 is_used = true;
196 }
197
198 if (wr == wr2) {
199 MPI_Status s;
200 shamcomm::mpi::Recv(buf_recv.get_ptr(), length, MPI_BYTE, wr1, 0, MPI_COMM_WORLD, &s);
201 shamcomm::mpi::Send(buf_send.get_ptr(), length, MPI_BYTE, wr1, 1, MPI_COMM_WORLD);
202 is_used = true;
203 }
204
205 if (!is_used) {
206 t = 1;
207 }
208 f64 t_end = MPI_Wtime();
209 t += t_end - t_start;
210
211 bench_timer.stop();
212
213 } while (shamalgs::collective::allreduce_min(bench_timer.elapsed_sec()) < 1);
214
215 f64 latency = t / f64(loops);
216 microbench_results["p2p_latency"] = latency;
217
218 if (shamcomm::world_rank() == 0) {
220 shambase::format(
221 " - p2p latency : {:.4e} s (ranks : {} <-> {}) (loops : {})",
222 latency,
223 wr1,
224 wr2,
225 loops));
226 }
227}
228
229template<typename T>
231 int Tsize = sizeof(T);
232
233 std::string type_name;
234 T init_x, init_y, a;
235 if constexpr (std::is_same_v<T, f32>) {
236 type_name = "f32";
237 init_x = 1.0f;
238 init_y = 2.0f;
239 a = 2.0f;
240 } else if constexpr (std::is_same_v<T, f64>) {
241 type_name = "f64";
242 init_x = 1.0;
243 init_y = 2.0;
244 a = 2.0;
245 } else if constexpr (std::is_same_v<T, f32_2>) {
246 type_name = "f32_2";
247 init_x = {1.0f, 1.0f};
248 init_y = {2.0f, 2.0f};
249 a = {2.0f, 2.0f};
250 } else if constexpr (std::is_same_v<T, f64_2>) {
251 type_name = "f64_2";
252 init_x = {1.0, 1.0};
253 init_y = {2.0, 2.0};
254 a = {2.0, 2.0};
255 } else if constexpr (std::is_same_v<T, f32_3>) {
256 type_name = "f32_3";
257 init_x = {1.0f, 1.0f, 1.0f};
258 init_y = {2.0f, 2.0f, 2.0f};
259 a = {2.0f, 2.0f, 2.0f};
260 } else if constexpr (std::is_same_v<T, f64_3>) {
261 type_name = "f64_3";
262 init_x = {1.0, 1.0, 1.0};
263 init_y = {2.0, 2.0, 2.0};
264 a = {2.0, 2.0, 2.0};
265 } else if constexpr (std::is_same_v<T, f32_4>) {
266 type_name = "f32_4";
267 init_x = {1.0f, 1.0f, 1.0f, 1.0f};
268 init_y = {2.0f, 2.0f, 2.0f, 2.0f};
269 a = {2.0f, 2.0f, 2.0f, 2.0f};
270 } else if constexpr (std::is_same_v<T, f64_4>) {
271 type_name = "f64_4";
272 init_x = {1.0, 1.0, 1.0, 1.0};
273 init_y = {2.0, 2.0, 2.0, 2.0};
274 a = {2.0, 2.0, 2.0, 2.0};
275 } else {
277 }
278
279 auto bench_step = [&](int N) {
281 instance::get_compute_scheduler_ptr(), N, init_x, init_y, a, Tsize, N < (1 << 17));
282 };
283
284 auto benchmark = [&]() {
285 size_t N = (1 << 15);
286
287 auto &dev_ctx = shambase::get_check_ref(instance::get_compute_scheduler().ctx);
288 auto &dev_ptr = dev_ctx.device;
289 auto &dev = shambase::get_check_ref(dev_ptr);
290
291 size_t max_alloc
292 = std::min<size_t>(dev.prop.max_mem_alloc_size_dev, dev.prop.global_mem_size);
293 double max_size = double(max_alloc) / (Tsize * 4); // there is 2 allocations so /4
294 if (max_size >= (1 << 30)) {
295 max_size = (1 << 30);
296 }
297
298 auto result = bench_step(shambase::narrow_or_throw<i32>(N));
299
300 for (; N <= (1 << 30) && static_cast<double>(N) <= max_size; N *= 2) {
301 result = bench_step(shambase::narrow_or_throw<i32>(N));
302
303 // std::cout << N << " " << result_new.seconds << " " << result_new.bandwidth
304 // << std::endl;
305
306 if (result.seconds > 1e-3) {
307 break;
308 }
309 }
310
311 return result;
312 };
313
314 auto result = benchmark();
315
316 f64 bw = result.bandwidth * 1e9;
317
318 f64 min_bw = shamalgs::collective::allreduce_min(bw);
319 f64 max_bw = shamalgs::collective::allreduce_max(bw);
320 f64 sum_bw = shamalgs::collective::allreduce_sum(bw);
321 f64 avg_bw = sum_bw / (f64) shamcomm::world_size();
322
323 microbench_results["saxpy_" + type_name] = sum_bw;
324
325 if (shamcomm::world_rank() == 0) {
326 auto hr_bw = sham::to_human_readable<false>(sum_bw);
328 shambase::format(
329 " - saxpy ({}) : {:.2f} {}B.s^-1 (min = {:.1e}, max = {:.1e}, avg = {:.1e}) "
330 "({:.1e} ms, {})",
331 type_name,
332 hr_bw.value,
333 hr_bw.prefix,
334 min_bw,
335 max_bw,
336 avg_bw,
337 result.seconds * 1e3,
338 shambase::readable_sizeof(result.byte_used)));
339 }
340}
341
342template<typename T>
344 int N = (1 << 22);
345
346 auto result
347 = sham::benchmarks::fma_chains_bench<T>(instance::get_compute_scheduler_ptr(), N, 0.2);
348
349 std::string type_name;
350 f64 flops_multiplier = 1;
351 if constexpr (std::is_same_v<T, f32>) {
352 type_name = "f32";
353 flops_multiplier = 1;
354 } else if constexpr (std::is_same_v<T, f64>) {
355 type_name = "f64";
356 flops_multiplier = 1;
357 } else if constexpr (std::is_same_v<T, f32_2>) {
358 type_name = "f32_2";
359 flops_multiplier = 2;
360 } else if constexpr (std::is_same_v<T, f64_2>) {
361 type_name = "f64_2";
362 flops_multiplier = 2;
363 } else if constexpr (std::is_same_v<T, f32_3>) {
364 type_name = "f32_3";
365 flops_multiplier = 3;
366 } else if constexpr (std::is_same_v<T, f64_3>) {
367 type_name = "f64_3";
368 flops_multiplier = 3;
369 } else if constexpr (std::is_same_v<T, f32_4>) {
370 type_name = "f32_4";
371 flops_multiplier = 4;
372 } else if constexpr (std::is_same_v<T, f64_4>) {
373 type_name = "f64_4";
374 flops_multiplier = 4;
375 } else {
377 }
378
379 f64 min_flop = shamalgs::collective::allreduce_min(result.flops);
380 f64 max_flop = shamalgs::collective::allreduce_max(result.flops);
381 f64 sum_flop = shamalgs::collective::allreduce_sum(result.flops);
382 f64 avg_flop = sum_flop / (f64) shamcomm::world_size();
383
384 microbench_results["fma_chains_" + type_name] = sum_flop * flops_multiplier;
385
386 if (shamcomm::world_rank() == 0) {
387 auto hr_flop = sham::to_human_readable<false>(sum_flop * flops_multiplier);
389 shambase::format(
390 " - fma_chains ({}) : {:.2f} {}flops (min = {:.1e}, max = {:.1e}, avg = {:.1e}) "
391 "({:.1e} ms, rotations = {})",
392 type_name,
393 hr_flop.value,
394 hr_flop.prefix,
395 min_flop * flops_multiplier,
396 max_flop * flops_multiplier,
397 avg_flop * flops_multiplier,
398 result.seconds * 1e3,
399 result.nrotations));
400 }
401}
402
404
405 using T = u64;
406 std::vector<u64> send_data(el_per_rank);
407
408 std::vector<u64> recv_data;
409
410 f64 t = 0;
411 u64 loops = 0;
412
413 auto benchmark_step = [&]() {
414 shamcomm::mpi::Barrier(MPI_COMM_WORLD);
415 f64 t_start = MPI_Wtime();
416 shamalgs::collective::vector_allgatherv(send_data, recv_data, MPI_COMM_WORLD);
417 f64 t_end = MPI_Wtime();
418 t += t_end - t_start;
419 loops++;
420 };
421
422 do {
423 benchmark_step();
424 } while (shamalgs::collective::allreduce_min(t) < 0.1);
425
426 t /= loops;
427
428 f64 min_t = shamalgs::collective::allreduce_min(t);
429 f64 max_t = shamalgs::collective::allreduce_max(t);
430 f64 sum_t = shamalgs::collective::allreduce_sum(t);
431 f64 avg_t = sum_t / (f64) shamcomm::world_size();
432
433 microbench_results["vector_allgather_u64_" + std::to_string(el_per_rank)] = avg_t;
434
435 if (shamcomm::world_rank() == 0) {
437 shambase::format(
438 " - vector_allgather (u64, n={:4}) : {:.3e} s (min = {:.2e}, max = {:.2e}, loops = "
439 "{})",
440 el_per_rank,
441 avg_t,
442 min_t,
443 max_t,
444 loops));
445 }
446}
447
448const std::unordered_map<std::string, double> &shamsys::get_microbench_results() {
449 return microbench_results;
450}
Shamrock communication buffers.
void vector_allgather(u32 el_per_rank)
Vector allgather benchmark.
void p2p_latency(u32 wr1, u32 wr2)
MPI point-to-point latency benchmark.
void p2p_bandwidth(u32 wr_sender, u32 wr_receiv)
MPI point-to-point bandwidth benchmark.
void saxpy()
SAXPY benchmark, to get the maximum bandwidth.
void fma_chains_rotation()
FMA chains benchmark to get the maximum floating point performance.
This header does the MPI include and wrap MPI calls.
Header file describing a Node Instance.
double f64
Alias for double.
std::uint32_t u32
32 bit unsigned integer
std::uint64_t u64
64 bit unsigned integer
Class Timer measures the time elapsed since the timer was started.
Definition time.hpp:35
f64 elapsed_sec() const
Converts the stored nanosecond time to a floating point representation in seconds.
Definition time.hpp:87
void start()
Starts the timer.
Definition time.hpp:50
void stop()
Stops the timer and stores the elapsed time in nanoseconds.
Definition time.hpp:64
Shamrock communication buffers.
This header file contains utility functions related to exception handling in the code.
std::vector< int > vector_allgatherv(const std::vector< T > &send_vec, const MPI_Datatype &send_type, std::vector< T > &recv_vec, const MPI_Datatype &recv_type, const MPI_Comm comm)
allgatherv on vector with size query (size querying variant of vector_allgatherv_ks) //TODO add fault...
Definition exchanges.hpp:98
Port of Argonne National Laboratory's FMA chains benchmark flops.cpp.
fma_chains_result fma_chains_bench(DeviceScheduler_ptr sched, int N, f64 time_threshold)
Run the fma_chains benchmark.
Convert raw numeric values to human-readable SI-formatted pairs.
human_readable_t to_human_readable(double value)
Convert a raw value to a human-readable scaled form with an SI prefix.
std::string readable_sizeof(double size)
given a sizeof value return a readble string Example : readable_sizeof(1e9) -> "1....
Definition string.hpp:84
T & get_check_ref(const std::unique_ptr< T > &ptr, SourceLocation loc=SourceLocation())
Takes a std::unique_ptr and returns a reference to the object it holds. It throws a std::runtime_erro...
Definition memory.hpp:110
ExcptTypes make_except_with_loc(std::string message, SourceLocation loc=SourceLocation{})
Create an exception with a message and a location.
i32 world_rank()
Gives the rank of the current process in the MPI communicator.
Definition worldInfo.cpp:40
i32 world_size()
Gives the size of the MPI communicator.
Definition worldInfo.cpp:38
void run_micro_benchmark()
Run latency & bandwidth benchmark those benchmark where adapted from osu_microbenchmark.
const std::unordered_map< std::string, double > & get_microbench_results()
Get the microbench results.
saxpy_result saxpy_bench(DeviceScheduler_ptr sched, int N, T init_x, T init_y, T a, int load_size, bool check_correctness)
saxpy function for benchmarking.
Definition saxpy.hpp:70
void raw_ln(Types... var2)
Prints a log message with multiple arguments followed by a newline.
Definition logs.hpp:90
This file contains the definition for the stacktrace related functionality.
shambase::details::BasicStackEntry StackEntry
Alias for shambase::details::BasicStackEntry.
void Recv(void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Status *status)
MPI wrapper for MPI_Recv.
Definition wrapper.cpp:208
void Barrier(MPI_Comm comm)
MPI wrapper for MPI_Barrier.
Definition wrapper.cpp:194
void Waitall(int count, MPI_Request array_of_requests[], MPI_Status *array_of_statuses)
MPI wrapper for MPI_Waitall.
Definition wrapper.cpp:187
void Send(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm)
MPI wrapper for MPI_Send.
Definition wrapper.cpp:229
void Isend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request)
MPI wrapper for MPI_Isend.
Definition wrapper.cpp:85