38 std::unordered_map<std::string, double> microbench_results = {};
42namespace shamsys::microbench {
105 u64 length = 1024UL * 1014UL * 8UL;
109 std::vector<MPI_Request> rqs;
113 bool is_used =
false;
117 mpi::barrier(MPI_COMM_WORLD);
118 f64 t_start = MPI_Wtime();
120 if (wr == wr_sender) {
121 rqs.push_back(MPI_Request{});
122 u32 rq_index = rqs.size() - 1;
123 auto &rq = rqs[rq_index];
125 buf_send.get_ptr(), length, MPI_BYTE, wr_receiv, 0, MPI_COMM_WORLD, &rq);
129 if (wr == wr_receiv) {
132 buf_recv.get_ptr(), length, MPI_BYTE, wr_sender, 0, MPI_COMM_WORLD, &s);
139 std::vector<MPI_Status> st_lst(rqs.size());
140 if (rqs.size() > 0) {
143 f64 t_end = MPI_Wtime();
144 t += t_end - t_start;
146 }
while (shamalgs::collective::allreduce_min(t) < 1);
148 f64 bw =
f64(length * loops) / t;
150 microbench_results[
"p2p_bandwidth"] = bw;
156 " - p2p bandwidth : {:.2f} {}B.s^-1 (ranks : {} -> {}) (loops : {})",
170 "can not launch this test with same ranks");
184 bool is_used =
false;
188 mpi::barrier(MPI_COMM_WORLD);
189 f64 t_start = MPI_Wtime();
208 f64 t_end = MPI_Wtime();
209 t += t_end - t_start;
213 }
while (shamalgs::collective::allreduce_min(bench_timer.
elapsed_sec()) < 1);
215 f64 latency = t /
f64(loops);
216 microbench_results[
"p2p_latency"] = latency;
221 " - p2p latency : {:.4e} s (ranks : {} <-> {}) (loops : {})",
231 int Tsize =
sizeof(T);
233 std::string type_name;
235 if constexpr (std::is_same_v<T, f32>) {
240 }
else if constexpr (std::is_same_v<T, f64>) {
245 }
else if constexpr (std::is_same_v<T, f32_2>) {
247 init_x = {1.0f, 1.0f};
248 init_y = {2.0f, 2.0f};
250 }
else if constexpr (std::is_same_v<T, f64_2>) {
255 }
else if constexpr (std::is_same_v<T, f32_3>) {
257 init_x = {1.0f, 1.0f, 1.0f};
258 init_y = {2.0f, 2.0f, 2.0f};
259 a = {2.0f, 2.0f, 2.0f};
260 }
else if constexpr (std::is_same_v<T, f64_3>) {
262 init_x = {1.0, 1.0, 1.0};
263 init_y = {2.0, 2.0, 2.0};
265 }
else if constexpr (std::is_same_v<T, f32_4>) {
267 init_x = {1.0f, 1.0f, 1.0f, 1.0f};
268 init_y = {2.0f, 2.0f, 2.0f, 2.0f};
269 a = {2.0f, 2.0f, 2.0f, 2.0f};
270 }
else if constexpr (std::is_same_v<T, f64_4>) {
272 init_x = {1.0, 1.0, 1.0, 1.0};
273 init_y = {2.0, 2.0, 2.0, 2.0};
274 a = {2.0, 2.0, 2.0, 2.0};
279 auto bench_step = [&](
int N) {
281 instance::get_compute_scheduler_ptr(), N, init_x, init_y, a, Tsize, N < (1 << 17));
284 auto benchmark = [&]() {
285 size_t N = (1 << 15);
288 auto &dev_ptr = dev_ctx.device;
292 = std::min<size_t>(dev.prop.max_mem_alloc_size_dev, dev.prop.global_mem_size);
293 double max_size = double(max_alloc) / (Tsize * 4);
294 if (max_size >= (1 << 30)) {
295 max_size = (1 << 30);
298 auto result = bench_step(shambase::narrow_or_throw<i32>(N));
300 for (; N <= (1 << 30) && static_cast<double>(N) <= max_size; N *= 2) {
301 result = bench_step(shambase::narrow_or_throw<i32>(N));
306 if (result.seconds > 1e-3) {
314 auto result = benchmark();
316 f64 bw = result.bandwidth * 1e9;
318 f64 min_bw = shamalgs::collective::allreduce_min(bw);
319 f64 max_bw = shamalgs::collective::allreduce_max(bw);
320 f64 sum_bw = shamalgs::collective::allreduce_sum(bw);
323 microbench_results[
"saxpy_" + type_name] = sum_bw;
329 " - saxpy ({}) : {:.2f} {}B.s^-1 (min = {:.1e}, max = {:.1e}, avg = {:.1e}) "
337 result.seconds * 1e3,
349 std::string type_name;
350 f64 flops_multiplier = 1;
351 if constexpr (std::is_same_v<T, f32>) {
353 flops_multiplier = 1;
354 }
else if constexpr (std::is_same_v<T, f64>) {
356 flops_multiplier = 1;
357 }
else if constexpr (std::is_same_v<T, f32_2>) {
359 flops_multiplier = 2;
360 }
else if constexpr (std::is_same_v<T, f64_2>) {
362 flops_multiplier = 2;
363 }
else if constexpr (std::is_same_v<T, f32_3>) {
365 flops_multiplier = 3;
366 }
else if constexpr (std::is_same_v<T, f64_3>) {
368 flops_multiplier = 3;
369 }
else if constexpr (std::is_same_v<T, f32_4>) {
371 flops_multiplier = 4;
372 }
else if constexpr (std::is_same_v<T, f64_4>) {
374 flops_multiplier = 4;
379 f64 min_flop = shamalgs::collective::allreduce_min(result.flops);
380 f64 max_flop = shamalgs::collective::allreduce_max(result.flops);
381 f64 sum_flop = shamalgs::collective::allreduce_sum(result.flops);
384 microbench_results[
"fma_chains_" + type_name] = sum_flop * flops_multiplier;
390 " - fma_chains ({}) : {:.2f} {}flops (min = {:.1e}, max = {:.1e}, avg = {:.1e}) "
391 "({:.1e} ms, rotations = {})",
395 min_flop * flops_multiplier,
396 max_flop * flops_multiplier,
397 avg_flop * flops_multiplier,
398 result.seconds * 1e3,
406 std::vector<u64> send_data(el_per_rank);
408 std::vector<u64> recv_data;
413 auto benchmark_step = [&]() {
415 f64 t_start = MPI_Wtime();
417 f64 t_end = MPI_Wtime();
418 t += t_end - t_start;
424 }
while (shamalgs::collective::allreduce_min(t) < 0.1);
428 f64 min_t = shamalgs::collective::allreduce_min(t);
429 f64 max_t = shamalgs::collective::allreduce_max(t);
430 f64 sum_t = shamalgs::collective::allreduce_sum(t);
433 microbench_results[
"vector_allgather_u64_" + std::to_string(el_per_rank)] = avg_t;
438 " - vector_allgather (u64, n={:4}) : {:.3e} s (min = {:.2e}, max = {:.2e}, loops = "
449 return microbench_results;
Shamrock communication buffers.
void vector_allgather(u32 el_per_rank)
Vector allgather benchmark.
void p2p_latency(u32 wr1, u32 wr2)
MPI point-to-point latency benchmark.
void p2p_bandwidth(u32 wr_sender, u32 wr_receiv)
MPI point-to-point bandwidth benchmark.
void saxpy()
SAXPY benchmark, to get the maximum bandwidth.
void fma_chains_rotation()
FMA chains benchmark to get the maximum floating point performance.
This header does the MPI include and wrap MPI calls.
Header file describing a Node Instance.
double f64
Alias for double.
std::uint32_t u32
32 bit unsigned integer
std::uint64_t u64
64 bit unsigned integer
Class Timer measures the time elapsed since the timer was started.
f64 elapsed_sec() const
Converts the stored nanosecond time to a floating point representation in seconds.
void start()
Starts the timer.
void stop()
Stops the timer and stores the elapsed time in nanoseconds.
Shamrock communication buffers.
This header file contains utility functions related to exception handling in the code.
std::vector< int > vector_allgatherv(const std::vector< T > &send_vec, const MPI_Datatype &send_type, std::vector< T > &recv_vec, const MPI_Datatype &recv_type, const MPI_Comm comm)
allgatherv on vector with size query (size querying variant of vector_allgatherv_ks) //TODO add fault...
Port of Argonne National Laboratory's FMA chains benchmark flops.cpp.
fma_chains_result fma_chains_bench(DeviceScheduler_ptr sched, int N, f64 time_threshold)
Run the fma_chains benchmark.
Convert raw numeric values to human-readable SI-formatted pairs.
human_readable_t to_human_readable(double value)
Convert a raw value to a human-readable scaled form with an SI prefix.
std::string readable_sizeof(double size)
given a sizeof value return a readble string Example : readable_sizeof(1e9) -> "1....
T & get_check_ref(const std::unique_ptr< T > &ptr, SourceLocation loc=SourceLocation())
Takes a std::unique_ptr and returns a reference to the object it holds. It throws a std::runtime_erro...
ExcptTypes make_except_with_loc(std::string message, SourceLocation loc=SourceLocation{})
Create an exception with a message and a location.
i32 world_rank()
Gives the rank of the current process in the MPI communicator.
i32 world_size()
Gives the size of the MPI communicator.
void run_micro_benchmark()
Run latency & bandwidth benchmark those benchmark where adapted from osu_microbenchmark.
const std::unordered_map< std::string, double > & get_microbench_results()
Get the microbench results.
saxpy_result saxpy_bench(DeviceScheduler_ptr sched, int N, T init_x, T init_y, T a, int load_size, bool check_correctness)
saxpy function for benchmarking.
void raw_ln(Types... var2)
Prints a log message with multiple arguments followed by a newline.
This file contains the definition for the stacktrace related functionality.
shambase::details::BasicStackEntry StackEntry
Alias for shambase::details::BasicStackEntry.
void Recv(void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Status *status)
MPI wrapper for MPI_Recv.
void Barrier(MPI_Comm comm)
MPI wrapper for MPI_Barrier.
void Waitall(int count, MPI_Request array_of_requests[], MPI_Status *array_of_statuses)
MPI wrapper for MPI_Waitall.
void Send(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm)
MPI wrapper for MPI_Send.
void Isend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request)
MPI wrapper for MPI_Isend.