reduction performance benchmarks#

This example benchmarks the reduction performance for the different algorithms available in Shamrock

 import random
 import time

 import matplotlib.colors as colors
 import matplotlib.pyplot as plt
 import numpy as np

 import shamrock

 # If we use the shamrock executable to run this script instead of the python interpreter,
 # we should not initialize the system as the shamrock executable needs to handle specific MPI logic
 if not shamrock.sys.is_initialized():
     shamrock.change_loglevel(1)
     shamrock.sys.init("0:0")

Main benchmark functions

 def benchmark_f32(N, nb_repeat=10):
     times = []
     for i in range(nb_repeat):
         buf = shamrock.backends.DeviceBuffer_f32()
         buf.resize(N)
         buf.fill(0)
         times.append(shamrock.algs.benchmark_reduction_sum(buf, N))
     return min(times), max(times), sum(times) / nb_repeat


 def benchmark_f64(N, nb_repeat=10):
     times = []
     for i in range(nb_repeat):
         buf = shamrock.backends.DeviceBuffer_f64()
         buf.resize(N)
         buf.fill(0)
         times.append(shamrock.algs.benchmark_reduction_sum(buf, N))
     return min(times), max(times), sum(times) / nb_repeat

Run the performance test for all parameters

 def run_performance_sweep():

     # Define parameter ranges
     # logspace as array
     particle_counts = np.logspace(2, 7, 20).astype(int).tolist()

     # Initialize results matrix
     results_f32 = []
     results_f64 = []

     print(f"Particle counts: {particle_counts}")

     total_runs = len(particle_counts)
     current_run = 0

     for i, N in enumerate(particle_counts):
         current_run += 1

         print(
             f"[{current_run:2d}/{total_runs}] Running N={N:5d}...",
             end=" ",
         )

         start_time = time.time()
         min_time, max_time, mean_time = benchmark_f32(N)
         results_f32.append(min_time)
         min_time, max_time, mean_time = benchmark_f64(N)
         results_f64.append(min_time)
         elapsed = time.time() - start_time

         print(f"mean={mean_time:.3f}s (took {elapsed:.1f}s)")

     return particle_counts, results_f32, results_f64

List current implementation

 current_impl = shamrock.algs.get_current_impl_reduction()

 print(current_impl)

impl_param(impl_name="group_reduction128", params="")

List all implementations available

 all_default_impls = shamrock.algs.get_default_impl_list_reduction()

 print(all_default_impls)

[impl_param(impl_name="fallback", params=""), impl_param(impl_name="group_reduction16", params=""), impl_param(impl_name="group_reduction128", params=""), impl_param(impl_name="group_reduction256", params="")]

Run the performance benchmarks for all implementations

 for impl in all_default_impls:
     shamrock.algs.set_impl_reduction(impl.impl_name, impl.params)

     print(f"Running reduction performance benchmarks for {impl}...")

     # Run the performance sweep
     particle_counts, results_f32, results_f64 = run_performance_sweep()

     (line,) = plt.plot(particle_counts, results_f64, "--.", label=impl.impl_name + " (f64)")
     plt.plot(
         particle_counts, results_f32, ":", color=line.get_color(), label=impl.impl_name + " (f32)"
     )


 Nobj = np.array(particle_counts)
 Time100M = Nobj / 1e8
 plt.plot(particle_counts, Time100M, color="grey", linestyle="-", alpha=0.7, label="100M obj/sec")


 plt.xlabel("Number of elements")
 plt.ylabel("Time (s)")
 plt.title("reduction performance benchmarks")

 plt.xscale("log")
 plt.yscale("log")

 plt.grid(True)

 plt.legend()
 plt.show()

Info: setting reduction implementation to impl : fallback                            [tree][rank=0]
Running reduction performance benchmarks for impl_param(impl_name="fallback", params="")...
Particle counts: [100, 183, 335, 615, 1128, 2069, 3792, 6951, 12742, 23357, 42813, 78475, 143844, 263665, 483293, 885866, 1623776, 2976351, 5455594, 10000000]
[ 1/20] Running N=  100... mean=0.000s (took 0.0s)
[ 2/20] Running N=  183... mean=0.000s (took 0.0s)
[ 3/20] Running N=  335... mean=0.000s (took 0.0s)
[ 4/20] Running N=  615... mean=0.000s (took 0.0s)
[ 5/20] Running N= 1128... mean=0.000s (took 0.0s)
[ 6/20] Running N= 2069... mean=0.000s (took 0.0s)
[ 7/20] Running N= 3792... mean=0.000s (took 0.0s)
[ 8/20] Running N= 6951... mean=0.000s (took 0.0s)
[ 9/20] Running N=12742... mean=0.000s (took 0.0s)
[10/20] Running N=23357... mean=0.000s (took 0.0s)
[11/20] Running N=42813... mean=0.000s (took 0.0s)
[12/20] Running N=78475... mean=0.000s (took 0.0s)
[13/20] Running N=143844... mean=0.000s (took 0.0s)
[14/20] Running N=263665... mean=0.001s (took 0.0s)
[15/20] Running N=483293... mean=0.001s (took 0.0s)
[16/20] Running N=885866... mean=0.002s (took 0.0s)
[17/20] Running N=1623776... mean=0.005s (took 0.1s)
[18/20] Running N=2976351... mean=0.009s (took 0.2s)
[19/20] Running N=5455594... mean=0.011s (took 0.3s)
[20/20] Running N=10000000... mean=0.028s (took 0.6s)
Info: setting reduction implementation to impl : group_reduction16                   [tree][rank=0]
Running reduction performance benchmarks for impl_param(impl_name="group_reduction16", params="")...
Particle counts: [100, 183, 335, 615, 1128, 2069, 3792, 6951, 12742, 23357, 42813, 78475, 143844, 263665, 483293, 885866, 1623776, 2976351, 5455594, 10000000]
[ 1/20] Running N=  100... mean=0.000s (took 0.0s)
[ 2/20] Running N=  183... mean=0.000s (took 0.0s)
[ 3/20] Running N=  335... mean=0.000s (took 0.0s)
[ 4/20] Running N=  615... mean=0.000s (took 0.0s)
[ 5/20] Running N= 1128... mean=0.000s (took 0.0s)
[ 6/20] Running N= 2069... mean=0.000s (took 0.0s)
[ 7/20] Running N= 3792... mean=0.000s (took 0.0s)
[ 8/20] Running N= 6951... mean=0.000s (took 0.0s)
[ 9/20] Running N=12742... mean=0.000s (took 0.0s)
[10/20] Running N=23357... mean=0.000s (took 0.0s)
[11/20] Running N=42813... mean=0.000s (took 0.0s)
[12/20] Running N=78475... mean=0.000s (took 0.0s)
[13/20] Running N=143844... mean=0.000s (took 0.0s)
[14/20] Running N=263665... mean=0.000s (took 0.0s)
[15/20] Running N=483293... mean=0.001s (took 0.0s)
[16/20] Running N=885866... mean=0.001s (took 0.0s)
[17/20] Running N=1623776... mean=0.002s (took 0.0s)
[18/20] Running N=2976351... mean=0.005s (took 0.1s)
[19/20] Running N=5455594... mean=0.011s (took 0.2s)
[20/20] Running N=10000000... mean=0.020s (took 0.4s)
Info: setting reduction implementation to impl : group_reduction128                  [tree][rank=0]
Running reduction performance benchmarks for impl_param(impl_name="group_reduction128", params="")...
Particle counts: [100, 183, 335, 615, 1128, 2069, 3792, 6951, 12742, 23357, 42813, 78475, 143844, 263665, 483293, 885866, 1623776, 2976351, 5455594, 10000000]
[ 1/20] Running N=  100... mean=0.000s (took 0.0s)
[ 2/20] Running N=  183... mean=0.000s (took 0.0s)
[ 3/20] Running N=  335... mean=0.000s (took 0.0s)
[ 4/20] Running N=  615... mean=0.000s (took 0.0s)
[ 5/20] Running N= 1128... mean=0.000s (took 0.0s)
[ 6/20] Running N= 2069... mean=0.000s (took 0.0s)
[ 7/20] Running N= 3792... mean=0.000s (took 0.0s)
[ 8/20] Running N= 6951... mean=0.000s (took 0.0s)
[ 9/20] Running N=12742... mean=0.000s (took 0.0s)
[10/20] Running N=23357... mean=0.000s (took 0.0s)
[11/20] Running N=42813... mean=0.000s (took 0.0s)
[12/20] Running N=78475... mean=0.000s (took 0.0s)
[13/20] Running N=143844... mean=0.000s (took 0.0s)
[14/20] Running N=263665... mean=0.000s (took 0.0s)
[15/20] Running N=483293... mean=0.000s (took 0.0s)
[16/20] Running N=885866... mean=0.001s (took 0.0s)
[17/20] Running N=1623776... mean=0.002s (took 0.0s)
[18/20] Running N=2976351... mean=0.003s (took 0.1s)
[19/20] Running N=5455594... mean=0.007s (took 0.2s)
[20/20] Running N=10000000... mean=0.018s (took 0.4s)
Info: setting reduction implementation to impl : group_reduction256                  [tree][rank=0]
Running reduction performance benchmarks for impl_param(impl_name="group_reduction256", params="")...
Particle counts: [100, 183, 335, 615, 1128, 2069, 3792, 6951, 12742, 23357, 42813, 78475, 143844, 263665, 483293, 885866, 1623776, 2976351, 5455594, 10000000]
[ 1/20] Running N=  100... mean=0.000s (took 0.0s)
[ 2/20] Running N=  183... mean=0.000s (took 0.0s)
[ 3/20] Running N=  335... mean=0.000s (took 0.0s)
[ 4/20] Running N=  615... mean=0.000s (took 0.0s)
[ 5/20] Running N= 1128... mean=0.000s (took 0.0s)
[ 6/20] Running N= 2069... mean=0.000s (took 0.0s)
[ 7/20] Running N= 3792... mean=0.000s (took 0.0s)
[ 8/20] Running N= 6951... mean=0.000s (took 0.0s)
[ 9/20] Running N=12742... mean=0.000s (took 0.0s)
[10/20] Running N=23357... mean=0.000s (took 0.0s)
[11/20] Running N=42813... mean=0.000s (took 0.0s)
[12/20] Running N=78475... mean=0.000s (took 0.0s)
[13/20] Running N=143844... mean=0.000s (took 0.0s)
[14/20] Running N=263665... mean=0.000s (took 0.0s)
[15/20] Running N=483293... mean=0.001s (took 0.0s)
[16/20] Running N=885866... mean=0.001s (took 0.0s)
[17/20] Running N=1623776... mean=0.002s (took 0.0s)
[18/20] Running N=2976351... mean=0.003s (took 0.1s)
[19/20] Running N=5455594... mean=0.007s (took 0.2s)
[20/20] Running N=10000000... mean=0.018s (took 0.4s)

Total running time of the script: (0 minutes 4.529 seconds)

Estimated memory usage: 271 MB

Gallery generated by Sphinx-Gallery

reduction performance benchmarks#

This Page