Note
Go to the end to download the full example code.
reduction performance benchmarks#
This example benchmarks the reduction performance for the different algorithms available in Shamrock
9 import random
10 import time
11
12 import matplotlib.colors as colors
13 import matplotlib.pyplot as plt
14 import numpy as np
15
16 import shamrock
17
18 # If we use the shamrock executable to run this script instead of the python interpreter,
19 # we should not initialize the system as the shamrock executable needs to handle specific MPI logic
20 if not shamrock.sys.is_initialized():
21 shamrock.change_loglevel(1)
22 shamrock.sys.init("0:0")
Use shamrock documentation style for matplotlib
28 shamrock.matplotlib.set_shamrock_mpl_style()
Main benchmark functions
33 def benchmark_f32(N, nb_repeat=10):
34 times = []
35 for i in range(nb_repeat):
36 buf = shamrock.backends.DeviceBuffer_f32()
37 buf.resize(N)
38 buf.fill(0)
39 times.append(shamrock.algs.benchmark_reduction_sum(buf, N))
40 return min(times), max(times), sum(times) / nb_repeat
41
42
43 def benchmark_f64(N, nb_repeat=10):
44 times = []
45 for i in range(nb_repeat):
46 buf = shamrock.backends.DeviceBuffer_f64()
47 buf.resize(N)
48 buf.fill(0)
49 times.append(shamrock.algs.benchmark_reduction_sum(buf, N))
50 return min(times), max(times), sum(times) / nb_repeat
Run the performance test for all parameters
55 def run_performance_sweep():
56 # Define parameter ranges
57 # logspace as array
58 particle_counts = np.logspace(2, 7, 20).astype(int).tolist()
59
60 # Initialize results matrix
61 results_f32 = []
62 results_f64 = []
63
64 print(f"Particle counts: {particle_counts}")
65
66 total_runs = len(particle_counts)
67 current_run = 0
68
69 for i, N in enumerate(particle_counts):
70 current_run += 1
71
72 print(
73 f"[{current_run:2d}/{total_runs}] Running N={N:5d}...",
74 end=" ",
75 )
76
77 start_time = time.time()
78 min_time, max_time, mean_time = benchmark_f32(N)
79 results_f32.append(min_time)
80 min_time, max_time, mean_time = benchmark_f64(N)
81 results_f64.append(min_time)
82 elapsed = time.time() - start_time
83
84 print(f"mean={mean_time:.3f}s (took {elapsed:.1f}s)")
85
86 return particle_counts, results_f32, results_f64
List current implementation
91 current_impl = shamrock.algs.get_current_impl_reduction()
92
93 print(current_impl)
impl_param(impl_name="group_reduction128", params="")
List all implementations available
97 all_default_impls = shamrock.algs.get_default_impl_list_reduction()
98
99 print(all_default_impls)
[impl_param(impl_name="fallback", params=""), impl_param(impl_name="group_reduction16", params=""), impl_param(impl_name="group_reduction128", params=""), impl_param(impl_name="group_reduction256", params="")]
Run the performance benchmarks for all implementations
104 for impl in all_default_impls:
105 shamrock.algs.set_impl_reduction(impl.impl_name, impl.params)
106
107 print(f"Running reduction performance benchmarks for {impl}...")
108
109 # Run the performance sweep
110 particle_counts, results_f32, results_f64 = run_performance_sweep()
111
112 (line,) = plt.plot(particle_counts, results_f64, "--.", label=impl.impl_name + " (f64)")
113 plt.plot(
114 particle_counts, results_f32, ":", color=line.get_color(), label=impl.impl_name + " (f32)"
115 )
116
117
118 Nobj = np.array(particle_counts)
119 Time100M = Nobj / 1e8
120 plt.plot(particle_counts, Time100M, color="grey", linestyle="-", alpha=0.7, label="100M obj/sec")
121
122
123 plt.xlabel("Number of elements")
124 plt.ylabel("Time (s)")
125 plt.title("reduction performance benchmarks")
126
127 plt.xscale("log")
128 plt.yscale("log")
129
130 plt.grid(True)
131
132 plt.legend(fontsize=10)
133 plt.show()

Info: setting reduction implementation to impl : fallback [tree][rank=0]
Running reduction performance benchmarks for impl_param(impl_name="fallback", params="")...
Particle counts: [100, 183, 335, 615, 1128, 2069, 3792, 6951, 12742, 23357, 42813, 78475, 143844, 263665, 483293, 885866, 1623776, 2976351, 5455594, 10000000]
[ 1/20] Running N= 100... mean=0.000s (took 0.0s)
[ 2/20] Running N= 183... mean=0.000s (took 0.0s)
[ 3/20] Running N= 335... mean=0.000s (took 0.0s)
[ 4/20] Running N= 615... mean=0.000s (took 0.0s)
[ 5/20] Running N= 1128... mean=0.000s (took 0.0s)
[ 6/20] Running N= 2069... mean=0.000s (took 0.0s)
[ 7/20] Running N= 3792... mean=0.000s (took 0.0s)
[ 8/20] Running N= 6951... mean=0.000s (took 0.0s)
[ 9/20] Running N=12742... mean=0.000s (took 0.0s)
[10/20] Running N=23357... mean=0.000s (took 0.0s)
[11/20] Running N=42813... mean=0.000s (took 0.0s)
[12/20] Running N=78475... mean=0.000s (took 0.0s)
[13/20] Running N=143844... mean=0.000s (took 0.0s)
[14/20] Running N=263665... mean=0.001s (took 0.0s)
[15/20] Running N=483293... mean=0.001s (took 0.0s)
[16/20] Running N=885866... mean=0.002s (took 0.0s)
[17/20] Running N=1623776... mean=0.004s (took 0.1s)
[18/20] Running N=2976351... mean=0.007s (took 0.1s)
[19/20] Running N=5455594... mean=0.008s (took 0.2s)
[20/20] Running N=10000000... mean=0.019s (took 0.4s)
Info: setting reduction implementation to impl : group_reduction16 [tree][rank=0]
Running reduction performance benchmarks for impl_param(impl_name="group_reduction16", params="")...
Particle counts: [100, 183, 335, 615, 1128, 2069, 3792, 6951, 12742, 23357, 42813, 78475, 143844, 263665, 483293, 885866, 1623776, 2976351, 5455594, 10000000]
[ 1/20] Running N= 100... mean=0.000s (took 0.0s)
[ 2/20] Running N= 183... mean=0.000s (took 0.0s)
[ 3/20] Running N= 335... mean=0.000s (took 0.0s)
[ 4/20] Running N= 615... mean=0.000s (took 0.0s)
[ 5/20] Running N= 1128... mean=0.000s (took 0.0s)
[ 6/20] Running N= 2069... mean=0.000s (took 0.0s)
[ 7/20] Running N= 3792... mean=0.000s (took 0.0s)
[ 8/20] Running N= 6951... mean=0.000s (took 0.0s)
[ 9/20] Running N=12742... mean=0.000s (took 0.0s)
[10/20] Running N=23357... mean=0.000s (took 0.0s)
[11/20] Running N=42813... mean=0.000s (took 0.0s)
[12/20] Running N=78475... mean=0.000s (took 0.0s)
[13/20] Running N=143844... mean=0.000s (took 0.0s)
[14/20] Running N=263665... mean=0.000s (took 0.0s)
[15/20] Running N=483293... mean=0.001s (took 0.0s)
[16/20] Running N=885866... mean=0.001s (took 0.0s)
[17/20] Running N=1623776... mean=0.002s (took 0.0s)
[18/20] Running N=2976351... mean=0.004s (took 0.1s)
[19/20] Running N=5455594... mean=0.007s (took 0.2s)
[20/20] Running N=10000000... mean=0.018s (took 0.3s)
Info: setting reduction implementation to impl : group_reduction128 [tree][rank=0]
Running reduction performance benchmarks for impl_param(impl_name="group_reduction128", params="")...
Particle counts: [100, 183, 335, 615, 1128, 2069, 3792, 6951, 12742, 23357, 42813, 78475, 143844, 263665, 483293, 885866, 1623776, 2976351, 5455594, 10000000]
[ 1/20] Running N= 100... mean=0.000s (took 0.0s)
[ 2/20] Running N= 183... mean=0.000s (took 0.0s)
[ 3/20] Running N= 335... mean=0.000s (took 0.0s)
[ 4/20] Running N= 615... mean=0.000s (took 0.0s)
[ 5/20] Running N= 1128... mean=0.000s (took 0.0s)
[ 6/20] Running N= 2069... mean=0.000s (took 0.0s)
[ 7/20] Running N= 3792... mean=0.000s (took 0.0s)
[ 8/20] Running N= 6951... mean=0.000s (took 0.0s)
[ 9/20] Running N=12742... mean=0.000s (took 0.0s)
[10/20] Running N=23357... mean=0.000s (took 0.0s)
[11/20] Running N=42813... mean=0.000s (took 0.0s)
[12/20] Running N=78475... mean=0.000s (took 0.0s)
[13/20] Running N=143844... mean=0.000s (took 0.0s)
[14/20] Running N=263665... mean=0.000s (took 0.0s)
[15/20] Running N=483293... mean=0.001s (took 0.0s)
[16/20] Running N=885866... mean=0.001s (took 0.0s)
[17/20] Running N=1623776... mean=0.002s (took 0.0s)
[18/20] Running N=2976351... mean=0.004s (took 0.1s)
[19/20] Running N=5455594... mean=0.007s (took 0.2s)
[20/20] Running N=10000000... mean=0.017s (took 0.3s)
Info: setting reduction implementation to impl : group_reduction256 [tree][rank=0]
Running reduction performance benchmarks for impl_param(impl_name="group_reduction256", params="")...
Particle counts: [100, 183, 335, 615, 1128, 2069, 3792, 6951, 12742, 23357, 42813, 78475, 143844, 263665, 483293, 885866, 1623776, 2976351, 5455594, 10000000]
[ 1/20] Running N= 100... mean=0.000s (took 0.0s)
[ 2/20] Running N= 183... mean=0.000s (took 0.0s)
[ 3/20] Running N= 335... mean=0.000s (took 0.0s)
[ 4/20] Running N= 615... mean=0.000s (took 0.0s)
[ 5/20] Running N= 1128... mean=0.000s (took 0.0s)
[ 6/20] Running N= 2069... mean=0.000s (took 0.0s)
[ 7/20] Running N= 3792... mean=0.000s (took 0.0s)
[ 8/20] Running N= 6951... mean=0.000s (took 0.0s)
[ 9/20] Running N=12742... mean=0.000s (took 0.0s)
[10/20] Running N=23357... mean=0.000s (took 0.0s)
[11/20] Running N=42813... mean=0.000s (took 0.0s)
[12/20] Running N=78475... mean=0.000s (took 0.0s)
[13/20] Running N=143844... mean=0.000s (took 0.0s)
[14/20] Running N=263665... mean=0.000s (took 0.0s)
[15/20] Running N=483293... mean=0.001s (took 0.0s)
[16/20] Running N=885866... mean=0.001s (took 0.0s)
[17/20] Running N=1623776... mean=0.002s (took 0.0s)
[18/20] Running N=2976351... mean=0.004s (took 0.1s)
[19/20] Running N=5455594... mean=0.007s (took 0.2s)
[20/20] Running N=10000000... mean=0.018s (took 0.3s)
Total running time of the script: (0 minutes 3.795 seconds)
Estimated memory usage: 316 MB