DTT performance benchmarks#

This example benchmarks the DTT performance for the different algorithms available in Shamrock

 9 import random
10 import time
11
12 import matplotlib.colors as colors
13 import matplotlib.pyplot as plt
14 import numpy as np
15
16 import shamrock
17
18 # If we use the shamrock executable to run this script instead of the python interpreter,
19 # we should not initialize the system as the shamrock executable needs to handle specific MPI logic
20 if not shamrock.sys.is_initialized():
21     shamrock.change_loglevel(1)
22     shamrock.sys.init("0:0")

Main benchmark functions

28 bounding_box = shamrock.math.AABB_f64_3((0.0, 0.0, 0.0), (1.0, 1.0, 1.0))
29
30
31 def benchmark_dtt_core(N, theta_crit, compression_level, ordered_result, nb_repeat=10):
32     times = []
33     random.seed(111)
34     max_mem_delta = 0
35     for i in range(nb_repeat):
36         positions = shamrock.algs.mock_buffer_f64_3(
37             random.randint(0, 1000000), N, bounding_box.lower, bounding_box.upper
38         )
39         tree = shamrock.tree.CLBVH_u64_f64_3()
40         tree.rebuild_from_positions(positions, bounding_box, compression_level)
41         shamrock.backends.reset_mem_info_max()
42         mem_info_before = shamrock.backends.get_mem_perf_info()
43         times.append(
44             shamrock.tree.benchmark_clbvh_dual_tree_traversal(tree, theta_crit, ordered_result)
45             * 1000
46         )
47         mem_info_after = shamrock.backends.get_mem_perf_info()
48
49         mem_delta = (
50             mem_info_after.max_allocated_byte_device - mem_info_before.max_allocated_byte_device
51         )
52         max_mem_delta = max(max_mem_delta, mem_delta)
53     return times, max_mem_delta
54
55
56 def benchmark_dtt(N, theta_crit, compression_level, ordered_result, nb_repeat=10):
57     times, max_mem_delta = benchmark_dtt_core(
58         N, theta_crit, compression_level, ordered_result, nb_repeat
59     )
60     return min(times), max(times), sum(times) / nb_repeat, max_mem_delta

Run the performance test for all parameters

 65 def run_performance_sweep(compression_level, threshold_run, ordered_result):
 66     # Define parameter ranges
 67     # logspace as array
 68     particle_counts = np.logspace(2, 7, 10).astype(int).tolist()
 69     theta_crits = [0.1, 0.3, 0.5, 0.7, 0.9]
 70
 71     # Initialize results matrix
 72     results_mean = np.zeros((len(theta_crits), len(particle_counts)))
 73     results_min = np.zeros((len(theta_crits), len(particle_counts)))
 74     results_max = np.zeros((len(theta_crits), len(particle_counts)))
 75     results_max_mem_delta = np.zeros((len(theta_crits), len(particle_counts)))
 76
 77     print(f"Particle counts: {particle_counts}")
 78     print(f"Theta_crit values: {theta_crits}")
 79     print(f"Compression level: {compression_level}")
 80
 81     total_runs = len(particle_counts) * len(theta_crits)
 82     current_run = 0
 83
 84     for i, theta_crit in enumerate(theta_crits):
 85         exceed_mem = False
 86         for j, N in enumerate(particle_counts):
 87             current_run += 1
 88
 89             if exceed_mem:
 90                 print(
 91                     f"[{current_run:2d}/{total_runs}] Skipping N={N:5d}, theta_crit={theta_crit:.1f}"
 92                 )
 93                 results_mean[i, j] = np.nan
 94                 results_min[i, j] = np.nan
 95                 results_max[i, j] = np.nan
 96                 continue
 97
 98             print(
 99                 f"[{current_run:2d}/{total_runs}] Running N={N:5d}, theta_crit={theta_crit:.1f}...",
100                 end=" ",
101             )
102
103             start_time = time.time()
104             min_time, max_time, mean_time, max_mem_delta = benchmark_dtt(
105                 N, theta_crit, compression_level, ordered_result
106             )
107             elapsed = time.time() - start_time
108
109             results_mean[i, j] = mean_time
110             results_min[i, j] = min_time
111             results_max[i, j] = max_time
112             results_max_mem_delta[i, j] = max_mem_delta
113
114             print(f"mean={mean_time:.3f}ms (took {elapsed:.1f}s)")
115
116             if max_mem_delta > threshold_run:
117                 exceed_mem = True
118
119     return (
120         particle_counts,
121         theta_crits,
122         results_mean,
123         results_min,
124         results_max,
125         results_max_mem_delta,
126     )

Create checkerboard plot with execution times and relative performance to reference algorithm

131 def create_checkerboard_plot(
132     particle_counts,
133     theta_crits,
134     results_data,
135     compression_level,
136     algname,
137     max_axis_value,
138     reference_data,
139     results_max_mem_delta,
140 ):
141     """Create checkerboard plot with execution times"""
142
143     fig, ax = plt.subplots(figsize=(12, 8))
144
145     # Calculate relative performance compared to reference algorithm
146     # results_data / reference_data gives the ratio (>1 means slower, <1 means faster)
147     relative_performance = results_data / reference_data
148
149     # Create the heatmap with relative performance values
150     # Create a masked array to handle NaN values (skipped benchmarks) as white
151     masked_relative = np.ma.masked_invalid(relative_performance)
152
153     # Use a diverging colormap: red for better performance (<1), green for worse (>1)
154     # RdYlGn_r (reversed) has green for high values (worse) and red for low values (better)
155     cmap = plt.cm.RdYlGn_r.copy()  # Green for >1 (slower), Red for <1 (faster)
156     cmap.set_bad(color="white")  # Set NaN values to white
157
158     # Set the color scale limits for relative performance
159     vmin = 0.5
160     vmax = 1.5
161
162     im = ax.imshow(
163         masked_relative, cmap=cmap, aspect="auto", interpolation="nearest", vmin=vmin, vmax=vmax
164     )
165
166     # Set ticks and labels
167     ax.set_xticks(range(len(particle_counts)))
168     ax.set_yticks(range(len(theta_crits)))
169     ax.set_xticklabels([f"{N // 1000}k" if N >= 1000 else str(N) for N in particle_counts])
170     ax.set_yticklabels([f"{theta:.1f}" for theta in theta_crits])
171
172     # Add labels
173     ax.set_xlabel("Particle Count")
174     ax.set_ylabel("Theta Critical")
175     ax.set_title(
176         f"Dual Tree Traversal Performance\n(Colors: Relative to Reference, Text: Absolute Time in ms)\ncompression level = {compression_level} algorithm = {algname}",
177         pad=20,
178     )
179
180     # Add text annotations showing the values
181     for i in range(len(theta_crits)):
182         for j in range(len(particle_counts)):
183             value = results_data[i, j]
184
185             if np.isnan(value):
186                 # For skipped benchmarks, show "SKIPPED" in black on white background
187                 # ax.text(j, i, 'SKIPPED', ha='center', va='center',
188                 #       color='black', fontweight='bold', fontsize=8)
189                 pass
190             else:
191                 perf = relative_performance[i, j]
192                 mem_delta = results_max_mem_delta[i, j] / 1e6
193                 text_color = "black"
194                 ax.text(
195                     j,
196                     i,
197                     f"{value:.2f}ms\n{perf:.2f}\n{mem_delta:.2f}MB",
198                     ha="center",
199                     va="center",
200                     color=text_color,
201                     fontweight="bold",
202                     fontsize=10,
203                 )
204
205     # Add colorbar for relative performance
206     cbar = plt.colorbar(im, ax=ax, shrink=0.8)
207     cbar.set_label("Relative performance (time / reference time)")
208     cbar.ax.tick_params(labelsize=10)
209
210     # Add custom tick labels for better interpretation
211     tick_positions = [0.1, 0.2, 0.5, 1.0, 2.0, 3.0]
212     cbar.set_ticks([pos for pos in tick_positions if vmin <= pos <= vmax])
213
214     # Improve layout
215     plt.tight_layout()
216
217     # Add grid for better readability
218     ax.set_xticks(np.arange(len(particle_counts)) - 0.5, minor=True)
219     ax.set_yticks(np.arange(len(theta_crits)) - 0.5, minor=True)
220     ax.grid(which="minor", color="black", linestyle="-", linewidth=1, alpha=0.3)
221
222     return fig, ax

List current implementation

impl_param(impl_name="scan_multipass", params="")

List all implementations available

[impl_param(impl_name="reference", params=""), impl_param(impl_name="parallel_select", params=""), impl_param(impl_name="scan_multipass", params="")]

Run the performance benchmarks for all implementations

239 results = {}
240
241
242 for ordered_result in [True, False]:
243     for default_impl in all_default_impls:
244         shamrock.tree.set_impl_clbvh_dual_tree_traversal(
245             default_impl.impl_name, default_impl.params
246         )
247
248         n = default_impl.impl_name + " " + default_impl.params + "ordered=" + str(ordered_result)
249
250         print(f"Running DTT performance benchmarks for {n}...")
251
252         compression_level = 4
253
254         threshold_run = 5e6
255         # Run the performance sweep
256         (
257             particle_counts,
258             theta_crits,
259             results_mean,
260             results_min,
261             results_max,
262             results_max_mem_delta,
263         ) = run_performance_sweep(compression_level, threshold_run, ordered_result)
264
265         results[n] = {
266             "particle_counts": particle_counts,
267             "theta_crits": theta_crits,
268             "results_mean": results_mean,
269             "results_min": results_min,
270             "results_max": results_max,
271             "results_max_mem_delta": results_max_mem_delta,
272             "name": n,
273         }
Info: setting dtt implementation to impl : reference                                 [tree][rank=0]
Running DTT performance benchmarks for reference ordered=True...
Particle counts: [100, 359, 1291, 4641, 16681, 59948, 215443, 774263, 2782559, 10000000]
Theta_crit values: [0.1, 0.3, 0.5, 0.7, 0.9]
Compression level: 4
[ 1/50] Running N=  100, theta_crit=0.1... mean=2.967ms (took 0.1s)
[ 2/50] Running N=  359, theta_crit=0.1... mean=2.969ms (took 0.1s)
[ 3/50] Running N= 1291, theta_crit=0.1... mean=3.393ms (took 0.1s)
[ 4/50] Running N= 4641, theta_crit=0.1... mean=9.722ms (took 0.2s)
[ 5/50] Running N=16681, theta_crit=0.1... mean=79.557ms (took 1.0s)
[ 6/50] Skipping N=59948, theta_crit=0.1
[ 7/50] Skipping N=215443, theta_crit=0.1
[ 8/50] Skipping N=774263, theta_crit=0.1
[ 9/50] Skipping N=2782559, theta_crit=0.1
[10/50] Skipping N=10000000, theta_crit=0.1
[11/50] Running N=  100, theta_crit=0.3... mean=3.079ms (took 0.1s)
[12/50] Running N=  359, theta_crit=0.3... mean=2.736ms (took 0.1s)
[13/50] Running N= 1291, theta_crit=0.3... mean=3.101ms (took 0.1s)
[14/50] Running N= 4641, theta_crit=0.3... mean=6.678ms (took 0.1s)
[15/50] Running N=16681, theta_crit=0.3... mean=26.421ms (took 0.5s)
[16/50] Running N=59948, theta_crit=0.3... mean=177.130ms (took 2.5s)
[17/50] Skipping N=215443, theta_crit=0.3
[18/50] Skipping N=774263, theta_crit=0.3
[19/50] Skipping N=2782559, theta_crit=0.3
[20/50] Skipping N=10000000, theta_crit=0.3
[21/50] Running N=  100, theta_crit=0.5... mean=3.284ms (took 0.1s)
[22/50] Running N=  359, theta_crit=0.5... mean=4.149ms (took 0.1s)
[23/50] Running N= 1291, theta_crit=0.5... mean=3.722ms (took 0.1s)
[24/50] Running N= 4641, theta_crit=0.5... mean=3.969ms (took 0.1s)
[25/50] Running N=16681, theta_crit=0.5... mean=8.532ms (took 0.3s)
[26/50] Running N=59948, theta_crit=0.5... mean=34.320ms (took 0.9s)
[27/50] Skipping N=215443, theta_crit=0.5
[28/50] Skipping N=774263, theta_crit=0.5
[29/50] Skipping N=2782559, theta_crit=0.5
[30/50] Skipping N=10000000, theta_crit=0.5
[31/50] Running N=  100, theta_crit=0.7... mean=3.614ms (took 0.1s)
[32/50] Running N=  359, theta_crit=0.7... mean=3.570ms (took 0.1s)
[33/50] Running N= 1291, theta_crit=0.7... mean=3.123ms (took 0.1s)
[34/50] Running N= 4641, theta_crit=0.7... mean=3.686ms (took 0.1s)
[35/50] Running N=16681, theta_crit=0.7... mean=6.560ms (took 0.3s)
[36/50] Running N=59948, theta_crit=0.7... mean=18.602ms (took 0.8s)
[37/50] Running N=215443, theta_crit=0.7... mean=40.894ms (took 2.4s)
[38/50] Skipping N=774263, theta_crit=0.7
[39/50] Skipping N=2782559, theta_crit=0.7
[40/50] Skipping N=10000000, theta_crit=0.7
[41/50] Running N=  100, theta_crit=0.9... mean=2.109ms (took 0.0s)
[42/50] Running N=  359, theta_crit=0.9... mean=2.257ms (took 0.0s)
[43/50] Running N= 1291, theta_crit=0.9... mean=2.327ms (took 0.1s)
[44/50] Running N= 4641, theta_crit=0.9... mean=2.723ms (took 0.1s)
[45/50] Running N=16681, theta_crit=0.9... mean=3.914ms (took 0.2s)
[46/50] Running N=59948, theta_crit=0.9... mean=10.807ms (took 0.6s)
[47/50] Running N=215443, theta_crit=0.9... mean=23.101ms (took 2.4s)
[48/50] Running N=774263, theta_crit=0.9... mean=78.124ms (took 5.9s)
[49/50] Skipping N=2782559, theta_crit=0.9
[50/50] Skipping N=10000000, theta_crit=0.9
Info: setting dtt implementation to impl : parallel_select                           [tree][rank=0]
Running DTT performance benchmarks for parallel_select ordered=True...
Particle counts: [100, 359, 1291, 4641, 16681, 59948, 215443, 774263, 2782559, 10000000]
Theta_crit values: [0.1, 0.3, 0.5, 0.7, 0.9]
Compression level: 4
[ 1/50] Running N=  100, theta_crit=0.1... mean=1.640ms (took 0.0s)
[ 2/50] Running N=  359, theta_crit=0.1... mean=1.783ms (took 0.0s)
[ 3/50] Running N= 1291, theta_crit=0.1... mean=2.844ms (took 0.1s)
[ 4/50] Running N= 4641, theta_crit=0.1... mean=10.603ms (took 0.2s)
[ 5/50] Running N=16681, theta_crit=0.1... mean=94.939ms (took 1.1s)
[ 6/50] Skipping N=59948, theta_crit=0.1
[ 7/50] Skipping N=215443, theta_crit=0.1
[ 8/50] Skipping N=774263, theta_crit=0.1
[ 9/50] Skipping N=2782559, theta_crit=0.1
[10/50] Skipping N=10000000, theta_crit=0.1
[11/50] Running N=  100, theta_crit=0.3... mean=1.120ms (took 0.0s)
[12/50] Running N=  359, theta_crit=0.3... mean=1.169ms (took 0.0s)
[13/50] Running N= 1291, theta_crit=0.3... mean=1.721ms (took 0.0s)
[14/50] Running N= 4641, theta_crit=0.3... mean=7.474ms (took 0.1s)
[15/50] Running N=16681, theta_crit=0.3... mean=51.683ms (took 0.6s)
[16/50] Running N=59948, theta_crit=0.3... mean=353.949ms (took 3.9s)
[17/50] Skipping N=215443, theta_crit=0.3
[18/50] Skipping N=774263, theta_crit=0.3
[19/50] Skipping N=2782559, theta_crit=0.3
[20/50] Skipping N=10000000, theta_crit=0.3
[21/50] Running N=  100, theta_crit=0.5... mean=1.127ms (took 0.0s)
[22/50] Running N=  359, theta_crit=0.5... mean=1.169ms (took 0.0s)
[23/50] Running N= 1291, theta_crit=0.5... mean=1.662ms (took 0.0s)
[24/50] Running N= 4641, theta_crit=0.5... mean=4.437ms (took 0.1s)
[25/50] Running N=16681, theta_crit=0.5... mean=20.881ms (took 0.3s)
[26/50] Running N=59948, theta_crit=0.5... mean=109.711ms (took 1.4s)
[27/50] Running N=215443, theta_crit=0.5... mean=495.408ms (took 6.2s)
[28/50] Skipping N=774263, theta_crit=0.5
[29/50] Skipping N=2782559, theta_crit=0.5
[30/50] Skipping N=10000000, theta_crit=0.5
[31/50] Running N=  100, theta_crit=0.7... mean=1.222ms (took 0.0s)
[32/50] Running N=  359, theta_crit=0.7... mean=1.167ms (took 0.0s)
[33/50] Running N= 1291, theta_crit=0.7... mean=1.527ms (took 0.0s)
[34/50] Running N= 4641, theta_crit=0.7... mean=3.373ms (took 0.1s)
[35/50] Running N=16681, theta_crit=0.7... mean=13.299ms (took 0.3s)
[36/50] Running N=59948, theta_crit=0.7... mean=64.594ms (took 1.0s)
[37/50] Running N=215443, theta_crit=0.7... mean=288.495ms (took 4.2s)
[38/50] Skipping N=774263, theta_crit=0.7
[39/50] Skipping N=2782559, theta_crit=0.7
[40/50] Skipping N=10000000, theta_crit=0.7
[41/50] Running N=  100, theta_crit=0.9... mean=1.124ms (took 0.0s)
[42/50] Running N=  359, theta_crit=0.9... mean=1.147ms (took 0.0s)
[43/50] Running N= 1291, theta_crit=0.9... mean=1.445ms (took 0.0s)
[44/50] Running N= 4641, theta_crit=0.9... mean=2.967ms (took 0.1s)
[45/50] Running N=16681, theta_crit=0.9... mean=7.507ms (took 0.2s)
[46/50] Running N=59948, theta_crit=0.9... mean=31.574ms (took 0.7s)
[47/50] Running N=215443, theta_crit=0.9... mean=131.548ms (took 2.6s)
[48/50] Running N=774263, theta_crit=0.9... mean=553.075ms (took 10.2s)
[49/50] Skipping N=2782559, theta_crit=0.9
[50/50] Skipping N=10000000, theta_crit=0.9
Info: setting dtt implementation to impl : scan_multipass                            [tree][rank=0]
Running DTT performance benchmarks for scan_multipass ordered=True...
Particle counts: [100, 359, 1291, 4641, 16681, 59948, 215443, 774263, 2782559, 10000000]
Theta_crit values: [0.1, 0.3, 0.5, 0.7, 0.9]
Compression level: 4
[ 1/50] Running N=  100, theta_crit=0.1... mean=7.150ms (took 0.1s)
[ 2/50] Running N=  359, theta_crit=0.1... mean=6.997ms (took 0.1s)
[ 3/50] Running N= 1291, theta_crit=0.1... mean=9.314ms (took 0.1s)
[ 4/50] Running N= 4641, theta_crit=0.1... mean=14.841ms (took 0.2s)
[ 5/50] Skipping N=16681, theta_crit=0.1
[ 6/50] Skipping N=59948, theta_crit=0.1
[ 7/50] Skipping N=215443, theta_crit=0.1
[ 8/50] Skipping N=774263, theta_crit=0.1
[ 9/50] Skipping N=2782559, theta_crit=0.1
[10/50] Skipping N=10000000, theta_crit=0.1
[11/50] Running N=  100, theta_crit=0.3... mean=5.085ms (took 0.1s)
[12/50] Running N=  359, theta_crit=0.3... mean=9.551ms (took 0.1s)
[13/50] Running N= 1291, theta_crit=0.3... mean=12.957ms (took 0.2s)
[14/50] Running N= 4641, theta_crit=0.3... mean=18.088ms (took 0.3s)
[15/50] Running N=16681, theta_crit=0.3... mean=37.432ms (took 0.6s)
[16/50] Skipping N=59948, theta_crit=0.3
[17/50] Skipping N=215443, theta_crit=0.3
[18/50] Skipping N=774263, theta_crit=0.3
[19/50] Skipping N=2782559, theta_crit=0.3
[20/50] Skipping N=10000000, theta_crit=0.3
[21/50] Running N=  100, theta_crit=0.5... mean=8.220ms (took 0.1s)
[22/50] Running N=  359, theta_crit=0.5... mean=9.178ms (took 0.1s)
[23/50] Running N= 1291, theta_crit=0.5... mean=9.716ms (took 0.1s)
[24/50] Running N= 4641, theta_crit=0.5... mean=12.148ms (took 0.2s)
[25/50] Running N=16681, theta_crit=0.5... mean=19.856ms (took 0.4s)
[26/50] Skipping N=59948, theta_crit=0.5
[27/50] Skipping N=215443, theta_crit=0.5
[28/50] Skipping N=774263, theta_crit=0.5
[29/50] Skipping N=2782559, theta_crit=0.5
[30/50] Skipping N=10000000, theta_crit=0.5
[31/50] Running N=  100, theta_crit=0.7... mean=6.994ms (took 0.1s)
[32/50] Running N=  359, theta_crit=0.7... mean=9.908ms (took 0.1s)
[33/50] Running N= 1291, theta_crit=0.7... mean=12.628ms (took 0.2s)
[34/50] Running N= 4641, theta_crit=0.7... mean=15.012ms (took 0.2s)
[35/50] Running N=16681, theta_crit=0.7... mean=21.068ms (took 0.4s)
[36/50] Running N=59948, theta_crit=0.7... mean=33.557ms (took 1.0s)
[37/50] Skipping N=215443, theta_crit=0.7
[38/50] Skipping N=774263, theta_crit=0.7
[39/50] Skipping N=2782559, theta_crit=0.7
[40/50] Skipping N=10000000, theta_crit=0.7
[41/50] Running N=  100, theta_crit=0.9... mean=7.056ms (took 0.1s)
[42/50] Running N=  359, theta_crit=0.9... mean=10.080ms (took 0.1s)
[43/50] Running N= 1291, theta_crit=0.9... mean=12.597ms (took 0.2s)
[44/50] Running N= 4641, theta_crit=0.9... mean=14.902ms (took 0.2s)
[45/50] Running N=16681, theta_crit=0.9... mean=19.317ms (took 0.4s)
[46/50] Running N=59948, theta_crit=0.9... mean=27.713ms (took 1.0s)
[47/50] Skipping N=215443, theta_crit=0.9
[48/50] Skipping N=774263, theta_crit=0.9
[49/50] Skipping N=2782559, theta_crit=0.9
[50/50] Skipping N=10000000, theta_crit=0.9
Info: setting dtt implementation to impl : reference                                 [tree][rank=0]
Running DTT performance benchmarks for reference ordered=False...
Particle counts: [100, 359, 1291, 4641, 16681, 59948, 215443, 774263, 2782559, 10000000]
Theta_crit values: [0.1, 0.3, 0.5, 0.7, 0.9]
Compression level: 4
[ 1/50] Running N=  100, theta_crit=0.1... mean=1.713ms (took 0.0s)
[ 2/50] Running N=  359, theta_crit=0.1... mean=1.782ms (took 0.0s)
[ 3/50] Running N= 1291, theta_crit=0.1... mean=2.013ms (took 0.1s)
[ 4/50] Running N= 4641, theta_crit=0.1... mean=3.856ms (took 0.1s)
[ 5/50] Running N=16681, theta_crit=0.1... mean=21.727ms (took 0.3s)
[ 6/50] Skipping N=59948, theta_crit=0.1
[ 7/50] Skipping N=215443, theta_crit=0.1
[ 8/50] Skipping N=774263, theta_crit=0.1
[ 9/50] Skipping N=2782559, theta_crit=0.1
[10/50] Skipping N=10000000, theta_crit=0.1
[11/50] Running N=  100, theta_crit=0.3... mean=1.044ms (took 0.0s)
[12/50] Running N=  359, theta_crit=0.3... mean=0.937ms (took 0.0s)
[13/50] Running N= 1291, theta_crit=0.3... mean=1.204ms (took 0.0s)
[14/50] Running N= 4641, theta_crit=0.3... mean=2.404ms (took 0.1s)
[15/50] Running N=16681, theta_crit=0.3... mean=8.819ms (took 0.2s)
[16/50] Running N=59948, theta_crit=0.3... mean=49.576ms (took 0.8s)
[17/50] Skipping N=215443, theta_crit=0.3
[18/50] Skipping N=774263, theta_crit=0.3
[19/50] Skipping N=2782559, theta_crit=0.3
[20/50] Skipping N=10000000, theta_crit=0.3
[21/50] Running N=  100, theta_crit=0.5... mean=1.131ms (took 0.0s)
[22/50] Running N=  359, theta_crit=0.5... mean=1.129ms (took 0.0s)
[23/50] Running N= 1291, theta_crit=0.5... mean=1.148ms (took 0.0s)
[24/50] Running N= 4641, theta_crit=0.5... mean=1.704ms (took 0.1s)
[25/50] Running N=16681, theta_crit=0.5... mean=3.616ms (took 0.2s)
[26/50] Running N=59948, theta_crit=0.5... mean=14.269ms (took 0.5s)
[27/50] Running N=215443, theta_crit=0.5... mean=45.217ms (took 1.7s)
[28/50] Skipping N=774263, theta_crit=0.5
[29/50] Skipping N=2782559, theta_crit=0.5
[30/50] Skipping N=10000000, theta_crit=0.5
[31/50] Running N=  100, theta_crit=0.7... mean=1.069ms (took 0.0s)
[32/50] Running N=  359, theta_crit=0.7... mean=0.995ms (took 0.0s)
[33/50] Running N= 1291, theta_crit=0.7... mean=1.185ms (took 0.0s)
[34/50] Running N= 4641, theta_crit=0.7... mean=2.968ms (took 0.1s)
[35/50] Running N=16681, theta_crit=0.7... mean=3.412ms (took 0.2s)
[36/50] Running N=59948, theta_crit=0.7... mean=7.880ms (took 0.4s)
[37/50] Running N=215443, theta_crit=0.7... mean=24.825ms (took 1.5s)
[38/50] Running N=774263, theta_crit=0.7... mean=100.545ms (took 5.5s)
[39/50] Skipping N=2782559, theta_crit=0.7
[40/50] Skipping N=10000000, theta_crit=0.7
[41/50] Running N=  100, theta_crit=0.9... mean=1.070ms (took 0.0s)
[42/50] Running N=  359, theta_crit=0.9... mean=0.980ms (took 0.0s)
[43/50] Running N= 1291, theta_crit=0.9... mean=1.039ms (took 0.0s)
[44/50] Running N= 4641, theta_crit=0.9... mean=1.296ms (took 0.1s)
[45/50] Running N=16681, theta_crit=0.9... mean=2.020ms (took 0.1s)
[46/50] Running N=59948, theta_crit=0.9... mean=5.385ms (took 0.4s)
[47/50] Running N=215443, theta_crit=0.9... mean=17.126ms (took 1.5s)
[48/50] Running N=774263, theta_crit=0.9... mean=62.371ms (took 5.2s)
[49/50] Skipping N=2782559, theta_crit=0.9
[50/50] Skipping N=10000000, theta_crit=0.9
Info: setting dtt implementation to impl : parallel_select                           [tree][rank=0]
Running DTT performance benchmarks for parallel_select ordered=False...
Particle counts: [100, 359, 1291, 4641, 16681, 59948, 215443, 774263, 2782559, 10000000]
Theta_crit values: [0.1, 0.3, 0.5, 0.7, 0.9]
Compression level: 4
[ 1/50] Running N=  100, theta_crit=0.1... mean=1.114ms (took 0.0s)
[ 2/50] Running N=  359, theta_crit=0.1... mean=1.184ms (took 0.0s)
[ 3/50] Running N= 1291, theta_crit=0.1... mean=1.738ms (took 0.0s)
[ 4/50] Running N= 4641, theta_crit=0.1... mean=8.602ms (took 0.1s)
[ 5/50] Running N=16681, theta_crit=0.1... mean=95.784ms (took 1.1s)
[ 6/50] Skipping N=59948, theta_crit=0.1
[ 7/50] Skipping N=215443, theta_crit=0.1
[ 8/50] Skipping N=774263, theta_crit=0.1
[ 9/50] Skipping N=2782559, theta_crit=0.1
[10/50] Skipping N=10000000, theta_crit=0.1
[11/50] Running N=  100, theta_crit=0.3... mean=1.127ms (took 0.0s)
[12/50] Running N=  359, theta_crit=0.3... mean=1.195ms (took 0.0s)
[13/50] Running N= 1291, theta_crit=0.3... mean=1.775ms (took 0.0s)
[14/50] Running N= 4641, theta_crit=0.3... mean=7.341ms (took 0.1s)
[15/50] Running N=16681, theta_crit=0.3... mean=51.560ms (took 0.6s)
[16/50] Running N=59948, theta_crit=0.3... mean=351.451ms (took 3.9s)
[17/50] Skipping N=215443, theta_crit=0.3
[18/50] Skipping N=774263, theta_crit=0.3
[19/50] Skipping N=2782559, theta_crit=0.3
[20/50] Skipping N=10000000, theta_crit=0.3
[21/50] Running N=  100, theta_crit=0.5... mean=1.116ms (took 0.0s)
[22/50] Running N=  359, theta_crit=0.5... mean=1.166ms (took 0.0s)
[23/50] Running N= 1291, theta_crit=0.5... mean=1.649ms (took 0.0s)
[24/50] Running N= 4641, theta_crit=0.5... mean=4.479ms (took 0.1s)
[25/50] Running N=16681, theta_crit=0.5... mean=20.777ms (took 0.3s)
[26/50] Running N=59948, theta_crit=0.5... mean=109.787ms (took 1.5s)
[27/50] Running N=215443, theta_crit=0.5... mean=496.188ms (took 6.3s)
[28/50] Skipping N=774263, theta_crit=0.5
[29/50] Skipping N=2782559, theta_crit=0.5
[30/50] Skipping N=10000000, theta_crit=0.5
[31/50] Running N=  100, theta_crit=0.7... mean=1.121ms (took 0.0s)
[32/50] Running N=  359, theta_crit=0.7... mean=1.228ms (took 0.0s)
[33/50] Running N= 1291, theta_crit=0.7... mean=1.557ms (took 0.0s)
[34/50] Running N= 4641, theta_crit=0.7... mean=3.253ms (took 0.1s)
[35/50] Running N=16681, theta_crit=0.7... mean=12.667ms (took 0.2s)
[36/50] Running N=59948, theta_crit=0.7... mean=62.210ms (took 1.0s)
[37/50] Running N=215443, theta_crit=0.7... mean=282.615ms (took 4.1s)
[38/50] Skipping N=774263, theta_crit=0.7
[39/50] Skipping N=2782559, theta_crit=0.7
[40/50] Skipping N=10000000, theta_crit=0.7
[41/50] Running N=  100, theta_crit=0.9... mean=1.176ms (took 0.0s)
[42/50] Running N=  359, theta_crit=0.9... mean=1.251ms (took 0.0s)
[43/50] Running N= 1291, theta_crit=0.9... mean=1.503ms (took 0.0s)
[44/50] Running N= 4641, theta_crit=0.9... mean=2.506ms (took 0.1s)
[45/50] Running N=16681, theta_crit=0.9... mean=7.169ms (took 0.2s)
[46/50] Running N=59948, theta_crit=0.9... mean=30.303ms (took 0.7s)
[47/50] Running N=215443, theta_crit=0.9... mean=128.701ms (took 2.6s)
[48/50] Running N=774263, theta_crit=0.9... mean=552.513ms (took 10.2s)
[49/50] Skipping N=2782559, theta_crit=0.9
[50/50] Skipping N=10000000, theta_crit=0.9
Info: setting dtt implementation to impl : scan_multipass                            [tree][rank=0]
Running DTT performance benchmarks for scan_multipass ordered=False...
Particle counts: [100, 359, 1291, 4641, 16681, 59948, 215443, 774263, 2782559, 10000000]
Theta_crit values: [0.1, 0.3, 0.5, 0.7, 0.9]
Compression level: 4
[ 1/50] Running N=  100, theta_crit=0.1... mean=4.315ms (took 0.1s)
[ 2/50] Running N=  359, theta_crit=0.1... mean=5.888ms (took 0.1s)
[ 3/50] Running N= 1291, theta_crit=0.1... mean=7.628ms (took 0.1s)
[ 4/50] Running N= 4641, theta_crit=0.1... mean=10.864ms (took 0.2s)
[ 5/50] Skipping N=16681, theta_crit=0.1
[ 6/50] Skipping N=59948, theta_crit=0.1
[ 7/50] Skipping N=215443, theta_crit=0.1
[ 8/50] Skipping N=774263, theta_crit=0.1
[ 9/50] Skipping N=2782559, theta_crit=0.1
[10/50] Skipping N=10000000, theta_crit=0.1
[11/50] Running N=  100, theta_crit=0.3... mean=4.030ms (took 0.1s)
[12/50] Running N=  359, theta_crit=0.3... mean=5.798ms (took 0.1s)
[13/50] Running N= 1291, theta_crit=0.3... mean=7.794ms (took 0.1s)
[14/50] Running N= 4641, theta_crit=0.3... mean=10.672ms (took 0.2s)
[15/50] Running N=16681, theta_crit=0.3... mean=16.948ms (took 0.3s)
[16/50] Skipping N=59948, theta_crit=0.3
[17/50] Skipping N=215443, theta_crit=0.3
[18/50] Skipping N=774263, theta_crit=0.3
[19/50] Skipping N=2782559, theta_crit=0.3
[20/50] Skipping N=10000000, theta_crit=0.3
[21/50] Running N=  100, theta_crit=0.5... mean=4.011ms (took 0.1s)
[22/50] Running N=  359, theta_crit=0.5... mean=5.860ms (took 0.1s)
[23/50] Running N= 1291, theta_crit=0.5... mean=7.870ms (took 0.1s)
[24/50] Running N= 4641, theta_crit=0.5... mean=9.971ms (took 0.1s)
[25/50] Running N=16681, theta_crit=0.5... mean=13.483ms (took 0.3s)
[26/50] Skipping N=59948, theta_crit=0.5
[27/50] Skipping N=215443, theta_crit=0.5
[28/50] Skipping N=774263, theta_crit=0.5
[29/50] Skipping N=2782559, theta_crit=0.5
[30/50] Skipping N=10000000, theta_crit=0.5
[31/50] Running N=  100, theta_crit=0.7... mean=4.079ms (took 0.1s)
[32/50] Running N=  359, theta_crit=0.7... mean=5.864ms (took 0.1s)
[33/50] Running N= 1291, theta_crit=0.7... mean=7.838ms (took 0.1s)
[34/50] Running N= 4641, theta_crit=0.7... mean=9.808ms (took 0.1s)
[35/50] Running N=16681, theta_crit=0.7... mean=12.098ms (took 0.2s)
[36/50] Running N=59948, theta_crit=0.7... mean=18.033ms (took 0.5s)
[37/50] Skipping N=215443, theta_crit=0.7
[38/50] Skipping N=774263, theta_crit=0.7
[39/50] Skipping N=2782559, theta_crit=0.7
[40/50] Skipping N=10000000, theta_crit=0.7
[41/50] Running N=  100, theta_crit=0.9... mean=3.957ms (took 0.1s)
[42/50] Running N=  359, theta_crit=0.9... mean=5.822ms (took 0.1s)
[43/50] Running N= 1291, theta_crit=0.9... mean=7.795ms (took 0.1s)
[44/50] Running N= 4641, theta_crit=0.9... mean=9.749ms (took 0.1s)
[45/50] Running N=16681, theta_crit=0.9... mean=11.914ms (took 0.2s)
[46/50] Running N=59948, theta_crit=0.9... mean=17.152ms (took 0.5s)
[47/50] Skipping N=215443, theta_crit=0.9
[48/50] Skipping N=774263, theta_crit=0.9
[49/50] Skipping N=2782559, theta_crit=0.9
[50/50] Skipping N=10000000, theta_crit=0.9

Plot the performance benchmarks for all implementations

277 dump_folder = "_to_trash"
278
279 import os
280
281 # Create the dump directory if it does not exist
282 if shamrock.sys.world_rank() == 0:
283     os.makedirs(dump_folder, exist_ok=True)
284
285 ref_key = "reference ordered=False"
286 largest_refalg_value = np.nanmax(results[ref_key]["results_min"])
287
288 i = 0
289 # iterate over the results
290 for k, v in results.items():
291     # Get the results for this algorithm
292     particle_counts = v["particle_counts"]
293     theta_crits = v["theta_crits"]
294     results_min = v["results_min"]
295     results_max_mem_delta = v["results_max_mem_delta"]
296
297     # Get reference algorithm results for comparison
298     reference_min = results[ref_key]["results_min"]
299
300     # Create and display the plot
301     fig, ax = create_checkerboard_plot(
302         particle_counts,
303         theta_crits,
304         results_min,
305         compression_level,
306         v["name"],
307         largest_refalg_value,
308         reference_min,
309         results_max_mem_delta,
310     )
311
312     plt.savefig(f"{dump_folder}/benchmark-dtt-performance-{i}.pdf")
313     i += 1
314
315 plt.show()
  • Dual Tree Traversal Performance (Colors: Relative to Reference, Text: Absolute Time in ms) compression level = 4 algorithm = reference ordered=True
  • Dual Tree Traversal Performance (Colors: Relative to Reference, Text: Absolute Time in ms) compression level = 4 algorithm = parallel_select ordered=True
  • Dual Tree Traversal Performance (Colors: Relative to Reference, Text: Absolute Time in ms) compression level = 4 algorithm = scan_multipass ordered=True
  • Dual Tree Traversal Performance (Colors: Relative to Reference, Text: Absolute Time in ms) compression level = 4 algorithm = reference ordered=False
  • Dual Tree Traversal Performance (Colors: Relative to Reference, Text: Absolute Time in ms) compression level = 4 algorithm = parallel_select ordered=False
  • Dual Tree Traversal Performance (Colors: Relative to Reference, Text: Absolute Time in ms) compression level = 4 algorithm = scan_multipass ordered=False

Total running time of the script: (2 minutes 0.497 seconds)

Estimated memory usage: 269 MB

Gallery generated by Sphinx-Gallery