DTT performance benchmarks#

This example benchmarks the DTT performance for the different algorithms available in Shamrock

 9 import random
10 import time
11
12 import matplotlib.colors as colors
13 import matplotlib.pyplot as plt
14 import numpy as np
15
16 import shamrock
17
18 # If we use the shamrock executable to run this script instead of the python interpreter,
19 # we should not initialize the system as the shamrock executable needs to handle specific MPI logic
20 if not shamrock.sys.is_initialized():
21     shamrock.change_loglevel(1)
22     shamrock.sys.init("0:0")

Main benchmark functions

28 bounding_box = shamrock.math.AABB_f64_3((0.0, 0.0, 0.0), (1.0, 1.0, 1.0))
29
30
31 def benchmark_dtt_core(N, theta_crit, compression_level, nb_repeat=10):
32     times = []
33     random.seed(111)
34     max_mem_delta = 0
35     for i in range(nb_repeat):
36         positions = shamrock.algs.mock_buffer_f64_3(
37             random.randint(0, 1000000), N, bounding_box.lower, bounding_box.upper
38         )
39         tree = shamrock.tree.CLBVH_u64_f64_3()
40         tree.rebuild_from_positions(positions, bounding_box, compression_level)
41         shamrock.backends.reset_mem_info_max()
42         mem_info_before = shamrock.backends.get_mem_perf_info()
43         times.append(shamrock.tree.benchmark_clbvh_dual_tree_traversal(tree, theta_crit) * 1000)
44         mem_info_after = shamrock.backends.get_mem_perf_info()
45
46         mem_delta = (
47             mem_info_after.max_allocated_byte_device - mem_info_before.max_allocated_byte_device
48         )
49         max_mem_delta = max(max_mem_delta, mem_delta)
50     return times, max_mem_delta
51
52
53 def benchmark_dtt(N, theta_crit, compression_level, nb_repeat=10):
54     times, max_mem_delta = benchmark_dtt_core(N, theta_crit, compression_level, nb_repeat)
55     return min(times), max(times), sum(times) / nb_repeat, max_mem_delta

Run the performance test for all parameters

 60 def run_performance_sweep(compression_level, threshold_run):
 61
 62     # Define parameter ranges
 63     # logspace as array
 64     particle_counts = np.logspace(2, 7, 10).astype(int).tolist()
 65     theta_crits = [0.1, 0.3, 0.5, 0.7, 0.9]
 66
 67     # Initialize results matrix
 68     results_mean = np.zeros((len(theta_crits), len(particle_counts)))
 69     results_min = np.zeros((len(theta_crits), len(particle_counts)))
 70     results_max = np.zeros((len(theta_crits), len(particle_counts)))
 71     results_max_mem_delta = np.zeros((len(theta_crits), len(particle_counts)))
 72
 73     print(f"Particle counts: {particle_counts}")
 74     print(f"Theta_crit values: {theta_crits}")
 75     print(f"Compression level: {compression_level}")
 76
 77     total_runs = len(particle_counts) * len(theta_crits)
 78     current_run = 0
 79
 80     for i, theta_crit in enumerate(theta_crits):
 81         exceed_mem = False
 82         for j, N in enumerate(particle_counts):
 83             current_run += 1
 84
 85             if exceed_mem:
 86                 print(
 87                     f"[{current_run:2d}/{total_runs}] Skipping N={N:5d}, theta_crit={theta_crit:.1f}"
 88                 )
 89                 results_mean[i, j] = np.nan
 90                 results_min[i, j] = np.nan
 91                 results_max[i, j] = np.nan
 92                 continue
 93
 94             print(
 95                 f"[{current_run:2d}/{total_runs}] Running N={N:5d}, theta_crit={theta_crit:.1f}...",
 96                 end=" ",
 97             )
 98
 99             start_time = time.time()
100             min_time, max_time, mean_time, max_mem_delta = benchmark_dtt(
101                 N, theta_crit, compression_level
102             )
103             elapsed = time.time() - start_time
104
105             results_mean[i, j] = mean_time
106             results_min[i, j] = min_time
107             results_max[i, j] = max_time
108             results_max_mem_delta[i, j] = max_mem_delta
109
110             print(f"mean={mean_time:.3f}ms (took {elapsed:.1f}s)")
111
112             if max_mem_delta > threshold_run:
113                 exceed_mem = True
114
115     return (
116         particle_counts,
117         theta_crits,
118         results_mean,
119         results_min,
120         results_max,
121         results_max_mem_delta,
122     )

Create checkerboard plot with execution times and relative performance to reference algorithm

127 def create_checkerboard_plot(
128     particle_counts,
129     theta_crits,
130     results_data,
131     compression_level,
132     algname,
133     max_axis_value,
134     reference_data,
135     results_max_mem_delta,
136 ):
137     """Create checkerboard plot with execution times"""
138
139     fig, ax = plt.subplots(figsize=(12, 8))
140
141     # Calculate relative performance compared to reference algorithm
142     # results_data / reference_data gives the ratio (>1 means slower, <1 means faster)
143     relative_performance = results_data / reference_data
144
145     # Create the heatmap with relative performance values
146     # Create a masked array to handle NaN values (skipped benchmarks) as white
147     masked_relative = np.ma.masked_invalid(relative_performance)
148
149     # Use a diverging colormap: red for better performance (<1), green for worse (>1)
150     # RdYlGn_r (reversed) has green for high values (worse) and red for low values (better)
151     cmap = plt.cm.RdYlGn_r.copy()  # Green for >1 (slower), Red for <1 (faster)
152     cmap.set_bad(color="white")  # Set NaN values to white
153
154     # Set the color scale limits for relative performance
155     vmin = 0.5
156     vmax = 1.5
157
158     im = ax.imshow(
159         masked_relative, cmap=cmap, aspect="auto", interpolation="nearest", vmin=vmin, vmax=vmax
160     )
161
162     # Set ticks and labels
163     ax.set_xticks(range(len(particle_counts)))
164     ax.set_yticks(range(len(theta_crits)))
165     ax.set_xticklabels([f"{N//1000}k" if N >= 1000 else str(N) for N in particle_counts])
166     ax.set_yticklabels([f"{theta:.1f}" for theta in theta_crits])
167
168     # Add labels
169     ax.set_xlabel("Particle Count")
170     ax.set_ylabel("Theta Critical")
171     ax.set_title(
172         f"Dual Tree Traversal Performance\n(Colors: Relative to Reference, Text: Absolute Time in ms)\ncompression level = {compression_level} algorithm = {algname}",
173         pad=20,
174     )
175
176     # Add text annotations showing the values
177     for i in range(len(theta_crits)):
178         for j in range(len(particle_counts)):
179             value = results_data[i, j]
180
181             if np.isnan(value):
182                 # For skipped benchmarks, show "SKIPPED" in black on white background
183                 # ax.text(j, i, 'SKIPPED', ha='center', va='center',
184                 #       color='black', fontweight='bold', fontsize=8)
185                 pass
186             else:
187                 perf = relative_performance[i, j]
188                 mem_delta = results_max_mem_delta[i, j] / 1e6
189                 text_color = "black"
190                 ax.text(
191                     j,
192                     i,
193                     f"{value:.2f}ms\n{perf:.2f}\n{mem_delta:.2f}MB",
194                     ha="center",
195                     va="center",
196                     color=text_color,
197                     fontweight="bold",
198                     fontsize=10,
199                 )
200
201     # Add colorbar for relative performance
202     cbar = plt.colorbar(im, ax=ax, shrink=0.8)
203     cbar.set_label("Relative performance (time / reference time)")
204     cbar.ax.tick_params(labelsize=10)
205
206     # Add custom tick labels for better interpretation
207     tick_positions = [0.1, 0.2, 0.5, 1.0, 2.0, 3.0]
208     cbar.set_ticks([pos for pos in tick_positions if vmin <= pos <= vmax])
209
210     # Improve layout
211     plt.tight_layout()
212
213     # Add grid for better readability
214     ax.set_xticks(np.arange(len(particle_counts)) - 0.5, minor=True)
215     ax.set_yticks(np.arange(len(theta_crits)) - 0.5, minor=True)
216     ax.grid(which="minor", color="black", linestyle="-", linewidth=1, alpha=0.3)
217
218     return fig, ax

List current implementation

impl_param(impl_name="scan_multipass", params="")

List all implementations available

[impl_param(impl_name="reference", params=""), impl_param(impl_name="parallel_select", params=""), impl_param(impl_name="scan_multipass", params="")]

Run the performance benchmarks for all implementations

235 results = {}
236
237 for default_impl in all_default_impls:
238     shamrock.tree.set_impl_clbvh_dual_tree_traversal(default_impl.impl_name, default_impl.params)
239
240     print(f"Running DTT performance benchmarks for {default_impl.impl_name}...")
241
242     compression_level = 4
243
244     threshold_run = 5e6
245     # Run the performance sweep
246     particle_counts, theta_crits, results_mean, results_min, results_max, results_max_mem_delta = (
247         run_performance_sweep(compression_level, threshold_run)
248     )
249
250     results[default_impl.impl_name + " " + default_impl.params] = {
251         "particle_counts": particle_counts,
252         "theta_crits": theta_crits,
253         "results_mean": results_mean,
254         "results_min": results_min,
255         "results_max": results_max,
256         "results_max_mem_delta": results_max_mem_delta,
257         "name": default_impl.impl_name + " " + default_impl.params,
258     }
Info: setting dtt implementation to impl : reference                                 [tree][rank=0]
Running DTT performance benchmarks for reference...
Particle counts: [100, 359, 1291, 4641, 16681, 59948, 215443, 774263, 2782559, 10000000]
Theta_crit values: [0.1, 0.3, 0.5, 0.7, 0.9]
Compression level: 4
[ 1/50] Running N=  100, theta_crit=0.1... mean=1.755ms (took 0.0s)
[ 2/50] Running N=  359, theta_crit=0.1... mean=1.067ms (took 0.0s)
[ 3/50] Running N= 1291, theta_crit=0.1... mean=1.307ms (took 0.0s)
[ 4/50] Running N= 4641, theta_crit=0.1... mean=3.113ms (took 0.1s)
[ 5/50] Running N=16681, theta_crit=0.1... mean=23.228ms (took 0.4s)
[ 6/50] Skipping N=59948, theta_crit=0.1
[ 7/50] Skipping N=215443, theta_crit=0.1
[ 8/50] Skipping N=774263, theta_crit=0.1
[ 9/50] Skipping N=2782559, theta_crit=0.1
[10/50] Skipping N=10000000, theta_crit=0.1
[11/50] Running N=  100, theta_crit=0.3... mean=0.834ms (took 0.0s)
[12/50] Running N=  359, theta_crit=0.3... mean=1.090ms (took 0.0s)
[13/50] Running N= 1291, theta_crit=0.3... mean=1.178ms (took 0.0s)
[14/50] Running N= 4641, theta_crit=0.3... mean=2.764ms (took 0.1s)
[15/50] Running N=16681, theta_crit=0.3... mean=21.239ms (took 0.3s)
[16/50] Skipping N=59948, theta_crit=0.3
[17/50] Skipping N=215443, theta_crit=0.3
[18/50] Skipping N=774263, theta_crit=0.3
[19/50] Skipping N=2782559, theta_crit=0.3
[20/50] Skipping N=10000000, theta_crit=0.3
[21/50] Running N=  100, theta_crit=0.5... mean=0.868ms (took 0.0s)
[22/50] Running N=  359, theta_crit=0.5... mean=1.037ms (took 0.0s)
[23/50] Running N= 1291, theta_crit=0.5... mean=1.228ms (took 0.0s)
[24/50] Running N= 4641, theta_crit=0.5... mean=2.661ms (took 0.1s)
[25/50] Running N=16681, theta_crit=0.5... mean=12.086ms (took 0.2s)
[26/50] Running N=59948, theta_crit=0.5... mean=73.430ms (took 1.1s)
[27/50] Skipping N=215443, theta_crit=0.5
[28/50] Skipping N=774263, theta_crit=0.5
[29/50] Skipping N=2782559, theta_crit=0.5
[30/50] Skipping N=10000000, theta_crit=0.5
[31/50] Running N=  100, theta_crit=0.7... mean=0.995ms (took 0.0s)
[32/50] Running N=  359, theta_crit=0.7... mean=0.888ms (took 0.0s)
[33/50] Running N= 1291, theta_crit=0.7... mean=1.064ms (took 0.0s)
[34/50] Running N= 4641, theta_crit=0.7... mean=2.019ms (took 0.1s)
[35/50] Running N=16681, theta_crit=0.7... mean=7.198ms (took 0.2s)
[36/50] Running N=59948, theta_crit=0.7... mean=33.120ms (took 0.7s)
[37/50] Skipping N=215443, theta_crit=0.7
[38/50] Skipping N=774263, theta_crit=0.7
[39/50] Skipping N=2782559, theta_crit=0.7
[40/50] Skipping N=10000000, theta_crit=0.7
[41/50] Running N=  100, theta_crit=0.9... mean=1.002ms (took 0.0s)
[42/50] Running N=  359, theta_crit=0.9... mean=1.205ms (took 0.0s)
[43/50] Running N= 1291, theta_crit=0.9... mean=1.053ms (took 0.0s)
[44/50] Running N= 4641, theta_crit=0.9... mean=1.762ms (took 0.1s)
[45/50] Running N=16681, theta_crit=0.9... mean=4.814ms (took 0.2s)
[46/50] Running N=59948, theta_crit=0.9... mean=19.865ms (took 0.5s)
[47/50] Skipping N=215443, theta_crit=0.9
[48/50] Skipping N=774263, theta_crit=0.9
[49/50] Skipping N=2782559, theta_crit=0.9
[50/50] Skipping N=10000000, theta_crit=0.9
Info: setting dtt implementation to impl : parallel_select                           [tree][rank=0]
Running DTT performance benchmarks for parallel_select...
Particle counts: [100, 359, 1291, 4641, 16681, 59948, 215443, 774263, 2782559, 10000000]
Theta_crit values: [0.1, 0.3, 0.5, 0.7, 0.9]
Compression level: 4
[ 1/50] Running N=  100, theta_crit=0.1... mean=1.313ms (took 0.0s)
[ 2/50] Running N=  359, theta_crit=0.1... mean=1.179ms (took 0.0s)
[ 3/50] Running N= 1291, theta_crit=0.1... mean=1.574ms (took 0.0s)
[ 4/50] Running N= 4641, theta_crit=0.1... mean=8.599ms (took 0.1s)
[ 5/50] Running N=16681, theta_crit=0.1... mean=92.707ms (took 1.1s)
[ 6/50] Skipping N=59948, theta_crit=0.1
[ 7/50] Skipping N=215443, theta_crit=0.1
[ 8/50] Skipping N=774263, theta_crit=0.1
[ 9/50] Skipping N=2782559, theta_crit=0.1
[10/50] Skipping N=10000000, theta_crit=0.1
[11/50] Running N=  100, theta_crit=0.3... mean=0.943ms (took 0.0s)
[12/50] Running N=  359, theta_crit=0.3... mean=1.084ms (took 0.0s)
[13/50] Running N= 1291, theta_crit=0.3... mean=1.654ms (took 0.0s)
[14/50] Running N= 4641, theta_crit=0.3... mean=9.200ms (took 0.1s)
[15/50] Running N=16681, theta_crit=0.3... mean=93.288ms (took 1.1s)
[16/50] Skipping N=59948, theta_crit=0.3
[17/50] Skipping N=215443, theta_crit=0.3
[18/50] Skipping N=774263, theta_crit=0.3
[19/50] Skipping N=2782559, theta_crit=0.3
[20/50] Skipping N=10000000, theta_crit=0.3
[21/50] Running N=  100, theta_crit=0.5... mean=0.947ms (took 0.0s)
[22/50] Running N=  359, theta_crit=0.5... mean=1.097ms (took 0.0s)
[23/50] Running N= 1291, theta_crit=0.5... mean=1.645ms (took 0.0s)
[24/50] Running N= 4641, theta_crit=0.5... mean=8.865ms (took 0.1s)
[25/50] Running N=16681, theta_crit=0.5... mean=68.973ms (took 0.8s)
[26/50] Running N=59948, theta_crit=0.5... mean=515.644ms (took 5.5s)
[27/50] Skipping N=215443, theta_crit=0.5
[28/50] Skipping N=774263, theta_crit=0.5
[29/50] Skipping N=2782559, theta_crit=0.5
[30/50] Skipping N=10000000, theta_crit=0.5
[31/50] Running N=  100, theta_crit=0.7... mean=0.966ms (took 0.0s)
[32/50] Running N=  359, theta_crit=0.7... mean=1.039ms (took 0.0s)
[33/50] Running N= 1291, theta_crit=0.7... mean=1.647ms (took 0.0s)
[34/50] Running N= 4641, theta_crit=0.7... mean=8.317ms (took 0.1s)
[35/50] Running N=16681, theta_crit=0.7... mean=44.213ms (took 0.6s)
[36/50] Running N=59948, theta_crit=0.7... mean=272.458ms (took 3.1s)
[37/50] Skipping N=215443, theta_crit=0.7
[38/50] Skipping N=774263, theta_crit=0.7
[39/50] Skipping N=2782559, theta_crit=0.7
[40/50] Skipping N=10000000, theta_crit=0.7
[41/50] Running N=  100, theta_crit=0.9... mean=0.951ms (took 0.0s)
[42/50] Running N=  359, theta_crit=0.9... mean=0.985ms (took 0.0s)
[43/50] Running N= 1291, theta_crit=0.9... mean=1.532ms (took 0.0s)
[44/50] Running N= 4641, theta_crit=0.9... mean=5.570ms (took 0.1s)
[45/50] Running N=16681, theta_crit=0.9... mean=30.386ms (took 0.4s)
[46/50] Running N=59948, theta_crit=0.9... mean=166.187ms (took 2.0s)
[47/50] Skipping N=215443, theta_crit=0.9
[48/50] Skipping N=774263, theta_crit=0.9
[49/50] Skipping N=2782559, theta_crit=0.9
[50/50] Skipping N=10000000, theta_crit=0.9
Info: setting dtt implementation to impl : scan_multipass                            [tree][rank=0]
Running DTT performance benchmarks for scan_multipass...
Particle counts: [100, 359, 1291, 4641, 16681, 59948, 215443, 774263, 2782559, 10000000]
Theta_crit values: [0.1, 0.3, 0.5, 0.7, 0.9]
Compression level: 4
[ 1/50] Running N=  100, theta_crit=0.1... mean=3.548ms (took 0.1s)
[ 2/50] Running N=  359, theta_crit=0.1... mean=5.249ms (took 0.1s)
[ 3/50] Running N= 1291, theta_crit=0.1... mean=7.061ms (took 0.1s)
[ 4/50] Running N= 4641, theta_crit=0.1... mean=10.032ms (took 0.1s)
[ 5/50] Skipping N=16681, theta_crit=0.1
[ 6/50] Skipping N=59948, theta_crit=0.1
[ 7/50] Skipping N=215443, theta_crit=0.1
[ 8/50] Skipping N=774263, theta_crit=0.1
[ 9/50] Skipping N=2782559, theta_crit=0.1
[10/50] Skipping N=10000000, theta_crit=0.1
[11/50] Running N=  100, theta_crit=0.3... mean=3.792ms (took 0.1s)
[12/50] Running N=  359, theta_crit=0.3... mean=5.151ms (took 0.1s)
[13/50] Running N= 1291, theta_crit=0.3... mean=7.014ms (took 0.1s)
[14/50] Running N= 4641, theta_crit=0.3... mean=10.191ms (took 0.1s)
[15/50] Skipping N=16681, theta_crit=0.3
[16/50] Skipping N=59948, theta_crit=0.3
[17/50] Skipping N=215443, theta_crit=0.3
[18/50] Skipping N=774263, theta_crit=0.3
[19/50] Skipping N=2782559, theta_crit=0.3
[20/50] Skipping N=10000000, theta_crit=0.3
[21/50] Running N=  100, theta_crit=0.5... mean=3.523ms (took 0.1s)
[22/50] Running N=  359, theta_crit=0.5... mean=5.194ms (took 0.1s)
[23/50] Running N= 1291, theta_crit=0.5... mean=7.422ms (took 0.1s)
[24/50] Running N= 4641, theta_crit=0.5... mean=10.060ms (took 0.1s)
[25/50] Skipping N=16681, theta_crit=0.5
[26/50] Skipping N=59948, theta_crit=0.5
[27/50] Skipping N=215443, theta_crit=0.5
[28/50] Skipping N=774263, theta_crit=0.5
[29/50] Skipping N=2782559, theta_crit=0.5
[30/50] Skipping N=10000000, theta_crit=0.5
[31/50] Running N=  100, theta_crit=0.7... mean=3.562ms (took 0.1s)
[32/50] Running N=  359, theta_crit=0.7... mean=5.142ms (took 0.1s)
[33/50] Running N= 1291, theta_crit=0.7... mean=7.078ms (took 0.1s)
[34/50] Running N= 4641, theta_crit=0.7... mean=9.638ms (took 0.1s)
[35/50] Running N=16681, theta_crit=0.7... mean=15.659ms (took 0.3s)
[36/50] Skipping N=59948, theta_crit=0.7
[37/50] Skipping N=215443, theta_crit=0.7
[38/50] Skipping N=774263, theta_crit=0.7
[39/50] Skipping N=2782559, theta_crit=0.7
[40/50] Skipping N=10000000, theta_crit=0.7
[41/50] Running N=  100, theta_crit=0.9... mean=3.928ms (took 0.1s)
[42/50] Running N=  359, theta_crit=0.9... mean=5.695ms (took 0.1s)
[43/50] Running N= 1291, theta_crit=0.9... mean=7.337ms (took 0.1s)
[44/50] Running N= 4641, theta_crit=0.9... mean=9.673ms (took 0.1s)
[45/50] Running N=16681, theta_crit=0.9... mean=13.692ms (took 0.3s)
[46/50] Skipping N=59948, theta_crit=0.9
[47/50] Skipping N=215443, theta_crit=0.9
[48/50] Skipping N=774263, theta_crit=0.9
[49/50] Skipping N=2782559, theta_crit=0.9
[50/50] Skipping N=10000000, theta_crit=0.9

Plot the performance benchmarks for all implementations

262 dump_folder = "_to_trash"
263
264 import os
265
266 # Create the dump directory if it does not exist
267 if shamrock.sys.world_rank() == 0:
268     os.makedirs(dump_folder, exist_ok=True)
269
270 ref_key = "reference "
271 largest_refalg_value = np.nanmax(results[ref_key]["results_min"])
272
273 i = 0
274 # iterate over the results
275 for k, v in results.items():
276
277     # Get the results for this algorithm
278     particle_counts = v["particle_counts"]
279     theta_crits = v["theta_crits"]
280     results_min = v["results_min"]
281     results_max_mem_delta = v["results_max_mem_delta"]
282
283     # Get reference algorithm results for comparison
284     reference_min = results[ref_key]["results_min"]
285
286     # Create and display the plot
287     fig, ax = create_checkerboard_plot(
288         particle_counts,
289         theta_crits,
290         results_min,
291         compression_level,
292         v["name"],
293         largest_refalg_value,
294         reference_min,
295         results_max_mem_delta,
296     )
297
298     plt.savefig(f"{dump_folder}/benchmark-dtt-performance-{i}.pdf")
299     i += 1
300
301 plt.show()
  • Dual Tree Traversal Performance (Colors: Relative to Reference, Text: Absolute Time in ms) compression level = 4 algorithm = reference
  • Dual Tree Traversal Performance (Colors: Relative to Reference, Text: Absolute Time in ms) compression level = 4 algorithm = parallel_select
  • Dual Tree Traversal Performance (Colors: Relative to Reference, Text: Absolute Time in ms) compression level = 4 algorithm = scan_multipass

Total running time of the script: (0 minutes 24.713 seconds)

Estimated memory usage: 119 MB

Gallery generated by Sphinx-Gallery