Note
Go to the end to download the full example code.
DTT performance benchmarks#
This example benchmarks the DTT performance for the different algorithms available in Shamrock
9 import random
10 import time
11
12 import matplotlib.colors as colors
13 import matplotlib.pyplot as plt
14 import numpy as np
15
16 import shamrock
17
18 # If we use the shamrock executable to run this script instead of the python interpreter,
19 # we should not initialize the system as the shamrock executable needs to handle specific MPI logic
20 if not shamrock.sys.is_initialized():
21 shamrock.change_loglevel(1)
22 shamrock.sys.init("0:0")
Main benchmark functions
28 bounding_box = shamrock.math.AABB_f64_3((0.0, 0.0, 0.0), (1.0, 1.0, 1.0))
29
30
31 def benchmark_dtt_core(N, theta_crit, compression_level, ordered_result, nb_repeat=10):
32 times = []
33 random.seed(111)
34 max_mem_delta = 0
35 for i in range(nb_repeat):
36 positions = shamrock.algs.mock_buffer_f64_3(
37 random.randint(0, 1000000), N, bounding_box.lower, bounding_box.upper
38 )
39 tree = shamrock.tree.CLBVH_u64_f64_3()
40 tree.rebuild_from_positions(positions, bounding_box, compression_level)
41 shamrock.backends.reset_mem_info_max()
42 mem_info_before = shamrock.backends.get_mem_perf_info()
43 times.append(
44 shamrock.tree.benchmark_clbvh_dual_tree_traversal(tree, theta_crit, ordered_result)
45 * 1000
46 )
47 mem_info_after = shamrock.backends.get_mem_perf_info()
48
49 mem_delta = (
50 mem_info_after.max_allocated_byte_device - mem_info_before.max_allocated_byte_device
51 )
52 max_mem_delta = max(max_mem_delta, mem_delta)
53 return times, max_mem_delta
54
55
56 def benchmark_dtt(N, theta_crit, compression_level, ordered_result, nb_repeat=10):
57 times, max_mem_delta = benchmark_dtt_core(
58 N, theta_crit, compression_level, ordered_result, nb_repeat
59 )
60 return min(times), max(times), sum(times) / nb_repeat, max_mem_delta
Run the performance test for all parameters
65 def run_performance_sweep(compression_level, threshold_run, ordered_result):
66 # Define parameter ranges
67 # logspace as array
68 particle_counts = np.logspace(2, 7, 10).astype(int).tolist()
69 theta_crits = [0.1, 0.3, 0.5, 0.7, 0.9]
70
71 # Initialize results matrix
72 results_mean = np.zeros((len(theta_crits), len(particle_counts)))
73 results_min = np.zeros((len(theta_crits), len(particle_counts)))
74 results_max = np.zeros((len(theta_crits), len(particle_counts)))
75 results_max_mem_delta = np.zeros((len(theta_crits), len(particle_counts)))
76
77 print(f"Particle counts: {particle_counts}")
78 print(f"Theta_crit values: {theta_crits}")
79 print(f"Compression level: {compression_level}")
80
81 total_runs = len(particle_counts) * len(theta_crits)
82 current_run = 0
83
84 for i, theta_crit in enumerate(theta_crits):
85 exceed_mem = False
86 for j, N in enumerate(particle_counts):
87 current_run += 1
88
89 if exceed_mem:
90 print(
91 f"[{current_run:2d}/{total_runs}] Skipping N={N:5d}, theta_crit={theta_crit:.1f}"
92 )
93 results_mean[i, j] = np.nan
94 results_min[i, j] = np.nan
95 results_max[i, j] = np.nan
96 continue
97
98 print(
99 f"[{current_run:2d}/{total_runs}] Running N={N:5d}, theta_crit={theta_crit:.1f}...",
100 end=" ",
101 )
102
103 start_time = time.time()
104 min_time, max_time, mean_time, max_mem_delta = benchmark_dtt(
105 N, theta_crit, compression_level, ordered_result
106 )
107 elapsed = time.time() - start_time
108
109 results_mean[i, j] = mean_time
110 results_min[i, j] = min_time
111 results_max[i, j] = max_time
112 results_max_mem_delta[i, j] = max_mem_delta
113
114 print(f"mean={mean_time:.3f}ms (took {elapsed:.1f}s)")
115
116 if max_mem_delta > threshold_run:
117 exceed_mem = True
118
119 return (
120 particle_counts,
121 theta_crits,
122 results_mean,
123 results_min,
124 results_max,
125 results_max_mem_delta,
126 )
Create checkerboard plot with execution times and relative performance to reference algorithm
131 def create_checkerboard_plot(
132 particle_counts,
133 theta_crits,
134 results_data,
135 compression_level,
136 algname,
137 max_axis_value,
138 reference_data,
139 results_max_mem_delta,
140 ):
141 """Create checkerboard plot with execution times"""
142
143 fig, ax = plt.subplots(figsize=(12, 8))
144
145 # Calculate relative performance compared to reference algorithm
146 # results_data / reference_data gives the ratio (>1 means slower, <1 means faster)
147 relative_performance = results_data / reference_data
148
149 # Create the heatmap with relative performance values
150 # Create a masked array to handle NaN values (skipped benchmarks) as white
151 masked_relative = np.ma.masked_invalid(relative_performance)
152
153 # Use a diverging colormap: red for better performance (<1), green for worse (>1)
154 # RdYlGn_r (reversed) has green for high values (worse) and red for low values (better)
155 cmap = plt.cm.RdYlGn_r.copy() # Green for >1 (slower), Red for <1 (faster)
156 cmap.set_bad(color="white") # Set NaN values to white
157
158 # Set the color scale limits for relative performance
159 vmin = 0.5
160 vmax = 1.5
161
162 im = ax.imshow(
163 masked_relative, cmap=cmap, aspect="auto", interpolation="nearest", vmin=vmin, vmax=vmax
164 )
165
166 # Set ticks and labels
167 ax.set_xticks(range(len(particle_counts)))
168 ax.set_yticks(range(len(theta_crits)))
169 ax.set_xticklabels([f"{N // 1000}k" if N >= 1000 else str(N) for N in particle_counts])
170 ax.set_yticklabels([f"{theta:.1f}" for theta in theta_crits])
171
172 # Add labels
173 ax.set_xlabel("Particle Count")
174 ax.set_ylabel("Theta Critical")
175 ax.set_title(
176 f"Dual Tree Traversal Performance\n(Colors: Relative to Reference, Text: Absolute Time in ms)\ncompression level = {compression_level} algorithm = {algname}",
177 pad=20,
178 )
179
180 # Add text annotations showing the values
181 for i in range(len(theta_crits)):
182 for j in range(len(particle_counts)):
183 value = results_data[i, j]
184
185 if np.isnan(value):
186 # For skipped benchmarks, show "SKIPPED" in black on white background
187 # ax.text(j, i, 'SKIPPED', ha='center', va='center',
188 # color='black', fontweight='bold', fontsize=8)
189 pass
190 else:
191 perf = relative_performance[i, j]
192 mem_delta = results_max_mem_delta[i, j] / 1e6
193 text_color = "black"
194 ax.text(
195 j,
196 i,
197 f"{value:.2f}ms\n{perf:.2f}\n{mem_delta:.2f}MB",
198 ha="center",
199 va="center",
200 color=text_color,
201 fontweight="bold",
202 fontsize=10,
203 )
204
205 # Add colorbar for relative performance
206 cbar = plt.colorbar(im, ax=ax, shrink=0.8)
207 cbar.set_label("Relative performance (time / reference time)")
208 cbar.ax.tick_params(labelsize=10)
209
210 # Add custom tick labels for better interpretation
211 tick_positions = [0.1, 0.2, 0.5, 1.0, 2.0, 3.0]
212 cbar.set_ticks([pos for pos in tick_positions if vmin <= pos <= vmax])
213
214 # Improve layout
215 plt.tight_layout()
216
217 # Add grid for better readability
218 ax.set_xticks(np.arange(len(particle_counts)) - 0.5, minor=True)
219 ax.set_yticks(np.arange(len(theta_crits)) - 0.5, minor=True)
220 ax.grid(which="minor", color="black", linestyle="-", linewidth=1, alpha=0.3)
221
222 return fig, ax
List current implementation
227 current_impl = shamrock.tree.get_current_impl_clbvh_dual_tree_traversal_impl()
228
229 print(current_impl)
impl_param(impl_name="scan_multipass", params="")
List all implementations available
233 all_default_impls = shamrock.tree.get_default_impl_list_clbvh_dual_tree_traversal()
234
235 print(all_default_impls)
[impl_param(impl_name="reference", params=""), impl_param(impl_name="parallel_select", params=""), impl_param(impl_name="scan_multipass", params="")]
Run the performance benchmarks for all implementations
239 results = {}
240
241
242 for ordered_result in [True, False]:
243 for default_impl in all_default_impls:
244 shamrock.tree.set_impl_clbvh_dual_tree_traversal(
245 default_impl.impl_name, default_impl.params
246 )
247
248 n = default_impl.impl_name + " " + default_impl.params + "ordered=" + str(ordered_result)
249
250 print(f"Running DTT performance benchmarks for {n}...")
251
252 compression_level = 4
253
254 threshold_run = 5e6
255 # Run the performance sweep
256 (
257 particle_counts,
258 theta_crits,
259 results_mean,
260 results_min,
261 results_max,
262 results_max_mem_delta,
263 ) = run_performance_sweep(compression_level, threshold_run, ordered_result)
264
265 results[n] = {
266 "particle_counts": particle_counts,
267 "theta_crits": theta_crits,
268 "results_mean": results_mean,
269 "results_min": results_min,
270 "results_max": results_max,
271 "results_max_mem_delta": results_max_mem_delta,
272 "name": n,
273 }
Info: setting dtt implementation to impl : reference [tree][rank=0]
Running DTT performance benchmarks for reference ordered=True...
Particle counts: [100, 359, 1291, 4641, 16681, 59948, 215443, 774263, 2782559, 10000000]
Theta_crit values: [0.1, 0.3, 0.5, 0.7, 0.9]
Compression level: 4
[ 1/50] Running N= 100, theta_crit=0.1... mean=1.932ms (took 0.0s)
[ 2/50] Running N= 359, theta_crit=0.1... mean=1.746ms (took 0.0s)
[ 3/50] Running N= 1291, theta_crit=0.1... mean=2.041ms (took 0.0s)
[ 4/50] Running N= 4641, theta_crit=0.1... mean=6.314ms (took 0.1s)
[ 5/50] Running N=16681, theta_crit=0.1... mean=65.664ms (took 0.8s)
[ 6/50] Skipping N=59948, theta_crit=0.1
[ 7/50] Skipping N=215443, theta_crit=0.1
[ 8/50] Skipping N=774263, theta_crit=0.1
[ 9/50] Skipping N=2782559, theta_crit=0.1
[10/50] Skipping N=10000000, theta_crit=0.1
[11/50] Running N= 100, theta_crit=0.3... mean=1.597ms (took 0.0s)
[12/50] Running N= 359, theta_crit=0.3... mean=1.577ms (took 0.0s)
[13/50] Running N= 1291, theta_crit=0.3... mean=2.256ms (took 0.0s)
[14/50] Running N= 4641, theta_crit=0.3... mean=4.909ms (took 0.1s)
[15/50] Running N=16681, theta_crit=0.3... mean=21.251ms (took 0.3s)
[16/50] Running N=59948, theta_crit=0.3... mean=136.309ms (took 1.7s)
[17/50] Skipping N=215443, theta_crit=0.3
[18/50] Skipping N=774263, theta_crit=0.3
[19/50] Skipping N=2782559, theta_crit=0.3
[20/50] Skipping N=10000000, theta_crit=0.3
[21/50] Running N= 100, theta_crit=0.5... mean=1.545ms (took 0.0s)
[22/50] Running N= 359, theta_crit=0.5... mean=1.848ms (took 0.0s)
[23/50] Running N= 1291, theta_crit=0.5... mean=2.099ms (took 0.0s)
[24/50] Running N= 4641, theta_crit=0.5... mean=3.188ms (took 0.1s)
[25/50] Running N=16681, theta_crit=0.5... mean=7.353ms (took 0.2s)
[26/50] Running N=59948, theta_crit=0.5... mean=29.837ms (took 0.6s)
[27/50] Skipping N=215443, theta_crit=0.5
[28/50] Skipping N=774263, theta_crit=0.5
[29/50] Skipping N=2782559, theta_crit=0.5
[30/50] Skipping N=10000000, theta_crit=0.5
[31/50] Running N= 100, theta_crit=0.7... mean=1.644ms (took 0.0s)
[32/50] Running N= 359, theta_crit=0.7... mean=1.779ms (took 0.0s)
[33/50] Running N= 1291, theta_crit=0.7... mean=2.047ms (took 0.0s)
[34/50] Running N= 4641, theta_crit=0.7... mean=2.586ms (took 0.1s)
[35/50] Running N=16681, theta_crit=0.7... mean=4.932ms (took 0.2s)
[36/50] Running N=59948, theta_crit=0.7... mean=15.845ms (took 0.5s)
[37/50] Running N=215443, theta_crit=0.7... mean=48.992ms (took 1.8s)
[38/50] Skipping N=774263, theta_crit=0.7
[39/50] Skipping N=2782559, theta_crit=0.7
[40/50] Skipping N=10000000, theta_crit=0.7
[41/50] Running N= 100, theta_crit=0.9... mean=1.781ms (took 0.0s)
[42/50] Running N= 359, theta_crit=0.9... mean=1.875ms (took 0.0s)
[43/50] Running N= 1291, theta_crit=0.9... mean=2.117ms (took 0.0s)
[44/50] Running N= 4641, theta_crit=0.9... mean=2.547ms (took 0.1s)
[45/50] Running N=16681, theta_crit=0.9... mean=4.052ms (took 0.2s)
[46/50] Running N=59948, theta_crit=0.9... mean=11.100ms (took 0.5s)
[47/50] Running N=215443, theta_crit=0.9... mean=32.452ms (took 1.6s)
[48/50] Running N=774263, theta_crit=0.9... mean=98.141ms (took 4.7s)
[49/50] Skipping N=2782559, theta_crit=0.9
[50/50] Skipping N=10000000, theta_crit=0.9
Info: setting dtt implementation to impl : parallel_select [tree][rank=0]
Running DTT performance benchmarks for parallel_select ordered=True...
Particle counts: [100, 359, 1291, 4641, 16681, 59948, 215443, 774263, 2782559, 10000000]
Theta_crit values: [0.1, 0.3, 0.5, 0.7, 0.9]
Compression level: 4
[ 1/50] Running N= 100, theta_crit=0.1... mean=1.462ms (took 0.0s)
[ 2/50] Running N= 359, theta_crit=0.1... mean=1.418ms (took 0.0s)
[ 3/50] Running N= 1291, theta_crit=0.1... mean=1.994ms (took 0.0s)
[ 4/50] Running N= 4641, theta_crit=0.1... mean=9.715ms (took 0.1s)
[ 5/50] Running N=16681, theta_crit=0.1... mean=107.787ms (took 1.2s)
[ 6/50] Skipping N=59948, theta_crit=0.1
[ 7/50] Skipping N=215443, theta_crit=0.1
[ 8/50] Skipping N=774263, theta_crit=0.1
[ 9/50] Skipping N=2782559, theta_crit=0.1
[10/50] Skipping N=10000000, theta_crit=0.1
[11/50] Running N= 100, theta_crit=0.3... mean=1.087ms (took 0.0s)
[12/50] Running N= 359, theta_crit=0.3... mean=1.133ms (took 0.0s)
[13/50] Running N= 1291, theta_crit=0.3... mean=1.796ms (took 0.0s)
[14/50] Running N= 4641, theta_crit=0.3... mean=8.095ms (took 0.1s)
[15/50] Running N=16681, theta_crit=0.3... mean=57.722ms (took 0.7s)
[16/50] Running N=59948, theta_crit=0.3... mean=385.149ms (took 4.1s)
[17/50] Skipping N=215443, theta_crit=0.3
[18/50] Skipping N=774263, theta_crit=0.3
[19/50] Skipping N=2782559, theta_crit=0.3
[20/50] Skipping N=10000000, theta_crit=0.3
[21/50] Running N= 100, theta_crit=0.5... mean=1.102ms (took 0.0s)
[22/50] Running N= 359, theta_crit=0.5... mean=1.211ms (took 0.0s)
[23/50] Running N= 1291, theta_crit=0.5... mean=1.696ms (took 0.0s)
[24/50] Running N= 4641, theta_crit=0.5... mean=4.707ms (took 0.1s)
[25/50] Running N=16681, theta_crit=0.5... mean=22.141ms (took 0.3s)
[26/50] Running N=59948, theta_crit=0.5... mean=120.527ms (took 1.5s)
[27/50] Running N=215443, theta_crit=0.5... mean=533.198ms (took 6.3s)
[28/50] Skipping N=774263, theta_crit=0.5
[29/50] Skipping N=2782559, theta_crit=0.5
[30/50] Skipping N=10000000, theta_crit=0.5
[31/50] Running N= 100, theta_crit=0.7... mean=1.078ms (took 0.0s)
[32/50] Running N= 359, theta_crit=0.7... mean=1.156ms (took 0.0s)
[33/50] Running N= 1291, theta_crit=0.7... mean=1.544ms (took 0.0s)
[34/50] Running N= 4641, theta_crit=0.7... mean=3.430ms (took 0.1s)
[35/50] Running N=16681, theta_crit=0.7... mean=13.727ms (took 0.2s)
[36/50] Running N=59948, theta_crit=0.7... mean=66.886ms (took 0.9s)
[37/50] Running N=215443, theta_crit=0.7... mean=302.737ms (took 4.0s)
[38/50] Skipping N=774263, theta_crit=0.7
[39/50] Skipping N=2782559, theta_crit=0.7
[40/50] Skipping N=10000000, theta_crit=0.7
[41/50] Running N= 100, theta_crit=0.9... mean=1.090ms (took 0.0s)
[42/50] Running N= 359, theta_crit=0.9... mean=1.123ms (took 0.0s)
[43/50] Running N= 1291, theta_crit=0.9... mean=1.476ms (took 0.0s)
[44/50] Running N= 4641, theta_crit=0.9... mean=2.823ms (took 0.1s)
[45/50] Running N=16681, theta_crit=0.9... mean=7.842ms (took 0.2s)
[46/50] Running N=59948, theta_crit=0.9... mean=34.199ms (took 0.6s)
[47/50] Running N=215443, theta_crit=0.9... mean=136.873ms (took 2.4s)
[48/50] Running N=774263, theta_crit=0.9... mean=587.120ms (took 9.6s)
[49/50] Skipping N=2782559, theta_crit=0.9
[50/50] Skipping N=10000000, theta_crit=0.9
Info: setting dtt implementation to impl : scan_multipass [tree][rank=0]
Running DTT performance benchmarks for scan_multipass ordered=True...
Particle counts: [100, 359, 1291, 4641, 16681, 59948, 215443, 774263, 2782559, 10000000]
Theta_crit values: [0.1, 0.3, 0.5, 0.7, 0.9]
Compression level: 4
[ 1/50] Running N= 100, theta_crit=0.1... mean=4.110ms (took 0.1s)
[ 2/50] Running N= 359, theta_crit=0.1... mean=5.529ms (took 0.1s)
[ 3/50] Running N= 1291, theta_crit=0.1... mean=7.305ms (took 0.1s)
[ 4/50] Running N= 4641, theta_crit=0.1... mean=11.929ms (took 0.2s)
[ 5/50] Skipping N=16681, theta_crit=0.1
[ 6/50] Skipping N=59948, theta_crit=0.1
[ 7/50] Skipping N=215443, theta_crit=0.1
[ 8/50] Skipping N=774263, theta_crit=0.1
[ 9/50] Skipping N=2782559, theta_crit=0.1
[10/50] Skipping N=10000000, theta_crit=0.1
[11/50] Running N= 100, theta_crit=0.3... mean=4.141ms (took 0.1s)
[12/50] Running N= 359, theta_crit=0.3... mean=5.461ms (took 0.1s)
[13/50] Running N= 1291, theta_crit=0.3... mean=7.668ms (took 0.1s)
[14/50] Running N= 4641, theta_crit=0.3... mean=11.270ms (took 0.2s)
[15/50] Running N=16681, theta_crit=0.3... mean=23.101ms (took 0.3s)
[16/50] Skipping N=59948, theta_crit=0.3
[17/50] Skipping N=215443, theta_crit=0.3
[18/50] Skipping N=774263, theta_crit=0.3
[19/50] Skipping N=2782559, theta_crit=0.3
[20/50] Skipping N=10000000, theta_crit=0.3
[21/50] Running N= 100, theta_crit=0.5... mean=4.015ms (took 0.1s)
[22/50] Running N= 359, theta_crit=0.5... mean=5.767ms (took 0.1s)
[23/50] Running N= 1291, theta_crit=0.5... mean=7.595ms (took 0.1s)
[24/50] Running N= 4641, theta_crit=0.5... mean=9.585ms (took 0.1s)
[25/50] Running N=16681, theta_crit=0.5... mean=14.015ms (took 0.2s)
[26/50] Skipping N=59948, theta_crit=0.5
[27/50] Skipping N=215443, theta_crit=0.5
[28/50] Skipping N=774263, theta_crit=0.5
[29/50] Skipping N=2782559, theta_crit=0.5
[30/50] Skipping N=10000000, theta_crit=0.5
[31/50] Running N= 100, theta_crit=0.7... mean=4.193ms (took 0.1s)
[32/50] Running N= 359, theta_crit=0.7... mean=5.884ms (took 0.1s)
[33/50] Running N= 1291, theta_crit=0.7... mean=7.482ms (took 0.1s)
[34/50] Running N= 4641, theta_crit=0.7... mean=9.334ms (took 0.1s)
[35/50] Running N=16681, theta_crit=0.7... mean=12.639ms (took 0.2s)
[36/50] Running N=59948, theta_crit=0.7... mean=21.333ms (took 0.5s)
[37/50] Skipping N=215443, theta_crit=0.7
[38/50] Skipping N=774263, theta_crit=0.7
[39/50] Skipping N=2782559, theta_crit=0.7
[40/50] Skipping N=10000000, theta_crit=0.7
[41/50] Running N= 100, theta_crit=0.9... mean=4.247ms (took 0.1s)
[42/50] Running N= 359, theta_crit=0.9... mean=5.773ms (took 0.1s)
[43/50] Running N= 1291, theta_crit=0.9... mean=7.523ms (took 0.1s)
[44/50] Running N= 4641, theta_crit=0.9... mean=9.103ms (took 0.1s)
[45/50] Running N=16681, theta_crit=0.9... mean=11.801ms (took 0.2s)
[46/50] Running N=59948, theta_crit=0.9... mean=19.141ms (took 0.5s)
[47/50] Skipping N=215443, theta_crit=0.9
[48/50] Skipping N=774263, theta_crit=0.9
[49/50] Skipping N=2782559, theta_crit=0.9
[50/50] Skipping N=10000000, theta_crit=0.9
Info: setting dtt implementation to impl : reference [tree][rank=0]
Running DTT performance benchmarks for reference ordered=False...
Particle counts: [100, 359, 1291, 4641, 16681, 59948, 215443, 774263, 2782559, 10000000]
Theta_crit values: [0.1, 0.3, 0.5, 0.7, 0.9]
Compression level: 4
[ 1/50] Running N= 100, theta_crit=0.1... mean=1.043ms (took 0.0s)
[ 2/50] Running N= 359, theta_crit=0.1... mean=1.070ms (took 0.0s)
[ 3/50] Running N= 1291, theta_crit=0.1... mean=1.235ms (took 0.0s)
[ 4/50] Running N= 4641, theta_crit=0.1... mean=3.584ms (took 0.1s)
[ 5/50] Running N=16681, theta_crit=0.1... mean=25.063ms (took 0.4s)
[ 6/50] Skipping N=59948, theta_crit=0.1
[ 7/50] Skipping N=215443, theta_crit=0.1
[ 8/50] Skipping N=774263, theta_crit=0.1
[ 9/50] Skipping N=2782559, theta_crit=0.1
[10/50] Skipping N=10000000, theta_crit=0.1
[11/50] Running N= 100, theta_crit=0.3... mean=0.823ms (took 0.0s)
[12/50] Running N= 359, theta_crit=0.3... mean=0.892ms (took 0.0s)
[13/50] Running N= 1291, theta_crit=0.3... mean=1.141ms (took 0.0s)
[14/50] Running N= 4641, theta_crit=0.3... mean=2.554ms (took 0.1s)
[15/50] Running N=16681, theta_crit=0.3... mean=9.910ms (took 0.2s)
[16/50] Running N=59948, theta_crit=0.3... mean=55.253ms (took 0.8s)
[17/50] Skipping N=215443, theta_crit=0.3
[18/50] Skipping N=774263, theta_crit=0.3
[19/50] Skipping N=2782559, theta_crit=0.3
[20/50] Skipping N=10000000, theta_crit=0.3
[21/50] Running N= 100, theta_crit=0.5... mean=0.882ms (took 0.0s)
[22/50] Running N= 359, theta_crit=0.5... mean=0.922ms (took 0.0s)
[23/50] Running N= 1291, theta_crit=0.5... mean=1.086ms (took 0.0s)
[24/50] Running N= 4641, theta_crit=0.5... mean=1.619ms (took 0.1s)
[25/50] Running N=16681, theta_crit=0.5... mean=3.931ms (took 0.1s)
[26/50] Running N=59948, theta_crit=0.5... mean=15.868ms (took 0.4s)
[27/50] Running N=215443, theta_crit=0.5... mean=51.470ms (took 1.5s)
[28/50] Skipping N=774263, theta_crit=0.5
[29/50] Skipping N=2782559, theta_crit=0.5
[30/50] Skipping N=10000000, theta_crit=0.5
[31/50] Running N= 100, theta_crit=0.7... mean=0.896ms (took 0.0s)
[32/50] Running N= 359, theta_crit=0.7... mean=0.938ms (took 0.0s)
[33/50] Running N= 1291, theta_crit=0.7... mean=1.019ms (took 0.0s)
[34/50] Running N= 4641, theta_crit=0.7... mean=1.369ms (took 0.1s)
[35/50] Running N=16681, theta_crit=0.7... mean=2.671ms (took 0.1s)
[36/50] Running N=59948, theta_crit=0.7... mean=8.696ms (took 0.4s)
[37/50] Running N=215443, theta_crit=0.7... mean=27.995ms (took 1.3s)
[38/50] Running N=774263, theta_crit=0.7... mean=113.728ms (took 4.7s)
[39/50] Skipping N=2782559, theta_crit=0.7
[40/50] Skipping N=10000000, theta_crit=0.7
[41/50] Running N= 100, theta_crit=0.9... mean=0.910ms (took 0.0s)
[42/50] Running N= 359, theta_crit=0.9... mean=0.932ms (took 0.0s)
[43/50] Running N= 1291, theta_crit=0.9... mean=1.009ms (took 0.0s)
[44/50] Running N= 4641, theta_crit=0.9... mean=1.270ms (took 0.1s)
[45/50] Running N=16681, theta_crit=0.9... mean=2.187ms (took 0.1s)
[46/50] Running N=59948, theta_crit=0.9... mean=5.734ms (took 0.3s)
[47/50] Running N=215443, theta_crit=0.9... mean=19.410ms (took 1.2s)
[48/50] Running N=774263, theta_crit=0.9... mean=69.711ms (took 4.4s)
[49/50] Skipping N=2782559, theta_crit=0.9
[50/50] Skipping N=10000000, theta_crit=0.9
Info: setting dtt implementation to impl : parallel_select [tree][rank=0]
Running DTT performance benchmarks for parallel_select ordered=False...
Particle counts: [100, 359, 1291, 4641, 16681, 59948, 215443, 774263, 2782559, 10000000]
Theta_crit values: [0.1, 0.3, 0.5, 0.7, 0.9]
Compression level: 4
[ 1/50] Running N= 100, theta_crit=0.1... mean=1.071ms (took 0.0s)
[ 2/50] Running N= 359, theta_crit=0.1... mean=1.132ms (took 0.0s)
[ 3/50] Running N= 1291, theta_crit=0.1... mean=1.775ms (took 0.0s)
[ 4/50] Running N= 4641, theta_crit=0.1... mean=9.322ms (took 0.1s)
[ 5/50] Running N=16681, theta_crit=0.1... mean=106.930ms (took 1.2s)
[ 6/50] Skipping N=59948, theta_crit=0.1
[ 7/50] Skipping N=215443, theta_crit=0.1
[ 8/50] Skipping N=774263, theta_crit=0.1
[ 9/50] Skipping N=2782559, theta_crit=0.1
[10/50] Skipping N=10000000, theta_crit=0.1
[11/50] Running N= 100, theta_crit=0.3... mean=1.056ms (took 0.0s)
[12/50] Running N= 359, theta_crit=0.3... mean=1.122ms (took 0.0s)
[13/50] Running N= 1291, theta_crit=0.3... mean=1.800ms (took 0.0s)
[14/50] Running N= 4641, theta_crit=0.3... mean=8.280ms (took 0.1s)
[15/50] Running N=16681, theta_crit=0.3... mean=57.428ms (took 0.7s)
[16/50] Running N=59948, theta_crit=0.3... mean=382.652ms (took 4.1s)
[17/50] Skipping N=215443, theta_crit=0.3
[18/50] Skipping N=774263, theta_crit=0.3
[19/50] Skipping N=2782559, theta_crit=0.3
[20/50] Skipping N=10000000, theta_crit=0.3
[21/50] Running N= 100, theta_crit=0.5... mean=1.110ms (took 0.0s)
[22/50] Running N= 359, theta_crit=0.5... mean=1.147ms (took 0.0s)
[23/50] Running N= 1291, theta_crit=0.5... mean=1.775ms (took 0.0s)
[24/50] Running N= 4641, theta_crit=0.5... mean=4.864ms (took 0.1s)
[25/50] Running N=16681, theta_crit=0.5... mean=23.152ms (took 0.3s)
[26/50] Running N=59948, theta_crit=0.5... mean=121.743ms (took 1.5s)
[27/50] Running N=215443, theta_crit=0.5... mean=542.229ms (took 6.4s)
[28/50] Skipping N=774263, theta_crit=0.5
[29/50] Skipping N=2782559, theta_crit=0.5
[30/50] Skipping N=10000000, theta_crit=0.5
[31/50] Running N= 100, theta_crit=0.7... mean=1.101ms (took 0.0s)
[32/50] Running N= 359, theta_crit=0.7... mean=1.180ms (took 0.0s)
[33/50] Running N= 1291, theta_crit=0.7... mean=1.761ms (took 0.0s)
[34/50] Running N= 4641, theta_crit=0.7... mean=3.511ms (took 0.1s)
[35/50] Running N=16681, theta_crit=0.7... mean=14.598ms (took 0.2s)
[36/50] Running N=59948, theta_crit=0.7... mean=71.153ms (took 1.0s)
[37/50] Running N=215443, theta_crit=0.7... mean=308.674ms (took 4.1s)
[38/50] Skipping N=774263, theta_crit=0.7
[39/50] Skipping N=2782559, theta_crit=0.7
[40/50] Skipping N=10000000, theta_crit=0.7
[41/50] Running N= 100, theta_crit=0.9... mean=1.060ms (took 0.0s)
[42/50] Running N= 359, theta_crit=0.9... mean=1.156ms (took 0.0s)
[43/50] Running N= 1291, theta_crit=0.9... mean=1.430ms (took 0.0s)
[44/50] Running N= 4641, theta_crit=0.9... mean=2.626ms (took 0.1s)
[45/50] Running N=16681, theta_crit=0.9... mean=7.942ms (took 0.2s)
[46/50] Running N=59948, theta_crit=0.9... mean=32.564ms (took 0.6s)
[47/50] Running N=215443, theta_crit=0.9... mean=135.414ms (took 2.3s)
[48/50] Running N=774263, theta_crit=0.9... mean=613.065ms (took 9.9s)
[49/50] Skipping N=2782559, theta_crit=0.9
[50/50] Skipping N=10000000, theta_crit=0.9
Info: setting dtt implementation to impl : scan_multipass [tree][rank=0]
Running DTT performance benchmarks for scan_multipass ordered=False...
Particle counts: [100, 359, 1291, 4641, 16681, 59948, 215443, 774263, 2782559, 10000000]
Theta_crit values: [0.1, 0.3, 0.5, 0.7, 0.9]
Compression level: 4
[ 1/50] Running N= 100, theta_crit=0.1... mean=3.490ms (took 0.1s)
[ 2/50] Running N= 359, theta_crit=0.1... mean=5.045ms (took 0.1s)
[ 3/50] Running N= 1291, theta_crit=0.1... mean=6.567ms (took 0.1s)
[ 4/50] Running N= 4641, theta_crit=0.1... mean=9.336ms (took 0.1s)
[ 5/50] Skipping N=16681, theta_crit=0.1
[ 6/50] Skipping N=59948, theta_crit=0.1
[ 7/50] Skipping N=215443, theta_crit=0.1
[ 8/50] Skipping N=774263, theta_crit=0.1
[ 9/50] Skipping N=2782559, theta_crit=0.1
[10/50] Skipping N=10000000, theta_crit=0.1
[11/50] Running N= 100, theta_crit=0.3... mean=3.451ms (took 0.1s)
[12/50] Running N= 359, theta_crit=0.3... mean=4.954ms (took 0.1s)
[13/50] Running N= 1291, theta_crit=0.3... mean=6.589ms (took 0.1s)
[14/50] Running N= 4641, theta_crit=0.3... mean=8.708ms (took 0.1s)
[15/50] Running N=16681, theta_crit=0.3... mean=15.646ms (took 0.3s)
[16/50] Skipping N=59948, theta_crit=0.3
[17/50] Skipping N=215443, theta_crit=0.3
[18/50] Skipping N=774263, theta_crit=0.3
[19/50] Skipping N=2782559, theta_crit=0.3
[20/50] Skipping N=10000000, theta_crit=0.3
[21/50] Running N= 100, theta_crit=0.5... mean=3.468ms (took 0.1s)
[22/50] Running N= 359, theta_crit=0.5... mean=4.981ms (took 0.1s)
[23/50] Running N= 1291, theta_crit=0.5... mean=6.483ms (took 0.1s)
[24/50] Running N= 4641, theta_crit=0.5... mean=8.068ms (took 0.1s)
[25/50] Running N=16681, theta_crit=0.5... mean=11.485ms (took 0.2s)
[26/50] Skipping N=59948, theta_crit=0.5
[27/50] Skipping N=215443, theta_crit=0.5
[28/50] Skipping N=774263, theta_crit=0.5
[29/50] Skipping N=2782559, theta_crit=0.5
[30/50] Skipping N=10000000, theta_crit=0.5
[31/50] Running N= 100, theta_crit=0.7... mean=3.524ms (took 0.1s)
[32/50] Running N= 359, theta_crit=0.7... mean=4.934ms (took 0.1s)
[33/50] Running N= 1291, theta_crit=0.7... mean=6.653ms (took 0.1s)
[34/50] Running N= 4641, theta_crit=0.7... mean=8.236ms (took 0.1s)
[35/50] Running N=16681, theta_crit=0.7... mean=11.517ms (took 0.2s)
[36/50] Running N=59948, theta_crit=0.7... mean=17.706ms (took 0.5s)
[37/50] Skipping N=215443, theta_crit=0.7
[38/50] Skipping N=774263, theta_crit=0.7
[39/50] Skipping N=2782559, theta_crit=0.7
[40/50] Skipping N=10000000, theta_crit=0.7
[41/50] Running N= 100, theta_crit=0.9... mean=3.511ms (took 0.1s)
[42/50] Running N= 359, theta_crit=0.9... mean=4.944ms (took 0.1s)
[43/50] Running N= 1291, theta_crit=0.9... mean=6.793ms (took 0.1s)
[44/50] Running N= 4641, theta_crit=0.9... mean=7.471ms (took 0.1s)
[45/50] Running N=16681, theta_crit=0.9... mean=10.573ms (took 0.2s)
[46/50] Running N=59948, theta_crit=0.9... mean=15.100ms (took 0.4s)
[47/50] Skipping N=215443, theta_crit=0.9
[48/50] Skipping N=774263, theta_crit=0.9
[49/50] Skipping N=2782559, theta_crit=0.9
[50/50] Skipping N=10000000, theta_crit=0.9
Plot the performance benchmarks for all implementations
277 dump_folder = "_to_trash"
278
279 import os
280
281 # Create the dump directory if it does not exist
282 if shamrock.sys.world_rank() == 0:
283 os.makedirs(dump_folder, exist_ok=True)
284
285 ref_key = "reference ordered=False"
286 largest_refalg_value = np.nanmax(results[ref_key]["results_min"])
287
288 i = 0
289 # iterate over the results
290 for k, v in results.items():
291 # Get the results for this algorithm
292 particle_counts = v["particle_counts"]
293 theta_crits = v["theta_crits"]
294 results_min = v["results_min"]
295 results_max_mem_delta = v["results_max_mem_delta"]
296
297 # Get reference algorithm results for comparison
298 reference_min = results[ref_key]["results_min"]
299
300 # Create and display the plot
301 fig, ax = create_checkerboard_plot(
302 particle_counts,
303 theta_crits,
304 results_min,
305 compression_level,
306 v["name"],
307 largest_refalg_value,
308 reference_min,
309 results_max_mem_delta,
310 )
311
312 plt.savefig(f"{dump_folder}/benchmark-dtt-performance-{i}.pdf")
313 i += 1
314
315 plt.show()
Total running time of the script: (1 minutes 48.346 seconds)
Estimated memory usage: 193 MB





