DTT performance benchmarks#

This example benchmarks the DTT performance for the different algorithms available in Shamrock

 import random
 import time

 import matplotlib.colors as colors
 import matplotlib.pyplot as plt
 import numpy as np

 import shamrock

 # If we use the shamrock executable to run this script instead of the python interpreter,
 # we should not initialize the system as the shamrock executable needs to handle specific MPI logic
 if not shamrock.sys.is_initialized():
     shamrock.change_loglevel(1)
     shamrock.sys.init("0:0")

Main benchmark functions

 bounding_box = shamrock.math.AABB_f64_3((0.0, 0.0, 0.0), (1.0, 1.0, 1.0))


 def benchmark_dtt_core(N, theta_crit, compression_level, ordered_result, nb_repeat=10):
     times = []
     random.seed(111)
     max_mem_delta = 0
     for i in range(nb_repeat):
         positions = shamrock.algs.mock_buffer_f64_3(
             random.randint(0, 1000000), N, bounding_box.lower, bounding_box.upper
         )
         tree = shamrock.tree.CLBVH_u64_f64_3()
         tree.rebuild_from_positions(positions, bounding_box, compression_level)
         shamrock.backends.reset_mem_info_max()
         mem_info_before = shamrock.backends.get_mem_perf_info()
         times.append(
             shamrock.tree.benchmark_clbvh_dual_tree_traversal(tree, theta_crit, ordered_result)
             * 1000
         )
         mem_info_after = shamrock.backends.get_mem_perf_info()

         mem_delta = (
             mem_info_after.max_allocated_byte_device - mem_info_before.max_allocated_byte_device
         )
         max_mem_delta = max(max_mem_delta, mem_delta)
     return times, max_mem_delta


 def benchmark_dtt(N, theta_crit, compression_level, ordered_result, nb_repeat=10):
     times, max_mem_delta = benchmark_dtt_core(
         N, theta_crit, compression_level, ordered_result, nb_repeat
     )
     return min(times), max(times), sum(times) / nb_repeat, max_mem_delta

Run the performance test for all parameters

 def run_performance_sweep(compression_level, threshold_run, ordered_result):

     # Define parameter ranges
     # logspace as array
     particle_counts = np.logspace(2, 7, 10).astype(int).tolist()
     theta_crits = [0.1, 0.3, 0.5, 0.7, 0.9]

     # Initialize results matrix
     results_mean = np.zeros((len(theta_crits), len(particle_counts)))
     results_min = np.zeros((len(theta_crits), len(particle_counts)))
     results_max = np.zeros((len(theta_crits), len(particle_counts)))
     results_max_mem_delta = np.zeros((len(theta_crits), len(particle_counts)))

     print(f"Particle counts: {particle_counts}")
     print(f"Theta_crit values: {theta_crits}")
     print(f"Compression level: {compression_level}")

     total_runs = len(particle_counts) * len(theta_crits)
     current_run = 0

     for i, theta_crit in enumerate(theta_crits):
         exceed_mem = False
         for j, N in enumerate(particle_counts):
             current_run += 1

             if exceed_mem:
                 print(
                     f"[{current_run:2d}/{total_runs}] Skipping N={N:5d}, theta_crit={theta_crit:.1f}"
                 )
                 results_mean[i, j] = np.nan
                 results_min[i, j] = np.nan
                 results_max[i, j] = np.nan
                 continue

             print(
                 f"[{current_run:2d}/{total_runs}] Running N={N:5d}, theta_crit={theta_crit:.1f}...",
                 end=" ",
             )

             start_time = time.time()
             min_time, max_time, mean_time, max_mem_delta = benchmark_dtt(
                 N, theta_crit, compression_level, ordered_result
             )
             elapsed = time.time() - start_time

             results_mean[i, j] = mean_time
             results_min[i, j] = min_time
             results_max[i, j] = max_time
             results_max_mem_delta[i, j] = max_mem_delta

             print(f"mean={mean_time:.3f}ms (took {elapsed:.1f}s)")

             if max_mem_delta > threshold_run:
                 exceed_mem = True

     return (
         particle_counts,
         theta_crits,
         results_mean,
         results_min,
         results_max,
         results_max_mem_delta,
     )

Create checkerboard plot with execution times and relative performance to reference algorithm

 def create_checkerboard_plot(
     particle_counts,
     theta_crits,
     results_data,
     compression_level,
     algname,
     max_axis_value,
     reference_data,
     results_max_mem_delta,
 ):
     """Create checkerboard plot with execution times"""

     fig, ax = plt.subplots(figsize=(12, 8))

     # Calculate relative performance compared to reference algorithm
     # results_data / reference_data gives the ratio (>1 means slower, <1 means faster)
     relative_performance = results_data / reference_data

     # Create the heatmap with relative performance values
     # Create a masked array to handle NaN values (skipped benchmarks) as white
     masked_relative = np.ma.masked_invalid(relative_performance)

     # Use a diverging colormap: red for better performance (<1), green for worse (>1)
     # RdYlGn_r (reversed) has green for high values (worse) and red for low values (better)
     cmap = plt.cm.RdYlGn_r.copy()  # Green for >1 (slower), Red for <1 (faster)
     cmap.set_bad(color="white")  # Set NaN values to white

     # Set the color scale limits for relative performance
     vmin = 0.5
     vmax = 1.5

     im = ax.imshow(
         masked_relative, cmap=cmap, aspect="auto", interpolation="nearest", vmin=vmin, vmax=vmax
     )

     # Set ticks and labels
     ax.set_xticks(range(len(particle_counts)))
     ax.set_yticks(range(len(theta_crits)))
     ax.set_xticklabels([f"{N//1000}k" if N >= 1000 else str(N) for N in particle_counts])
     ax.set_yticklabels([f"{theta:.1f}" for theta in theta_crits])

     # Add labels
     ax.set_xlabel("Particle Count")
     ax.set_ylabel("Theta Critical")
     ax.set_title(
         f"Dual Tree Traversal Performance\n(Colors: Relative to Reference, Text: Absolute Time in ms)\ncompression level = {compression_level} algorithm = {algname}",
         pad=20,
     )

     # Add text annotations showing the values
     for i in range(len(theta_crits)):
         for j in range(len(particle_counts)):
             value = results_data[i, j]

             if np.isnan(value):
                 # For skipped benchmarks, show "SKIPPED" in black on white background
                 # ax.text(j, i, 'SKIPPED', ha='center', va='center',
                 #       color='black', fontweight='bold', fontsize=8)
                 pass
             else:
                 perf = relative_performance[i, j]
                 mem_delta = results_max_mem_delta[i, j] / 1e6
                 text_color = "black"
                 ax.text(
                     j,
                     i,
                     f"{value:.2f}ms\n{perf:.2f}\n{mem_delta:.2f}MB",
                     ha="center",
                     va="center",
                     color=text_color,
                     fontweight="bold",
                     fontsize=10,
                 )

     # Add colorbar for relative performance
     cbar = plt.colorbar(im, ax=ax, shrink=0.8)
     cbar.set_label("Relative performance (time / reference time)")
     cbar.ax.tick_params(labelsize=10)

     # Add custom tick labels for better interpretation
     tick_positions = [0.1, 0.2, 0.5, 1.0, 2.0, 3.0]
     cbar.set_ticks([pos for pos in tick_positions if vmin <= pos <= vmax])

     # Improve layout
     plt.tight_layout()

     # Add grid for better readability
     ax.set_xticks(np.arange(len(particle_counts)) - 0.5, minor=True)
     ax.set_yticks(np.arange(len(theta_crits)) - 0.5, minor=True)
     ax.grid(which="minor", color="black", linestyle="-", linewidth=1, alpha=0.3)

     return fig, ax

List current implementation

 current_impl = shamrock.tree.get_current_impl_clbvh_dual_tree_traversal_impl()

 print(current_impl)

impl_param(impl_name="scan_multipass", params="")

List all implementations available

 all_default_impls = shamrock.tree.get_default_impl_list_clbvh_dual_tree_traversal()

 print(all_default_impls)

[impl_param(impl_name="reference", params=""), impl_param(impl_name="parallel_select", params=""), impl_param(impl_name="scan_multipass", params="")]

Run the performance benchmarks for all implementations

 results = {}


 for ordered_result in [True, False]:
     for default_impl in all_default_impls:
         shamrock.tree.set_impl_clbvh_dual_tree_traversal(
             default_impl.impl_name, default_impl.params
         )

         n = default_impl.impl_name + " " + default_impl.params + "ordered=" + str(ordered_result)

         print(f"Running DTT performance benchmarks for {n}...")

         compression_level = 4

         threshold_run = 5e6
         # Run the performance sweep
         (
             particle_counts,
             theta_crits,
             results_mean,
             results_min,
             results_max,
             results_max_mem_delta,
         ) = run_performance_sweep(compression_level, threshold_run, ordered_result)

         results[n] = {
             "particle_counts": particle_counts,
             "theta_crits": theta_crits,
             "results_mean": results_mean,
             "results_min": results_min,
             "results_max": results_max,
             "results_max_mem_delta": results_max_mem_delta,
             "name": n,
         }

Info: setting dtt implementation to impl : reference                                 [tree][rank=0]
Running DTT performance benchmarks for reference ordered=True...
Particle counts: [100, 359, 1291, 4641, 16681, 59948, 215443, 774263, 2782559, 10000000]
Theta_crit values: [0.1, 0.3, 0.5, 0.7, 0.9]
Compression level: 4
[ 1/50] Running N=  100, theta_crit=0.1... mean=2.837ms (took 0.1s)
[ 2/50] Running N=  359, theta_crit=0.1... mean=3.021ms (took 0.1s)
[ 3/50] Running N= 1291, theta_crit=0.1... mean=2.907ms (took 0.1s)
[ 4/50] Running N= 4641, theta_crit=0.1... mean=7.999ms (took 0.2s)
[ 5/50] Running N=16681, theta_crit=0.1... mean=70.090ms (took 0.9s)
[ 6/50] Skipping N=59948, theta_crit=0.1
[ 7/50] Skipping N=215443, theta_crit=0.1
[ 8/50] Skipping N=774263, theta_crit=0.1
[ 9/50] Skipping N=2782559, theta_crit=0.1
[10/50] Skipping N=10000000, theta_crit=0.1
[11/50] Running N=  100, theta_crit=0.3... mean=2.334ms (took 0.0s)
[12/50] Running N=  359, theta_crit=0.3... mean=2.310ms (took 0.0s)
[13/50] Running N= 1291, theta_crit=0.3... mean=2.392ms (took 0.1s)
[14/50] Running N= 4641, theta_crit=0.3... mean=7.305ms (took 0.1s)
[15/50] Running N=16681, theta_crit=0.3... mean=65.593ms (took 0.9s)
[16/50] Skipping N=59948, theta_crit=0.3
[17/50] Skipping N=215443, theta_crit=0.3
[18/50] Skipping N=774263, theta_crit=0.3
[19/50] Skipping N=2782559, theta_crit=0.3
[20/50] Skipping N=10000000, theta_crit=0.3
[21/50] Running N=  100, theta_crit=0.5... mean=2.975ms (took 0.1s)
[22/50] Running N=  359, theta_crit=0.5... mean=3.015ms (took 0.1s)
[23/50] Running N= 1291, theta_crit=0.5... mean=2.880ms (took 0.1s)
[24/50] Running N= 4641, theta_crit=0.5... mean=8.313ms (took 0.2s)
[25/50] Running N=16681, theta_crit=0.5... mean=35.914ms (took 0.6s)
[26/50] Skipping N=59948, theta_crit=0.5
[27/50] Skipping N=215443, theta_crit=0.5
[28/50] Skipping N=774263, theta_crit=0.5
[29/50] Skipping N=2782559, theta_crit=0.5
[30/50] Skipping N=10000000, theta_crit=0.5
[31/50] Running N=  100, theta_crit=0.7... mean=2.704ms (took 0.0s)
[32/50] Running N=  359, theta_crit=0.7... mean=2.750ms (took 0.1s)
[33/50] Running N= 1291, theta_crit=0.7... mean=3.159ms (took 0.1s)
[34/50] Running N= 4641, theta_crit=0.7... mean=6.134ms (took 0.1s)
[35/50] Running N=16681, theta_crit=0.7... mean=20.831ms (took 0.4s)
[36/50] Running N=59948, theta_crit=0.7... mean=101.582ms (took 1.7s)
[37/50] Skipping N=215443, theta_crit=0.7
[38/50] Skipping N=774263, theta_crit=0.7
[39/50] Skipping N=2782559, theta_crit=0.7
[40/50] Skipping N=10000000, theta_crit=0.7
[41/50] Running N=  100, theta_crit=0.9... mean=2.630ms (took 0.0s)
[42/50] Running N=  359, theta_crit=0.9... mean=2.985ms (took 0.1s)
[43/50] Running N= 1291, theta_crit=0.9... mean=2.930ms (took 0.1s)
[44/50] Running N= 4641, theta_crit=0.9... mean=5.141ms (took 0.1s)
[45/50] Running N=16681, theta_crit=0.9... mean=13.177ms (took 0.3s)
[46/50] Running N=59948, theta_crit=0.9... mean=55.167ms (took 1.2s)
[47/50] Skipping N=215443, theta_crit=0.9
[48/50] Skipping N=774263, theta_crit=0.9
[49/50] Skipping N=2782559, theta_crit=0.9
[50/50] Skipping N=10000000, theta_crit=0.9
Info: setting dtt implementation to impl : parallel_select                           [tree][rank=0]
Running DTT performance benchmarks for parallel_select ordered=True...
Particle counts: [100, 359, 1291, 4641, 16681, 59948, 215443, 774263, 2782559, 10000000]
Theta_crit values: [0.1, 0.3, 0.5, 0.7, 0.9]
Compression level: 4
[ 1/50] Running N=  100, theta_crit=0.1... mean=2.910ms (took 0.1s)
[ 2/50] Running N=  359, theta_crit=0.1... mean=3.005ms (took 0.1s)
[ 3/50] Running N= 1291, theta_crit=0.1... mean=3.791ms (took 0.1s)
[ 4/50] Running N= 4641, theta_crit=0.1... mean=8.768ms (took 0.1s)
[ 5/50] Running N=16681, theta_crit=0.1... mean=91.683ms (took 1.0s)
[ 6/50] Skipping N=59948, theta_crit=0.1
[ 7/50] Skipping N=215443, theta_crit=0.1
[ 8/50] Skipping N=774263, theta_crit=0.1
[ 9/50] Skipping N=2782559, theta_crit=0.1
[10/50] Skipping N=10000000, theta_crit=0.1
[11/50] Running N=  100, theta_crit=0.3... mean=1.189ms (took 0.0s)
[12/50] Running N=  359, theta_crit=0.3... mean=1.144ms (took 0.0s)
[13/50] Running N= 1291, theta_crit=0.3... mean=1.700ms (took 0.0s)
[14/50] Running N= 4641, theta_crit=0.3... mean=8.341ms (took 0.1s)
[15/50] Running N=16681, theta_crit=0.3... mean=91.910ms (took 1.0s)
[16/50] Skipping N=59948, theta_crit=0.3
[17/50] Skipping N=215443, theta_crit=0.3
[18/50] Skipping N=774263, theta_crit=0.3
[19/50] Skipping N=2782559, theta_crit=0.3
[20/50] Skipping N=10000000, theta_crit=0.3
[21/50] Running N=  100, theta_crit=0.5... mean=1.233ms (took 0.0s)
[22/50] Running N=  359, theta_crit=0.5... mean=1.196ms (took 0.0s)
[23/50] Running N= 1291, theta_crit=0.5... mean=1.753ms (took 0.0s)
[24/50] Running N= 4641, theta_crit=0.5... mean=8.603ms (took 0.1s)
[25/50] Running N=16681, theta_crit=0.5... mean=67.419ms (took 0.8s)
[26/50] Running N=59948, theta_crit=0.5... mean=500.056ms (took 5.4s)
[27/50] Skipping N=215443, theta_crit=0.5
[28/50] Skipping N=774263, theta_crit=0.5
[29/50] Skipping N=2782559, theta_crit=0.5
[30/50] Skipping N=10000000, theta_crit=0.5
[31/50] Running N=  100, theta_crit=0.7... mean=1.152ms (took 0.0s)
[32/50] Running N=  359, theta_crit=0.7... mean=1.226ms (took 0.0s)
[33/50] Running N= 1291, theta_crit=0.7... mean=1.674ms (took 0.0s)
[34/50] Running N= 4641, theta_crit=0.7... mean=7.002ms (took 0.1s)
[35/50] Running N=16681, theta_crit=0.7... mean=44.220ms (took 0.6s)
[36/50] Running N=59948, theta_crit=0.7... mean=268.061ms (took 3.0s)
[37/50] Skipping N=215443, theta_crit=0.7
[38/50] Skipping N=774263, theta_crit=0.7
[39/50] Skipping N=2782559, theta_crit=0.7
[40/50] Skipping N=10000000, theta_crit=0.7
[41/50] Running N=  100, theta_crit=0.9... mean=1.157ms (took 0.0s)
[42/50] Running N=  359, theta_crit=0.9... mean=1.150ms (took 0.0s)
[43/50] Running N= 1291, theta_crit=0.9... mean=1.790ms (took 0.0s)
[44/50] Running N= 4641, theta_crit=0.9... mean=5.532ms (took 0.1s)
[45/50] Running N=16681, theta_crit=0.9... mean=29.669ms (took 0.4s)
[46/50] Running N=59948, theta_crit=0.9... mean=164.409ms (took 2.0s)
[47/50] Skipping N=215443, theta_crit=0.9
[48/50] Skipping N=774263, theta_crit=0.9
[49/50] Skipping N=2782559, theta_crit=0.9
[50/50] Skipping N=10000000, theta_crit=0.9
Info: setting dtt implementation to impl : scan_multipass                            [tree][rank=0]
Running DTT performance benchmarks for scan_multipass ordered=True...
Particle counts: [100, 359, 1291, 4641, 16681, 59948, 215443, 774263, 2782559, 10000000]
Theta_crit values: [0.1, 0.3, 0.5, 0.7, 0.9]
Compression level: 4
[ 1/50] Running N=  100, theta_crit=0.1... mean=7.451ms (took 0.1s)
[ 2/50] Running N=  359, theta_crit=0.1... mean=10.746ms (took 0.1s)
[ 3/50] Running N= 1291, theta_crit=0.1... mean=12.040ms (took 0.2s)
[ 4/50] Running N= 4641, theta_crit=0.1... mean=20.062ms (took 0.3s)
[ 5/50] Skipping N=16681, theta_crit=0.1
[ 6/50] Skipping N=59948, theta_crit=0.1
[ 7/50] Skipping N=215443, theta_crit=0.1
[ 8/50] Skipping N=774263, theta_crit=0.1
[ 9/50] Skipping N=2782559, theta_crit=0.1
[10/50] Skipping N=10000000, theta_crit=0.1
[11/50] Running N=  100, theta_crit=0.3... mean=7.885ms (took 0.1s)
[12/50] Running N=  359, theta_crit=0.3... mean=10.518ms (took 0.1s)
[13/50] Running N= 1291, theta_crit=0.3... mean=11.464ms (took 0.2s)
[14/50] Running N= 4641, theta_crit=0.3... mean=16.970ms (took 0.2s)
[15/50] Skipping N=16681, theta_crit=0.3
[16/50] Skipping N=59948, theta_crit=0.3
[17/50] Skipping N=215443, theta_crit=0.3
[18/50] Skipping N=774263, theta_crit=0.3
[19/50] Skipping N=2782559, theta_crit=0.3
[20/50] Skipping N=10000000, theta_crit=0.3
[21/50] Running N=  100, theta_crit=0.5... mean=7.022ms (took 0.1s)
[22/50] Running N=  359, theta_crit=0.5... mean=9.089ms (took 0.1s)
[23/50] Running N= 1291, theta_crit=0.5... mean=12.290ms (took 0.2s)
[24/50] Running N= 4641, theta_crit=0.5... mean=17.686ms (took 0.3s)
[25/50] Skipping N=16681, theta_crit=0.5
[26/50] Skipping N=59948, theta_crit=0.5
[27/50] Skipping N=215443, theta_crit=0.5
[28/50] Skipping N=774263, theta_crit=0.5
[29/50] Skipping N=2782559, theta_crit=0.5
[30/50] Skipping N=10000000, theta_crit=0.5
[31/50] Running N=  100, theta_crit=0.7... mean=6.441ms (took 0.1s)
[32/50] Running N=  359, theta_crit=0.7... mean=9.332ms (took 0.1s)
[33/50] Running N= 1291, theta_crit=0.7... mean=12.129ms (took 0.2s)
[34/50] Running N= 4641, theta_crit=0.7... mean=15.942ms (took 0.2s)
[35/50] Running N=16681, theta_crit=0.7... mean=29.906ms (took 0.5s)
[36/50] Skipping N=59948, theta_crit=0.7
[37/50] Skipping N=215443, theta_crit=0.7
[38/50] Skipping N=774263, theta_crit=0.7
[39/50] Skipping N=2782559, theta_crit=0.7
[40/50] Skipping N=10000000, theta_crit=0.7
[41/50] Running N=  100, theta_crit=0.9... mean=6.707ms (took 0.1s)
[42/50] Running N=  359, theta_crit=0.9... mean=9.572ms (took 0.1s)
[43/50] Running N= 1291, theta_crit=0.9... mean=12.241ms (took 0.2s)
[44/50] Running N= 4641, theta_crit=0.9... mean=15.889ms (took 0.2s)
[45/50] Running N=16681, theta_crit=0.9... mean=24.745ms (took 0.5s)
[46/50] Skipping N=59948, theta_crit=0.9
[47/50] Skipping N=215443, theta_crit=0.9
[48/50] Skipping N=774263, theta_crit=0.9
[49/50] Skipping N=2782559, theta_crit=0.9
[50/50] Skipping N=10000000, theta_crit=0.9
Info: setting dtt implementation to impl : reference                                 [tree][rank=0]
Running DTT performance benchmarks for reference ordered=False...
Particle counts: [100, 359, 1291, 4641, 16681, 59948, 215443, 774263, 2782559, 10000000]
Theta_crit values: [0.1, 0.3, 0.5, 0.7, 0.9]
Compression level: 4
[ 1/50] Running N=  100, theta_crit=0.1... mean=1.711ms (took 0.0s)
[ 2/50] Running N=  359, theta_crit=0.1... mean=1.757ms (took 0.0s)
[ 3/50] Running N= 1291, theta_crit=0.1... mean=1.969ms (took 0.1s)
[ 4/50] Running N= 4641, theta_crit=0.1... mean=3.476ms (took 0.1s)
[ 5/50] Running N=16681, theta_crit=0.1... mean=20.105ms (took 0.3s)
[ 6/50] Skipping N=59948, theta_crit=0.1
[ 7/50] Skipping N=215443, theta_crit=0.1
[ 8/50] Skipping N=774263, theta_crit=0.1
[ 9/50] Skipping N=2782559, theta_crit=0.1
[10/50] Skipping N=10000000, theta_crit=0.1
[11/50] Running N=  100, theta_crit=0.3... mean=0.934ms (took 0.0s)
[12/50] Running N=  359, theta_crit=0.3... mean=0.927ms (took 0.0s)
[13/50] Running N= 1291, theta_crit=0.3... mean=1.076ms (took 0.0s)
[14/50] Running N= 4641, theta_crit=0.3... mean=2.568ms (took 0.1s)
[15/50] Running N=16681, theta_crit=0.3... mean=20.237ms (took 0.3s)
[16/50] Skipping N=59948, theta_crit=0.3
[17/50] Skipping N=215443, theta_crit=0.3
[18/50] Skipping N=774263, theta_crit=0.3
[19/50] Skipping N=2782559, theta_crit=0.3
[20/50] Skipping N=10000000, theta_crit=0.3
[21/50] Running N=  100, theta_crit=0.5... mean=1.051ms (took 0.0s)
[22/50] Running N=  359, theta_crit=0.5... mean=0.936ms (took 0.0s)
[23/50] Running N= 1291, theta_crit=0.5... mean=1.113ms (took 0.0s)
[24/50] Running N= 4641, theta_crit=0.5... mean=2.618ms (took 0.1s)
[25/50] Running N=16681, theta_crit=0.5... mean=11.900ms (took 0.2s)
[26/50] Running N=59948, theta_crit=0.5... mean=68.015ms (took 1.0s)
[27/50] Skipping N=215443, theta_crit=0.5
[28/50] Skipping N=774263, theta_crit=0.5
[29/50] Skipping N=2782559, theta_crit=0.5
[30/50] Skipping N=10000000, theta_crit=0.5
[31/50] Running N=  100, theta_crit=0.7... mean=1.038ms (took 0.0s)
[32/50] Running N=  359, theta_crit=0.7... mean=1.041ms (took 0.0s)
[33/50] Running N= 1291, theta_crit=0.7... mean=1.219ms (took 0.0s)
[34/50] Running N= 4641, theta_crit=0.7... mean=2.199ms (took 0.1s)
[35/50] Running N=16681, theta_crit=0.7... mean=7.123ms (took 0.2s)
[36/50] Running N=59948, theta_crit=0.7... mean=32.289ms (took 0.7s)
[37/50] Skipping N=215443, theta_crit=0.7
[38/50] Skipping N=774263, theta_crit=0.7
[39/50] Skipping N=2782559, theta_crit=0.7
[40/50] Skipping N=10000000, theta_crit=0.7
[41/50] Running N=  100, theta_crit=0.9... mean=1.015ms (took 0.0s)
[42/50] Running N=  359, theta_crit=0.9... mean=1.092ms (took 0.0s)
[43/50] Running N= 1291, theta_crit=0.9... mean=1.113ms (took 0.0s)
[44/50] Running N= 4641, theta_crit=0.9... mean=1.987ms (took 0.1s)
[45/50] Running N=16681, theta_crit=0.9... mean=4.980ms (took 0.2s)
[46/50] Running N=59948, theta_crit=0.9... mean=19.283ms (took 0.5s)
[47/50] Skipping N=215443, theta_crit=0.9
[48/50] Skipping N=774263, theta_crit=0.9
[49/50] Skipping N=2782559, theta_crit=0.9
[50/50] Skipping N=10000000, theta_crit=0.9
Info: setting dtt implementation to impl : parallel_select                           [tree][rank=0]
Running DTT performance benchmarks for parallel_select ordered=False...
Particle counts: [100, 359, 1291, 4641, 16681, 59948, 215443, 774263, 2782559, 10000000]
Theta_crit values: [0.1, 0.3, 0.5, 0.7, 0.9]
Compression level: 4
[ 1/50] Running N=  100, theta_crit=0.1... mean=1.259ms (took 0.0s)
[ 2/50] Running N=  359, theta_crit=0.1... mean=1.697ms (took 0.0s)
[ 3/50] Running N= 1291, theta_crit=0.1... mean=1.717ms (took 0.0s)
[ 4/50] Running N= 4641, theta_crit=0.1... mean=8.387ms (took 0.1s)
[ 5/50] Running N=16681, theta_crit=0.1... mean=91.531ms (took 1.0s)
[ 6/50] Skipping N=59948, theta_crit=0.1
[ 7/50] Skipping N=215443, theta_crit=0.1
[ 8/50] Skipping N=774263, theta_crit=0.1
[ 9/50] Skipping N=2782559, theta_crit=0.1
[10/50] Skipping N=10000000, theta_crit=0.1
[11/50] Running N=  100, theta_crit=0.3... mean=1.256ms (took 0.0s)
[12/50] Running N=  359, theta_crit=0.3... mean=1.237ms (took 0.0s)
[13/50] Running N= 1291, theta_crit=0.3... mean=1.786ms (took 0.0s)
[14/50] Running N= 4641, theta_crit=0.3... mean=8.457ms (took 0.1s)
[15/50] Running N=16681, theta_crit=0.3... mean=91.832ms (took 1.0s)
[16/50] Skipping N=59948, theta_crit=0.3
[17/50] Skipping N=215443, theta_crit=0.3
[18/50] Skipping N=774263, theta_crit=0.3
[19/50] Skipping N=2782559, theta_crit=0.3
[20/50] Skipping N=10000000, theta_crit=0.3
[21/50] Running N=  100, theta_crit=0.5... mean=1.126ms (took 0.0s)
[22/50] Running N=  359, theta_crit=0.5... mean=1.377ms (took 0.0s)
[23/50] Running N= 1291, theta_crit=0.5... mean=1.724ms (took 0.0s)
[24/50] Running N= 4641, theta_crit=0.5... mean=8.867ms (took 0.1s)
[25/50] Running N=16681, theta_crit=0.5... mean=67.417ms (took 0.8s)
[26/50] Running N=59948, theta_crit=0.5... mean=506.486ms (took 5.4s)
[27/50] Skipping N=215443, theta_crit=0.5
[28/50] Skipping N=774263, theta_crit=0.5
[29/50] Skipping N=2782559, theta_crit=0.5
[30/50] Skipping N=10000000, theta_crit=0.5
[31/50] Running N=  100, theta_crit=0.7... mean=1.129ms (took 0.0s)
[32/50] Running N=  359, theta_crit=0.7... mean=1.151ms (took 0.0s)
[33/50] Running N= 1291, theta_crit=0.7... mean=1.715ms (took 0.0s)
[34/50] Running N= 4641, theta_crit=0.7... mean=7.174ms (took 0.1s)
[35/50] Running N=16681, theta_crit=0.7... mean=44.306ms (took 0.6s)
[36/50] Running N=59948, theta_crit=0.7... mean=270.903ms (took 3.1s)
[37/50] Skipping N=215443, theta_crit=0.7
[38/50] Skipping N=774263, theta_crit=0.7
[39/50] Skipping N=2782559, theta_crit=0.7
[40/50] Skipping N=10000000, theta_crit=0.7
[41/50] Running N=  100, theta_crit=0.9... mean=1.118ms (took 0.0s)
[42/50] Running N=  359, theta_crit=0.9... mean=1.212ms (took 0.0s)
[43/50] Running N= 1291, theta_crit=0.9... mean=1.815ms (took 0.0s)
[44/50] Running N= 4641, theta_crit=0.9... mean=5.900ms (took 0.1s)
[45/50] Running N=16681, theta_crit=0.9... mean=30.747ms (took 0.4s)
[46/50] Running N=59948, theta_crit=0.9... mean=165.754ms (took 2.0s)
[47/50] Skipping N=215443, theta_crit=0.9
[48/50] Skipping N=774263, theta_crit=0.9
[49/50] Skipping N=2782559, theta_crit=0.9
[50/50] Skipping N=10000000, theta_crit=0.9
Info: setting dtt implementation to impl : scan_multipass                            [tree][rank=0]
Running DTT performance benchmarks for scan_multipass ordered=False...
Particle counts: [100, 359, 1291, 4641, 16681, 59948, 215443, 774263, 2782559, 10000000]
Theta_crit values: [0.1, 0.3, 0.5, 0.7, 0.9]
Compression level: 4
[ 1/50] Running N=  100, theta_crit=0.1... mean=3.876ms (took 0.1s)
[ 2/50] Running N=  359, theta_crit=0.1... mean=5.823ms (took 0.1s)
[ 3/50] Running N= 1291, theta_crit=0.1... mean=7.615ms (took 0.1s)
[ 4/50] Running N= 4641, theta_crit=0.1... mean=11.487ms (took 0.2s)
[ 5/50] Skipping N=16681, theta_crit=0.1
[ 6/50] Skipping N=59948, theta_crit=0.1
[ 7/50] Skipping N=215443, theta_crit=0.1
[ 8/50] Skipping N=774263, theta_crit=0.1
[ 9/50] Skipping N=2782559, theta_crit=0.1
[10/50] Skipping N=10000000, theta_crit=0.1
[11/50] Running N=  100, theta_crit=0.3... mean=4.164ms (took 0.1s)
[12/50] Running N=  359, theta_crit=0.3... mean=5.817ms (took 0.1s)
[13/50] Running N= 1291, theta_crit=0.3... mean=7.348ms (took 0.1s)
[14/50] Running N= 4641, theta_crit=0.3... mean=10.429ms (took 0.1s)
[15/50] Skipping N=16681, theta_crit=0.3
[16/50] Skipping N=59948, theta_crit=0.3
[17/50] Skipping N=215443, theta_crit=0.3
[18/50] Skipping N=774263, theta_crit=0.3
[19/50] Skipping N=2782559, theta_crit=0.3
[20/50] Skipping N=10000000, theta_crit=0.3
[21/50] Running N=  100, theta_crit=0.5... mean=4.062ms (took 0.1s)
[22/50] Running N=  359, theta_crit=0.5... mean=5.822ms (took 0.1s)
[23/50] Running N= 1291, theta_crit=0.5... mean=7.607ms (took 0.1s)
[24/50] Running N= 4641, theta_crit=0.5... mean=10.795ms (took 0.2s)
[25/50] Skipping N=16681, theta_crit=0.5
[26/50] Skipping N=59948, theta_crit=0.5
[27/50] Skipping N=215443, theta_crit=0.5
[28/50] Skipping N=774263, theta_crit=0.5
[29/50] Skipping N=2782559, theta_crit=0.5
[30/50] Skipping N=10000000, theta_crit=0.5
[31/50] Running N=  100, theta_crit=0.7... mean=4.056ms (took 0.1s)
[32/50] Running N=  359, theta_crit=0.7... mean=5.557ms (took 0.1s)
[33/50] Running N= 1291, theta_crit=0.7... mean=7.164ms (took 0.1s)
[34/50] Running N= 4641, theta_crit=0.7... mean=9.930ms (took 0.1s)
[35/50] Running N=16681, theta_crit=0.7... mean=15.348ms (took 0.3s)
[36/50] Skipping N=59948, theta_crit=0.7
[37/50] Skipping N=215443, theta_crit=0.7
[38/50] Skipping N=774263, theta_crit=0.7
[39/50] Skipping N=2782559, theta_crit=0.7
[40/50] Skipping N=10000000, theta_crit=0.7
[41/50] Running N=  100, theta_crit=0.9... mean=3.768ms (took 0.1s)
[42/50] Running N=  359, theta_crit=0.9... mean=5.386ms (took 0.1s)
[43/50] Running N= 1291, theta_crit=0.9... mean=7.344ms (took 0.1s)
[44/50] Running N= 4641, theta_crit=0.9... mean=10.153ms (took 0.1s)
[45/50] Running N=16681, theta_crit=0.9... mean=14.276ms (took 0.3s)
[46/50] Skipping N=59948, theta_crit=0.9
[47/50] Skipping N=215443, theta_crit=0.9
[48/50] Skipping N=774263, theta_crit=0.9
[49/50] Skipping N=2782559, theta_crit=0.9
[50/50] Skipping N=10000000, theta_crit=0.9

Plot the performance benchmarks for all implementations

 dump_folder = "_to_trash"

 import os

 # Create the dump directory if it does not exist
 if shamrock.sys.world_rank() == 0:
     os.makedirs(dump_folder, exist_ok=True)

 ref_key = "reference ordered=False"
 largest_refalg_value = np.nanmax(results[ref_key]["results_min"])

 i = 0
 # iterate over the results
 for k, v in results.items():

     # Get the results for this algorithm
     particle_counts = v["particle_counts"]
     theta_crits = v["theta_crits"]
     results_min = v["results_min"]
     results_max_mem_delta = v["results_max_mem_delta"]

     # Get reference algorithm results for comparison
     reference_min = results[ref_key]["results_min"]

     # Create and display the plot
     fig, ax = create_checkerboard_plot(
         particle_counts,
         theta_crits,
         results_min,
         compression_level,
         v["name"],
         largest_refalg_value,
         reference_min,
         results_max_mem_delta,
     )

     plt.savefig(f"{dump_folder}/benchmark-dtt-performance-{i}.pdf")
     i += 1

 plt.show()