59 sycl::buffer<T> scan_inclusive(sycl::queue &q, sycl::buffer<T> &buf1,
u32 len);
62 void scan_exclusive_in_place(sycl::queue &q, sycl::buffer<T> &buf,
u32 len);
65 void scan_inclusive_in_place(sycl::queue &q, sycl::buffer<T> &buf,
u32 len);
76 sycl::queue &q, sycl::buffer<u32> &buf_flags,
u32 len);
130 template<
class Tret,
class T>
132 const sham::DeviceScheduler_ptr &sched,
140 const sham::DeviceScheduler_ptr &sched,
145 return device_histogram<u64, T>(sched, bin_edges, nbins, values, len);
150 const sham::DeviceScheduler_ptr &sched,
155 return device_histogram<u32, T>(sched, bin_edges, nbins, values, len);
175 const sham::DeviceScheduler_ptr &sched,
197 const T *__restrict bin_edges,
198 T *__restrict bins_center,
199 T *__restrict bins_width) {
200 bins_center[i] = (bin_edges[i] + bin_edges[i + 1]) / 2;
201 bins_width[i] = bin_edges[i + 1] - bin_edges[i];
204 return {std::move(counts), std::move(bins_center), std::move(bins_width)};
252 const sham::DeviceScheduler_ptr &sched,
295 template<
class Tret,
class T,
class Fct>
297 const sham::DeviceScheduler_ptr &sched,
306 auto &valid_values = bin_compute_in.valid_values;
307 auto &offsets_bins = bin_compute_in.offsets_bins;
320 const T *__restrict valid_values,
321 const u32 *__restrict offsets_bins,
322 Tret *__restrict bin_averages) {
323 u32 bin_start = offsets_bins[i];
324 u32 bin_end = offsets_bins[i + 1];
325 u32 bin_count = bin_end - bin_start;
327 auto for_each_values = [&](
auto func) {
328 for (
u32 j = bin_start; j < bin_end; j++) {
329 func(valid_values[j]);
333 bin_averages[i] = fct(for_each_values, bin_count);
367 const sham::DeviceScheduler_ptr &sched,
374 return binned_compute<T, T>(
375 sched, bin_edges, nbins, values, keys, len, [](
auto for_each_values,
u32 bin_count) {
377 for_each_values([&](T val) {
412 const sham::DeviceScheduler_ptr &sched,
419 return binned_compute<T, T>(
426 [](
auto for_each_values,
u32 bin_count) -> T {
428 for_each_values([&](T val) {
431 if (bin_count == 0) {
434 return sum / bin_count;
474 template<
class Tret,
class T>
476 const sham::DeviceScheduler_ptr &sched,
482 auto local_counts = device_histogram<Tret, T>(sched, bin_edges, nbins, values, len);
485 shamalgs::collective::reduce_buffer_in_place_sum(local_counts, MPI_COMM_WORLD);
522 const sham::DeviceScheduler_ptr &sched,
527 return device_histogram_mpi<u64, T>(sched, bin_edges, nbins, values, len);
562 const sham::DeviceScheduler_ptr &sched,
567 return device_histogram_mpi<u32, T>(sched, bin_edges, nbins, values, len);
607 const sham::DeviceScheduler_ptr &sched,
614 auto local_result =
binned_sum(sched, bin_edges, nbins, values, keys, len);
617 shamalgs::collective::reduce_buffer_in_place_sum(local_result, MPI_COMM_WORLD);
660 const sham::DeviceScheduler_ptr &sched,
668 auto bin_sums =
binned_sum_mpi(sched, bin_edges, nbins, values, keys, len);
675 [](
u32 i,
const u32 *__restrict bin_counts, T *__restrict bin_sums) {
676 u32 bin_count = bin_counts[i];
677 if (bin_count == 0) {
680 bin_sums[i] /= bin_count;
724 const sham::DeviceScheduler_ptr &sched,
754 const sham::DeviceScheduler_ptr &sched,
777 const T *__restrict bin_edges,
778 T *__restrict bins_center,
779 T *__restrict bins_width) {
780 bins_center[i] = (bin_edges[i] + bin_edges[i + 1]) / 2;
781 bins_width[i] = bin_edges[i + 1] - bin_edges[i];
784 return {std::move(counts), std::move(bins_center), std::move(bins_width)};
std::uint32_t u32
32 bit unsigned integer
std::uint64_t u64
64 bit unsigned integer
Shamrock assertion utility.
#define SHAM_ASSERT(x)
Shorthand for SHAM_ASSERT_NAMED without a message.
A buffer allocated in USM (Unified Shared Memory)
size_t get_size() const
Gets the number of elements in the buffer.
void kernel_call(sham::DeviceQueue &q, RefIn in, RefOut in_out, u32 n, Functor &&func, SourceLocation &&callsite=SourceLocation{})
Submit a kernel to a SYCL queue.
namespace containing the numeric algorithms of shamalgs
BinnedCompute< T > binned_init_compute(const sham::DeviceScheduler_ptr &sched, const sham::DeviceBuffer< T > &bin_edges, u64 nbins, const sham::DeviceBuffer< T > &values, const sham::DeviceBuffer< T > &keys, u32 len)
Prepare binned data for per-bin computation.
sham::DeviceBuffer< u32 > device_histogram_u32_mpi(const sham::DeviceScheduler_ptr &sched, const sham::DeviceBuffer< T > &bin_edges, u64 nbins, const sham::DeviceBuffer< T > &values, u32 len)
Compute the u32 histogram of values between bin_edges across all MPI ranks.
sham::DeviceBuffer< Tret > device_histogram(const sham::DeviceScheduler_ptr &sched, const sham::DeviceBuffer< T > &bin_edges, u64 nbins, const sham::DeviceBuffer< T > &values, u32 len)
Compute the histogram of values between bin_edges.
sham::DeviceBuffer< Tret > device_histogram_mpi(const sham::DeviceScheduler_ptr &sched, const sham::DeviceBuffer< T > &bin_edges, u64 nbins, const sham::DeviceBuffer< T > &values, u32 len)
Compute the histogram of values between bin_edges across all MPI ranks.
sham::DeviceBuffer< T > binned_sum(const sham::DeviceScheduler_ptr &sched, const sham::DeviceBuffer< T > &bin_edges, u64 nbins, const sham::DeviceBuffer< T > &values, const sham::DeviceBuffer< T > &keys, u32 len)
Compute the sum of values in each bin.
sham::DeviceBuffer< T > binned_sum_mpi(const sham::DeviceScheduler_ptr &sched, const sham::DeviceBuffer< T > &bin_edges, u64 nbins, const sham::DeviceBuffer< T > &values, const sham::DeviceBuffer< T > &keys, u32 len)
Compute the sum of values in each bin across all MPI ranks.
sham::DeviceBuffer< T > binned_average_mpi(const sham::DeviceScheduler_ptr &sched, const sham::DeviceBuffer< T > &bin_edges, u64 nbins, const sham::DeviceBuffer< T > &values, const sham::DeviceBuffer< T > &keys, u32 len, const sham::DeviceBuffer< u32 > &bin_counts_global)
Compute the average of values in each bin across all MPI ranks (with pre-computed global counts).
sycl::buffer< T > scan_exclusive(sycl::queue &q, sycl::buffer< T > &buf1, u32 len)
Computes the exclusive sum of elements in a SYCL buffer.
std::tuple< std::optional< sycl::buffer< u32 > >, u32 > stream_compact(sycl::queue &q, sycl::buffer< u32 > &buf_flags, u32 len)
Stream compaction algorithm.
sham::DeviceBuffer< T > binned_average(const sham::DeviceScheduler_ptr &sched, const sham::DeviceBuffer< T > &bin_edges, u64 nbins, const sham::DeviceBuffer< T > &values, const sham::DeviceBuffer< T > &keys, u32 len)
Compute the average of values in each bin.
histogram_result< T > device_histogram_full_mpi(const sham::DeviceScheduler_ptr &sched, const sham::DeviceBuffer< T > &bin_edges, u64 nbins, const sham::DeviceBuffer< T > &values, u32 len)
Compute the histogram and bin properties (center, width) for a set of values and bin edges.
sham::DeviceBuffer< u64 > device_histogram_u64_mpi(const sham::DeviceScheduler_ptr &sched, const sham::DeviceBuffer< T > &bin_edges, u64 nbins, const sham::DeviceBuffer< T > &values, u32 len)
Compute the u64 histogram of values between bin_edges across all MPI ranks.
sham::DeviceBuffer< Tret > binned_compute(const sham::DeviceScheduler_ptr &sched, const sham::DeviceBuffer< T > &bin_edges, u64 nbins, const sham::DeviceBuffer< T > &values, const sham::DeviceBuffer< T > &keys, u32 len, Fct &&fct)
Perform a custom reduction or computation over values in each bin.
histogram_result< T > device_histogram_full(const sham::DeviceScheduler_ptr &sched, const sham::DeviceBuffer< T > &bin_edges, u64 nbins, const sham::DeviceBuffer< T > &values, u32 len)
Compute the histogram and bin properties (center, width) for a set of values and bin edges.
T & get_check_ref(const std::unique_ptr< T > &ptr, SourceLocation loc=SourceLocation())
Takes a std::unique_ptr and returns a reference to the object it holds. It throws a std::runtime_erro...
A class that references multiple buffers or similar objects.
Structure holding the result of binning values for further computation.
sham::DeviceBuffer< u32 > offsets_bins
Offsets for each bin (size nbins+1).
sham::DeviceBuffer< T > valid_values
Values that are within the bin range, sorted by bin.