37 enum class histo_impl { reference, naive_gpu, gpu_team_fetching, gpu_oversubscribe };
41 bool was_init =
false;
42 histo_impl impl = histo_impl::reference;
44 virtual std::string impl_get_alg_name()
const {
return "compute_histogram"; }
46 virtual bool impl_was_configured(
const sham::DeviceScheduler_ptr &)
const {
50 virtual std::string impl_get_config(
const sham::DeviceScheduler_ptr &)
const {
52 case histo_impl::reference :
return "reference";
53 case histo_impl::naive_gpu :
return "naive_gpu";
54 case histo_impl::gpu_team_fetching:
return "gpu_team_fetching";
55 case histo_impl::gpu_oversubscribe:
return "gpu_oversubscribe";
59 virtual std::string impl_get_default_config(
60 const sham::DeviceScheduler_ptr &dev_sched)
const {
61 if (dev_sched->ctx->device->prop.type == sham::DeviceType::GPU) {
62 return "gpu_oversubscribe";
68 virtual void impl_set_config(
69 const sham::DeviceScheduler_ptr &,
const std::string &config) {
70 if (config ==
"reference") {
71 impl = histo_impl::reference;
72 }
else if (config ==
"naive_gpu") {
73 impl = histo_impl::naive_gpu;
74 }
else if (config ==
"gpu_team_fetching") {
75 impl = histo_impl::gpu_team_fetching;
76 }
else if (config ==
"gpu_oversubscribe") {
77 impl = histo_impl::gpu_oversubscribe;
84 virtual std::vector<std::string> impl_get_avail_configs(
85 const sham::DeviceScheduler_ptr &) {
86 return {
"reference",
"naive_gpu",
"gpu_team_fetching",
"gpu_oversubscribe"};
90 histo_impl get_impl(
const sham::DeviceScheduler_ptr &dev_sched) {
91 this->ensure_init(dev_sched);
98 template<
class T,
class Tbins,
class... Targs,
class Tfunctor>
99 inline void compute_histogram_reference(
103 size_t element_count,
110 auto cpu_basic_impl = [&](
const std::vector<Tbins> &bin_edge_inf,
111 const std::vector<Tbins> &bin_edge_sup,
112 const std::vector<Targs> &...in_data,
113 std::vector<T> &result) {
114 for (
size_t ibin = 0; ibin < nbins; ibin++) {
115 Tbins edge_inf = bin_edge_inf[ibin];
116 Tbins edge_sup = bin_edge_sup[ibin];
120 for (
size_t i = 0; i < element_count; i++) {
121 bool has_value =
false;
122 auto tmp = functor(edge_inf, edge_sup, in_data[i]..., has_value);
128 result[ibin] = accumulator;
133 bin_edge_inf.copy_to_stdvec(),
134 bin_edge_sup.copy_to_stdvec(),
141 template<
class T,
class Tbins,
class... Targs,
class Tfunctor>
142 inline void compute_histogram_naive_gpu(
143 const sham::DeviceScheduler_ptr &dev_sched,
147 size_t element_count,
153 dev_sched->get_queue(),
157 [element_count, functor](
159 const Tbins *__restrict bin_edge_inf,
160 const Tbins *__restrict bin_edge_sup,
161 const Targs *__restrict... in_data,
162 T *__restrict result) {
163 Tbins edge_inf = bin_edge_inf[ibin];
164 Tbins edge_sup = bin_edge_sup[ibin];
168 for (size_t i = 0; i < element_count; i++) {
169 bool has_value = false;
170 T tmp = functor(edge_inf, edge_sup, in_data[i]..., has_value);
176 result[ibin] = accumulator;
180 template<
class T,
class Tbins,
class... Targs,
class Tfunctor>
181 inline void compute_histogram_gpu_team_fetching(
182 const sham::DeviceScheduler_ptr &dev_sched,
186 size_t element_count,
191 sham::kernel_call_hndl(
192 dev_sched->get_queue(),
196 [element_count, functor](
198 const Tbins *__restrict bin_edge_inf,
199 const Tbins *__restrict bin_edge_sup,
200 const Targs *__restrict... in_data,
201 T *__restrict result) {
202 return [=, in_data = std::tuple{in_data...}](sycl::handler &cgh) {
203 u32 group_size = 128;
204 u32 group_cnt = shambase::group_count(nbins, group_size);
207 group_cnt = (group_cnt + 3) / 4 * 4;
208 u32 corrected_len = group_cnt * group_size;
211 = sycl::local_accessor<std::tuple<Targs...>, 1>(group_size, cgh);
214 sycl::nd_range<1>{corrected_len, group_size},
215 [=](sycl::nd_item<1> item) {
216 u32 local_id = item.get_local_id(0);
217 u32 group_tile_id = item.get_group_linear_id();
218 u32 ibin = group_tile_id * group_size + local_id;
220 bool is_valid_point = (ibin < nbins);
221 Tbins edge_inf = is_valid_point ? bin_edge_inf[ibin] : Tbins{};
222 Tbins edge_sup = is_valid_point ? bin_edge_sup[ibin] : Tbins{};
226 for (
size_t i = 0; i < element_count; i += group_size) {
228 item.barrier(sycl::access::fence_space::local_space);
230 if (i + local_id < element_count) {
232 [&](
auto &...in_data) {
234 = std::tuple{in_data[i + local_id]...};
239 item.barrier(sycl::access::fence_space::local_space);
241 if (is_valid_point) {
242 for (
size_t lane = 0; lane < group_size; lane++) {
243 if (i + lane >= element_count) {
246 bool has_value =
false;
248 [&](
auto &...local_accs) {
262 item.barrier(sycl::access::fence_space::local_space);
265 if (is_valid_point) {
266 result[ibin] = local_sum;
273 template<
class T,
class Tbins,
class... Targs,
class Tfunctor>
274 inline void compute_histogram_gpu_oversubscribe(
275 const sham::DeviceScheduler_ptr &dev_sched,
280 size_t element_count,
285 sham::kernel_call_hndl(
286 dev_sched->get_queue(),
290 [element_count, functor, group_size, nbins](
291 u32 nbins_oversubscribed,
292 const Tbins *__restrict bin_edge_inf,
293 const Tbins *__restrict bin_edge_sup,
294 const Targs *__restrict... in_data,
295 T *__restrict result) {
296 return [=, in_data = std::tuple{in_data...}](sycl::handler &cgh) {
297 u32 group_cnt = shambase::group_count(nbins_oversubscribed, group_size);
300 group_cnt = (group_cnt + 3) / 4 * 4;
302 u32 corrected_len = group_cnt * group_size;
305 sycl::nd_range<1>{corrected_len, group_size},
306 [=](sycl::nd_item<1> item) {
307 u32 local_id = item.get_local_id(0);
308 u32 ibin = item.get_group_linear_id();
310 bool is_valid_point = (ibin < nbins);
311 Tbins edge_inf = is_valid_point ? bin_edge_inf[ibin] : Tbins{};
312 Tbins edge_sup = is_valid_point ? bin_edge_sup[ibin] : Tbins{};
318 for (
size_t i = 0; i < element_count; i += group_size) {
320 if (i + local_id < element_count) {
322 bool has_value =
false;
327 [&](
auto &...in_data) {
331 in_data[i + local_id]...,
345 auto group_sum = sycl::reduce_over_group(
346 item.get_group(), local_sum, sycl::plus<T>{});
348 if (is_valid_point && local_id == 0) {
349 result[ibin] = group_sum;