21namespace impl::copy_to_host {
27 T *init(
const std::unique_ptr<sycl::buffer<T>> &buf,
u32 comm_sz) {
29 using namespace shamsys::instance;
33 shamlog_debug_sycl_ln(
34 "PatchDataField MPI Comm",
38 reinterpret_cast<void *
>(comm_ptr));
41 shamlog_debug_sycl_ln(
"PatchDataField MPI Comm",
"copy buffer -> USM");
46 const T *src = &(acc[0]);
49 std::memcpy(dest, src,
sizeof(T) * comm_sz);
53 shamlog_debug_sycl_ln(
54 "PatchDataField MPI Comm",
"copy buffer -> USM (skipped size=0)");
60#define X(_t) template _t *init<_t>(const std::unique_ptr<sycl::buffer<_t>> &buf, u32 comm_sz);
61 XMAC_SYCLMPI_TYPE_ENABLED
65 void finalize(T *comm_ptr) {
67 using namespace shamsys::instance;
69 shamlog_debug_sycl_ln(
70 "PatchDataField MPI Comm",
"sycl::free",
reinterpret_cast<void *
>(comm_ptr));
75#define X(_t) template void finalize(_t *comm_ptr);
76 XMAC_SYCLMPI_TYPE_ENABLED
82 T *init(
u32 comm_sz) {
84 using namespace shamsys::instance;
86 T *comm_ptr = sycl::malloc_host<T>(comm_sz, shamsys::instance::get_compute_queue());
88 shamlog_debug_sycl_ln(
"PatchDataField MPI Comm",
"sycl::malloc_host", comm_sz);
93#define X(_t) template _t *init(u32 comm_sz);
94 XMAC_SYCLMPI_TYPE_ENABLED
97 void finalize(
const std::unique_ptr<sycl::buffer<T>> &buf, T *comm_ptr,
u32 comm_sz) {
100 shamlog_debug_sycl_ln(
"PatchDataField MPI Comm",
"copy USM -> buffer");
103 sycl::host_accessor acc{
106 const T *src = comm_ptr;
109 std::memcpy(dest, src,
sizeof(T) * comm_sz);
113 shamlog_debug_sycl_ln(
114 "PatchDataField MPI Comm",
"copy USM -> buffer (skipped size=0)");
117 shamlog_debug_sycl_ln(
118 "PatchDataField MPI Comm",
"sycl::free",
reinterpret_cast<void *
>(comm_ptr));
120 sycl::free(comm_ptr, shamsys::instance::get_compute_queue());
124 template void finalize(const std::unique_ptr<sycl::buffer<_t>> &buf, _t *comm_ptr, u32 comm_sz);
125 XMAC_SYCLMPI_TYPE_ENABLED
131namespace impl::directgpu {
133 using namespace mpi_sycl_interop;
137 T *init(
const std::unique_ptr<sycl::buffer<T>> &buf,
u32 comm_sz) {
139 T *comm_ptr = sycl::malloc_device<T>(comm_sz, shamsys::instance::get_compute_queue());
140 shamlog_debug_sycl_ln(
141 "PatchDataField MPI Comm",
"sycl::malloc_device", comm_sz,
"->", comm_ptr);
144 shamlog_debug_sycl_ln(
"PatchDataField MPI Comm",
"copy buffer -> USM");
148 sycl::accessor acc{*buf, cgh, sycl::read_only};
152 cgh.parallel_for(sycl::range<1>{comm_sz}, [=](sycl::item<1> item) {
153 ptr[item.get_linear_id()] = acc[item];
159 shamlog_debug_sycl_ln(
160 "PatchDataField MPI Comm",
"copy buffer -> USM (skipped size=0)");
166#define X(_t) template _t *init<_t>(const std::unique_ptr<sycl::buffer<_t>> &buf, u32 comm_sz);
167 XMAC_SYCLMPI_TYPE_ENABLED
171 void finalize(T *comm_ptr) {
172 shamlog_debug_sycl_ln(
"PatchDataField MPI Comm",
"sycl::free", comm_ptr);
174 sycl::free(comm_ptr, shamsys::instance::get_compute_queue());
177#define X(_t) template void finalize(_t *comm_ptr);
178 XMAC_SYCLMPI_TYPE_ENABLED
184 T *init(
u32 comm_sz) {
185 T *comm_ptr = sycl::malloc_device<T>(comm_sz, shamsys::instance::get_compute_queue());
187 shamlog_debug_sycl_ln(
"PatchDataField MPI Comm",
"sycl::malloc_device", comm_sz);
192#define X(_t) template _t *init(u32 comm_sz);
193 XMAC_SYCLMPI_TYPE_ENABLED
196 void finalize(
const std::unique_ptr<sycl::buffer<T>> &buf, T *comm_ptr,
u32 comm_sz) {
199 shamlog_debug_sycl_ln(
"PatchDataField MPI Comm",
"copy USM -> buffer");
203 sycl::accessor acc{*buf, cgh, sycl::write_only};
207 cgh.parallel_for(sycl::range<1>{comm_sz}, [=](sycl::item<1> item) {
208 acc[item] = ptr[item.get_linear_id()];
214 shamlog_debug_sycl_ln(
215 "PatchDataField MPI Comm",
"copy USM -> buffer (skipped size=0)");
218 shamlog_debug_sycl_ln(
"PatchDataField MPI Comm",
"sycl::free", comm_ptr);
220 sycl::free(comm_ptr, shamsys::instance::get_compute_queue());
223 template void finalize(const std::unique_ptr<sycl::buffer<_t>> &buf, _t *comm_ptr, u32 comm_sz);
224 XMAC_SYCLMPI_TYPE_ENABLED
231namespace mpi_sycl_interop {
237 std::unique_ptr<sycl::buffer<T>> &sycl_buf,
241 : comm_mode(comm_mode), comm_op(comm_op), comm_sz(comm_sz), sycl_buf(sycl_buf) {
243 shamlog_debug_mpi_ln(
244 "PatchDataField MPI Comm",
245 "starting mpi sycl comm ",
250 if (comm_mode == CopyToHost && comm_op == Send) {
252 comm_ptr = impl::copy_to_host::send::init<T>(sycl_buf, comm_sz);
254 }
else if (comm_mode == CopyToHost && comm_op == Recv_Probe) {
256 comm_ptr = impl::copy_to_host::recv::init<T>(comm_sz);
258 }
else if (comm_mode == DirectGPU && comm_op == Send) {
260 comm_ptr = impl::directgpu::send::init<T>(sycl_buf, comm_sz);
262 }
else if (comm_mode == DirectGPU && comm_op == Recv_Probe) {
264 comm_ptr = impl::directgpu::recv::init<T>(comm_sz);
268 "PatchDataField MPI Comm",
269 "communication mode & op combination not implemented :",
278 shamlog_debug_mpi_ln(
279 "PatchDataField MPI Comm",
280 "finalizing mpi sycl comm ",
285 sycl_buf = std::make_unique<sycl::buffer<T>>(comm_sz);
287 if (comm_mode == CopyToHost && comm_op == Send) {
289 impl::copy_to_host::send::finalize<T>(comm_ptr);
291 }
else if (comm_mode == CopyToHost && comm_op == Recv_Probe) {
293 impl::copy_to_host::recv::finalize<T>(sycl_buf, comm_ptr, comm_sz);
295 }
else if (comm_mode == DirectGPU && comm_op == Send) {
297 impl::directgpu::send::finalize<T>(comm_ptr);
299 }
else if (comm_mode == DirectGPU && comm_op == Recv_Probe) {
301 impl::directgpu::recv::finalize<T>(sycl_buf, comm_ptr, comm_sz);
305 "PatchDataField MPI Comm",
306 "communication mode & op combination not implemented :",
312#define X(a) template struct BufferMpiRequest<a>;
313 XMAC_SYCLMPI_TYPE_ENABLED
sycl::queue & get_compute_queue(u32 id=0)
std::uint32_t u32
32 bit unsigned integer
T & get_check_ref(const std::unique_ptr< T > &ptr, SourceLocation loc=SourceLocation())
Takes a std::unique_ptr and returns a reference to the object it holds. It throws a std::runtime_erro...
@ CopyToHost
copy data to the host and then perform the call
header file to manage sycl