Shamrock 2025.10.0
Astrophysical Code
Loading...
Searching...
No Matches
gpu_core_timeline.hpp
Go to the documentation of this file.
1// -------------------------------------------------------//
2//
3// SHAMROCK code for hydrodynamics
4// Copyright (c) 2021-2026 Timothée David--Cléris <tim.shamrock@proton.me>
5// SPDX-License-Identifier: CeCILL Free Software License Agreement v2.1
6// Shamrock is licensed under the CeCILL 2.1 License, see LICENSE for more information
7//
8// -------------------------------------------------------//
9
10#pragma once
11
19
26#include "shamcomm/logs.hpp"
27#include <shambackends/sycl.hpp>
28#include <unordered_map>
29#include <vector>
30
31#if __has_include(<nlohmann/json.hpp>)
32 #include "nlohmann/json.hpp"
33#endif
34
35namespace sham {
36
39 unsigned long long start;
40 unsigned long long first_end;
41 unsigned long long last_end;
42 uint lane;
43 uint color;
44 };
45
46} // namespace sham
47
48#if __has_include(<nlohmann/json.hpp>)
49
50NLOHMANN_JSON_NAMESPACE_BEGIN
51template<>
52struct adl_serializer<sham::TimelineEvent> {
53 static void to_json(json &j, const sham::TimelineEvent &e) {
54 j
55 = {{"start", e.start},
56 {"first_end", e.first_end},
57 {"last_end", e.last_end},
58 {"color", e.color},
59 {"lane", e.lane}};
60 }
61};
62NLOHMANN_JSON_NAMESPACE_END
63#endif
64
65namespace sham {
66
78 sham::DeviceScheduler_ptr dev_sched;
79 sham::DeviceBuffer<u64> frame_start_clock;
80
82 sham::DeviceBuffer<u64> event_count;
83
84 public:
87 const sham::DeviceScheduler_ptr &dev_sched, u32 max_event_count)
88 : dev_sched(dev_sched), frame_start_clock(1, dev_sched),
89 events(max_event_count, dev_sched), event_count(1, dev_sched) {
90 event_count.set_val_at_idx(0, 0);
92 }
93
106
107 static std::unordered_map<DeviceScheduler *, bool> cache;
108 auto it = cache.find(dev_sched.get());
109 if (it == cache.end()) {
110
111 sham::DeviceBuffer<u64> tmp(1, dev_sched);
112
114 dev_sched->get_queue(),
116 sham::MultiRef{tmp},
117 1,
118 [](u32 i, u64 *out) {
119#if defined(SHAMROCK_INTRISICS_GET_DEVICE_CLOCK_AVAILABLE) \
120 && defined(SHAMROCK_INTRISICS_GET_SMID_AVAILABLE)
121 *out = 1;
122#else
123 *out = 0;
124#endif
125 });
126
127 cache[dev_sched.get()] = tmp.get_val_at_idx(0);
128
129 if (!cache[dev_sched.get()]) {
131 "Backend", "gpu_core_timeline_profilier is not available on the device");
132 }
133 }
134
135 return cache[dev_sched.get()];
136 }
137
138 // base clock val
139
143 inline void setFrameStartClock() {
145 dev_sched->get_queue(),
147 sham::MultiRef{frame_start_clock},
148 1,
149 [](u32 i, u64 *clock) {
150#ifdef SHAMROCK_INTRISICS_GET_DEVICE_CLOCK_AVAILABLE
151 *clock = sham::get_device_clock();
152#else
153 *clock = 0;
154#endif
155 });
156 }
157
158 inline u64 get_base_clock_value() { return frame_start_clock.get_val_at_idx(0); }
159
160 struct local_access_t {
161 sycl::local_accessor<uint> _index;
162 sycl::local_accessor<bool> _valid;
163
164 local_access_t(sycl::handler &cgh) : _index(1, cgh), _valid(1, cgh) {}
165 };
166
167 // Kernel access section
168 struct acc {
169 TimelineEvent *events;
170 u64 *event_count;
171 u64 max_event_count;
172
190 sycl::nd_item<1> item, const local_access_t &acc) const {
191 if (item.get_local_id(0) == 0) {
192 sycl::atomic_ref<
193 u64,
194 sycl::memory_order_relaxed,
195 sycl::memory_scope_device,
196 sycl::access::address_space::global_space>
197 ev_cnt_ref(event_count[0]);
198
199 acc._index[0] = ev_cnt_ref.fetch_add(1_u64);
200 acc._valid[0] = acc._index[0] < max_event_count;
201
202 if (acc._valid[0]) {
203#ifdef SHAMROCK_INTRISICS_GET_SMID_AVAILABLE
204 events[acc._index[0]]
205 = {.start = u64_max,
206 .first_end = u64_max,
207 .last_end = 0,
208 .lane = sham::get_sm_id(),
209 .color = 0};
210#else
211 events[acc._index[0]] = {u64_max, u64_max, 0, 0, 0};
212#endif
213 }
214 }
215 item.barrier(); // equivalent to __syncthreads
216 }
217
226 inline void start_timeline_event(const local_access_t &acc) const {
227 if (acc._valid[0]) {
228
229 sycl::atomic_ref<
230 unsigned long long,
231 sycl::memory_order_relaxed,
232 sycl::memory_scope_device,
233 sycl::access::address_space::global_space>
234 start_val(events[acc._index[0]].start);
235
236 using ull = unsigned long long;
237
238#ifdef SHAMROCK_INTRISICS_GET_DEVICE_CLOCK_AVAILABLE
239 ull clock = sham::get_device_clock();
240#else
241 ull clock = 0;
242#endif
243
244 start_val.fetch_min(clock);
245 }
246 }
247
256 inline void end_timeline_event(const local_access_t &acc) const {
257 if (acc._valid[0]) {
258 sycl::atomic_ref<
259 unsigned long long,
260 sycl::memory_order_relaxed,
261 sycl::memory_scope_device,
262 sycl::access::address_space::global_space>
263 first_end(events[acc._index[0]].first_end);
264
265 sycl::atomic_ref<
266 unsigned long long,
267 sycl::memory_order_relaxed,
268 sycl::memory_scope_device,
269 sycl::access::address_space::global_space>
270 last_end(events[acc._index[0]].last_end);
271
272 using ull = unsigned long long;
273
274#ifdef SHAMROCK_INTRISICS_GET_DEVICE_CLOCK_AVAILABLE
275 ull clock = sham::get_device_clock();
276#else
277 ull clock = 0;
278#endif
279
280 first_end.fetch_min(clock);
281 last_end.fetch_max(clock);
282 }
283 }
284 };
285
294 return {
295 .events = events.get_write_access(deps),
296 .event_count = event_count.get_write_access(deps),
297 .max_event_count = events.get_size()};
298 }
299
307 inline void complete_event_state(sycl::event e) {
308 events.complete_event_state(e);
309 event_count.complete_event_state(e);
310 }
311
312#if __has_include(<nlohmann/json.hpp>)
333 inline void dump_to_file(const std::string &filename) {
334
335 u32 sz = event_count.get_val_at_idx(0);
336
337 std::cout << "dumping to " << filename << " size = " << sz << std::endl;
338
339 std::vector<TimelineEvent> events = this->events.copy_to_stdvec_idx_range(0, sz);
340
341 u64 base_clock = get_base_clock_value();
342
343 for (auto &t : events) {
344 t.start -= base_clock;
345 t.first_end -= base_clock;
346 t.last_end -= base_clock;
347 }
348
349 std::ofstream file(filename);
350 file << nlohmann::json(events).dump(4) << std::endl;
351 }
352#endif
353
354 // inline void open_file(const std::string &filename) {
355 // std::string cmd = "python3 ../buildbot/gpu_core_timeline_read.py ";
356 // cmd += filename + " -b 4";
357 // std::system(cmd.c_str());
358 // }
359 };
360
361} // namespace sham
void to_json(nlohmann::json &j, const PatchSchedulerConfig &p)
Converts a PatchSchedulerConfig object to a JSON object.
std::uint32_t u32
32 bit unsigned integer
std::uint64_t u64
64 bit unsigned integer
A buffer allocated in USM (Unified Shared Memory).
T get_val_at_idx(size_t idx) const
Get the value at a given index in the buffer.
Class to manage a list of SYCL events.
Definition EventList.hpp:31
void setFrameStartClock()
Recover the current device time in the frame_start_clock buffer.
gpu_core_timeline_profilier(const sham::DeviceScheduler_ptr &dev_sched, u32 max_event_count)
CTOR.
acc get_write_access(sham::EventList &deps)
Get a write access to the timeline events and the event count.
bool is_available_on_device()
Check if gpu_core_timeline_profilier is available on the device.
This file implement the GPU core timeline tool from A. Richermoz, F. Neyret 2024.
namespace for backends this one is named only sham since shambackends is too long to write
u32 get_sm_id()
Return the SM (Streaming Multiprocessor) ID of the calling thread, or equivalent if implemented.
void kernel_call(sham::DeviceQueue &q, RefIn in, RefOut in_out, u32 n, Functor &&func, SourceLocation &&callsite=SourceLocation{})
Submit a kernel to a SYCL queue.
u64 get_device_clock()
Return the number of clock cycles elapsed since an arbitrary starting point on the device.
constexpr u64 u64_max
u64 max value
void warn_ln(std::string module_name, Types... var2)
Prints a log message with multiple arguments followed by a newline.
Definition logs.hpp:133
A class that references multiple buffers or similar objects.
A timeline event for the gpu core timeline.
void start_timeline_event(const local_access_t &acc) const
Start a timeline event.
void init_timeline_event(sycl::nd_item< 1 > item, const local_access_t &acc) const
Initialize a timeline event.
void end_timeline_event(const local_access_t &acc) const
Finish a timeline event.