Shamrock 2025.10.0
Astrophysical Code
Loading...
Searching...
No Matches
gpu_core_timeline.hpp
Go to the documentation of this file.
1// -------------------------------------------------------//
2//
3// SHAMROCK code for hydrodynamics
4// Copyright (c) 2021-2026 Timothée David--Cléris <tim.shamrock@proton.me>
5// SPDX-License-Identifier: CeCILL Free Software License Agreement v2.1
6// Shamrock is licensed under the CeCILL 2.1 License, see LICENSE for more information
7//
8// -------------------------------------------------------//
9
10#pragma once
11
26#include "shamcomm/logs.hpp"
27#include <shambackends/sycl.hpp>
28#include <unordered_map>
29#include <vector>
30
31#if __has_include(<nlohmann/json.hpp>)
32 #include "nlohmann/json.hpp"
33#endif
34
35namespace sham {
36
39 unsigned long long start;
40 unsigned long long first_end;
41 unsigned long long last_end;
42 uint lane;
43 uint color;
44 };
45
46} // namespace sham
47
48#if __has_include(<nlohmann/json.hpp>)
49
50NLOHMANN_JSON_NAMESPACE_BEGIN
51template<>
52struct adl_serializer<sham::TimelineEvent> {
53 static void to_json(json &j, const sham::TimelineEvent &e) {
54 j
55 = {{"start", e.start},
56 {"first_end", e.first_end},
57 {"last_end", e.last_end},
58 {"color", e.color},
59 {"lane", e.lane}};
60 }
61};
62NLOHMANN_JSON_NAMESPACE_END
63#endif
64
65namespace sham {
66
78 sham::DeviceScheduler_ptr dev_sched;
79 sham::DeviceBuffer<u64> frame_start_clock;
80
82 sham::DeviceBuffer<u64> event_count;
83
84 public:
87 const sham::DeviceScheduler_ptr &dev_sched, u32 max_event_count)
88 : dev_sched(dev_sched), frame_start_clock(1, dev_sched),
89 events(max_event_count, dev_sched), event_count(1, dev_sched) {
90 event_count.set_val_at_idx(0, 0);
92 }
93
106
107 static std::unordered_map<DeviceScheduler *, bool> cache;
108 auto it = cache.find(dev_sched.get());
109 if (it == cache.end()) {
110
111 sham::DeviceBuffer<u64> tmp(1, dev_sched);
112
114 dev_sched->get_queue(),
116 sham::MultiRef{tmp},
117 1,
118 [](u32 i, u64 *out) {
119#if defined(SHAMROCK_INTRISICS_GET_DEVICE_CLOCK_AVAILABLE) \
120 && defined(SHAMROCK_INTRISICS_GET_SMID_AVAILABLE)
121 *out = 1;
122#else
123 *out = 0;
124#endif
125 });
126
127 cache[dev_sched.get()] = tmp.get_val_at_idx(0);
128
129 if (!cache[dev_sched.get()]) {
130 logger::warn_ln(
131 "Backend", "gpu_core_timeline_profilier is not available on the device");
132 }
133 }
134
135 return cache[dev_sched.get()];
136 }
137
138 // base clock val
139
143 inline void setFrameStartClock() {
145 dev_sched->get_queue(),
147 sham::MultiRef{frame_start_clock},
148 1,
149 [](u32 i, u64 *clock) {
150#ifdef SHAMROCK_INTRISICS_GET_DEVICE_CLOCK_AVAILABLE
151 *clock = sham::get_device_clock();
152#else
153 *clock = 0;
154#endif
155 });
156 }
157
158 inline u64 get_base_clock_value() { return frame_start_clock.get_val_at_idx(0); }
159
161 sycl::local_accessor<uint> _index;
162 sycl::local_accessor<bool> _valid;
163
164 local_access_t(sycl::handler &cgh) : _index(1, cgh), _valid(1, cgh) {}
165 };
166
167 // Kernel access section
168 struct acc {
169 TimelineEvent *events;
170 u64 *event_count;
171 u64 max_event_count;
172
190 sycl::nd_item<1> item, const local_access_t &acc) const {
191 if (item.get_local_id(0) == 0) {
192 sycl::atomic_ref<
193 u64,
194 sycl::memory_order_relaxed,
195 sycl::memory_scope_device,
196 sycl::access::address_space::global_space>
197 ev_cnt_ref(event_count[0]);
198
199 acc._index[0] = ev_cnt_ref.fetch_add(1_u64);
200 acc._valid[0] = acc._index[0] < max_event_count;
201
202 if (acc._valid[0]) {
203#ifdef SHAMROCK_INTRISICS_GET_SMID_AVAILABLE
204 events[acc._index[0]] = {u64_max, u64_max, 0, sham::get_sm_id(), 0};
205#else
206 events[acc._index[0]] = {u64_max, u64_max, 0, 0, 0};
207#endif
208 }
209 }
210 item.barrier(); // equivalent to __syncthreads
211 }
212
221 inline void start_timeline_event(const local_access_t &acc) const {
222 if (acc._valid[0]) {
223
224 sycl::atomic_ref<
225 unsigned long long,
226 sycl::memory_order_relaxed,
227 sycl::memory_scope_device,
228 sycl::access::address_space::global_space>
229 start_val(events[acc._index[0]].start);
230
231 using ull = unsigned long long;
232
233#ifdef SHAMROCK_INTRISICS_GET_DEVICE_CLOCK_AVAILABLE
234 ull clock = sham::get_device_clock();
235#else
236 ull clock = 0;
237#endif
238
239 start_val.fetch_min(clock);
240 }
241 }
242
251 inline void end_timeline_event(const local_access_t &acc) const {
252 if (acc._valid[0]) {
253 sycl::atomic_ref<
254 unsigned long long,
255 sycl::memory_order_relaxed,
256 sycl::memory_scope_device,
257 sycl::access::address_space::global_space>
258 first_end(events[acc._index[0]].first_end);
259
260 sycl::atomic_ref<
261 unsigned long long,
262 sycl::memory_order_relaxed,
263 sycl::memory_scope_device,
264 sycl::access::address_space::global_space>
265 last_end(events[acc._index[0]].last_end);
266
267 using ull = unsigned long long;
268
269#ifdef SHAMROCK_INTRISICS_GET_DEVICE_CLOCK_AVAILABLE
270 ull clock = sham::get_device_clock();
271#else
272 ull clock = 0;
273#endif
274
275 first_end.fetch_min(clock);
276 last_end.fetch_max(clock);
277 }
278 }
279 };
280
289 return {
290 events.get_write_access(deps),
291 event_count.get_write_access(deps),
292 events.get_size()};
293 }
294
302 inline void complete_event_state(sycl::event e) {
303 events.complete_event_state(e);
304 event_count.complete_event_state(e);
305 }
306
307#if __has_include(<nlohmann/json.hpp>)
328 inline void dump_to_file(const std::string &filename) {
329
330 u32 sz = event_count.get_val_at_idx(0);
331
332 std::cout << "dumping to " << filename << " size = " << sz << std::endl;
333
334 std::vector<TimelineEvent> events = this->events.copy_to_stdvec_idx_range(0, sz);
335
336 u64 base_clock = get_base_clock_value();
337
338 for (auto &t : events) {
339 t.start -= base_clock;
340 t.first_end -= base_clock;
341 t.last_end -= base_clock;
342 }
343
344 std::ofstream file(filename);
345 file << nlohmann::json(events).dump(4) << std::endl;
346 }
347#endif
348
349 // inline void open_file(const std::string &filename) {
350 // std::string cmd = "python3 ../buildbot/gpu_core_timeline_read.py ";
351 // cmd += filename + " -b 4";
352 // std::system(cmd.c_str());
353 // }
354 };
355
356} // namespace sham
void to_json(nlohmann::json &j, const PatchSchedulerConfig &p)
Converts a PatchSchedulerConfig object to a JSON object.
std::uint32_t u32
32 bit unsigned integer
std::uint64_t u64
64 bit unsigned integer
A buffer allocated in USM (Unified Shared Memory)
void complete_event_state(sycl::event e) const
Complete the event state of the buffer.
T * get_write_access(sham::EventList &depends_list, SourceLocation src_loc=SourceLocation{})
Get a read-write pointer to the buffer's data.
T get_val_at_idx(size_t idx) const
Get the value at a given index in the buffer.
size_t get_size() const
Gets the number of elements in the buffer.
std::vector< T > copy_to_stdvec_idx_range(size_t begin, size_t end) const
Copies a specified range of elements from the buffer to a std::vector.
Class to manage a list of SYCL events.
Definition EventList.hpp:31
This class implement the GPU core timeline tool from the original algorithm of A. Richermoz,...
void setFrameStartClock()
Recover the current device time in the frame_start_clock buffer.
gpu_core_timeline_profilier(const sham::DeviceScheduler_ptr &dev_sched, u32 max_event_count)
CTOR.
acc get_write_access(sham::EventList &deps)
Get a write access to the timeline events and the event count.
bool is_available_on_device()
Check if gpu_core_timeline_profilier is available on the device.
This file implement the GPU core timeline tool from A. Richermoz, F. Neyret 2024.
namespace for backends this one is named only sham since shambackends is too long to write
u32 get_sm_id()
Return the SM (Streaming Multiprocessor) ID of the calling thread, or equivalent if implemented.
void kernel_call(sham::DeviceQueue &q, RefIn in, RefOut in_out, u32 n, Functor &&func, SourceLocation &&callsite=SourceLocation{})
Submit a kernel to a SYCL queue.
u64 get_device_clock()
Return the number of clock cycles elapsed since an arbitrary starting point on the device.
constexpr u64 u64_max
u64 max value
A class that references multiple buffers or similar objects.
A timeline event for the gpu core timeline.
void start_timeline_event(const local_access_t &acc) const
Start a timeline event.
void init_timeline_event(sycl::nd_item< 1 > item, const local_access_t &acc) const
Initialize a timeline event.
void end_timeline_event(const local_access_t &acc) const
Finish a timeline event.