28#include <unordered_map>
31#if __has_include(<nlohmann/json.hpp>)
32 #include "nlohmann/json.hpp"
39 unsigned long long start;
40 unsigned long long first_end;
41 unsigned long long last_end;
48#if __has_include(<nlohmann/json.hpp>)
50NLOHMANN_JSON_NAMESPACE_BEGIN
52struct adl_serializer<
sham::TimelineEvent> {
55 = {{
"start", e.start},
56 {
"first_end", e.first_end},
57 {
"last_end", e.last_end},
62NLOHMANN_JSON_NAMESPACE_END
78 sham::DeviceScheduler_ptr dev_sched;
87 const sham::DeviceScheduler_ptr &dev_sched,
u32 max_event_count)
88 : dev_sched(dev_sched), frame_start_clock(1, dev_sched),
89 events(max_event_count, dev_sched), event_count(1, dev_sched) {
90 event_count.set_val_at_idx(0, 0);
107 static std::unordered_map<DeviceScheduler *, bool> cache;
108 auto it = cache.find(dev_sched.get());
109 if (it == cache.end()) {
114 dev_sched->get_queue(),
119#if defined(SHAMROCK_INTRISICS_GET_DEVICE_CLOCK_AVAILABLE) \
120 && defined(SHAMROCK_INTRISICS_GET_SMID_AVAILABLE)
129 if (!cache[dev_sched.get()]) {
131 "Backend",
"gpu_core_timeline_profilier is not available on the device");
135 return cache[dev_sched.get()];
145 dev_sched->get_queue(),
150#ifdef SHAMROCK_INTRISICS_GET_DEVICE_CLOCK_AVAILABLE
151 *clock = sham::get_device_clock();
158 inline u64 get_base_clock_value() {
return frame_start_clock.
get_val_at_idx(0); }
160 struct local_access_t {
161 sycl::local_accessor<uint> _index;
162 sycl::local_accessor<bool> _valid;
164 local_access_t(sycl::handler &cgh) : _index(1, cgh), _valid(1, cgh) {}
191 if (item.get_local_id(0) == 0) {
194 sycl::memory_order_relaxed,
195 sycl::memory_scope_device,
196 sycl::access::address_space::global_space>
197 ev_cnt_ref(event_count[0]);
199 acc._index[0] = ev_cnt_ref.fetch_add(1_u64);
200 acc._valid[0] =
acc._index[0] < max_event_count;
203#ifdef SHAMROCK_INTRISICS_GET_SMID_AVAILABLE
204 events[
acc._index[0]]
231 sycl::memory_order_relaxed,
232 sycl::memory_scope_device,
233 sycl::access::address_space::global_space>
234 start_val(events[
acc._index[0]].start);
236 using ull =
unsigned long long;
238#ifdef SHAMROCK_INTRISICS_GET_DEVICE_CLOCK_AVAILABLE
244 start_val.fetch_min(clock);
260 sycl::memory_order_relaxed,
261 sycl::memory_scope_device,
262 sycl::access::address_space::global_space>
263 first_end(events[
acc._index[0]].first_end);
267 sycl::memory_order_relaxed,
268 sycl::memory_scope_device,
269 sycl::access::address_space::global_space>
270 last_end(events[
acc._index[0]].last_end);
272 using ull =
unsigned long long;
274#ifdef SHAMROCK_INTRISICS_GET_DEVICE_CLOCK_AVAILABLE
280 first_end.fetch_min(clock);
281 last_end.fetch_max(clock);
295 .events = events.get_write_access(deps),
296 .event_count = event_count.get_write_access(deps),
297 .max_event_count = events.get_size()};
308 events.complete_event_state(e);
309 event_count.complete_event_state(e);
312#if __has_include(<nlohmann/json.hpp>)
333 inline void dump_to_file(
const std::string &filename) {
337 std::cout <<
"dumping to " << filename <<
" size = " << sz << std::endl;
339 std::vector<TimelineEvent> events = this->events.copy_to_stdvec_idx_range(0, sz);
341 u64 base_clock = get_base_clock_value();
343 for (
auto &t : events) {
344 t.start -= base_clock;
345 t.first_end -= base_clock;
346 t.last_end -= base_clock;
349 std::ofstream file(filename);
350 file << nlohmann::json(events).dump(4) << std::endl;
void to_json(nlohmann::json &j, const PatchSchedulerConfig &p)
Converts a PatchSchedulerConfig object to a JSON object.
std::uint32_t u32
32 bit unsigned integer
std::uint64_t u64
64 bit unsigned integer
A buffer allocated in USM (Unified Shared Memory).
T get_val_at_idx(size_t idx) const
Get the value at a given index in the buffer.
Class to manage a list of SYCL events.
void complete_event_state(sycl::event e)
void setFrameStartClock()
Recover the current device time in the frame_start_clock buffer.
gpu_core_timeline_profilier(const sham::DeviceScheduler_ptr &dev_sched, u32 max_event_count)
CTOR.
acc get_write_access(sham::EventList &deps)
Get a write access to the timeline events and the event count.
bool is_available_on_device()
Check if gpu_core_timeline_profilier is available on the device.
This file implement the GPU core timeline tool from A. Richermoz, F. Neyret 2024.
namespace for backends this one is named only sham since shambackends is too long to write
u32 get_sm_id()
Return the SM (Streaming Multiprocessor) ID of the calling thread, or equivalent if implemented.
void kernel_call(sham::DeviceQueue &q, RefIn in, RefOut in_out, u32 n, Functor &&func, SourceLocation &&callsite=SourceLocation{})
Submit a kernel to a SYCL queue.
u64 get_device_clock()
Return the number of clock cycles elapsed since an arbitrary starting point on the device.
constexpr u64 u64_max
u64 max value
void warn_ln(std::string module_name, Types... var2)
Prints a log message with multiple arguments followed by a newline.
A class that references multiple buffers or similar objects.
A timeline event for the gpu core timeline.
void start_timeline_event(const local_access_t &acc) const
Start a timeline event.
void init_timeline_event(sycl::nd_item< 1 > item, const local_access_t &acc) const
Initialize a timeline event.
void end_timeline_event(const local_access_t &acc) const
Finish a timeline event.