28#include <unordered_map>
31#if __has_include(<nlohmann/json.hpp>)
32 #include "nlohmann/json.hpp"
39 unsigned long long start;
40 unsigned long long first_end;
41 unsigned long long last_end;
48#if __has_include(<nlohmann/json.hpp>)
50NLOHMANN_JSON_NAMESPACE_BEGIN
52struct adl_serializer<
sham::TimelineEvent> {
55 = {{
"start", e.start},
56 {
"first_end", e.first_end},
57 {
"last_end", e.last_end},
62NLOHMANN_JSON_NAMESPACE_END
78 sham::DeviceScheduler_ptr dev_sched;
87 const sham::DeviceScheduler_ptr &dev_sched,
u32 max_event_count)
88 : dev_sched(dev_sched), frame_start_clock(1, dev_sched),
89 events(max_event_count, dev_sched), event_count(1, dev_sched) {
90 event_count.set_val_at_idx(0, 0);
107 static std::unordered_map<DeviceScheduler *, bool> cache;
108 auto it = cache.find(dev_sched.get());
109 if (it == cache.end()) {
114 dev_sched->get_queue(),
119#if defined(SHAMROCK_INTRISICS_GET_DEVICE_CLOCK_AVAILABLE) \
120 && defined(SHAMROCK_INTRISICS_GET_SMID_AVAILABLE)
129 if (!cache[dev_sched.get()]) {
131 "Backend",
"gpu_core_timeline_profilier is not available on the device");
135 return cache[dev_sched.get()];
145 dev_sched->get_queue(),
150#ifdef SHAMROCK_INTRISICS_GET_DEVICE_CLOCK_AVAILABLE
151 *clock = sham::get_device_clock();
158 inline u64 get_base_clock_value() {
return frame_start_clock.
get_val_at_idx(0); }
161 sycl::local_accessor<uint> _index;
162 sycl::local_accessor<bool> _valid;
164 local_access_t(sycl::handler &cgh) : _index(1, cgh), _valid(1, cgh) {}
191 if (item.get_local_id(0) == 0) {
194 sycl::memory_order_relaxed,
195 sycl::memory_scope_device,
196 sycl::access::address_space::global_space>
197 ev_cnt_ref(event_count[0]);
199 acc._index[0] = ev_cnt_ref.fetch_add(1_u64);
200 acc._valid[0] =
acc._index[0] < max_event_count;
203#ifdef SHAMROCK_INTRISICS_GET_SMID_AVAILABLE
226 sycl::memory_order_relaxed,
227 sycl::memory_scope_device,
228 sycl::access::address_space::global_space>
229 start_val(events[
acc._index[0]].start);
231 using ull =
unsigned long long;
233#ifdef SHAMROCK_INTRISICS_GET_DEVICE_CLOCK_AVAILABLE
239 start_val.fetch_min(clock);
255 sycl::memory_order_relaxed,
256 sycl::memory_scope_device,
257 sycl::access::address_space::global_space>
258 first_end(events[
acc._index[0]].first_end);
262 sycl::memory_order_relaxed,
263 sycl::memory_scope_device,
264 sycl::access::address_space::global_space>
265 last_end(events[
acc._index[0]].last_end);
267 using ull =
unsigned long long;
269#ifdef SHAMROCK_INTRISICS_GET_DEVICE_CLOCK_AVAILABLE
275 first_end.fetch_min(clock);
276 last_end.fetch_max(clock);
307#if __has_include(<nlohmann/json.hpp>)
328 inline void dump_to_file(
const std::string &filename) {
332 std::cout <<
"dumping to " << filename <<
" size = " << sz << std::endl;
336 u64 base_clock = get_base_clock_value();
338 for (
auto &t : events) {
339 t.start -= base_clock;
340 t.first_end -= base_clock;
341 t.last_end -= base_clock;
344 std::ofstream file(filename);
345 file << nlohmann::json(events).dump(4) << std::endl;
void to_json(nlohmann::json &j, const PatchSchedulerConfig &p)
Converts a PatchSchedulerConfig object to a JSON object.
std::uint32_t u32
32 bit unsigned integer
std::uint64_t u64
64 bit unsigned integer
A buffer allocated in USM (Unified Shared Memory)
void complete_event_state(sycl::event e) const
Complete the event state of the buffer.
T * get_write_access(sham::EventList &depends_list, SourceLocation src_loc=SourceLocation{})
Get a read-write pointer to the buffer's data.
T get_val_at_idx(size_t idx) const
Get the value at a given index in the buffer.
size_t get_size() const
Gets the number of elements in the buffer.
std::vector< T > copy_to_stdvec_idx_range(size_t begin, size_t end) const
Copies a specified range of elements from the buffer to a std::vector.
Class to manage a list of SYCL events.
This class implement the GPU core timeline tool from the original algorithm of A. Richermoz,...
void complete_event_state(sycl::event e)
void setFrameStartClock()
Recover the current device time in the frame_start_clock buffer.
gpu_core_timeline_profilier(const sham::DeviceScheduler_ptr &dev_sched, u32 max_event_count)
CTOR.
acc get_write_access(sham::EventList &deps)
Get a write access to the timeline events and the event count.
bool is_available_on_device()
Check if gpu_core_timeline_profilier is available on the device.
This file implement the GPU core timeline tool from A. Richermoz, F. Neyret 2024.
namespace for backends this one is named only sham since shambackends is too long to write
u32 get_sm_id()
Return the SM (Streaming Multiprocessor) ID of the calling thread, or equivalent if implemented.
void kernel_call(sham::DeviceQueue &q, RefIn in, RefOut in_out, u32 n, Functor &&func, SourceLocation &&callsite=SourceLocation{})
Submit a kernel to a SYCL queue.
u64 get_device_clock()
Return the number of clock cycles elapsed since an arbitrary starting point on the device.
constexpr u64 u64_max
u64 max value
A class that references multiple buffers or similar objects.
A timeline event for the gpu core timeline.
void start_timeline_event(const local_access_t &acc) const
Start a timeline event.
void init_timeline_event(sycl::nd_item< 1 > item, const local_access_t &acc) const
Initialize a timeline event.
void end_timeline_event(const local_access_t &acc) const
Finish a timeline event.