commit b8166afbbd33f8f277e919560981668a327e39db Author: John Alanbrook Date: Fri Jan 9 11:11:12 2026 -0600 initial add diff --git a/cell.toml b/cell.toml new file mode 100644 index 0000000..d12cae0 --- /dev/null +++ b/cell.toml @@ -0,0 +1,3 @@ +[compilation] +LDFLAGS = "-ltracy" +CFLAGS = "-I/opt/homebrew/include/tracy" diff --git a/tracy.c b/tracy.c new file mode 100644 index 0000000..d7844cc --- /dev/null +++ b/tracy.c @@ -0,0 +1,135 @@ +#include "cell.h" + +#define TRACY_ENABLE +#include "TracyC.h" + +#define MAX_ZONE_DEPTH 256 + +typedef struct { + TracyCZoneCtx zone_stack[MAX_ZONE_DEPTH]; + int zone_depth; +} tracy_context; + +static void tracy_hook(JSContext *js, int type, struct js_debug *dbg, void *user) +{ + tracy_context *ctx = (tracy_context *)user; + if (!ctx) return; + + if (type == JS_HOOK_CALL) { + if (ctx->zone_depth >= MAX_ZONE_DEPTH) return; + + uint64_t srcloc = ___tracy_alloc_srcloc( + dbg->line, + dbg->filename ? dbg->filename : "", + dbg->filename ? strlen(dbg->filename) : 9, + dbg->name ? dbg->name : "", + dbg->name ? strlen(dbg->name) : 11, + 0 + ); + + ctx->zone_stack[ctx->zone_depth] = ___tracy_emit_zone_begin_alloc(srcloc, 1); + ___tracy_emit_zone_color(ctx->zone_stack[ctx->zone_depth], dbg->unique); + ctx->zone_depth++; + } + else if (type == JS_HOOK_RET) { + if (ctx->zone_depth <= 0) return; + + ctx->zone_depth--; + ___tracy_emit_zone_end(ctx->zone_stack[ctx->zone_depth]); + } +} + +static void tracy_cell_hook(const char *name, int type) +{ + printf("%s\n", name); +#ifdef TRACY_FIBERS + if (type == CELL_HOOK_ENTER) + ___tracy_fiber_enter(name); + else if (type == CELL_HOOK_EXIT) + ___tracy_fiber_leave(); +#endif +} + +static JSValue js_tracy_init(JSContext *js, JSValue self, int argc, JSValue *argv) +{ + tracy_context *ctx = malloc(sizeof(tracy_context)); + if (!ctx) return JS_EXCEPTION; + + ctx->zone_depth = 0; + memset(ctx->zone_stack, 0, sizeof(ctx->zone_stack)); + + js_debug_sethook(js, tracy_hook, JS_HOOK_CALL | JS_HOOK_RET, ctx); + cell_trace_sethook(tracy_cell_hook); + return JS_NULL; +} + +static JSValue js_tracy_frame(JSContext *js, JSValue self, int argc, JSValue *argv) +{ + ___tracy_emit_frame_mark(NULL); + return JS_NULL; +} + +static JSValue js_tracy_message(JSContext *js, JSValue self, int argc, JSValue *argv) +{ + if (argc < 1) return JS_NULL; + + const char *msg = JS_ToCString(js, argv[0]); + if (!msg) return JS_EXCEPTION; + + ___tracy_emit_messageL(msg, 0); + JS_FreeCString(js, msg); + return JS_NULL; +} + +static JSValue js_tracy_image(JSContext *js, JSValue self, int argc, JSValue *argv) +{ + if (argc < 3) return JS_NULL; + + size_t size; + void *data = js_get_blob_data(js, &size, argv[0]); + if (!data) return JS_EXCEPTION; + + uint16_t w = (uint16_t)js2number(js, argv[1]); + uint16_t h = (uint16_t)js2number(js, argv[2]); + uint8_t offset = argc > 3 ? (uint8_t)js2number(js, argv[3]) : 0; + int32_t flip = argc > 4 ? js2bool(js, argv[4]) : 0; + + ___tracy_emit_frame_image(data, w, h, offset, flip); + return JS_NULL; +} +static JSValue js_tracy_appinfo(JSContext *js, JSValue self, int argc, JSValue *argv) +{ + if (argc < 1) return JS_NULL; + + const char *txt = JS_ToCString(js, argv[0]); + if (!txt) return JS_EXCEPTION; + + ___tracy_emit_message_appinfo(txt, strlen(txt)); + JS_FreeCString(js, txt); + return JS_NULL; +} + +static JSValue js_tracy_begin_sampling(JSContext *js, JSValue self, int argc, JSValue *argv) +{ +// int result = ___tracy_begin_sampling_profiler(); +// return JS_NewInt32(js, result); + return JS_NULL; +} + +static JSValue js_tracy_end_sampling(JSContext *js, JSValue self, int argc, JSValue *argv) +{ +// ___tracy_end_sampling_profiler(); + return JS_NULL; +} + +static const JSCFunctionListEntry js_tracy_funcs[] = { + JS_CFUNC_DEF("init", 0, js_tracy_init), + JS_CFUNC_DEF("frame", 0, js_tracy_frame), + JS_CFUNC_DEF("message", 1, js_tracy_message), + JS_CFUNC_DEF("image", 1, js_tracy_image), + JS_CFUNC_DEF("appinfo", 1, js_tracy_appinfo), + JS_CFUNC_DEF("beginSampling", 0, js_tracy_begin_sampling), + JS_CFUNC_DEF("endSampling", 0, js_tracy_end_sampling), +}; + +CELL_USE_FUNCS(js_tracy_funcs) diff --git a/tracy/Tracy.hpp b/tracy/Tracy.hpp new file mode 100644 index 0000000..8481396 --- /dev/null +++ b/tracy/Tracy.hpp @@ -0,0 +1,278 @@ +#ifndef __TRACY_HPP__ +#define __TRACY_HPP__ + +#include "../common/TracyColor.hpp" +#include "../common/TracySystem.hpp" + +#ifndef TracyFunction +# define TracyFunction __FUNCTION__ +#endif + +#ifndef TracyFile +# define TracyFile __FILE__ +#endif + +#ifndef TracyLine +# define TracyLine TracyConcat(__LINE__,U) // MSVC Edit and continue __LINE__ is non-constant. See https://developercommunity.visualstudio.com/t/-line-cannot-be-used-as-an-argument-for-constexpr/195665 +#endif + +#ifndef TRACY_ENABLE + +#define TracyNoop + +#define ZoneNamed(x,y) +#define ZoneNamedN(x,y,z) +#define ZoneNamedC(x,y,z) +#define ZoneNamedNC(x,y,z,w) + +#define ZoneTransient(x,y) +#define ZoneTransientN(x,y,z) + +#define ZoneScoped +#define ZoneScopedN(x) +#define ZoneScopedC(x) +#define ZoneScopedNC(x,y) + +#define ZoneText(x,y) +#define ZoneTextV(x,y,z) +#define ZoneTextF(x,...) +#define ZoneTextVF(x,y,...) +#define ZoneName(x,y) +#define ZoneNameV(x,y,z) +#define ZoneNameF(x,...) +#define ZoneNameVF(x,y,...) +#define ZoneColor(x) +#define ZoneColorV(x,y) +#define ZoneValue(x) +#define ZoneValueV(x,y) +#define ZoneIsActive false +#define ZoneIsActiveV(x) false + +#define FrameMark +#define FrameMarkNamed(x) +#define FrameMarkStart(x) +#define FrameMarkEnd(x) + +#define FrameImage(x,y,z,w,a) + +#define TracyLockable( type, varname ) type varname +#define TracyLockableN( type, varname, desc ) type varname +#define TracySharedLockable( type, varname ) type varname +#define TracySharedLockableN( type, varname, desc ) type varname +#define LockableBase( type ) type +#define SharedLockableBase( type ) type +#define LockMark(x) (void)x +#define LockableName(x,y,z) + +#define TracyPlot(x,y) +#define TracyPlotConfig(x,y,z,w,a) + +#define TracyMessage(x,y) +#define TracyMessageL(x) +#define TracyMessageC(x,y,z) +#define TracyMessageLC(x,y) +#define TracyAppInfo(x,y) + +#define TracyAlloc(x,y) +#define TracyFree(x) +#define TracyMemoryDiscard(x) +#define TracySecureAlloc(x,y) +#define TracySecureFree(x) +#define TracySecureMemoryDiscard(x) + +#define TracyAllocN(x,y,z) +#define TracyFreeN(x,y) +#define TracySecureAllocN(x,y,z) +#define TracySecureFreeN(x,y) + +#define ZoneNamedS(x,y,z) +#define ZoneNamedNS(x,y,z,w) +#define ZoneNamedCS(x,y,z,w) +#define ZoneNamedNCS(x,y,z,w,a) + +#define ZoneTransientS(x,y,z) +#define ZoneTransientNS(x,y,z,w) + +#define ZoneScopedS(x) +#define ZoneScopedNS(x,y) +#define ZoneScopedCS(x,y) +#define ZoneScopedNCS(x,y,z) + +#define TracyAllocS(x,y,z) +#define TracyFreeS(x,y) +#define TracyMemoryDiscardS(x,y) +#define TracySecureAllocS(x,y,z) +#define TracySecureFreeS(x,y) +#define TracySecureMemoryDiscardS(x,y) + +#define TracyAllocNS(x,y,z,w) +#define TracyFreeNS(x,y,z) +#define TracySecureAllocNS(x,y,z,w) +#define TracySecureFreeNS(x,y,z) + +#define TracyMessageS(x,y,z) +#define TracyMessageLS(x,y) +#define TracyMessageCS(x,y,z,w) +#define TracyMessageLCS(x,y,z) + +#define TracySourceCallbackRegister(x,y) +#define TracyParameterRegister(x,y) +#define TracyParameterSetup(x,y,z,w) +#define TracyIsConnected false +#define TracyIsStarted false +#define TracySetProgramName(x) + +#define TracyFiberEnter(x) +#define TracyFiberEnterHint(x,y) +#define TracyFiberLeave + +#else + +#include + +#include "../client/TracyLock.hpp" +#include "../client/TracyProfiler.hpp" +#include "../client/TracyScoped.hpp" + +#ifndef TRACY_CALLSTACK +#define TRACY_CALLSTACK 0 +#endif + +#define TracyNoop tracy::ProfilerAvailable() + +#define ZoneNamed( varname, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { nullptr, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active ) +#define ZoneNamedN( varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active ) +#define ZoneNamedC( varname, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { nullptr, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active ) +#define ZoneNamedNC( varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active ) + +#define ZoneTransient( varname, active ) tracy::ScopedZone varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), nullptr, 0, TRACY_CALLSTACK, active ) +#define ZoneTransientN( varname, name, active ) tracy::ScopedZone varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), TRACY_CALLSTACK, active ) +#define ZoneTransientNC( varname, name, color, active ) tracy::ScopedZone varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), color, TRACY_CALLSTACK, active ) + +#if defined(TRACY_ALLOW_SHADOW_WARNING) + #define SuppressVarShadowWarning(Expr) Expr +#elif defined(__clang__) + #define SuppressVarShadowWarning(Expr) \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wshadow\"") \ + Expr; \ + _Pragma("clang diagnostic pop") +#elif defined(__GNUC__) + #define SuppressVarShadowWarning(Expr) \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wshadow\"") \ + Expr; \ + _Pragma("GCC diagnostic pop") +#elif defined(_MSC_VER) + #define SuppressVarShadowWarning(Expr) \ + _Pragma("warning(push)") \ + _Pragma("warning(disable : 4456)") \ + Expr; \ + _Pragma("warning(pop)") +#else + #define SuppressVarShadowWarning(Expr) Expr +#endif + +#define ZoneScoped SuppressVarShadowWarning( ZoneNamed( ___tracy_scoped_zone, true ) ) +#define ZoneScopedN( name ) SuppressVarShadowWarning( ZoneNamedN( ___tracy_scoped_zone, name, true ) ) +#define ZoneScopedC( color ) SuppressVarShadowWarning( ZoneNamedC( ___tracy_scoped_zone, color, true ) ) +#define ZoneScopedNC( name, color ) SuppressVarShadowWarning( ZoneNamedNC( ___tracy_scoped_zone, name, color, true ) ) + +#define ZoneText( txt, size ) ___tracy_scoped_zone.Text( txt, size ) +#define ZoneTextV( varname, txt, size ) varname.Text( txt, size ) +#define ZoneTextF( fmt, ... ) ___tracy_scoped_zone.TextFmt( fmt, ##__VA_ARGS__ ) +#define ZoneTextVF( varname, fmt, ... ) varname.TextFmt( fmt, ##__VA_ARGS__ ) +#define ZoneName( txt, size ) ___tracy_scoped_zone.Name( txt, size ) +#define ZoneNameV( varname, txt, size ) varname.Name( txt, size ) +#define ZoneNameF( fmt, ... ) ___tracy_scoped_zone.NameFmt( fmt, ##__VA_ARGS__ ) +#define ZoneNameVF( varname, fmt, ... ) varname.NameFmt( fmt, ##__VA_ARGS__ ) +#define ZoneColor( color ) ___tracy_scoped_zone.Color( color ) +#define ZoneColorV( varname, color ) varname.Color( color ) +#define ZoneValue( value ) ___tracy_scoped_zone.Value( value ) +#define ZoneValueV( varname, value ) varname.Value( value ) +#define ZoneIsActive ___tracy_scoped_zone.IsActive() +#define ZoneIsActiveV( varname ) varname.IsActive() + +#define FrameMark tracy::Profiler::SendFrameMark( nullptr ) +#define FrameMarkNamed( name ) tracy::Profiler::SendFrameMark( name ) +#define FrameMarkStart( name ) tracy::Profiler::SendFrameMark( name, tracy::QueueType::FrameMarkMsgStart ) +#define FrameMarkEnd( name ) tracy::Profiler::SendFrameMark( name, tracy::QueueType::FrameMarkMsgEnd ) + +#define FrameImage( image, width, height, offset, flip ) tracy::Profiler::SendFrameImage( image, width, height, offset, flip ) + +#define TracyLockable( type, varname ) tracy::Lockable varname { [] () -> const tracy::SourceLocationData* { static constexpr tracy::SourceLocationData srcloc { nullptr, #type " " #varname, TracyFile, TracyLine, 0 }; return &srcloc; }() } +#define TracyLockableN( type, varname, desc ) tracy::Lockable varname { [] () -> const tracy::SourceLocationData* { static constexpr tracy::SourceLocationData srcloc { nullptr, desc, TracyFile, TracyLine, 0 }; return &srcloc; }() } +#define TracySharedLockable( type, varname ) tracy::SharedLockable varname { [] () -> const tracy::SourceLocationData* { static constexpr tracy::SourceLocationData srcloc { nullptr, #type " " #varname, TracyFile, TracyLine, 0 }; return &srcloc; }() } +#define TracySharedLockableN( type, varname, desc ) tracy::SharedLockable varname { [] () -> const tracy::SourceLocationData* { static constexpr tracy::SourceLocationData srcloc { nullptr, desc, TracyFile, TracyLine, 0 }; return &srcloc; }() } +#define LockableBase( type ) tracy::Lockable +#define SharedLockableBase( type ) tracy::SharedLockable +#define LockMark( varname ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_lock_location_,TracyLine) { nullptr, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; varname.Mark( &TracyConcat(__tracy_lock_location_,TracyLine) ) +#define LockableName( varname, txt, size ) varname.CustomName( txt, size ) + +#define TracyPlot( name, val ) tracy::Profiler::PlotData( name, val ) +#define TracyPlotConfig( name, type, step, fill, color ) tracy::Profiler::ConfigurePlot( name, type, step, fill, color ) + +#define TracyAppInfo( txt, size ) tracy::Profiler::MessageAppInfo( txt, size ) + +#define TracyMessage( txt, size ) tracy::Profiler::Message( txt, size, TRACY_CALLSTACK ) +#define TracyMessageL( txt ) tracy::Profiler::Message( txt, TRACY_CALLSTACK ) +#define TracyMessageC( txt, size, color ) tracy::Profiler::MessageColor( txt, size, color, TRACY_CALLSTACK ) +#define TracyMessageLC( txt, color ) tracy::Profiler::MessageColor( txt, color, TRACY_CALLSTACK ) + +#define TracyAlloc( ptr, size ) tracy::Profiler::MemAllocCallstack( ptr, size, TRACY_CALLSTACK, false ) +#define TracyFree( ptr ) tracy::Profiler::MemFreeCallstack( ptr, TRACY_CALLSTACK, false ) +#define TracySecureAlloc( ptr, size ) tracy::Profiler::MemAllocCallstack( ptr, size, TRACY_CALLSTACK, true ) +#define TracySecureFree( ptr ) tracy::Profiler::MemFreeCallstack( ptr, TRACY_CALLSTACK, true ) + +#define TracyAllocN( ptr, size, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, TRACY_CALLSTACK, false, name ) +#define TracyFreeN( ptr, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, TRACY_CALLSTACK, false, name ) +#define TracyMemoryDiscard( name ) tracy::Profiler::MemDiscardCallstack( name, false, TRACY_CALLSTACK ) +#define TracySecureAllocN( ptr, size, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, TRACY_CALLSTACK, true, name ) +#define TracySecureFreeN( ptr, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, TRACY_CALLSTACK, true, name ) +#define TracySecureMemoryDiscard( name ) tracy::Profiler::MemDiscardCallstack( name, true, TRACY_CALLSTACK ) + +#define ZoneNamedS( varname, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { nullptr, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), depth, active ) +#define ZoneNamedNS( varname, name, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), depth, active ) +#define ZoneNamedCS( varname, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { nullptr, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), depth, active ) +#define ZoneNamedNCS( varname, name, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), depth, active ) + +#define ZoneTransientS( varname, depth, active ) tracy::ScopedZone varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), nullptr, 0, depth, active ) +#define ZoneTransientNS( varname, name, depth, active ) tracy::ScopedZone varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), depth, active ) + +#define ZoneScopedS( depth ) ZoneNamedS( ___tracy_scoped_zone, depth, true ) +#define ZoneScopedNS( name, depth ) ZoneNamedNS( ___tracy_scoped_zone, name, depth, true ) +#define ZoneScopedCS( color, depth ) ZoneNamedCS( ___tracy_scoped_zone, color, depth, true ) +#define ZoneScopedNCS( name, color, depth ) ZoneNamedNCS( ___tracy_scoped_zone, name, color, depth, true ) + +#define TracyAllocS( ptr, size, depth ) tracy::Profiler::MemAllocCallstack( ptr, size, depth, false ) +#define TracyFreeS( ptr, depth ) tracy::Profiler::MemFreeCallstack( ptr, depth, false ) +#define TracySecureAllocS( ptr, size, depth ) tracy::Profiler::MemAllocCallstack( ptr, size, depth, true ) +#define TracySecureFreeS( ptr, depth ) tracy::Profiler::MemFreeCallstack( ptr, depth, true ) + +#define TracyAllocNS( ptr, size, depth, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, depth, false, name ) +#define TracyFreeNS( ptr, depth, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, depth, false, name ) +#define TracyMemoryDiscardS( name, depth ) tracy::Profiler::MemDiscardCallstack( name, false, depth ) +#define TracySecureAllocNS( ptr, size, depth, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, depth, true, name ) +#define TracySecureFreeNS( ptr, depth, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, depth, true, name ) +#define TracySecureMemoryDiscardS( name, depth ) tracy::Profiler::MemDiscardCallstack( name, true, depth ) + +#define TracyMessageS( txt, size, depth ) tracy::Profiler::Message( txt, size, depth ) +#define TracyMessageLS( txt, depth ) tracy::Profiler::Message( txt, depth ) +#define TracyMessageCS( txt, size, color, depth ) tracy::Profiler::MessageColor( txt, size, color, depth ) +#define TracyMessageLCS( txt, color, depth ) tracy::Profiler::MessageColor( txt, color, depth ) + +#define TracySourceCallbackRegister( cb, data ) tracy::Profiler::SourceCallbackRegister( cb, data ) +#define TracyParameterRegister( cb, data ) tracy::Profiler::ParameterRegister( cb, data ) +#define TracyParameterSetup( idx, name, isBool, val ) tracy::Profiler::ParameterSetup( idx, name, isBool, val ) +#define TracyIsConnected tracy::GetProfiler().IsConnected() +#define TracySetProgramName( name ) tracy::GetProfiler().SetProgramName( name ); + +#ifdef TRACY_FIBERS +# define TracyFiberEnter( fiber ) tracy::Profiler::EnterFiber( fiber, 0 ) +# define TracyFiberEnterHint( fiber, groupHint ) tracy::Profiler::EnterFiber( fiber, groupHint ) +# define TracyFiberLeave tracy::Profiler::LeaveFiber() +#endif + +#endif + +#endif diff --git a/tracy/TracyC.h b/tracy/TracyC.h new file mode 100644 index 0000000..e77c01f --- /dev/null +++ b/tracy/TracyC.h @@ -0,0 +1,393 @@ +#ifndef __TRACYC_HPP__ +#define __TRACYC_HPP__ + +#include +#include + +#include "../common/TracyApi.h" + +#ifdef __cplusplus +extern "C" { +#endif + +enum TracyPlotFormatEnum +{ + TracyPlotFormatNumber, + TracyPlotFormatMemory, + TracyPlotFormatPercentage, + TracyPlotFormatWatt +}; + +TRACY_API void ___tracy_set_thread_name( const char* name ); + +#define TracyCSetThreadName( name ) ___tracy_set_thread_name( name ); + +#ifndef TracyFunction +# define TracyFunction __FUNCTION__ +#endif + +#ifndef TracyFile +# define TracyFile __FILE__ +#endif + +#ifndef TracyLine +# define TracyLine __LINE__ +#endif + +#ifndef TRACY_ENABLE + +typedef const void* TracyCZoneCtx; + +typedef const void* TracyCLockCtx; + +#define TracyCZone(c,x) +#define TracyCZoneN(c,x,y) +#define TracyCZoneC(c,x,y) +#define TracyCZoneNC(c,x,y,z) +#define TracyCZoneEnd(c) +#define TracyCZoneText(c,x,y) +#define TracyCZoneName(c,x,y) +#define TracyCZoneColor(c,x) +#define TracyCZoneValue(c,x) + +#define TracyCAlloc(x,y) +#define TracyCFree(x) +#define TracyCMemoryDiscard(x) +#define TracyCSecureAlloc(x,y) +#define TracyCSecureFree(x) +#define TracyCSecureMemoryDiscard(x) + +#define TracyCAllocN(x,y,z) +#define TracyCFreeN(x,y) +#define TracyCSecureAllocN(x,y,z) +#define TracyCSecureFreeN(x,y) + +#define TracyCFrameMark +#define TracyCFrameMarkNamed(x) +#define TracyCFrameMarkStart(x) +#define TracyCFrameMarkEnd(x) +#define TracyCFrameImage(x,y,z,w,a) + +#define TracyCPlot(x,y) +#define TracyCPlotF(x,y) +#define TracyCPlotI(x,y) +#define TracyCPlotConfig(x,y,z,w,a) + +#define TracyCMessage(x,y) +#define TracyCMessageL(x) +#define TracyCMessageC(x,y,z) +#define TracyCMessageLC(x,y) +#define TracyCAppInfo(x,y) + +#define TracyCZoneS(x,y,z) +#define TracyCZoneNS(x,y,z,w) +#define TracyCZoneCS(x,y,z,w) +#define TracyCZoneNCS(x,y,z,w,a) + +#define TracyCAllocS(x,y,z) +#define TracyCFreeS(x,y) +#define TracyCMemoryDiscardS(x,y) +#define TracyCSecureAllocS(x,y,z) +#define TracyCSecureFreeS(x,y) +#define TracyCSecureMemoryDiscardS(x,y) + +#define TracyCAllocNS(x,y,z,w) +#define TracyCFreeNS(x,y,z) +#define TracyCSecureAllocNS(x,y,z,w) +#define TracyCSecureFreeNS(x,y,z) + +#define TracyCMessageS(x,y,z) +#define TracyCMessageLS(x,y) +#define TracyCMessageCS(x,y,z,w) +#define TracyCMessageLCS(x,y,z) + +#define TracyCLockCtx(l) +#define TracyCLockAnnounce(l) +#define TracyCLockTerminate(l) +#define TracyCLockBeforeLock(l) +#define TracyCLockAfterLock(l) +#define TracyCLockAfterUnlock(l) +#define TracyCLockAfterTryLock(l,x) +#define TracyCLockMark(l) +#define TracyCLockCustomName(l,x,y) + +#define TracyCIsConnected 0 +#define TracyCIsStarted 0 + +#define TracyCBeginSamplingProfiling() 0 +#define TracyCEndSamplingProfiling() + +#ifdef TRACY_FIBERS +# define TracyCFiberEnter(fiber) +# define TracyCFiberLeave +#endif + +#else + +#ifndef TracyConcat +# define TracyConcat(x,y) TracyConcatIndirect(x,y) +#endif +#ifndef TracyConcatIndirect +# define TracyConcatIndirect(x,y) x##y +#endif + +struct ___tracy_source_location_data +{ + const char* name; + const char* function; + const char* file; + uint32_t line; + uint32_t color; +}; + +struct ___tracy_c_zone_context +{ + uint32_t id; + int32_t active; +}; + +struct ___tracy_gpu_time_data +{ + int64_t gpuTime; + uint16_t queryId; + uint8_t context; +}; + +struct ___tracy_gpu_zone_begin_data { + uint64_t srcloc; + uint16_t queryId; + uint8_t context; +}; + +struct ___tracy_gpu_zone_begin_callstack_data { + uint64_t srcloc; + int32_t depth; + uint16_t queryId; + uint8_t context; +}; + +struct ___tracy_gpu_zone_end_data { + uint16_t queryId; + uint8_t context; +}; + +struct ___tracy_gpu_new_context_data { + int64_t gpuTime; + float period; + uint8_t context; + uint8_t flags; + uint8_t type; +}; + +struct ___tracy_gpu_context_name_data { + uint8_t context; + const char* name; + uint16_t len; +}; + +struct ___tracy_gpu_calibration_data { + int64_t gpuTime; + int64_t cpuDelta; + uint8_t context; +}; + +struct ___tracy_gpu_time_sync_data { + int64_t gpuTime; + uint8_t context; +}; + +struct __tracy_lockable_context_data; + +// Some containers don't support storing const types. +// This struct, as visible to user, is immutable, so treat it as if const was declared here. +typedef /*const*/ struct ___tracy_c_zone_context TracyCZoneCtx; + +typedef struct __tracy_lockable_context_data* TracyCLockCtx; + +#ifdef TRACY_MANUAL_LIFETIME +TRACY_API void ___tracy_startup_profiler(void); +TRACY_API void ___tracy_shutdown_profiler(void); +TRACY_API int32_t ___tracy_profiler_started(void); + +# define TracyCIsStarted ___tracy_profiler_started() +#else +# define TracyCIsStarted 1 +#endif + +TRACY_API uint64_t ___tracy_alloc_srcloc( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, uint32_t color ); +TRACY_API uint64_t ___tracy_alloc_srcloc_name( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, uint32_t color ); + +TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin( const struct ___tracy_source_location_data* srcloc, int32_t active ); +TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_callstack( const struct ___tracy_source_location_data* srcloc, int32_t depth, int32_t active ); +TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_alloc( uint64_t srcloc, int32_t active ); +TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_alloc_callstack( uint64_t srcloc, int32_t depth, int32_t active ); +TRACY_API void ___tracy_emit_zone_end( TracyCZoneCtx ctx ); +TRACY_API void ___tracy_emit_zone_text( TracyCZoneCtx ctx, const char* txt, size_t size ); +TRACY_API void ___tracy_emit_zone_name( TracyCZoneCtx ctx, const char* txt, size_t size ); +TRACY_API void ___tracy_emit_zone_color( TracyCZoneCtx ctx, uint32_t color ); +TRACY_API void ___tracy_emit_zone_value( TracyCZoneCtx ctx, uint64_t value ); + +TRACY_API void ___tracy_emit_gpu_zone_begin( const struct ___tracy_gpu_zone_begin_data ); +TRACY_API void ___tracy_emit_gpu_zone_begin_callstack( const struct ___tracy_gpu_zone_begin_callstack_data ); +TRACY_API void ___tracy_emit_gpu_zone_begin_alloc( const struct ___tracy_gpu_zone_begin_data ); +TRACY_API void ___tracy_emit_gpu_zone_begin_alloc_callstack( const struct ___tracy_gpu_zone_begin_callstack_data ); +TRACY_API void ___tracy_emit_gpu_zone_end( const struct ___tracy_gpu_zone_end_data data ); +TRACY_API void ___tracy_emit_gpu_time( const struct ___tracy_gpu_time_data ); +TRACY_API void ___tracy_emit_gpu_new_context( const struct ___tracy_gpu_new_context_data ); +TRACY_API void ___tracy_emit_gpu_context_name( const struct ___tracy_gpu_context_name_data ); +TRACY_API void ___tracy_emit_gpu_calibration( const struct ___tracy_gpu_calibration_data ); +TRACY_API void ___tracy_emit_gpu_time_sync( const struct ___tracy_gpu_time_sync_data ); + +TRACY_API void ___tracy_emit_gpu_zone_begin_serial( const struct ___tracy_gpu_zone_begin_data ); +TRACY_API void ___tracy_emit_gpu_zone_begin_callstack_serial( const struct ___tracy_gpu_zone_begin_callstack_data ); +TRACY_API void ___tracy_emit_gpu_zone_begin_alloc_serial( const struct ___tracy_gpu_zone_begin_data ); +TRACY_API void ___tracy_emit_gpu_zone_begin_alloc_callstack_serial( const struct ___tracy_gpu_zone_begin_callstack_data ); +TRACY_API void ___tracy_emit_gpu_zone_end_serial( const struct ___tracy_gpu_zone_end_data data ); +TRACY_API void ___tracy_emit_gpu_time_serial( const struct ___tracy_gpu_time_data ); +TRACY_API void ___tracy_emit_gpu_new_context_serial( const struct ___tracy_gpu_new_context_data ); +TRACY_API void ___tracy_emit_gpu_context_name_serial( const struct ___tracy_gpu_context_name_data ); +TRACY_API void ___tracy_emit_gpu_calibration_serial( const struct ___tracy_gpu_calibration_data ); +TRACY_API void ___tracy_emit_gpu_time_sync_serial( const struct ___tracy_gpu_time_sync_data ); + +TRACY_API int32_t ___tracy_connected(void); + +#ifndef TRACY_CALLSTACK +#define TRACY_CALLSTACK 0 +#endif + +#define TracyCZone( ctx, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { NULL, __func__, TracyFile, (uint32_t)TracyLine, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active ); +#define TracyCZoneN( ctx, name, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { name, __func__, TracyFile, (uint32_t)TracyLine, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active ); +#define TracyCZoneC( ctx, color, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { NULL, __func__, TracyFile, (uint32_t)TracyLine, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active ); +#define TracyCZoneNC( ctx, name, color, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { name, __func__, TracyFile, (uint32_t)TracyLine, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active ); + +#define TracyCZoneEnd( ctx ) ___tracy_emit_zone_end( ctx ); + +#define TracyCZoneText( ctx, txt, size ) ___tracy_emit_zone_text( ctx, txt, size ); +#define TracyCZoneName( ctx, txt, size ) ___tracy_emit_zone_name( ctx, txt, size ); +#define TracyCZoneColor( ctx, color ) ___tracy_emit_zone_color( ctx, color ); +#define TracyCZoneValue( ctx, value ) ___tracy_emit_zone_value( ctx, value ); + + +TRACY_API void ___tracy_emit_memory_alloc( const void* ptr, size_t size, int32_t secure ); +TRACY_API void ___tracy_emit_memory_alloc_callstack( const void* ptr, size_t size, int32_t depth, int32_t secure ); +TRACY_API void ___tracy_emit_memory_free( const void* ptr, int32_t secure ); +TRACY_API void ___tracy_emit_memory_free_callstack( const void* ptr, int32_t depth, int32_t secure ); +TRACY_API void ___tracy_emit_memory_alloc_named( const void* ptr, size_t size, int32_t secure, const char* name ); +TRACY_API void ___tracy_emit_memory_alloc_callstack_named( const void* ptr, size_t size, int32_t depth, int32_t secure, const char* name ); +TRACY_API void ___tracy_emit_memory_free_named( const void* ptr, int32_t secure, const char* name ); +TRACY_API void ___tracy_emit_memory_free_callstack_named( const void* ptr, int32_t depth, int32_t secure, const char* name ); +TRACY_API void ___tracy_emit_memory_discard( const char* name, int32_t secure ); +TRACY_API void ___tracy_emit_memory_discard_callstack( const char* name, int32_t secure, int32_t depth ); + +TRACY_API void ___tracy_emit_message( const char* txt, size_t size, int32_t callstack_depth ); +TRACY_API void ___tracy_emit_messageL( const char* txt, int32_t callstack_depth ); +TRACY_API void ___tracy_emit_messageC( const char* txt, size_t size, uint32_t color, int32_t callstack_depth ); +TRACY_API void ___tracy_emit_messageLC( const char* txt, uint32_t color, int32_t callstack_depth ); + +#define TracyCAlloc( ptr, size ) ___tracy_emit_memory_alloc_callstack( ptr, size, TRACY_CALLSTACK, 0 ) +#define TracyCFree( ptr ) ___tracy_emit_memory_free_callstack( ptr, TRACY_CALLSTACK, 0 ) +#define TracyCMemoryDiscard( name ) ___tracy_emit_memory_discard_callstack( name, 0, TRACY_CALLSTACK ); +#define TracyCSecureAlloc( ptr, size ) ___tracy_emit_memory_alloc_callstack( ptr, size, TRACY_CALLSTACK, 1 ) +#define TracyCSecureFree( ptr ) ___tracy_emit_memory_free_callstack( ptr, TRACY_CALLSTACK, 1 ) +#define TracyCSecureMemoryDiscard( name ) ___tracy_emit_memory_discard_callstack( name, 1, TRACY_CALLSTACK ); + +#define TracyCAllocN( ptr, size, name ) ___tracy_emit_memory_alloc_callstack_named( ptr, size, TRACY_CALLSTACK, 0, name ) +#define TracyCFreeN( ptr, name ) ___tracy_emit_memory_free_callstack_named( ptr, TRACY_CALLSTACK, 0, name ) +#define TracyCSecureAllocN( ptr, size, name ) ___tracy_emit_memory_alloc_callstack_named( ptr, size, TRACY_CALLSTACK, 1, name ) +#define TracyCSecureFreeN( ptr, name ) ___tracy_emit_memory_free_callstack_named( ptr, TRACY_CALLSTACK, 1, name ) + +#define TracyCMessage( txt, size ) ___tracy_emit_message( txt, size, TRACY_CALLSTACK ); +#define TracyCMessageL( txt ) ___tracy_emit_messageL( txt, TRACY_CALLSTACK ); +#define TracyCMessageC( txt, size, color ) ___tracy_emit_messageC( txt, size, color, TRACY_CALLSTACK ); +#define TracyCMessageLC( txt, color ) ___tracy_emit_messageLC( txt, color, TRACY_CALLSTACK ); + + +TRACY_API void ___tracy_emit_frame_mark( const char* name ); +TRACY_API void ___tracy_emit_frame_mark_start( const char* name ); +TRACY_API void ___tracy_emit_frame_mark_end( const char* name ); +TRACY_API void ___tracy_emit_frame_image( const void* image, uint16_t w, uint16_t h, uint8_t offset, int32_t flip ); + +#define TracyCFrameMark ___tracy_emit_frame_mark( 0 ); +#define TracyCFrameMarkNamed( name ) ___tracy_emit_frame_mark( name ); +#define TracyCFrameMarkStart( name ) ___tracy_emit_frame_mark_start( name ); +#define TracyCFrameMarkEnd( name ) ___tracy_emit_frame_mark_end( name ); +#define TracyCFrameImage( image, width, height, offset, flip ) ___tracy_emit_frame_image( image, width, height, offset, flip ); + + +TRACY_API void ___tracy_emit_plot( const char* name, double val ); +TRACY_API void ___tracy_emit_plot_float( const char* name, float val ); +TRACY_API void ___tracy_emit_plot_int( const char* name, int64_t val ); +TRACY_API void ___tracy_emit_plot_config( const char* name, int32_t type, int32_t step, int32_t fill, uint32_t color ); +TRACY_API void ___tracy_emit_message_appinfo( const char* txt, size_t size ); + +#define TracyCPlot( name, val ) ___tracy_emit_plot( name, val ); +#define TracyCPlotF( name, val ) ___tracy_emit_plot_float( name, val ); +#define TracyCPlotI( name, val ) ___tracy_emit_plot_int( name, val ); +#define TracyCPlotConfig( name, type, step, fill, color ) ___tracy_emit_plot_config( name, type, step, fill, color ); +#define TracyCAppInfo( txt, size ) ___tracy_emit_message_appinfo( txt, size ); + + +#define TracyCZoneS( ctx, depth, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { NULL, __func__, TracyFile, (uint32_t)TracyLine, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), depth, active ); +#define TracyCZoneNS( ctx, name, depth, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { name, __func__, TracyFile, (uint32_t)TracyLine, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), depth, active ); +#define TracyCZoneCS( ctx, color, depth, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { NULL, __func__, TracyFile, (uint32_t)TracyLine, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), depth, active ); +#define TracyCZoneNCS( ctx, name, color, depth, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { name, __func__, TracyFile, (uint32_t)TracyLine, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), depth, active ); + +#define TracyCAllocS( ptr, size, depth ) ___tracy_emit_memory_alloc_callstack( ptr, size, depth, 0 ) +#define TracyCFreeS( ptr, depth ) ___tracy_emit_memory_free_callstack( ptr, depth, 0 ) +#define TracyCMemoryDiscardS( name, depth ) ___tracy_emit_memory_discard_callstack( name, 0, depth ) +#define TracyCSecureAllocS( ptr, size, depth ) ___tracy_emit_memory_alloc_callstack( ptr, size, depth, 1 ) +#define TracyCSecureFreeS( ptr, depth ) ___tracy_emit_memory_free_callstack( ptr, depth, 1 ) +#define TracyCSecureMemoryDiscardS( name, depth ) ___tracy_emit_memory_discard_callstack( name, 1, depth ) + +#define TracyCAllocNS( ptr, size, depth, name ) ___tracy_emit_memory_alloc_callstack_named( ptr, size, depth, 0, name ) +#define TracyCFreeNS( ptr, depth, name ) ___tracy_emit_memory_free_callstack_named( ptr, depth, 0, name ) +#define TracyCSecureAllocNS( ptr, size, depth, name ) ___tracy_emit_memory_alloc_callstack_named( ptr, size, depth, 1, name ) +#define TracyCSecureFreeNS( ptr, depth, name ) ___tracy_emit_memory_free_callstack_named( ptr, depth, 1, name ) + +#define TracyCMessageS( txt, size, depth ) ___tracy_emit_message( txt, size, depth ); +#define TracyCMessageLS( txt, depth ) ___tracy_emit_messageL( txt, depth ); +#define TracyCMessageCS( txt, size, color, depth ) ___tracy_emit_messageC( txt, size, color, depth ); +#define TracyCMessageLCS( txt, color, depth ) ___tracy_emit_messageLC( txt, color, depth ); + + +TRACY_API struct __tracy_lockable_context_data* ___tracy_announce_lockable_ctx( const struct ___tracy_source_location_data* srcloc ); +TRACY_API void ___tracy_terminate_lockable_ctx( struct __tracy_lockable_context_data* lockdata ); +TRACY_API int32_t ___tracy_before_lock_lockable_ctx( struct __tracy_lockable_context_data* lockdata ); +TRACY_API void ___tracy_after_lock_lockable_ctx( struct __tracy_lockable_context_data* lockdata ); +TRACY_API void ___tracy_after_unlock_lockable_ctx( struct __tracy_lockable_context_data* lockdata ); +TRACY_API void ___tracy_after_try_lock_lockable_ctx( struct __tracy_lockable_context_data* lockdata, int32_t acquired ); +TRACY_API void ___tracy_mark_lockable_ctx( struct __tracy_lockable_context_data* lockdata, const struct ___tracy_source_location_data* srcloc ); +TRACY_API void ___tracy_custom_name_lockable_ctx( struct __tracy_lockable_context_data* lockdata, const char* name, size_t nameSz ); + +#define TracyCLockAnnounce( lock ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { NULL, __func__, TracyFile, (uint32_t)TracyLine, 0 }; lock = ___tracy_announce_lockable_ctx( &TracyConcat(__tracy_source_location,TracyLine) ); +#define TracyCLockTerminate( lock ) ___tracy_terminate_lockable_ctx( lock ); +#define TracyCLockBeforeLock( lock ) ___tracy_before_lock_lockable_ctx( lock ); +#define TracyCLockAfterLock( lock ) ___tracy_after_lock_lockable_ctx( lock ); +#define TracyCLockAfterUnlock( lock ) ___tracy_after_unlock_lockable_ctx( lock ); +#define TracyCLockAfterTryLock( lock, acquired ) ___tracy_after_try_lock_lockable_ctx( lock, acquired ); +#define TracyCLockMark( lock ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { NULL, __func__, TracyFile, (uint32_t)TracyLine, 0 }; ___tracy_mark_lockable_ctx( lock, &TracyConcat(__tracy_source_location,TracyLine) ); +#define TracyCLockCustomName( lock, name, nameSz ) ___tracy_custom_name_lockable_ctx( lock, name, nameSz ); + +#define TracyCIsConnected ___tracy_connected() + +TRACY_API int ___tracy_begin_sampling_profiler( void ); +TRACY_API void ___tracy_end_sampling_profiler( void ); + +#define TracyCBeginSamplingProfiling() ___tracy_begin_sampling_profiling() +#define TracyCEndSamplingProfiling() ___tracy_end_sampling_profiling() + +#ifdef TRACY_FIBERS +TRACY_API void ___tracy_fiber_enter( const char* fiber ); +TRACY_API void ___tracy_fiber_leave( void ); + +# define TracyCFiberEnter( fiber ) ___tracy_fiber_enter( fiber ); +# define TracyCFiberLeave ___tracy_fiber_leave(); +#endif + +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/tracy/TracyCUDA.hpp b/tracy/TracyCUDA.hpp new file mode 100644 index 0000000..40ff55d --- /dev/null +++ b/tracy/TracyCUDA.hpp @@ -0,0 +1,1325 @@ +#ifndef __TRACYCUDA_HPP__ +#define __TRACYCUDA_HPP__ + +#ifndef TRACY_ENABLE + +#define TracyCUDAContext() nullptr +#define TracyCUDAContextDestroy(ctx) +#define TracyCUDAContextName(ctx, name, size) + +#define TracyCUDAStartProfiling(ctx) +#define TracyCUDAStopProfiling(ctx) + +#define TracyCUDACollect(ctx) + +#else +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef _MSC_VER +#include +#endif + +#include + +#ifndef UNREFERENCED +#define UNREFERENCED(x) (void)x +#endif//UNREFERENCED + +#ifndef TRACY_CUDA_CALIBRATED_CONTEXT +#define TRACY_CUDA_CALIBRATED_CONTEXT (1) +#endif//TRACY_CUDA_CALIBRATED_CONTEXT + +#ifndef TRACY_CUDA_ENABLE_COLLECTOR_THREAD +#define TRACY_CUDA_ENABLE_COLLECTOR_THREAD (1) +#endif//TRACY_CUDA_ENABLE_COLLECTOR_THREAD + +#ifndef TRACY_CUDA_ENABLE_CUDA_CALL_STATS +#define TRACY_CUDA_ENABLE_CUDA_CALL_STATS (0) +#endif//TRACY_CUDA_ENABLE_CUDA_CALL_STATS + +namespace { + +// TODO(marcos): wrap these in structs for better type safety +using CUptiTimestamp = uint64_t; +using TracyTimestamp = int64_t; + +struct IncrementalRegression { + using float_t = double; + struct Parameters { + float_t slope, intercept; + }; + + int n = 0; + float_t x_mean = 0; + float_t y_mean = 0; + float_t x_svar = 0; + float_t y_svar = 0; + float_t xy_scov = 0; + + auto parameters() const { + float_t slope = xy_scov / x_svar; + float_t intercept = y_mean - slope * x_mean; + return Parameters{ slope, intercept }; + } + + auto orthogonal() const { + // NOTE(marcos): orthogonal regression is Deming regression with delta = 1 + float_t delta = float_t(1); // delta = 1 -> orthogonal regression + float_t k = y_svar - delta * x_svar; + float_t slope = (k + sqrt(k * k + 4 * delta * xy_scov * xy_scov)) / (2 * xy_scov); + float_t intercept = y_mean - slope * x_mean; + return Parameters{ slope, intercept }; + } + + void addSample(float_t x, float_t y) { + ++n; + float_t x_mean_prev = x_mean; + float_t y_mean_prev = y_mean; + x_mean += (x - x_mean) / n; + y_mean += (y - y_mean) / n; + x_svar += (x - x_mean_prev) * (x - x_mean); + y_svar += (y - y_mean_prev) * (y - y_mean); + xy_scov += (x - x_mean_prev) * (y - y_mean); + } +}; + +tracy_force_inline TracyTimestamp tracyGetTimestamp() { + return tracy::Profiler::GetTime(); +} + +auto& getCachedRegressionParameters() { + // WARN(marcos): in theory, these linear regression parameters would be loaded/stored atomically; + // in practice, however, it should not matter so long as the loads/stores are not "sliced" + static IncrementalRegression::Parameters cached; + return cached; +} + +TracyTimestamp tracyFromCUpti(CUptiTimestamp cuptiTime) { + // NOTE(marcos): linear regression estimate + // y_hat = slope * x + intercept | X: CUptiTimestamp, Y: TracyTimestamp + auto [slope, intercept] = getCachedRegressionParameters(); + double y_hat = slope * cuptiTime + intercept; + TracyTimestamp tracyTime = TracyTimestamp(y_hat); + assert(tracyTime >= 0); + return tracyTime; +} + +template +tracy_force_inline void tracyMemWrite(T& where,U what) { + static_assert(std::is_same_v, "tracy::MemWrite: type mismatch."); + tracy::MemWrite(&where, what); +} + +void* tracyMalloc(size_t bytes) { + return tracy::tracy_malloc(bytes); +} + +void tracyFree(void* ptr) { + tracy::tracy_free(ptr); +} + +void tracyZoneBegin(TracyTimestamp time, tracy::SourceLocationData* srcLoc) { + using namespace tracy; + TracyQueuePrepare(QueueType::ZoneBegin); + tracyMemWrite(item->zoneBegin.time, time); + tracyMemWrite(item->zoneBegin.srcloc, (uint64_t)srcLoc); + TracyQueueCommit(zoneBeginThread); +} + +void tracyZoneEnd(TracyTimestamp time) { + using namespace tracy; + TracyQueuePrepare(QueueType::ZoneEnd); + tracyMemWrite(item->zoneEnd.time, time); + TracyQueueCommit(zoneEndThread); +} + +void tracyPlot(const char* name, float value, TracyTimestamp time) { + using namespace tracy; + TracyLfqPrepare(QueueType::PlotDataFloat); + tracyMemWrite(item->plotDataFloat.name, (uint64_t)name); + tracyMemWrite(item->plotDataFloat.time, time); + tracyMemWrite(item->plotDataFloat.val, value); + TracyLfqCommit; +} + +void tracyPlot(const char* name, float value, CUptiTimestamp time) { + tracyPlot(name, value, tracyFromCUpti(time)); +} + +void tracyPlotActivity(const char* name, TracyTimestamp start, TracyTimestamp end, float value = 1.0f, float baseline = 0.0f) { + tracyPlot(name, baseline, start); + tracyPlot(name, value, start + 3); + tracyPlot(name, value, end - 3); + tracyPlot(name, baseline, end); +} + +void tracyPlotActivity(const char* name, CUptiTimestamp start, CUptiTimestamp end, float value = 1.0f, float baseline = 0.0f) { + tracyPlotActivity(name, tracyFromCUpti(start), tracyFromCUpti(end), value, baseline); +} + +void tracyPlotBlip(const char* name, TracyTimestamp time, float value = 1.0f, float baseline = 0.0f) { + tracyPlot(name, baseline, time - 3); + tracyPlot(name, value, time); + tracyPlot(name, baseline, time + 3); +} + +void tracyPlotBlip(const char* name, CUptiTimestamp time, float value = 1.0f, float baseline = 0.0f) { + tracyPlotBlip(name, tracyFromCUpti(time), value, baseline); +} + +void tracyEmitMemAlloc(const char* name, const void* ptr, size_t size, TracyTimestamp time) { + using namespace tracy; + const auto thread = GetThreadHandle(); + + auto item = Profiler::QueueSerial(); + tracyMemWrite(item->hdr.type, QueueType::MemNamePayload); + tracyMemWrite(item->memName.name, (uint64_t)name); + Profiler::QueueSerialFinish(); + + item = Profiler::QueueSerial(); + tracyMemWrite(item->hdr.type, QueueType::MemAllocNamed); + tracyMemWrite(item->memAlloc.time, time); + tracyMemWrite(item->memAlloc.thread, thread); + tracyMemWrite(item->memAlloc.ptr, (uint64_t)ptr); + + if (compile_time_condition::value) + { + memcpy(&item->memAlloc.size, &size, 4); + memset(&item->memAlloc.size + 4, 0, 2); + } + else + { + assert(sizeof(size) == 8); + memcpy(&item->memAlloc.size, &size, 4); + memcpy(((char *)&item->memAlloc.size) + 4, ((char *)&size) + 4, 2); + } + Profiler::QueueSerialFinish(); +} + +void tracyEmitMemFree(const char* name, const void* ptr, TracyTimestamp time) { + using namespace tracy; + const auto thread = GetThreadHandle(); + + auto item = Profiler::QueueSerial(); + tracyMemWrite(item->hdr.type, QueueType::MemNamePayload); + tracyMemWrite(item->memName.name, (uint64_t)name); + Profiler::QueueSerialFinish(); + + item = Profiler::QueueSerial(); + tracyMemWrite(item->hdr.type, QueueType::MemFreeNamed); + tracyMemWrite(item->memFree.time, time); + tracyMemWrite(item->memFree.thread, thread); + tracyMemWrite(item->memFree.ptr, (uint64_t)ptr); + Profiler::QueueSerialFinish(); +} + +void tracyEmitMemAlloc(const char* name, const void* ptr, size_t size, CUptiTimestamp cuptiTime) { + tracyEmitMemAlloc(name, ptr, size, tracyFromCUpti(cuptiTime)); +} + +void tracyEmitMemFree(const char* name, const void* ptr, CUptiTimestamp cuptiTime) { + tracyEmitMemFree(name, ptr, tracyFromCUpti(cuptiTime)); +} + +void tracyAnnounceGpuTimestamp(TracyTimestamp apiStart, TracyTimestamp apiEnd, + uint16_t queryId, uint8_t gpuContextId, + const tracy::SourceLocationData* sourceLocation, uint32_t threadId) { + using namespace tracy; + + auto item = Profiler::QueueSerial(); + tracyMemWrite(item->hdr.type, QueueType::GpuZoneBeginSerial); + tracyMemWrite(item->gpuZoneBegin.cpuTime, apiStart); + tracyMemWrite(item->gpuZoneBegin.srcloc, (uint64_t)sourceLocation); + tracyMemWrite(item->gpuZoneBegin.thread, threadId); + tracyMemWrite(item->gpuZoneBegin.queryId, uint16_t(queryId+0)); + tracyMemWrite(item->gpuZoneBegin.context, gpuContextId); + Profiler::QueueSerialFinish(); + + item = Profiler::QueueSerial(); + tracyMemWrite(item->hdr.type, QueueType::GpuZoneEndSerial); + tracyMemWrite(item->gpuZoneEnd.cpuTime, apiEnd); + tracyMemWrite(item->gpuZoneEnd.thread, threadId); + tracyMemWrite(item->gpuZoneEnd.queryId, uint16_t(queryId+1)); + tracyMemWrite(item->gpuZoneEnd.context, gpuContextId); + Profiler::QueueSerialFinish(); +} + +void tracySubmitGpuTimestamp(CUptiTimestamp gpuStart, CUptiTimestamp gpuEnd, + uint16_t queryId, uint8_t gpuContextId) { + using namespace tracy; + + auto item = Profiler::QueueSerial(); + tracyMemWrite(item->hdr.type, QueueType::GpuTime); + tracyMemWrite(item->gpuTime.gpuTime, (int64_t)gpuStart); + tracyMemWrite(item->gpuTime.queryId, uint16_t(queryId+0)); + tracyMemWrite(item->gpuTime.context, gpuContextId); + Profiler::QueueSerialFinish(); + + item = Profiler::QueueSerial(); + tracyMemWrite(item->hdr.type, QueueType::GpuTime); + tracyMemWrite(item->gpuTime.gpuTime, (int64_t)gpuEnd); + tracyMemWrite(item->gpuTime.queryId, uint16_t(queryId+1)); + tracyMemWrite(item->gpuTime.context, gpuContextId); + Profiler::QueueSerialFinish(); +} + +#define CUPTI_API_CALL(call) CUptiCallChecked(call, #call, __FILE__, __LINE__) + +#define DRIVER_API_CALL(call) cudaDriverCallChecked(call, #call, __FILE__, __LINE__) + +CUptiResult CUptiCallChecked(CUptiResult result, const char* call, const char* file, int line) noexcept { + if (result == CUPTI_SUCCESS) + return result; + const char* resultMsg = ""; + CUPTI_API_CALL(cuptiGetResultString(result, &resultMsg)); // maybe not a good idea to recurse here... + fprintf(stderr, "ERROR:\t%s:%d:\n\tfunction '%s' failed with error '%s'.\n", file, line, call, resultMsg); + //assert(result == CUPTI_SUCCESS); + return result; +} + +CUresult cudaDriverCallChecked(CUresult result, const char* call, const char* file, int line) noexcept { + if (result == CUDA_SUCCESS) + return result; + const char* resultMsg = ""; + DRIVER_API_CALL(cuGetErrorString(result, &resultMsg)); // maybe not a good idea to recurse here... + fprintf(stderr, "ERROR:\t%s:%d:\n\tfunction '%s' failed with error '%s'.\n", file, line, call, resultMsg); + //assert(result == CUDA_SUCCESS); + return result; +} + +template +struct ConcurrentHashMap { + static constexpr bool instrument = false; + auto acquire_read_lock() { + if (m.try_lock_shared()) + return std::shared_lock(m, std::adopt_lock); + ZoneNamedC(rwlock, tracy::Color::Tomato, instrument); + return std::shared_lock(m); + } + auto acquire_write_lock() { + if (m.try_lock()) + return std::unique_lock(m, std::adopt_lock); + ZoneNamedC(wxlock, tracy::Color::Tomato, instrument); + return std::unique_lock(m); + } + std::unordered_map mapping; + std::shared_mutex m; + auto& operator[](TKey key) { + { + auto lock = acquire_read_lock(); + auto it = mapping.find(key); + if (it != mapping.end()) { + return it->second; + } + } + return emplace(key, TValue{}).first->second; + } + auto find(TKey key) { + ZoneNamed(find, instrument); + auto lock = acquire_read_lock(); + return mapping.find(key); + } + auto fetch(TKey key, TValue& value) { + ZoneNamed(fetch, instrument); + auto it = mapping.find(key); + if (it != mapping.end()) { + value = it->second; + return true; + } + return false; + } + auto end() { + ZoneNamed(end, instrument); + auto lock = acquire_read_lock(); + return mapping.end(); + } + template + auto emplace(TKey key, Args&&... args) { + ZoneNamed(emplace, instrument); + auto lock = acquire_write_lock(); + return mapping.emplace(std::forward(key), std::forward(args)...); + } + auto erase(TKey key) { + ZoneNamed(erase, instrument); + auto lock = acquire_write_lock(); + return mapping.erase(key); + } +}; + +#if TRACY_CUDA_ENABLE_CUDA_CALL_STATS +struct ProfilerStats { + static constexpr bool instrument = false; + + ConcurrentHashMap> apiCallCount; + + void update(CUpti_CallbackDomain domain, CUpti_CallbackId cbid) { + ZoneNamed(update, instrument); + uint32_t key = (domain << 24) | (cbid & 0x00'FFFFFF); + auto it = apiCallCount.find(key); + if (it == apiCallCount.end()) { + it = apiCallCount.emplace(key, 0).first; + } + it->second.fetch_add(1, std::memory_order::memory_order_relaxed); + } +}; +#endif + +// StringTable: string memoization/interning +struct StringTable { + static constexpr bool instrument = false; + + // TODO(marcos): this could be just a "ConcurrentHashSet" + ConcurrentHashMap table; + + ~StringTable() { /* TODO(marcos): free string copy */ } + + std::string_view operator[](std::string_view str) { + ZoneNamedN(lookup, "StringTable::lookup", instrument); + std::string_view memoized; + if (!table.fetch(str, memoized)) { + ZoneNamedN(lookup, "StringTable::insert", instrument); + char* copy = (char*)tracyMalloc(str.size() + 1); + strncpy(copy, str.data(), str.size()); + copy[str.size()] = '\0'; + std::string_view value (copy, str.size()); + auto [it, inserted] = table.emplace(value, value); + if (!inserted) { + // another thread inserted it while we were trying to: cleanup + tracyFree(copy); + } + memoized = it->second; + } + assert(str == memoized); + return memoized; + } +}; + +struct SourceLocationMap { + static constexpr bool instrument = false; + + // NOTE(marcos): the address of an unordered_map value may become invalid + // later on (e.g., during a rehash), so mapping to a pointer is necessary + ConcurrentHashMap locations; + + ~SourceLocationMap() { /* TODO(marcos): free SourceLocationData* entries */ } + + tracy::SourceLocationData* retrieve(std::string_view function) { + ZoneNamed(retrieve, instrument); + tracy::SourceLocationData* pSrcLoc = nullptr; + locations.fetch(function, pSrcLoc); + return pSrcLoc; + } + + tracy::SourceLocationData* add(std::string_view function, std::string_view file, int line, uint32_t color=0) { + ZoneNamed(emplace, instrument); + assert(*function.end() == '\0'); + assert(*file.end() == '\0'); + void* bytes = tracyMalloc(sizeof(tracy::SourceLocationData)); + auto pSrcLoc = new(bytes)tracy::SourceLocationData{ function.data(), TracyFunction, file.data(), (uint32_t)line, color }; + auto [it, inserted] = locations.emplace(function, pSrcLoc); + if (!inserted) { + // another thread inserted it while we were trying to: cleanup + tracyFree(pSrcLoc); // POD: no destructor to call + } + assert(it->second != nullptr); + return it->second; + } +}; + +struct SourceLocationLUT { + static constexpr bool instrument = false; + + ~SourceLocationLUT() { /* no action needed: no dynamic allocation */ } + + tracy::SourceLocationData runtime [CUpti_runtime_api_trace_cbid::CUPTI_RUNTIME_TRACE_CBID_SIZE] = {}; + tracy::SourceLocationData driver [CUpti_driver_api_trace_cbid::CUPTI_DRIVER_TRACE_CBID_SIZE] = {}; + + tracy::SourceLocationData* retrieve(CUpti_CallbackDomain domain, CUpti_CallbackId cbid, CUpti_CallbackData* apiInfo) { + ZoneNamed(retrieve, instrument); + tracy::SourceLocationData* pSrcLoc = nullptr; + switch (domain) { + case CUPTI_CB_DOMAIN_RUNTIME_API : + if ((cbid > 0) && (cbid < CUPTI_RUNTIME_TRACE_CBID_SIZE)) { + pSrcLoc = &runtime[cbid]; + } + break; + case CUPTI_CB_DOMAIN_DRIVER_API : + if ((cbid > 0) && (cbid < CUPTI_DRIVER_TRACE_CBID_SIZE)) { + pSrcLoc = &driver[cbid]; + } + break; + default: + break; + } + if (pSrcLoc->name == nullptr) { + const char* function = apiInfo->functionName ? apiInfo->functionName : "cuda???"; + // cuptiGetCallbackName includes the "version suffix" of the function/cbid + //CUPTI_API_CALL(cuptiGetCallbackName(domain, cbid, &function)); + *pSrcLoc = tracy::SourceLocationData{ function, TracyFunction, TracyFile, TracyLine, 0 }; + } + return pSrcLoc; + } +}; + +uint32_t tracyTimelineId(uint32_t contextId, uint32_t streamId) { + // 0xA7C5 = 42,949 => 42,949 * 100,000 = 4,294,900,000 + // 4,294,900,000 + 65,535 = 4,294,965,535 < 4,294,967,295 (max uint32) + assert(contextId <= 0xA7C5); + assert((streamId == CUPTI_INVALID_STREAM_ID) || (streamId < 0xFFFF)); + uint32_t packed = (contextId * 100'000) + (streamId & 0x0000'FFFF); + return packed; +} + +} // unnamed/anonymous namespace + +namespace tracy +{ + class CUDACtx + { + public: + static CUDACtx* Create() { + auto& s = Singleton::Get(); + std::unique_lock lock (s.m); + if (s.ref_count == 0) { + assert(s.ctx == nullptr); + s.ctx = new CUDACtx(s.ctx_id); + s.ref_count += 1; + s.ctx_id = s.ctx->m_tracyGpuContext; + } + return s.ctx; + } + + static void Destroy(CUDACtx* ctx) { + auto& s = Singleton::Get(); + std::unique_lock lock(s.m); + assert(ctx == s.ctx); + s.ref_count -= 1; + if (s.ref_count == 0) { + delete s.ctx; + s.ctx = nullptr; + } + } + + void Collect() + { + ZoneScoped; + CUPTI::FlushActivity(); + } + + void printStats() + { + #if TRACY_CUDA_ENABLE_CUDA_CALL_STATS + fprintf(stdout, "\nCUDA API stats:\n"); + { + struct Stats { CUpti_CallbackDomain domain; CUpti_CallbackId cbid; int count; }; + std::vector sorted; + for (auto&& api : stats.apiCallCount.mapping) { + auto domain = CUpti_CallbackDomain(api.first >> 24); + auto cbid = CUpti_CallbackId(api.first & 0x00'FFFFFF); + int count = api.second; + sorted.emplace_back(Stats{ domain, cbid, count }); + } + std::sort(sorted.begin(), sorted.end(), [](const Stats& x, const Stats& y) { return x.count > y.count; }); + for (auto&& api : sorted) { + const char* function = ""; + CUPTI_API_CALL(cuptiGetCallbackName(api.domain, api.cbid, &function)); + printf("- %s : %d\n", function, api.count); + } + } + #endif + } + + void StartProfiling() + { + ZoneScoped; + CUPTI::BeginInstrumentation(this); + } + + void StopProfiling() + { + ZoneScoped; + CUPTI::EndInstrumentation(); + printStats(); + } + + void Name(const char *name, uint16_t len) + { + auto ptr = (char*)tracyMalloc(len); + memcpy(ptr, name, len); + + auto item = Profiler::QueueSerial(); + tracyMemWrite(item->hdr.type, QueueType::GpuContextName); + tracyMemWrite(item->gpuContextNameFat.context, m_tracyGpuContext); + tracyMemWrite(item->gpuContextNameFat.ptr, (uint64_t)ptr); + tracyMemWrite(item->gpuContextNameFat.size, len); + SubmitQueueItem(item); + } + + tracy_force_inline void SubmitQueueItem(tracy::QueueItem *item) + { +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem(*item); +#endif + Profiler::QueueSerialFinish(); + } + + static void QueryTimestamps(TracyTimestamp& tTracy, CUptiTimestamp& tCUpti) { + TracyTimestamp tTracy1 = tracyGetTimestamp(); + CUPTI_API_CALL(cuptiGetTimestamp(&tCUpti)); + TracyTimestamp tTracy2 = tracyGetTimestamp(); + // NOTE(marcos): giving more weight to 'tTracy2' + tTracy = (3*tTracy1 + 5*tTracy2) / 8; + } + + // NOTE(marcos): recalibration is 'static' since Tracy and CUPTI timestamps + // are "global" across all contexts; that said, each Tracy GPU context needs + // its own GpuCalibration message, but for now there's just a singleton context. + void Recalibrate() { + ZoneScoped; + // NOTE(marcos): only one thread should do the calibration, but there's + // no good reason to block threads that also trying to do the same + static std::mutex m; + if (!m.try_lock()) + return; + std::unique_lock lock (m, std::adopt_lock); + ZoneNamedNC(zone, "tracy::CUDACtx::Recalibrate[effective]", tracy::Color::Goldenrod, true); + TracyTimestamp tTracy; + CUptiTimestamp tCUpti; + QueryTimestamps(tTracy, tCUpti); + #if TRACY_CUDA_CALIBRATED_CONTEXT + static CUptiTimestamp prevCUptiTime = tCUpti; + int64_t deltaTicksCUpti = tCUpti - prevCUptiTime; + if (deltaTicksCUpti > 0) { + prevCUptiTime = tCUpti; + auto* item = Profiler::QueueSerial(); + tracyMemWrite(item->hdr.type, QueueType::GpuCalibration); + tracyMemWrite(item->gpuCalibration.gpuTime, (int64_t)tCUpti); + tracyMemWrite(item->gpuCalibration.cpuTime, tTracy); + tracyMemWrite(item->gpuCalibration.cpuDelta, deltaTicksCUpti); + tracyMemWrite(item->gpuCalibration.context, m_tracyGpuContext); + Profiler::QueueSerialFinish(); + } + #endif + // NOTE(marcos): update linear regression incrementally, which will refine + // the estimation of Tracy timestamps (Y) from CUpti timestamps (X) + static IncrementalRegression model; + model.addSample(double(tCUpti), double(tTracy)); + // NOTE(marcos): using orthogonal regression because the independet variable + // (X: CUpti timestamps) measurements are also imprecise + getCachedRegressionParameters() = model.orthogonal(); + } + + protected: + void EmitGpuZone(TracyTimestamp apiStart, TracyTimestamp apiEnd, + CUptiTimestamp gpuStart, CUptiTimestamp gpuEnd, + const tracy::SourceLocationData* pSrcLoc, + uint32_t cudaContextId, uint32_t cudaStreamId) { + //uint32_t timelineId = tracy::GetThreadHandle(); + uint32_t timelineId = tracyTimelineId(cudaContextId, cudaStreamId); + uint16_t queryId = m_queryIdGen.fetch_add(2); + tracyAnnounceGpuTimestamp(apiStart, apiEnd, queryId, m_tracyGpuContext, pSrcLoc, timelineId); + tracySubmitGpuTimestamp(gpuStart, gpuEnd, queryId, m_tracyGpuContext); + } + + void OnEventsProcessed() { + Recalibrate(); + } + + struct CUPTI { + static void CUPTIAPI OnBufferRequested(uint8_t **buffer, size_t *size, size_t *maxNumRecords) + { + ZoneScoped; + // TODO(marcos): avoid malloc and instead suballocate from a large circular buffer; + // according to the CUPTI documentation: "To minimize profiling overhead the client + // should return as quickly as possible from these callbacks." + *size = 1 * 1024*1024; // 1MB + *buffer = (uint8_t*)tracyMalloc(*size); + assert(*buffer != nullptr); + FlushActivityAsync(); + } + + static void CUPTIAPI OnBufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t* buffer, size_t size, size_t validSize) + { + // CUDA 6.0 onwards: all buffers from this callback are "global" buffers + // (i.e. there is no context/stream specific buffer; ctx is always NULL) + ZoneScoped; + tracy::SetThreadName("NVIDIA CUPTI Worker"); + CUptiResult status; + CUpti_Activity* record = nullptr; + while ((status = cuptiActivityGetNextRecord(buffer, validSize, &record)) == CUPTI_SUCCESS) { + DoProcessDeviceEvent(record); + } + if (status != CUPTI_ERROR_MAX_LIMIT_REACHED) { + CUptiCallChecked(status, "cuptiActivityGetNextRecord", TracyFile, TracyLine); + } + size_t dropped = 0; + CUPTI_API_CALL(cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped)); + assert(dropped == 0); + tracyFree(buffer); + PersistentState::Get().profilerHost->OnEventsProcessed(); + } + + // correlationID -> [CPU start time, CPU end time, CUPTI start time] + using CorrelationID = uint32_t; + struct APICallInfo { TracyTimestamp start = 0, end = 0; CUptiTimestamp cupti = CUPTI_TIMESTAMP_UNKNOWN; CUDACtx* host = nullptr; }; + + static void CUPTIAPI OnCallbackAPI( + void* userdata, + CUpti_CallbackDomain domain, + CUpti_CallbackId cbid, + const void* cbdata) + { + static constexpr bool instrument = false; + + TracyTimestamp apiCallStartTime = tracyGetTimestamp(); + CUDACtx* profilerHost = (CUDACtx*)userdata; + + switch (domain) { + case CUPTI_CB_DOMAIN_RUNTIME_API: + case CUPTI_CB_DOMAIN_DRIVER_API: + break; + case CUPTI_CB_DOMAIN_RESOURCE: { + // match 'callbackId' with CUpti_CallbackIdResource + // interpret 'cbdata' as CUpti_ResourceData, + // or as CUpti_ModuleResourceData, + // or as CUpti_GraphData, + // or as CUpti_StreamAttrData, + // or as ... (what else?) + return; + } + case CUPTI_CB_DOMAIN_SYNCHRONIZE: { + // match 'callbackId' with CUpti_CallbackIdSync + // interpret 'cbdata' as CUpti_SynchronizeData + return; + } + case CUPTI_CB_DOMAIN_STATE: { + // match 'callbackId' with CUpti_CallbackIdState + // interpret 'cbdata' as CUpti_StateData + return; + } + case CUPTI_CB_DOMAIN_NVTX: { + // match 'callbackId' with CUpti_nvtx_api_trace_cbid + // interpret 'cbdata' as CUpti_NvtxData + return; + } + case CUPTI_CB_DOMAIN_FORCE_INT: + // NOTE(marcos): the "FORCE_INT" values in CUPTI enums exist only to + // force the enum to have a specific representation (signed 32bits) + case CUPTI_CB_DOMAIN_INVALID: + default: + // TODO(marcos): unexpected error! + return; + } + + // if we reached this point, then we are in the (runtime or driver) API domain + CUpti_CallbackData* apiInfo = (CUpti_CallbackData*)cbdata; + + // Emit the Tracy 'ZoneBegin' message upon entering the API call + // TODO(marcos): a RAII object could be useful here... + if (apiInfo->callbackSite == CUPTI_API_ENTER) { + #if TRACY_CUDA_ENABLE_CUDA_CALL_STATS + ctx->stats.update(domain, cbid); + #endif + + auto& cudaCallSourceLocation = PersistentState::Get().cudaCallSourceLocation; + auto pSrcLoc = cudaCallSourceLocation.retrieve(domain, cbid, apiInfo); + + // HACK(marcos): the SourceLocationLUT::retrieve zone (above) should + // not be emitted before its enclosing zone (below) actually begins, + // so we delay the beginning of the enclosing zone to "unstack" them + if (SourceLocationLUT::instrument) + apiCallStartTime = tracyGetTimestamp(); + tracyZoneBegin(apiCallStartTime, pSrcLoc); + } + + if (apiInfo->callbackSite == CUPTI_API_ENTER) { + ZoneNamedN(enter, "tracy::CUDACtx::OnCUptiCallback[enter]", instrument); + // Track API calls that generate device activity: + bool trackDeviceActivity = false; + CUstream hStream = nullptr; + if (domain == CUPTI_CB_DOMAIN_RUNTIME_API) { + #define GET_STREAM_FUNC(Params, field) [](CUpti_CallbackData* api) { return ((Params*)api->functionParams)->field; } + #define NON_STREAM_FUNC() [](CUpti_CallbackData*) { return cudaStream_t(nullptr); } + static std::unordered_map cbidRuntimeTrackers = { + // Runtime: Kernel + { CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000, GET_STREAM_FUNC(cudaLaunchKernel_v7000_params, stream) }, + { CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_ptsz_v7000, GET_STREAM_FUNC(cudaLaunchKernel_ptsz_v7000_params, stream) }, + { CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060, GET_STREAM_FUNC(cudaLaunchKernelExC_v11060_params, config->stream) }, + { CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_ptsz_v11060, GET_STREAM_FUNC(cudaLaunchKernelExC_ptsz_v11060_params, config->stream) }, + // Runtime: Memory + { CUPTI_RUNTIME_TRACE_CBID_cudaMalloc_v3020, NON_STREAM_FUNC() }, + { CUPTI_RUNTIME_TRACE_CBID_cudaFree_v3020, NON_STREAM_FUNC() }, + // Runtime: Memcpy + { CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020, NON_STREAM_FUNC() }, + { CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020, GET_STREAM_FUNC(cudaMemcpyAsync_v3020_params, stream) }, + // Runtime: Memset + { CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020, NON_STREAM_FUNC() }, + { CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020, GET_STREAM_FUNC(cudaMemsetAsync_v3020_params, stream) }, + // Runtime: Synchronization + { CUPTI_RUNTIME_TRACE_CBID_cudaStreamSynchronize_v3020, NON_STREAM_FUNC() }, + { CUPTI_RUNTIME_TRACE_CBID_cudaEventSynchronize_v3020, NON_STREAM_FUNC() }, + { CUPTI_RUNTIME_TRACE_CBID_cudaEventQuery_v3020, NON_STREAM_FUNC() }, + { CUPTI_RUNTIME_TRACE_CBID_cudaStreamWaitEvent_v3020, NON_STREAM_FUNC() }, + { CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSynchronize_v3020, NON_STREAM_FUNC() }, + }; + #undef NON_STREAM_FUNC + #undef GET_STREAM_FUNC + auto it = cbidRuntimeTrackers.find(CUpti_runtime_api_trace_cbid(cbid)); + if (it != cbidRuntimeTrackers.end()) { + trackDeviceActivity = true; + hStream = (CUstream)it->second(apiInfo); + } + } + if (domain == CUPTI_CB_DOMAIN_DRIVER_API) { + #define GET_STREAM_FUNC(Params, field) [](CUpti_CallbackData* api) { return ((Params*)api->functionParams)->field; } + #define NON_STREAM_FUNC() [](CUpti_CallbackData*) { return CUstream(nullptr); } + static std::unordered_map cbidDriverTrackers = { + // Driver: Kernel + { CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel, GET_STREAM_FUNC(cuLaunchKernel_params, hStream) }, + { CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel_ptsz, GET_STREAM_FUNC(cuLaunchKernel_ptsz_params, hStream)} , + { CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx, GET_STREAM_FUNC(cuLaunchKernelEx_params, config->hStream) }, + { CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx_ptsz, GET_STREAM_FUNC(cuLaunchKernelEx_params, config->hStream) }, + }; + #undef NON_STREAM_FUNC + #undef GET_STREAM_FUNC + auto it = cbidDriverTrackers.find(CUpti_driver_api_trace_cbid(cbid)); + if (it != cbidDriverTrackers.end()) { + trackDeviceActivity = true; + hStream = it->second(apiInfo); + } + } + if (trackDeviceActivity) { + // NOTE(marcos): we should NOT track if the stream is being captured + CUstreamCaptureStatus status = {}; + DRIVER_API_CALL(cuStreamIsCapturing(hStream, &status)); + trackDeviceActivity = !(status == CU_STREAM_CAPTURE_STATUS_ACTIVE); + } + if (trackDeviceActivity) { + CUptiTimestamp tgpu; + // TODO(marcos): do a "reverse-estimate" to obtain CUpti time from Tracy time instead? + CUPTI_API_CALL(cuptiGetTimestamp(&tgpu)); + auto& cudaCallSiteInfo = PersistentState::Get().cudaCallSiteInfo; + cudaCallSiteInfo.emplace(apiInfo->correlationId, APICallInfo{ apiCallStartTime, apiCallStartTime, tgpu, profilerHost }); + } + auto& entryFlags = *apiInfo->correlationData; + assert(entryFlags == 0); + entryFlags |= trackDeviceActivity ? 0x8000 : 0; + } + + if (apiInfo->callbackSite == CUPTI_API_EXIT) { + APICallInfo* pApiInterval = [](CUpti_CallbackData* apiInfo) { + ZoneNamedN(exit, "tracy::CUDACtx::OnCUptiCallback[exit]", instrument); + auto entryFlags = *apiInfo->correlationData; + bool trackDeviceActivity = (entryFlags & 0x8000) != 0; + if (trackDeviceActivity) { + auto& cudaCallSiteInfo = PersistentState::Get().cudaCallSiteInfo; + auto it = cudaCallSiteInfo.find(apiInfo->correlationId); + if (it != cudaCallSiteInfo.end()) { + // WARN(marcos): leaking the address of a hash-map value could spell trouble + return &it->second; + } + } + // NOTE(marcos): this can happen if the GPU activity completes + // before the CUDA function that enqueued it returns (e.g., sync) + static APICallInfo sentinel; + return &sentinel; + }(apiInfo); + pApiInterval->end = tracyGetTimestamp(); + tracyZoneEnd(pApiInterval->end); + } + } + + static bool matchActivityToAPICall(uint32_t correlationId, APICallInfo& apiCallInfo) { + static constexpr bool instrument = false; + ZoneNamed(match, instrument); + auto& cudaCallSiteInfo = PersistentState::Get().cudaCallSiteInfo; + if (!cudaCallSiteInfo.fetch(correlationId, apiCallInfo)) { + return false; + } + cudaCallSiteInfo.erase(correlationId); + assert(apiCallInfo.host != nullptr); + return true; + } + + static void matchError(uint32_t correlationId, const char* kind) { + char msg [128]; + snprintf(msg, sizeof(msg), "ERROR: device activity '%s' has no matching CUDA API call (id=%u).", kind, correlationId); + TracyMessageC(msg, strlen(msg), tracy::Color::Tomato); + } + + static std::string extractActualName(char** name){ + //If name does not start with number, return empty string + if (!isdigit(**name)) + { + return std::string(); + } + // Assuming name starts with number followed by actual name + std::string actualName; + char* currStr = *name; + int num = 0; + while (*currStr >= '0' && *currStr <= '9') + { + num = num * 10 + (*currStr - '0'); + currStr++; + } + + // Return the string start at currStr ends at num + actualName = std::string(currStr, num); + // check if actualName starts with _GLOBAL__N__ + if (actualName.rfind("_GLOBAL__N__", 0) == 0) + { + // _GLOBAL__N__ with an id stands for anonymous namespace + actualName = std::string("(anonymous_namespace)"); + } + + *name = currStr + num; + return actualName; + } + + static std::string extractActualNameNested(const char* demangledName) + { + ZoneNamedN(demangle, "demangle_kernel", false); + //If name does not start with _Z, return a new std::string with original name + if (demangledName[0] != '_' || demangledName[1] != 'Z') + { + return std::string(demangledName); + } + std::string actualName; + char* currStr = (char*)demangledName + 2; + + if (*currStr == 'N') + { + currStr++; + // extract actual name from nested name + std::string nestedName = extractActualName(&currStr); + actualName += nestedName; + while (1) + { + //Loop until nested name is empty + nestedName = extractActualName(&currStr); + if (nestedName.empty()) + { + break; + } + actualName += "::" + nestedName; + } + } else + { + actualName = extractActualName(&currStr); + } + return actualName; + } + + static tracy::SourceLocationData* getKernelSourceLocation(const char* kernelName) + { + auto& kernelSrcLoc = PersistentState::Get().kernelSrcLoc; + std::string_view demangledName; + #ifndef _MSC_VER + // TODO(marcos): extractActualNameNested is the main bottleneck right now; + // we need a specialized StringTable mapping from "peristent" kernel names + // (const char*/uintptr_t) to memoized, lazily initialized demangled names + auto& demangledNameTable = PersistentState::Get().demangledNameTable; + std::string demangled = extractActualNameNested(kernelName); + demangledName = demangledNameTable[demangled]; + #else + demangledName = kernelName; + #endif + auto pSrcLoc = kernelSrcLoc.retrieve(demangledName); + if (pSrcLoc == nullptr) { + pSrcLoc = kernelSrcLoc.add(demangledName, TracyFile, TracyLine); + } + return pSrcLoc; + } + + static void DoProcessDeviceEvent(CUpti_Activity *record) + { + static constexpr bool instrument = false; + ZoneNamed(activity, instrument); + + switch (record->kind) + { + case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: + { + ZoneNamedN(kernel, "tracy::CUDACtx::DoProcessDeviceEvent[kernel]", instrument); + CUpti_ActivityKernel9* kernel9 = (CUpti_ActivityKernel9*) record; + APICallInfo apiCall; + if (!matchActivityToAPICall(kernel9->correlationId, apiCall)) { + return matchError(kernel9->correlationId, "KERNEL"); + } + apiCall.host->EmitGpuZone(apiCall.start, apiCall.end, kernel9->start, kernel9->end, getKernelSourceLocation(kernel9->name), kernel9->contextId, kernel9->streamId); + auto latency_ms = (kernel9->start - apiCall.cupti) / 1'000'000.0; + tracyPlotBlip("Kernel Latency (ms)", kernel9->start, latency_ms); + break; + } + + case CUPTI_ACTIVITY_KIND_MEMCPY: + { + ZoneNamedN(kernel, "tracy::CUDACtx::DoProcessDeviceEvent[memcpy]", instrument); + CUpti_ActivityMemcpy5* memcpy5 = (CUpti_ActivityMemcpy5*) record; + APICallInfo apiCall; + if (!matchActivityToAPICall(memcpy5->correlationId, apiCall)) { + return matchError(memcpy5->correlationId, "MEMCPY"); + } + static constexpr tracy::SourceLocationData TracyCUPTISrcLocDeviceMemcpy { "CUDA::memcpy", TracyFunction, TracyFile, (uint32_t)TracyLine, tracy::Color::Blue }; + apiCall.host->EmitGpuZone(apiCall.start, apiCall.end, memcpy5->start, memcpy5->end, &TracyCUPTISrcLocDeviceMemcpy, memcpy5->contextId, memcpy5->streamId); + static constexpr const char* graph_name = "CUDA Memory Copy"; + tracyEmitMemAlloc(graph_name, (void*)(uintptr_t)memcpy5->correlationId, memcpy5->bytes, memcpy5->start); + tracyEmitMemFree (graph_name, (void*)(uintptr_t)memcpy5->correlationId, memcpy5->end); + break; + } + + case CUPTI_ACTIVITY_KIND_MEMSET: + { + ZoneNamedN(kernel, "tracy::CUDACtx::DoProcessDeviceEvent[memset]", instrument); + CUpti_ActivityMemset4* memset4 = (CUpti_ActivityMemset4*) record; + APICallInfo apiCall; + if (!matchActivityToAPICall(memset4->correlationId, apiCall)) { + return matchError(memset4->correlationId, "MEMSET"); + } + static constexpr tracy::SourceLocationData TracyCUPTISrcLocDeviceMemset { "CUDA::memset", TracyFunction, TracyFile, (uint32_t)TracyLine, tracy::Color::Blue }; + apiCall.host->EmitGpuZone(apiCall.start, apiCall.end, memset4->start, memset4->end, &TracyCUPTISrcLocDeviceMemset, memset4->contextId, memset4->streamId); + static constexpr const char* graph_name = "CUDA Memory Set"; + tracyEmitMemAlloc(graph_name, (void*)(uintptr_t)memset4->correlationId, memset4->bytes, memset4->start); + tracyEmitMemFree (graph_name, (void*)(uintptr_t)memset4->correlationId, memset4->end); + break; + } + + case CUPTI_ACTIVITY_KIND_SYNCHRONIZATION: + { + ZoneNamedN(kernel, "tracy::CUDACtx::DoProcessDeviceEvent[sync]", instrument); + CUpti_ActivitySynchronization* synchronization = (CUpti_ActivitySynchronization*) record; + APICallInfo apiCall; + if (!matchActivityToAPICall(synchronization->correlationId, apiCall)) { + return matchError(synchronization->correlationId, "SYNCHRONIZATION"); + } + // NOTE(marcos): synchronization can happen at different levels/objects: + // a. on the entire context : cuCtxSynchronize() -> timeline(ctx,0) + // b. on a specific stream : cuStreamSynchronize() -> timeline(ctx,stream) + // c. on a specific event : cuEventSynchronize() -> timeline(ctx,0xffff) + static constexpr tracy::SourceLocationData TracyCUPTISrcLocContextSynchronization { "CUDA::Context::sync", TracyFunction, TracyFile, (uint32_t)TracyLine, tracy::Color::Magenta }; + auto* pSrcLoc = &TracyCUPTISrcLocContextSynchronization; + uint32_t cudaContextId = synchronization->contextId; + uint32_t cudaStreamId = 0; + if (synchronization->streamId != CUPTI_SYNCHRONIZATION_INVALID_VALUE) { + static constexpr tracy::SourceLocationData TracyCUPTISrcLocStreamSynchronization{ "CUDA::Stream::sync", TracyFunction, TracyFile, (uint32_t)TracyLine, tracy::Color::Magenta3 }; + pSrcLoc = &TracyCUPTISrcLocStreamSynchronization; + cudaStreamId = synchronization->streamId; + } + if (synchronization->cudaEventId != CUPTI_SYNCHRONIZATION_INVALID_VALUE) { + static constexpr tracy::SourceLocationData TracyCUPTISrcLocEventSynchronization{ "CUDA::Event::sync", TracyFunction, TracyFile, (uint32_t)TracyLine, tracy::Color::Magenta4 }; + pSrcLoc = &TracyCUPTISrcLocEventSynchronization; + cudaStreamId = 0xFFFFFFFF; + // TODO(marcos): CUpti_ActivitySynchronization2 introduces a new + // field 'cudaEventSyncId' which complements 'cudaEventId' + } + apiCall.host->EmitGpuZone(apiCall.start, apiCall.end, synchronization->start, synchronization->end, pSrcLoc, cudaContextId, cudaStreamId); + static constexpr const char* graph_name = "CUDA Synchronization"; + tracyEmitMemAlloc(graph_name, (void*)(uintptr_t)synchronization->correlationId, 1, synchronization->start); + tracyEmitMemFree (graph_name, (void*)(uintptr_t)synchronization->correlationId, synchronization->end); + break; + } + case CUPTI_ACTIVITY_KIND_MEMORY2: + { + ZoneNamedN(kernel, "tracy::CUDACtx::DoProcessDeviceEvent[malloc/free]", instrument); + CUpti_ActivityMemory3* memory3 = (CUpti_ActivityMemory3*)record; + APICallInfo apiCall; + if (!matchActivityToAPICall(memory3->correlationId, apiCall)) { + return matchError(memory3->correlationId, "MEMORY"); + } + static constexpr const char* graph_name = "CUDA Memory Allocation"; + if (memory3->memoryOperationType == CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_ALLOCATION){ + auto& memAllocAddress = PersistentState::Get().memAllocAddress; + memAllocAddress[memory3->address] = 1; + tracyEmitMemAlloc(graph_name, (void*)memory3->address, memory3->bytes, memory3->timestamp); + } + else if (memory3->memoryOperationType == CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_RELEASE){ + auto& memAllocAddress = PersistentState::Get().memAllocAddress; + int dontCare; + if (!memAllocAddress.fetch(memory3->address, dontCare)){ + // Note(Frank): This is a hack to handle the case where the memory allocation + // corresponds to the memory release is not found. + // This can happen when the memory is allocated when profiling is not enabled. + matchError(memory3->correlationId, "MEMORY/RELEASE"); + tracyEmitMemAlloc(graph_name, (void*)memory3->address, memory3->bytes, memory3->timestamp); + } else { + memAllocAddress.erase(memory3->address); + } + tracyEmitMemFree(graph_name, (void*)memory3->address, memory3->timestamp); + } + break; + } + case CUPTI_ACTIVITY_KIND_CUDA_EVENT : + { + // NOTE(marcos): a byproduct of CUPTI_ACTIVITY_KIND_SYNCHRONIZATION + // (I think this is related to cudaEvent*() API calls) + CUpti_ActivityCudaEvent2* event = (CUpti_ActivityCudaEvent2*)record; + UNREFERENCED(event); + break; + } + default: + { + char buffer[64]; + snprintf(buffer, sizeof(buffer), "Unknown activity record (kind is %d)", record->kind); + TracyMessageC(buffer, strlen(buffer), tracy::Color::Crimson); + break; + } + } + } + + static constexpr CUpti_CallbackDomain domains[] = { + CUPTI_CB_DOMAIN_RUNTIME_API, + CUPTI_CB_DOMAIN_DRIVER_API, + //CUPTI_CB_DOMAIN_RESOURCE, + //CUPTI_CB_DOMAIN_SYNCHRONIZE, + //CUPTI_CB_DOMAIN_NVTX, + //CUPTI_CB_DOMAIN_STATE + }; + + static constexpr CUpti_ActivityKind activities[] = { + //CUPTI_ACTIVITY_KIND_KERNEL, // mutually exclusive with CONCURRENT_KERNEL + CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL, + CUPTI_ACTIVITY_KIND_MEMCPY, + CUPTI_ACTIVITY_KIND_MEMSET, + CUPTI_ACTIVITY_KIND_SYNCHRONIZATION, + CUPTI_ACTIVITY_KIND_MEMORY2, + //CUPTI_ACTIVITY_KIND_MEMCPY2, + //CUPTI_ACTIVITY_KIND_OVERHEAD, + //CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API, + //CUPTI_ACTIVITY_KIND_RUNTIME, + //CUPTI_ACTIVITY_KIND_DRIVER, + }; + + static void BeginInstrumentation(CUDACtx* profilerHost) { + auto& currentProfilerHost = PersistentState::Get().profilerHost; + if (currentProfilerHost != nullptr) { + return; + } + currentProfilerHost = profilerHost; + + // NOTE(frank): full-stop synchronization to ensure we only handle + // CUDA API calls and device activities that happens past this point + cudaDeviceSynchronize(); + + auto& subscriber = PersistentState::Get().subscriber; + CUPTI_API_CALL(cuptiSubscribe(&subscriber, CUPTI::OnCallbackAPI, profilerHost)); + CUPTI_API_CALL(cuptiActivityRegisterCallbacks(CUPTI::OnBufferRequested, CUPTI::OnBufferCompleted)); + for (auto domain : domains) { + CUPTI_API_CALL(cuptiEnableDomain(uint32_t(true), subscriber, domain)); + } + for (auto activity : activities) { + CUPTI_API_CALL(cuptiActivityEnable(activity)); + } + + #if TRACY_CUDA_ENABLE_COLLECTOR_THREAD + auto& collector = PersistentState::Get().collector; + collector.period = 160; + collector.signal.notify_one(); + #endif + } + + static void EndInstrumentation() { + auto& currentProfilerHost = PersistentState::Get().profilerHost; + if (currentProfilerHost == nullptr) { + return; + } + + // NOTE(frank): full-stop synchronization to ensure we catch + // and drain all the activities that has been tracked up to now. + cudaDeviceSynchronize(); + + FlushActivity(); + + auto& subscriber = PersistentState::Get().subscriber; + for (auto activity : activities) { + CUPTI_API_CALL(cuptiActivityDisable(activity)); + } + for (auto domain : domains) { + CUPTI_API_CALL(cuptiEnableDomain(uint32_t(false), subscriber, domain)); + } + // TODO(marcos): is here a counterpart for 'cuptiActivityRegisterCallbacks()'? + CUPTI_API_CALL(cuptiUnsubscribe(subscriber)); + + #if TRACY_CUDA_ENABLE_COLLECTOR_THREAD + auto& collector = PersistentState::Get().collector; + collector.period = ~uint32_t(0); + collector.signal.notify_one(); + #endif + + currentProfilerHost = nullptr; + } + + static void FlushActivity() + { + // NOTE(marcos): only one thread should do the collection at any given time, + // but there's no reason to block threads that are also trying to do the same + static std::mutex m; + if (!m.try_lock()) + return; + std::unique_lock lock (m, std::adopt_lock); + ZoneNamedNC(zone, "cuptiActivityFlushAll", tracy::Color::Red4, true); + CUPTI_API_CALL(cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_NONE)); + } + + #if TRACY_CUDA_ENABLE_COLLECTOR_THREAD + // WARN(marcos): technically, CUPTI already offers async flushing of + // activity records through cuptiActivityFlushPeriod(), but I haven't + // had much luck getting reliable, consistent delivery with it... + struct Collector { + std::atomic running = true; + volatile uint32_t period = ~uint32_t(0); + std::mutex mtx; + std::condition_variable signal; + std::thread thread = std::thread( + [this]() { + tracy::SetThreadName("Tracy CUDA Collector"); + atexit([]() { + auto& collector = CUPTI::PersistentState::Get().collector; + collector.running = false; + collector.signal.notify_one(); + collector.thread.join(); + }); + while (running) { + { + std::unique_lock lock(mtx); + signal.wait_for(lock, std::chrono::milliseconds(period)); + } + FlushActivity(); + } + } + ); + }; + #endif + + static void FlushActivityAsync() + { + #if TRACY_CUDA_ENABLE_COLLECTOR_THREAD + ZoneScoped; + auto& collector = PersistentState::Get().collector; + collector.signal.notify_one(); + #endif + } + + struct PersistentState { + // NOTE(marcos): these objects must remain in memory past the application + // returning from main() because the Tracy client worker thread may still + // be responding to string/source-location requests from the server + SourceLocationMap kernelSrcLoc; + StringTable demangledNameTable; + SourceLocationLUT cudaCallSourceLocation; + + // NOTE(marcos): these objects do not need to persist, but their relative + // footprint is trivial enough that we don't care if we let them leak + ConcurrentHashMap cudaCallSiteInfo; + ConcurrentHashMap memAllocAddress; + CUpti_SubscriberHandle subscriber = {}; + CUDACtx* profilerHost = nullptr; + + Collector collector; + + static PersistentState& Get() { + static PersistentState& persistent = *(new PersistentState()); + return persistent; + } + }; + + }; + + CUDACtx(uint8_t gpuContextID = 255) + { + ZoneScoped; + + if (gpuContextID != 255) { + m_tracyGpuContext = gpuContextID; + return; + } + + m_tracyGpuContext = GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed); + assert(m_tracyGpuContext != 255); + + TracyTimestamp tTracy; + CUptiTimestamp tCUpti; + QueryTimestamps(tTracy, tCUpti); + + // Announce to Tracy about a new GPU context/timeline: + auto item = Profiler::QueueSerial(); + tracyMemWrite(item->hdr.type, QueueType::GpuNewContext); + tracyMemWrite(item->gpuNewContext.cpuTime, tTracy); + tracyMemWrite(item->gpuNewContext.gpuTime, (int64_t)tCUpti); // TODO: Be more careful about this cast + tracyMemWrite(item->gpuNewContext.thread, (uint32_t)0); + tracyMemWrite(item->gpuNewContext.period, 1.0f); + tracyMemWrite(item->gpuNewContext.type, GpuContextType::CUDA); + tracyMemWrite(item->gpuNewContext.context, m_tracyGpuContext); + #if TRACY_CUDA_CALIBRATED_CONTEXT + tracyMemWrite(item->gpuNewContext.flags, GpuContextCalibration); + #else + tracyMemWrite(item->gpuNewContext.flags, tracy::GpuContextFlags(0)); + #endif + Profiler::QueueSerialFinish(); + + constexpr const char* tracyCtxName = "CUDA GPU/Device Activity"; + this->Name(tracyCtxName, uint16_t(strlen(tracyCtxName))); + + // NOTE(marcos): a few rounds of calibation amorthized over 1 second + // in order to get a meaningful linear regression estimator + Recalibrate(); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + Recalibrate(); + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + Recalibrate(); + std::this_thread::sleep_for(std::chrono::milliseconds(300)); + Recalibrate(); + std::this_thread::sleep_for(std::chrono::milliseconds(400)); + Recalibrate(); + } + + ~CUDACtx() + { + ZoneScoped; + } + + struct Singleton { + CUDACtx* ctx = nullptr; + std::mutex m; + int ref_count = 0; + uint8_t ctx_id = 255; + static Singleton& Get() { + static Singleton singleton; + return singleton; + } + }; + + #if TRACY_CUDA_ENABLE_CUDA_CALL_STATS + ProfilerStats stats = {}; + #endif + + uint8_t m_tracyGpuContext = 255; + static constexpr size_t cacheline = 64; + alignas(cacheline) std::atomic m_queryIdGen = 0; + }; + +} + +#define TracyCUDAContext() tracy::CUDACtx::Create() +#define TracyCUDAContextDestroy(ctx) tracy::CUDACtx::Destroy(ctx) +#define TracyCUDAContextName(ctx, name, size) ctx->Name(name, size) + +#define TracyCUDAStartProfiling(ctx) ctx->StartProfiling() +#define TracyCUDAStopProfiling(ctx) ctx->StopProfiling() + +#define TracyCUDACollect(ctx) ctx->Collect() + +#endif + +#endif \ No newline at end of file diff --git a/tracy/TracyD3D11.hpp b/tracy/TracyD3D11.hpp new file mode 100644 index 0000000..acab383 --- /dev/null +++ b/tracy/TracyD3D11.hpp @@ -0,0 +1,466 @@ +#ifndef __TRACYD3D11_HPP__ +#define __TRACYD3D11_HPP__ + +#ifndef TRACY_ENABLE + +#define TracyD3D11Context(device,queue) nullptr +#define TracyD3D11Destroy(ctx) +#define TracyD3D11ContextName(ctx, name, size) + +#define TracyD3D11NewFrame(ctx) + +#define TracyD3D11Zone(ctx, name) +#define TracyD3D11ZoneC(ctx, name, color) +#define TracyD3D11NamedZone(ctx, varname, name, active) +#define TracyD3D11NamedZoneC(ctx, varname, name, color, active) +#define TracyD3D11ZoneTransient(ctx, varname, name, active) + +#define TracyD3D11ZoneS(ctx, name, depth) +#define TracyD3D11ZoneCS(ctx, name, color, depth) +#define TracyD3D11NamedZoneS(ctx, varname, name, depth, active) +#define TracyD3D11NamedZoneCS(ctx, varname, name, color, depth, active) +#define TracyD3D11ZoneTransientS(ctx, varname, name, depth, active) + +#define TracyD3D11Collect(ctx) + +namespace tracy +{ +class D3D11ZoneScope {}; +} + +using TracyD3D11Ctx = void*; + +#else + +#include +#include +#include + +#include "Tracy.hpp" +#include "../client/TracyProfiler.hpp" +#include "../client/TracyCallstack.hpp" +#include "../common/TracyYield.hpp" + +#include + +#define TracyD3D11Panic(msg, ...) do { assert(false && "TracyD3D11: " msg); TracyMessageLC("TracyD3D11: " msg, tracy::Color::Red4); __VA_ARGS__; } while(false); + +namespace tracy +{ + +class D3D11Ctx +{ + friend class D3D11ZoneScope; + + static constexpr uint32_t MaxQueries = 64 * 1024; + + enum CollectMode { POLL, BLOCK }; + +public: + D3D11Ctx( ID3D11Device* device, ID3D11DeviceContext* devicectx ) + { + // TODO: consider calling ID3D11Device::GetImmediateContext() instead of passing it as an argument + m_device = device; + device->AddRef(); + m_immediateDevCtx = devicectx; + devicectx->AddRef(); + + { + D3D11_QUERY_DESC desc = { }; + desc.Query = D3D11_QUERY_TIMESTAMP_DISJOINT; + if (FAILED(m_device->CreateQuery(&desc, &m_disjointQuery))) + { + TracyD3D11Panic("unable to create disjoint timestamp query.", return); + } + } + + for (ID3D11Query*& query : m_queries) + { + D3D11_QUERY_DESC desc = { }; + desc.Query = D3D11_QUERY_TIMESTAMP; + if (FAILED(m_device->CreateQuery(&desc, &query))) + { + TracyD3D11Panic("unable to create timestamp query.", return); + } + } + + // Calibrate CPU and GPU timestamps + int64_t tcpu = 0; + int64_t tgpu = 0; + for (int attempts = 0; attempts < 50; attempts++) + { + m_immediateDevCtx->Begin(m_disjointQuery); + m_immediateDevCtx->End(m_queries[0]); + m_immediateDevCtx->End(m_disjointQuery); + + int64_t tcpu0 = Profiler::GetTime(); + WaitForQuery(m_disjointQuery); + // NOTE: one would expect that by waiting for the enclosing disjoint query to finish, + // all timestamp queries within would also be readily available, but that does not + // seem to be the case here... See https://github.com/wolfpld/tracy/issues/947 + WaitForQuery(m_queries[0]); + int64_t tcpu1 = Profiler::GetTime(); + + D3D11_QUERY_DATA_TIMESTAMP_DISJOINT disjoint = { }; + if (m_immediateDevCtx->GetData(m_disjointQuery, &disjoint, sizeof(disjoint), 0) != S_OK) + { + TracyMessageLC("TracyD3D11: unable to query GPU timestamp; retrying...", tracy::Color::Tomato); + continue; + } + + if (disjoint.Disjoint) + continue; + + UINT64 timestamp = 0; + if (m_immediateDevCtx->GetData(m_queries[0], ×tamp, sizeof(timestamp), 0) != S_OK) + continue; // this should never happen (we waited for the query to finish above) + + tcpu = tcpu0 + (tcpu1 - tcpu0) * 1 / 2; + tgpu = timestamp * (1000000000 / disjoint.Frequency); + break; + } + + // ready to roll + m_contextId = GetGpuCtxCounter().fetch_add(1); + m_immediateDevCtx->Begin(m_disjointQuery); + m_previousCheckpoint = m_nextCheckpoint = 0; + + auto* item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuNewContext ); + MemWrite( &item->gpuNewContext.cpuTime, tcpu ); + MemWrite( &item->gpuNewContext.gpuTime, tgpu ); + MemWrite( &item->gpuNewContext.thread, uint32_t(0) ); // #TODO: why not GetThreadHandle()? + MemWrite( &item->gpuNewContext.period, 1.0f ); + MemWrite( &item->gpuNewContext.context, m_contextId); + MemWrite( &item->gpuNewContext.flags, uint8_t(0) ); + MemWrite( &item->gpuNewContext.type, GpuContextType::Direct3D11 ); + +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem( *item ); +#endif + + Profiler::QueueSerialFinish(); + } + + ~D3D11Ctx() + { + // collect all pending timestamps before destroying everything + do + { + Collect(BLOCK); + } while (m_previousCheckpoint != m_queryCounter); + + for (ID3D11Query* query : m_queries) + { + query->Release(); + } + m_immediateDevCtx->End(m_disjointQuery); + m_disjointQuery->Release(); + m_immediateDevCtx->Release(); + m_device->Release(); + } + + void Name( const char* name, uint16_t len ) + { + auto ptr = (char*)tracy_malloc( len ); + memcpy( ptr, name, len ); + + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuContextName ); + MemWrite( &item->gpuContextNameFat.context, m_contextId ); + MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr ); + MemWrite( &item->gpuContextNameFat.size, len ); +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem( *item ); +#endif + Profiler::QueueSerialFinish(); + } + + void Collect(CollectMode mode = POLL) + { + ZoneScopedC( Color::Red4 ); + +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) + { + m_previousCheckpoint = m_nextCheckpoint = m_queryCounter; + return; + } +#endif + + if (m_previousCheckpoint == m_nextCheckpoint) + { + uintptr_t nextCheckpoint = m_queryCounter; + if (nextCheckpoint == m_nextCheckpoint) + { + return; + } + m_nextCheckpoint = nextCheckpoint; + m_immediateDevCtx->End(m_disjointQuery); + } + + if (mode == CollectMode::BLOCK) + { + WaitForQuery(m_disjointQuery); + } + + D3D11_QUERY_DATA_TIMESTAMP_DISJOINT disjoint = { }; + if (m_immediateDevCtx->GetData(m_disjointQuery, &disjoint, sizeof(disjoint), D3D11_ASYNC_GETDATA_DONOTFLUSH) != S_OK) + { + return; + } + + if (disjoint.Disjoint == TRUE) + { + m_previousCheckpoint = m_nextCheckpoint; + TracyD3D11Panic("disjoint timestamps detected; dropping."); + return; + } + + auto begin = m_previousCheckpoint; + auto end = m_nextCheckpoint; + for (auto i = begin; i != end; ++i) + { + uint32_t k = RingIndex(i); + UINT64 timestamp = 0; + if (m_immediateDevCtx->GetData(m_queries[k], ×tamp, sizeof(timestamp), 0) != S_OK) + { + TracyD3D11Panic("timestamp expected to be ready, but it was not!"); + break; + } + timestamp *= (1000000000ull / disjoint.Frequency); + auto* item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuTime); + MemWrite(&item->gpuTime.gpuTime, static_cast(timestamp)); + MemWrite(&item->gpuTime.queryId, static_cast(k)); + MemWrite(&item->gpuTime.context, m_contextId); + Profiler::QueueSerialFinish(); + } + + // disjoint timestamp queries should only be invoked once per frame or less + // https://learn.microsoft.com/en-us/windows/win32/api/d3d11/ne-d3d11-d3d11_query + m_immediateDevCtx->Begin(m_disjointQuery); + m_previousCheckpoint = m_nextCheckpoint; + } + +private: + tracy_force_inline uint32_t RingIndex(uintptr_t index) + { + index %= MaxQueries; + return static_cast(index); + } + + tracy_force_inline uint32_t RingCount(uintptr_t begin, uintptr_t end) + { + // wrap-around safe: all unsigned + uintptr_t count = end - begin; + return static_cast(count); + } + + tracy_force_inline uint32_t NextQueryId() + { + auto id = m_queryCounter++; + if (RingCount(m_previousCheckpoint, id) >= MaxQueries) + { + TracyD3D11Panic("too many pending timestamp queries."); + // #TODO: return some sentinel value; ideally a "hidden" query index + } + return RingIndex(id); + } + + tracy_force_inline ID3D11Query* GetQueryObjectFromId(uint32_t id) + { + return m_queries[id]; + } + + tracy_force_inline void WaitForQuery(ID3D11Query* query) + { + m_immediateDevCtx->Flush(); + while (m_immediateDevCtx->GetData(query, nullptr, 0, 0) != S_OK) + YieldThread(); // busy-wait :-( attempt to reduce power usage with _mm_pause() & friends... + } + + tracy_force_inline uint8_t GetContextId() const + { + return m_contextId; + } + + ID3D11Device* m_device = nullptr; + ID3D11DeviceContext* m_immediateDevCtx = nullptr; + + ID3D11Query* m_queries[MaxQueries]; + ID3D11Query* m_disjointQuery = nullptr; + + uint8_t m_contextId = 255; // NOTE: apparently, 255 means invalid id; is this documented anywhere? + + uintptr_t m_queryCounter = 0; + + uintptr_t m_previousCheckpoint = 0; + uintptr_t m_nextCheckpoint = 0; +}; + +class D3D11ZoneScope +{ +public: + tracy_force_inline D3D11ZoneScope( D3D11Ctx* ctx, const SourceLocationData* srcloc, bool active ) + : D3D11ZoneScope(ctx, active) + { + if( !m_active ) return; + + auto* item = Profiler::QueueSerial(); + WriteQueueItem(item, QueueType::GpuZoneBeginSerial, reinterpret_cast(srcloc)); + } + + tracy_force_inline D3D11ZoneScope( D3D11Ctx* ctx, const SourceLocationData* srcloc, int32_t depth, bool active ) + : D3D11ZoneScope(ctx, active) + { + if( !m_active ) return; + + if( depth > 0 && has_callstack() ) + { + auto* item = Profiler::QueueSerialCallstack(Callstack(depth)); + WriteQueueItem(item, QueueType::GpuZoneBeginCallstackSerial, reinterpret_cast(srcloc)); + } + else + { + auto* item = Profiler::QueueSerial(); + WriteQueueItem(item, QueueType::GpuZoneBeginSerial, reinterpret_cast(srcloc)); + } + } + + tracy_force_inline D3D11ZoneScope(D3D11Ctx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, bool active) + : D3D11ZoneScope(ctx, active) + { + if( !m_active ) return; + + const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz); + + auto* item = Profiler::QueueSerial(); + WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocSerial, sourceLocation); + } + + tracy_force_inline D3D11ZoneScope(D3D11Ctx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, int32_t depth, bool active) + : D3D11ZoneScope(ctx, active) + { + if( !m_active ) return; + + const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz); + + if ( depth > 0 && has_callstack() ) + { + auto* item = Profiler::QueueSerialCallstack(Callstack(depth)); + WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocCallstackSerial, sourceLocation); + } + else + { + auto* item = Profiler::QueueSerial(); + WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocSerial, sourceLocation); + } + } + + tracy_force_inline ~D3D11ZoneScope() + { + if( !m_active ) return; + + const auto queryId = m_ctx->NextQueryId(); + m_ctx->m_immediateDevCtx->End(m_ctx->GetQueryObjectFromId(queryId)); + + auto* item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuZoneEndSerial ); + MemWrite( &item->gpuZoneEnd.cpuTime, Profiler::GetTime() ); + MemWrite( &item->gpuZoneEnd.thread, GetThreadHandle() ); + MemWrite( &item->gpuZoneEnd.queryId, uint16_t( queryId ) ); + MemWrite( &item->gpuZoneEnd.context, m_ctx->GetContextId() ); + Profiler::QueueSerialFinish(); + } + +private: + tracy_force_inline D3D11ZoneScope( D3D11Ctx* ctx, bool active ) +#ifdef TRACY_ON_DEMAND + : m_active( active && GetProfiler().IsConnected() ) +#else + : m_active( active ) +#endif + { + if( !m_active ) return; + m_ctx = ctx; + } + + void WriteQueueItem(tracy::QueueItem* item, tracy::QueueType queueItemType, uint64_t sourceLocation) + { + const auto queryId = m_ctx->NextQueryId(); + m_ctx->m_immediateDevCtx->End(m_ctx->GetQueryObjectFromId(queryId)); + + MemWrite( &item->hdr.type, queueItemType); + MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() ); + MemWrite( &item->gpuZoneBegin.srcloc, sourceLocation ); + MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() ); + MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) ); + MemWrite( &item->gpuZoneBegin.context, m_ctx->GetContextId() ); + Profiler::QueueSerialFinish(); + } + + const bool m_active; + + D3D11Ctx* m_ctx; +}; + +static inline D3D11Ctx* CreateD3D11Context( ID3D11Device* device, ID3D11DeviceContext* devicectx ) +{ + auto ctx = (D3D11Ctx*)tracy_malloc( sizeof( D3D11Ctx ) ); + new(ctx) D3D11Ctx( device, devicectx ); + return ctx; +} + +static inline void DestroyD3D11Context( D3D11Ctx* ctx ) +{ + ctx->~D3D11Ctx(); + tracy_free( ctx ); +} +} + +#undef TracyD3D11Panic + +using TracyD3D11Ctx = tracy::D3D11Ctx*; + +#define TracyD3D11Context( device, devicectx ) tracy::CreateD3D11Context( device, devicectx ); +#define TracyD3D11Destroy(ctx) tracy::DestroyD3D11Context(ctx); +#define TracyD3D11ContextName(ctx, name, size) ctx->Name(name, size); + +#define TracyD3D11UnnamedZone ___tracy_gpu_d3d11_zone +#define TracyD3D11SrcLocSymbol TracyConcat(__tracy_gpu_d3d11_source_location,TracyLine) +#define TracyD3D11SrcLocObject(name, color) static constexpr tracy::SourceLocationData TracyD3D11SrcLocSymbol { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; + +#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK +# define TracyD3D11Zone( ctx, name ) TracyD3D11NamedZoneS( ctx, TracyD3D11UnnamedZone, name, TRACY_CALLSTACK, true ) +# define TracyD3D11ZoneC( ctx, name, color ) TracyD3D11NamedZoneCS( ctx, TracyD3D11UnnamedZone, name, color, TRACY_CALLSTACK, true ) +# define TracyD3D11NamedZone( ctx, varname, name, active ) TracyD3D11SrcLocObject(name, 0); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, TRACY_CALLSTACK, active ); +# define TracyD3D11NamedZoneC( ctx, varname, name, color, active ) TracyD3D11SrcLocObject(name, color); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, TRACY_CALLSTACK, active ); +# define TracyD3D11ZoneTransient(ctx, varname, name, active) TracyD3D11ZoneTransientS(ctx, varname, cmdList, name, TRACY_CALLSTACK, active) +#else +# define TracyD3D11Zone( ctx, name ) TracyD3D11NamedZone( ctx, TracyD3D11UnnamedZone, name, true ) +# define TracyD3D11ZoneC( ctx, name, color ) TracyD3D11NamedZoneC( ctx, TracyD3D11UnnamedZone, name, color, true ) +# define TracyD3D11NamedZone( ctx, varname, name, active ) TracyD3D11SrcLocObject(name, 0); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, active ); +# define TracyD3D11NamedZoneC( ctx, varname, name, color, active ) TracyD3D11SrcLocObject(name, color); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, active ); +# define TracyD3D11ZoneTransient(ctx, varname, name, active) tracy::D3D11ZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), active }; +#endif + +#ifdef TRACY_HAS_CALLSTACK +# define TracyD3D11ZoneS( ctx, name, depth ) TracyD3D11NamedZoneS( ctx, TracyD3D11UnnamedZone, name, depth, true ) +# define TracyD3D11ZoneCS( ctx, name, color, depth ) TracyD3D11NamedZoneCS( ctx, TracyD3D11UnnamedZone, name, color, depth, true ) +# define TracyD3D11NamedZoneS( ctx, varname, name, depth, active ) TracyD3D11SrcLocObject(name, 0); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, depth, active ); +# define TracyD3D11NamedZoneCS( ctx, varname, name, color, depth, active ) TracyD3D11SrcLocObject(name, color); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, depth, active ); +# define TracyD3D11ZoneTransientS(ctx, varname, name, depth, active) tracy::D3D11ZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), depth, active }; +#else +# define TracyD3D11ZoneS( ctx, name, depth, active ) TracyD3D11Zone( ctx, name ) +# define TracyD3D11ZoneCS( ctx, name, color, depth, active ) TracyD3D11ZoneC( name, color ) +# define TracyD3D11NamedZoneS( ctx, varname, name, depth, active ) TracyD3D11NamedZone( ctx, varname, name, active ) +# define TracyD3D11NamedZoneCS( ctx, varname, name, color, depth, active ) TracyD3D11NamedZoneC( ctx, varname, name, color, active ) +# define TracyD3D11ZoneTransientS(ctx, varname, name, depth, active) TracyD3D11ZoneTransient(ctx, varname, name, active) +#endif + +#define TracyD3D11Collect( ctx ) ctx->Collect(); + +#endif + +#endif diff --git a/tracy/TracyD3D12.hpp b/tracy/TracyD3D12.hpp new file mode 100644 index 0000000..d36253d --- /dev/null +++ b/tracy/TracyD3D12.hpp @@ -0,0 +1,500 @@ +#ifndef __TRACYD3D12_HPP__ +#define __TRACYD3D12_HPP__ + +#ifndef TRACY_ENABLE + +#define TracyD3D12Context(device, queue) nullptr +#define TracyD3D12Destroy(ctx) +#define TracyD3D12ContextName(ctx, name, size) + +#define TracyD3D12NewFrame(ctx) + +#define TracyD3D12Zone(ctx, cmdList, name) +#define TracyD3D12ZoneC(ctx, cmdList, name, color) +#define TracyD3D12NamedZone(ctx, varname, cmdList, name, active) +#define TracyD3D12NamedZoneC(ctx, varname, cmdList, name, color, active) +#define TracyD3D12ZoneTransient(ctx, varname, cmdList, name, active) + +#define TracyD3D12ZoneS(ctx, cmdList, name, depth) +#define TracyD3D12ZoneCS(ctx, cmdList, name, color, depth) +#define TracyD3D12NamedZoneS(ctx, varname, cmdList, name, depth, active) +#define TracyD3D12NamedZoneCS(ctx, varname, cmdList, name, color, depth, active) +#define TracyD3D12ZoneTransientS(ctx, varname, cmdList, name, depth, active) + +#define TracyD3D12Collect(ctx) + +namespace tracy +{ + class D3D12ZoneScope {}; +} + +using TracyD3D12Ctx = void*; + +#else + +#include "Tracy.hpp" +#include "../client/TracyProfiler.hpp" +#include "../client/TracyCallstack.hpp" + +#include +#include +#include +#include +#include + +#define TracyD3D12Panic(msg, ...) do { assert(false && "TracyD3D12: " msg); TracyMessageLC("TracyD3D12: " msg, tracy::Color::Red4); __VA_ARGS__; } while(false); + +namespace tracy +{ + + struct D3D12QueryPayload + { + uint32_t m_queryIdStart = 0; + uint32_t m_queryCount = 0; + }; + + // Command queue context. + class D3D12QueueCtx + { + friend class D3D12ZoneScope; + + ID3D12Device* m_device = nullptr; + ID3D12CommandQueue* m_queue = nullptr; + uint8_t m_contextId = 255; // TODO: apparently, 255 means "invalid id"; is this documented somewhere? + ID3D12QueryHeap* m_queryHeap = nullptr; + ID3D12Resource* m_readbackBuffer = nullptr; + + // In-progress payload. + uint32_t m_queryLimit = 0; + std::atomic m_queryCounter = 0; + uint32_t m_previousQueryCounter = 0; + + uint32_t m_activePayload = 0; + ID3D12Fence* m_payloadFence = nullptr; + std::queue m_payloadQueue; + + UINT64 m_prevCalibrationTicksCPU = 0; + + void RecalibrateClocks() + { + UINT64 cpuTimestamp; + UINT64 gpuTimestamp; + if (FAILED(m_queue->GetClockCalibration(&gpuTimestamp, &cpuTimestamp))) + { + TracyD3D12Panic("failed to obtain queue clock calibration counters.", return); + } + + int64_t cpuDeltaTicks = cpuTimestamp - m_prevCalibrationTicksCPU; + if (cpuDeltaTicks > 0) + { + static const int64_t nanosecodsPerTick = int64_t(1000000000) / GetFrequencyQpc(); + int64_t cpuDeltaNS = cpuDeltaTicks * nanosecodsPerTick; + // Save the device cpu timestamp, not the Tracy profiler timestamp: + m_prevCalibrationTicksCPU = cpuTimestamp; + + cpuTimestamp = Profiler::GetTime(); + + auto* item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuCalibration); + MemWrite(&item->gpuCalibration.gpuTime, gpuTimestamp); + MemWrite(&item->gpuCalibration.cpuTime, cpuTimestamp); + MemWrite(&item->gpuCalibration.cpuDelta, cpuDeltaNS); + MemWrite(&item->gpuCalibration.context, GetId()); + SubmitQueueItem(item); + } + } + + tracy_force_inline void SubmitQueueItem(tracy::QueueItem* item) + { +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem(*item); +#endif + Profiler::QueueSerialFinish(); + } + + public: + D3D12QueueCtx(ID3D12Device* device, ID3D12CommandQueue* queue) + : m_device(device) + , m_queue(queue) + { + // Verify we support timestamp queries on this queue. + + if (queue->GetDesc().Type == D3D12_COMMAND_LIST_TYPE_COPY) + { + D3D12_FEATURE_DATA_D3D12_OPTIONS3 featureData{}; + + HRESULT hr = device->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS3, &featureData, sizeof(featureData)); + if (FAILED(hr) || (featureData.CopyQueueTimestampQueriesSupported == FALSE)) + { + TracyD3D12Panic("Platform does not support profiling of copy queues.", return); + } + } + + static constexpr uint32_t MaxQueries = 64 * 1024; // Must be even, because queries are (begin, end) pairs + m_queryLimit = MaxQueries; + + D3D12_QUERY_HEAP_DESC heapDesc{}; + heapDesc.Type = queue->GetDesc().Type == D3D12_COMMAND_LIST_TYPE_COPY ? D3D12_QUERY_HEAP_TYPE_COPY_QUEUE_TIMESTAMP : D3D12_QUERY_HEAP_TYPE_TIMESTAMP; + heapDesc.Count = m_queryLimit; + heapDesc.NodeMask = 0; // #TODO: Support multiple adapters. + + while (FAILED(device->CreateQueryHeap(&heapDesc, IID_PPV_ARGS(&m_queryHeap)))) + { + m_queryLimit /= 2; + heapDesc.Count = m_queryLimit; + } + + // Create a readback buffer, which will be used as a destination for the query data. + + D3D12_RESOURCE_DESC readbackBufferDesc{}; + readbackBufferDesc.Alignment = 0; + readbackBufferDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER; + readbackBufferDesc.Width = m_queryLimit * sizeof(uint64_t); + readbackBufferDesc.Height = 1; + readbackBufferDesc.DepthOrArraySize = 1; + readbackBufferDesc.Format = DXGI_FORMAT_UNKNOWN; + readbackBufferDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR; // Buffers are always row major. + readbackBufferDesc.MipLevels = 1; + readbackBufferDesc.SampleDesc.Count = 1; + readbackBufferDesc.SampleDesc.Quality = 0; + readbackBufferDesc.Flags = D3D12_RESOURCE_FLAG_NONE; + + D3D12_HEAP_PROPERTIES readbackHeapProps{}; + readbackHeapProps.Type = D3D12_HEAP_TYPE_READBACK; + readbackHeapProps.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN; + readbackHeapProps.MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN; + readbackHeapProps.CreationNodeMask = 0; + readbackHeapProps.VisibleNodeMask = 0; // #TODO: Support multiple adapters. + + if (FAILED(device->CreateCommittedResource(&readbackHeapProps, D3D12_HEAP_FLAG_NONE, &readbackBufferDesc, D3D12_RESOURCE_STATE_COPY_DEST, nullptr, IID_PPV_ARGS(&m_readbackBuffer)))) + { + TracyD3D12Panic("Failed to create query readback buffer.", return); + } + + if (FAILED(device->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&m_payloadFence)))) + { + TracyD3D12Panic("Failed to create payload fence.", return); + } + + float period = [queue]() + { + uint64_t timestampFrequency; + if (FAILED(queue->GetTimestampFrequency(×tampFrequency))) + { + return 0.0f; + } + return static_cast( 1E+09 / static_cast(timestampFrequency) ); + }(); + + if (period == 0.0f) + { + TracyD3D12Panic("Failed to get timestamp frequency.", return); + } + + uint64_t cpuTimestamp; + uint64_t gpuTimestamp; + if (FAILED(queue->GetClockCalibration(&gpuTimestamp, &cpuTimestamp))) + { + TracyD3D12Panic("Failed to get queue clock calibration.", return); + } + + // Save the device cpu timestamp, not the profiler's timestamp. + m_prevCalibrationTicksCPU = cpuTimestamp; + + cpuTimestamp = Profiler::GetTime(); + + // all checked: ready to roll + m_contextId = GetGpuCtxCounter().fetch_add(1); + + auto* item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuNewContext); + MemWrite(&item->gpuNewContext.cpuTime, cpuTimestamp); + MemWrite(&item->gpuNewContext.gpuTime, gpuTimestamp); + MemWrite(&item->gpuNewContext.thread, decltype(item->gpuNewContext.thread)(0)); // #TODO: why 0 instead of GetThreadHandle()? + MemWrite(&item->gpuNewContext.period, period); + MemWrite(&item->gpuNewContext.context, GetId()); + MemWrite(&item->gpuNewContext.flags, GpuContextCalibration); + MemWrite(&item->gpuNewContext.type, GpuContextType::Direct3D12); + SubmitQueueItem(item); + } + + ~D3D12QueueCtx() + { + ZoneScopedC(Color::Red4); + // collect all pending timestamps + while (m_payloadFence->GetCompletedValue() != m_activePayload) + /* busy-wait ... */; + Collect(); + m_payloadFence->Release(); + m_readbackBuffer->Release(); + m_queryHeap->Release(); + } + + + void NewFrame() + { + uint32_t queryCounter = m_queryCounter.exchange(0); + m_payloadQueue.emplace(D3D12QueryPayload{ m_previousQueryCounter, queryCounter }); + m_previousQueryCounter += queryCounter; + + if (m_previousQueryCounter >= m_queryLimit) + { + m_previousQueryCounter -= m_queryLimit; + } + + m_queue->Signal(m_payloadFence, ++m_activePayload); + } + + void Name( const char* name, uint16_t len ) + { + auto ptr = (char*)tracy_malloc( len ); + memcpy( ptr, name, len ); + + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuContextName ); + MemWrite( &item->gpuContextNameFat.context, GetId()); + MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr ); + MemWrite( &item->gpuContextNameFat.size, len ); + SubmitQueueItem(item); + } + + void Collect() + { + ZoneScopedC(Color::Red4); + +#ifdef TRACY_ON_DEMAND + if (!GetProfiler().IsConnected()) + { + m_queryCounter = 0; + + return; + } +#endif + + // Find out what payloads are available. + const auto newestReadyPayload = m_payloadFence->GetCompletedValue(); + const auto payloadCount = m_payloadQueue.size() - (m_activePayload - newestReadyPayload); + + if (!payloadCount) + { + return; // No payloads are available yet, exit out. + } + + D3D12_RANGE mapRange{ 0, m_queryLimit * sizeof(uint64_t) }; + + // Map the readback buffer so we can fetch the query data from the GPU. + void* readbackBufferMapping = nullptr; + + if (FAILED(m_readbackBuffer->Map(0, &mapRange, &readbackBufferMapping))) + { + TracyD3D12Panic("Failed to map readback buffer.", return); + } + + auto* timestampData = static_cast(readbackBufferMapping); + + for (uint32_t i = 0; i < payloadCount; ++i) + { + const auto& payload = m_payloadQueue.front(); + + for (uint32_t j = 0; j < payload.m_queryCount; ++j) + { + const auto counter = (payload.m_queryIdStart + j) % m_queryLimit; + const auto timestamp = timestampData[counter]; + const auto queryId = counter; + + auto* item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuTime); + MemWrite(&item->gpuTime.gpuTime, timestamp); + MemWrite(&item->gpuTime.queryId, static_cast(queryId)); + MemWrite(&item->gpuTime.context, GetId()); + + Profiler::QueueSerialFinish(); + } + + m_payloadQueue.pop(); + } + + m_readbackBuffer->Unmap(0, nullptr); + + // Recalibrate to account for drift. + RecalibrateClocks(); + } + + private: + tracy_force_inline uint32_t NextQueryId() + { + uint32_t queryCounter = m_queryCounter.fetch_add(2); + if (queryCounter >= m_queryLimit) + { + TracyD3D12Panic("Submitted too many GPU queries! Consider increasing MaxQueries."); + // #TODO: consider returning an invalid id or sentinel value here + } + + const uint32_t id = (m_previousQueryCounter + queryCounter) % m_queryLimit; + + return id; + } + + tracy_force_inline uint8_t GetId() const + { + return m_contextId; + } + }; + + class D3D12ZoneScope + { + const bool m_active; + D3D12QueueCtx* m_ctx = nullptr; + ID3D12GraphicsCommandList* m_cmdList = nullptr; + uint32_t m_queryId = 0; // Used for tracking in nested zones. + + tracy_force_inline void WriteQueueItem(QueueItem* item, QueueType type, uint64_t srcLocation) + { + MemWrite(&item->hdr.type, type); + MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime()); + MemWrite(&item->gpuZoneBegin.srcloc, srcLocation); + MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle()); + MemWrite(&item->gpuZoneBegin.queryId, static_cast(m_queryId)); + MemWrite(&item->gpuZoneBegin.context, m_ctx->GetId()); + Profiler::QueueSerialFinish(); + } + + tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, bool active) +#ifdef TRACY_ON_DEMAND + : m_active(active&& GetProfiler().IsConnected()) +#else + : m_active(active) +#endif + { + if (!m_active) return; + + m_ctx = ctx; + m_cmdList = cmdList; + + m_queryId = m_ctx->NextQueryId(); + m_cmdList->EndQuery(m_ctx->m_queryHeap, D3D12_QUERY_TYPE_TIMESTAMP, m_queryId); + } + + public: + tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, const SourceLocationData* srcLocation, bool active) + : D3D12ZoneScope(ctx, cmdList, active) + { + if (!m_active) return; + + auto* item = Profiler::QueueSerial(); + WriteQueueItem(item, QueueType::GpuZoneBeginSerial, reinterpret_cast(srcLocation)); + } + + tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, const SourceLocationData* srcLocation, int32_t depth, bool active) + : D3D12ZoneScope(ctx, cmdList, active) + { + if (!m_active) return; + + auto* item = Profiler::QueueSerialCallstack(Callstack(depth)); + WriteQueueItem(item, QueueType::GpuZoneBeginCallstackSerial, reinterpret_cast(srcLocation)); + } + + tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, ID3D12GraphicsCommandList* cmdList, bool active) + : D3D12ZoneScope(ctx, cmdList, active) + { + if (!m_active) return; + + const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz); + + auto* item = Profiler::QueueSerial(); + WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocSerial, sourceLocation); + } + + tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, ID3D12GraphicsCommandList* cmdList, int32_t depth, bool active) + : D3D12ZoneScope(ctx, cmdList, active) + { + if (!m_active) return; + + const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz); + + auto* item = Profiler::QueueSerialCallstack(Callstack(depth)); + WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocCallstackSerial, sourceLocation); + } + + tracy_force_inline ~D3D12ZoneScope() + { + if (!m_active) return; + + const auto queryId = m_queryId + 1; // Our end query slot is immediately after the begin slot. + m_cmdList->EndQuery(m_ctx->m_queryHeap, D3D12_QUERY_TYPE_TIMESTAMP, queryId); + + auto* item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuZoneEndSerial); + MemWrite(&item->gpuZoneEnd.cpuTime, Profiler::GetTime()); + MemWrite(&item->gpuZoneEnd.thread, GetThreadHandle()); + MemWrite(&item->gpuZoneEnd.queryId, static_cast(queryId)); + MemWrite(&item->gpuZoneEnd.context, m_ctx->GetId()); + Profiler::QueueSerialFinish(); + + m_cmdList->ResolveQueryData(m_ctx->m_queryHeap, D3D12_QUERY_TYPE_TIMESTAMP, m_queryId, 2, m_ctx->m_readbackBuffer, m_queryId * sizeof(uint64_t)); + } + }; + + static inline D3D12QueueCtx* CreateD3D12Context(ID3D12Device* device, ID3D12CommandQueue* queue) + { + auto* ctx = static_cast(tracy_malloc(sizeof(D3D12QueueCtx))); + new (ctx) D3D12QueueCtx{ device, queue }; + + return ctx; + } + + static inline void DestroyD3D12Context(D3D12QueueCtx* ctx) + { + ctx->~D3D12QueueCtx(); + tracy_free(ctx); + } + +} + +#undef TracyD3D12Panic + +using TracyD3D12Ctx = tracy::D3D12QueueCtx*; + +#define TracyD3D12Context(device, queue) tracy::CreateD3D12Context(device, queue); +#define TracyD3D12Destroy(ctx) tracy::DestroyD3D12Context(ctx); +#define TracyD3D12ContextName(ctx, name, size) ctx->Name(name, size); + +#define TracyD3D12NewFrame(ctx) ctx->NewFrame(); + +#define TracyD3D12UnnamedZone ___tracy_gpu_d3d12_zone +#define TracyD3D12SrcLocSymbol TracyConcat(__tracy_d3d12_source_location,TracyLine) +#define TracyD3D12SrcLocObject(name, color) static constexpr tracy::SourceLocationData TracyD3D12SrcLocSymbol { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; + +#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK +# define TracyD3D12Zone(ctx, cmdList, name) TracyD3D12NamedZoneS(ctx, TracyD3D12UnnamedZone, cmdList, name, TRACY_CALLSTACK, true) +# define TracyD3D12ZoneC(ctx, cmdList, name, color) TracyD3D12NamedZoneCS(ctx, TracyD3D12UnnamedZone, cmdList, name, color, TRACY_CALLSTACK, true) +# define TracyD3D12NamedZone(ctx, varname, cmdList, name, active) TracyD3D12SrcLocObject(name, 0); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, TRACY_CALLSTACK, active }; +# define TracyD3D12NamedZoneC(ctx, varname, cmdList, name, color, active) TracyD3D12SrcLocObject(name, color); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, TRACY_CALLSTACK, active }; +# define TracyD3D12ZoneTransient(ctx, varname, cmdList, name, active) TracyD3D12ZoneTransientS(ctx, varname, cmdList, name, TRACY_CALLSTACK, active) +#else +# define TracyD3D12Zone(ctx, cmdList, name) TracyD3D12NamedZone(ctx, TracyD3D12UnnamedZone, cmdList, name, true) +# define TracyD3D12ZoneC(ctx, cmdList, name, color) TracyD3D12NamedZoneC(ctx, TracyD3D12UnnamedZone, cmdList, name, color, true) +# define TracyD3D12NamedZone(ctx, varname, cmdList, name, active) TracyD3D12SrcLocObject(name, 0); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, active }; +# define TracyD3D12NamedZoneC(ctx, varname, cmdList, name, color, active) TracyD3D12SrcLocObject(name, color); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, active }; +# define TracyD3D12ZoneTransient(ctx, varname, cmdList, name, active) tracy::D3D12ZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), cmdList, active }; +#endif + +#ifdef TRACY_HAS_CALLSTACK +# define TracyD3D12ZoneS(ctx, cmdList, name, depth) TracyD3D12NamedZoneS(ctx, TracyD3D12UnnamedZone, cmdList, name, depth, true) +# define TracyD3D12ZoneCS(ctx, cmdList, name, color, depth) TracyD3D12NamedZoneCS(ctx, TracyD3D12UnnamedZone, cmdList, name, color, depth, true) +# define TracyD3D12NamedZoneS(ctx, varname, cmdList, name, depth, active) TracyD3D12SrcLocObject(name, 0); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, depth, active }; +# define TracyD3D12NamedZoneCS(ctx, varname, cmdList, name, color, depth, active) TracyD3D12SrcLocObject(name, color); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, depth, active }; +# define TracyD3D12ZoneTransientS(ctx, varname, cmdList, name, depth, active) tracy::D3D12ZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), cmdList, depth, active }; +#else +# define TracyD3D12ZoneS(ctx, cmdList, name, depth) TracyD3D12Zone(ctx, cmdList, name) +# define TracyD3D12ZoneCS(ctx, cmdList, name, color, depth) TracyD3D12Zone(ctx, cmdList, name, color) +# define TracyD3D12NamedZoneS(ctx, varname, cmdList, name, depth, active) TracyD3D12NamedZone(ctx, varname, cmdList, name, active) +# define TracyD3D12NamedZoneCS(ctx, varname, cmdList, name, color, depth, active) TracyD3D12NamedZoneC(ctx, varname, cmdList, name, color, active) +# define TracyD3D12ZoneTransientS(ctx, varname, cmdList, name, depth, active) TracyD3D12ZoneTransient(ctx, varname, cmdList, name, active) +#endif + +#define TracyD3D12Collect(ctx) ctx->Collect(); + +#endif + +#endif diff --git a/tracy/TracyLua.hpp b/tracy/TracyLua.hpp new file mode 100644 index 0000000..f0c5c40 --- /dev/null +++ b/tracy/TracyLua.hpp @@ -0,0 +1,486 @@ +#ifndef __TRACYLUA_HPP__ +#define __TRACYLUA_HPP__ + +// Include this file after you include lua headers. + +#ifndef TRACY_ENABLE + +#include + +namespace tracy +{ + +namespace detail +{ +static inline int noop( lua_State* L ) { return 0; } +} + +static inline void LuaRegister( lua_State* L ) +{ + lua_newtable( L ); + lua_pushcfunction( L, detail::noop ); + lua_setfield( L, -2, "ZoneBegin" ); + lua_pushcfunction( L, detail::noop ); + lua_setfield( L, -2, "ZoneBeginN" ); + lua_pushcfunction( L, detail::noop ); + lua_setfield( L, -2, "ZoneBeginS" ); + lua_pushcfunction( L, detail::noop ); + lua_setfield( L, -2, "ZoneBeginNS" ); + lua_pushcfunction( L, detail::noop ); + lua_setfield( L, -2, "ZoneEnd" ); + lua_pushcfunction( L, detail::noop ); + lua_setfield( L, -2, "ZoneText" ); + lua_pushcfunction( L, detail::noop ); + lua_setfield( L, -2, "ZoneName" ); + lua_pushcfunction( L, detail::noop ); + lua_setfield( L, -2, "Message" ); + lua_setglobal( L, "tracy" ); +} + +static inline char* FindEnd( char* ptr ) +{ + unsigned int cnt = 1; + while( cnt != 0 ) + { + if( *ptr == '(' ) cnt++; + else if( *ptr == ')' ) cnt--; + ptr++; + } + return ptr; +} + +static inline void LuaRemove( char* script ) +{ + while( *script ) + { + if( strncmp( script, "tracy.", 6 ) == 0 ) + { + if( strncmp( script + 6, "Zone", 4 ) == 0 ) + { + if( strncmp( script + 10, "End()", 5 ) == 0 ) + { + memset( script, ' ', 15 ); + script += 15; + } + else if( strncmp( script + 10, "Begin()", 7 ) == 0 ) + { + memset( script, ' ', 17 ); + script += 17; + } + else if( strncmp( script + 10, "Text(", 5 ) == 0 ) + { + auto end = FindEnd( script + 15 ); + memset( script, ' ', end - script ); + script = end; + } + else if( strncmp( script + 10, "Name(", 5 ) == 0 ) + { + auto end = FindEnd( script + 15 ); + memset( script, ' ', end - script ); + script = end; + } + else if( strncmp( script + 10, "BeginN(", 7 ) == 0 ) + { + auto end = FindEnd( script + 17 ); + memset( script, ' ', end - script ); + script = end; + } + else if( strncmp( script + 10, "BeginS(", 7 ) == 0 ) + { + auto end = FindEnd( script + 17 ); + memset( script, ' ', end - script ); + script = end; + } + else if( strncmp( script + 10, "BeginNS(", 8 ) == 0 ) + { + auto end = FindEnd( script + 18 ); + memset( script, ' ', end - script ); + script = end; + } + else + { + script += 10; + } + } + else if( strncmp( script + 6, "Message(", 8 ) == 0 ) + { + auto end = FindEnd( script + 14 ); + memset( script, ' ', end - script ); + script = end; + } + else + { + script += 6; + } + } + else + { + script++; + } + } +} + +static inline void LuaHook( lua_State* L, lua_Debug* ar ) {} + +} + +#else + +#include +#include + +#include "../common/TracyColor.hpp" +#include "../common/TracyAlign.hpp" +#include "../common/TracyForceInline.hpp" +#include "../common/TracySystem.hpp" +#include "../client/TracyProfiler.hpp" + +namespace tracy +{ + +#ifdef TRACY_ON_DEMAND +TRACY_API LuaZoneState& GetLuaZoneState(); +#endif + +namespace detail +{ + +static inline void LuaShortenSrc( char* dst, const char* src ) +{ + size_t l = std::min( (size_t)255, strlen( src ) ); + memcpy( dst, src, l ); + dst[l] = 0; +} + +#ifdef TRACY_HAS_CALLSTACK +static tracy_force_inline void SendLuaCallstack( lua_State* L, uint32_t depth ) +{ + assert( depth <= 64 ); + lua_Debug dbg[64]; + const char* func[64]; + uint32_t fsz[64]; + uint32_t ssz[64]; + + uint8_t cnt; + uint16_t spaceNeeded = sizeof( cnt ); + for( cnt=0; cnt::max)() ); + memcpy( dst, fsz+i, 2 ); dst += 2; + memcpy( dst, func[i], fsz[i] ); dst += fsz[i]; + assert( ssz[i] <= (std::numeric_limits::max)() ); + memcpy( dst, ssz+i, 2 ); dst += 2; + memcpy( dst, dbg[i].source, ssz[i] ), dst += ssz[i]; + } + assert( dst - ptr == spaceNeeded + 2 ); + + TracyQueuePrepare( QueueType::CallstackAlloc ); + MemWrite( &item->callstackAllocFat.ptr, (uint64_t)ptr ); + MemWrite( &item->callstackAllocFat.nativePtr, (uint64_t)Callstack( depth ) ); + TracyQueueCommit( callstackAllocFatThread ); +} + +static inline int LuaZoneBeginS( lua_State* L ) +{ +#ifdef TRACY_ON_DEMAND + const auto zoneCnt = GetLuaZoneState().counter++; + if( zoneCnt != 0 && !GetLuaZoneState().active ) return 0; + GetLuaZoneState().active = GetProfiler().IsConnected(); + if( !GetLuaZoneState().active ) return 0; +#endif + +#ifdef TRACY_CALLSTACK + const uint32_t depth = TRACY_CALLSTACK; +#else + const auto depth = uint32_t( lua_tointeger( L, 1 ) ); +#endif + SendLuaCallstack( L, depth ); + + lua_Debug dbg; + lua_getstack( L, 1, &dbg ); + lua_getinfo( L, "Snl", &dbg ); + char src[256]; + LuaShortenSrc( src, dbg.source ); + const auto srcloc = Profiler::AllocSourceLocation( dbg.currentline, src, dbg.name ? dbg.name : dbg.short_src ); + + TracyQueuePrepare( QueueType::ZoneBeginAllocSrcLocCallstack ); + MemWrite( &item->zoneBegin.time, Profiler::GetTime() ); + MemWrite( &item->zoneBegin.srcloc, srcloc ); + TracyQueueCommit( zoneBeginThread ); + + return 0; +} + +static inline int LuaZoneBeginNS( lua_State* L ) +{ +#ifdef TRACY_ON_DEMAND + const auto zoneCnt = GetLuaZoneState().counter++; + if( zoneCnt != 0 && !GetLuaZoneState().active ) return 0; + GetLuaZoneState().active = GetProfiler().IsConnected(); + if( !GetLuaZoneState().active ) return 0; +#endif + +#ifdef TRACY_CALLSTACK + const uint32_t depth = TRACY_CALLSTACK; +#else + const auto depth = uint32_t( lua_tointeger( L, 2 ) ); +#endif + SendLuaCallstack( L, depth ); + + lua_Debug dbg; + lua_getstack( L, 1, &dbg ); + lua_getinfo( L, "Snl", &dbg ); + size_t nsz; + char src[256]; + LuaShortenSrc( src, dbg.source ); + const auto name = lua_tolstring( L, 1, &nsz ); + const auto srcloc = Profiler::AllocSourceLocation( dbg.currentline, src, dbg.name ? dbg.name : dbg.short_src, name, nsz ); + + TracyQueuePrepare( QueueType::ZoneBeginAllocSrcLocCallstack ); + MemWrite( &item->zoneBegin.time, Profiler::GetTime() ); + MemWrite( &item->zoneBegin.srcloc, srcloc ); + TracyQueueCommit( zoneBeginThread ); + + return 0; +} +#endif + +static inline int LuaZoneBegin( lua_State* L ) +{ +#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK + return LuaZoneBeginS( L ); +#else +#ifdef TRACY_ON_DEMAND + const auto zoneCnt = GetLuaZoneState().counter++; + if( zoneCnt != 0 && !GetLuaZoneState().active ) return 0; + GetLuaZoneState().active = GetProfiler().IsConnected(); + if( !GetLuaZoneState().active ) return 0; +#endif + + lua_Debug dbg; + lua_getstack( L, 1, &dbg ); + lua_getinfo( L, "Snl", &dbg ); + char src[256]; + LuaShortenSrc( src, dbg.source ); + const auto srcloc = Profiler::AllocSourceLocation( dbg.currentline, src, dbg.name ? dbg.name : dbg.short_src ); + + TracyQueuePrepare( QueueType::ZoneBeginAllocSrcLoc ); + MemWrite( &item->zoneBegin.time, Profiler::GetTime() ); + MemWrite( &item->zoneBegin.srcloc, srcloc ); + TracyQueueCommit( zoneBeginThread ); + return 0; +#endif +} + +static inline int LuaZoneBeginN( lua_State* L ) +{ +#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK + return LuaZoneBeginNS( L ); +#else +#ifdef TRACY_ON_DEMAND + const auto zoneCnt = GetLuaZoneState().counter++; + if( zoneCnt != 0 && !GetLuaZoneState().active ) return 0; + GetLuaZoneState().active = GetProfiler().IsConnected(); + if( !GetLuaZoneState().active ) return 0; +#endif + + lua_Debug dbg; + lua_getstack( L, 1, &dbg ); + lua_getinfo( L, "Snl", &dbg ); + size_t nsz; + char src[256]; + LuaShortenSrc( src, dbg.source ); + const auto name = lua_tolstring( L, 1, &nsz ); + const auto srcloc = Profiler::AllocSourceLocation( dbg.currentline, src, dbg.name ? dbg.name : dbg.short_src, name, nsz ); + + TracyQueuePrepare( QueueType::ZoneBeginAllocSrcLoc ); + MemWrite( &item->zoneBegin.time, Profiler::GetTime() ); + MemWrite( &item->zoneBegin.srcloc, srcloc ); + TracyQueueCommit( zoneBeginThread ); + return 0; +#endif +} + +static inline int LuaZoneEnd( lua_State* L ) +{ +#ifdef TRACY_ON_DEMAND + assert( GetLuaZoneState().counter != 0 ); + GetLuaZoneState().counter--; + if( !GetLuaZoneState().active ) return 0; + if( !GetProfiler().IsConnected() ) + { + GetLuaZoneState().active = false; + return 0; + } +#endif + + TracyQueuePrepare( QueueType::ZoneEnd ); + MemWrite( &item->zoneEnd.time, Profiler::GetTime() ); + TracyQueueCommit( zoneEndThread ); + return 0; +} + +static inline int LuaZoneText( lua_State* L ) +{ +#ifdef TRACY_ON_DEMAND + if( !GetLuaZoneState().active ) return 0; + if( !GetProfiler().IsConnected() ) + { + GetLuaZoneState().active = false; + return 0; + } +#endif + + auto txt = lua_tostring( L, 1 ); + const auto size = strlen( txt ); + assert( size < (std::numeric_limits::max)() ); + + auto ptr = (char*)tracy_malloc( size ); + memcpy( ptr, txt, size ); + + TracyQueuePrepare( QueueType::ZoneText ); + MemWrite( &item->zoneTextFat.text, (uint64_t)ptr ); + MemWrite( &item->zoneTextFat.size, (uint16_t)size ); + TracyQueueCommit( zoneTextFatThread ); + return 0; +} + +static inline int LuaZoneName( lua_State* L ) +{ +#ifdef TRACY_ON_DEMAND + if( !GetLuaZoneState().active ) return 0; + if( !GetProfiler().IsConnected() ) + { + GetLuaZoneState().active = false; + return 0; + } +#endif + + auto txt = lua_tostring( L, 1 ); + const auto size = strlen( txt ); + assert( size < (std::numeric_limits::max)() ); + + auto ptr = (char*)tracy_malloc( size ); + memcpy( ptr, txt, size ); + + TracyQueuePrepare( QueueType::ZoneName ); + MemWrite( &item->zoneTextFat.text, (uint64_t)ptr ); + MemWrite( &item->zoneTextFat.size, (uint16_t)size ); + TracyQueueCommit( zoneTextFatThread ); + return 0; +} + +static inline int LuaMessage( lua_State* L ) +{ +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return 0; +#endif + + auto txt = lua_tostring( L, 1 ); + const auto size = strlen( txt ); + assert( size < (std::numeric_limits::max)() ); + + auto ptr = (char*)tracy_malloc( size ); + memcpy( ptr, txt, size ); + + TracyQueuePrepare( QueueType::Message ); + MemWrite( &item->messageFat.time, Profiler::GetTime() ); + MemWrite( &item->messageFat.text, (uint64_t)ptr ); + MemWrite( &item->messageFat.size, (uint16_t)size ); + TracyQueueCommit( messageFatThread ); + return 0; +} + +} + +static inline void LuaRegister( lua_State* L ) +{ + lua_newtable( L ); + lua_pushcfunction( L, detail::LuaZoneBegin ); + lua_setfield( L, -2, "ZoneBegin" ); + lua_pushcfunction( L, detail::LuaZoneBeginN ); + lua_setfield( L, -2, "ZoneBeginN" ); +#ifdef TRACY_HAS_CALLSTACK + lua_pushcfunction( L, detail::LuaZoneBeginS ); + lua_setfield( L, -2, "ZoneBeginS" ); + lua_pushcfunction( L, detail::LuaZoneBeginNS ); + lua_setfield( L, -2, "ZoneBeginNS" ); +#else + lua_pushcfunction( L, detail::LuaZoneBegin ); + lua_setfield( L, -2, "ZoneBeginS" ); + lua_pushcfunction( L, detail::LuaZoneBeginN ); + lua_setfield( L, -2, "ZoneBeginNS" ); +#endif + lua_pushcfunction( L, detail::LuaZoneEnd ); + lua_setfield( L, -2, "ZoneEnd" ); + lua_pushcfunction( L, detail::LuaZoneText ); + lua_setfield( L, -2, "ZoneText" ); + lua_pushcfunction( L, detail::LuaZoneName ); + lua_setfield( L, -2, "ZoneName" ); + lua_pushcfunction( L, detail::LuaMessage ); + lua_setfield( L, -2, "Message" ); + lua_setglobal( L, "tracy" ); +} + +static inline void LuaRemove( char* script ) {} + +static inline void LuaHook( lua_State* L, lua_Debug* ar ) +{ + if ( ar->event == LUA_HOOKCALL ) + { +#ifdef TRACY_ON_DEMAND + const auto zoneCnt = GetLuaZoneState().counter++; + if ( zoneCnt != 0 && !GetLuaZoneState().active ) return; + GetLuaZoneState().active = GetProfiler().IsConnected(); + if ( !GetLuaZoneState().active ) return; +#endif + lua_getinfo( L, "Snl", ar ); + + char src[256]; + detail::LuaShortenSrc( src, ar->short_src ); + + const auto srcloc = Profiler::AllocSourceLocation( ar->currentline, src, ar->name ? ar->name : ar->short_src ); + TracyQueuePrepare( QueueType::ZoneBeginAllocSrcLoc ); + MemWrite( &item->zoneBegin.time, Profiler::GetTime() ); + MemWrite( &item->zoneBegin.srcloc, srcloc ); + TracyQueueCommit( zoneBeginThread ); + } + else if (ar->event == LUA_HOOKRET) { +#ifdef TRACY_ON_DEMAND + assert( GetLuaZoneState().counter != 0 ); + GetLuaZoneState().counter--; + if ( !GetLuaZoneState().active ) return; + if ( !GetProfiler().IsConnected() ) + { + GetLuaZoneState().active = false; + return; + } +#endif + TracyQueuePrepare( QueueType::ZoneEnd ); + MemWrite( &item->zoneEnd.time, Profiler::GetTime() ); + TracyQueueCommit( zoneEndThread ); + } +} + +} + +#endif + +#endif diff --git a/tracy/TracyMetal.hmm b/tracy/TracyMetal.hmm new file mode 100644 index 0000000..a4b4cb5 --- /dev/null +++ b/tracy/TracyMetal.hmm @@ -0,0 +1,644 @@ +#ifndef __TRACYMETAL_HMM__ +#define __TRACYMETAL_HMM__ + +/* This file implements a Metal API back-end for Tracy (it has only been tested on Apple + Silicon devices, but it should also work on Intel-based Macs and older iOS devices). + The Metal back-end in Tracy operates differently than other GPU back-ends like Vulkan, + Direct3D and OpenGL. Specifically, TracyMetalZone() must be placed around the site where + a command encoder is created. This is because not all hardware supports timestamps at + command granularity, and can only provide timestamps around an entire command encoder. + This accommodates for all tiers of hardware; in the future, variants of TracyMetalZone() + will be added to support the habitual command-level granularity of Tracy GPU back-ends. + Metal also imposes a few restrictions that make the process of requesting and collecting + queries more complicated in Tracy: + a) timestamp query buffers are limited to 4096 queries (32KB, where each query is 8 bytes) + b) when a timestamp query buffer is created, Metal initializes all timestamps with zeroes, + and there's no way to reset them back to zero after timestamps get resolved; the only + way to clear the timestamps is by allocating a new timestamp query buffer + c) if a command encoder records no commands and its corresponding command buffer ends up + committed to the command queue, Metal will "optimize-away" the encoder along with any + timestamp queries associated with it (the timestamp will remain as zero and will never + get resolved) + Because of the limitations above, two timestamp buffers are managed internally. Once one + of the buffers fills up with requests, the second buffer can start serving new requests. + Once all requests in a buffer get resolved and collected, the entire buffer is discarded + and a new one allocated for future requests. (Proper cycling through a ring buffer would + require bookkeeping and completion handlers to collect only the known complete queries.) + In the current implementation, there is potential for a race condition when the buffer is + discarded and reallocated. In practice, the race condition will never materialize so long + as TracyMetalCollect() is called frequently to keep the amount of unresolved queries low. + Finally, there's a timeout mechanism during timestamp collection to detect "empty" command + encoders and ensure progress. +*/ + +#ifndef TRACY_ENABLE + +#define TracyMetalContext(device) nullptr +#define TracyMetalDestroy(ctx) +#define TracyMetalContextName(ctx, name, size) + +#define TracyMetalZone(ctx, encoderDesc, name) +#define TracyMetalZoneC(ctx, encoderDesc, name, color) +#define TracyMetalNamedZone(ctx, varname, encoderDesc, name, active) +#define TracyMetalNamedZoneC(ctx, varname, encoderDesc, name, color, active) + +#define TracyMetalCollect(ctx) + +namespace tracy +{ +class MetalZoneScope {}; +} + +using TracyMetalCtx = void; + +#else + +#if not __has_feature(objc_arc) +#error TracyMetal requires ARC to be enabled. +#endif + +#include +#include +#include + +#include "Tracy.hpp" +#include "../client/TracyProfiler.hpp" +#include "../client/TracyCallstack.hpp" +#include "../common/TracyAlign.hpp" +#include "../common/TracyAlloc.hpp" + +// ok to import if in obj-c code +#import + +#define TRACY_METAL_VA_ARGS(...) , ##__VA_ARGS__ + +#define TracyMetalPanic(ret, msg, ...) do { \ + char buffer [1024]; \ + snprintf(buffer, sizeof(buffer), "TracyMetal: " msg TRACY_METAL_VA_ARGS(__VA_ARGS__)); \ + TracyMessageC(buffer, strlen(buffer), tracy::Color::OrangeRed); \ + fprintf(stderr, "%s\n", buffer); \ + ret; \ + } while(false); + +#ifndef TRACY_METAL_TIMESTAMP_COLLECT_TIMEOUT +#define TRACY_METAL_TIMESTAMP_COLLECT_TIMEOUT 0.200f +#endif//TRACY_METAL_TIMESTAMP_COLLECT_TIMEOUT + +#ifndef TRACY_METAL_DEBUG_MASK +#define TRACY_METAL_DEBUG_MASK (0) +#endif//TRACY_METAL_DEBUG_MASK + +#if TRACY_METAL_DEBUG_MASK + #define TracyMetalDebugMasked(mask, ...) if constexpr (mask & TRACY_METAL_DEBUG_MASK) { __VA_ARGS__; } +#else + #define TracyMetalDebugMasked(mask, ...) +#endif + +#if TRACY_METAL_DEBUG_MASK & (1 << 1) + #define TracyMetalDebug_0b00010(...) __VA_ARGS__; +#else + #define TracyMetalDebug_0b00010(...) +#endif + +#if TRACY_METAL_DEBUG_MASK & (1 << 4) + #define TracyMetalDebug_0b10000(...) __VA_ARGS__; +#else + #define TracyMetalDebug_0b10000(...) +#endif + +#ifndef TracyMetalDebugZoneScopeWireTap +#define TracyMetalDebugZoneScopeWireTap +#endif//TracyMetalDebugZoneScopeWireTap + +namespace tracy +{ + +class MetalCtx +{ + friend class MetalZoneScope; + + enum { MaxQueries = 4 * 1024 }; // Metal: between 8 and 32768 _BYTES_... + +public: + static MetalCtx* Create(id device) + { + ZoneScopedNC("tracy::MetalCtx::Create", Color::Red4); + auto ctx = static_cast(tracy_malloc(sizeof(MetalCtx))); + new (ctx) MetalCtx(device); + if (ctx->m_contextId == 255) + { + TracyMetalPanic({assert(false);} return nullptr, "ERROR: unable to create context."); + Destroy(ctx); + } + return ctx; + } + + static void Destroy(MetalCtx* ctx) + { + ZoneScopedNC("tracy::MetalCtx::Destroy", Color::Red4); + ctx->~MetalCtx(); + tracy_free(ctx); + } + + void Name( const char* name, uint16_t len ) + { + auto ptr = (char*)tracy_malloc( len ); + memcpy( ptr, name, len ); + + auto* item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuContextName ); + MemWrite( &item->gpuContextNameFat.context, m_contextId ); + MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr ); + MemWrite( &item->gpuContextNameFat.size, len ); + SubmitQueueItem(item); + } + + bool Collect() + { + ZoneScopedNC("tracy::MetalCtx::Collect", Color::Red4); + +#ifdef TRACY_ON_DEMAND + if (!GetProfiler().IsConnected()) + { + return true; + } +#endif + + // Only one thread is allowed to collect timestamps at any given time + // but there's no need to block contending threads + if (!m_collectionMutex.try_lock()) + { + return true; + } + + std::unique_lock lock (m_collectionMutex, std::adopt_lock); + + uintptr_t begin = m_previousCheckpoint.load(); + uintptr_t latestCheckpoint = m_queryCounter.load(); // TODO: MTLEvent? MTLFence?; + TracyMetalDebugMasked(1<<3, ZoneValue(begin)); + TracyMetalDebugMasked(1<<3, ZoneValue(latestCheckpoint)); + + uint32_t count = RingCount(begin, latestCheckpoint); + if (count == 0) // no pending timestamp queries + { + //uintptr_t nextCheckpoint = m_queryCounter.load(); + //if (nextCheckpoint != latestCheckpoint) + //{ + // // TODO: signal event / fence now? + //} + return true; + } + + // resolve up until the ring buffer boundary and let a subsequenty call + // to Collect handle the wrap-around + bool reallocateBuffer = false; + if (RingIndex(begin) + count >= RingSize()) + { + count = RingSize() - RingIndex(begin); + reallocateBuffer = true; + } + TracyMetalDebugMasked(1<<3, ZoneValue(count)); + + auto buffer_idx = (begin / MaxQueries) % 2; + auto counterSampleBuffer = m_counterSampleBuffers[buffer_idx]; + + if (count >= RingSize()) + { + TracyMetalPanic(return false, "Collect: FULL! too many pending timestamp queries. [%llu, %llu] (%u)", begin, latestCheckpoint, count); + } + + TracyMetalDebugMasked(1<<3, TracyMetalPanic(, "Collect: [%llu, %llu] :: (%u)", begin, latestCheckpoint, count)); + + NSRange range = NSMakeRange(RingIndex(begin), count); + NSData* data = [counterSampleBuffer resolveCounterRange:range]; + NSUInteger numResolvedTimestamps = data.length / sizeof(MTLCounterResultTimestamp); + MTLCounterResultTimestamp* timestamps = (MTLCounterResultTimestamp *)(data.bytes); + if (timestamps == nil) + { + TracyMetalPanic(return false, "Collect: unable to resolve timestamps."); + } + + if (numResolvedTimestamps != count) + { + TracyMetalPanic(, "Collect: numResolvedTimestamps != count : %u != %u", (uint32_t)numResolvedTimestamps, count); + } + + int resolved = 0; + for (auto i = 0; i < numResolvedTimestamps; i += 2) + { + TracyMetalDebug_0b10000( ZoneScopedN("tracy::MetalCtx::Collect::[i]") ); + MTLTimestamp t_start = timestamps[i+0].timestamp; + MTLTimestamp t_end = timestamps[i+1].timestamp; + uint32_t k = RingIndex(begin + i); + TracyMetalDebugMasked(1<<4, TracyMetalPanic(, "Collect: timestamp[%u] = %llu | timestamp[%u] = %llu | diff = %llu\n", k, t_start, k+1, t_end, (t_end - t_start))); + if ((t_start == MTLCounterErrorValue) || (t_end == MTLCounterErrorValue)) + { + TracyMetalPanic(, "Collect: invalid timestamp (MTLCounterErrorValue) at %u.", k); + break; + } + // Metal will initialize timestamp buffer with zeroes; encountering a zero-value + // timestamp means that the timestamp has not been written and resolved yet + if ((t_start == 0) || (t_end == 0)) + { + auto checkTime = std::chrono::high_resolution_clock::now(); + auto requestTime = m_timestampRequestTime[k]; + auto ms_in_flight = std::chrono::duration(checkTime-requestTime).count()*1000.0f; + TracyMetalDebugMasked(1<<4, TracyMetalPanic(, "Collect: invalid timestamp (zero) at %u [%.0fms in flight].", k, ms_in_flight)); + const float timeout_ms = TRACY_METAL_TIMESTAMP_COLLECT_TIMEOUT * 1000.0f; + if (ms_in_flight < timeout_ms) + break; + TracyMetalDebug_0b10000( ZoneScopedN("tracy::MetalCtx::Collect::Drop") ); + TracyMetalPanic(, "Collect: giving up on timestamp at %u [%.0fms in flight].", k, ms_in_flight); + t_start = m_mostRecentTimestamp + 5; + t_end = t_start + 5; + } + TracyMetalDebugMasked(1<<2, TracyFreeN((void*)(uintptr_t)(k+0), "TracyMetalGpuZone")); + TracyMetalDebugMasked(1<<2, TracyFreeN((void*)(uintptr_t)(k+1), "TracyMetalGpuZone")); + { + auto* item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuTime); + MemWrite(&item->gpuTime.gpuTime, static_cast(t_start)); + MemWrite(&item->gpuTime.queryId, static_cast(k)); + MemWrite(&item->gpuTime.context, m_contextId); + Profiler::QueueSerialFinish(); + } + { + auto* item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuTime); + MemWrite(&item->gpuTime.gpuTime, static_cast(t_end)); + MemWrite(&item->gpuTime.queryId, static_cast(k+1)); + MemWrite(&item->gpuTime.context, m_contextId); + Profiler::QueueSerialFinish(); + } + m_mostRecentTimestamp = (t_end > m_mostRecentTimestamp) ? t_end : m_mostRecentTimestamp; + TracyMetalDebugMasked(1<<1, TracyFreeN((void*)(uintptr_t)k, "TracyMetalTimestampQueryId")); + resolved += 2; + } + TracyMetalDebugMasked(1<<3, ZoneValue(RingCount(begin, m_previousCheckpoint.load()))); + + m_previousCheckpoint += resolved; + + // Check whether the timestamp buffer has been fully resolved/collected: + // WARN: there's technically a race condition here: NextQuery() may reference the + // buffer that is being released instead of the new one. In practice, this should + // never happen so long as Collect is called frequently enough to prevent pending + // timestamp query requests from piling up too quickly. + if ((resolved == count) && (m_previousCheckpoint.load() % MaxQueries) == 0) + { + m_counterSampleBuffers[buffer_idx] = NewTimestampSampleBuffer(m_device, MaxQueries); + } + + //RecalibrateClocks(); // to account for drift + + return true; + } + +private: + MetalCtx(id device) + : m_device(device) + { + TracyMetalDebugMasked(1<<0, TracyMetalPanic(, "MTLCounterErrorValue = 0x%llx", MTLCounterErrorValue)); + TracyMetalDebugMasked(1<<0, TracyMetalPanic(, "MTLCounterDontSample = 0x%llx", MTLCounterDontSample)); + + if (m_device == nil) + { + TracyMetalPanic({assert(false);} return, "device is nil."); + } + if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtStageBoundary]) + { + TracyMetalPanic({assert(false);} return, "ERROR: timestamp sampling at pipeline stage boundary is not supported."); + } + if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtDrawBoundary]) + { + TracyMetalDebugMasked(1<<0, fprintf(stderr, "WARNING: timestamp sampling at draw call boundary is not supported.\n")); + } + if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtBlitBoundary]) + { + TracyMetalDebugMasked(1<<0, fprintf(stderr, "WARNING: timestamp sampling at blit boundary is not supported.\n")); + } + if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtDispatchBoundary]) + { + TracyMetalDebugMasked(1<<0, fprintf(stderr, "WARNING: timestamp sampling at compute dispatch boundary is not supported.\n")); + } + if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtTileDispatchBoundary]) + { + TracyMetalDebugMasked(1<<0, fprintf(stderr, "WARNING: timestamp sampling at tile dispatch boundary is not supported.\n")); + } + + m_counterSampleBuffers[0] = NewTimestampSampleBuffer(m_device, MaxQueries); + m_counterSampleBuffers[1] = NewTimestampSampleBuffer(m_device, MaxQueries); + + m_timestampRequestTime.resize(MaxQueries); + + MTLTimestamp cpuTimestamp = 0; + MTLTimestamp gpuTimestamp = 0; + [m_device sampleTimestamps:&cpuTimestamp gpuTimestamp:&gpuTimestamp]; + m_mostRecentTimestamp = gpuTimestamp; + TracyMetalDebugMasked(1<<0, TracyMetalPanic(, "Calibration: CPU timestamp (Metal): %llu", cpuTimestamp)); + TracyMetalDebugMasked(1<<0, TracyMetalPanic(, "Calibration: GPU timestamp (Metal): %llu", gpuTimestamp)); + + cpuTimestamp = Profiler::GetTime(); + TracyMetalDebugMasked(1<<0, TracyMetalPanic(, "Calibration: CPU timestamp (Tracy): %llu", cpuTimestamp)); + + float period = 1.0f; + + m_contextId = GetGpuCtxCounter().fetch_add(1); + + auto* item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuNewContext); + MemWrite(&item->gpuNewContext.cpuTime, int64_t(cpuTimestamp)); + MemWrite(&item->gpuNewContext.gpuTime, int64_t(gpuTimestamp)); + MemWrite(&item->gpuNewContext.thread, uint32_t(0)); // TODO: why not GetThreadHandle()? + MemWrite(&item->gpuNewContext.period, period); + MemWrite(&item->gpuNewContext.context, m_contextId); + //MemWrite(&item->gpuNewContext.flags, GpuContextCalibration); + MemWrite(&item->gpuNewContext.flags, GpuContextFlags(0)); + MemWrite(&item->gpuNewContext.type, GpuContextType::Metal); + SubmitQueueItem(item); + } + + ~MetalCtx() + { + // collect the last remnants of Metal GPU activity... + // TODO: add a timeout to this loop? + while (m_previousCheckpoint.load() != m_queryCounter.load()) + Collect(); + } + + tracy_force_inline void SubmitQueueItem(QueueItem* item) + { +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem(*item); +#endif + Profiler::QueueSerialFinish(); + } + + tracy_force_inline uint32_t RingIndex(uintptr_t index) + { + index %= MaxQueries; + return static_cast(index); + } + + tracy_force_inline uint32_t RingCount(uintptr_t begin, uintptr_t end) + { + // wrap-around safe: all unsigned + uintptr_t count = end - begin; + return static_cast(count); + } + + tracy_force_inline uint32_t RingSize() const + { + return MaxQueries; + } + + struct Query { id buffer; uint32_t idx; }; + + tracy_force_inline Query NextQuery() + { + TracyMetalDebug_0b00010( ZoneScopedNC("Tracy::MetalCtx::NextQuery", tracy::Color::LightCoral) ); + auto id = m_queryCounter.fetch_add(2); + TracyMetalDebug_0b00010( ZoneValue(id) ); + auto count = RingCount(m_previousCheckpoint, id); + if (count >= MaxQueries) + { + // TODO: return a proper (hidden) "sentinel" query + Query sentinel = Query{ m_counterSampleBuffers[1], MaxQueries-2 }; + TracyMetalPanic( + return sentinel, + "NextQueryId: FULL! too many pending timestamp queries. Consider calling TracyMetalCollect() more frequently. [%llu, %llu] (%u)", + m_previousCheckpoint.load(), id, count + ); + } + uint32_t buffer_idx = (id / MaxQueries) % 2; + TracyMetalDebug_0b00010( ZoneValue(buffer_idx) ); + auto buffer = m_counterSampleBuffers[buffer_idx]; + if (buffer == nil) + TracyMetalPanic(, "NextQueryId: sample buffer is nil! (id=%llu)", id); + uint32_t idx = RingIndex(id); + TracyMetalDebug_0b00010( ZoneValue(idx) ); + TracyMetalDebug_0b00010( TracyAllocN((void*)(uintptr_t)idx, 2, "TracyMetalTimestampQueryId") ); + m_timestampRequestTime[idx] = std::chrono::high_resolution_clock::now(); + return Query{ buffer, idx }; + } + + tracy_force_inline uint8_t GetContextId() const + { + return m_contextId; + } + + static id NewTimestampSampleBuffer(id device, size_t count) + { + ZoneScopedN("tracy::MetalCtx::NewTimestampSampleBuffer"); + + id timestampCounterSet = nil; + for (id counterSet in device.counterSets) + { + if ([counterSet.name isEqualToString:MTLCommonCounterSetTimestamp]) + { + timestampCounterSet = counterSet; + break; + } + } + if (timestampCounterSet == nil) + { + TracyMetalPanic({assert(false);} return nil, "ERROR: timestamp counters are not supported on the platform."); + } + + MTLCounterSampleBufferDescriptor* sampleDescriptor = [[MTLCounterSampleBufferDescriptor alloc] init]; + sampleDescriptor.counterSet = timestampCounterSet; + sampleDescriptor.sampleCount = MaxQueries; + sampleDescriptor.storageMode = MTLStorageModeShared; + sampleDescriptor.label = @"TracyMetalTimestampPool"; + + NSError* error = nil; + id counterSampleBuffer = [device newCounterSampleBufferWithDescriptor:sampleDescriptor error:&error]; + if (error != nil) + { + //NSLog(@"%@ | %@", error.localizedDescription, error.localizedFailureReason); + TracyMetalPanic({assert(false);} return nil, + "ERROR: unable to create sample buffer for timestamp counters : %s | %s", + [error.localizedDescription cString], [error.localizedFailureReason cString]); + } + + return counterSampleBuffer; + } + + uint8_t m_contextId = 255; + + id m_device = nil; + id m_counterSampleBuffers [2] = {}; + + using atomic_counter = std::atomic; + static_assert(atomic_counter::is_always_lock_free); + atomic_counter m_queryCounter = 0; + + atomic_counter m_previousCheckpoint = 0; + MTLTimestamp m_mostRecentTimestamp = 0; + + std::vector m_timestampRequestTime; + + std::mutex m_collectionMutex; +}; + +class MetalZoneScope +{ +public: + tracy_force_inline MetalZoneScope( MetalCtx* ctx, MTLComputePassDescriptor* desc, const SourceLocationData* srcloc, bool is_active ) +#ifdef TRACY_ON_DEMAND + : m_active( is_active && GetProfiler().IsConnected() ) +#else + : m_active( is_active ) +#endif + { + if ( !m_active ) return; + if (desc == nil) TracyMetalPanic({assert(false);} return, "compute pass descriptor is nil."); + m_ctx = ctx; + + auto& query = m_query = ctx->NextQuery(); + + desc.sampleBufferAttachments[0].sampleBuffer = query.buffer; + desc.sampleBufferAttachments[0].startOfEncoderSampleIndex = query.idx+0; + desc.sampleBufferAttachments[0].endOfEncoderSampleIndex = query.idx+1; + + SubmitZoneBeginGpu(ctx, query.idx + 0, srcloc); + } + + tracy_force_inline MetalZoneScope( MetalCtx* ctx, MTLBlitPassDescriptor* desc, const SourceLocationData* srcloc, bool is_active ) +#ifdef TRACY_ON_DEMAND + : m_active( is_active && GetProfiler().IsConnected() ) +#else + : m_active( is_active ) +#endif + { + if ( !m_active ) return; + if (desc == nil) TracyMetalPanic({assert(false); }return, "blit pass descriptor is nil."); + m_ctx = ctx; + + auto& query = m_query = ctx->NextQuery(); + + desc.sampleBufferAttachments[0].sampleBuffer = query.buffer; + desc.sampleBufferAttachments[0].startOfEncoderSampleIndex = query.idx+0; + desc.sampleBufferAttachments[0].endOfEncoderSampleIndex = query.idx+1; + + SubmitZoneBeginGpu(ctx, query.idx + 0, srcloc); + } + + tracy_force_inline MetalZoneScope( MetalCtx* ctx, MTLRenderPassDescriptor* desc, const SourceLocationData* srcloc, bool is_active ) +#ifdef TRACY_ON_DEMAND + : m_active( is_active && GetProfiler().IsConnected() ) +#else + : m_active( is_active ) +#endif + { + if ( !m_active ) return; + if (desc == nil) TracyMetalPanic({assert(false);} return, "render pass descriptor is nil."); + m_ctx = ctx; + + auto& query = m_query = ctx->NextQuery(); + + desc.sampleBufferAttachments[0].sampleBuffer = query.buffer; + desc.sampleBufferAttachments[0].startOfVertexSampleIndex = query.idx+0; + desc.sampleBufferAttachments[0].endOfVertexSampleIndex = MTLCounterDontSample; + desc.sampleBufferAttachments[0].startOfFragmentSampleIndex = MTLCounterDontSample; + desc.sampleBufferAttachments[0].endOfFragmentSampleIndex = query.idx+1; + + SubmitZoneBeginGpu(ctx, query.idx + 0, srcloc); + } + + /* TODO: implement this constructor interfarce for "command-level" profiling, if the device supports it + tracy_force_inline MetalZoneScope( MetalCtx* ctx, id cmdEncoder, const SourceLocationData* srcloc, bool is_active ) +#ifdef TRACY_ON_DEMAND + : m_active( is_active && GetProfiler().IsConnected() ) +#else + : m_active( is_active ) +#endif + { + if( !m_active ) return; + m_ctx = ctx; + m_cmdEncoder = cmdEncoder; + + auto& query = m_query = ctx->NextQueryId(); + + [m_cmdEncoder sampleCountersInBuffer:m_ctx->m_counterSampleBuffer atSampleIndex:query.idx withBarrier:YES]; + + SubmitZoneBeginGpu(ctx, query.idx, srcloc); + } + */ + + tracy_force_inline ~MetalZoneScope() + { + if( !m_active ) return; + + SubmitZoneEndGpu(m_ctx, m_query.idx + 1); + } + + TracyMetalDebugZoneScopeWireTap; + +private: + const bool m_active; + + MetalCtx* m_ctx; + + /* TODO: declare it for "command-level" profiling + id m_cmdEncoder; + */ + + static void SubmitZoneBeginGpu(MetalCtx* ctx, uint32_t queryId, const SourceLocationData* srcloc) + { + auto* item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuZoneBeginSerial ); + MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() ); + MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc ); + MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() ); + MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) ); + MemWrite( &item->gpuZoneBegin.context, ctx->GetContextId() ); + Profiler::QueueSerialFinish(); + + TracyMetalDebugMasked(1<<2, TracyAllocN((void*)(uintptr_t)queryId, 1, "TracyMetalGpuZone")); + } + + static void SubmitZoneEndGpu(MetalCtx* ctx, uint32_t queryId) + { + auto* item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuZoneEndSerial ); + MemWrite( &item->gpuZoneEnd.cpuTime, Profiler::GetTime() ); + MemWrite( &item->gpuZoneEnd.thread, GetThreadHandle() ); + MemWrite( &item->gpuZoneEnd.queryId, uint16_t( queryId ) ); + MemWrite( &item->gpuZoneEnd.context, ctx->GetContextId() ); + Profiler::QueueSerialFinish(); + + TracyMetalDebugMasked(1<<2, TracyAllocN((void*)(uintptr_t)queryId, 1, "TracyMetalGpuZone")); + } + + MetalCtx::Query m_query = {}; +}; + +} + +using TracyMetalCtx = tracy::MetalCtx; + +#define TracyMetalContext(device) tracy::MetalCtx::Create(device) +#define TracyMetalDestroy(ctx) tracy::MetalCtx::Destroy(ctx) +#define TracyMetalContextName(ctx, name, size) ctx->Name(name, size) + +#define TracyMetalZone( ctx, encoderDesc, name ) TracyMetalNamedZone( ctx, ___tracy_gpu_zone, encoderDesc, name, true ) +#define TracyMetalZoneC( ctx, encoderDesc, name, color ) TracyMetalNamedZoneC( ctx, ___tracy_gpu_zone, encoderDesc, name, color, true ) +#define TracyMetalNamedZone( ctx, varname, encoderDesc, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::MetalZoneScope varname( ctx, encoderDesc, &TracyConcat(__tracy_gpu_source_location,TracyLine), active ); +#define TracyMetalNamedZoneC( ctx, varname, encoderDesc, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::MetalZoneScope varname( ctx, encoderDesc, &TracyConcat(__tracy_gpu_source_location,TracyLine), active ); + +#define TracyMetalCollect( ctx ) ctx->Collect(); + + + +#undef TracyMetalDebug_ZoneScopeWireTap +#undef TracyMetalDebug_0b00010 +#undef TracyMetalDebug_0b10000 +#undef TracyMetalDebugMasked +#undef TRACY_METAL_DEBUG_MASK +#undef TRACY_METAL_TIMESTAMP_COLLECT_TIMEOUT +#undef TracyMetalPanic +#undef TRACY_METAL_VA_ARGS + +#endif + +#endif//__TRACYMETAL_HMM__ diff --git a/tracy/TracyOpenCL.hpp b/tracy/TracyOpenCL.hpp new file mode 100644 index 0000000..ede5c46 --- /dev/null +++ b/tracy/TracyOpenCL.hpp @@ -0,0 +1,414 @@ +#ifndef __TRACYOPENCL_HPP__ +#define __TRACYOPENCL_HPP__ + +#if !defined TRACY_ENABLE + +#define TracyCLContext(c, x) nullptr +#define TracyCLDestroy(c) +#define TracyCLContextName(c, x, y) + +#define TracyCLNamedZone(c, x, y, z) +#define TracyCLNamedZoneC(c, x, y, z, w) +#define TracyCLZone(c, x) +#define TracyCLZoneC(c, x, y) +#define TracyCLZoneTransient(c,x,y,z) + +#define TracyCLNamedZoneS(c, x, y, z, w) +#define TracyCLNamedZoneCS(c, x, y, z, w, v) +#define TracyCLZoneS(c, x, y) +#define TracyCLZoneCS(c, x, y, z) +#define TracyCLZoneTransientS(c,x,y,z,w) + +#define TracyCLNamedZoneSetEvent(x, e) +#define TracyCLZoneSetEvent(e) + +#define TracyCLCollect(c) + +namespace tracy +{ + class OpenCLCtxScope {}; +} + +using TracyCLCtx = void*; + +#else + +#include + +#include +#include +#include + +#include "Tracy.hpp" +#include "../client/TracyCallstack.hpp" +#include "../client/TracyProfiler.hpp" +#include "../common/TracyAlloc.hpp" + +#define TRACY_CL_TO_STRING_INDIRECT(T) #T +#define TRACY_CL_TO_STRING(T) TRACY_CL_TO_STRING_INDIRECT(T) +#define TRACY_CL_ASSERT(p) if(!(p)) { \ + TracyMessageL( "TRACY_CL_ASSERT failed on " TracyFile ":" TRACY_CL_TO_STRING(TracyLine) ); \ + assert(false && "TRACY_CL_ASSERT failed"); \ +} +#define TRACY_CL_CHECK_ERROR(err) if(err != CL_SUCCESS) { \ + std::ostringstream oss; \ + oss << "TRACY_CL_CHECK_ERROR failed on " << TracyFile << ":" << TracyLine \ + << ": error code " << err; \ + auto msg = oss.str(); \ + TracyMessage(msg.data(), msg.size()); \ + assert(false && "TRACY_CL_CHECK_ERROR failed"); \ +} + +namespace tracy { + + enum class EventPhase : uint8_t + { + Begin, + End + }; + + struct EventInfo + { + cl_event event; + EventPhase phase; + }; + + class OpenCLCtx + { + public: + enum { QueryCount = 64 * 1024 }; + + OpenCLCtx(cl_context context, cl_device_id device) + : m_contextId(GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed)) + , m_head(0) + , m_tail(0) + { + int64_t tcpu, tgpu; + TRACY_CL_ASSERT(m_contextId != 255); + + cl_int err = CL_SUCCESS; + cl_command_queue queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err); + TRACY_CL_CHECK_ERROR(err) + uint32_t dummyValue = 42; + cl_mem dummyBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(uint32_t), nullptr, &err); + TRACY_CL_CHECK_ERROR(err) + cl_event writeBufferEvent; + TRACY_CL_CHECK_ERROR(clEnqueueWriteBuffer(queue, dummyBuffer, CL_FALSE, 0, sizeof(uint32_t), &dummyValue, 0, nullptr, &writeBufferEvent)); + TRACY_CL_CHECK_ERROR(clWaitForEvents(1, &writeBufferEvent)); + + tcpu = Profiler::GetTime(); + + cl_int eventStatus; + TRACY_CL_CHECK_ERROR(clGetEventInfo(writeBufferEvent, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, nullptr)); + TRACY_CL_ASSERT(eventStatus == CL_COMPLETE); + TRACY_CL_CHECK_ERROR(clGetEventProfilingInfo(writeBufferEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &tgpu, nullptr)); + TRACY_CL_CHECK_ERROR(clReleaseEvent(writeBufferEvent)); + TRACY_CL_CHECK_ERROR(clReleaseMemObject(dummyBuffer)); + TRACY_CL_CHECK_ERROR(clReleaseCommandQueue(queue)); + + auto item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuNewContext); + MemWrite(&item->gpuNewContext.cpuTime, tcpu); + MemWrite(&item->gpuNewContext.gpuTime, tgpu); + memset(&item->gpuNewContext.thread, 0, sizeof(item->gpuNewContext.thread)); + MemWrite(&item->gpuNewContext.period, 1.0f); + MemWrite(&item->gpuNewContext.type, GpuContextType::OpenCL); + MemWrite(&item->gpuNewContext.context, (uint8_t) m_contextId); + MemWrite(&item->gpuNewContext.flags, (uint8_t)0); +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem(*item); +#endif + Profiler::QueueSerialFinish(); + } + + void Name( const char* name, uint16_t len ) + { + auto ptr = (char*)tracy_malloc( len ); + memcpy( ptr, name, len ); + + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuContextName ); + MemWrite( &item->gpuContextNameFat.context, (uint8_t)m_contextId ); + MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr ); + MemWrite( &item->gpuContextNameFat.size, len ); +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem( *item ); +#endif + Profiler::QueueSerialFinish(); + } + + void Collect() + { + ZoneScopedC(Color::Red4); + + if (m_tail == m_head) return; + +#ifdef TRACY_ON_DEMAND + if (!GetProfiler().IsConnected()) + { + m_head = m_tail = 0; + } +#endif + + for (; m_tail != m_head; m_tail = (m_tail + 1) % QueryCount) + { + EventInfo eventInfo = GetQuery(m_tail); + cl_int eventStatus; + cl_int err = clGetEventInfo(eventInfo.event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, nullptr); + if (err != CL_SUCCESS) + { + std::ostringstream oss; + oss << "clGetEventInfo falied with error code " << err << ", on event " << eventInfo.event << ", skipping..."; + auto msg = oss.str(); + TracyMessage(msg.data(), msg.size()); + if (eventInfo.event == nullptr) { + TracyMessageL("A TracyCLZone must be paird with a TracyCLZoneSetEvent, check your code!"); + } + assert(false && "clGetEventInfo failed, maybe a TracyCLZone is not paired with TracyCLZoneSetEvent"); + continue; + } + if (eventStatus != CL_COMPLETE) return; + + cl_int eventInfoQuery = (eventInfo.phase == EventPhase::Begin) + ? CL_PROFILING_COMMAND_START + : CL_PROFILING_COMMAND_END; + + cl_ulong eventTimeStamp = 0; + err = clGetEventProfilingInfo(eventInfo.event, eventInfoQuery, sizeof(cl_ulong), &eventTimeStamp, nullptr); + if (err == CL_PROFILING_INFO_NOT_AVAILABLE) + { + TracyMessageL("command queue is not created with CL_QUEUE_PROFILING_ENABLE flag, check your code!"); + assert(false && "command queue is not created with CL_QUEUE_PROFILING_ENABLE flag"); + } + else + TRACY_CL_CHECK_ERROR(err); + + TRACY_CL_ASSERT(eventTimeStamp != 0); + + auto item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuTime); + MemWrite(&item->gpuTime.gpuTime, (int64_t)eventTimeStamp); + MemWrite(&item->gpuTime.queryId, (uint16_t)m_tail); + MemWrite(&item->gpuTime.context, m_contextId); + Profiler::QueueSerialFinish(); + + if (eventInfo.phase == EventPhase::End) + { + // Done with the event, so release it + TRACY_CL_CHECK_ERROR(clReleaseEvent(eventInfo.event)); + } + } + } + + tracy_force_inline uint8_t GetId() const + { + return m_contextId; + } + + tracy_force_inline unsigned int NextQueryId(EventInfo eventInfo) + { + const auto id = m_head; + m_head = (m_head + 1) % QueryCount; + TRACY_CL_ASSERT(m_head != m_tail); + m_query[id] = eventInfo; + return id; + } + + tracy_force_inline EventInfo& GetQuery(unsigned int id) + { + TRACY_CL_ASSERT(id < QueryCount); + return m_query[id]; + } + + private: + + unsigned int m_contextId; + + EventInfo m_query[QueryCount]; + unsigned int m_head; // index at which a new event should be inserted + unsigned int m_tail; // oldest event + + }; + + class OpenCLCtxScope { + public: + tracy_force_inline OpenCLCtxScope(OpenCLCtx* ctx, const SourceLocationData* srcLoc, bool is_active) +#ifdef TRACY_ON_DEMAND + : m_active(is_active&& GetProfiler().IsConnected()) +#else + : m_active(is_active) +#endif + , m_ctx(ctx) + , m_event(nullptr) + { + if (!m_active) return; + + m_beginQueryId = ctx->NextQueryId(EventInfo{ nullptr, EventPhase::Begin }); + + auto item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuZoneBeginSerial); + MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime()); + MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)srcLoc); + MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle()); + MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)m_beginQueryId); + MemWrite(&item->gpuZoneBegin.context, ctx->GetId()); + Profiler::QueueSerialFinish(); + } + + tracy_force_inline OpenCLCtxScope(OpenCLCtx* ctx, const SourceLocationData* srcLoc, int32_t depth, bool is_active) +#ifdef TRACY_ON_DEMAND + : m_active(is_active&& GetProfiler().IsConnected()) +#else + : m_active(is_active) +#endif + , m_ctx(ctx) + , m_event(nullptr) + { + if (!m_active) return; + + m_beginQueryId = ctx->NextQueryId(EventInfo{ nullptr, EventPhase::Begin }); + + GetProfiler().SendCallstack(depth); + + auto item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuZoneBeginCallstackSerial); + MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime()); + MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)srcLoc); + MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle()); + MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)m_beginQueryId); + MemWrite(&item->gpuZoneBegin.context, ctx->GetId()); + Profiler::QueueSerialFinish(); + } + + tracy_force_inline OpenCLCtxScope(OpenCLCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, bool is_active) +#ifdef TRACY_ON_DEMAND + : m_active(is_active && GetProfiler().IsConnected()) +#else + : m_active(is_active) +#endif + , m_ctx(ctx) + , m_event(nullptr) + { + if (!m_active) return; + + m_beginQueryId = ctx->NextQueryId(EventInfo{ nullptr, EventPhase::Begin }); + + const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz ); + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuZoneBeginAllocSrcLocSerial ); + MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime()); + MemWrite(&item->gpuZoneBegin.srcloc, srcloc); + MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle()); + MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)m_beginQueryId); + MemWrite(&item->gpuZoneBegin.context, ctx->GetId()); + Profiler::QueueSerialFinish(); + } + + tracy_force_inline OpenCLCtxScope(OpenCLCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, int32_t depth, bool is_active) +#ifdef TRACY_ON_DEMAND + : m_active(is_active && GetProfiler().IsConnected()) +#else + : m_active(is_active) +#endif + , m_ctx(ctx) + , m_event(nullptr) + { + if (!m_active) return; + + m_beginQueryId = ctx->NextQueryId(EventInfo{ nullptr, EventPhase::Begin }); + + const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz ); + auto item = Profiler::QueueSerialCallstack( Callstack( depth ) ); + MemWrite(&item->hdr.type, QueueType::GpuZoneBeginAllocSrcLocCallstackSerial); + MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime()); + MemWrite(&item->gpuZoneBegin.srcloc, srcloc); + MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle()); + MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)m_beginQueryId); + MemWrite(&item->gpuZoneBegin.context, ctx->GetId()); + Profiler::QueueSerialFinish(); + } + + tracy_force_inline void SetEvent(cl_event event) + { + if (!m_active) return; + m_event = event; + TRACY_CL_CHECK_ERROR(clRetainEvent(m_event)); + m_ctx->GetQuery(m_beginQueryId).event = m_event; + } + + tracy_force_inline ~OpenCLCtxScope() + { + if (!m_active) return; + const auto queryId = m_ctx->NextQueryId(EventInfo{ m_event, EventPhase::End }); + + auto item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuZoneEndSerial); + MemWrite(&item->gpuZoneEnd.cpuTime, Profiler::GetTime()); + MemWrite(&item->gpuZoneEnd.thread, GetThreadHandle()); + MemWrite(&item->gpuZoneEnd.queryId, (uint16_t)queryId); + MemWrite(&item->gpuZoneEnd.context, m_ctx->GetId()); + Profiler::QueueSerialFinish(); + } + + const bool m_active; + OpenCLCtx* m_ctx; + cl_event m_event; + unsigned int m_beginQueryId; + }; + + static inline OpenCLCtx* CreateCLContext(cl_context context, cl_device_id device) + { + auto ctx = (OpenCLCtx*)tracy_malloc(sizeof(OpenCLCtx)); + new (ctx) OpenCLCtx(context, device); + return ctx; + } + + static inline void DestroyCLContext(OpenCLCtx* ctx) + { + ctx->~OpenCLCtx(); + tracy_free(ctx); + } + +} // namespace tracy + +using TracyCLCtx = tracy::OpenCLCtx*; + +#define TracyCLContext(ctx, device) tracy::CreateCLContext(ctx, device); +#define TracyCLDestroy(ctx) tracy::DestroyCLContext(ctx); +#define TracyCLContextName(ctx, name, size) ctx->Name(name, size); +#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK +# define TracyCLNamedZone(ctx, varname, name, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), TRACY_CALLSTACK, active ); +# define TracyCLNamedZoneC(ctx, varname, name, color, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), TRACY_CALLSTACK, active ); +# define TracyCLZone(ctx, name) TracyCLNamedZoneS(ctx, __tracy_gpu_zone, name, TRACY_CALLSTACK, true) +# define TracyCLZoneC(ctx, name, color) TracyCLNamedZoneCS(ctx, __tracy_gpu_zone, name, color, TRACY_CALLSTACK, true) +# define TracyCLZoneTransient( ctx, varname, name, active ) tracy::OpenCLCtxScope varname( ctx, TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), TRACY_CALLSTACK, active ); +#else +# define TracyCLNamedZone(ctx, varname, name, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine){ name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), active); +# define TracyCLNamedZoneC(ctx, varname, name, color, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine){ name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), active); +# define TracyCLZone(ctx, name) TracyCLNamedZone(ctx, __tracy_gpu_zone, name, true) +# define TracyCLZoneC(ctx, name, color) TracyCLNamedZoneC(ctx, __tracy_gpu_zone, name, color, true ) +# define TracyCLZoneTransient( ctx, varname, name, active ) tracy::OpenCLCtxScope varname( ctx, TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), active ); +#endif + +#ifdef TRACY_HAS_CALLSTACK +# define TracyCLNamedZoneS(ctx, varname, name, depth, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine){ name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), depth, active); +# define TracyCLNamedZoneCS(ctx, varname, name, color, depth, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine){ name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), depth, active); +# define TracyCLZoneS(ctx, name, depth) TracyCLNamedZoneS(ctx, __tracy_gpu_zone, name, depth, true) +# define TracyCLZoneCS(ctx, name, color, depth) TracyCLNamedZoneCS(ctx, __tracy_gpu_zone, name, color, depth, true) +# define TracyCLZoneTransientS( ctx, varname, name, depth, active ) tracy::OpenCLCtxScope varname( ctx, TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), depth, active ); +#else +# define TracyCLNamedZoneS(ctx, varname, name, depth, active) TracyCLNamedZone(ctx, varname, name, active) +# define TracyCLNamedZoneCS(ctx, varname, name, color, depth, active) TracyCLNamedZoneC(ctx, varname, name, color, active) +# define TracyCLZoneS(ctx, name, depth) TracyCLZone(ctx, name) +# define TracyCLZoneCS(ctx, name, color, depth) TracyCLZoneC(ctx, name, color) +# define TracyCLZoneTransientS( ctx, varname, name, depth, active ) TracyCLZoneTransient( ctx, varname, name, active ) +#endif + +#define TracyCLNamedZoneSetEvent(varname, event) varname.SetEvent(event) +#define TracyCLZoneSetEvent(event) __tracy_gpu_zone.SetEvent(event) + +#define TracyCLCollect(ctx) ctx->Collect() + +#endif + +#endif diff --git a/tracy/TracyOpenGL.hpp b/tracy/TracyOpenGL.hpp new file mode 100644 index 0000000..30abd4f --- /dev/null +++ b/tracy/TracyOpenGL.hpp @@ -0,0 +1,325 @@ +#ifndef __TRACYOPENGL_HPP__ +#define __TRACYOPENGL_HPP__ + +#if !defined TRACY_ENABLE || defined __APPLE__ + +#define TracyGpuContext +#define TracyGpuContextName(x,y) +#define TracyGpuNamedZone(x,y,z) +#define TracyGpuNamedZoneC(x,y,z,w) +#define TracyGpuZone(x) +#define TracyGpuZoneC(x,y) +#define TracyGpuZoneTransient(x,y,z) +#define TracyGpuCollect + +#define TracyGpuNamedZoneS(x,y,z,w) +#define TracyGpuNamedZoneCS(x,y,z,w,a) +#define TracyGpuZoneS(x,y) +#define TracyGpuZoneCS(x,y,z) +#define TracyGpuZoneTransientS(x,y,z,w) + +namespace tracy +{ +struct SourceLocationData; +class GpuCtxScope +{ +public: + GpuCtxScope( const SourceLocationData*, bool ) {} + GpuCtxScope( const SourceLocationData*, int32_t, bool ) {} +}; +} + +#else + +#include +#include +#include + +#include "Tracy.hpp" +#include "../client/TracyProfiler.hpp" +#include "../client/TracyCallstack.hpp" +#include "../common/TracyAlign.hpp" +#include "../common/TracyAlloc.hpp" + +#if !defined GL_TIMESTAMP && defined GL_TIMESTAMP_EXT +# define GL_TIMESTAMP GL_TIMESTAMP_EXT +# define GL_QUERY_COUNTER_BITS GL_QUERY_COUNTER_BITS_EXT +# define glGetQueryObjectiv glGetQueryObjectivEXT +# define glGetQueryObjectui64v glGetQueryObjectui64vEXT +# define glQueryCounter glQueryCounterEXT +#endif + +#define TracyGpuContext tracy::GetGpuCtx().ptr = (tracy::GpuCtx*)tracy::tracy_malloc( sizeof( tracy::GpuCtx ) ); new(tracy::GetGpuCtx().ptr) tracy::GpuCtx; +#define TracyGpuContextName( name, size ) tracy::GetGpuCtx().ptr->Name( name, size ); +#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK +# define TracyGpuNamedZone( varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::GpuCtxScope varname( &TracyConcat(__tracy_gpu_source_location,TracyLine), TRACY_CALLSTACK, active ); +# define TracyGpuNamedZoneC( varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::GpuCtxScope varname( &TracyConcat(__tracy_gpu_source_location,TracyLine), TRACY_CALLSTACK, active ); +# define TracyGpuZone( name ) TracyGpuNamedZoneS( ___tracy_gpu_zone, name, TRACY_CALLSTACK, true ) +# define TracyGpuZoneC( name, color ) TracyGpuNamedZoneCS( ___tracy_gpu_zone, name, color, TRACY_CALLSTACK, true ) +# define TracyGpuZoneTransient( varname, name, active ) tracy::GpuCtxScope varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), TRACY_CALLSTACK, active ); +#else +# define TracyGpuNamedZone( varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::GpuCtxScope varname( &TracyConcat(__tracy_gpu_source_location,TracyLine), active ); +# define TracyGpuNamedZoneC( varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::GpuCtxScope varname( &TracyConcat(__tracy_gpu_source_location,TracyLine), active ); +# define TracyGpuZone( name ) TracyGpuNamedZone( ___tracy_gpu_zone, name, true ) +# define TracyGpuZoneC( name, color ) TracyGpuNamedZoneC( ___tracy_gpu_zone, name, color, true ) +# define TracyGpuZoneTransient( varname, name, active ) tracy::GpuCtxScope varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), active ); +#endif +#define TracyGpuCollect tracy::GetGpuCtx().ptr->Collect(); + +#ifdef TRACY_HAS_CALLSTACK +# define TracyGpuNamedZoneS( varname, name, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::GpuCtxScope varname( &TracyConcat(__tracy_gpu_source_location,TracyLine), depth, active ); +# define TracyGpuNamedZoneCS( varname, name, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::GpuCtxScope varname( &TracyConcat(__tracy_gpu_source_location,TracyLine), depth, active ); +# define TracyGpuZoneS( name, depth ) TracyGpuNamedZoneS( ___tracy_gpu_zone, name, depth, true ) +# define TracyGpuZoneCS( name, color, depth ) TracyGpuNamedZoneCS( ___tracy_gpu_zone, name, color, depth, true ) +# define TracyGpuZoneTransientS( varname, name, depth, active ) tracy::GpuCtxScope varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), depth, active ); +#else +# define TracyGpuNamedZoneS( varname, name, depth, active ) TracyGpuNamedZone( varname, name, active ) +# define TracyGpuNamedZoneCS( varname, name, color, depth, active ) TracyGpuNamedZoneC( varname, name, color, active ) +# define TracyGpuZoneS( name, depth ) TracyGpuZone( name ) +# define TracyGpuZoneCS( name, color, depth ) TracyGpuZoneC( name, color ) +# define TracyGpuZoneTransientS( varname, name, depth, active ) TracyGpuZoneTransient( varname, name, active ) +#endif + +namespace tracy +{ + +class GpuCtx +{ + friend class GpuCtxScope; + + enum { QueryCount = 64 * 1024 }; + +public: + GpuCtx() + : m_context( GetGpuCtxCounter().fetch_add( 1, std::memory_order_relaxed ) ) + , m_head( 0 ) + , m_tail( 0 ) + { + assert( m_context != 255 ); + + glGenQueries( QueryCount, m_query ); + + int64_t tgpu; + glGetInteger64v( GL_TIMESTAMP, &tgpu ); + int64_t tcpu = Profiler::GetTime(); + + GLint bits; + glGetQueryiv( GL_TIMESTAMP, GL_QUERY_COUNTER_BITS, &bits ); + + const float period = 1.f; + const auto thread = GetThreadHandle(); + TracyLfqPrepare( QueueType::GpuNewContext ); + MemWrite( &item->gpuNewContext.cpuTime, tcpu ); + MemWrite( &item->gpuNewContext.gpuTime, tgpu ); + MemWrite( &item->gpuNewContext.thread, thread ); + MemWrite( &item->gpuNewContext.period, period ); + MemWrite( &item->gpuNewContext.context, m_context ); + MemWrite( &item->gpuNewContext.flags, uint8_t( 0 ) ); + MemWrite( &item->gpuNewContext.type, GpuContextType::OpenGl ); + +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem( *item ); +#endif + + TracyLfqCommit; + } + + void Name( const char* name, uint16_t len ) + { + auto ptr = (char*)tracy_malloc( len ); + memcpy( ptr, name, len ); + + TracyLfqPrepare( QueueType::GpuContextName ); + MemWrite( &item->gpuContextNameFat.context, m_context ); + MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr ); + MemWrite( &item->gpuContextNameFat.size, len ); +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem( *item ); +#endif + TracyLfqCommit; + } + + void Collect() + { + ZoneScopedC( Color::Red4 ); + + if( m_tail == m_head ) return; + +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) + { + m_head = m_tail = 0; + return; + } +#endif + + while( m_tail != m_head ) + { + GLint available; + glGetQueryObjectiv( m_query[m_tail], GL_QUERY_RESULT_AVAILABLE, &available ); + if( !available ) return; + + uint64_t time; + glGetQueryObjectui64v( m_query[m_tail], GL_QUERY_RESULT, &time ); + + TracyLfqPrepare( QueueType::GpuTime ); + MemWrite( &item->gpuTime.gpuTime, (int64_t)time ); + MemWrite( &item->gpuTime.queryId, (uint16_t)m_tail ); + MemWrite( &item->gpuTime.context, m_context ); + TracyLfqCommit; + + m_tail = ( m_tail + 1 ) % QueryCount; + } + } + +private: + tracy_force_inline unsigned int NextQueryId() + { + const auto id = m_head; + m_head = ( m_head + 1 ) % QueryCount; + assert( m_head != m_tail ); + return id; + } + + tracy_force_inline unsigned int TranslateOpenGlQueryId( unsigned int id ) + { + return m_query[id]; + } + + tracy_force_inline uint8_t GetId() const + { + return m_context; + } + + unsigned int m_query[QueryCount]; + uint8_t m_context; + + unsigned int m_head; + unsigned int m_tail; +}; + +class GpuCtxScope +{ +public: + tracy_force_inline GpuCtxScope( const SourceLocationData* srcloc, bool is_active ) +#ifdef TRACY_ON_DEMAND + : m_active( is_active && GetProfiler().IsConnected() ) +#else + : m_active( is_active ) +#endif + { + if( !m_active ) return; + + const auto queryId = GetGpuCtx().ptr->NextQueryId(); + glQueryCounter( GetGpuCtx().ptr->TranslateOpenGlQueryId( queryId ), GL_TIMESTAMP ); + + TracyLfqPrepare( QueueType::GpuZoneBegin ); + MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() ); + memset( &item->gpuZoneBegin.thread, 0, sizeof( item->gpuZoneBegin.thread ) ); + MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) ); + MemWrite( &item->gpuZoneBegin.context, GetGpuCtx().ptr->GetId() ); + MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc ); + TracyLfqCommit; + } + + tracy_force_inline GpuCtxScope( const SourceLocationData* srcloc, int32_t depth, bool is_active ) +#ifdef TRACY_ON_DEMAND + : m_active( is_active && GetProfiler().IsConnected() ) +#else + : m_active( is_active ) +#endif + { + if( !m_active ) return; + + const auto queryId = GetGpuCtx().ptr->NextQueryId(); + glQueryCounter( GetGpuCtx().ptr->TranslateOpenGlQueryId( queryId ), GL_TIMESTAMP ); + +#ifdef TRACY_FIBERS + TracyLfqPrepare( QueueType::GpuZoneBegin ); + memset( &item->gpuZoneBegin.thread, 0, sizeof( item->gpuZoneBegin.thread ) ); +#else + GetProfiler().SendCallstack( depth ); + TracyLfqPrepare( QueueType::GpuZoneBeginCallstack ); + MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() ); +#endif + MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() ); + MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) ); + MemWrite( &item->gpuZoneBegin.context, GetGpuCtx().ptr->GetId() ); + MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc ); + TracyLfqCommit; + } + + tracy_force_inline GpuCtxScope( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, bool is_active ) +#ifdef TRACY_ON_DEMAND + : m_active( is_active && GetProfiler().IsConnected() ) +#else + : m_active( is_active ) +#endif + { + if( !m_active ) return; + + const auto queryId = GetGpuCtx().ptr->NextQueryId(); + glQueryCounter( GetGpuCtx().ptr->TranslateOpenGlQueryId( queryId ), GL_TIMESTAMP ); + + TracyLfqPrepare( QueueType::GpuZoneBeginAllocSrcLoc ); + const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz ); + MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() ); + memset( &item->gpuZoneBegin.thread, 0, sizeof( item->gpuZoneBegin.thread ) ); + MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) ); + MemWrite( &item->gpuZoneBegin.context, GetGpuCtx().ptr->GetId() ); + MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc ); + TracyLfqCommit; + } + + tracy_force_inline GpuCtxScope( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, int32_t depth, bool is_active ) +#ifdef TRACY_ON_DEMAND + : m_active( is_active && GetProfiler().IsConnected() ) +#else + : m_active( is_active ) +#endif + { + if( !m_active ) return; + + const auto queryId = GetGpuCtx().ptr->NextQueryId(); + glQueryCounter( GetGpuCtx().ptr->TranslateOpenGlQueryId( queryId ), GL_TIMESTAMP ); + +#ifdef TRACY_FIBERS + TracyLfqPrepare( QueueType::GpuZoneBeginAllocSrcLoc ); + memset( &item->gpuZoneBegin.thread, 0, sizeof( item->gpuZoneBegin.thread ) ); +#else + GetProfiler().SendCallstack( depth ); + TracyLfqPrepare( QueueType::GpuZoneBeginAllocSrcLocCallstack ); + MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() ); +#endif + const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz ); + MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() ); + MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) ); + MemWrite( &item->gpuZoneBegin.context, GetGpuCtx().ptr->GetId() ); + MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc ); + TracyLfqCommit; + } + + tracy_force_inline ~GpuCtxScope() + { + if( !m_active ) return; + + const auto queryId = GetGpuCtx().ptr->NextQueryId(); + glQueryCounter( GetGpuCtx().ptr->TranslateOpenGlQueryId( queryId ), GL_TIMESTAMP ); + + TracyLfqPrepare( QueueType::GpuZoneEnd ); + MemWrite( &item->gpuZoneEnd.cpuTime, Profiler::GetTime() ); + memset( &item->gpuZoneEnd.thread, 0, sizeof( item->gpuZoneEnd.thread ) ); + MemWrite( &item->gpuZoneEnd.queryId, uint16_t( queryId ) ); + MemWrite( &item->gpuZoneEnd.context, GetGpuCtx().ptr->GetId() ); + TracyLfqCommit; + } + +private: + const bool m_active; +}; + +} + +#endif + +#endif diff --git a/tracy/TracyVulkan.hpp b/tracy/TracyVulkan.hpp new file mode 100644 index 0000000..429f299 --- /dev/null +++ b/tracy/TracyVulkan.hpp @@ -0,0 +1,747 @@ +#ifndef __TRACYVULKAN_HPP__ +#define __TRACYVULKAN_HPP__ + +#if !defined TRACY_ENABLE + +#define TracyVkContext(x,y,z,w) nullptr +#define TracyVkContextCalibrated(x,y,z,w,a,b) nullptr +#if defined VK_EXT_host_query_reset +#define TracyVkContextHostCalibrated(x,y,z,w,a) nullptr +#endif +#define TracyVkDestroy(x) +#define TracyVkContextName(c,x,y) +#define TracyVkNamedZone(c,x,y,z,w) +#define TracyVkNamedZoneC(c,x,y,z,w,a) +#define TracyVkZone(c,x,y) +#define TracyVkZoneC(c,x,y,z) +#define TracyVkZoneTransient(c,x,y,z,w) +#define TracyVkCollect(c,x) +#define TracyVkCollectHost(c) + +#define TracyVkNamedZoneS(c,x,y,z,w,a) +#define TracyVkNamedZoneCS(c,x,y,z,w,v,a) +#define TracyVkZoneS(c,x,y,z) +#define TracyVkZoneCS(c,x,y,z,w) +#define TracyVkZoneTransientS(c,x,y,z,w,a) + +namespace tracy +{ +class VkCtxScope {}; +} + +using TracyVkCtx = void*; + +#else + +#if !defined VK_NULL_HANDLE +# error "You must include Vulkan headers before including TracyVulkan.hpp" +#endif + +#include +#include +#include "Tracy.hpp" +#include "../client/TracyProfiler.hpp" +#include "../client/TracyCallstack.hpp" + +#include + +namespace tracy +{ + +#if defined TRACY_VK_USE_SYMBOL_TABLE +#define LoadVkDeviceCoreSymbols(Operation) \ + Operation(vkBeginCommandBuffer) \ + Operation(vkCmdResetQueryPool) \ + Operation(vkCmdWriteTimestamp) \ + Operation(vkCreateQueryPool) \ + Operation(vkDestroyQueryPool) \ + Operation(vkEndCommandBuffer) \ + Operation(vkGetQueryPoolResults) \ + Operation(vkQueueSubmit) \ + Operation(vkQueueWaitIdle) \ + Operation(vkResetQueryPool) + +#define LoadVkDeviceExtensionSymbols(Operation) \ + Operation(vkGetCalibratedTimestampsEXT) + +#define LoadVkInstanceExtensionSymbols(Operation) \ + Operation(vkGetPhysicalDeviceCalibrateableTimeDomainsEXT) + +#define LoadVkInstanceCoreSymbols(Operation) \ + Operation(vkGetPhysicalDeviceProperties) + +struct VkSymbolTable +{ +#define MAKE_PFN(name) PFN_##name name; + LoadVkDeviceCoreSymbols(MAKE_PFN) + LoadVkDeviceExtensionSymbols(MAKE_PFN) + LoadVkInstanceExtensionSymbols(MAKE_PFN) + LoadVkInstanceCoreSymbols(MAKE_PFN) +#undef MAKE_PFN +}; + +#define VK_FUNCTION_WRAPPER(callSignature) m_symbols.callSignature +#define CONTEXT_VK_FUNCTION_WRAPPER(callSignature) m_ctx->m_symbols.callSignature +#else +#define VK_FUNCTION_WRAPPER(callSignature) callSignature +#define CONTEXT_VK_FUNCTION_WRAPPER(callSignature) callSignature +#endif + +class VkCtx +{ + friend class VkCtxScope; + + enum { QueryCount = 64 * 1024 }; + +public: +#if defined TRACY_VK_USE_SYMBOL_TABLE + VkCtx( VkInstance instance, VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr deviceProcAddr, bool calibrated ) +#else + VkCtx( VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT vkGetPhysicalDeviceCalibrateableTimeDomainsEXT, PFN_vkGetCalibratedTimestampsEXT vkGetCalibratedTimestampsEXT) +#endif + : m_device( device ) + , m_timeDomain( VK_TIME_DOMAIN_DEVICE_EXT ) + , m_context( GetGpuCtxCounter().fetch_add( 1, std::memory_order_relaxed ) ) + , m_head( 0 ) + , m_tail( 0 ) + , m_oldCnt( 0 ) + , m_queryCount( QueryCount ) +#if !defined TRACY_VK_USE_SYMBOL_TABLE + , m_vkGetCalibratedTimestampsEXT( vkGetCalibratedTimestampsEXT ) +#endif + { + assert( m_context != 255 ); + +#if defined TRACY_VK_USE_SYMBOL_TABLE + PopulateSymbolTable(instance, instanceProcAddr, deviceProcAddr); + if ( calibrated ) + { + m_vkGetCalibratedTimestampsEXT = m_symbols.vkGetCalibratedTimestampsEXT; + } + +#endif + + if( VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceCalibrateableTimeDomainsEXT ) && m_vkGetCalibratedTimestampsEXT ) + { + FindAvailableTimeDomains( physdev, VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceCalibrateableTimeDomainsEXT ) ); + } + + CreateQueryPool(); + + VkCommandBufferBeginInfo beginInfo = {}; + beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + + VkSubmitInfo submitInfo = {}; + submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &cmdbuf; + + VK_FUNCTION_WRAPPER( vkBeginCommandBuffer( cmdbuf, &beginInfo ) ); + VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, 0, m_queryCount ) ); + VK_FUNCTION_WRAPPER( vkEndCommandBuffer( cmdbuf ) ); + VK_FUNCTION_WRAPPER( vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE ) ); + VK_FUNCTION_WRAPPER( vkQueueWaitIdle( queue ) ); + + int64_t tcpu, tgpu; + if( m_timeDomain == VK_TIME_DOMAIN_DEVICE_EXT ) + { + VK_FUNCTION_WRAPPER( vkBeginCommandBuffer( cmdbuf, &beginInfo ) ); + VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, m_query, 0 ) ); + VK_FUNCTION_WRAPPER( vkEndCommandBuffer( cmdbuf ) ); + VK_FUNCTION_WRAPPER( vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE ) ); + VK_FUNCTION_WRAPPER( vkQueueWaitIdle( queue ) ); + + tcpu = Profiler::GetTime(); + VK_FUNCTION_WRAPPER( vkGetQueryPoolResults( device, m_query, 0, 1, sizeof( tgpu ), &tgpu, sizeof( tgpu ), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT ) ); + + VK_FUNCTION_WRAPPER( vkBeginCommandBuffer( cmdbuf, &beginInfo ) ); + VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, 0, 1 ) ); + VK_FUNCTION_WRAPPER( vkEndCommandBuffer( cmdbuf ) ); + VK_FUNCTION_WRAPPER( vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE ) ); + VK_FUNCTION_WRAPPER( vkQueueWaitIdle( queue ) ); + } + else + { + FindCalibratedTimestampDeviation(); + Calibrate( device, m_prevCalibration, tgpu ); + tcpu = Profiler::GetTime(); + } + + WriteInitialItem( physdev, tcpu, tgpu ); + + m_res = (int64_t*)tracy_malloc( sizeof( int64_t ) * m_queryCount ); + } + +#if defined VK_EXT_host_query_reset + /** + * This alternative constructor does not use command buffers and instead uses functionality from + * VK_EXT_host_query_reset (core with 1.2 and non-optional) and VK_EXT_calibrated_timestamps. This requires + * the physical device to have another time domain apart from DEVICE to be calibrateable. + */ +#if defined TRACY_VK_USE_SYMBOL_TABLE + VkCtx( VkInstance instance, VkPhysicalDevice physdev, VkDevice device, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr deviceProcAddr ) +#else + VkCtx( VkPhysicalDevice physdev, VkDevice device, PFN_vkResetQueryPoolEXT vkResetQueryPool, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT vkGetPhysicalDeviceCalibrateableTimeDomainsEXT, PFN_vkGetCalibratedTimestampsEXT vkGetCalibratedTimestampsEXT ) +#endif + : m_device( device ) + , m_timeDomain( VK_TIME_DOMAIN_DEVICE_EXT ) + , m_context( GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed) ) + , m_head( 0 ) + , m_tail( 0 ) + , m_oldCnt( 0 ) + , m_queryCount( QueryCount ) +#if !defined TRACY_VK_USE_SYMBOL_TABLE + , m_vkGetCalibratedTimestampsEXT( vkGetCalibratedTimestampsEXT ) +#endif + { + assert( m_context != 255); + +#if defined TRACY_VK_USE_SYMBOL_TABLE + PopulateSymbolTable(instance, instanceProcAddr, deviceProcAddr); + m_vkGetCalibratedTimestampsEXT = m_symbols.vkGetCalibratedTimestampsEXT; +#endif + + assert( VK_FUNCTION_WRAPPER( vkResetQueryPool ) != nullptr ); + assert( VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceCalibrateableTimeDomainsEXT ) != nullptr ); + assert( VK_FUNCTION_WRAPPER( vkGetCalibratedTimestampsEXT ) != nullptr ); + + FindAvailableTimeDomains( physdev, VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceCalibrateableTimeDomainsEXT ) ); + + // We require a host time domain to be available to properly calibrate. + FindCalibratedTimestampDeviation(); + int64_t tgpu; + Calibrate( device, m_prevCalibration, tgpu ); + int64_t tcpu = Profiler::GetTime(); + + CreateQueryPool(); + VK_FUNCTION_WRAPPER( vkResetQueryPool( device, m_query, 0, m_queryCount ) ); + + WriteInitialItem( physdev, tcpu, tgpu ); + + // We need the buffer to be twice as large for availability values + size_t resSize = sizeof( int64_t ) * m_queryCount * 2; + m_res = (int64_t*)tracy_malloc( resSize ); + } +#endif + + ~VkCtx() + { + tracy_free( m_res ); + VK_FUNCTION_WRAPPER( vkDestroyQueryPool( m_device, m_query, nullptr ) ); + } + + void Name( const char* name, uint16_t len ) + { + auto ptr = (char*)tracy_malloc( len ); + memcpy( ptr, name, len ); + + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuContextName ); + MemWrite( &item->gpuContextNameFat.context, m_context ); + MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr ); + MemWrite( &item->gpuContextNameFat.size, len ); +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem( *item ); +#endif + Profiler::QueueSerialFinish(); + } + + void Collect( VkCommandBuffer cmdbuf ) + { + ZoneScopedC( Color::Red4 ); + + const uint64_t head = m_head.load(std::memory_order_relaxed); + if( m_tail == head ) return; + +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) + { + cmdbuf ? + VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, 0, m_queryCount ) ) : + VK_FUNCTION_WRAPPER( vkResetQueryPool( m_device, m_query, 0, m_queryCount ) ); + m_tail = head; + m_oldCnt = 0; + int64_t tgpu; + if( m_timeDomain != VK_TIME_DOMAIN_DEVICE_EXT ) Calibrate( m_device, m_prevCalibration, tgpu ); + return; + } +#endif + assert( head > m_tail ); + + const unsigned int wrappedTail = (unsigned int)( m_tail % m_queryCount ); + + unsigned int cnt; + if( m_oldCnt != 0 ) + { + cnt = m_oldCnt; + m_oldCnt = 0; + } + else + { + cnt = (unsigned int)( head - m_tail ); + assert( cnt <= m_queryCount ); + if( wrappedTail + cnt > m_queryCount ) + { + cnt = m_queryCount - wrappedTail; + } + } + + + VK_FUNCTION_WRAPPER( vkGetQueryPoolResults( m_device, m_query, wrappedTail, cnt, sizeof( int64_t ) * m_queryCount * 2, m_res, sizeof( int64_t ) * 2, VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WITH_AVAILABILITY_BIT ) ); + + for( unsigned int idx=0; idxhdr.type, QueueType::GpuTime ); + MemWrite( &item->gpuTime.gpuTime, m_res[idx * 2] ); + MemWrite( &item->gpuTime.queryId, uint16_t( wrappedTail + idx ) ); + MemWrite( &item->gpuTime.context, m_context ); + Profiler::QueueSerialFinish(); + } + + if( m_timeDomain != VK_TIME_DOMAIN_DEVICE_EXT ) + { + int64_t tgpu, tcpu; + Calibrate( m_device, tcpu, tgpu ); + const auto refCpu = Profiler::GetTime(); + const auto delta = tcpu - m_prevCalibration; + if( delta > 0 ) + { + m_prevCalibration = tcpu; + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuCalibration ); + MemWrite( &item->gpuCalibration.gpuTime, tgpu ); + MemWrite( &item->gpuCalibration.cpuTime, refCpu ); + MemWrite( &item->gpuCalibration.cpuDelta, delta ); + MemWrite( &item->gpuCalibration.context, m_context ); + Profiler::QueueSerialFinish(); + } + } + + cmdbuf ? + VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, wrappedTail, cnt ) ) : + VK_FUNCTION_WRAPPER( vkResetQueryPool( m_device, m_query, wrappedTail, cnt ) ); + + m_tail += cnt; + } + + tracy_force_inline unsigned int NextQueryId() + { + const uint64_t id = m_head.fetch_add(1, std::memory_order_relaxed); + return id % m_queryCount; + } + + tracy_force_inline uint8_t GetId() const + { + return m_context; + } + + tracy_force_inline VkQueryPool GetQueryPool() const + { + return m_query; + } + +private: + tracy_force_inline void Calibrate( VkDevice device, int64_t& tCpu, int64_t& tGpu ) + { + assert( m_timeDomain != VK_TIME_DOMAIN_DEVICE_EXT ); + VkCalibratedTimestampInfoEXT spec[2] = { + { VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, VK_TIME_DOMAIN_DEVICE_EXT }, + { VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, m_timeDomain }, + }; + uint64_t ts[2]; + uint64_t deviation; + do + { + m_vkGetCalibratedTimestampsEXT( device, 2, spec, ts, &deviation ); + } + while( deviation > m_deviation ); + +#if defined _WIN32 + tGpu = ts[0]; + tCpu = ts[1] * m_qpcToNs; +#elif defined __linux__ && defined CLOCK_MONOTONIC_RAW + tGpu = ts[0]; + tCpu = ts[1]; +#else + assert( false ); +#endif + } + + tracy_force_inline void CreateQueryPool() + { + VkQueryPoolCreateInfo poolInfo = {}; + poolInfo.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO; + poolInfo.queryCount = m_queryCount; + poolInfo.queryType = VK_QUERY_TYPE_TIMESTAMP; + while ( VK_FUNCTION_WRAPPER( vkCreateQueryPool( m_device, &poolInfo, nullptr, &m_query ) != VK_SUCCESS ) ) + { + m_queryCount /= 2; + poolInfo.queryCount = m_queryCount; + } + } + + tracy_force_inline void FindAvailableTimeDomains( VkPhysicalDevice physicalDevice, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT _vkGetPhysicalDeviceCalibrateableTimeDomainsEXT ) + { + uint32_t num; + _vkGetPhysicalDeviceCalibrateableTimeDomainsEXT( physicalDevice, &num, nullptr ); + if(num > 4) num = 4; + VkTimeDomainEXT data[4]; + _vkGetPhysicalDeviceCalibrateableTimeDomainsEXT( physicalDevice, &num, data ); + VkTimeDomainEXT supportedDomain = (VkTimeDomainEXT)-1; +#if defined _WIN32 + supportedDomain = VK_TIME_DOMAIN_QUERY_PERFORMANCE_COUNTER_EXT; +#elif defined __linux__ && defined CLOCK_MONOTONIC_RAW + supportedDomain = VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT; +#endif + for( uint32_t i=0; i deviation[i] ) { + minDeviation = deviation[i]; + } + } + m_deviation = minDeviation * 3 / 2; + +#if defined _WIN32 + m_qpcToNs = int64_t( 1000000000. / GetFrequencyQpc() ); +#endif + } + + tracy_force_inline void WriteInitialItem( VkPhysicalDevice physdev, int64_t tcpu, int64_t tgpu ) + { + uint8_t flags = 0; + if( m_timeDomain != VK_TIME_DOMAIN_DEVICE_EXT ) flags |= GpuContextCalibration; + + VkPhysicalDeviceProperties prop; + VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceProperties( physdev, &prop ) ); + const float period = prop.limits.timestampPeriod; + + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuNewContext ); + MemWrite( &item->gpuNewContext.cpuTime, tcpu ); + MemWrite( &item->gpuNewContext.gpuTime, tgpu ); + memset( &item->gpuNewContext.thread, 0, sizeof( item->gpuNewContext.thread ) ); + MemWrite( &item->gpuNewContext.period, period ); + MemWrite( &item->gpuNewContext.context, m_context ); + MemWrite( &item->gpuNewContext.flags, flags ); + MemWrite( &item->gpuNewContext.type, GpuContextType::Vulkan ); + +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem( *item ); +#endif + Profiler::QueueSerialFinish(); + } + +#if defined TRACY_VK_USE_SYMBOL_TABLE + void PopulateSymbolTable( VkInstance instance, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr deviceProcAddr ) + { +#define VK_GET_DEVICE_SYMBOL( name ) \ + (PFN_##name)deviceProcAddr( m_device, #name ); +#define VK_LOAD_DEVICE_SYMBOL( name ) \ + m_symbols.name = VK_GET_DEVICE_SYMBOL( name ); +#define VK_GET_INSTANCE_SYMBOL( name ) \ + (PFN_##name)instanceProcAddr( instance, #name ); +#define VK_LOAD_INSTANCE_SYMBOL( name ) \ + m_symbols.name = VK_GET_INSTANCE_SYMBOL( name ); + + LoadVkDeviceCoreSymbols( VK_LOAD_DEVICE_SYMBOL ) + LoadVkDeviceExtensionSymbols( VK_LOAD_DEVICE_SYMBOL ) + LoadVkInstanceExtensionSymbols( VK_LOAD_INSTANCE_SYMBOL ) + LoadVkInstanceCoreSymbols( VK_LOAD_INSTANCE_SYMBOL ) +#undef VK_GET_DEVICE_SYMBOL +#undef VK_LOAD_DEVICE_SYMBOL +#undef VK_GET_INSTANCE_SYMBOL +#undef VK_LOAD_INSTANCE_SYMBOL + } +#endif + + VkDevice m_device; + VkQueryPool m_query; + VkTimeDomainEXT m_timeDomain; +#if defined TRACY_VK_USE_SYMBOL_TABLE + VkSymbolTable m_symbols; +#endif + uint64_t m_deviation; +#ifdef _WIN32 + int64_t m_qpcToNs; +#endif + int64_t m_prevCalibration; + uint8_t m_context; + + std::atomic m_head; + uint64_t m_tail; + unsigned int m_oldCnt; + unsigned int m_queryCount; + + int64_t* m_res; + + PFN_vkGetCalibratedTimestampsEXT m_vkGetCalibratedTimestampsEXT; +}; + +class VkCtxScope +{ +public: + tracy_force_inline VkCtxScope( VkCtx* ctx, const SourceLocationData* srcloc, VkCommandBuffer cmdbuf, bool is_active ) +#ifdef TRACY_ON_DEMAND + : m_active( is_active && GetProfiler().IsConnected() ) +#else + : m_active( is_active ) +#endif + { + if( !m_active ) return; + m_cmdbuf = cmdbuf; + m_ctx = ctx; + + const auto queryId = ctx->NextQueryId(); + CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ) ); + + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuZoneBeginSerial ); + MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() ); + MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc ); + MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() ); + MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) ); + MemWrite( &item->gpuZoneBegin.context, ctx->GetId() ); + Profiler::QueueSerialFinish(); + } + + tracy_force_inline VkCtxScope( VkCtx* ctx, const SourceLocationData* srcloc, VkCommandBuffer cmdbuf, int32_t depth, bool is_active ) +#ifdef TRACY_ON_DEMAND + : m_active( is_active && GetProfiler().IsConnected() ) +#else + : m_active( is_active ) +#endif + { + if( !m_active ) return; + m_cmdbuf = cmdbuf; + m_ctx = ctx; + + const auto queryId = ctx->NextQueryId(); + CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ) ); + + QueueItem *item; + if( depth > 0 && has_callstack() ) + { + item = Profiler::QueueSerialCallstack( Callstack( depth ) ); + MemWrite( &item->hdr.type, QueueType::GpuZoneBeginCallstackSerial ); + } + else + { + item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuZoneBeginSerial ); + } + MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() ); + MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc ); + MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() ); + MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) ); + MemWrite( &item->gpuZoneBegin.context, ctx->GetId() ); + Profiler::QueueSerialFinish(); + } + + tracy_force_inline VkCtxScope( VkCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, VkCommandBuffer cmdbuf, bool is_active ) +#ifdef TRACY_ON_DEMAND + : m_active( is_active && GetProfiler().IsConnected() ) +#else + : m_active( is_active ) +#endif + { + if( !m_active ) return; + m_cmdbuf = cmdbuf; + m_ctx = ctx; + + const auto queryId = ctx->NextQueryId(); + CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ) ); + + const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz ); + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuZoneBeginAllocSrcLocSerial ); + MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() ); + MemWrite( &item->gpuZoneBegin.srcloc, srcloc ); + MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() ); + MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) ); + MemWrite( &item->gpuZoneBegin.context, ctx->GetId() ); + Profiler::QueueSerialFinish(); + } + + tracy_force_inline VkCtxScope( VkCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, VkCommandBuffer cmdbuf, int32_t depth, bool is_active ) +#ifdef TRACY_ON_DEMAND + : m_active( is_active && GetProfiler().IsConnected() ) +#else + : m_active( is_active ) +#endif + { + if( !m_active ) return; + m_cmdbuf = cmdbuf; + m_ctx = ctx; + + const auto queryId = ctx->NextQueryId(); + CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ) ); + + const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz ); + QueueItem *item; + if( depth > 0 && has_callstack() ) + { + item = Profiler::QueueSerialCallstack( Callstack( depth ) ); + MemWrite( &item->hdr.type, QueueType::GpuZoneBeginAllocSrcLocCallstackSerial ); + } + else + { + item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuZoneBeginAllocSrcLocSerial ); + } + MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() ); + MemWrite( &item->gpuZoneBegin.srcloc, srcloc ); + MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() ); + MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) ); + MemWrite( &item->gpuZoneBegin.context, ctx->GetId() ); + Profiler::QueueSerialFinish(); + } + + tracy_force_inline ~VkCtxScope() + { + if( !m_active ) return; + + const auto queryId = m_ctx->NextQueryId(); + CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( m_cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, m_ctx->m_query, queryId ) ); + + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuZoneEndSerial ); + MemWrite( &item->gpuZoneEnd.cpuTime, Profiler::GetTime() ); + MemWrite( &item->gpuZoneEnd.thread, GetThreadHandle() ); + MemWrite( &item->gpuZoneEnd.queryId, uint16_t( queryId ) ); + MemWrite( &item->gpuZoneEnd.context, m_ctx->GetId() ); + Profiler::QueueSerialFinish(); + } + +private: + const bool m_active; + + VkCommandBuffer m_cmdbuf; + VkCtx* m_ctx; +}; + +#if defined TRACY_VK_USE_SYMBOL_TABLE +static inline VkCtx* CreateVkContext( VkInstance instance, VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr getDeviceProcAddr, bool calibrated = false ) +#else +static inline VkCtx* CreateVkContext( VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT gpdctd, PFN_vkGetCalibratedTimestampsEXT gct ) +#endif +{ + auto ctx = (VkCtx*)tracy_malloc( sizeof( VkCtx ) ); +#if defined TRACY_VK_USE_SYMBOL_TABLE + new(ctx) VkCtx( instance, physdev, device, queue, cmdbuf, instanceProcAddr, getDeviceProcAddr, calibrated ); +#else + new(ctx) VkCtx( physdev, device, queue, cmdbuf, gpdctd, gct ); +#endif + return ctx; +} + +#if defined VK_EXT_host_query_reset +#if defined TRACY_VK_USE_SYMBOL_TABLE +static inline VkCtx* CreateVkContext( VkInstance instance, VkPhysicalDevice physdev, VkDevice device, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr getDeviceProcAddr ) +#else +static inline VkCtx* CreateVkContext( VkPhysicalDevice physdev, VkDevice device, PFN_vkResetQueryPoolEXT qpreset, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT gpdctd, PFN_vkGetCalibratedTimestampsEXT gct ) +#endif +{ + auto ctx = (VkCtx*)tracy_malloc( sizeof( VkCtx ) ); +#if defined TRACY_VK_USE_SYMBOL_TABLE + new(ctx) VkCtx( instance, physdev, device, instanceProcAddr, getDeviceProcAddr ); +#else + new(ctx) VkCtx( physdev, device, qpreset, gpdctd, gct ); +#endif + return ctx; +} +#endif + +static inline void DestroyVkContext( VkCtx* ctx ) +{ + ctx->~VkCtx(); + tracy_free( ctx ); +} + +} + +using TracyVkCtx = tracy::VkCtx*; + +#if defined TRACY_VK_USE_SYMBOL_TABLE +#define TracyVkContext( instance, physdev, device, queue, cmdbuf, instanceProcAddr, deviceProcAddr ) tracy::CreateVkContext( instance, physdev, device, queue, cmdbuf, instanceProcAddr, deviceProcAddr ); +#else +#define TracyVkContext( physdev, device, queue, cmdbuf ) tracy::CreateVkContext( physdev, device, queue, cmdbuf, nullptr, nullptr ); +#endif +#if defined TRACY_VK_USE_SYMBOL_TABLE +#define TracyVkContextCalibrated( instance, physdev, device, queue, cmdbuf, instanceProcAddr, deviceProcAddr ) tracy::CreateVkContext( instance, physdev, device, queue, cmdbuf, instanceProcAddr, deviceProcAddr, true ); +#else +#define TracyVkContextCalibrated( physdev, device, queue, cmdbuf, gpdctd, gct ) tracy::CreateVkContext( physdev, device, queue, cmdbuf, gpdctd, gct ); +#endif +#if defined VK_EXT_host_query_reset +#if defined TRACY_VK_USE_SYMBOL_TABLE +#define TracyVkContextHostCalibrated( instance, physdev, device, instanceProcAddr, deviceProcAddr ) tracy::CreateVkContext( instance, physdev, device, instanceProcAddr, deviceProcAddr ); +#else +#define TracyVkContextHostCalibrated( physdev, device, qpreset, gpdctd, gct ) tracy::CreateVkContext( physdev, device, qpreset, gpdctd, gct ); +#endif +#endif +#define TracyVkDestroy( ctx ) tracy::DestroyVkContext( ctx ); +#define TracyVkContextName( ctx, name, size ) ctx->Name( name, size ); +#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK +# define TracyVkNamedZone( ctx, varname, cmdbuf, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::VkCtxScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), cmdbuf, TRACY_CALLSTACK, active ); +# define TracyVkNamedZoneC( ctx, varname, cmdbuf, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::VkCtxScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), cmdbuf, TRACY_CALLSTACK, active ); +# define TracyVkZone( ctx, cmdbuf, name ) TracyVkNamedZoneS( ctx, ___tracy_gpu_zone, cmdbuf, name, TRACY_CALLSTACK, true ) +# define TracyVkZoneC( ctx, cmdbuf, name, color ) TracyVkNamedZoneCS( ctx, ___tracy_gpu_zone, cmdbuf, name, color, TRACY_CALLSTACK, true ) +# define TracyVkZoneTransient( ctx, varname, cmdbuf, name, active ) TracyVkZoneTransientS( ctx, varname, cmdbuf, name, TRACY_CALLSTACK, active ) +#else +# define TracyVkNamedZone( ctx, varname, cmdbuf, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::VkCtxScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), cmdbuf, active ); +# define TracyVkNamedZoneC( ctx, varname, cmdbuf, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::VkCtxScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), cmdbuf, active ); +# define TracyVkZone( ctx, cmdbuf, name ) TracyVkNamedZone( ctx, ___tracy_gpu_zone, cmdbuf, name, true ) +# define TracyVkZoneC( ctx, cmdbuf, name, color ) TracyVkNamedZoneC( ctx, ___tracy_gpu_zone, cmdbuf, name, color, true ) +# define TracyVkZoneTransient( ctx, varname, cmdbuf, name, active ) tracy::VkCtxScope varname( ctx, TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), cmdbuf, active ); +#endif +#define TracyVkCollect( ctx, cmdbuf ) ctx->Collect( cmdbuf ); +#define TracyVkCollectHost( ctx ) ctx->Collect( VK_NULL_HANDLE ); + +#ifdef TRACY_HAS_CALLSTACK +# define TracyVkNamedZoneS( ctx, varname, cmdbuf, name, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::VkCtxScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), cmdbuf, depth, active ); +# define TracyVkNamedZoneCS( ctx, varname, cmdbuf, name, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::VkCtxScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), cmdbuf, depth, active ); +# define TracyVkZoneS( ctx, cmdbuf, name, depth ) TracyVkNamedZoneS( ctx, ___tracy_gpu_zone, cmdbuf, name, depth, true ) +# define TracyVkZoneCS( ctx, cmdbuf, name, color, depth ) TracyVkNamedZoneCS( ctx, ___tracy_gpu_zone, cmdbuf, name, color, depth, true ) +# define TracyVkZoneTransientS( ctx, varname, cmdbuf, name, depth, active ) tracy::VkCtxScope varname( ctx, TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), cmdbuf, depth, active ); +#else +# define TracyVkNamedZoneS( ctx, varname, cmdbuf, name, depth, active ) TracyVkNamedZone( ctx, varname, cmdbuf, name, active ) +# define TracyVkNamedZoneCS( ctx, varname, cmdbuf, name, color, depth, active ) TracyVkNamedZoneC( ctx, varname, cmdbuf, name, color, active ) +# define TracyVkZoneS( ctx, cmdbuf, name, depth ) TracyVkZone( ctx, cmdbuf, name ) +# define TracyVkZoneCS( ctx, cmdbuf, name, color, depth ) TracyVkZoneC( ctx, cmdbuf, name, color ) +# define TracyVkZoneTransientS( ctx, varname, cmdbuf, name, depth, active ) TracyVkZoneTransient( ctx, varname, cmdbuf, name, active ) +#endif + +#endif + +#endif