1325 lines
56 KiB
C++
1325 lines
56 KiB
C++
#ifndef __TRACYCUDA_HPP__
|
|
#define __TRACYCUDA_HPP__
|
|
|
|
#ifndef TRACY_ENABLE
|
|
|
|
#define TracyCUDAContext() nullptr
|
|
#define TracyCUDAContextDestroy(ctx)
|
|
#define TracyCUDAContextName(ctx, name, size)
|
|
|
|
#define TracyCUDAStartProfiling(ctx)
|
|
#define TracyCUDAStopProfiling(ctx)
|
|
|
|
#define TracyCUDACollect(ctx)
|
|
|
|
#else
|
|
#include <cupti.h>
|
|
|
|
#include <cassert>
|
|
#include <cmath>
|
|
#include <string>
|
|
#include <string_view>
|
|
#include <atomic>
|
|
#include <mutex>
|
|
#include <shared_mutex>
|
|
#include <condition_variable>
|
|
#include <vector>
|
|
#include <unordered_set>
|
|
#include <unordered_map>
|
|
|
|
#ifndef _MSC_VER
|
|
#include <cxxabi.h>
|
|
#endif
|
|
|
|
#include <tracy/Tracy.hpp>
|
|
|
|
#ifndef UNREFERENCED
|
|
#define UNREFERENCED(x) (void)x
|
|
#endif//UNREFERENCED
|
|
|
|
#ifndef TRACY_CUDA_CALIBRATED_CONTEXT
|
|
#define TRACY_CUDA_CALIBRATED_CONTEXT (1)
|
|
#endif//TRACY_CUDA_CALIBRATED_CONTEXT
|
|
|
|
#ifndef TRACY_CUDA_ENABLE_COLLECTOR_THREAD
|
|
#define TRACY_CUDA_ENABLE_COLLECTOR_THREAD (1)
|
|
#endif//TRACY_CUDA_ENABLE_COLLECTOR_THREAD
|
|
|
|
#ifndef TRACY_CUDA_ENABLE_CUDA_CALL_STATS
|
|
#define TRACY_CUDA_ENABLE_CUDA_CALL_STATS (0)
|
|
#endif//TRACY_CUDA_ENABLE_CUDA_CALL_STATS
|
|
|
|
namespace {
|
|
|
|
// TODO(marcos): wrap these in structs for better type safety
|
|
using CUptiTimestamp = uint64_t;
|
|
using TracyTimestamp = int64_t;
|
|
|
|
struct IncrementalRegression {
|
|
using float_t = double;
|
|
struct Parameters {
|
|
float_t slope, intercept;
|
|
};
|
|
|
|
int n = 0;
|
|
float_t x_mean = 0;
|
|
float_t y_mean = 0;
|
|
float_t x_svar = 0;
|
|
float_t y_svar = 0;
|
|
float_t xy_scov = 0;
|
|
|
|
auto parameters() const {
|
|
float_t slope = xy_scov / x_svar;
|
|
float_t intercept = y_mean - slope * x_mean;
|
|
return Parameters{ slope, intercept };
|
|
}
|
|
|
|
auto orthogonal() const {
|
|
// NOTE(marcos): orthogonal regression is Deming regression with delta = 1
|
|
float_t delta = float_t(1); // delta = 1 -> orthogonal regression
|
|
float_t k = y_svar - delta * x_svar;
|
|
float_t slope = (k + sqrt(k * k + 4 * delta * xy_scov * xy_scov)) / (2 * xy_scov);
|
|
float_t intercept = y_mean - slope * x_mean;
|
|
return Parameters{ slope, intercept };
|
|
}
|
|
|
|
void addSample(float_t x, float_t y) {
|
|
++n;
|
|
float_t x_mean_prev = x_mean;
|
|
float_t y_mean_prev = y_mean;
|
|
x_mean += (x - x_mean) / n;
|
|
y_mean += (y - y_mean) / n;
|
|
x_svar += (x - x_mean_prev) * (x - x_mean);
|
|
y_svar += (y - y_mean_prev) * (y - y_mean);
|
|
xy_scov += (x - x_mean_prev) * (y - y_mean);
|
|
}
|
|
};
|
|
|
|
tracy_force_inline TracyTimestamp tracyGetTimestamp() {
|
|
return tracy::Profiler::GetTime();
|
|
}
|
|
|
|
auto& getCachedRegressionParameters() {
|
|
// WARN(marcos): in theory, these linear regression parameters would be loaded/stored atomically;
|
|
// in practice, however, it should not matter so long as the loads/stores are not "sliced"
|
|
static IncrementalRegression::Parameters cached;
|
|
return cached;
|
|
}
|
|
|
|
TracyTimestamp tracyFromCUpti(CUptiTimestamp cuptiTime) {
|
|
// NOTE(marcos): linear regression estimate
|
|
// y_hat = slope * x + intercept | X: CUptiTimestamp, Y: TracyTimestamp
|
|
auto [slope, intercept] = getCachedRegressionParameters();
|
|
double y_hat = slope * cuptiTime + intercept;
|
|
TracyTimestamp tracyTime = TracyTimestamp(y_hat);
|
|
assert(tracyTime >= 0);
|
|
return tracyTime;
|
|
}
|
|
|
|
template<typename T, typename U>
|
|
tracy_force_inline void tracyMemWrite(T& where,U what) {
|
|
static_assert(std::is_same_v<T, U>, "tracy::MemWrite: type mismatch.");
|
|
tracy::MemWrite(&where, what);
|
|
}
|
|
|
|
void* tracyMalloc(size_t bytes) {
|
|
return tracy::tracy_malloc(bytes);
|
|
}
|
|
|
|
void tracyFree(void* ptr) {
|
|
tracy::tracy_free(ptr);
|
|
}
|
|
|
|
void tracyZoneBegin(TracyTimestamp time, tracy::SourceLocationData* srcLoc) {
|
|
using namespace tracy;
|
|
TracyQueuePrepare(QueueType::ZoneBegin);
|
|
tracyMemWrite(item->zoneBegin.time, time);
|
|
tracyMemWrite(item->zoneBegin.srcloc, (uint64_t)srcLoc);
|
|
TracyQueueCommit(zoneBeginThread);
|
|
}
|
|
|
|
void tracyZoneEnd(TracyTimestamp time) {
|
|
using namespace tracy;
|
|
TracyQueuePrepare(QueueType::ZoneEnd);
|
|
tracyMemWrite(item->zoneEnd.time, time);
|
|
TracyQueueCommit(zoneEndThread);
|
|
}
|
|
|
|
void tracyPlot(const char* name, float value, TracyTimestamp time) {
|
|
using namespace tracy;
|
|
TracyLfqPrepare(QueueType::PlotDataFloat);
|
|
tracyMemWrite(item->plotDataFloat.name, (uint64_t)name);
|
|
tracyMemWrite(item->plotDataFloat.time, time);
|
|
tracyMemWrite(item->plotDataFloat.val, value);
|
|
TracyLfqCommit;
|
|
}
|
|
|
|
void tracyPlot(const char* name, float value, CUptiTimestamp time) {
|
|
tracyPlot(name, value, tracyFromCUpti(time));
|
|
}
|
|
|
|
void tracyPlotActivity(const char* name, TracyTimestamp start, TracyTimestamp end, float value = 1.0f, float baseline = 0.0f) {
|
|
tracyPlot(name, baseline, start);
|
|
tracyPlot(name, value, start + 3);
|
|
tracyPlot(name, value, end - 3);
|
|
tracyPlot(name, baseline, end);
|
|
}
|
|
|
|
void tracyPlotActivity(const char* name, CUptiTimestamp start, CUptiTimestamp end, float value = 1.0f, float baseline = 0.0f) {
|
|
tracyPlotActivity(name, tracyFromCUpti(start), tracyFromCUpti(end), value, baseline);
|
|
}
|
|
|
|
void tracyPlotBlip(const char* name, TracyTimestamp time, float value = 1.0f, float baseline = 0.0f) {
|
|
tracyPlot(name, baseline, time - 3);
|
|
tracyPlot(name, value, time);
|
|
tracyPlot(name, baseline, time + 3);
|
|
}
|
|
|
|
void tracyPlotBlip(const char* name, CUptiTimestamp time, float value = 1.0f, float baseline = 0.0f) {
|
|
tracyPlotBlip(name, tracyFromCUpti(time), value, baseline);
|
|
}
|
|
|
|
void tracyEmitMemAlloc(const char* name, const void* ptr, size_t size, TracyTimestamp time) {
|
|
using namespace tracy;
|
|
const auto thread = GetThreadHandle();
|
|
|
|
auto item = Profiler::QueueSerial();
|
|
tracyMemWrite(item->hdr.type, QueueType::MemNamePayload);
|
|
tracyMemWrite(item->memName.name, (uint64_t)name);
|
|
Profiler::QueueSerialFinish();
|
|
|
|
item = Profiler::QueueSerial();
|
|
tracyMemWrite(item->hdr.type, QueueType::MemAllocNamed);
|
|
tracyMemWrite(item->memAlloc.time, time);
|
|
tracyMemWrite(item->memAlloc.thread, thread);
|
|
tracyMemWrite(item->memAlloc.ptr, (uint64_t)ptr);
|
|
|
|
if (compile_time_condition<sizeof(size) == 4>::value)
|
|
{
|
|
memcpy(&item->memAlloc.size, &size, 4);
|
|
memset(&item->memAlloc.size + 4, 0, 2);
|
|
}
|
|
else
|
|
{
|
|
assert(sizeof(size) == 8);
|
|
memcpy(&item->memAlloc.size, &size, 4);
|
|
memcpy(((char *)&item->memAlloc.size) + 4, ((char *)&size) + 4, 2);
|
|
}
|
|
Profiler::QueueSerialFinish();
|
|
}
|
|
|
|
void tracyEmitMemFree(const char* name, const void* ptr, TracyTimestamp time) {
|
|
using namespace tracy;
|
|
const auto thread = GetThreadHandle();
|
|
|
|
auto item = Profiler::QueueSerial();
|
|
tracyMemWrite(item->hdr.type, QueueType::MemNamePayload);
|
|
tracyMemWrite(item->memName.name, (uint64_t)name);
|
|
Profiler::QueueSerialFinish();
|
|
|
|
item = Profiler::QueueSerial();
|
|
tracyMemWrite(item->hdr.type, QueueType::MemFreeNamed);
|
|
tracyMemWrite(item->memFree.time, time);
|
|
tracyMemWrite(item->memFree.thread, thread);
|
|
tracyMemWrite(item->memFree.ptr, (uint64_t)ptr);
|
|
Profiler::QueueSerialFinish();
|
|
}
|
|
|
|
void tracyEmitMemAlloc(const char* name, const void* ptr, size_t size, CUptiTimestamp cuptiTime) {
|
|
tracyEmitMemAlloc(name, ptr, size, tracyFromCUpti(cuptiTime));
|
|
}
|
|
|
|
void tracyEmitMemFree(const char* name, const void* ptr, CUptiTimestamp cuptiTime) {
|
|
tracyEmitMemFree(name, ptr, tracyFromCUpti(cuptiTime));
|
|
}
|
|
|
|
void tracyAnnounceGpuTimestamp(TracyTimestamp apiStart, TracyTimestamp apiEnd,
|
|
uint16_t queryId, uint8_t gpuContextId,
|
|
const tracy::SourceLocationData* sourceLocation, uint32_t threadId) {
|
|
using namespace tracy;
|
|
|
|
auto item = Profiler::QueueSerial();
|
|
tracyMemWrite(item->hdr.type, QueueType::GpuZoneBeginSerial);
|
|
tracyMemWrite(item->gpuZoneBegin.cpuTime, apiStart);
|
|
tracyMemWrite(item->gpuZoneBegin.srcloc, (uint64_t)sourceLocation);
|
|
tracyMemWrite(item->gpuZoneBegin.thread, threadId);
|
|
tracyMemWrite(item->gpuZoneBegin.queryId, uint16_t(queryId+0));
|
|
tracyMemWrite(item->gpuZoneBegin.context, gpuContextId);
|
|
Profiler::QueueSerialFinish();
|
|
|
|
item = Profiler::QueueSerial();
|
|
tracyMemWrite(item->hdr.type, QueueType::GpuZoneEndSerial);
|
|
tracyMemWrite(item->gpuZoneEnd.cpuTime, apiEnd);
|
|
tracyMemWrite(item->gpuZoneEnd.thread, threadId);
|
|
tracyMemWrite(item->gpuZoneEnd.queryId, uint16_t(queryId+1));
|
|
tracyMemWrite(item->gpuZoneEnd.context, gpuContextId);
|
|
Profiler::QueueSerialFinish();
|
|
}
|
|
|
|
void tracySubmitGpuTimestamp(CUptiTimestamp gpuStart, CUptiTimestamp gpuEnd,
|
|
uint16_t queryId, uint8_t gpuContextId) {
|
|
using namespace tracy;
|
|
|
|
auto item = Profiler::QueueSerial();
|
|
tracyMemWrite(item->hdr.type, QueueType::GpuTime);
|
|
tracyMemWrite(item->gpuTime.gpuTime, (int64_t)gpuStart);
|
|
tracyMemWrite(item->gpuTime.queryId, uint16_t(queryId+0));
|
|
tracyMemWrite(item->gpuTime.context, gpuContextId);
|
|
Profiler::QueueSerialFinish();
|
|
|
|
item = Profiler::QueueSerial();
|
|
tracyMemWrite(item->hdr.type, QueueType::GpuTime);
|
|
tracyMemWrite(item->gpuTime.gpuTime, (int64_t)gpuEnd);
|
|
tracyMemWrite(item->gpuTime.queryId, uint16_t(queryId+1));
|
|
tracyMemWrite(item->gpuTime.context, gpuContextId);
|
|
Profiler::QueueSerialFinish();
|
|
}
|
|
|
|
#define CUPTI_API_CALL(call) CUptiCallChecked(call, #call, __FILE__, __LINE__)
|
|
|
|
#define DRIVER_API_CALL(call) cudaDriverCallChecked(call, #call, __FILE__, __LINE__)
|
|
|
|
CUptiResult CUptiCallChecked(CUptiResult result, const char* call, const char* file, int line) noexcept {
|
|
if (result == CUPTI_SUCCESS)
|
|
return result;
|
|
const char* resultMsg = "";
|
|
CUPTI_API_CALL(cuptiGetResultString(result, &resultMsg)); // maybe not a good idea to recurse here...
|
|
fprintf(stderr, "ERROR:\t%s:%d:\n\tfunction '%s' failed with error '%s'.\n", file, line, call, resultMsg);
|
|
//assert(result == CUPTI_SUCCESS);
|
|
return result;
|
|
}
|
|
|
|
CUresult cudaDriverCallChecked(CUresult result, const char* call, const char* file, int line) noexcept {
|
|
if (result == CUDA_SUCCESS)
|
|
return result;
|
|
const char* resultMsg = "";
|
|
DRIVER_API_CALL(cuGetErrorString(result, &resultMsg)); // maybe not a good idea to recurse here...
|
|
fprintf(stderr, "ERROR:\t%s:%d:\n\tfunction '%s' failed with error '%s'.\n", file, line, call, resultMsg);
|
|
//assert(result == CUDA_SUCCESS);
|
|
return result;
|
|
}
|
|
|
|
template<typename TKey, typename TValue>
|
|
struct ConcurrentHashMap {
|
|
static constexpr bool instrument = false;
|
|
auto acquire_read_lock() {
|
|
if (m.try_lock_shared())
|
|
return std::shared_lock<std::shared_mutex>(m, std::adopt_lock);
|
|
ZoneNamedC(rwlock, tracy::Color::Tomato, instrument);
|
|
return std::shared_lock<std::shared_mutex>(m);
|
|
}
|
|
auto acquire_write_lock() {
|
|
if (m.try_lock())
|
|
return std::unique_lock<std::shared_mutex>(m, std::adopt_lock);
|
|
ZoneNamedC(wxlock, tracy::Color::Tomato, instrument);
|
|
return std::unique_lock<std::shared_mutex>(m);
|
|
}
|
|
std::unordered_map<TKey, TValue> mapping;
|
|
std::shared_mutex m;
|
|
auto& operator[](TKey key) {
|
|
{
|
|
auto lock = acquire_read_lock();
|
|
auto it = mapping.find(key);
|
|
if (it != mapping.end()) {
|
|
return it->second;
|
|
}
|
|
}
|
|
return emplace(key, TValue{}).first->second;
|
|
}
|
|
auto find(TKey key) {
|
|
ZoneNamed(find, instrument);
|
|
auto lock = acquire_read_lock();
|
|
return mapping.find(key);
|
|
}
|
|
auto fetch(TKey key, TValue& value) {
|
|
ZoneNamed(fetch, instrument);
|
|
auto it = mapping.find(key);
|
|
if (it != mapping.end()) {
|
|
value = it->second;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
auto end() {
|
|
ZoneNamed(end, instrument);
|
|
auto lock = acquire_read_lock();
|
|
return mapping.end();
|
|
}
|
|
template<typename... Args>
|
|
auto emplace(TKey key, Args&&... args) {
|
|
ZoneNamed(emplace, instrument);
|
|
auto lock = acquire_write_lock();
|
|
return mapping.emplace(std::forward<TKey>(key), std::forward<Args>(args)...);
|
|
}
|
|
auto erase(TKey key) {
|
|
ZoneNamed(erase, instrument);
|
|
auto lock = acquire_write_lock();
|
|
return mapping.erase(key);
|
|
}
|
|
};
|
|
|
|
#if TRACY_CUDA_ENABLE_CUDA_CALL_STATS
|
|
struct ProfilerStats {
|
|
static constexpr bool instrument = false;
|
|
|
|
ConcurrentHashMap<uint32_t, std::atomic<int>> apiCallCount;
|
|
|
|
void update(CUpti_CallbackDomain domain, CUpti_CallbackId cbid) {
|
|
ZoneNamed(update, instrument);
|
|
uint32_t key = (domain << 24) | (cbid & 0x00'FFFFFF);
|
|
auto it = apiCallCount.find(key);
|
|
if (it == apiCallCount.end()) {
|
|
it = apiCallCount.emplace(key, 0).first;
|
|
}
|
|
it->second.fetch_add(1, std::memory_order::memory_order_relaxed);
|
|
}
|
|
};
|
|
#endif
|
|
|
|
// StringTable: string memoization/interning
|
|
struct StringTable {
|
|
static constexpr bool instrument = false;
|
|
|
|
// TODO(marcos): this could be just a "ConcurrentHashSet"
|
|
ConcurrentHashMap<std::string_view, std::string_view> table;
|
|
|
|
~StringTable() { /* TODO(marcos): free string copy */ }
|
|
|
|
std::string_view operator[](std::string_view str) {
|
|
ZoneNamedN(lookup, "StringTable::lookup", instrument);
|
|
std::string_view memoized;
|
|
if (!table.fetch(str, memoized)) {
|
|
ZoneNamedN(lookup, "StringTable::insert", instrument);
|
|
char* copy = (char*)tracyMalloc(str.size() + 1);
|
|
strncpy(copy, str.data(), str.size());
|
|
copy[str.size()] = '\0';
|
|
std::string_view value (copy, str.size());
|
|
auto [it, inserted] = table.emplace(value, value);
|
|
if (!inserted) {
|
|
// another thread inserted it while we were trying to: cleanup
|
|
tracyFree(copy);
|
|
}
|
|
memoized = it->second;
|
|
}
|
|
assert(str == memoized);
|
|
return memoized;
|
|
}
|
|
};
|
|
|
|
struct SourceLocationMap {
|
|
static constexpr bool instrument = false;
|
|
|
|
// NOTE(marcos): the address of an unordered_map value may become invalid
|
|
// later on (e.g., during a rehash), so mapping to a pointer is necessary
|
|
ConcurrentHashMap<std::string_view, tracy::SourceLocationData*> locations;
|
|
|
|
~SourceLocationMap() { /* TODO(marcos): free SourceLocationData* entries */ }
|
|
|
|
tracy::SourceLocationData* retrieve(std::string_view function) {
|
|
ZoneNamed(retrieve, instrument);
|
|
tracy::SourceLocationData* pSrcLoc = nullptr;
|
|
locations.fetch(function, pSrcLoc);
|
|
return pSrcLoc;
|
|
}
|
|
|
|
tracy::SourceLocationData* add(std::string_view function, std::string_view file, int line, uint32_t color=0) {
|
|
ZoneNamed(emplace, instrument);
|
|
assert(*function.end() == '\0');
|
|
assert(*file.end() == '\0');
|
|
void* bytes = tracyMalloc(sizeof(tracy::SourceLocationData));
|
|
auto pSrcLoc = new(bytes)tracy::SourceLocationData{ function.data(), TracyFunction, file.data(), (uint32_t)line, color };
|
|
auto [it, inserted] = locations.emplace(function, pSrcLoc);
|
|
if (!inserted) {
|
|
// another thread inserted it while we were trying to: cleanup
|
|
tracyFree(pSrcLoc); // POD: no destructor to call
|
|
}
|
|
assert(it->second != nullptr);
|
|
return it->second;
|
|
}
|
|
};
|
|
|
|
struct SourceLocationLUT {
|
|
static constexpr bool instrument = false;
|
|
|
|
~SourceLocationLUT() { /* no action needed: no dynamic allocation */ }
|
|
|
|
tracy::SourceLocationData runtime [CUpti_runtime_api_trace_cbid::CUPTI_RUNTIME_TRACE_CBID_SIZE] = {};
|
|
tracy::SourceLocationData driver [CUpti_driver_api_trace_cbid::CUPTI_DRIVER_TRACE_CBID_SIZE] = {};
|
|
|
|
tracy::SourceLocationData* retrieve(CUpti_CallbackDomain domain, CUpti_CallbackId cbid, CUpti_CallbackData* apiInfo) {
|
|
ZoneNamed(retrieve, instrument);
|
|
tracy::SourceLocationData* pSrcLoc = nullptr;
|
|
switch (domain) {
|
|
case CUPTI_CB_DOMAIN_RUNTIME_API :
|
|
if ((cbid > 0) && (cbid < CUPTI_RUNTIME_TRACE_CBID_SIZE)) {
|
|
pSrcLoc = &runtime[cbid];
|
|
}
|
|
break;
|
|
case CUPTI_CB_DOMAIN_DRIVER_API :
|
|
if ((cbid > 0) && (cbid < CUPTI_DRIVER_TRACE_CBID_SIZE)) {
|
|
pSrcLoc = &driver[cbid];
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
if (pSrcLoc->name == nullptr) {
|
|
const char* function = apiInfo->functionName ? apiInfo->functionName : "cuda???";
|
|
// cuptiGetCallbackName includes the "version suffix" of the function/cbid
|
|
//CUPTI_API_CALL(cuptiGetCallbackName(domain, cbid, &function));
|
|
*pSrcLoc = tracy::SourceLocationData{ function, TracyFunction, TracyFile, TracyLine, 0 };
|
|
}
|
|
return pSrcLoc;
|
|
}
|
|
};
|
|
|
|
uint32_t tracyTimelineId(uint32_t contextId, uint32_t streamId) {
|
|
// 0xA7C5 = 42,949 => 42,949 * 100,000 = 4,294,900,000
|
|
// 4,294,900,000 + 65,535 = 4,294,965,535 < 4,294,967,295 (max uint32)
|
|
assert(contextId <= 0xA7C5);
|
|
assert((streamId == CUPTI_INVALID_STREAM_ID) || (streamId < 0xFFFF));
|
|
uint32_t packed = (contextId * 100'000) + (streamId & 0x0000'FFFF);
|
|
return packed;
|
|
}
|
|
|
|
} // unnamed/anonymous namespace
|
|
|
|
namespace tracy
|
|
{
|
|
class CUDACtx
|
|
{
|
|
public:
|
|
static CUDACtx* Create() {
|
|
auto& s = Singleton::Get();
|
|
std::unique_lock<std::mutex> lock (s.m);
|
|
if (s.ref_count == 0) {
|
|
assert(s.ctx == nullptr);
|
|
s.ctx = new CUDACtx(s.ctx_id);
|
|
s.ref_count += 1;
|
|
s.ctx_id = s.ctx->m_tracyGpuContext;
|
|
}
|
|
return s.ctx;
|
|
}
|
|
|
|
static void Destroy(CUDACtx* ctx) {
|
|
auto& s = Singleton::Get();
|
|
std::unique_lock<std::mutex> lock(s.m);
|
|
assert(ctx == s.ctx);
|
|
s.ref_count -= 1;
|
|
if (s.ref_count == 0) {
|
|
delete s.ctx;
|
|
s.ctx = nullptr;
|
|
}
|
|
}
|
|
|
|
void Collect()
|
|
{
|
|
ZoneScoped;
|
|
CUPTI::FlushActivity();
|
|
}
|
|
|
|
void printStats()
|
|
{
|
|
#if TRACY_CUDA_ENABLE_CUDA_CALL_STATS
|
|
fprintf(stdout, "\nCUDA API stats:\n");
|
|
{
|
|
struct Stats { CUpti_CallbackDomain domain; CUpti_CallbackId cbid; int count; };
|
|
std::vector<Stats> sorted;
|
|
for (auto&& api : stats.apiCallCount.mapping) {
|
|
auto domain = CUpti_CallbackDomain(api.first >> 24);
|
|
auto cbid = CUpti_CallbackId(api.first & 0x00'FFFFFF);
|
|
int count = api.second;
|
|
sorted.emplace_back(Stats{ domain, cbid, count });
|
|
}
|
|
std::sort(sorted.begin(), sorted.end(), [](const Stats& x, const Stats& y) { return x.count > y.count; });
|
|
for (auto&& api : sorted) {
|
|
const char* function = "";
|
|
CUPTI_API_CALL(cuptiGetCallbackName(api.domain, api.cbid, &function));
|
|
printf("- %s : %d\n", function, api.count);
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void StartProfiling()
|
|
{
|
|
ZoneScoped;
|
|
CUPTI::BeginInstrumentation(this);
|
|
}
|
|
|
|
void StopProfiling()
|
|
{
|
|
ZoneScoped;
|
|
CUPTI::EndInstrumentation();
|
|
printStats();
|
|
}
|
|
|
|
void Name(const char *name, uint16_t len)
|
|
{
|
|
auto ptr = (char*)tracyMalloc(len);
|
|
memcpy(ptr, name, len);
|
|
|
|
auto item = Profiler::QueueSerial();
|
|
tracyMemWrite(item->hdr.type, QueueType::GpuContextName);
|
|
tracyMemWrite(item->gpuContextNameFat.context, m_tracyGpuContext);
|
|
tracyMemWrite(item->gpuContextNameFat.ptr, (uint64_t)ptr);
|
|
tracyMemWrite(item->gpuContextNameFat.size, len);
|
|
SubmitQueueItem(item);
|
|
}
|
|
|
|
tracy_force_inline void SubmitQueueItem(tracy::QueueItem *item)
|
|
{
|
|
#ifdef TRACY_ON_DEMAND
|
|
GetProfiler().DeferItem(*item);
|
|
#endif
|
|
Profiler::QueueSerialFinish();
|
|
}
|
|
|
|
static void QueryTimestamps(TracyTimestamp& tTracy, CUptiTimestamp& tCUpti) {
|
|
TracyTimestamp tTracy1 = tracyGetTimestamp();
|
|
CUPTI_API_CALL(cuptiGetTimestamp(&tCUpti));
|
|
TracyTimestamp tTracy2 = tracyGetTimestamp();
|
|
// NOTE(marcos): giving more weight to 'tTracy2'
|
|
tTracy = (3*tTracy1 + 5*tTracy2) / 8;
|
|
}
|
|
|
|
// NOTE(marcos): recalibration is 'static' since Tracy and CUPTI timestamps
|
|
// are "global" across all contexts; that said, each Tracy GPU context needs
|
|
// its own GpuCalibration message, but for now there's just a singleton context.
|
|
void Recalibrate() {
|
|
ZoneScoped;
|
|
// NOTE(marcos): only one thread should do the calibration, but there's
|
|
// no good reason to block threads that also trying to do the same
|
|
static std::mutex m;
|
|
if (!m.try_lock())
|
|
return;
|
|
std::unique_lock<std::mutex> lock (m, std::adopt_lock);
|
|
ZoneNamedNC(zone, "tracy::CUDACtx::Recalibrate[effective]", tracy::Color::Goldenrod, true);
|
|
TracyTimestamp tTracy;
|
|
CUptiTimestamp tCUpti;
|
|
QueryTimestamps(tTracy, tCUpti);
|
|
#if TRACY_CUDA_CALIBRATED_CONTEXT
|
|
static CUptiTimestamp prevCUptiTime = tCUpti;
|
|
int64_t deltaTicksCUpti = tCUpti - prevCUptiTime;
|
|
if (deltaTicksCUpti > 0) {
|
|
prevCUptiTime = tCUpti;
|
|
auto* item = Profiler::QueueSerial();
|
|
tracyMemWrite(item->hdr.type, QueueType::GpuCalibration);
|
|
tracyMemWrite(item->gpuCalibration.gpuTime, (int64_t)tCUpti);
|
|
tracyMemWrite(item->gpuCalibration.cpuTime, tTracy);
|
|
tracyMemWrite(item->gpuCalibration.cpuDelta, deltaTicksCUpti);
|
|
tracyMemWrite(item->gpuCalibration.context, m_tracyGpuContext);
|
|
Profiler::QueueSerialFinish();
|
|
}
|
|
#endif
|
|
// NOTE(marcos): update linear regression incrementally, which will refine
|
|
// the estimation of Tracy timestamps (Y) from CUpti timestamps (X)
|
|
static IncrementalRegression model;
|
|
model.addSample(double(tCUpti), double(tTracy));
|
|
// NOTE(marcos): using orthogonal regression because the independet variable
|
|
// (X: CUpti timestamps) measurements are also imprecise
|
|
getCachedRegressionParameters() = model.orthogonal();
|
|
}
|
|
|
|
protected:
|
|
void EmitGpuZone(TracyTimestamp apiStart, TracyTimestamp apiEnd,
|
|
CUptiTimestamp gpuStart, CUptiTimestamp gpuEnd,
|
|
const tracy::SourceLocationData* pSrcLoc,
|
|
uint32_t cudaContextId, uint32_t cudaStreamId) {
|
|
//uint32_t timelineId = tracy::GetThreadHandle();
|
|
uint32_t timelineId = tracyTimelineId(cudaContextId, cudaStreamId);
|
|
uint16_t queryId = m_queryIdGen.fetch_add(2);
|
|
tracyAnnounceGpuTimestamp(apiStart, apiEnd, queryId, m_tracyGpuContext, pSrcLoc, timelineId);
|
|
tracySubmitGpuTimestamp(gpuStart, gpuEnd, queryId, m_tracyGpuContext);
|
|
}
|
|
|
|
void OnEventsProcessed() {
|
|
Recalibrate();
|
|
}
|
|
|
|
struct CUPTI {
|
|
static void CUPTIAPI OnBufferRequested(uint8_t **buffer, size_t *size, size_t *maxNumRecords)
|
|
{
|
|
ZoneScoped;
|
|
// TODO(marcos): avoid malloc and instead suballocate from a large circular buffer;
|
|
// according to the CUPTI documentation: "To minimize profiling overhead the client
|
|
// should return as quickly as possible from these callbacks."
|
|
*size = 1 * 1024*1024; // 1MB
|
|
*buffer = (uint8_t*)tracyMalloc(*size);
|
|
assert(*buffer != nullptr);
|
|
FlushActivityAsync();
|
|
}
|
|
|
|
static void CUPTIAPI OnBufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t* buffer, size_t size, size_t validSize)
|
|
{
|
|
// CUDA 6.0 onwards: all buffers from this callback are "global" buffers
|
|
// (i.e. there is no context/stream specific buffer; ctx is always NULL)
|
|
ZoneScoped;
|
|
tracy::SetThreadName("NVIDIA CUPTI Worker");
|
|
CUptiResult status;
|
|
CUpti_Activity* record = nullptr;
|
|
while ((status = cuptiActivityGetNextRecord(buffer, validSize, &record)) == CUPTI_SUCCESS) {
|
|
DoProcessDeviceEvent(record);
|
|
}
|
|
if (status != CUPTI_ERROR_MAX_LIMIT_REACHED) {
|
|
CUptiCallChecked(status, "cuptiActivityGetNextRecord", TracyFile, TracyLine);
|
|
}
|
|
size_t dropped = 0;
|
|
CUPTI_API_CALL(cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped));
|
|
assert(dropped == 0);
|
|
tracyFree(buffer);
|
|
PersistentState::Get().profilerHost->OnEventsProcessed();
|
|
}
|
|
|
|
// correlationID -> [CPU start time, CPU end time, CUPTI start time]
|
|
using CorrelationID = uint32_t;
|
|
struct APICallInfo { TracyTimestamp start = 0, end = 0; CUptiTimestamp cupti = CUPTI_TIMESTAMP_UNKNOWN; CUDACtx* host = nullptr; };
|
|
|
|
static void CUPTIAPI OnCallbackAPI(
|
|
void* userdata,
|
|
CUpti_CallbackDomain domain,
|
|
CUpti_CallbackId cbid,
|
|
const void* cbdata)
|
|
{
|
|
static constexpr bool instrument = false;
|
|
|
|
TracyTimestamp apiCallStartTime = tracyGetTimestamp();
|
|
CUDACtx* profilerHost = (CUDACtx*)userdata;
|
|
|
|
switch (domain) {
|
|
case CUPTI_CB_DOMAIN_RUNTIME_API:
|
|
case CUPTI_CB_DOMAIN_DRIVER_API:
|
|
break;
|
|
case CUPTI_CB_DOMAIN_RESOURCE: {
|
|
// match 'callbackId' with CUpti_CallbackIdResource
|
|
// interpret 'cbdata' as CUpti_ResourceData,
|
|
// or as CUpti_ModuleResourceData,
|
|
// or as CUpti_GraphData,
|
|
// or as CUpti_StreamAttrData,
|
|
// or as ... (what else?)
|
|
return;
|
|
}
|
|
case CUPTI_CB_DOMAIN_SYNCHRONIZE: {
|
|
// match 'callbackId' with CUpti_CallbackIdSync
|
|
// interpret 'cbdata' as CUpti_SynchronizeData
|
|
return;
|
|
}
|
|
case CUPTI_CB_DOMAIN_STATE: {
|
|
// match 'callbackId' with CUpti_CallbackIdState
|
|
// interpret 'cbdata' as CUpti_StateData
|
|
return;
|
|
}
|
|
case CUPTI_CB_DOMAIN_NVTX: {
|
|
// match 'callbackId' with CUpti_nvtx_api_trace_cbid
|
|
// interpret 'cbdata' as CUpti_NvtxData
|
|
return;
|
|
}
|
|
case CUPTI_CB_DOMAIN_FORCE_INT:
|
|
// NOTE(marcos): the "FORCE_INT" values in CUPTI enums exist only to
|
|
// force the enum to have a specific representation (signed 32bits)
|
|
case CUPTI_CB_DOMAIN_INVALID:
|
|
default:
|
|
// TODO(marcos): unexpected error!
|
|
return;
|
|
}
|
|
|
|
// if we reached this point, then we are in the (runtime or driver) API domain
|
|
CUpti_CallbackData* apiInfo = (CUpti_CallbackData*)cbdata;
|
|
|
|
// Emit the Tracy 'ZoneBegin' message upon entering the API call
|
|
// TODO(marcos): a RAII object could be useful here...
|
|
if (apiInfo->callbackSite == CUPTI_API_ENTER) {
|
|
#if TRACY_CUDA_ENABLE_CUDA_CALL_STATS
|
|
ctx->stats.update(domain, cbid);
|
|
#endif
|
|
|
|
auto& cudaCallSourceLocation = PersistentState::Get().cudaCallSourceLocation;
|
|
auto pSrcLoc = cudaCallSourceLocation.retrieve(domain, cbid, apiInfo);
|
|
|
|
// HACK(marcos): the SourceLocationLUT::retrieve zone (above) should
|
|
// not be emitted before its enclosing zone (below) actually begins,
|
|
// so we delay the beginning of the enclosing zone to "unstack" them
|
|
if (SourceLocationLUT::instrument)
|
|
apiCallStartTime = tracyGetTimestamp();
|
|
tracyZoneBegin(apiCallStartTime, pSrcLoc);
|
|
}
|
|
|
|
if (apiInfo->callbackSite == CUPTI_API_ENTER) {
|
|
ZoneNamedN(enter, "tracy::CUDACtx::OnCUptiCallback[enter]", instrument);
|
|
// Track API calls that generate device activity:
|
|
bool trackDeviceActivity = false;
|
|
CUstream hStream = nullptr;
|
|
if (domain == CUPTI_CB_DOMAIN_RUNTIME_API) {
|
|
#define GET_STREAM_FUNC(Params, field) [](CUpti_CallbackData* api) { return ((Params*)api->functionParams)->field; }
|
|
#define NON_STREAM_FUNC() [](CUpti_CallbackData*) { return cudaStream_t(nullptr); }
|
|
static std::unordered_map<CUpti_runtime_api_trace_cbid, cudaStream_t(*)(CUpti_CallbackData*)> cbidRuntimeTrackers = {
|
|
// Runtime: Kernel
|
|
{ CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000, GET_STREAM_FUNC(cudaLaunchKernel_v7000_params, stream) },
|
|
{ CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_ptsz_v7000, GET_STREAM_FUNC(cudaLaunchKernel_ptsz_v7000_params, stream) },
|
|
{ CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060, GET_STREAM_FUNC(cudaLaunchKernelExC_v11060_params, config->stream) },
|
|
{ CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_ptsz_v11060, GET_STREAM_FUNC(cudaLaunchKernelExC_ptsz_v11060_params, config->stream) },
|
|
// Runtime: Memory
|
|
{ CUPTI_RUNTIME_TRACE_CBID_cudaMalloc_v3020, NON_STREAM_FUNC() },
|
|
{ CUPTI_RUNTIME_TRACE_CBID_cudaFree_v3020, NON_STREAM_FUNC() },
|
|
// Runtime: Memcpy
|
|
{ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020, NON_STREAM_FUNC() },
|
|
{ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020, GET_STREAM_FUNC(cudaMemcpyAsync_v3020_params, stream) },
|
|
// Runtime: Memset
|
|
{ CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020, NON_STREAM_FUNC() },
|
|
{ CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020, GET_STREAM_FUNC(cudaMemsetAsync_v3020_params, stream) },
|
|
// Runtime: Synchronization
|
|
{ CUPTI_RUNTIME_TRACE_CBID_cudaStreamSynchronize_v3020, NON_STREAM_FUNC() },
|
|
{ CUPTI_RUNTIME_TRACE_CBID_cudaEventSynchronize_v3020, NON_STREAM_FUNC() },
|
|
{ CUPTI_RUNTIME_TRACE_CBID_cudaEventQuery_v3020, NON_STREAM_FUNC() },
|
|
{ CUPTI_RUNTIME_TRACE_CBID_cudaStreamWaitEvent_v3020, NON_STREAM_FUNC() },
|
|
{ CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSynchronize_v3020, NON_STREAM_FUNC() },
|
|
};
|
|
#undef NON_STREAM_FUNC
|
|
#undef GET_STREAM_FUNC
|
|
auto it = cbidRuntimeTrackers.find(CUpti_runtime_api_trace_cbid(cbid));
|
|
if (it != cbidRuntimeTrackers.end()) {
|
|
trackDeviceActivity = true;
|
|
hStream = (CUstream)it->second(apiInfo);
|
|
}
|
|
}
|
|
if (domain == CUPTI_CB_DOMAIN_DRIVER_API) {
|
|
#define GET_STREAM_FUNC(Params, field) [](CUpti_CallbackData* api) { return ((Params*)api->functionParams)->field; }
|
|
#define NON_STREAM_FUNC() [](CUpti_CallbackData*) { return CUstream(nullptr); }
|
|
static std::unordered_map<CUpti_driver_api_trace_cbid, CUstream(*)(CUpti_CallbackData*)> cbidDriverTrackers = {
|
|
// Driver: Kernel
|
|
{ CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel, GET_STREAM_FUNC(cuLaunchKernel_params, hStream) },
|
|
{ CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel_ptsz, GET_STREAM_FUNC(cuLaunchKernel_ptsz_params, hStream)} ,
|
|
{ CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx, GET_STREAM_FUNC(cuLaunchKernelEx_params, config->hStream) },
|
|
{ CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx_ptsz, GET_STREAM_FUNC(cuLaunchKernelEx_params, config->hStream) },
|
|
};
|
|
#undef NON_STREAM_FUNC
|
|
#undef GET_STREAM_FUNC
|
|
auto it = cbidDriverTrackers.find(CUpti_driver_api_trace_cbid(cbid));
|
|
if (it != cbidDriverTrackers.end()) {
|
|
trackDeviceActivity = true;
|
|
hStream = it->second(apiInfo);
|
|
}
|
|
}
|
|
if (trackDeviceActivity) {
|
|
// NOTE(marcos): we should NOT track if the stream is being captured
|
|
CUstreamCaptureStatus status = {};
|
|
DRIVER_API_CALL(cuStreamIsCapturing(hStream, &status));
|
|
trackDeviceActivity = !(status == CU_STREAM_CAPTURE_STATUS_ACTIVE);
|
|
}
|
|
if (trackDeviceActivity) {
|
|
CUptiTimestamp tgpu;
|
|
// TODO(marcos): do a "reverse-estimate" to obtain CUpti time from Tracy time instead?
|
|
CUPTI_API_CALL(cuptiGetTimestamp(&tgpu));
|
|
auto& cudaCallSiteInfo = PersistentState::Get().cudaCallSiteInfo;
|
|
cudaCallSiteInfo.emplace(apiInfo->correlationId, APICallInfo{ apiCallStartTime, apiCallStartTime, tgpu, profilerHost });
|
|
}
|
|
auto& entryFlags = *apiInfo->correlationData;
|
|
assert(entryFlags == 0);
|
|
entryFlags |= trackDeviceActivity ? 0x8000 : 0;
|
|
}
|
|
|
|
if (apiInfo->callbackSite == CUPTI_API_EXIT) {
|
|
APICallInfo* pApiInterval = [](CUpti_CallbackData* apiInfo) {
|
|
ZoneNamedN(exit, "tracy::CUDACtx::OnCUptiCallback[exit]", instrument);
|
|
auto entryFlags = *apiInfo->correlationData;
|
|
bool trackDeviceActivity = (entryFlags & 0x8000) != 0;
|
|
if (trackDeviceActivity) {
|
|
auto& cudaCallSiteInfo = PersistentState::Get().cudaCallSiteInfo;
|
|
auto it = cudaCallSiteInfo.find(apiInfo->correlationId);
|
|
if (it != cudaCallSiteInfo.end()) {
|
|
// WARN(marcos): leaking the address of a hash-map value could spell trouble
|
|
return &it->second;
|
|
}
|
|
}
|
|
// NOTE(marcos): this can happen if the GPU activity completes
|
|
// before the CUDA function that enqueued it returns (e.g., sync)
|
|
static APICallInfo sentinel;
|
|
return &sentinel;
|
|
}(apiInfo);
|
|
pApiInterval->end = tracyGetTimestamp();
|
|
tracyZoneEnd(pApiInterval->end);
|
|
}
|
|
}
|
|
|
|
static bool matchActivityToAPICall(uint32_t correlationId, APICallInfo& apiCallInfo) {
|
|
static constexpr bool instrument = false;
|
|
ZoneNamed(match, instrument);
|
|
auto& cudaCallSiteInfo = PersistentState::Get().cudaCallSiteInfo;
|
|
if (!cudaCallSiteInfo.fetch(correlationId, apiCallInfo)) {
|
|
return false;
|
|
}
|
|
cudaCallSiteInfo.erase(correlationId);
|
|
assert(apiCallInfo.host != nullptr);
|
|
return true;
|
|
}
|
|
|
|
static void matchError(uint32_t correlationId, const char* kind) {
|
|
char msg [128];
|
|
snprintf(msg, sizeof(msg), "ERROR: device activity '%s' has no matching CUDA API call (id=%u).", kind, correlationId);
|
|
TracyMessageC(msg, strlen(msg), tracy::Color::Tomato);
|
|
}
|
|
|
|
static std::string extractActualName(char** name){
|
|
//If name does not start with number, return empty string
|
|
if (!isdigit(**name))
|
|
{
|
|
return std::string();
|
|
}
|
|
// Assuming name starts with number followed by actual name
|
|
std::string actualName;
|
|
char* currStr = *name;
|
|
int num = 0;
|
|
while (*currStr >= '0' && *currStr <= '9')
|
|
{
|
|
num = num * 10 + (*currStr - '0');
|
|
currStr++;
|
|
}
|
|
|
|
// Return the string start at currStr ends at num
|
|
actualName = std::string(currStr, num);
|
|
// check if actualName starts with _GLOBAL__N__
|
|
if (actualName.rfind("_GLOBAL__N__", 0) == 0)
|
|
{
|
|
// _GLOBAL__N__ with an id stands for anonymous namespace
|
|
actualName = std::string("(anonymous_namespace)");
|
|
}
|
|
|
|
*name = currStr + num;
|
|
return actualName;
|
|
}
|
|
|
|
static std::string extractActualNameNested(const char* demangledName)
|
|
{
|
|
ZoneNamedN(demangle, "demangle_kernel", false);
|
|
//If name does not start with _Z, return a new std::string with original name
|
|
if (demangledName[0] != '_' || demangledName[1] != 'Z')
|
|
{
|
|
return std::string(demangledName);
|
|
}
|
|
std::string actualName;
|
|
char* currStr = (char*)demangledName + 2;
|
|
|
|
if (*currStr == 'N')
|
|
{
|
|
currStr++;
|
|
// extract actual name from nested name
|
|
std::string nestedName = extractActualName(&currStr);
|
|
actualName += nestedName;
|
|
while (1)
|
|
{
|
|
//Loop until nested name is empty
|
|
nestedName = extractActualName(&currStr);
|
|
if (nestedName.empty())
|
|
{
|
|
break;
|
|
}
|
|
actualName += "::" + nestedName;
|
|
}
|
|
} else
|
|
{
|
|
actualName = extractActualName(&currStr);
|
|
}
|
|
return actualName;
|
|
}
|
|
|
|
static tracy::SourceLocationData* getKernelSourceLocation(const char* kernelName)
|
|
{
|
|
auto& kernelSrcLoc = PersistentState::Get().kernelSrcLoc;
|
|
std::string_view demangledName;
|
|
#ifndef _MSC_VER
|
|
// TODO(marcos): extractActualNameNested is the main bottleneck right now;
|
|
// we need a specialized StringTable mapping from "peristent" kernel names
|
|
// (const char*/uintptr_t) to memoized, lazily initialized demangled names
|
|
auto& demangledNameTable = PersistentState::Get().demangledNameTable;
|
|
std::string demangled = extractActualNameNested(kernelName);
|
|
demangledName = demangledNameTable[demangled];
|
|
#else
|
|
demangledName = kernelName;
|
|
#endif
|
|
auto pSrcLoc = kernelSrcLoc.retrieve(demangledName);
|
|
if (pSrcLoc == nullptr) {
|
|
pSrcLoc = kernelSrcLoc.add(demangledName, TracyFile, TracyLine);
|
|
}
|
|
return pSrcLoc;
|
|
}
|
|
|
|
static void DoProcessDeviceEvent(CUpti_Activity *record)
|
|
{
|
|
static constexpr bool instrument = false;
|
|
ZoneNamed(activity, instrument);
|
|
|
|
switch (record->kind)
|
|
{
|
|
case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL:
|
|
{
|
|
ZoneNamedN(kernel, "tracy::CUDACtx::DoProcessDeviceEvent[kernel]", instrument);
|
|
CUpti_ActivityKernel9* kernel9 = (CUpti_ActivityKernel9*) record;
|
|
APICallInfo apiCall;
|
|
if (!matchActivityToAPICall(kernel9->correlationId, apiCall)) {
|
|
return matchError(kernel9->correlationId, "KERNEL");
|
|
}
|
|
apiCall.host->EmitGpuZone(apiCall.start, apiCall.end, kernel9->start, kernel9->end, getKernelSourceLocation(kernel9->name), kernel9->contextId, kernel9->streamId);
|
|
auto latency_ms = (kernel9->start - apiCall.cupti) / 1'000'000.0;
|
|
tracyPlotBlip("Kernel Latency (ms)", kernel9->start, latency_ms);
|
|
break;
|
|
}
|
|
|
|
case CUPTI_ACTIVITY_KIND_MEMCPY:
|
|
{
|
|
ZoneNamedN(kernel, "tracy::CUDACtx::DoProcessDeviceEvent[memcpy]", instrument);
|
|
CUpti_ActivityMemcpy5* memcpy5 = (CUpti_ActivityMemcpy5*) record;
|
|
APICallInfo apiCall;
|
|
if (!matchActivityToAPICall(memcpy5->correlationId, apiCall)) {
|
|
return matchError(memcpy5->correlationId, "MEMCPY");
|
|
}
|
|
static constexpr tracy::SourceLocationData TracyCUPTISrcLocDeviceMemcpy { "CUDA::memcpy", TracyFunction, TracyFile, (uint32_t)TracyLine, tracy::Color::Blue };
|
|
apiCall.host->EmitGpuZone(apiCall.start, apiCall.end, memcpy5->start, memcpy5->end, &TracyCUPTISrcLocDeviceMemcpy, memcpy5->contextId, memcpy5->streamId);
|
|
static constexpr const char* graph_name = "CUDA Memory Copy";
|
|
tracyEmitMemAlloc(graph_name, (void*)(uintptr_t)memcpy5->correlationId, memcpy5->bytes, memcpy5->start);
|
|
tracyEmitMemFree (graph_name, (void*)(uintptr_t)memcpy5->correlationId, memcpy5->end);
|
|
break;
|
|
}
|
|
|
|
case CUPTI_ACTIVITY_KIND_MEMSET:
|
|
{
|
|
ZoneNamedN(kernel, "tracy::CUDACtx::DoProcessDeviceEvent[memset]", instrument);
|
|
CUpti_ActivityMemset4* memset4 = (CUpti_ActivityMemset4*) record;
|
|
APICallInfo apiCall;
|
|
if (!matchActivityToAPICall(memset4->correlationId, apiCall)) {
|
|
return matchError(memset4->correlationId, "MEMSET");
|
|
}
|
|
static constexpr tracy::SourceLocationData TracyCUPTISrcLocDeviceMemset { "CUDA::memset", TracyFunction, TracyFile, (uint32_t)TracyLine, tracy::Color::Blue };
|
|
apiCall.host->EmitGpuZone(apiCall.start, apiCall.end, memset4->start, memset4->end, &TracyCUPTISrcLocDeviceMemset, memset4->contextId, memset4->streamId);
|
|
static constexpr const char* graph_name = "CUDA Memory Set";
|
|
tracyEmitMemAlloc(graph_name, (void*)(uintptr_t)memset4->correlationId, memset4->bytes, memset4->start);
|
|
tracyEmitMemFree (graph_name, (void*)(uintptr_t)memset4->correlationId, memset4->end);
|
|
break;
|
|
}
|
|
|
|
case CUPTI_ACTIVITY_KIND_SYNCHRONIZATION:
|
|
{
|
|
ZoneNamedN(kernel, "tracy::CUDACtx::DoProcessDeviceEvent[sync]", instrument);
|
|
CUpti_ActivitySynchronization* synchronization = (CUpti_ActivitySynchronization*) record;
|
|
APICallInfo apiCall;
|
|
if (!matchActivityToAPICall(synchronization->correlationId, apiCall)) {
|
|
return matchError(synchronization->correlationId, "SYNCHRONIZATION");
|
|
}
|
|
// NOTE(marcos): synchronization can happen at different levels/objects:
|
|
// a. on the entire context : cuCtxSynchronize() -> timeline(ctx,0)
|
|
// b. on a specific stream : cuStreamSynchronize() -> timeline(ctx,stream)
|
|
// c. on a specific event : cuEventSynchronize() -> timeline(ctx,0xffff)
|
|
static constexpr tracy::SourceLocationData TracyCUPTISrcLocContextSynchronization { "CUDA::Context::sync", TracyFunction, TracyFile, (uint32_t)TracyLine, tracy::Color::Magenta };
|
|
auto* pSrcLoc = &TracyCUPTISrcLocContextSynchronization;
|
|
uint32_t cudaContextId = synchronization->contextId;
|
|
uint32_t cudaStreamId = 0;
|
|
if (synchronization->streamId != CUPTI_SYNCHRONIZATION_INVALID_VALUE) {
|
|
static constexpr tracy::SourceLocationData TracyCUPTISrcLocStreamSynchronization{ "CUDA::Stream::sync", TracyFunction, TracyFile, (uint32_t)TracyLine, tracy::Color::Magenta3 };
|
|
pSrcLoc = &TracyCUPTISrcLocStreamSynchronization;
|
|
cudaStreamId = synchronization->streamId;
|
|
}
|
|
if (synchronization->cudaEventId != CUPTI_SYNCHRONIZATION_INVALID_VALUE) {
|
|
static constexpr tracy::SourceLocationData TracyCUPTISrcLocEventSynchronization{ "CUDA::Event::sync", TracyFunction, TracyFile, (uint32_t)TracyLine, tracy::Color::Magenta4 };
|
|
pSrcLoc = &TracyCUPTISrcLocEventSynchronization;
|
|
cudaStreamId = 0xFFFFFFFF;
|
|
// TODO(marcos): CUpti_ActivitySynchronization2 introduces a new
|
|
// field 'cudaEventSyncId' which complements 'cudaEventId'
|
|
}
|
|
apiCall.host->EmitGpuZone(apiCall.start, apiCall.end, synchronization->start, synchronization->end, pSrcLoc, cudaContextId, cudaStreamId);
|
|
static constexpr const char* graph_name = "CUDA Synchronization";
|
|
tracyEmitMemAlloc(graph_name, (void*)(uintptr_t)synchronization->correlationId, 1, synchronization->start);
|
|
tracyEmitMemFree (graph_name, (void*)(uintptr_t)synchronization->correlationId, synchronization->end);
|
|
break;
|
|
}
|
|
case CUPTI_ACTIVITY_KIND_MEMORY2:
|
|
{
|
|
ZoneNamedN(kernel, "tracy::CUDACtx::DoProcessDeviceEvent[malloc/free]", instrument);
|
|
CUpti_ActivityMemory3* memory3 = (CUpti_ActivityMemory3*)record;
|
|
APICallInfo apiCall;
|
|
if (!matchActivityToAPICall(memory3->correlationId, apiCall)) {
|
|
return matchError(memory3->correlationId, "MEMORY");
|
|
}
|
|
static constexpr const char* graph_name = "CUDA Memory Allocation";
|
|
if (memory3->memoryOperationType == CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_ALLOCATION){
|
|
auto& memAllocAddress = PersistentState::Get().memAllocAddress;
|
|
memAllocAddress[memory3->address] = 1;
|
|
tracyEmitMemAlloc(graph_name, (void*)memory3->address, memory3->bytes, memory3->timestamp);
|
|
}
|
|
else if (memory3->memoryOperationType == CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_RELEASE){
|
|
auto& memAllocAddress = PersistentState::Get().memAllocAddress;
|
|
int dontCare;
|
|
if (!memAllocAddress.fetch(memory3->address, dontCare)){
|
|
// Note(Frank): This is a hack to handle the case where the memory allocation
|
|
// corresponds to the memory release is not found.
|
|
// This can happen when the memory is allocated when profiling is not enabled.
|
|
matchError(memory3->correlationId, "MEMORY/RELEASE");
|
|
tracyEmitMemAlloc(graph_name, (void*)memory3->address, memory3->bytes, memory3->timestamp);
|
|
} else {
|
|
memAllocAddress.erase(memory3->address);
|
|
}
|
|
tracyEmitMemFree(graph_name, (void*)memory3->address, memory3->timestamp);
|
|
}
|
|
break;
|
|
}
|
|
case CUPTI_ACTIVITY_KIND_CUDA_EVENT :
|
|
{
|
|
// NOTE(marcos): a byproduct of CUPTI_ACTIVITY_KIND_SYNCHRONIZATION
|
|
// (I think this is related to cudaEvent*() API calls)
|
|
CUpti_ActivityCudaEvent2* event = (CUpti_ActivityCudaEvent2*)record;
|
|
UNREFERENCED(event);
|
|
break;
|
|
}
|
|
default:
|
|
{
|
|
char buffer[64];
|
|
snprintf(buffer, sizeof(buffer), "Unknown activity record (kind is %d)", record->kind);
|
|
TracyMessageC(buffer, strlen(buffer), tracy::Color::Crimson);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
static constexpr CUpti_CallbackDomain domains[] = {
|
|
CUPTI_CB_DOMAIN_RUNTIME_API,
|
|
CUPTI_CB_DOMAIN_DRIVER_API,
|
|
//CUPTI_CB_DOMAIN_RESOURCE,
|
|
//CUPTI_CB_DOMAIN_SYNCHRONIZE,
|
|
//CUPTI_CB_DOMAIN_NVTX,
|
|
//CUPTI_CB_DOMAIN_STATE
|
|
};
|
|
|
|
static constexpr CUpti_ActivityKind activities[] = {
|
|
//CUPTI_ACTIVITY_KIND_KERNEL, // mutually exclusive with CONCURRENT_KERNEL
|
|
CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL,
|
|
CUPTI_ACTIVITY_KIND_MEMCPY,
|
|
CUPTI_ACTIVITY_KIND_MEMSET,
|
|
CUPTI_ACTIVITY_KIND_SYNCHRONIZATION,
|
|
CUPTI_ACTIVITY_KIND_MEMORY2,
|
|
//CUPTI_ACTIVITY_KIND_MEMCPY2,
|
|
//CUPTI_ACTIVITY_KIND_OVERHEAD,
|
|
//CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API,
|
|
//CUPTI_ACTIVITY_KIND_RUNTIME,
|
|
//CUPTI_ACTIVITY_KIND_DRIVER,
|
|
};
|
|
|
|
static void BeginInstrumentation(CUDACtx* profilerHost) {
|
|
auto& currentProfilerHost = PersistentState::Get().profilerHost;
|
|
if (currentProfilerHost != nullptr) {
|
|
return;
|
|
}
|
|
currentProfilerHost = profilerHost;
|
|
|
|
// NOTE(frank): full-stop synchronization to ensure we only handle
|
|
// CUDA API calls and device activities that happens past this point
|
|
cudaDeviceSynchronize();
|
|
|
|
auto& subscriber = PersistentState::Get().subscriber;
|
|
CUPTI_API_CALL(cuptiSubscribe(&subscriber, CUPTI::OnCallbackAPI, profilerHost));
|
|
CUPTI_API_CALL(cuptiActivityRegisterCallbacks(CUPTI::OnBufferRequested, CUPTI::OnBufferCompleted));
|
|
for (auto domain : domains) {
|
|
CUPTI_API_CALL(cuptiEnableDomain(uint32_t(true), subscriber, domain));
|
|
}
|
|
for (auto activity : activities) {
|
|
CUPTI_API_CALL(cuptiActivityEnable(activity));
|
|
}
|
|
|
|
#if TRACY_CUDA_ENABLE_COLLECTOR_THREAD
|
|
auto& collector = PersistentState::Get().collector;
|
|
collector.period = 160;
|
|
collector.signal.notify_one();
|
|
#endif
|
|
}
|
|
|
|
static void EndInstrumentation() {
|
|
auto& currentProfilerHost = PersistentState::Get().profilerHost;
|
|
if (currentProfilerHost == nullptr) {
|
|
return;
|
|
}
|
|
|
|
// NOTE(frank): full-stop synchronization to ensure we catch
|
|
// and drain all the activities that has been tracked up to now.
|
|
cudaDeviceSynchronize();
|
|
|
|
FlushActivity();
|
|
|
|
auto& subscriber = PersistentState::Get().subscriber;
|
|
for (auto activity : activities) {
|
|
CUPTI_API_CALL(cuptiActivityDisable(activity));
|
|
}
|
|
for (auto domain : domains) {
|
|
CUPTI_API_CALL(cuptiEnableDomain(uint32_t(false), subscriber, domain));
|
|
}
|
|
// TODO(marcos): is here a counterpart for 'cuptiActivityRegisterCallbacks()'?
|
|
CUPTI_API_CALL(cuptiUnsubscribe(subscriber));
|
|
|
|
#if TRACY_CUDA_ENABLE_COLLECTOR_THREAD
|
|
auto& collector = PersistentState::Get().collector;
|
|
collector.period = ~uint32_t(0);
|
|
collector.signal.notify_one();
|
|
#endif
|
|
|
|
currentProfilerHost = nullptr;
|
|
}
|
|
|
|
static void FlushActivity()
|
|
{
|
|
// NOTE(marcos): only one thread should do the collection at any given time,
|
|
// but there's no reason to block threads that are also trying to do the same
|
|
static std::mutex m;
|
|
if (!m.try_lock())
|
|
return;
|
|
std::unique_lock<std::mutex> lock (m, std::adopt_lock);
|
|
ZoneNamedNC(zone, "cuptiActivityFlushAll", tracy::Color::Red4, true);
|
|
CUPTI_API_CALL(cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_NONE));
|
|
}
|
|
|
|
#if TRACY_CUDA_ENABLE_COLLECTOR_THREAD
|
|
// WARN(marcos): technically, CUPTI already offers async flushing of
|
|
// activity records through cuptiActivityFlushPeriod(), but I haven't
|
|
// had much luck getting reliable, consistent delivery with it...
|
|
struct Collector {
|
|
std::atomic<bool> running = true;
|
|
volatile uint32_t period = ~uint32_t(0);
|
|
std::mutex mtx;
|
|
std::condition_variable signal;
|
|
std::thread thread = std::thread(
|
|
[this]() {
|
|
tracy::SetThreadName("Tracy CUDA Collector");
|
|
atexit([]() {
|
|
auto& collector = CUPTI::PersistentState::Get().collector;
|
|
collector.running = false;
|
|
collector.signal.notify_one();
|
|
collector.thread.join();
|
|
});
|
|
while (running) {
|
|
{
|
|
std::unique_lock<std::mutex> lock(mtx);
|
|
signal.wait_for(lock, std::chrono::milliseconds(period));
|
|
}
|
|
FlushActivity();
|
|
}
|
|
}
|
|
);
|
|
};
|
|
#endif
|
|
|
|
static void FlushActivityAsync()
|
|
{
|
|
#if TRACY_CUDA_ENABLE_COLLECTOR_THREAD
|
|
ZoneScoped;
|
|
auto& collector = PersistentState::Get().collector;
|
|
collector.signal.notify_one();
|
|
#endif
|
|
}
|
|
|
|
struct PersistentState {
|
|
// NOTE(marcos): these objects must remain in memory past the application
|
|
// returning from main() because the Tracy client worker thread may still
|
|
// be responding to string/source-location requests from the server
|
|
SourceLocationMap kernelSrcLoc;
|
|
StringTable demangledNameTable;
|
|
SourceLocationLUT cudaCallSourceLocation;
|
|
|
|
// NOTE(marcos): these objects do not need to persist, but their relative
|
|
// footprint is trivial enough that we don't care if we let them leak
|
|
ConcurrentHashMap<CorrelationID, APICallInfo> cudaCallSiteInfo;
|
|
ConcurrentHashMap<uintptr_t, int> memAllocAddress;
|
|
CUpti_SubscriberHandle subscriber = {};
|
|
CUDACtx* profilerHost = nullptr;
|
|
|
|
Collector collector;
|
|
|
|
static PersistentState& Get() {
|
|
static PersistentState& persistent = *(new PersistentState());
|
|
return persistent;
|
|
}
|
|
};
|
|
|
|
};
|
|
|
|
CUDACtx(uint8_t gpuContextID = 255)
|
|
{
|
|
ZoneScoped;
|
|
|
|
if (gpuContextID != 255) {
|
|
m_tracyGpuContext = gpuContextID;
|
|
return;
|
|
}
|
|
|
|
m_tracyGpuContext = GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed);
|
|
assert(m_tracyGpuContext != 255);
|
|
|
|
TracyTimestamp tTracy;
|
|
CUptiTimestamp tCUpti;
|
|
QueryTimestamps(tTracy, tCUpti);
|
|
|
|
// Announce to Tracy about a new GPU context/timeline:
|
|
auto item = Profiler::QueueSerial();
|
|
tracyMemWrite(item->hdr.type, QueueType::GpuNewContext);
|
|
tracyMemWrite(item->gpuNewContext.cpuTime, tTracy);
|
|
tracyMemWrite(item->gpuNewContext.gpuTime, (int64_t)tCUpti); // TODO: Be more careful about this cast
|
|
tracyMemWrite(item->gpuNewContext.thread, (uint32_t)0);
|
|
tracyMemWrite(item->gpuNewContext.period, 1.0f);
|
|
tracyMemWrite(item->gpuNewContext.type, GpuContextType::CUDA);
|
|
tracyMemWrite(item->gpuNewContext.context, m_tracyGpuContext);
|
|
#if TRACY_CUDA_CALIBRATED_CONTEXT
|
|
tracyMemWrite(item->gpuNewContext.flags, GpuContextCalibration);
|
|
#else
|
|
tracyMemWrite(item->gpuNewContext.flags, tracy::GpuContextFlags(0));
|
|
#endif
|
|
Profiler::QueueSerialFinish();
|
|
|
|
constexpr const char* tracyCtxName = "CUDA GPU/Device Activity";
|
|
this->Name(tracyCtxName, uint16_t(strlen(tracyCtxName)));
|
|
|
|
// NOTE(marcos): a few rounds of calibation amorthized over 1 second
|
|
// in order to get a meaningful linear regression estimator
|
|
Recalibrate();
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
|
Recalibrate();
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(200));
|
|
Recalibrate();
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(300));
|
|
Recalibrate();
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(400));
|
|
Recalibrate();
|
|
}
|
|
|
|
~CUDACtx()
|
|
{
|
|
ZoneScoped;
|
|
}
|
|
|
|
struct Singleton {
|
|
CUDACtx* ctx = nullptr;
|
|
std::mutex m;
|
|
int ref_count = 0;
|
|
uint8_t ctx_id = 255;
|
|
static Singleton& Get() {
|
|
static Singleton singleton;
|
|
return singleton;
|
|
}
|
|
};
|
|
|
|
#if TRACY_CUDA_ENABLE_CUDA_CALL_STATS
|
|
ProfilerStats stats = {};
|
|
#endif
|
|
|
|
uint8_t m_tracyGpuContext = 255;
|
|
static constexpr size_t cacheline = 64;
|
|
alignas(cacheline) std::atomic<uint16_t> m_queryIdGen = 0;
|
|
};
|
|
|
|
}
|
|
|
|
#define TracyCUDAContext() tracy::CUDACtx::Create()
|
|
#define TracyCUDAContextDestroy(ctx) tracy::CUDACtx::Destroy(ctx)
|
|
#define TracyCUDAContextName(ctx, name, size) ctx->Name(name, size)
|
|
|
|
#define TracyCUDAStartProfiling(ctx) ctx->StartProfiling()
|
|
#define TracyCUDAStopProfiling(ctx) ctx->StopProfiling()
|
|
|
|
#define TracyCUDACollect(ctx) ctx->Collect()
|
|
|
|
#endif
|
|
|
|
#endif |