Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into hw
Browse files Browse the repository at this point in the history
  • Loading branch information
wolfpld committed Jun 1, 2021
2 parents 688a972 + f4d80a4 commit b7c5939
Show file tree
Hide file tree
Showing 21 changed files with 112 additions and 104 deletions.
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
cmake_minimum_required(VERSION 3.10)
project(TracyClient LANGUAGES CXX)
add_library(TracyClient INTERFACE)
target_include_directories(TracyClient INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
3 changes: 3 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ v0.x.x (xxxx-xx-xx)
- Added TRACY_NO_CALLSTACK_INLINES macro to disable inline functions
resolution in call stacks on Windows.
- Limited client query response rate.
- Improved function matching algorithm in compare traces view.
- Added minimal CMake integration layer.
- Reworked rpmalloc initialization.


v0.7.8 (2021-05-19)
Expand Down
1 change: 0 additions & 1 deletion TracyC.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,6 @@ struct ___tracy_c_zone_context
// This struct, as visible to user, is immutable, so treat it as if const was declared here.
typedef /*const*/ struct ___tracy_c_zone_context TracyCZoneCtx;

TRACY_API void ___tracy_init_thread(void);
TRACY_API uint64_t ___tracy_alloc_srcloc( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz );
TRACY_API uint64_t ___tracy_alloc_srcloc_name( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz );

Expand Down
1 change: 1 addition & 0 deletions TracyClient.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "common/TracySocket.cpp"
#include "client/tracy_rpmalloc.cpp"
#include "client/TracyDxt1.cpp"
#include "client/TracyAlloc.cpp"

#if TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6
# include "libbacktrace/alloc.cpp"
Expand Down
5 changes: 2 additions & 3 deletions TracyD3D11.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ class D3D11ZoneScope
MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() );
MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) );
MemWrite( &item->gpuZoneBegin.context, ctx->GetId() );

Profiler::QueueSerialFinish();
}

Expand Down Expand Up @@ -376,7 +376,7 @@ class D3D11ZoneScope
MemWrite( &item->gpuZoneEnd.thread, GetThreadHandle() );
MemWrite( &item->gpuZoneEnd.queryId, uint16_t( queryId ) );
MemWrite( &item->gpuZoneEnd.context, m_ctx->GetId() );

Profiler::QueueSerialFinish();
}

Expand All @@ -389,7 +389,6 @@ class D3D11ZoneScope

static inline D3D11Ctx* CreateD3D11Context( ID3D11Device* device, ID3D11DeviceContext* devicectx )
{
InitRPMallocThread();
auto ctx = (D3D11Ctx*)tracy_malloc( sizeof( D3D11Ctx ) );
new(ctx) D3D11Ctx( device, devicectx );
return ctx;
Expand Down
2 changes: 0 additions & 2 deletions TracyD3D12.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -451,8 +451,6 @@ namespace tracy

static inline D3D12QueueCtx* CreateD3D12Context(ID3D12Device* device, ID3D12CommandQueue* queue)
{
InitRPMallocThread();

auto* ctx = static_cast<D3D12QueueCtx*>(tracy_malloc(sizeof(D3D12QueueCtx)));
new (ctx) D3D12QueueCtx{ device, queue };

Expand Down
1 change: 0 additions & 1 deletion TracyOpenCL.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,6 @@ namespace tracy {

static inline OpenCLCtx* CreateCLContext(cl_context context, cl_device_id device)
{
InitRPMallocThread();
auto ctx = (OpenCLCtx*)tracy_malloc(sizeof(OpenCLCtx));
new (ctx) OpenCLCtx(context, device);
return ctx;
Expand Down
10 changes: 5 additions & 5 deletions TracyOpenGL.hpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
#ifndef __TRACYOPENGL_HPP__
#define __TRACYOPENGL_HPP__

#if !defined GL_TIMESTAMP && !defined GL_TIMESTAMP_EXT
# error "You must include OpenGL 3.2 headers before including TracyOpenGL.hpp"
#endif

#if !defined TRACY_ENABLE || defined __APPLE__

#define TracyGpuContext
Expand Down Expand Up @@ -35,6 +31,10 @@ class GpuCtxScope

#else

#if !defined GL_TIMESTAMP && !defined GL_TIMESTAMP_EXT
# error "You must include OpenGL 3.2 headers before including TracyOpenGL.hpp"
#endif

#include <atomic>
#include <assert.h>
#include <stdlib.h>
Expand All @@ -53,7 +53,7 @@ class GpuCtxScope
# define glQueryCounter glQueryCounterEXT
#endif

#define TracyGpuContext tracy::InitRPMallocThread(); tracy::GetGpuCtx().ptr = (tracy::GpuCtx*)tracy::tracy_malloc( sizeof( tracy::GpuCtx ) ); new(tracy::GetGpuCtx().ptr) tracy::GpuCtx;
#define TracyGpuContext tracy::GetGpuCtx().ptr = (tracy::GpuCtx*)tracy::tracy_malloc( sizeof( tracy::GpuCtx ) ); new(tracy::GetGpuCtx().ptr) tracy::GpuCtx;
#define TracyGpuContextName( name, size ) tracy::GetGpuCtx().ptr->Name( name, size );
#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
# define TracyGpuNamedZone( varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::GpuCtxScope varname( &TracyConcat(__tracy_gpu_source_location,__LINE__), TRACY_CALLSTACK, active );
Expand Down
1 change: 0 additions & 1 deletion TracyVulkan.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -456,7 +456,6 @@ class VkCtxScope

static inline VkCtx* CreateVkContext( VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT gpdctd, PFN_vkGetCalibratedTimestampsEXT gct )
{
InitRPMallocThread();
auto ctx = (VkCtx*)tracy_malloc( sizeof( VkCtx ) );
new(ctx) VkCtx( physdev, device, queue, cmdbuf, gpdctd, gct );
return ctx;
Expand Down
35 changes: 35 additions & 0 deletions client/TracyAlloc.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#ifdef TRACY_ENABLE

#include <atomic>

#include "../common/TracyAlloc.hpp"
#include "../common/TracyYield.hpp"

namespace tracy
{

extern std::atomic<int> RpInitDone;
extern std::atomic<int> RpInitLock;

TRACY_API void InitRpmallocPlumbing()
{
const auto done = RpInitDone.load( std::memory_order_acquire );
if( !done )
{
int expected = 0;
while( !RpInitLock.compare_exchange_weak( expected, 1, std::memory_order_release, std::memory_order_relaxed ) ) { expected = 0; YieldThread(); }
const auto done = RpInitDone.load( std::memory_order_acquire );
if( !done )
{
rpmalloc_initialize();
RpInitDone.store( 1, std::memory_order_release );
}
RpInitLock.store( 0, std::memory_order_release );
}
rpmalloc_thread_initialize();
RpThreadInitDone = true;
}

}

#endif
79 changes: 9 additions & 70 deletions client/TracyProfiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
#include "../common/TracyAlign.hpp"
#include "../common/TracySocket.hpp"
#include "../common/TracySystem.hpp"
#include "../common/TracyYield.hpp"
#include "../common/tracy_lz4.hpp"
#include "tracy_rpmalloc.hpp"
#include "TracyCallstack.hpp"
Expand Down Expand Up @@ -118,45 +119,6 @@ extern "C" typedef BOOL (WINAPI *t_GetLogicalProcessorInformationEx)( LOGICAL_PR
namespace tracy
{

namespace
{
# if ( defined _WIN32 || defined __CYGWIN__ ) && _WIN32_WINNT >= _WIN32_WINNT_VISTA
BOOL CALLBACK InitOnceCallback( PINIT_ONCE /*initOnce*/, PVOID /*Parameter*/, PVOID* /*Context*/)
{
rpmalloc_initialize();
return TRUE;
}
INIT_ONCE InitOnce = INIT_ONCE_STATIC_INIT;
# elif defined __linux__
void InitOnceCallback()
{
rpmalloc_initialize();
}
pthread_once_t once_control = PTHREAD_ONCE_INIT;
# else
void InitOnceCallback()
{
rpmalloc_initialize();
}
std::once_flag once_flag;
# endif
}

struct RPMallocInit
{
RPMallocInit()
{
# if ( defined _WIN32 || defined __CYGWIN__ ) && _WIN32_WINNT >= _WIN32_WINNT_VISTA
InitOnceExecuteOnce( &InitOnce, InitOnceCallback, nullptr, nullptr );
# elif defined __linux__
pthread_once( &once_control, InitOnceCallback );
# else
std::call_once( once_flag, InitOnceCallback );
# endif
rpmalloc_thread_initialize();
}
};

#ifndef TRACY_DELAYED_INIT

struct InitTimeWrapper
Expand Down Expand Up @@ -964,12 +926,6 @@ TRACY_API int64_t GetFrequencyQpc()
#ifdef TRACY_DELAYED_INIT
struct ThreadNameData;
TRACY_API moodycamel::ConcurrentQueue<QueueItem>& GetQueue();
TRACY_API void InitRPMallocThread();

void InitRPMallocThread()
{
RPMallocInit rpinit;
}

struct ProfilerData
{
Expand All @@ -991,19 +947,21 @@ struct ProducerWrapper
struct ProfilerThreadData
{
ProfilerThreadData( ProfilerData& data ) : token( data ), gpuCtx( { nullptr } ) {}
RPMallocInit rpmalloc_init;
ProducerWrapper token;
GpuCtxWrapper gpuCtx;
# ifdef TRACY_ON_DEMAND
LuaZoneState luaZoneState;
# endif
};

std::atomic<int> RpInitDone { 0 };
std::atomic<int> RpInitLock { 0 };
thread_local bool RpThreadInitDone = false;

# ifdef TRACY_MANUAL_LIFETIME
ProfilerData* s_profilerData = nullptr;
TRACY_API void StartupProfiler()
{
RPMallocInit init;
s_profilerData = (ProfilerData*)tracy_malloc( sizeof( ProfilerData ) );
new (s_profilerData) ProfilerData();
s_profilerData->profiler.SpawnWorkerThreads();
Expand All @@ -1030,11 +988,10 @@ static ProfilerData& GetProfilerData()
if( !ptr )
{
int expected = 0;
while( !profilerDataLock.compare_exchange_strong( expected, 1, std::memory_order_release, std::memory_order_relaxed ) ) { expected = 0; }
while( !profilerDataLock.compare_exchange_weak( expected, 1, std::memory_order_release, std::memory_order_relaxed ) ) { expected = 0; YieldThread(); }
ptr = profilerData.load( std::memory_order_acquire );
if( !ptr )
{
RPMallocInit init;
ptr = (ProfilerData*)tracy_malloc( sizeof( ProfilerData ) );
new (ptr) ProfilerData();
profilerData.store( ptr, std::memory_order_release );
Expand Down Expand Up @@ -1071,7 +1028,6 @@ struct ProfilerThreadDataKey
void* p = pthread_getspecific(m_key);
if (!p)
{
RPMallocInit init;
p = (ProfilerThreadData*)tracy_malloc( sizeof( ProfilerThreadData ) );
new (p) ProfilerThreadData(GetProfilerData());
pthread_setspecific(m_key, p);
Expand Down Expand Up @@ -1123,18 +1079,12 @@ namespace
# endif

#else
TRACY_API void InitRPMallocThread()
{
rpmalloc_thread_initialize();
}

// MSVC static initialization order solution. gcc/clang uses init_order() to avoid all this.

// 1a. But s_queue is needed for initialization of variables in point 2.
extern moodycamel::ConcurrentQueue<QueueItem> s_queue;

thread_local RPMallocInit init_order(106) s_rpmalloc_thread_init;

// 2. If these variables would be in the .CRT$XCB section, they would be initialized only in main thread.
thread_local moodycamel::ProducerToken init_order(107) s_token_detail( s_queue );
thread_local ProducerWrapper init_order(108) s_token { s_queue.get_explicit_producer( s_token_detail ) };
Expand All @@ -1147,7 +1097,9 @@ thread_local ThreadHandleWrapper init_order(104) s_threadHandle { detail::GetThr
# endif

static InitTimeWrapper init_order(101) s_initTime { SetupHwTimer() };
static RPMallocInit init_order(102) s_rpmalloc_init;
std::atomic<int> init_order(102) RpInitDone( 0 );
std::atomic<int> init_order(102) RpInitLock( 0 );
thread_local bool RpThreadInitDone = false;
moodycamel::ConcurrentQueue<QueueItem> init_order(103) s_queue( QueuePrealloc );
std::atomic<uint32_t> init_order(104) s_lockCounter( 0 );
std::atomic<uint8_t> init_order(104) s_gpuCtxCounter( 0 );
Expand Down Expand Up @@ -3613,19 +3565,6 @@ TRACY_API uint64_t ___tracy_alloc_srcloc_name( uint32_t line, const char* source
return tracy::Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz );
}

// thread_locals are not initialized on thread creation. At least on GNU/Linux. Instead they are
// initialized on their first ODR-use. This means that the allocator is not automagically
// initialized every time a thread is created. As thus, expose to the C API users a simple API to
// call every time they create a thread. Here we can then put all sorts of per-thread
// initialization.
TRACY_API void ___tracy_init_thread(void) {
#ifdef TRACY_DELAYED_INIT
(void)tracy::GetProfilerThreadData();
#else
(void)tracy::s_rpmalloc_thread_init;
#endif
}

#ifdef __cplusplus
}
#endif
Expand Down
10 changes: 0 additions & 10 deletions client/TracyProfiler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ TRACY_API std::atomic<uint32_t>& GetLockCounter();
TRACY_API std::atomic<uint8_t>& GetGpuCtxCounter();
TRACY_API GpuCtxWrapper& GetGpuCtx();
TRACY_API uint64_t GetThreadHandle();
TRACY_API void InitRPMallocThread();
TRACY_API bool ProfilerAvailable();
TRACY_API int64_t GetFrequencyQpc();

Expand Down Expand Up @@ -295,7 +294,6 @@ class Profiler
#endif
if( callstack != 0 )
{
InitRPMallocThread();
tracy::GetProfiler().SendCallstack( callstack );
}

Expand All @@ -315,7 +313,6 @@ class Profiler
#endif
if( callstack != 0 )
{
InitRPMallocThread();
tracy::GetProfiler().SendCallstack( callstack );
}

Expand All @@ -333,7 +330,6 @@ class Profiler
#endif
if( callstack != 0 )
{
InitRPMallocThread();
tracy::GetProfiler().SendCallstack( callstack );
}

Expand All @@ -356,7 +352,6 @@ class Profiler
#endif
if( callstack != 0 )
{
InitRPMallocThread();
tracy::GetProfiler().SendCallstack( callstack );
}

Expand All @@ -372,7 +367,6 @@ class Profiler
static tracy_force_inline void MessageAppInfo( const char* txt, size_t size )
{
assert( size < std::numeric_limits<uint16_t>::max() );
InitRPMallocThread();
auto ptr = (char*)tracy_malloc( size );
memcpy( ptr, txt, size );
TracyLfqPrepare( QueueType::MessageAppInfo );
Expand Down Expand Up @@ -423,7 +417,6 @@ class Profiler
# endif
const auto thread = GetThreadHandle();

InitRPMallocThread();
auto callstack = Callstack( depth );

profiler.m_serialLock.lock();
Expand All @@ -445,7 +438,6 @@ class Profiler
# endif
const auto thread = GetThreadHandle();

InitRPMallocThread();
auto callstack = Callstack( depth );

profiler.m_serialLock.lock();
Expand Down Expand Up @@ -495,7 +487,6 @@ class Profiler
# endif
const auto thread = GetThreadHandle();

InitRPMallocThread();
auto callstack = Callstack( depth );

profiler.m_serialLock.lock();
Expand All @@ -518,7 +509,6 @@ class Profiler
# endif
const auto thread = GetThreadHandle();

InitRPMallocThread();
auto callstack = Callstack( depth );

profiler.m_serialLock.lock();
Expand Down
Loading

0 comments on commit b7c5939

Please sign in to comment.