Skip to content

Commit

Permalink
gpu optimizations
Browse files Browse the repository at this point in the history
  • Loading branch information
turanszkij committed Aug 3, 2024
1 parent f3f5148 commit 3ac4ce9
Show file tree
Hide file tree
Showing 19 changed files with 166 additions and 127 deletions.
2 changes: 1 addition & 1 deletion WickedEngine/shaders/ShaderInterop_Renderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -1440,7 +1440,7 @@ struct VirtualTextureTileRequestsPush
uint lodCount;
uint width;
uint height;
int feedbackTextureRO;
int feedbackTextureRW;
int requestBufferRW;
int padding0;
int padding1;
Expand Down
4 changes: 2 additions & 2 deletions WickedEngine/shaders/lightingHF.hlsli
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,8 @@ inline half attenuation_pointlight(in half dist2, in half range, in half range2)
// GLTF recommendation: https://github.com/KhronosGroup/glTF/tree/main/extensions/2.0/Khronos/KHR_lights_punctual#range-property
//return saturate(1 - pow(dist / range, 4)) / dist2;

// Removed pow(x, 4), and avoid zero divisions:
half dist_per_range = dist2 / max(0.0001, range2); // pow2
// Removed pow(x, 4):
half dist_per_range = dist2 / range2; // pow2 (note: range cannot be 0, in that case light is not uploaded to GPU, so here will not be zero-division)
dist_per_range *= dist_per_range; // pow4
return saturate(1 - dist_per_range) / max(0.0001, dist2);
}
Expand Down
3 changes: 1 addition & 2 deletions WickedEngine/shaders/skinningCS.hlsl
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ void main(uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID)
float4 p = 0;
half3 n = 0;
half3 t = 0;
half weisum = 0;
for (uint influence = 0; influence < push.influence_div4; ++influence)
{
uint4 ind = 0;
Expand All @@ -121,8 +122,6 @@ void main(uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID)
}
if (any(wei))
{
half weisum = 0;

for (uint i = 0; ((i < 4) && (weisum < 1.0)); ++i)
{
float4x4 m = skinningbuffer.Load<ShaderTransform>(push.bone_offset + ind[i] * sizeof(ShaderTransform)).GetMatrix();
Expand Down
20 changes: 10 additions & 10 deletions WickedEngine/shaders/upsample_bilateral_float4CS.hlsl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#include "ShaderInterop_Postprocess.h"

#ifndef UPSAMPLE_FORMAT
#define UPSAMPLE_FORMAT float4
#define UPSAMPLE_FORMAT half4
#endif // UPSAMPLE_FORMAT

PUSHCONSTANT(postprocess, PostProcess);
Expand Down Expand Up @@ -43,31 +43,31 @@ void main(uint3 DTid : SV_DispatchThreadID)
const float lineardepth_highres = input_lineardepth_high[pixel] * GetCamera().z_far;

UPSAMPLE_FORMAT color = 0;
float sum = 0;
half sum = 0;

int2 lowres_pixel = int2(float2(pixel) * postprocess.params0.w);

for(uint i = 0; i < 4; ++i)
{
const float2 sample_uv = uv + offsets[i] * lowres_texel_size;
const float4 zzzz = input_lineardepth_low.GatherRed(sampler_linear_clamp, sample_uv) * GetCamera().z_far;
const float4 wwww = max(0.001, 1 - saturate(abs(zzzz - lineardepth_highres) * threshold));
const float4 rrrr = input.GatherRed(sampler_linear_clamp, sample_uv);
const float4 gggg = input.GatherGreen(sampler_linear_clamp, sample_uv);
const float4 bbbb = input.GatherBlue(sampler_linear_clamp, sample_uv);
const float4 aaaa = input.GatherAlpha(sampler_linear_clamp, sample_uv);
const half4 wwww = max(0.001, 1 - saturate(abs(zzzz - lineardepth_highres) * threshold));
const half4 rrrr = input.GatherRed(sampler_linear_clamp, sample_uv);
const half4 gggg = input.GatherGreen(sampler_linear_clamp, sample_uv);
const half4 bbbb = input.GatherBlue(sampler_linear_clamp, sample_uv);
const half4 aaaa = input.GatherAlpha(sampler_linear_clamp, sample_uv);

float2 sam_pixel = sample_uv * lowres_size + (-0.5 + 1.0 / 512.0); // (1.0 / 512.0) correction is described here: https://www.reedbeta.com/blog/texture-gathers-and-coordinate-precision/
float2 sam_pixel_frac = frac(sam_pixel);
half2 sam_pixel_frac = frac(sam_pixel);

color += (UPSAMPLE_FORMAT)float4(
color += (UPSAMPLE_FORMAT)half4(
bilinear(rrrr * wwww, sam_pixel_frac),
bilinear(gggg * wwww, sam_pixel_frac),
bilinear(bbbb * wwww, sam_pixel_frac),
bilinear(aaaa * wwww, sam_pixel_frac)
);

float weight = bilinear(wwww, sam_pixel_frac);
half weight = bilinear(wwww, sam_pixel_frac);
sum += weight;
}

Expand Down
2 changes: 1 addition & 1 deletion WickedEngine/shaders/virtualTextureTileRequestsCS.hlsl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ void main(uint3 DTid : SV_DispatchThreadID, uint groupIndex : SV_GroupIndex)
if (DTid.x >= push.width / 2 || DTid.y >= push.height / 2)
return;

Texture2D<uint> feedbackTexture = bindless_textures_uint[push.feedbackTextureRO];
RWTexture2D<uint> feedbackTexture = bindless_rwtextures_uint[push.feedbackTextureRW]; // this is not written, but we keep it in UAV state intentionally
RWByteAddressBuffer requestBuffer = bindless_rwbuffers[push.requestBufferRW];

uint page_count = 0;
Expand Down
1 change: 1 addition & 0 deletions WickedEngine/shaders/volumetricLight_DirectionalPS.hlsl
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#define TRANSPARENT_SHADOWMAP_SECONDARY_DEPTH_CHECK // fix the lack of depth testing
#define DISABLE_SOFT_SHADOWMAP
#include "volumetricLightHF.hlsli"
#include "volumetricCloudsHF.hlsli"
#include "fogHF.hlsli"
Expand Down
1 change: 1 addition & 0 deletions WickedEngine/shaders/volumetricLight_PointPS.hlsl
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#define TRANSPARENT_SHADOWMAP_SECONDARY_DEPTH_CHECK // fix the lack of depth testing
#define DISABLE_SOFT_SHADOWMAP
#include "volumetricLightHF.hlsli"
#include "fogHF.hlsli"
#include "oceanSurfaceHF.hlsli"
Expand Down
1 change: 1 addition & 0 deletions WickedEngine/shaders/volumetricLight_SpotPS.hlsl
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#define TRANSPARENT_SHADOWMAP_SECONDARY_DEPTH_CHECK // fix the lack of depth testing
#define DISABLE_SOFT_SHADOWMAP
#include "volumetricLightHF.hlsli"
#include "fogHF.hlsli"
#include "oceanSurfaceHF.hlsli"
Expand Down
29 changes: 27 additions & 2 deletions WickedEngine/wiEmittedParticle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,8 @@ namespace wi
if (IsPaused() || dt == 0)
return;

active_frames <<= 1; // advance next frame

emit = std::max(0.0f, emit - std::floor(emit));

center = transform.GetPosition();
Expand All @@ -349,6 +351,7 @@ namespace wi
location.transform.init();
location.count = (uint)emit;
location.color = wi::Color::White();
active_frames |= 1; // activate current frame
}

worldMatrix = transform.world;
Expand All @@ -359,13 +362,23 @@ namespace wi
// Read back statistics (with GPU delay):
const uint32_t oldest_stat_index = wi::graphics::GetDevice()->GetBufferIndex();
memcpy(&statistics, statisticsReadbackBuffer[oldest_stat_index].mapped_data, sizeof(statistics));

if (statistics.aliveCount > 0 || statistics.aliveCount_afterSimulation > 0)
{
active_frames |= 1; // activate current frame
}
}
void EmittedParticleSystem::Burst(int num)
{
if (IsPaused())
return;

burst += num;

if (num > 0)
{
active_frames |= 1; // activate current frame
}
}
void EmittedParticleSystem::Burst(int num, const XMFLOAT3& position, const wi::Color& color)
{
Expand All @@ -379,6 +392,11 @@ namespace wi
location.count = (uint)num;
location.transform.Create(transform);
location.color = color;

if (num > 0)
{
active_frames |= 1; // activate current frame
}
}
void EmittedParticleSystem::Burst(int num, const XMFLOAT4X4& transform, const wi::Color& color)
{
Expand All @@ -389,6 +407,11 @@ namespace wi
location.count = (uint)num;
location.transform.Create(transform);
location.color = color;

if (num > 0)
{
active_frames |= 1; // activate current frame
}
}
void EmittedParticleSystem::Restart()
{
Expand All @@ -398,10 +421,10 @@ namespace wi

void EmittedParticleSystem::UpdateGPU(uint32_t instanceIndex, const MeshComponent* mesh, CommandList cmd) const
{
if (IsInactive())
return;
if (!particleBuffer.IsValid())
{
return;
}

GraphicsDevice* device = wi::graphics::GetDevice();
device->EventBegin("UpdateEmittedParticles", cmd);
Expand Down Expand Up @@ -809,6 +832,8 @@ namespace wi

void EmittedParticleSystem::Draw(const MaterialComponent& material, CommandList cmd, const PARTICLESHADERTYPE* shadertype_override) const
{
if (IsInactive())
return;
GraphicsDevice* device = wi::graphics::GetDevice();
device->EventBegin("EmittedParticle", cmd);

Expand Down
3 changes: 3 additions & 0 deletions WickedEngine/wiEmittedParticle.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ namespace wi
float emit = 0.0f;
int burst = 0;
float dt = 0;
uint32_t active_frames = 0;

uint32_t MAX_PARTICLES = 1000;

Expand Down Expand Up @@ -137,6 +138,8 @@ namespace wi
uint32_t GetMaxParticleCount() const { return MAX_PARTICLES; }
uint64_t GetMemorySizeInBytes() const;

bool IsInactive() const { return active_frames == 0; }

// Non-serialized attributes:
XMFLOAT3 center;
uint32_t layerMask = ~0u;
Expand Down
19 changes: 5 additions & 14 deletions WickedEngine/wiHairParticle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -325,15 +325,6 @@ namespace wi
GraphicsDevice* device = wi::graphics::GetDevice();
device->EventBegin("HairParticleSystem - UpdateGPU", cmd);

static thread_local wi::vector<GPUBarrier> barrier_stack;
auto barrier_stack_flush = [&]()
{
if (barrier_stack.empty())
return;
device->Barrier(barrier_stack.data(), (uint32_t)barrier_stack.size(), cmd);
barrier_stack.clear();
};

for (uint32_t i = 0; i < itemCount; ++i)
{
const UpdateGPUItem& item = items[i];
Expand Down Expand Up @@ -380,7 +371,7 @@ namespace wi
hcb.xHairLayerMask = hair.layerMask;
hcb.xHairInstanceIndex = item.instanceIndex;
device->UpdateBuffer(&hair.constantBuffer, &hcb, cmd);
barrier_stack.push_back(GPUBarrier::Buffer(&hair.constantBuffer, ResourceState::COPY_DST, ResourceState::CONSTANT_BUFFER));
wi::renderer::PushBarrier(GPUBarrier::Buffer(&hair.constantBuffer, ResourceState::COPY_DST, ResourceState::CONSTANT_BUFFER));

IndirectDrawArgsIndexedInstanced args = {};
args.BaseVertexLocation = 0;
Expand All @@ -389,15 +380,15 @@ namespace wi
args.StartIndexLocation = 0;
args.StartInstanceLocation = 0;
device->UpdateBuffer(&hair.generalBuffer, &args, cmd, sizeof(args), hair.indirect_view.offset);
barrier_stack.push_back(GPUBarrier::Buffer(&hair.generalBuffer, ResourceState::COPY_DST, ResourceState::UNORDERED_ACCESS));
wi::renderer::PushBarrier(GPUBarrier::Buffer(&hair.generalBuffer, ResourceState::COPY_DST, ResourceState::UNORDERED_ACCESS));

if (hair.regenerate_frame)
{
hair.regenerate_frame = false;
}
}

barrier_stack_flush();
wi::renderer::FlushBarriers(cmd);

// Simulate:
device->BindComputeShader(&cs_simulate, cmd);
Expand Down Expand Up @@ -443,10 +434,10 @@ namespace wi

device->Dispatch((hair.strandCount + THREADCOUNT_SIMULATEHAIR - 1) / THREADCOUNT_SIMULATEHAIR, 1, 1, cmd);

barrier_stack.push_back(GPUBarrier::Buffer(&hair.generalBuffer, ResourceState::UNORDERED_ACCESS, ResourceState::INDIRECT_ARGUMENT | ResourceState::INDEX_BUFFER | ResourceState::SHADER_RESOURCE));
wi::renderer::PushBarrier(GPUBarrier::Buffer(&hair.generalBuffer, ResourceState::UNORDERED_ACCESS, ResourceState::INDIRECT_ARGUMENT | ResourceState::INDEX_BUFFER | ResourceState::SHADER_RESOURCE));
}

barrier_stack_flush();
wi::renderer::FlushBarriers(cmd);

device->EventEnd(cmd);
}
Expand Down
18 changes: 13 additions & 5 deletions WickedEngine/wiRenderPath3D.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -800,7 +800,6 @@ namespace wi
if (scene->terrains.GetCount() > 0)
{
cmd_copypages = device->BeginCommandList(QUEUE_COPY);
device->WaitQueue(cmd_copypages, QUEUE_GRAPHICS); // sync to prev frame graphics
wi::jobsystem::Execute(ctx, [this, cmd_copypages](wi::jobsystem::JobArgs args) {
for (size_t i = 0; i < scene->terrains.GetCount(); ++i)
{
Expand Down Expand Up @@ -839,14 +838,23 @@ namespace wi

});

// async compute terrain tasks parellel with graphics prepareframe task
if (scene->terrains.GetCount() > 0)
{
CommandList cmd_updatepages = device->BeginCommandList(QUEUE_COMPUTE);
device->WaitCommandList(cmd_updatepages, cmd_copypages);
wi::jobsystem::Execute(ctx, [this, cmd_updatepages](wi::jobsystem::JobArgs args) {
for (size_t i = 0; i < scene->terrains.GetCount(); ++i)
{
scene->terrains[i].UpdateVirtualTexturesGPU(cmd_updatepages);
}
});
}

// async compute parallel with depth prepass
cmd = device->BeginCommandList(QUEUE_COMPUTE);
CommandList cmd_prepareframe_async = cmd;
device->WaitCommandList(cmd, cmd_prepareframe);
if (cmd_copypages.IsValid())
{
device->WaitCommandList(cmd, cmd_copypages);
}
wi::jobsystem::Execute(ctx, [this, cmd](wi::jobsystem::JobArgs args) {

wi::renderer::BindCameraCB(
Expand Down
23 changes: 15 additions & 8 deletions WickedEngine/wiRenderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,14 @@ void barrier_stack_flush(CommandList cmd)
device->Barrier(barrier_stack.data(), (uint32_t)barrier_stack.size(), cmd);
barrier_stack.clear();
}
void PushBarrier(const GPUBarrier& barrier)
{
barrier_stack.push_back(barrier);
}
void FlushBarriers(CommandList cmd)
{
barrier_stack_flush(cmd);
}

bool wireRender = false;
bool debugBoneLines = false;
Expand Down Expand Up @@ -3445,6 +3453,8 @@ void UpdateVisibility(Visibility& vis)
for (uint32_t lightIndex : vis.visibleLights)
{
const LightComponent& light = vis.scene->lights[lightIndex];
if (light.IsInactive())
continue;
if (!light.IsCastingShadow() || light.IsStatic())
continue;

Expand Down Expand Up @@ -4063,6 +4073,8 @@ void UpdatePerFrameData(
shaderentity = {};

const LightComponent& light = vis.scene->lights[lightIndex];
if (light.IsInactive())
continue;

shaderentity.layerMask = ~0u;

Expand Down Expand Up @@ -4765,11 +4777,6 @@ void UpdateRenderDataAsync(
wi::profiler::EndRange(range);
}

for (size_t i = 0; i < vis.scene->terrains.GetCount(); ++i)
{
vis.scene->terrains[i].UpdateVirtualTexturesGPU(cmd);
}

if (vis.scene->textureStreamingFeedbackBuffer.IsValid())
{
device->ClearUAV(&vis.scene->textureStreamingFeedbackBuffer, 0, cmd);
Expand Down Expand Up @@ -5786,12 +5793,12 @@ void DrawShadowmaps(
for (uint32_t lightIndex : vis.visibleLights)
{
const LightComponent& light = vis.scene->lights[lightIndex];
if (light.IsInactive())
continue;

bool shadow = light.IsCastingShadow() && !light.IsStatic();
const bool shadow = light.IsCastingShadow() && !light.IsStatic();
if (!shadow)
{
continue;
}
const wi::rectpacker::Rect& shadow_rect = vis.visibleLightShadowRects[lightIndex];

switch (light.GetType())
Expand Down
4 changes: 4 additions & 0 deletions WickedEngine/wiRenderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -1222,5 +1222,9 @@ namespace wi::renderer
int RegisterCustomShader(const CustomShader& customShader);
const wi::vector<CustomShader>& GetCustomShaders();

// Thread-local barrier batching helpers:
void PushBarrier(const wi::graphics::GPUBarrier& barrier);
void FlushBarriers(wi::graphics::CommandList cmd);

};

2 changes: 2 additions & 0 deletions WickedEngine/wiScene.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4737,6 +4737,8 @@ namespace wi::scene

const TransformComponent& transform = *transforms.GetComponent(entity);
emitter.UpdateCPU(transform, dt);
if (emitter.IsInactive()) // check after UpdateCPU
return; // can skip writing TLAS instace below

GraphicsDevice* device = wi::graphics::GetDevice();

Expand Down
1 change: 1 addition & 0 deletions WickedEngine/wiScene_Components.h
Original file line number Diff line number Diff line change
Expand Up @@ -1074,6 +1074,7 @@ namespace wi::scene
inline bool IsVisualizerEnabled() const { return _flags & VISUALIZER; }
inline bool IsStatic() const { return _flags & LIGHTMAPONLY_STATIC; }
inline bool IsVolumetricCloudsEnabled() const { return _flags & VOLUMETRICCLOUDS; }
inline bool IsInactive() const { return intensity == 0 || range == 0; }

inline float GetRange() const
{
Expand Down
Loading

0 comments on commit 3ac4ce9

Please sign in to comment.