Skip to content

Commit

Permalink
optimize mesh emulator
Browse files Browse the repository at this point in the history
  • Loading branch information
Try committed Aug 9, 2022
1 parent 8586c60 commit c763de1
Show file tree
Hide file tree
Showing 9 changed files with 125 additions and 129 deletions.
37 changes: 20 additions & 17 deletions Engine/gapi/spirv/meshconverter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -602,8 +602,8 @@ void MeshConverter::injectCountingPass(const uint32_t idMainFunc) {
const uint32_t _runtimearr_cmd = code.OpTypeRuntimeArray(fn, IndirectCommand);

const uint32_t EngineInternal0 = code.OpTypeStruct (fn, {_runtimearr_cmd});
const uint32_t EngineInternal1 = code.OpTypeStruct (fn, {uint_t, uint_t, uint_t, _runtimearr_uint});
const uint32_t EngineInternal2 = code.OpTypeStruct (fn, {uint_t, _runtimearr_uint});
const uint32_t EngineInternal1 = code.OpTypeStruct (fn, {uint_t, uint_t, uint_t, uint_t, uint_t, _runtimearr_uint});
const uint32_t EngineInternal2 = code.OpTypeStruct (fn, {_runtimearr_uint});

const uint32_t _ptr_Uniform_EngineInternal0 = code.OpTypePointer(fn,spv::StorageClassUniform, EngineInternal0);
const uint32_t _ptr_Uniform_EngineInternal1 = code.OpTypePointer(fn,spv::StorageClassUniform, EngineInternal1);
Expand All @@ -613,6 +613,7 @@ void MeshConverter::injectCountingPass(const uint32_t idMainFunc) {
const uint32_t const1 = code.OpConstant(fn,uint_t,1);
const uint32_t const2 = code.OpConstant(fn,uint_t,2);
const uint32_t const3 = code.OpConstant(fn,uint_t,3);
const uint32_t const5 = code.OpConstant(fn,uint_t,5);
const uint32_t const10 = code.OpConstant(fn,uint_t,10);
const uint32_t const18 = code.OpConstant(fn,uint_t,18);
const uint32_t const128 = code.OpConstant(fn,uint_t,128);
Expand Down Expand Up @@ -662,13 +663,13 @@ void MeshConverter::injectCountingPass(const uint32_t idMainFunc) {
fn.insert(spv::OpDecorate, {EngineInternal1, spv::DecorationBufferBlock});
fn.insert(spv::OpDecorate, {vEngine1, spv::DecorationDescriptorSet, 1});
fn.insert(spv::OpDecorate, {vEngine1, spv::DecorationBinding, 1});
for(uint32_t i=0; i<4; ++i)
for(uint32_t i=0; i<6; ++i)
fn.insert(spv::OpMemberDecorate, {EngineInternal1, i, spv::DecorationOffset, i*4});

fn.insert(spv::OpDecorate, {EngineInternal2, spv::DecorationBufferBlock});
fn.insert(spv::OpDecorate, {vEngine2, spv::DecorationDescriptorSet, 1});
fn.insert(spv::OpDecorate, {vEngine2, spv::DecorationBinding, 2});
for(uint32_t i=0; i<2; ++i)
for(uint32_t i=0; i<1; ++i)
fn.insert(spv::OpMemberDecorate, {EngineInternal2, i, spv::DecorationOffset, i*4});

fn = code.findSectionEnd(libspirv::Bytecode::S_Debug);
Expand All @@ -684,14 +685,15 @@ void MeshConverter::injectCountingPass(const uint32_t idMainFunc) {
fn.insert(spv::OpName, EngineInternal0, "EngineInternal0");

fn.insert(spv::OpName, EngineInternal1, "EngineInternal1");
fn.insert(spv::OpMemberName, EngineInternal1, 0, "grow");
fn.insert(spv::OpMemberName, EngineInternal1, 1, "dispatchY");
fn.insert(spv::OpMemberName, EngineInternal1, 2, "dispatchZ");
fn.insert(spv::OpMemberName, EngineInternal1, 3, "desc");
fn.insert(spv::OpMemberName, EngineInternal1, 0, "varGrow");
fn.insert(spv::OpMemberName, EngineInternal1, 1, "grow");
fn.insert(spv::OpMemberName, EngineInternal1, 2, "dispatchX");
fn.insert(spv::OpMemberName, EngineInternal1, 3, "dispatchY");
fn.insert(spv::OpMemberName, EngineInternal1, 4, "dispatchZ");
fn.insert(spv::OpMemberName, EngineInternal1, 5, "desc");

fn.insert(spv::OpName, EngineInternal2, "EngineInternal2");
fn.insert(spv::OpMemberName, EngineInternal2, 0, "grow");
fn.insert(spv::OpMemberName, EngineInternal2, 1, "heap");
fn.insert(spv::OpMemberName, EngineInternal2, 0, "heap");

// engine-level main
fn = code.end();
Expand Down Expand Up @@ -797,14 +799,15 @@ void MeshConverter::injectCountingPass(const uint32_t idMainFunc) {
const uint32_t heapAllocSz = code.fetchAddBound();
fn.insert(spv::OpIAdd, {uint_t, heapAllocSz, maxVar, indSize});

// uint heapDest = atomicAdd(mesh.varGrow, indSize + maxVar);
const uint32_t ptrHeapDest = code.fetchAddBound();
fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeapDest, vEngine2, const0});
fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeapDest, vEngine1, const0});
const uint32_t heapDest = code.fetchAddBound();
fn.insert(spv::OpAtomicIAdd, {uint_t, heapDest, ptrHeapDest, const1/*scope*/, const0/*semantices*/, heapAllocSz});

// uint meshDest = atomicAdd(mesh.grow, 1)*3;
const uint32_t ptrMeshDest = code.fetchAddBound();
fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrMeshDest, vEngine1, const0});
fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrMeshDest, vEngine1, const1});
const uint32_t meshDestRaw = code.fetchAddBound();
fn.insert(spv::OpAtomicIAdd, {uint_t, meshDestRaw, ptrMeshDest, const1/*scope*/, const0/*semantices*/, const1});

Expand All @@ -823,7 +826,7 @@ void MeshConverter::injectCountingPass(const uint32_t idMainFunc) {
const uint32_t rDst = code.fetchAddBound();
fn.insert(spv::OpIAdd, {uint_t, rDst, rI, heapDest});
const uint32_t ptrHeap = code.fetchAddBound();
fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap, vEngine2, const1, rDst});
fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap, vEngine2, const0, rDst});

const uint32_t ptrIndicesNV = code.fetchAddBound();
fn.insert(spv::OpAccessChain, {_ptr_Workgroup_uint, ptrIndicesNV, idPrimitiveIndicesNV, rI});
Expand Down Expand Up @@ -860,7 +863,7 @@ void MeshConverter::injectCountingPass(const uint32_t idMainFunc) {
block.insert(spv::OpIAdd, {uint_t, rDst, rAt, constants[seq]});
++seq;
const uint32_t ptrHeap = code.fetchAddBound();
block.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap, vEngine2, const1, rDst});
block.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap, vEngine2, const0, rDst});

// NOTE: ids is pointer to array of X, we need only X
const uint32_t varPtr = code.fetchAddBound();
Expand Down Expand Up @@ -898,19 +901,19 @@ void MeshConverter::injectCountingPass(const uint32_t idMainFunc) {
fn.insert(spv::OpLoad, {uint_t, workIdX, ptrWorkGroupID});

const uint32_t ptrHeap0 = code.fetchAddBound();
fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap0, vEngine1, const3, meshDest});
fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap0, vEngine1, const5, meshDest});
fn.insert(spv::OpStore, {ptrHeap0, workIdX});

const uint32_t dest1 = code.fetchAddBound();
fn.insert(spv::OpIAdd, {uint_t, dest1, meshDest, const1});
const uint32_t ptrHeap1 = code.fetchAddBound();
fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap1, vEngine1, const3, dest1});
fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap1, vEngine1, const5, dest1});
fn.insert(spv::OpStore, {ptrHeap1, heapDest});

const uint32_t dest2 = code.fetchAddBound();
fn.insert(spv::OpIAdd, {uint_t, dest2, meshDest, const2});
const uint32_t ptrHeap2 = code.fetchAddBound();
fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap2, vEngine1, const3, dest2});
fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap2, vEngine1, const5, dest2});

const uint32_t tmp0 = code.fetchAddBound();
fn.insert(spv::OpShiftLeftLogical, {uint_t, tmp0, maxVertex, const10});
Expand Down
7 changes: 2 additions & 5 deletions Engine/gapi/vulkan/vdevice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,8 @@
#include "vdevice.h"

#include "vcommandbuffer.h"
#include "vcommandpool.h"
#include "vfence.h"
#include "vswapchain.h"
#include "vbuffer.h"
#include "vtexture.h"
#include "vmeshlethelper.h"
#include "system/api/x11api.h"

Expand Down Expand Up @@ -514,7 +511,7 @@ void VDevice::submit(VCommandBuffer& cmd, VFence* sync) {
submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2_KHR;
submitInfo.commandBufferInfoCount = uint32_t(cmd.chunks.size());
submitInfo.pCommandBufferInfos = flat.get();
submitInfo.waitSemaphoreInfoCount = waitCnt;
submitInfo.waitSemaphoreInfoCount = uint32_t(waitCnt);
submitInfo.pWaitSemaphoreInfos = wait2.get();

graphicsQueue->submit(1,&submitInfo,fence,vkQueueSubmit2);
Expand All @@ -536,7 +533,7 @@ void VDevice::submit(VCommandBuffer& cmd, VFence* sync) {
submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
submitInfo.commandBufferCount = uint32_t(cmd.chunks.size());
submitInfo.pCommandBuffers = flat.get();
submitInfo.waitSemaphoreCount = waitCnt;
submitInfo.waitSemaphoreCount = uint32_t(waitCnt);
submitInfo.pWaitSemaphores = wait.get();
submitInfo.pWaitDstStageMask = waitStages.get();

Expand Down
39 changes: 20 additions & 19 deletions Engine/gapi/vulkan/vmeshlethelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,28 +98,28 @@ void VMeshletHelper::drawIndirect(VkCommandBuffer impl, uint32_t id) {

void VMeshletHelper::initRP(VkCommandBuffer impl) {
if(false) {
VkDrawIndexedIndirectCommand cmd = {};
VkDrawIndexedIndirectCommand cmd[3] = {};
indirect.read(&cmd,0,sizeof(cmd));

IVec3 cmdSz = {};
meshlets.read(&cmdSz,0,sizeof(cmdSz));
meshlets.read(&cmdSz,2*4,sizeof(cmdSz));

IVec3 desc = {};
meshlets.read(&desc,3*4,sizeof(desc));
IVec3 desc[3] = {};
meshlets.read(&desc,5*4,sizeof(desc));

uint32_t indSize = (desc.z ) & 0x3FF;
uint32_t maxVertex = (desc.z >> 10 ) & 0xFF;
uint32_t varSize = (desc.z >> 18u);
uint32_t indSize = (desc[0].z ) & 0x3FF;
uint32_t maxVertex = (desc[0].z >> 10 ) & 0xFF;
uint32_t varSize = (desc[0].z >> 18u);

uint32_t ibo[3] = {};
uint32_t ibo[3*3] = {};
compacted.read(ibo,0,sizeof(ibo));

float vbo[11*3] = {};
uint32_t vboI[11*3] = {};
compacted.read(vbo, 3*4,sizeof(vbo));
compacted.read(vboI,3*4,sizeof(vboI));

float sc[11*3+3] = {};
float sc[12*3+3] = {};
scratch.read(sc,0,sizeof(sc));

Log::i("");
Expand All @@ -130,13 +130,10 @@ void VMeshletHelper::initRP(VkCommandBuffer impl) {
VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT,
VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT);
// drawcall-related parts should be set to zeros
// drawcall-related parts should be set to zeros. TODO: remove
vkCmdFillBuffer(impl, indirect.impl, 0, VK_WHOLE_SIZE, 0);
// {0, 1, 1, <undefined>}
IVec3 meshletBufInit = {0,1,1};
vkCmdUpdateBuffer(impl ,meshlets.impl, 0, sizeof(meshletBufInit), &meshletBufInit);
// var.grow
vkCmdFillBuffer(impl, scratch.impl, 0, sizeof(uint32_t), 0);
// meshlet counters
vkCmdFillBuffer(impl, meshlets.impl, 0, sizeof(uint32_t)*2, 0);

barrier(impl,
VK_PIPELINE_STAGE_TRANSFER_BIT,
Expand All @@ -146,8 +143,11 @@ void VMeshletHelper::initRP(VkCommandBuffer impl) {
}

void VMeshletHelper::sortPass(VkCommandBuffer impl, uint32_t meshCallsCount) {
if(meshCallsCount==0)
return;
// Issue: sync for indirect buffer will ruin pipelining, by serializing all renderpasses

// prefix summ pass
// Issue: sync for indirect buffer will ruing pipelining, by serializing all renderpasses
barrier(impl,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
Expand All @@ -156,18 +156,19 @@ void VMeshletHelper::sortPass(VkCommandBuffer impl, uint32_t meshCallsCount) {
vkCmdBindPipeline(impl,VK_PIPELINE_BIND_POINT_COMPUTE,prefixSum.handler->impl);
vkCmdBindDescriptorSets(impl,VK_PIPELINE_BIND_POINT_COMPUTE, prefixSum.handler->pipelineLayout,
0, 1,&engSet, 0,nullptr);
vkCmdPushConstants(impl,prefixSum.handler->pipelineLayout,VK_SHADER_STAGE_COMPUTE_BIT,0,4,&meshCallsCount);
vkCmdDispatch(impl, 1,1,1); // one threadgroup for prefix pass

// compactage pass
barrier(impl,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT,
VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT,
VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_INDIRECT_COMMAND_READ_BIT,
VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_INDIRECT_COMMAND_READ_BIT);
vkCmdBindPipeline(impl,VK_PIPELINE_BIND_POINT_COMPUTE,compactage.handler->impl);
vkCmdBindDescriptorSets(impl,VK_PIPELINE_BIND_POINT_COMPUTE, compactage.handler->pipelineLayout,
0, 1,&compSet, 0,nullptr);
vkCmdDispatchIndirect(impl, meshlets.impl, 0);
vkCmdDispatchIndirect(impl, meshlets.impl, sizeof(uint32_t)*2);

// ready for draw
barrier(impl,
Expand Down
47 changes: 18 additions & 29 deletions Engine/shaders/mesh_compactage.comp
Original file line number Diff line number Diff line change
Expand Up @@ -20,62 +20,51 @@ layout(binding = 0, std430) buffer EngineInternal0 {
IndirectCmd cmd[];
} indirect;

layout(binding = 1, std430) readonly buffer EngineInternal3 {
layout(binding = 1, std430) readonly buffer EngineInternal1 {
uint varGrow;
uint grow;
uint dispatchX;
uint dispatchY;
uint dispatchZ;
uint desc[];
} mesh;

layout(binding = 2, std430) readonly buffer EngineInternal2 {
uint grow;
uint heap[];
} var;

layout(binding = 3, std430) buffer EngineInternal4 {
layout(binding = 3, std430) buffer EngineInternal3 {
uint heap[];
} compacted;

shared uint iboOffset;
shared uint vboOffset;

void main() {
if(gl_WorkGroupID.x>mesh.desc.length())
return;
uint mestId = gl_GlobalInvocationID.x*3;

const uint index = gl_LocalInvocationID.x;
[[branch]]
if(gl_GlobalInvocationID.x>=mesh.grow)
return;

uint mestId = gl_WorkGroupID.x*3;
uint self = mesh.desc[mestId+0];
uint indPtr = mesh.desc[mestId+1];
uint desc = mesh.desc[mestId+2];

uint indSize = (desc ) & 0x3FF;
uint maxVertex = (desc >> 10 ) & 0xFF;
uint varSize = (desc >> 18u);
uint vMem = maxVertex*varSize;
uint vertPtr = indPtr + indSize;

[[branch]]
if(index==0) {
uint firstIndex = indirect.cmd[self].firstIndex;
iboOffset = atomicAdd(indirect.cmd[self].indexCount, indSize) + firstIndex;
vboOffset = atomicAdd(indirect.cmd[self].vboOffset, vMem);
}
uint indSize = (desc ) & 0x3FF;
uint maxVertex = (desc >> 10 ) & 0xFF;
uint varSize = (desc >> 18u);
uint vMem = maxVertex*varSize;
uint vertPtr = indPtr + indSize;

barrier();
uint firstIndex = indirect.cmd[self].firstIndex;
uint iboOffset = atomicAdd(indirect.cmd[self].indexCount, indSize) + firstIndex;
uint vboOffset = atomicAdd(indirect.cmd[self].vboOffset, vMem);

uint b = ((index+0)*vMem)/gl_WorkGroupSize.x;
uint e = ((index+1)*vMem)/gl_WorkGroupSize.x;
[[loop]]
for(uint i=b; i<e; ++i) {
for(uint i=0; i<vMem; ++i) {
compacted.heap[vboOffset+i] = var.heap[vertPtr+i];
}

b = ((index+0)*indSize)/gl_WorkGroupSize.x;
e = ((index+1)*indSize)/gl_WorkGroupSize.x;
[[loop]]
for(uint i=b; i<e; ++i) {
for(uint i=0; i<indSize; ++i) {
compacted.heap[iboOffset+i] = vboOffset + var.heap[indPtr+i]*varSize;
}
}
18 changes: 14 additions & 4 deletions Engine/shaders/mesh_prefix_pass.comp
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,16 @@ layout(binding = 0, std430) buffer EngineInternal0

layout(binding = 1, std430) buffer EngineInternal1
{
uint varGrow;
uint grow;
uint dispatchX;
uint dispatchY;
uint dispatchZ;
uint desc[];
} _116;
} mesh;

layout(binding = 2, std430) buffer EngineInternal2
{
uint grow;
uint heap[];
} var;

Expand All @@ -40,9 +41,13 @@ shared uint maxInd;
shared uint partialSummIbo[gl_WorkGroupSize.x];
shared uint partialSummVbo[gl_WorkGroupSize.x];

layout(push_constant, std140) uniform UboPush {
uint indirectCmdCount;
};

void main() {
uint index = gl_LocalInvocationID.x;
uint len = indirect.cmd.length();
uint len = indirectCmdCount;

uint b = ((index+0)*len)/gl_WorkGroupSize.x;
uint e = ((index+1)*len)/gl_WorkGroupSize.x;
Expand All @@ -66,8 +71,13 @@ void main() {
prefixIbo += partialSummIbo[i];
prefixVbo += partialSummVbo[i];
}

[[branch]]
if(index==255) {
maxInd = prefixIbo + partialSummIbo[index];
maxInd = prefixIbo + partialSummIbo[index];
mesh.dispatchX = (mesh.grow+64-1)/64;
mesh.dispatchY = 1;
mesh.dispatchZ = 1;
}

memoryBarrierShared();
Expand Down
Loading

0 comments on commit c763de1

Please sign in to comment.