diff --git a/Engine/gapi/spirv/meshconverter.cpp b/Engine/gapi/spirv/meshconverter.cpp index 73dc3488..479bbf35 100644 --- a/Engine/gapi/spirv/meshconverter.cpp +++ b/Engine/gapi/spirv/meshconverter.cpp @@ -602,8 +602,8 @@ void MeshConverter::injectCountingPass(const uint32_t idMainFunc) { const uint32_t _runtimearr_cmd = code.OpTypeRuntimeArray(fn, IndirectCommand); const uint32_t EngineInternal0 = code.OpTypeStruct (fn, {_runtimearr_cmd}); - const uint32_t EngineInternal1 = code.OpTypeStruct (fn, {uint_t, uint_t, uint_t, _runtimearr_uint}); - const uint32_t EngineInternal2 = code.OpTypeStruct (fn, {uint_t, _runtimearr_uint}); + const uint32_t EngineInternal1 = code.OpTypeStruct (fn, {uint_t, uint_t, uint_t, uint_t, uint_t, _runtimearr_uint}); + const uint32_t EngineInternal2 = code.OpTypeStruct (fn, {_runtimearr_uint}); const uint32_t _ptr_Uniform_EngineInternal0 = code.OpTypePointer(fn,spv::StorageClassUniform, EngineInternal0); const uint32_t _ptr_Uniform_EngineInternal1 = code.OpTypePointer(fn,spv::StorageClassUniform, EngineInternal1); @@ -613,6 +613,7 @@ void MeshConverter::injectCountingPass(const uint32_t idMainFunc) { const uint32_t const1 = code.OpConstant(fn,uint_t,1); const uint32_t const2 = code.OpConstant(fn,uint_t,2); const uint32_t const3 = code.OpConstant(fn,uint_t,3); + const uint32_t const5 = code.OpConstant(fn,uint_t,5); const uint32_t const10 = code.OpConstant(fn,uint_t,10); const uint32_t const18 = code.OpConstant(fn,uint_t,18); const uint32_t const128 = code.OpConstant(fn,uint_t,128); @@ -662,13 +663,13 @@ void MeshConverter::injectCountingPass(const uint32_t idMainFunc) { fn.insert(spv::OpDecorate, {EngineInternal1, spv::DecorationBufferBlock}); fn.insert(spv::OpDecorate, {vEngine1, spv::DecorationDescriptorSet, 1}); fn.insert(spv::OpDecorate, {vEngine1, spv::DecorationBinding, 1}); - for(uint32_t i=0; i<4; ++i) + for(uint32_t i=0; i<6; ++i) fn.insert(spv::OpMemberDecorate, {EngineInternal1, i, spv::DecorationOffset, i*4}); fn.insert(spv::OpDecorate, {EngineInternal2, spv::DecorationBufferBlock}); fn.insert(spv::OpDecorate, {vEngine2, spv::DecorationDescriptorSet, 1}); fn.insert(spv::OpDecorate, {vEngine2, spv::DecorationBinding, 2}); - for(uint32_t i=0; i<2; ++i) + for(uint32_t i=0; i<1; ++i) fn.insert(spv::OpMemberDecorate, {EngineInternal2, i, spv::DecorationOffset, i*4}); fn = code.findSectionEnd(libspirv::Bytecode::S_Debug); @@ -684,14 +685,15 @@ void MeshConverter::injectCountingPass(const uint32_t idMainFunc) { fn.insert(spv::OpName, EngineInternal0, "EngineInternal0"); fn.insert(spv::OpName, EngineInternal1, "EngineInternal1"); - fn.insert(spv::OpMemberName, EngineInternal1, 0, "grow"); - fn.insert(spv::OpMemberName, EngineInternal1, 1, "dispatchY"); - fn.insert(spv::OpMemberName, EngineInternal1, 2, "dispatchZ"); - fn.insert(spv::OpMemberName, EngineInternal1, 3, "desc"); + fn.insert(spv::OpMemberName, EngineInternal1, 0, "varGrow"); + fn.insert(spv::OpMemberName, EngineInternal1, 1, "grow"); + fn.insert(spv::OpMemberName, EngineInternal1, 2, "dispatchX"); + fn.insert(spv::OpMemberName, EngineInternal1, 3, "dispatchY"); + fn.insert(spv::OpMemberName, EngineInternal1, 4, "dispatchZ"); + fn.insert(spv::OpMemberName, EngineInternal1, 5, "desc"); fn.insert(spv::OpName, EngineInternal2, "EngineInternal2"); - fn.insert(spv::OpMemberName, EngineInternal2, 0, "grow"); - fn.insert(spv::OpMemberName, EngineInternal2, 1, "heap"); + fn.insert(spv::OpMemberName, EngineInternal2, 0, "heap"); // engine-level main fn = code.end(); @@ -797,14 +799,15 @@ void MeshConverter::injectCountingPass(const uint32_t idMainFunc) { const uint32_t heapAllocSz = code.fetchAddBound(); fn.insert(spv::OpIAdd, {uint_t, heapAllocSz, maxVar, indSize}); + // uint heapDest = atomicAdd(mesh.varGrow, indSize + maxVar); const uint32_t ptrHeapDest = code.fetchAddBound(); - fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeapDest, vEngine2, const0}); + fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeapDest, vEngine1, const0}); const uint32_t heapDest = code.fetchAddBound(); fn.insert(spv::OpAtomicIAdd, {uint_t, heapDest, ptrHeapDest, const1/*scope*/, const0/*semantices*/, heapAllocSz}); // uint meshDest = atomicAdd(mesh.grow, 1)*3; const uint32_t ptrMeshDest = code.fetchAddBound(); - fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrMeshDest, vEngine1, const0}); + fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrMeshDest, vEngine1, const1}); const uint32_t meshDestRaw = code.fetchAddBound(); fn.insert(spv::OpAtomicIAdd, {uint_t, meshDestRaw, ptrMeshDest, const1/*scope*/, const0/*semantices*/, const1}); @@ -823,7 +826,7 @@ void MeshConverter::injectCountingPass(const uint32_t idMainFunc) { const uint32_t rDst = code.fetchAddBound(); fn.insert(spv::OpIAdd, {uint_t, rDst, rI, heapDest}); const uint32_t ptrHeap = code.fetchAddBound(); - fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap, vEngine2, const1, rDst}); + fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap, vEngine2, const0, rDst}); const uint32_t ptrIndicesNV = code.fetchAddBound(); fn.insert(spv::OpAccessChain, {_ptr_Workgroup_uint, ptrIndicesNV, idPrimitiveIndicesNV, rI}); @@ -860,7 +863,7 @@ void MeshConverter::injectCountingPass(const uint32_t idMainFunc) { block.insert(spv::OpIAdd, {uint_t, rDst, rAt, constants[seq]}); ++seq; const uint32_t ptrHeap = code.fetchAddBound(); - block.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap, vEngine2, const1, rDst}); + block.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap, vEngine2, const0, rDst}); // NOTE: ids is pointer to array of X, we need only X const uint32_t varPtr = code.fetchAddBound(); @@ -898,19 +901,19 @@ void MeshConverter::injectCountingPass(const uint32_t idMainFunc) { fn.insert(spv::OpLoad, {uint_t, workIdX, ptrWorkGroupID}); const uint32_t ptrHeap0 = code.fetchAddBound(); - fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap0, vEngine1, const3, meshDest}); + fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap0, vEngine1, const5, meshDest}); fn.insert(spv::OpStore, {ptrHeap0, workIdX}); const uint32_t dest1 = code.fetchAddBound(); fn.insert(spv::OpIAdd, {uint_t, dest1, meshDest, const1}); const uint32_t ptrHeap1 = code.fetchAddBound(); - fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap1, vEngine1, const3, dest1}); + fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap1, vEngine1, const5, dest1}); fn.insert(spv::OpStore, {ptrHeap1, heapDest}); const uint32_t dest2 = code.fetchAddBound(); fn.insert(spv::OpIAdd, {uint_t, dest2, meshDest, const2}); const uint32_t ptrHeap2 = code.fetchAddBound(); - fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap2, vEngine1, const3, dest2}); + fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap2, vEngine1, const5, dest2}); const uint32_t tmp0 = code.fetchAddBound(); fn.insert(spv::OpShiftLeftLogical, {uint_t, tmp0, maxVertex, const10}); diff --git a/Engine/gapi/vulkan/vdevice.cpp b/Engine/gapi/vulkan/vdevice.cpp index 9531d159..f7a146a7 100644 --- a/Engine/gapi/vulkan/vdevice.cpp +++ b/Engine/gapi/vulkan/vdevice.cpp @@ -3,11 +3,8 @@ #include "vdevice.h" #include "vcommandbuffer.h" -#include "vcommandpool.h" #include "vfence.h" #include "vswapchain.h" -#include "vbuffer.h" -#include "vtexture.h" #include "vmeshlethelper.h" #include "system/api/x11api.h" @@ -514,7 +511,7 @@ void VDevice::submit(VCommandBuffer& cmd, VFence* sync) { submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2_KHR; submitInfo.commandBufferInfoCount = uint32_t(cmd.chunks.size()); submitInfo.pCommandBufferInfos = flat.get(); - submitInfo.waitSemaphoreInfoCount = waitCnt; + submitInfo.waitSemaphoreInfoCount = uint32_t(waitCnt); submitInfo.pWaitSemaphoreInfos = wait2.get(); graphicsQueue->submit(1,&submitInfo,fence,vkQueueSubmit2); @@ -536,7 +533,7 @@ void VDevice::submit(VCommandBuffer& cmd, VFence* sync) { submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; submitInfo.commandBufferCount = uint32_t(cmd.chunks.size()); submitInfo.pCommandBuffers = flat.get(); - submitInfo.waitSemaphoreCount = waitCnt; + submitInfo.waitSemaphoreCount = uint32_t(waitCnt); submitInfo.pWaitSemaphores = wait.get(); submitInfo.pWaitDstStageMask = waitStages.get(); diff --git a/Engine/gapi/vulkan/vmeshlethelper.cpp b/Engine/gapi/vulkan/vmeshlethelper.cpp index 5b094f9b..fbff0832 100644 --- a/Engine/gapi/vulkan/vmeshlethelper.cpp +++ b/Engine/gapi/vulkan/vmeshlethelper.cpp @@ -98,20 +98,20 @@ void VMeshletHelper::drawIndirect(VkCommandBuffer impl, uint32_t id) { void VMeshletHelper::initRP(VkCommandBuffer impl) { if(false) { - VkDrawIndexedIndirectCommand cmd = {}; + VkDrawIndexedIndirectCommand cmd[3] = {}; indirect.read(&cmd,0,sizeof(cmd)); IVec3 cmdSz = {}; - meshlets.read(&cmdSz,0,sizeof(cmdSz)); + meshlets.read(&cmdSz,2*4,sizeof(cmdSz)); - IVec3 desc = {}; - meshlets.read(&desc,3*4,sizeof(desc)); + IVec3 desc[3] = {}; + meshlets.read(&desc,5*4,sizeof(desc)); - uint32_t indSize = (desc.z ) & 0x3FF; - uint32_t maxVertex = (desc.z >> 10 ) & 0xFF; - uint32_t varSize = (desc.z >> 18u); + uint32_t indSize = (desc[0].z ) & 0x3FF; + uint32_t maxVertex = (desc[0].z >> 10 ) & 0xFF; + uint32_t varSize = (desc[0].z >> 18u); - uint32_t ibo[3] = {}; + uint32_t ibo[3*3] = {}; compacted.read(ibo,0,sizeof(ibo)); float vbo[11*3] = {}; @@ -119,7 +119,7 @@ void VMeshletHelper::initRP(VkCommandBuffer impl) { compacted.read(vbo, 3*4,sizeof(vbo)); compacted.read(vboI,3*4,sizeof(vboI)); - float sc[11*3+3] = {}; + float sc[12*3+3] = {}; scratch.read(sc,0,sizeof(sc)); Log::i(""); @@ -130,13 +130,10 @@ void VMeshletHelper::initRP(VkCommandBuffer impl) { VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT); - // drawcall-related parts should be set to zeros + // drawcall-related parts should be set to zeros. TODO: remove vkCmdFillBuffer(impl, indirect.impl, 0, VK_WHOLE_SIZE, 0); - // {0, 1, 1, } - IVec3 meshletBufInit = {0,1,1}; - vkCmdUpdateBuffer(impl ,meshlets.impl, 0, sizeof(meshletBufInit), &meshletBufInit); - // var.grow - vkCmdFillBuffer(impl, scratch.impl, 0, sizeof(uint32_t), 0); + // meshlet counters + vkCmdFillBuffer(impl, meshlets.impl, 0, sizeof(uint32_t)*2, 0); barrier(impl, VK_PIPELINE_STAGE_TRANSFER_BIT, @@ -146,8 +143,11 @@ void VMeshletHelper::initRP(VkCommandBuffer impl) { } void VMeshletHelper::sortPass(VkCommandBuffer impl, uint32_t meshCallsCount) { + if(meshCallsCount==0) + return; + // Issue: sync for indirect buffer will ruin pipelining, by serializing all renderpasses + // prefix summ pass - // Issue: sync for indirect buffer will ruing pipelining, by serializing all renderpasses barrier(impl, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, @@ -156,18 +156,19 @@ void VMeshletHelper::sortPass(VkCommandBuffer impl, uint32_t meshCallsCount) { vkCmdBindPipeline(impl,VK_PIPELINE_BIND_POINT_COMPUTE,prefixSum.handler->impl); vkCmdBindDescriptorSets(impl,VK_PIPELINE_BIND_POINT_COMPUTE, prefixSum.handler->pipelineLayout, 0, 1,&engSet, 0,nullptr); + vkCmdPushConstants(impl,prefixSum.handler->pipelineLayout,VK_SHADER_STAGE_COMPUTE_BIT,0,4,&meshCallsCount); vkCmdDispatch(impl, 1,1,1); // one threadgroup for prefix pass // compactage pass barrier(impl, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT, - VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT, + VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_INDIRECT_COMMAND_READ_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_INDIRECT_COMMAND_READ_BIT); vkCmdBindPipeline(impl,VK_PIPELINE_BIND_POINT_COMPUTE,compactage.handler->impl); vkCmdBindDescriptorSets(impl,VK_PIPELINE_BIND_POINT_COMPUTE, compactage.handler->pipelineLayout, 0, 1,&compSet, 0,nullptr); - vkCmdDispatchIndirect(impl, meshlets.impl, 0); + vkCmdDispatchIndirect(impl, meshlets.impl, sizeof(uint32_t)*2); // ready for draw barrier(impl, diff --git a/Engine/shaders/mesh_compactage.comp b/Engine/shaders/mesh_compactage.comp index 64d46e47..97467367 100644 --- a/Engine/shaders/mesh_compactage.comp +++ b/Engine/shaders/mesh_compactage.comp @@ -20,62 +20,51 @@ layout(binding = 0, std430) buffer EngineInternal0 { IndirectCmd cmd[]; } indirect; -layout(binding = 1, std430) readonly buffer EngineInternal3 { +layout(binding = 1, std430) readonly buffer EngineInternal1 { + uint varGrow; uint grow; + uint dispatchX; uint dispatchY; uint dispatchZ; uint desc[]; } mesh; layout(binding = 2, std430) readonly buffer EngineInternal2 { - uint grow; uint heap[]; } var; -layout(binding = 3, std430) buffer EngineInternal4 { +layout(binding = 3, std430) buffer EngineInternal3 { uint heap[]; } compacted; -shared uint iboOffset; -shared uint vboOffset; - void main() { - if(gl_WorkGroupID.x>mesh.desc.length()) - return; + uint mestId = gl_GlobalInvocationID.x*3; - const uint index = gl_LocalInvocationID.x; + [[branch]] + if(gl_GlobalInvocationID.x>=mesh.grow) + return; - uint mestId = gl_WorkGroupID.x*3; uint self = mesh.desc[mestId+0]; uint indPtr = mesh.desc[mestId+1]; uint desc = mesh.desc[mestId+2]; - uint indSize = (desc ) & 0x3FF; - uint maxVertex = (desc >> 10 ) & 0xFF; - uint varSize = (desc >> 18u); - uint vMem = maxVertex*varSize; - uint vertPtr = indPtr + indSize; - - [[branch]] - if(index==0) { - uint firstIndex = indirect.cmd[self].firstIndex; - iboOffset = atomicAdd(indirect.cmd[self].indexCount, indSize) + firstIndex; - vboOffset = atomicAdd(indirect.cmd[self].vboOffset, vMem); - } + uint indSize = (desc ) & 0x3FF; + uint maxVertex = (desc >> 10 ) & 0xFF; + uint varSize = (desc >> 18u); + uint vMem = maxVertex*varSize; + uint vertPtr = indPtr + indSize; - barrier(); + uint firstIndex = indirect.cmd[self].firstIndex; + uint iboOffset = atomicAdd(indirect.cmd[self].indexCount, indSize) + firstIndex; + uint vboOffset = atomicAdd(indirect.cmd[self].vboOffset, vMem); - uint b = ((index+0)*vMem)/gl_WorkGroupSize.x; - uint e = ((index+1)*vMem)/gl_WorkGroupSize.x; [[loop]] - for(uint i=b; i flatCpu(flat.size()/4); device.readBytes(flat,flatCpu.data(),flat.size()); - EXPECT_EQ(meshCpu[0],5); - for(uint32_t i=0; i> 10 ) & 0xFF;