diff --git a/Engine/gapi/spirv/meshconverter.cpp b/Engine/gapi/spirv/meshconverter.cpp
index 73dc3488..479bbf35 100644
--- a/Engine/gapi/spirv/meshconverter.cpp
+++ b/Engine/gapi/spirv/meshconverter.cpp
@@ -602,8 +602,8 @@ void MeshConverter::injectCountingPass(const uint32_t idMainFunc) {
   const uint32_t _runtimearr_cmd      = code.OpTypeRuntimeArray(fn, IndirectCommand);
 
   const uint32_t EngineInternal0      = code.OpTypeStruct (fn, {_runtimearr_cmd});
-  const uint32_t EngineInternal1      = code.OpTypeStruct (fn, {uint_t, uint_t, uint_t, _runtimearr_uint});
-  const uint32_t EngineInternal2      = code.OpTypeStruct (fn, {uint_t, _runtimearr_uint});
+  const uint32_t EngineInternal1      = code.OpTypeStruct (fn, {uint_t, uint_t, uint_t, uint_t, uint_t, _runtimearr_uint});
+  const uint32_t EngineInternal2      = code.OpTypeStruct (fn, {_runtimearr_uint});
 
   const uint32_t _ptr_Uniform_EngineInternal0 = code.OpTypePointer(fn,spv::StorageClassUniform, EngineInternal0);
   const uint32_t _ptr_Uniform_EngineInternal1 = code.OpTypePointer(fn,spv::StorageClassUniform, EngineInternal1);
@@ -613,6 +613,7 @@ void MeshConverter::injectCountingPass(const uint32_t idMainFunc) {
   const uint32_t const1   = code.OpConstant(fn,uint_t,1);
   const uint32_t const2   = code.OpConstant(fn,uint_t,2);
   const uint32_t const3   = code.OpConstant(fn,uint_t,3);
+  const uint32_t const5   = code.OpConstant(fn,uint_t,5);
   const uint32_t const10  = code.OpConstant(fn,uint_t,10);
   const uint32_t const18  = code.OpConstant(fn,uint_t,18);
   const uint32_t const128 = code.OpConstant(fn,uint_t,128);
@@ -662,13 +663,13 @@ void MeshConverter::injectCountingPass(const uint32_t idMainFunc) {
   fn.insert(spv::OpDecorate, {EngineInternal1, spv::DecorationBufferBlock});
   fn.insert(spv::OpDecorate, {vEngine1, spv::DecorationDescriptorSet, 1});
   fn.insert(spv::OpDecorate, {vEngine1, spv::DecorationBinding, 1});
-  for(uint32_t i=0; i<4; ++i)
+  for(uint32_t i=0; i<6; ++i)
     fn.insert(spv::OpMemberDecorate, {EngineInternal1, i, spv::DecorationOffset, i*4});
 
   fn.insert(spv::OpDecorate, {EngineInternal2, spv::DecorationBufferBlock});
   fn.insert(spv::OpDecorate, {vEngine2, spv::DecorationDescriptorSet, 1});
   fn.insert(spv::OpDecorate, {vEngine2, spv::DecorationBinding, 2});
-  for(uint32_t i=0; i<2; ++i)
+  for(uint32_t i=0; i<1; ++i)
     fn.insert(spv::OpMemberDecorate, {EngineInternal2, i, spv::DecorationOffset, i*4});
 
   fn = code.findSectionEnd(libspirv::Bytecode::S_Debug);
@@ -684,14 +685,15 @@ void MeshConverter::injectCountingPass(const uint32_t idMainFunc) {
   fn.insert(spv::OpName,       EngineInternal0,    "EngineInternal0");
 
   fn.insert(spv::OpName,       EngineInternal1,    "EngineInternal1");
-  fn.insert(spv::OpMemberName, EngineInternal1, 0, "grow");
-  fn.insert(spv::OpMemberName, EngineInternal1, 1, "dispatchY");
-  fn.insert(spv::OpMemberName, EngineInternal1, 2, "dispatchZ");
-  fn.insert(spv::OpMemberName, EngineInternal1, 3, "desc");
+  fn.insert(spv::OpMemberName, EngineInternal1, 0, "varGrow");
+  fn.insert(spv::OpMemberName, EngineInternal1, 1, "grow");
+  fn.insert(spv::OpMemberName, EngineInternal1, 2, "dispatchX");
+  fn.insert(spv::OpMemberName, EngineInternal1, 3, "dispatchY");
+  fn.insert(spv::OpMemberName, EngineInternal1, 4, "dispatchZ");
+  fn.insert(spv::OpMemberName, EngineInternal1, 5, "desc");
 
   fn.insert(spv::OpName,       EngineInternal2,    "EngineInternal2");
-  fn.insert(spv::OpMemberName, EngineInternal2, 0, "grow");
-  fn.insert(spv::OpMemberName, EngineInternal2, 1, "heap");
+  fn.insert(spv::OpMemberName, EngineInternal2, 0, "heap");
 
   // engine-level main
   fn = code.end();
@@ -797,14 +799,15 @@ void MeshConverter::injectCountingPass(const uint32_t idMainFunc) {
   const uint32_t heapAllocSz = code.fetchAddBound();
   fn.insert(spv::OpIAdd, {uint_t, heapAllocSz, maxVar, indSize});
 
+  // uint heapDest  = atomicAdd(mesh.varGrow, indSize + maxVar);
   const uint32_t ptrHeapDest = code.fetchAddBound();
-  fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeapDest, vEngine2, const0});
+  fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeapDest, vEngine1, const0});
   const uint32_t heapDest  = code.fetchAddBound();
   fn.insert(spv::OpAtomicIAdd, {uint_t, heapDest, ptrHeapDest, const1/*scope*/, const0/*semantices*/, heapAllocSz});
 
   // uint meshDest = atomicAdd(mesh.grow, 1)*3;
   const uint32_t ptrMeshDest = code.fetchAddBound();
-  fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrMeshDest, vEngine1, const0});
+  fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrMeshDest, vEngine1, const1});
   const uint32_t meshDestRaw  = code.fetchAddBound();
   fn.insert(spv::OpAtomicIAdd, {uint_t, meshDestRaw, ptrMeshDest, const1/*scope*/, const0/*semantices*/, const1});
 
@@ -823,7 +826,7 @@ void MeshConverter::injectCountingPass(const uint32_t idMainFunc) {
     const uint32_t rDst = code.fetchAddBound();
     fn.insert(spv::OpIAdd, {uint_t, rDst, rI, heapDest});
     const uint32_t ptrHeap = code.fetchAddBound();
-    fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap, vEngine2, const1, rDst});
+    fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap, vEngine2, const0, rDst});
 
     const uint32_t ptrIndicesNV = code.fetchAddBound();
     fn.insert(spv::OpAccessChain, {_ptr_Workgroup_uint, ptrIndicesNV, idPrimitiveIndicesNV, rI});
@@ -860,7 +863,7 @@ void MeshConverter::injectCountingPass(const uint32_t idMainFunc) {
         block.insert(spv::OpIAdd, {uint_t, rDst, rAt, constants[seq]});
         ++seq;
         const uint32_t ptrHeap = code.fetchAddBound();
-        block.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap, vEngine2, const1, rDst});
+        block.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap, vEngine2, const0, rDst});
 
         // NOTE: ids is pointer to array of X, we need only X
         const uint32_t varPtr = code.fetchAddBound();
@@ -898,19 +901,19 @@ void MeshConverter::injectCountingPass(const uint32_t idMainFunc) {
     fn.insert(spv::OpLoad, {uint_t, workIdX, ptrWorkGroupID});
 
     const uint32_t ptrHeap0 = code.fetchAddBound();
-    fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap0, vEngine1, const3, meshDest});
+    fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap0, vEngine1, const5, meshDest});
     fn.insert(spv::OpStore, {ptrHeap0, workIdX});
 
     const uint32_t dest1 = code.fetchAddBound();
     fn.insert(spv::OpIAdd, {uint_t, dest1, meshDest, const1});
     const uint32_t ptrHeap1 = code.fetchAddBound();
-    fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap1, vEngine1, const3, dest1});
+    fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap1, vEngine1, const5, dest1});
     fn.insert(spv::OpStore, {ptrHeap1, heapDest});
 
     const uint32_t dest2 = code.fetchAddBound();
     fn.insert(spv::OpIAdd, {uint_t, dest2, meshDest, const2});
     const uint32_t ptrHeap2 = code.fetchAddBound();
-    fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap2, vEngine1, const3, dest2});
+    fn.insert(spv::OpAccessChain, {_ptr_Uniform_uint, ptrHeap2, vEngine1, const5, dest2});
 
     const uint32_t tmp0 = code.fetchAddBound();
     fn.insert(spv::OpShiftLeftLogical, {uint_t, tmp0, maxVertex, const10});
diff --git a/Engine/gapi/vulkan/vdevice.cpp b/Engine/gapi/vulkan/vdevice.cpp
index 9531d159..f7a146a7 100644
--- a/Engine/gapi/vulkan/vdevice.cpp
+++ b/Engine/gapi/vulkan/vdevice.cpp
@@ -3,11 +3,8 @@
 #include "vdevice.h"
 
 #include "vcommandbuffer.h"
-#include "vcommandpool.h"
 #include "vfence.h"
 #include "vswapchain.h"
-#include "vbuffer.h"
-#include "vtexture.h"
 #include "vmeshlethelper.h"
 #include "system/api/x11api.h"
 
@@ -514,7 +511,7 @@ void VDevice::submit(VCommandBuffer& cmd, VFence* sync) {
     submitInfo.sType                  = VK_STRUCTURE_TYPE_SUBMIT_INFO_2_KHR;
     submitInfo.commandBufferInfoCount = uint32_t(cmd.chunks.size());
     submitInfo.pCommandBufferInfos    = flat.get();
-    submitInfo.waitSemaphoreInfoCount = waitCnt;
+    submitInfo.waitSemaphoreInfoCount = uint32_t(waitCnt);
     submitInfo.pWaitSemaphoreInfos    = wait2.get();
 
     graphicsQueue->submit(1,&submitInfo,fence,vkQueueSubmit2);
@@ -536,7 +533,7 @@ void VDevice::submit(VCommandBuffer& cmd, VFence* sync) {
     submitInfo.sType              = VK_STRUCTURE_TYPE_SUBMIT_INFO;
     submitInfo.commandBufferCount = uint32_t(cmd.chunks.size());
     submitInfo.pCommandBuffers    = flat.get();
-    submitInfo.waitSemaphoreCount = waitCnt;
+    submitInfo.waitSemaphoreCount = uint32_t(waitCnt);
     submitInfo.pWaitSemaphores    = wait.get();
     submitInfo.pWaitDstStageMask  = waitStages.get();
 
diff --git a/Engine/gapi/vulkan/vmeshlethelper.cpp b/Engine/gapi/vulkan/vmeshlethelper.cpp
index 5b094f9b..fbff0832 100644
--- a/Engine/gapi/vulkan/vmeshlethelper.cpp
+++ b/Engine/gapi/vulkan/vmeshlethelper.cpp
@@ -98,20 +98,20 @@ void VMeshletHelper::drawIndirect(VkCommandBuffer impl, uint32_t id) {
 
 void VMeshletHelper::initRP(VkCommandBuffer impl) {
   if(false) {
-    VkDrawIndexedIndirectCommand cmd = {};
+    VkDrawIndexedIndirectCommand cmd[3] = {};
     indirect.read(&cmd,0,sizeof(cmd));
 
     IVec3 cmdSz = {};
-    meshlets.read(&cmdSz,0,sizeof(cmdSz));
+    meshlets.read(&cmdSz,2*4,sizeof(cmdSz));
 
-    IVec3 desc = {};
-    meshlets.read(&desc,3*4,sizeof(desc));
+    IVec3 desc[3] = {};
+    meshlets.read(&desc,5*4,sizeof(desc));
 
-    uint32_t indSize   = (desc.z       ) & 0x3FF;
-    uint32_t maxVertex = (desc.z >> 10 ) & 0xFF;
-    uint32_t varSize   = (desc.z >> 18u);
+    uint32_t indSize   = (desc[0].z       ) & 0x3FF;
+    uint32_t maxVertex = (desc[0].z >> 10 ) & 0xFF;
+    uint32_t varSize   = (desc[0].z >> 18u);
 
-    uint32_t ibo[3] = {};
+    uint32_t ibo[3*3] = {};
     compacted.read(ibo,0,sizeof(ibo));
 
     float    vbo[11*3] = {};
@@ -119,7 +119,7 @@ void VMeshletHelper::initRP(VkCommandBuffer impl) {
     compacted.read(vbo, 3*4,sizeof(vbo));
     compacted.read(vboI,3*4,sizeof(vboI));
 
-    float sc[11*3+3] = {};
+    float sc[12*3+3] = {};
     scratch.read(sc,0,sizeof(sc));
 
     Log::i("");
@@ -130,13 +130,10 @@ void VMeshletHelper::initRP(VkCommandBuffer impl) {
           VK_PIPELINE_STAGE_TRANSFER_BIT,
           VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT,
           VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT);
-  // drawcall-related parts should be set to zeros
+  // drawcall-related parts should be set to zeros. TODO: remove
   vkCmdFillBuffer(impl, indirect.impl, 0, VK_WHOLE_SIZE, 0);
-  // {0, 1, 1, <undefined>}
-  IVec3 meshletBufInit = {0,1,1};
-  vkCmdUpdateBuffer(impl ,meshlets.impl, 0, sizeof(meshletBufInit), &meshletBufInit);
-  // var.grow
-  vkCmdFillBuffer(impl, scratch.impl, 0, sizeof(uint32_t), 0);
+  // meshlet counters
+  vkCmdFillBuffer(impl, meshlets.impl, 0, sizeof(uint32_t)*2, 0);
 
   barrier(impl,
           VK_PIPELINE_STAGE_TRANSFER_BIT,
@@ -146,8 +143,11 @@ void VMeshletHelper::initRP(VkCommandBuffer impl) {
   }
 
 void VMeshletHelper::sortPass(VkCommandBuffer impl, uint32_t meshCallsCount) {
+  if(meshCallsCount==0)
+    return;
+  // Issue: sync for indirect buffer will ruin pipelining, by serializing all renderpasses
+
   // prefix summ pass
-  // Issue: sync for indirect buffer will ruing pipelining, by serializing all renderpasses
   barrier(impl,
           VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT,
           VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
@@ -156,18 +156,19 @@ void VMeshletHelper::sortPass(VkCommandBuffer impl, uint32_t meshCallsCount) {
   vkCmdBindPipeline(impl,VK_PIPELINE_BIND_POINT_COMPUTE,prefixSum.handler->impl);
   vkCmdBindDescriptorSets(impl,VK_PIPELINE_BIND_POINT_COMPUTE, prefixSum.handler->pipelineLayout,
                           0, 1,&engSet, 0,nullptr);
+  vkCmdPushConstants(impl,prefixSum.handler->pipelineLayout,VK_SHADER_STAGE_COMPUTE_BIT,0,4,&meshCallsCount);
   vkCmdDispatch(impl, 1,1,1); // one threadgroup for prefix pass
 
   // compactage pass
   barrier(impl,
-          VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
           VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT,
-          VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
+          VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT,
+          VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_INDIRECT_COMMAND_READ_BIT,
           VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_INDIRECT_COMMAND_READ_BIT);
   vkCmdBindPipeline(impl,VK_PIPELINE_BIND_POINT_COMPUTE,compactage.handler->impl);
   vkCmdBindDescriptorSets(impl,VK_PIPELINE_BIND_POINT_COMPUTE, compactage.handler->pipelineLayout,
                           0, 1,&compSet, 0,nullptr);
-  vkCmdDispatchIndirect(impl, meshlets.impl, 0);
+  vkCmdDispatchIndirect(impl, meshlets.impl, sizeof(uint32_t)*2);
 
   // ready for draw
   barrier(impl,
diff --git a/Engine/shaders/mesh_compactage.comp b/Engine/shaders/mesh_compactage.comp
index 64d46e47..97467367 100644
--- a/Engine/shaders/mesh_compactage.comp
+++ b/Engine/shaders/mesh_compactage.comp
@@ -20,62 +20,51 @@ layout(binding = 0, std430) buffer EngineInternal0 {
   IndirectCmd cmd[];
   } indirect;
 
-layout(binding = 1, std430) readonly buffer EngineInternal3 {
+layout(binding = 1, std430) readonly buffer EngineInternal1 {
+  uint    varGrow;
   uint    grow;
+  uint    dispatchX;
   uint    dispatchY;
   uint    dispatchZ;
   uint    desc[];
   } mesh;
 
 layout(binding = 2, std430) readonly buffer EngineInternal2 {
-  uint    grow;
   uint    heap[];
   } var;
 
-layout(binding = 3, std430) buffer EngineInternal4 {
+layout(binding = 3, std430) buffer EngineInternal3 {
   uint    heap[];
   } compacted;
 
-shared uint iboOffset;
-shared uint vboOffset;
-
 void main() {
-  if(gl_WorkGroupID.x>mesh.desc.length())
-    return;
+  uint mestId     = gl_GlobalInvocationID.x*3;
 
-  const uint index = gl_LocalInvocationID.x;
+  [[branch]]
+  if(gl_GlobalInvocationID.x>=mesh.grow)
+    return;
 
-  uint mestId     = gl_WorkGroupID.x*3;
   uint self       = mesh.desc[mestId+0];
   uint indPtr     = mesh.desc[mestId+1];
   uint desc       = mesh.desc[mestId+2];
 
-  uint indSize   = (desc       ) & 0x3FF;
-  uint maxVertex = (desc >> 10 ) & 0xFF;
-  uint varSize   = (desc >> 18u);
-  uint vMem      = maxVertex*varSize;
-  uint vertPtr   = indPtr + indSize;
-
-  [[branch]]
-  if(index==0) {
-    uint firstIndex = indirect.cmd[self].firstIndex;
-    iboOffset = atomicAdd(indirect.cmd[self].indexCount, indSize) + firstIndex;
-    vboOffset = atomicAdd(indirect.cmd[self].vboOffset,  vMem);
-    }
+  uint indSize    = (desc       ) & 0x3FF;
+  uint maxVertex  = (desc >> 10 ) & 0xFF;
+  uint varSize    = (desc >> 18u);
+  uint vMem       = maxVertex*varSize;
+  uint vertPtr    = indPtr + indSize;
 
-  barrier();
+  uint firstIndex = indirect.cmd[self].firstIndex;
+  uint iboOffset  = atomicAdd(indirect.cmd[self].indexCount, indSize) + firstIndex;
+  uint vboOffset  = atomicAdd(indirect.cmd[self].vboOffset,  vMem);
 
-  uint b     = ((index+0)*vMem)/gl_WorkGroupSize.x;
-  uint e     = ((index+1)*vMem)/gl_WorkGroupSize.x;
   [[loop]]
-  for(uint i=b; i<e; ++i) {
+  for(uint i=0; i<vMem; ++i) {
     compacted.heap[vboOffset+i] = var.heap[vertPtr+i];
     }
 
-  b = ((index+0)*indSize)/gl_WorkGroupSize.x;
-  e = ((index+1)*indSize)/gl_WorkGroupSize.x;
   [[loop]]
-  for(uint i=b; i<e; ++i) {
+  for(uint i=0; i<indSize; ++i) {
     compacted.heap[iboOffset+i] = vboOffset + var.heap[indPtr+i]*varSize;
     }
   }
diff --git a/Engine/shaders/mesh_prefix_pass.comp b/Engine/shaders/mesh_prefix_pass.comp
index 2ee56ccb..5db07e45 100644
--- a/Engine/shaders/mesh_prefix_pass.comp
+++ b/Engine/shaders/mesh_prefix_pass.comp
@@ -22,15 +22,16 @@ layout(binding = 0, std430) buffer EngineInternal0
 
 layout(binding = 1, std430) buffer EngineInternal1
 {
+    uint varGrow;
     uint grow;
+    uint dispatchX;
     uint dispatchY;
     uint dispatchZ;
     uint desc[];
-} _116;
+} mesh;
 
 layout(binding = 2, std430) buffer EngineInternal2
 {
-    uint grow;
     uint heap[];
 } var;
 
@@ -40,9 +41,13 @@ shared uint maxInd;
 shared uint partialSummIbo[gl_WorkGroupSize.x];
 shared uint partialSummVbo[gl_WorkGroupSize.x];
 
+layout(push_constant, std140) uniform UboPush {
+  uint indirectCmdCount;
+  };
+
 void main() {
   uint index = gl_LocalInvocationID.x;
-  uint len   = indirect.cmd.length();
+  uint len   = indirectCmdCount;
 
   uint b = ((index+0)*len)/gl_WorkGroupSize.x;
   uint e = ((index+1)*len)/gl_WorkGroupSize.x;
@@ -66,8 +71,13 @@ void main() {
     prefixIbo += partialSummIbo[i];
     prefixVbo += partialSummVbo[i];
     }
+
+  [[branch]]
   if(index==255) {
-    maxInd = prefixIbo + partialSummIbo[index];
+    maxInd         = prefixIbo + partialSummIbo[index];
+    mesh.dispatchX = (mesh.grow+64-1)/64;
+    mesh.dispatchY = 1;
+    mesh.dispatchZ = 1;
     }
 
   memoryBarrierShared();
diff --git a/Tests/shader/mesh_compactage.comp b/Tests/shader/mesh_compactage.comp
index 6bef59c2..d68a6803 100644
--- a/Tests/shader/mesh_compactage.comp
+++ b/Tests/shader/mesh_compactage.comp
@@ -6,10 +6,7 @@ struct IndirectCmd {
   uint    firstIndex;    // prefix sum
   int     vertexOffset;  // can be abused to offset into var_buffer
   uint    firstInstance; // caps: should be zero
-
-  uint    self;
   uint    vboOffset;
-  uint    padd1;
   };
 
 layout(set = 0, binding = 10, std430) buffer EngineInternal0 {
@@ -17,6 +14,7 @@ layout(set = 0, binding = 10, std430) buffer EngineInternal0 {
   } indirect;
 
 layout(set = 0, binding = 11, std430) readonly buffer EngineInternal3 {
+  uint    varGrow;
   uint    grow;
   uint    dispatchY;
   uint    dispatchZ;
@@ -24,13 +22,12 @@ layout(set = 0, binding = 11, std430) readonly buffer EngineInternal3 {
   } mesh;
 
 layout(set = 0, binding = 12, std430) buffer EngineInternal2 {
-  uint    grow;
   uint    heap[];
   } var;
 
 layout(set = 0, binding = 1, std430) buffer EngineInternal4 {
   uint    heap[];
-  } varFlat;
+  } compacted;
 
 layout(local_size_x = 1) in;
 
@@ -54,10 +51,10 @@ void main() {
   uint vboOffset  = atomicAdd(indirect.cmd[self].vboOffset,  vMem);
 
   for(uint i=0; i<vMem; ++i) {
-    varFlat.heap[vboOffset+i] = var.heap[indDest+indSize+i];
+    compacted.heap[vboOffset+i] = var.heap[indDest+indSize+i];
     }
 
   for(uint i=0; i<indSize; ++i) {
-    varFlat.heap[iboOffset+i] = vboOffset + var.heap[indDest+i]*varSize;
+    compacted.heap[iboOffset+i] = vboOffset + var.heap[indDest+i]*varSize;
     }
   }
diff --git a/Tests/shader/mesh_prefix_sum.comp b/Tests/shader/mesh_prefix_sum.comp
index 3d40116c..aeb70e13 100644
--- a/Tests/shader/mesh_prefix_sum.comp
+++ b/Tests/shader/mesh_prefix_sum.comp
@@ -6,18 +6,22 @@ struct IndirectCmd {
   uint    firstIndex;    // prefix sum
   int     vertexOffset;  // can be abused to offset into var_buffer
   uint    firstInstance; // caps: should be zero
-
-  uint    self;
   uint    vboOffset;
-  uint    padd1;
   };
 
-layout(binding = 0, std430) buffer EngineInternal0 {
+layout(set = 0, binding = 0, std430) buffer EngineInternal0 {
   IndirectCmd cmd[];
   } indirect;
 
-layout(set = 0, binding = 1, std430) buffer EngineInternal2 {
+layout(set = 0, binding = 1, std430) readonly buffer EngineInternal3 {
+  uint    varGrow;
   uint    grow;
+  uint    dispatchY;
+  uint    dispatchZ;
+  uint    desc[];
+  } mesh;
+
+layout(set = 0, binding = 2, std430) buffer EngineInternal2 {
   uint    heap[];
   } var;
 
diff --git a/Tests/shader/simple_test.mesh.comp b/Tests/shader/simple_test.mesh.comp
index e28ce091..b66b3a5d 100644
--- a/Tests/shader/simple_test.mesh.comp
+++ b/Tests/shader/simple_test.mesh.comp
@@ -37,22 +37,28 @@ shared PerVertexData v_out[max_vertices];
 shared uint te_PrimitiveIndicesNV[3];
 shared uint te_PrimitiveCountNV;
 
-// Injected interface block, should be at set=1
-layout(set = 0, binding = 10, std430) buffer EngineInternal0
-{
-   uint    indexCount;
-   uint    instanceCount;
-   uint    firstIndex;    // prefix sum
-   int     vertexOffset;  // can be abused to offset into var_buffer
-   uint    firstInstance; // caps: should be zero
+// should be dispatch base Y parameter
+layout(std140, push_constant) uniform Push {
+  uint self;
+  };
 
-   uint    self;
-   uint    padd0;
-   uint    padd1;
-} indirect;
+struct IndirectCmd {
+  uint    indexCount;
+  uint    instanceCount;
+  uint    firstIndex;    // prefix sum
+  int     vertexOffset;  // can be abused to offset into var_buffer
+  uint    firstInstance; // caps: should be zero
+  uint    vboOffset;
+  };
+
+// Injected interface block, should be at set=1
+layout(set = 0, binding = 10, std430) buffer EngineInternal0 {
+  IndirectCmd cmd[];
+  } indirect;
 
 layout(set = 0, binding = 11, std430) buffer EngineInternal1
 {
+  uint    varGrow;
   uint    grow;
   uint    dispatchY;
   uint    dispatchZ;
@@ -61,7 +67,6 @@ layout(set = 0, binding = 11, std430) buffer EngineInternal1
 
 layout(set = 0, binding = 12, std430) buffer EngineInternal2
 {
-  uint    grow;
   uint    heap[];
 } var;
 
@@ -113,10 +118,10 @@ void main() {
   uint maxVertex = indMax+1;
   uint maxVar    = maxVertex*varSize;
 
-  atomicAdd(indirect.indexCount,    indSize);
-  atomicAdd(indirect.instanceCount, maxVar); // abuse: vbo size
-  uint heapDest  = atomicAdd(var.grow, indSize + maxVar);
-  uint meshDest  = atomicAdd(mesh.grow, 1)*3;
+  atomicAdd(indirect.cmd[self].indexCount,    indSize);
+  atomicAdd(indirect.cmd[self].instanceCount, maxVar); // abuse: vbo size
+  uint heapDest  = atomicAdd(mesh.varGrow, indSize + maxVar);
+  uint meshDest  = atomicAdd(mesh.grow,    1)*3;
 
   uint varDest   = heapDest + indSize;
 
@@ -144,7 +149,7 @@ void main() {
     }
 
   // Writeout meshlet descriptor
-  mesh.desc[meshDest+0] = indirect.self; // owner
-  mesh.desc[meshDest+1] = heapDest;       // ptr
+  mesh.desc[meshDest+0] = self;     // owner
+  mesh.desc[meshDest+1] = heapDest; // ptr
   mesh.desc[meshDest+2] = (varSize<<18u) | (maxVertex<<10u) | (indSize);
   }
diff --git a/Tests/tests/gapi/gapi_test_common.h b/Tests/tests/gapi/gapi_test_common.h
index 2ca1603e..c9825392 100644
--- a/Tests/tests/gapi/gapi_test_common.h
+++ b/Tests/tests/gapi/gapi_test_common.h
@@ -1509,11 +1509,9 @@ void MeshComputePrototype(const char* outImg) {
     uint32_t    firstIndex    = 0;
     int32_t     vertexOffset  = 0;
     uint32_t    firstInstance = 0;
-    uint32_t    self          = 0;
+
     uint32_t    vboOffset     = 0;
-    uint32_t    padd1         = 0;
     };
-  static_assert(sizeof(VkDrawIndexedIndirectCommand)==32);
 
   enum : uint32_t {
     B_Indirect = 10,
@@ -1525,29 +1523,18 @@ void MeshComputePrototype(const char* outImg) {
     const char* msDev = nullptr;
 
     GraphicsApi api{ApiFlags::Validation};
-    auto dev = api.devices();
-    for(auto& i:dev)
-      if(i.meshlets.meshShader)
-        msDev = i.name;
-    if(msDev==nullptr)
-      return;
 
     Device device(api,msDev);
     auto vbo  = device.vbo(vboData,3);
 
     VkDrawIndexedIndirectCommand ix[2] = {};
-    ix[0].self = 0;
-    ix[1].self = 1;
     auto indirect = device.ssbo(ix,sizeof(ix));
 
     auto var  = device.ssbo(nullptr, 4*16*1024);     // big buffer for meshlet data
     auto flat = device.ssbo(nullptr, var.size());    // double buffer
     auto mesh = device.ssbo(nullptr, 4*256);         // buffer meshlet descriptors
 
-    const uint32_t zero = 0;
-    var.update(&zero,0,4);
-
-    const IVec3 msz = {0,1,1};
+    const uint32_t msz[4] = {0,0,1,1};
     mesh.update(&msz,0,sizeof(msz));
 
     auto shaderMs = device.shader("shader/simple_test.mesh.comp.sprv");
@@ -1561,7 +1548,7 @@ void MeshComputePrototype(const char* outImg) {
 
     auto ubo1  = device.descriptors(psoMs);
     ubo1.set(0,  vbo);
-    ubo1.set(B_Indirect, indirect, sizeof(VkDrawIndexedIndirectCommand));
+    ubo1.set(B_Indirect, indirect);
     ubo1.set(B_Meshlet,  mesh);
     ubo1.set(B_Var,      var);
 
@@ -1569,7 +1556,8 @@ void MeshComputePrototype(const char* outImg) {
     auto psoSum = device.pipeline(shSum);
     auto uboSum = device.descriptors(psoSum);
     uboSum.set(0, indirect);
-    uboSum.set(1, var);
+    uboSum.set(1, mesh);
+    uboSum.set(2, var);
 
     auto shCompactage  = device.shader("shader/mesh_compactage.comp.sprv");
     auto psoCompactage = device.pipeline(shCompactage);
@@ -1582,9 +1570,11 @@ void MeshComputePrototype(const char* outImg) {
     auto cmd = device.commandBuffer();
     {
       auto enc = cmd.startEncoding(device);
-      enc.setUniforms(psoMs,ubo0);
+      uint32_t id = 0;
+      enc.setUniforms(psoMs,ubo0,&id,sizeof(id));
       enc.dispatch(3, 1,1);
-      enc.setUniforms(psoMs,ubo1);
+      id = 1;
+      enc.setUniforms(psoMs,ubo1,&id,sizeof(id));
       enc.dispatch(2, 1,1);
       // ^ 3+2 meshlets
 
@@ -1610,11 +1600,11 @@ void MeshComputePrototype(const char* outImg) {
     std::vector<uint32_t> flatCpu(flat.size()/4);
     device.readBytes(flat,flatCpu.data(),flat.size());
 
-    EXPECT_EQ(meshCpu[0],5);
-    for(uint32_t i=0; i<meshCpu[0]; ++i) {
-      uint32_t self = meshCpu[3+i*3+0];
-      uint32_t heap = meshCpu[3+i*3+1];
-      uint32_t desc = meshCpu[3+i*3+2];
+    EXPECT_EQ(meshCpu[1],5);
+    for(uint32_t i=0; i<meshCpu[1]; ++i) {
+      uint32_t self = meshCpu[4+i*3+0];
+      uint32_t heap = meshCpu[4+i*3+1];
+      uint32_t desc = meshCpu[4+i*3+2];
 
       uint32_t indSize   = (desc       ) & 0x3FF;
       uint32_t maxVertex = (desc >> 10 ) & 0xFF;