From 1e3f5dd9322f1eb1ffd816ed7e5f59e54362e504 Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 20 Nov 2024 16:44:36 +0100 Subject: [PATCH 01/11] Decide on the patchable parameters for the TLAS and BLAS builds. Note that pointer/build param encoding stuff shouldn't be in the CPU side but don't touch anything. Also fix a typo, change the SRange to a std::span, and add default SPIR-V optimizer if none provided to asset converter. --- include/nbl/asset/IAccelerationStructure.h | 2 +- include/nbl/asset/ICPUAccelerationStructure.h | 14 +-- include/nbl/video/utilities/CAssetConverter.h | 94 ++++++++++++++++++- 3 files changed, 99 insertions(+), 11 deletions(-) diff --git a/include/nbl/asset/IAccelerationStructure.h b/include/nbl/asset/IAccelerationStructure.h index 94e53238b2..46d8a5c4c6 100644 --- a/include/nbl/asset/IAccelerationStructure.h +++ b/include/nbl/asset/IAccelerationStructure.h @@ -155,7 +155,7 @@ class ITopLevelAccelerationStructure : public AccelerationStructure PREFER_FAST_BUILD_BIT = 0x1u<<3u, LOW_MEMORY_BIT = 0x1u<<4u, // Synthetic flag we use to indicate `VkAccelerationStructureGeometryInstancesDataKHR::arrayOfPointers` - INSTANCE_DATA_IS_POINTERS_TYPE_ENCODED_LSB = 0x1u<<5u, + INSTANCE_DATA_IS_POINTERS_TYPE_ENCODED_LSB = 0x1u<<5u, // this flag really shouldn't be settable outside of `video::IGPU` // Provided by VK_NV_ray_tracing_motion_blur, but we always override and deduce from creation flag because of // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkAccelerationStructureBuildGeometryInfoKHR-dstAccelerationStructure-04927 //MOTION_BIT = 0x1u<<5u, diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h index dfc96ce01a..87dcee99ab 100644 --- a/include/nbl/asset/ICPUAccelerationStructure.h +++ b/include/nbl/asset/ICPUAccelerationStructure.h @@ -337,17 +337,17 @@ class ICPUTopLevelAccelerationStructure final : public ITopLevelAccelerationStru std::variant instance = StaticInstance{}; }; - core::SRange getInstances() + std::span getInstances() { if (!isMutable() || !m_instances) - return {nullptr,nullptr}; - return {m_instances->begin(),m_instances->end()}; + return {}; + return {m_instances->data(),m_instances->size()}; } - core::SRange getInstances() const + std::span getInstances() const { if (!m_instances) - return {nullptr,nullptr}; - return {m_instances->begin(),m_instances->end()}; + return {}; + return {m_instances->data(),m_instances->size()}; } bool setInstances(core::smart_refctd_dynamic_array&& _instances) { @@ -367,7 +367,7 @@ class ICPUTopLevelAccelerationStructure final : public ITopLevelAccelerationStru } //! - constexpr static inline auto AssetType = ET_BOTOM_LEVEL_ACCELERATION_STRUCTURE; + constexpr static inline auto AssetType = ET_TOP_LEVEL_ACCELERATION_STRUCTURE; inline IAsset::E_TYPE getAssetType() const override { return AssetType; } inline core::smart_refctd_ptr clone(uint32_t _depth = ~0u) const override diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h index cc3e0a93b9..452a52f3da 100644 --- a/include/nbl/video/utilities/CAssetConverter.h +++ b/include/nbl/video/utilities/CAssetConverter.h @@ -38,7 +38,8 @@ class CAssetConverter : public core::IReferenceCounted asset::ICPUSampler, asset::ICPUShader, asset::ICPUBuffer, - // acceleration structures, + asset::ICPUBottomLevelAccelerationStructure, + asset::ICPUTopLevelAccelerationStructure, asset::ICPUImage, asset::ICPUBufferView, asset::ICPUImageView, @@ -71,6 +72,14 @@ class CAssetConverter : public core::IReferenceCounted { if (!params.valid()) return nullptr; + #ifndef _NBL_DEBUG + if (!params.optimizer) + { + using pass_e = asset::ISPIRVOptimizer::E_OPTIMIZER_PASS; + // shall we do others? + params.optimizer = core::make_smart_rectd_ptr({EOP_STRIP_DEBUG_INFO}); + } + #endif return core::smart_refctd_ptr(new CAssetConverter(std::move(params)),core::dont_grab); } // When getting dependents, the creation parameters of GPU objects will be produced and patched appropriately. @@ -149,6 +158,71 @@ class CAssetConverter : public core::IReferenceCounted return {true,retval}; } }; + struct NBL_API2 acceleration_structure_patch_base + { + public: + enum class BuildPreference : uint8_t + { + None = 0, + FastTrace = 1, + FastBuild = 2, + Invalid = 3 + }; + + //! select build flags + uint8_t allowUpdate : 1 = false; + uint8_t allowCompaction : 1 = false; + uint8_t allowDataAccess : 1 = false; + BuildPreference preference : 2 = BuildPreference::Invalid; + uint8_t lowMemory : 1 = false; + //! things that control the build + uint8_t hostBuild : 1 = false; + uint8_t compactAfterBuild : 1 = false; + + protected: + bool valid(const ILogicalDevice* device); + + template + std::pair combine_impl(const CRTP& _this, const CRTP& other) const + { + if (_this.preference!=other.preference || _this.preference==BuildPreference::Invalid) + return {false,_this}; + CRTP retval = _this; + retval.allowUpdate |= other.allowUpdate; + retval.allowCompaction |= other.allowCompaction; + retval.allowDataAccess |= other.allowDataAccess; + retval.lowMemory |= other.lowMemory; + retval.hostBuild |= other.hostBuild; + retval.compactAfterBuild |= other.compactAfterBuild; + return {true,retval}; + } + }; + template<> + struct NBL_API2 patch_impl_t : acceleration_structure_patch_base + { + public: + PATCH_IMPL_BOILERPLATE(asset::ICPUBottomLevelAccelerationStructure); + + protected: + using build_flags_t = asset::ICPUBottomLevelAccelerationStructure::BUILD_FLAGS; + inline std::pair combine(const this_t& other) const + { + return combine_impl(*this,other); + } + }; + template<> + struct NBL_API2 patch_impl_t : acceleration_structure_patch_base + { + public: + PATCH_IMPL_BOILERPLATE(asset::ICPUTopLevelAccelerationStructure); + + protected: + using build_flags_t = asset::ICPUTopLevelAccelerationStructure::BUILD_FLAGS; + inline std::pair combine(const this_t& other) const + { + return combine_impl(*this,other); + } + }; template<> struct NBL_API2 patch_impl_t { @@ -458,6 +532,8 @@ class CAssetConverter : public core::IReferenceCounted virtual const patch_t* operator()(const lookup_t&) const = 0; virtual const patch_t* operator()(const lookup_t&) const = 0; virtual const patch_t* operator()(const lookup_t&) const = 0; + virtual const patch_t* operator()(const lookup_t&) const = 0; + virtual const patch_t* operator()(const lookup_t&) const = 0; virtual const patch_t* operator()(const lookup_t&) const = 0; virtual const patch_t* operator()(const lookup_t&) const = 0; virtual const patch_t* operator()(const lookup_t&) const = 0; @@ -577,6 +653,8 @@ class CAssetConverter : public core::IReferenceCounted bool operator()(lookup_t); bool operator()(lookup_t); bool operator()(lookup_t); + bool operator()(lookup_t); + bool operator()(lookup_t); bool operator()(lookup_t); bool operator()(lookup_t); bool operator()(lookup_t); @@ -829,6 +907,8 @@ class CAssetConverter : public core::IReferenceCounted IUtilities* utilities = nullptr; // optional, last submit (compute, transfer if no compute needed) signals these in addition to the scratch semaphore std::span extraSignalSemaphores = {}; + // specific to Acceleration Structure Build, they need to be at least as large as the largest amount of scratch required for an AS build + CAsyncSingleBufferSubAllocatorST<>* scratchForASBuild = nullptr; // specific to mip-map recomputation, these are okay defaults for the size of our Descriptor Indexed temporary descriptor set uint32_t sampledImageBindingCount = 1<<10; uint32_t storageImageBindingCount = 11<<10; @@ -853,6 +933,11 @@ class CAssetConverter : public core::IReferenceCounted // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdCopyBufferToImage.html#VUID-vkCmdCopyBufferToImage-commandBuffer-07739 inline core::bitflag getRequiredQueueFlags() const {return m_queueFlags;} + // just enough memory to build the Acceleration Structures one by one waiting for each build to complete inbetween + inline uint64_t getMinASBuildScratchSize() const {return m_minASBuildScratchSize;} + // enough memory to build and compact the all Acceleration Structures at once, obviously respecting order of BLAS (build->compact) -> TLAS (build->compact) + inline uint64_t getMaxASBuildScratchSize() const {return m_maxASBuildScratchSize;} + // inline operator bool() const {return bool(m_converter);} @@ -924,12 +1009,15 @@ class CAssetConverter : public core::IReferenceCounted using conversion_requests_t = core::vector>; using convertible_asset_types = core::type_list< asset::ICPUBuffer, - asset::ICPUImage/*, + asset::ICPUImage, asset::ICPUBottomLevelAccelerationStructure, - asset::ICPUTopLevelAccelerationStructure*/ + asset::ICPUTopLevelAccelerationStructure >; core::tuple_transform_t m_conversionRequests; + // + uint64_t m_minASBuildScratchSize = 0; + uint64_t m_maxASBuildScratchSize = 0; // core::bitflag m_queueFlags = IQueue::FAMILY_FLAGS::NONE; }; From f065d7c3b9b19c7e5958b598bb08069fdbf796d0 Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 20 Nov 2024 16:45:08 +0100 Subject: [PATCH 02/11] start going through the implementation --- src/nbl/video/utilities/CAssetConverter.cpp | 200 +++++++++++++++++++- 1 file changed, 196 insertions(+), 4 deletions(-) diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index cbbfe09a7c..95e21848e5 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -40,6 +40,7 @@ bool CAssetConverter::patch_impl_t::valid(const ILogicalDevice* dev return true; } + CAssetConverter::patch_impl_t::patch_impl_t(const ICPUShader* shader) : stage(shader->getStage()) {} bool CAssetConverter::patch_impl_t::valid(const ILogicalDevice* device) { @@ -99,6 +100,67 @@ bool CAssetConverter::patch_impl_t::valid(const ILogicalDevice* devi return true; } +bool CAssetConverter::acceleration_structure_patch_base::valid(const ILogicalDevice* device) +{ + // note that we don't check the validity of things we don't patch, all the instance and geometry data, but it will be checked by the driver anyway during creation/build + if (preference==BuildPreference::Invalid) // unititialized or just wrong + return false; + // just make the flags agree/canonicalize + allowCompaction = allowCompaction || compactAfterBuild; + // don't make invalid, just soft fail + const auto& limits = device->getPhysicalDevice()->getLimits(); + if (allowDataAccess) // TODO: && !limits.rayTracingPositionFetch) + allowDataAccess = false; + const auto& features = device->getEnabledFeatures(); + if (hostBuild && !features.accelerationStructureHostCommands) + hostBuild = false; + return true; +} +CAssetConverter::patch_impl_t::patch_impl_t(const ICPUBottomLevelAccelerationStructure* blas) +{ + const auto flags = blas->getBuildFlags(); + // straight up invalid + if (flags.hasFlags(build_flags_t::PREFER_FAST_TRACE_BIT | build_flags_t::PREFER_FAST_BUILD_BIT)) + return; + + allowUpdate = flags.hasFlags(build_flags_t::ALLOW_UPDATE_BIT); + allowCompaction = flags.hasFlags(build_flags_t::ALLOW_COMPACTION_BIT); + allowDataAccess = flags.hasFlags(build_flags_t::ALLOW_DATA_ACCESS_KHR); + if (flags.hasFlags(build_flags_t::PREFER_FAST_TRACE_BIT)) + preference = BuildPreference::FastTrace; + else if (flags.hasFlags(build_flags_t::PREFER_FAST_BUILD_BIT)) + preference = BuildPreference::FastBuild; + else + preference = BuildPreference::None; + lowMemory = flags.hasFlags(build_flags_t::LOW_MEMORY_BIT); +} +bool CAssetConverter::patch_impl_t::valid(const ILogicalDevice* device) +{ + return acceleration_structure_patch_base::valid(device); +} +CAssetConverter::patch_impl_t::patch_impl_t(const ICPUTopLevelAccelerationStructure* blas) +{ + const auto flags = blas->getBuildFlags(); + // straight up invalid + if (flags.hasFlags(build_flags_t::PREFER_FAST_TRACE_BIT|build_flags_t::PREFER_FAST_BUILD_BIT)) + return; + + allowUpdate = flags.hasFlags(build_flags_t::ALLOW_UPDATE_BIT); + allowCompaction = flags.hasFlags(build_flags_t::ALLOW_COMPACTION_BIT); + allowDataAccess = false; + if (flags.hasFlags(build_flags_t::PREFER_FAST_TRACE_BIT)) + preference = BuildPreference::FastTrace; + else if (flags.hasFlags(build_flags_t::PREFER_FAST_BUILD_BIT)) + preference = BuildPreference::FastBuild; + else + preference = BuildPreference::None; + lowMemory = flags.hasFlags(build_flags_t::LOW_MEMORY_BIT); +} +bool CAssetConverter::patch_impl_t::valid(const ILogicalDevice* device) +{ + return acceleration_structure_patch_base::valid(device); +} + // smol utility function template void deduceMetaUsages(Patch& patch, const core::bitflag usages, const E_FORMAT originalFormat, const bool hasDepthAspect=true) @@ -330,6 +392,24 @@ class AssetVisitor : public CRTP patch.usage |= IGPUBuffer::E_USAGE_FLAGS::EUF_STORAGE_TEXEL_BUFFER_BIT; return descend(dep,std::move(patch)); } + inline bool impl(const instance_t& instance, const CAssetConverter::patch_t& userPatch) + { + const auto blasInstances = instance.asset->getInstances(); + if (blasInstances.empty()) // TODO: is it valid to have a BLASless TLAS? + return false; + for (size_t i=0; i patch = {blas}; + if (userPatch.allowDataAccess) // TODO: check if all BLAS within TLAS need to have the flag ON vs OFF or only some + patch.allowDataAccess = true; + if (!descend(blas,std::move(patch),i)) + return false; + } + return true; + } inline bool impl(const instance_t& instance, const CAssetConverter::patch_t& userPatch) { const auto& params = instance.asset->getCreationParameters(); @@ -556,8 +636,10 @@ class AssetVisitor : public CRTP } case IDescriptor::EC_ACCELERATION_STRUCTURE: { - _NBL_TODO(); - [[fallthrough]]; + auto tlas = static_cast(untypedDesc); + if (!descend(tlas,{tlas},type,binding,el)) + return false; + break; } default: assert(false); @@ -845,6 +927,8 @@ class PatchOverride final : public CAssetConverter::CHashCache::IPatchOverride inline const patch_t* operator()(const lookup_t& lookup) const override {return impl(lookup);} inline const patch_t* operator()(const lookup_t& lookup) const override {return impl(lookup);} inline const patch_t* operator()(const lookup_t& lookup) const override {return impl(lookup);} + inline const patch_t* operator()(const lookup_t& lookup) const override {return impl(lookup);} + inline const patch_t* operator()(const lookup_t& lookup) const override {return impl(lookup);} inline const patch_t* operator()(const lookup_t& lookup) const override {return impl(lookup);} inline const patch_t* operator()(const lookup_t& lookup) const override {return impl(lookup);} inline const patch_t* operator()(const lookup_t& lookup) const override {return impl(lookup);} @@ -955,6 +1039,68 @@ bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t loo hasher.update(&patchedParams,sizeof(patchedParams)) << lookup.asset->getContentHash(); return true; } +bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t lookup) +{ + // extras from the patch + hasher << lookup.patch->hostBuild; + hasher << lookup.patch->compactAfterBuild; + // overriden flags + using build_flags_t = ICPUBottomLevelAccelerationStructure::BUILD_FLAGS; + constexpr build_flags_t OverridableMask = build_flags_t::LOW_MEMORY_BIT|build_flags_t::PREFER_FAST_TRACE_BIT|build_flags_t::PREFER_FAST_BUILD_BIT|build_flags_t::ALLOW_COMPACTION_BIT|build_flags_t::ALLOW_UPDATE_BIT|build_flags_t::ALLOW_DATA_ACCESS_KHR; + auto patchedBuildFlags = lookup.asset->getBuildFlags()&(~OverridableMask); + if (lookup.patch->lowMemory) + patchedBuildFlags |= build_flags_t::LOW_MEMORY_BIT; + if (lookup.patch->allowDataAccess) + patchedBuildFlags |= build_flags_t::ALLOW_DATA_ACCESS_KHR; + if (lookup.patch->allowCompaction) + patchedBuildFlags |= build_flags_t::ALLOW_COMPACTION_BIT; + if (lookup.patch->allowUpdate) + patchedBuildFlags |= build_flags_t::ALLOW_UPDATE_BIT; + switch (lookup.patch->preference) + { + case acceleration_structure_patch_base::BuildPreference::FastTrace: + patchedBuildFlags |= build_flags_t::PREFER_FAST_TRACE_BIT; + break; + case acceleration_structure_patch_base::BuildPreference::FastBuild: + patchedBuildFlags |= build_flags_t::PREFER_FAST_BUILD_BIT; + break; + default: + break; + } + hasher << patchedBuildFlags; +// TODO: hash the geometry metadata thats not already in the dependents (c.f. Renderpass, Descriptor Set) + return true; +} +bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t lookup) +{ + // extras from the patch + hasher << lookup.patch->hostBuild; + hasher << lookup.patch->compactAfterBuild; + // overriden flags + using build_flags_t = ICPUTopLevelAccelerationStructure::BUILD_FLAGS; + constexpr build_flags_t OverridableMask = build_flags_t::LOW_MEMORY_BIT|build_flags_t::PREFER_FAST_TRACE_BIT|build_flags_t::PREFER_FAST_BUILD_BIT|build_flags_t::ALLOW_COMPACTION_BIT|build_flags_t::ALLOW_UPDATE_BIT; + auto patchedBuildFlags = lookup.asset->getBuildFlags()&(~OverridableMask); + if (lookup.patch->lowMemory) + patchedBuildFlags |= build_flags_t::LOW_MEMORY_BIT; + if (lookup.patch->allowCompaction) + patchedBuildFlags |= build_flags_t::ALLOW_COMPACTION_BIT; + if (lookup.patch->allowUpdate) + patchedBuildFlags |= build_flags_t::ALLOW_UPDATE_BIT; + switch (lookup.patch->preference) + { + case acceleration_structure_patch_base::BuildPreference::FastTrace: + patchedBuildFlags |= build_flags_t::PREFER_FAST_TRACE_BIT; + break; + case acceleration_structure_patch_base::BuildPreference::FastBuild: + patchedBuildFlags |= build_flags_t::PREFER_FAST_BUILD_BIT; + break; + default: + break; + } + hasher << patchedBuildFlags; +// TODO: hash the instance metadata thats not already in the dependents (c.f. Renderpass, Descriptor Set) + return true; +} bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t lookup) { // failed promotion @@ -1366,8 +1512,8 @@ void CAssetConverter::CHashCache::eraseStale(const IPatchOverride* patchOverride rehash.operator()(); rehash.operator()(); rehash.operator()(); -// rehash.operator()(); -// rehash.operator()(); + rehash.operator()(); + rehash.operator()(); // only once all the descriptor types have been hashed, we can hash sets rehash.operator()(); // naturally any pipeline depends on shaders and pipeline cache @@ -1417,6 +1563,52 @@ class GetDependantVisitBase template class GetDependantVisit; +template<> +class GetDependantVisit : public GetDependantVisitBase +{ + public: + SBufferRange underlying = {}; + + protected: + bool descend_impl( + const instance_t& user, const CAssetConverter::patch_t& userPatch, + const instance_t& dep, const CAssetConverter::patch_t& soloPatch + ) + { + auto depObj = getDependant(dep,soloPatch); + if (!depObj) + return false; + underlying = { + .offset = user.asset->getOffsetInBuffer(), + .size = user.asset->getByteSize(), + .buffer = std::move(depObj) + }; + return underlying.isValid(); + } +}; +template<> +class GetDependantVisit : public GetDependantVisitBase +{ + public: + SBufferRange underlying = {}; + + protected: + bool descend_impl( + const instance_t& user, const CAssetConverter::patch_t& userPatch, + const instance_t& dep, const CAssetConverter::patch_t& soloPatch + ) + { + auto depObj = getDependant(dep,soloPatch); + if (!depObj) + return false; + underlying = { + .offset = user.asset->getOffsetInBuffer(), + .size = user.asset->getByteSize(), + .buffer = std::move(depObj) + }; + return underlying.isValid(); + } +}; template<> class GetDependantVisit : public GetDependantVisitBase { From 3689118cd968c02258208ffe6439b08451f31c3b Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 21 Nov 2024 11:17:02 +0100 Subject: [PATCH 03/11] const correctness --- include/nbl/video/ILogicalDevice.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index 276ed74501..bd47f6650c 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -402,7 +402,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes( const core::bitflag flags, const bool motionBlur, - const std::span geometries, + const std::span geometries, const uint32_t* const pMaxPrimitiveCounts ) const { @@ -412,7 +412,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe return {}; } - if (!IGPUBottomLevelAccelerationStructure::validBuildFlags(flags, m_enabledFeatures)) + if (!IGPUBottomLevelAccelerationStructure::validBuildFlags(flags,m_enabledFeatures)) { NBL_LOG_ERROR("Invalid build flags"); return {}; From 215ee50a7541d66447cb76f9f04bbfe939514cab Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 21 Nov 2024 11:18:15 +0100 Subject: [PATCH 04/11] shared ownership needs to be settled for the buffers backing acceelration structures --- include/nbl/video/utilities/CAssetConverter.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h index 452a52f3da..816d956d5f 100644 --- a/include/nbl/video/utilities/CAssetConverter.h +++ b/include/nbl/video/utilities/CAssetConverter.h @@ -203,8 +203,10 @@ class CAssetConverter : public core::IReferenceCounted public: PATCH_IMPL_BOILERPLATE(asset::ICPUBottomLevelAccelerationStructure); - protected: using build_flags_t = asset::ICPUBottomLevelAccelerationStructure::BUILD_FLAGS; + core::bitflag getBuildFlags(const asset::ICPUBottomLevelAccelerationStructure* blas) const; + + protected: inline std::pair combine(const this_t& other) const { return combine_impl(*this,other); @@ -216,8 +218,10 @@ class CAssetConverter : public core::IReferenceCounted public: PATCH_IMPL_BOILERPLATE(asset::ICPUTopLevelAccelerationStructure); - protected: using build_flags_t = asset::ICPUTopLevelAccelerationStructure::BUILD_FLAGS; + core::bitflag getBuildFlags(const asset::ICPUTopLevelAccelerationStructure* tlas) const; + + protected: inline std::pair combine(const this_t& other) const { return combine_impl(*this,other); @@ -795,6 +799,16 @@ class CAssetConverter : public core::IReferenceCounted return {}; } + // this a weird signature, but its for an acceleration structure backing IGPUBuffer + virtual inline std::span getSharedOwnershipQueueFamilies(const size_t groupCopyID, const asset::ICPUBottomLevelAccelerationStructure* blas, const patch_t& patch) const + { + return {}; + } + virtual inline std::span getSharedOwnershipQueueFamilies(const size_t groupCopyID, const asset::ICPUTopLevelAccelerationStructure* tlas, const patch_t& patch) const + { + return {}; + } + virtual inline std::span getSharedOwnershipQueueFamilies(const size_t groupCopyID, const asset::ICPUImage* buffer, const patch_t& patch) const { return {}; From 19e3e57d56d8ed845b80ead7f3caaf97dbef5b32 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 21 Nov 2024 11:18:30 +0100 Subject: [PATCH 05/11] fix typos --- include/nbl/asset/ICPUAccelerationStructure.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h index 87dcee99ab..c2779151d4 100644 --- a/include/nbl/asset/ICPUAccelerationStructure.h +++ b/include/nbl/asset/ICPUAccelerationStructure.h @@ -47,7 +47,7 @@ class ICPUBottomLevelAccelerationStructure final : public IBottomLevelAccelerati return {m_geometryPrimitiveCount->begin(),m_geometryPrimitiveCount->end()}; return {}; } - inline std::span getGeometryPrimitiveCounts(const size_t geomIx) const + inline std::span getGeometryPrimitiveCounts() const { if (m_geometryPrimitiveCount) return {m_geometryPrimitiveCount->begin(),m_geometryPrimitiveCount->end()}; @@ -79,7 +79,7 @@ class ICPUBottomLevelAccelerationStructure final : public IBottomLevelAccelerati { if (!isMutable()) return false; - m_buildFlags &= BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT; + m_buildFlags &= ~BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT; m_geometryPrimitiveCount = std::move(ranges); m_triangleGeoms = std::move(geometries); m_AABBGeoms = nullptr; From 100ae7d121165752d3c37a07176f404878f3f94d Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 21 Nov 2024 16:28:13 +0100 Subject: [PATCH 06/11] Realize that compacted acceleration structures need an allocator for their storage. Change more stuff to span in `ICPUBottomLevelAccelerationStructure` Use a semantically better typedef/alias in `ILogicalDevice::createBottomLevelAccelerationStructure` --- include/nbl/asset/ICPUAccelerationStructure.h | 8 ++++---- include/nbl/video/ILogicalDevice.h | 2 +- include/nbl/video/utilities/CAssetConverter.h | 19 +++++++++++++++++-- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h index c2779151d4..dcf2975b78 100644 --- a/include/nbl/asset/ICPUAccelerationStructure.h +++ b/include/nbl/asset/ICPUAccelerationStructure.h @@ -87,17 +87,17 @@ class ICPUBottomLevelAccelerationStructure final : public IBottomLevelAccelerati } // - inline core::SRange> getAABBGeometries() + inline std::span> getAABBGeometries() { if (!isMutable() || !m_AABBGeoms) return {nullptr,nullptr}; - return {m_AABBGeoms->begin(),m_AABBGeoms->end()}; + return {m_AABBGeoms->data(),m_AABBGeoms->size()}; } - inline core::SRange> getAABBGeometries() const + inline std::span> getAABBGeometries() const { if (!m_AABBGeoms) return {nullptr,nullptr}; - return {m_AABBGeoms->begin(),m_AABBGeoms->end()}; + return {m_AABBGeoms->data(),m_AABBGeoms->size()}; } inline bool setGeometries(core::smart_refctd_dynamic_array>&& geometries, core::smart_refctd_dynamic_array&& ranges) { diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index bd47f6650c..39325bd6dd 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -358,7 +358,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe // Create a sampler object to use with ImageViews virtual core::smart_refctd_ptr createSampler(const IGPUSampler::SParams& _params) = 0; // acceleration structures - inline core::smart_refctd_ptr createBottomLevelAccelerationStructure(IGPUAccelerationStructure::SCreationParams&& params) + inline core::smart_refctd_ptr createBottomLevelAccelerationStructure(IGPUBottomLevelAccelerationStructure::SCreationParams&& params) { if (invalidCreationParams(params)) { diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h index 816d956d5f..57ddec91a9 100644 --- a/include/nbl/video/utilities/CAssetConverter.h +++ b/include/nbl/video/utilities/CAssetConverter.h @@ -885,6 +885,7 @@ class CAssetConverter : public core::IReferenceCounted { // By default the last to queue to touch a GPU object will own it after any transfer or compute operations are complete. // If you want to record a pipeline barrier that will release ownership to another family, override this. + // The overload for the IGPUBuffer may be called with a hash belonging to a Acceleration Structure, this means that its the storage buffer backing the AS virtual inline uint32_t getFinalOwnerQueueFamily(const IGPUBuffer* buffer, const core::blake3_hash_t& createdFrom) { return IQueue::FamilyIgnored; @@ -923,6 +924,8 @@ class CAssetConverter : public core::IReferenceCounted std::span extraSignalSemaphores = {}; // specific to Acceleration Structure Build, they need to be at least as large as the largest amount of scratch required for an AS build CAsyncSingleBufferSubAllocatorST<>* scratchForASBuild = nullptr; + // + IDeviceMemoryAllocator* compactedASAllocator = nullptr; // specific to mip-map recomputation, these are okay defaults for the size of our Descriptor Indexed temporary descriptor set uint32_t sampledImageBindingCount = 1<<10; uint32_t storageImageBindingCount = 11<<10; @@ -951,6 +954,8 @@ class CAssetConverter : public core::IReferenceCounted inline uint64_t getMinASBuildScratchSize() const {return m_minASBuildScratchSize;} // enough memory to build and compact the all Acceleration Structures at once, obviously respecting order of BLAS (build->compact) -> TLAS (build->compact) inline uint64_t getMaxASBuildScratchSize() const {return m_maxASBuildScratchSize;} + // if returns NONE means there are no acceleration structures to build + inline auto getASBuildScratchUsages() const {return m_ASBuildScratchUsages;} // inline operator bool() const {return bool(m_converter);} @@ -1016,8 +1021,17 @@ class CAssetConverter : public core::IReferenceCounted core::smart_refctd_ptr canonical; // gpu object to transfer canonical's data to or build it from asset_traits::video_t* gpuObj; - // only relevant for images - uint16_t recomputeMips = 0; + union + { + // only relevant for images + uint16_t recomputeMips = 0; + // + struct ASBuildParams + { + uint8_t host : 1; + uint8_t compact : 1; + } asBuildParams; + }; }; template using conversion_requests_t = core::vector>; @@ -1032,6 +1046,7 @@ class CAssetConverter : public core::IReferenceCounted // uint64_t m_minASBuildScratchSize = 0; uint64_t m_maxASBuildScratchSize = 0; + core::bitflag m_ASBuildScratchUsages = IGPUBuffer::E_USAGE_FLAGS::EUF_NONE; // core::bitflag m_queueFlags = IQueue::FAMILY_FLAGS::NONE; }; From 0763416af703ed5550a636bfc753444b849a6c04 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 21 Nov 2024 16:56:24 +0100 Subject: [PATCH 07/11] start attempts at AS creation --- src/nbl/video/utilities/CAssetConverter.cpp | 421 +++++++++++++++----- 1 file changed, 317 insertions(+), 104 deletions(-) diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index 95e21848e5..8c6ee875aa 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -105,13 +105,16 @@ bool CAssetConverter::acceleration_structure_patch_base::valid(const ILogicalDev // note that we don't check the validity of things we don't patch, all the instance and geometry data, but it will be checked by the driver anyway during creation/build if (preference==BuildPreference::Invalid) // unititialized or just wrong return false; + // potentially skip a lot of work and allocations + const auto& features = device->getEnabledFeatures(); + if (!features.accelerationStructure) + return false; // just make the flags agree/canonicalize allowCompaction = allowCompaction || compactAfterBuild; // don't make invalid, just soft fail const auto& limits = device->getPhysicalDevice()->getLimits(); if (allowDataAccess) // TODO: && !limits.rayTracingPositionFetch) allowDataAccess = false; - const auto& features = device->getEnabledFeatures(); if (hostBuild && !features.accelerationStructureHostCommands) hostBuild = false; return true; @@ -120,7 +123,7 @@ CAssetConverter::patch_impl_t::patch_impl_ { const auto flags = blas->getBuildFlags(); // straight up invalid - if (flags.hasFlags(build_flags_t::PREFER_FAST_TRACE_BIT | build_flags_t::PREFER_FAST_BUILD_BIT)) + if (flags.hasFlags(build_flags_t::PREFER_FAST_TRACE_BIT|build_flags_t::PREFER_FAST_BUILD_BIT)) return; allowUpdate = flags.hasFlags(build_flags_t::ALLOW_UPDATE_BIT); @@ -134,14 +137,41 @@ CAssetConverter::patch_impl_t::patch_impl_ preference = BuildPreference::None; lowMemory = flags.hasFlags(build_flags_t::LOW_MEMORY_BIT); } +auto CAssetConverter::patch_impl_t::getBuildFlags(const ICPUBottomLevelAccelerationStructure* blas) const -> core::bitflag +{ + constexpr build_flags_t OverridableMask = build_flags_t::LOW_MEMORY_BIT|build_flags_t::PREFER_FAST_TRACE_BIT|build_flags_t::PREFER_FAST_BUILD_BIT|build_flags_t::ALLOW_COMPACTION_BIT|build_flags_t::ALLOW_UPDATE_BIT|build_flags_t::ALLOW_DATA_ACCESS_KHR; + auto flags = blas->getBuildFlags()&(~OverridableMask); + if (lowMemory) + flags |= build_flags_t::LOW_MEMORY_BIT; + if (allowDataAccess) + flags |= build_flags_t::ALLOW_DATA_ACCESS_KHR; + if (allowCompaction) + flags |= build_flags_t::ALLOW_COMPACTION_BIT; + if (allowUpdate) + flags |= build_flags_t::ALLOW_UPDATE_BIT; + switch (preference) + { + case acceleration_structure_patch_base::BuildPreference::FastTrace: + flags |= build_flags_t::PREFER_FAST_TRACE_BIT; + break; + case acceleration_structure_patch_base::BuildPreference::FastBuild: + flags |= build_flags_t::PREFER_FAST_BUILD_BIT; + break; + default: + break; + } + return flags; +} bool CAssetConverter::patch_impl_t::valid(const ILogicalDevice* device) { return acceleration_structure_patch_base::valid(device); } -CAssetConverter::patch_impl_t::patch_impl_t(const ICPUTopLevelAccelerationStructure* blas) +CAssetConverter::patch_impl_t::patch_impl_t(const ICPUTopLevelAccelerationStructure* tlas) { - const auto flags = blas->getBuildFlags(); + const auto flags = tlas->getBuildFlags(); // straight up invalid + if (tlas->getInstances().empty()) + return; if (flags.hasFlags(build_flags_t::PREFER_FAST_TRACE_BIT|build_flags_t::PREFER_FAST_BUILD_BIT)) return; @@ -156,6 +186,29 @@ CAssetConverter::patch_impl_t::patch_impl_t(c preference = BuildPreference::None; lowMemory = flags.hasFlags(build_flags_t::LOW_MEMORY_BIT); } +auto CAssetConverter::patch_impl_t::getBuildFlags(const ICPUTopLevelAccelerationStructure* tlas) const -> core::bitflag +{ + constexpr build_flags_t OverridableMask = build_flags_t::LOW_MEMORY_BIT|build_flags_t::PREFER_FAST_TRACE_BIT|build_flags_t::PREFER_FAST_BUILD_BIT|build_flags_t::ALLOW_COMPACTION_BIT|build_flags_t::ALLOW_UPDATE_BIT; + auto flags = tlas->getBuildFlags()&(~OverridableMask); + if (lowMemory) + flags |= build_flags_t::LOW_MEMORY_BIT; + if (allowCompaction) + flags |= build_flags_t::ALLOW_COMPACTION_BIT; + if (allowUpdate) + flags |= build_flags_t::ALLOW_UPDATE_BIT; + switch (preference) + { + case acceleration_structure_patch_base::BuildPreference::FastTrace: + flags |= build_flags_t::PREFER_FAST_TRACE_BIT; + break; + case acceleration_structure_patch_base::BuildPreference::FastBuild: + flags |= build_flags_t::PREFER_FAST_BUILD_BIT; + break; + default: + break; + } + return flags; +} bool CAssetConverter::patch_impl_t::valid(const ILogicalDevice* device) { return acceleration_structure_patch_base::valid(device); @@ -380,18 +433,7 @@ class AssetVisitor : public CRTP } private: - inline bool impl(const instance_t& instance, const CAssetConverter::patch_t& userPatch) - { - const auto* dep = instance.asset->getUnderlyingBuffer(); - if (!dep) - return false; - CAssetConverter::patch_t patch = {dep}; - if (userPatch.utbo) - patch.usage |= IGPUBuffer::E_USAGE_FLAGS::EUF_UNIFORM_TEXEL_BUFFER_BIT; - if (userPatch.stbo) - patch.usage |= IGPUBuffer::E_USAGE_FLAGS::EUF_STORAGE_TEXEL_BUFFER_BIT; - return descend(dep,std::move(patch)); - } + // there is no `impl()` overload taking `ICPUTopLevelAccelerationStructure` same as there is no `ICPUmage` inline bool impl(const instance_t& instance, const CAssetConverter::patch_t& userPatch) { const auto blasInstances = instance.asset->getInstances(); @@ -410,6 +452,18 @@ class AssetVisitor : public CRTP } return true; } + inline bool impl(const instance_t& instance, const CAssetConverter::patch_t& userPatch) + { + const auto* dep = instance.asset->getUnderlyingBuffer(); + if (!dep) + return false; + CAssetConverter::patch_t patch = {dep}; + if (userPatch.utbo) + patch.usage |= IGPUBuffer::E_USAGE_FLAGS::EUF_UNIFORM_TEXEL_BUFFER_BIT; + if (userPatch.stbo) + patch.usage |= IGPUBuffer::E_USAGE_FLAGS::EUF_STORAGE_TEXEL_BUFFER_BIT; + return descend(dep,std::move(patch)); + } inline bool impl(const instance_t& instance, const CAssetConverter::patch_t& userPatch) { const auto& params = instance.asset->getCreationParameters(); @@ -1045,60 +1099,42 @@ bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_thostBuild; hasher << lookup.patch->compactAfterBuild; // overriden flags - using build_flags_t = ICPUBottomLevelAccelerationStructure::BUILD_FLAGS; - constexpr build_flags_t OverridableMask = build_flags_t::LOW_MEMORY_BIT|build_flags_t::PREFER_FAST_TRACE_BIT|build_flags_t::PREFER_FAST_BUILD_BIT|build_flags_t::ALLOW_COMPACTION_BIT|build_flags_t::ALLOW_UPDATE_BIT|build_flags_t::ALLOW_DATA_ACCESS_KHR; - auto patchedBuildFlags = lookup.asset->getBuildFlags()&(~OverridableMask); - if (lookup.patch->lowMemory) - patchedBuildFlags |= build_flags_t::LOW_MEMORY_BIT; - if (lookup.patch->allowDataAccess) - patchedBuildFlags |= build_flags_t::ALLOW_DATA_ACCESS_KHR; - if (lookup.patch->allowCompaction) - patchedBuildFlags |= build_flags_t::ALLOW_COMPACTION_BIT; - if (lookup.patch->allowUpdate) - patchedBuildFlags |= build_flags_t::ALLOW_UPDATE_BIT; - switch (lookup.patch->preference) - { - case acceleration_structure_patch_base::BuildPreference::FastTrace: - patchedBuildFlags |= build_flags_t::PREFER_FAST_TRACE_BIT; - break; - case acceleration_structure_patch_base::BuildPreference::FastBuild: - patchedBuildFlags |= build_flags_t::PREFER_FAST_BUILD_BIT; - break; - default: - break; - } - hasher << patchedBuildFlags; -// TODO: hash the geometry metadata thats not already in the dependents (c.f. Renderpass, Descriptor Set) + hasher << lookup.patch->getBuildFlags(lookup.asset); + // finally the contents +//TODO: hasher << lookup.asset->getContentHash(); return true; } bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t lookup) { + const auto* asset = lookup.asset; + // + AssetVisitor> visitor = { + *this, + {asset,static_cast(patchOverride)->uniqueCopyGroupID}, + *lookup.patch + }; + if (!visitor()) + return false; // extras from the patch hasher << lookup.patch->hostBuild; hasher << lookup.patch->compactAfterBuild; // overriden flags - using build_flags_t = ICPUTopLevelAccelerationStructure::BUILD_FLAGS; - constexpr build_flags_t OverridableMask = build_flags_t::LOW_MEMORY_BIT|build_flags_t::PREFER_FAST_TRACE_BIT|build_flags_t::PREFER_FAST_BUILD_BIT|build_flags_t::ALLOW_COMPACTION_BIT|build_flags_t::ALLOW_UPDATE_BIT; - auto patchedBuildFlags = lookup.asset->getBuildFlags()&(~OverridableMask); - if (lookup.patch->lowMemory) - patchedBuildFlags |= build_flags_t::LOW_MEMORY_BIT; - if (lookup.patch->allowCompaction) - patchedBuildFlags |= build_flags_t::ALLOW_COMPACTION_BIT; - if (lookup.patch->allowUpdate) - patchedBuildFlags |= build_flags_t::ALLOW_UPDATE_BIT; - switch (lookup.patch->preference) + hasher << lookup.patch->getBuildFlags(lookup.asset); + const auto instances = asset->getInstances(); + // important two passes to not give identical data due to variable length polymorphic array being hashed + for (const auto& instance : instances) + hasher << instance.getType(); + for (const auto& instance : instances) { - case acceleration_structure_patch_base::BuildPreference::FastTrace: - patchedBuildFlags |= build_flags_t::PREFER_FAST_TRACE_BIT; - break; - case acceleration_structure_patch_base::BuildPreference::FastBuild: - patchedBuildFlags |= build_flags_t::PREFER_FAST_BUILD_BIT; - break; - default: - break; + std::visit([&hasher](const auto& typedInstance)->void + { + using instance_t = std::decay_t; + // the BLAS pointers (the BLAS contents already get hashed via asset visitor and `getDependent`, its only the metadate we need to hash + hasher.update(&typedInstance,offsetof(instance_t,base)+offsetof(ICPUTopLevelAccelerationStructure::Instance,blas)); + }, + instance.instance + ); } - hasher << patchedBuildFlags; -// TODO: hash the instance metadata thats not already in the dependents (c.f. Renderpass, Descriptor Set) return true; } bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t lookup) @@ -1563,50 +1599,25 @@ class GetDependantVisitBase template class GetDependantVisit; -template<> -class GetDependantVisit : public GetDependantVisitBase -{ - public: - SBufferRange underlying = {}; - - protected: - bool descend_impl( - const instance_t& user, const CAssetConverter::patch_t& userPatch, - const instance_t& dep, const CAssetConverter::patch_t& soloPatch - ) - { - auto depObj = getDependant(dep,soloPatch); - if (!depObj) - return false; - underlying = { - .offset = user.asset->getOffsetInBuffer(), - .size = user.asset->getByteSize(), - .buffer = std::move(depObj) - }; - return underlying.isValid(); - } -}; template<> class GetDependantVisit : public GetDependantVisitBase { public: - SBufferRange underlying = {}; + // because of the deferred building of TLASes and lack of lifetime tracking between them, nothing to do on some passes + //core::smart_refctd_ptr* const outBLASes; protected: bool descend_impl( const instance_t& user, const CAssetConverter::patch_t& userPatch, - const instance_t& dep, const CAssetConverter::patch_t& soloPatch + const instance_t& dep, const CAssetConverter::patch_t& soloPatch, + const uint32_t instanceIndex // not the custom index, its literally just an ordinal in `getInstances()` ) { - auto depObj = getDependant(dep,soloPatch); + auto depObj = getDependant(dep,soloPatch); if (!depObj) return false; - underlying = { - .offset = user.asset->getOffsetInBuffer(), - .size = user.asset->getByteSize(), - .buffer = std::move(depObj) - }; - return underlying.isValid(); + // outBLASes[instanceIndex] = std::move(depObj); + return true; } }; template<> @@ -2040,6 +2051,9 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult case ICPUImageView::AssetType: visit.operator()(entry); break; + case ICPUTopLevelAccelerationStructure::AssetType: + visit.operator()(entry); + break; // these assets have no dependants, should have never been pushed on the stack default: assert(false); @@ -2155,12 +2169,12 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult using memory_backed_ptr_variant_t = std::variant*,asset_cached_t*>; core::map> allocationRequests; // for this we require that the data storage for the dfsCaches' nodes does not change - auto requestAllocation = [&inputs,device,&allocationRequests](asset_cached_t* pGpuObj)->bool + auto requestAllocation = [&inputs,device,&allocationRequests](asset_cached_t* pGpuObj, const uint32_t memoryTypeConstraint=~0u)->bool { auto* gpuObj = pGpuObj->get(); const IDeviceMemoryBacked::SDeviceMemoryRequirements& memReqs = gpuObj->getMemoryReqs(); // this shouldn't be possible - assert(memReqs.memoryTypeBits); + assert(memReqs.memoryTypeBits&memoryTypeConstraint); // allocate right away those that need their own allocation if (memReqs.requiresDedicatedAllocation) { @@ -2176,17 +2190,34 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult { // make the creation conditional upon allocation success MemoryRequirementBin reqBin = { - .compatibileMemoryTypeBits = memReqs.memoryTypeBits, + .compatibileMemoryTypeBits = memReqs.memoryTypeBits&memoryTypeConstraint, // we ignore this for now, because we can't know how many `DeviceMemory` objects we have left to make, so just join everything by default //.refersDedicatedAllocation = memReqs.prefersDedicatedAllocation }; if constexpr (std::is_same_v,IGPUBuffer>) - reqBin.needsDeviceAddress = gpuObj->getCreationParams().usage.hasFlags(IGPUBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT); + { + const auto usage = gpuObj->getCreationParams().usage; + reqBin.needsDeviceAddress = usage.hasFlags(IGPUBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT); + // stops us needing weird awful code to ensure buffer storing AS has alignment of at least 256 + assert(!usage.hasFlags(IGPUBuffer::E_USAGE_FLAGS::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT) || memReqs.alignmentLog2>=8); + } allocationRequests[reqBin].emplace_back(pGpuObj); } return true; }; + // BLAS and TLAS creation is somewhat delayed by buffer creation and allocation + struct DeferredASCreationParams + { + asset_cached_t storage; + size_t scratchSize : 62 = 0; + size_t motionBlur : 1 = false; + size_t compactAfterBuild : 1 = false; + size_t inputSize = 0; + uint32_t maxInstanceCount = 0; + }; + core::vector accelerationStructureParams[2]; + // Deduplication, Creation and Propagation auto dedupCreateProp = [&]()->void { @@ -2206,7 +2237,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult contentHash = retval.getHashCache()->hash( {instance.asset,&created.patch}, &patchOverride, - /*.mistrustLevel = */1 + /*.mistrustLevel =*/ 1 ); // failed to hash all together (only possible reason is failure of `PatchGetter` to provide a valid patch) if (contentHash==CHashCache::NoContentHash) @@ -2356,6 +2387,133 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; } } + if constexpr (std::is_same_v || std::is_same_v) + { + using mem_prop_f = IDeviceMemoryAllocation::E_MEMORY_PROPERTY_FLAGS; + const auto deviceBuildMemoryTypes = device->getPhysicalDevice()->getMemoryTypeBitsFromMemoryTypeFlags(mem_prop_f::EMPF_DEVICE_LOCAL_BIT); + const auto hostBuildMemoryTypes = device->getPhysicalDevice()->getMemoryTypeBitsFromMemoryTypeFlags(mem_prop_f::EMPF_DEVICE_LOCAL_BIT|mem_prop_f::EMPF_HOST_WRITABLE_BIT|mem_prop_f::EMPF_HOST_CACHED_BIT); + + constexpr bool IsTLAS = std::is_same_v; + accelerationStructureParams[IsTLAS].resize(gpuObjects.size()); + for (auto& entry : conversionRequests) + for (auto i=0ull; iusesMotion(); + // we will need to temporarily store the build input buffers somewhere + size_t inputSize = 0; + ILogicalDevice::AccelerationStructureBuildSizes sizes = {}; + { + const auto buildFlags = patch.getBuildFlags(as); + if constexpr (IsTLAS) + { + AssetVisitor> visitor = { + {visitBase}, + {asset,uniqueCopyGroupID}, + patch + }; + if (!visitor()) + continue; + const auto instanceCount = as->getInstances().size(); + sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,instanceCount); + inputSize = (motionBlur ? sizeof(IGPUTopLevelAccelerationStructure::DevicePolymorphicInstance):sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance))*instanceCount; + } + else + { + const uint32_t* pMaxPrimitiveCounts = as->getGeometryPrimitiveCounts().data(); + // the code here is not pretty, but DRY-ing is of this is for later + if (buildFlags.hasFlags(ICPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT)) + { + const auto geoms = as->getAABBGeometries(); + if (patch.hostBuild) + { + const std::span> cpuGeoms = { + reinterpret_cast*>(geoms.data()),geoms.size() + }; + sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts); + } + else + { + const std::span> cpuGeoms = { + reinterpret_cast*>(geoms.data()),geoms.size() + }; + sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts); + // TODO: check if the strides need to be aligned to 4 bytes for AABBs + for (const auto& geom : geoms) + if (const auto aabbCount=*(pMaxPrimitiveCounts++); aabbCount) + inputSize = core::roundUp(inputSize,sizeof(float))+aabbCount*geom.stride; + } + } + else + { + core::map allocationsPerStride; + const auto geoms = as->getTriangleGeometries(); + if (patch.hostBuild) + { + const std::span> cpuGeoms = { + reinterpret_cast*>(geoms.data()),geoms.size() + }; + sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts); + } + else + { + const std::span> cpuGeoms = { + reinterpret_cast*>(geoms.data()),geoms.size() + }; + sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts); + // TODO: check if the strides need to be aligned to 4 bytes for AABBs + for (const auto& geom : geoms) + if (const auto triCount=*(pMaxPrimitiveCounts++); triCount) + { + switch (geom.indexType) + { + case E_INDEX_TYPE::EIT_16BIT: + allocationsPerStride[sizeof(uint16_t)] += triCount*3; + break; + case E_INDEX_TYPE::EIT_32BIT: + allocationsPerStride[sizeof(uint32_t)] += triCount*3; + break; + default: + break; + } + size_t bytesPerVertex = geom.vertexStride; + if (geom.vertexData[1]) + bytesPerVertex += bytesPerVertex; + allocationsPerStride[geom.vertexStride] += geom.maxVertex; + } + } + for (const auto& entry : allocationsPerStride) + inputSize = core::roundUp(inputSize,entry.first)+entry.first*entry.second; + } + } + } + if (!sizes) + continue; + // this is where it gets a bit weird, we need to create a buffer to back the acceleration structure + IGPUBuffer::SCreationParams params = {}; + constexpr size_t MinASBufferAlignment = 256u; + params.size = core::roundUp(sizes.accelerationStructureSize,MinASBufferAlignment); + params.usage = IGPUBuffer::E_USAGE_FLAGS::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT|IGPUBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT; + // concurrent ownership if any + const auto outIx = i+entry.second.firstCopyIx; + const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx]; + const auto queueFamilies = inputs.getSharedOwnershipQueueFamilies(uniqueCopyGroupID,as,patch); + params.queueFamilyIndexCount = queueFamilies.size(); + params.queueFamilyIndices = queueFamilies.data(); + // we need to save the buffer in a side-channel for later + auto& out = accelerationStructureParams[IsTLAS][baseOffset+entry.second.firstCopyIx+i]; + out = { + .storage = device->createBuffer(std::move(params)), + .scratchSize = sizes.buildScratchSize, + .motionBlur = motionBlur, + .compactAfterBuild = patch.compactAfterBuild, + .inputSize = inputSize + }; + if (out.storage) + requestAllocation(&out.storage,patch.hostBuild ? hostBuildMemoryTypes:deviceBuildMemoryTypes); + } + } if constexpr (std::is_same_v) { for (auto& entry : conversionRequests) @@ -2792,9 +2950,11 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult } // Propagate the results back, since the dfsCache has the original asset pointers as keys, we map in reverse - auto& stagingCache = std::get>(retval.m_stagingCaches); - dfsCache.for_each([&](const instance_t& instance, dfs_cache::created_t& created)->void + // This gets deferred till AFTER the Buffer Memory Allocations and Binding for Acceleration Structures + if constexpr (!std::is_same_v && !std::is_same_v) + dfsCache.for_each([&](const instance_t& instance, dfs_cache::created_t& created)->void { + auto& stagingCache = std::get>(retval.m_stagingCaches); // already found in read cache and not converted if (created.gpuObj) return; @@ -2865,7 +3025,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult const uint16_t recomputeMips = created.patch.recomputeMips; getConversionRequests.operator()().emplace_back(core::smart_refctd_ptr(instance.asset),created.gpuObj.get(),recomputeMips); } - // TODO: BLAS and TLAS requests +// TODO: BLAS and TLAS requests } ); }; @@ -2873,8 +3033,9 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult // Both so we can hash in O(Depth) and not O(Depth^2) but also so we have all the possible dependants ready. // If two Asset chains are independent then we order them from most catastrophic failure to least. dedupCreateProp.operator()(); + dedupCreateProp.operator()(); + dedupCreateProp.operator()(); dedupCreateProp.operator()(); -// TODO: add backing buffers (not assets) for BLAS and TLAS builds // Allocate Memory { auto getAsBase = [](const memory_backed_ptr_variant_t& var) -> const IDeviceMemoryBacked* @@ -3069,7 +3230,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult for (auto& req : reqBin.second) { const auto asBacked = getAsBase(req); - if (asBacked->getBoundMemory().isValid()) + if (asBacked->getBoundMemory().isValid()) // TODO: maybe change to an assert? Our `failures` should really kinda make sure we never continue; switch (req.index()) { @@ -3087,8 +3248,60 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult } allocationRequests.clear(); } -// dedupCreateProp.operator()(); -// dedupCreateProp.operator()(); + // Deal with Deferred Creation of Acceleration structures + { + for (auto asLevel=0; asLevel<2; asLevel++) + { + // each of these stages must have a barrier inbetween + size_t scratchSizeFullParallelBuild = 0; + size_t scratchSizeFullParallelCompact = 0; + // we collect that stats AFTER making sure that the BLAS / TLAS can actually be created + for (const auto& deferredParams : accelerationStructureParams[asLevel]) + { + // buffer failed to create/allocate + if (!deferredParams.storage.get()) + continue; + IGPUAccelerationStructure::SCreationParams baseParams; + { + auto* buf = deferredParams.storage.get(); + const auto bufSz = buf->getSize(); + using create_f = IGPUAccelerationStructure::SCreationParams::FLAGS; + baseParams = { + .bufferRange = {.offset=0,.size=bufSz,.buffer=smart_refctd_ptr(buf)}, + .flags = deferredParams.motionBlur ? create_f::MOTION_BIT:create_f::NONE + }; + } + smart_refctd_ptr as; + if (asLevel) + { + as = device->createBottomLevelAccelerationStructure({baseParams,deferredParams.maxInstanceCount}); + } + else + { + as = device->createTopLevelAccelerationStructure({baseParams,deferredParams.maxInstanceCount}); + } + // note that in order to compact an AS you need to allocate a buffer range whose size is known only after the build + const auto buildSize = deferredParams.inputSize+deferredParams.scratchSize; + // sizes for building 1-by-1 vs parallel, note that + retval.m_minASBuildScratchSize = core::max(buildSize,retval.m_minASBuildScratchSize); + scratchSizeFullParallelBuild += buildSize; + if (deferredParams.compactAfterBuild) + scratchSizeFullParallelCompact += deferredParams.scratchSize; + // triangles, AABBs or Instance Transforms will need to be supplied from VRAM + // TODO: also mark somehow that we'll need a BUILD INPUT READ ONLY BUFFER WITH XFER usage + if (deferredParams.inputSize) + retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; + } + // + retval.m_maxASBuildScratchSize = core::max(core::max(scratchSizeFullParallelBuild,scratchSizeFullParallelCompact),retval.m_maxASBuildScratchSize); + } + // + if (retval.m_minASBuildScratchSize) + { + retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT; + retval.m_maxASBuildScratchSize = core::max(core::max(scratchSizeFullParallelBLASBuild,scratchSizeFullParallelBLASCompact),core::max(scratchSizeFullParallelTLASBuild,scratchSizeFullParallelTLASCompact)); + } + } dedupCreateProp.operator()(); dedupCreateProp.operator()(); dedupCreateProp.operator()(); From 11a141caef7c28c072004ed18df53d8f031def13 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 21 Nov 2024 21:22:59 +0100 Subject: [PATCH 08/11] realize that scratches need to be provided separately for Device and Host builds --- include/nbl/video/utilities/CAssetConverter.h | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h index 57ddec91a9..069b049f2a 100644 --- a/include/nbl/video/utilities/CAssetConverter.h +++ b/include/nbl/video/utilities/CAssetConverter.h @@ -923,8 +923,9 @@ class CAssetConverter : public core::IReferenceCounted // optional, last submit (compute, transfer if no compute needed) signals these in addition to the scratch semaphore std::span extraSignalSemaphores = {}; // specific to Acceleration Structure Build, they need to be at least as large as the largest amount of scratch required for an AS build - CAsyncSingleBufferSubAllocatorST<>* scratchForASBuild = nullptr; - // + CAsyncSingleBufferSubAllocatorST* scratchForDeviceASBuild = nullptr; + std::pmr::memory_resource* scratchForHostASBuild = nullptr; + // needs to service allocations without limit, unlike the above where failure will just force a flush and performance of already queued up builds IDeviceMemoryAllocator* compactedASAllocator = nullptr; // specific to mip-map recomputation, these are okay defaults for the size of our Descriptor Indexed temporary descriptor set uint32_t sampledImageBindingCount = 1<<10; @@ -950,12 +951,16 @@ class CAssetConverter : public core::IReferenceCounted // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdCopyBufferToImage.html#VUID-vkCmdCopyBufferToImage-commandBuffer-07739 inline core::bitflag getRequiredQueueFlags() const {return m_queueFlags;} - // just enough memory to build the Acceleration Structures one by one waiting for each build to complete inbetween - inline uint64_t getMinASBuildScratchSize() const {return m_minASBuildScratchSize;} - // enough memory to build and compact the all Acceleration Structures at once, obviously respecting order of BLAS (build->compact) -> TLAS (build->compact) - inline uint64_t getMaxASBuildScratchSize() const {return m_maxASBuildScratchSize;} - // if returns NONE means there are no acceleration structures to build + // This is just enough memory to build the Acceleration Structures one by one waiting for each Device Build to complete inbetween. If 0 there are no Device AS Builds or Compactions to perform. + inline uint64_t getMinASBuildScratchSize(const bool forHostOps) const {return m_minASBuildScratchSize[forHostOps];} + // Enough memory to build and compact all the Acceleration Structures at once, obviously respecting order of BLAS (build->compact) -> TLAS (build->compact) + inline uint64_t getMaxASBuildScratchSize(const bool forHostOps) const {return m_maxASBuildScratchSize[forHostOps];} + // What usage flags your scratch buffer must have, if returns NONE means are no Device AS Builds to perform. inline auto getASBuildScratchUsages() const {return m_ASBuildScratchUsages;} + // tells you if you need to provide a valid `SConvertParams::scratchForHostASBuild` + inline bool willHostASBuild() const {return m_willHostBuildSomeAS;} + // tells you if you need to provide a valid `SConvertParams::compactedASAllocator` + inline bool willCompactAS() const {return m_willHostBuildSomeAS;} // inline operator bool() const {return bool(m_converter);} @@ -1028,6 +1033,7 @@ class CAssetConverter : public core::IReferenceCounted // struct ASBuildParams { + // TODO: buildFlags uint8_t host : 1; uint8_t compact : 1; } asBuildParams; @@ -1044,9 +1050,12 @@ class CAssetConverter : public core::IReferenceCounted core::tuple_transform_t m_conversionRequests; // - uint64_t m_minASBuildScratchSize = 0; - uint64_t m_maxASBuildScratchSize = 0; + uint64_t m_minASBuildScratchSize[2] = {0,0}; + uint64_t m_maxASBuildScratchSize[2] = {0,0}; core::bitflag m_ASBuildScratchUsages = IGPUBuffer::E_USAGE_FLAGS::EUF_NONE; + uint8_t m_willHostBuildSomeAS : 1 = false; + uint8_t m_willCompactSomeAS : 1 = false; + // core::bitflag m_queueFlags = IQueue::FAMILY_FLAGS::NONE; }; From 667522460ca80ae47b9f26cf2ed8a471dc583462 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 21 Nov 2024 21:24:42 +0100 Subject: [PATCH 09/11] starting writing the building code, realize we need to bucket the Device and Host build requests separately --- src/nbl/video/utilities/CAssetConverter.cpp | 128 ++++++++++++++++++-- 1 file changed, 119 insertions(+), 9 deletions(-) diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index 8c6ee875aa..b313ed7d24 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -3377,7 +3377,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul { if (reqQueueFlags.hasFlags(IQueue::FAMILY_FLAGS::TRANSFER_BIT) && (!params.utilities || params.utilities->getLogicalDevice()!=device)) { - logger.log("Transfer Capability required for this conversion and no compatible `utilities` provided!", system::ILogger::ELL_ERROR); + logger.log("Transfer Capability required for this conversion and no compatible `utilities` provided!",system::ILogger::ELL_ERROR); return retval; } @@ -3406,6 +3406,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul }; // If the transfer queue will be used, the transfer Intended Submit Info must be valid and utilities must be provided auto reqTransferQueueCaps = IQueue::FAMILY_FLAGS::TRANSFER_BIT; + // Depth/Stencil transfers need Graphics Capabilities, so make sure the queue chosen for transfers also has them! if (reservations.m_queueFlags.hasFlags(IQueue::FAMILY_FLAGS::GRAPHICS_BIT)) reqTransferQueueCaps |= IQueue::FAMILY_FLAGS::GRAPHICS_BIT; if (invalidIntended(reqTransferQueueCaps,params.transfer)) @@ -3428,7 +3429,52 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul } } - // wipe gpu item in staging cache (this may drop it as well if it was made for only a root asset == no users) + // check things necessary for building Acceleration Structures + using buffer_usage_f = IGPUBuffer::E_USAGE_FLAGS; + if (reservations.m_ASBuildScratchUsages!=buffer_usage_f::EUF_NONE) + { + if (!params.scratchForDeviceASBuild) + { + logger.log("An Acceleration Structure will be built on Device but no scratch allocator provided!",system::ILogger::ELL_ERROR); + return retval; + } + // TODO: do the build input buffers also need `EUF_STORAGE_BUFFER_BIT` ? + constexpr buffer_usage_f asBuildInputFlags = buffer_usage_f::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT|buffer_usage_f::EUF_TRANSFER_DST_BIT|buffer_usage_f::EUF_SHADER_DEVICE_ADDRESS_BIT; + // we may use the staging buffer directly to skip an extra copy on small enough geometries + if (!params.utilities->getDefaultUpStreamingBuffer()->getBuffer()->getCreationParams().usage.hasFlags(asBuildInputFlags)) + { + logger.log("An Acceleration Structure will be built on Device but Default UpStreaming Buffer from IUtilities doesn't have required usage flags!",system::ILogger::ELL_ERROR); + return retval; + } + constexpr buffer_usage_f asBuildScratchFlags = buffer_usage_f::EUF_STORAGE_BUFFER_BIT|buffer_usage_f::EUF_SHADER_DEVICE_ADDRESS_BIT; + // we use the scratch allocator both for scratch and uploaded geometry data + if (!params.scratchForDeviceASBuild->getBuffer()->getCreationParams().usage.hasFlags(asBuildScratchFlags|asBuildInputFlags)) + { + logger.log("An Acceleration Structure will be built on Device but scratch buffer doesn't have required usage flags!",system::ILogger::ELL_ERROR); + return retval; + } + const auto& addrAlloc = params.scratchForDeviceASBuild->getAddressAllocator(); + // could have used an address allocator trait to work this out, same verbosity + if (addrAlloc.get_allocated_size()+addrAlloc.get_free_size()(const typename asset_traits::video_t* gpuObj)->core::blake3_hash_t* { auto& stagingCache = std::get>(reservations.m_stagingCaches); @@ -3547,9 +3593,9 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul const auto computeFamily = shouldDoSomeCompute ? params.compute->queue->getFamilyIndex():IQueue::FamilyIgnored; // whenever transfer needs to do a submit overflow because it ran out of memory for streaming an image, we can already submit the recorded mip-map compute shader dispatches auto computeCmdBuf = shouldDoSomeCompute ? params.compute->getCommandBufferForRecording():nullptr; - auto drainCompute = [¶ms,shouldDoSomeCompute,&computeCmdBuf](const std::span extraSignal={})->auto + auto drainCompute = [¶ms,&computeCmdBuf](const std::span extraSignal={})->auto { - if (!shouldDoSomeCompute || computeCmdBuf->cmdbuf->empty()) + if (!computeCmdBuf || computeCmdBuf->cmdbuf->empty()) return IQueue::RESULT::SUCCESS; // before we overflow submit we need to inject extra wait semaphores auto& waitSemaphoreSpan = params.compute->waitSemaphores; @@ -3568,6 +3614,8 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul IQueue::RESULT res = params.compute->submit(computeCmdBuf,extraSignal); if (res!=IQueue::RESULT::SUCCESS) return res; + // set to empty so we don't grow over and over again + waitSemaphoreSpan = {}; if (!params.compute->beginNextCommandBuffer(computeCmdBuf)) return IQueue::RESULT::OTHER_ERROR; return res; @@ -4039,7 +4087,65 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul imagesToUpload.clear(); } - // TODO: build BLASes and TLASes + // BLAS builds + auto& blasToBuild = std::get>(reservations.m_conversionRequests); + if (const auto blasCount = blasToBuild.size(); blasCount) + { + constexpr auto GeometryIsAABBFlag = ICPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT; + + core::vector buildInfos; buildInfos.reserve(blasCount); + core::vector rangeInfo; rangeInfo.reserve(blasCount); + core::vector> triangles; + core::vector> aabbs; + { + size_t totalTriGeoCount = 0; + size_t totalAABBGeoCount = 0; + for (auto& item : blasToBuild) + { + const size_t geoCount = item.canonical->getGeometryCount(); + if (item.canonical->getBuildFlags().hasFlags(GeometryIsAABBFlag)) + totalAABBGeoCount += geoCount; + else + totalTriGeoCount += geoCount; + } + triangles.reserve(totalTriGeoCount); + triangles.reserve(totalAABBGeoCount); + } + for (auto& item : blasToBuild) + { + auto* as = item.gpuObj; + auto pFoundHash = findInStaging.operator()(as); + if (item.asBuildParams.host) + { + auto dOp = device->createDeferredOperation(); + // + if (!device->buildAccelerationStructure(dOp.get(),info,range)) + { + markFailureInStaging(gpuObj,pFoundHash); + continue; + } + } + else + { + auto& buildInfo = buildInfo.emplace_back({ + .buildFlags = item.buildFlags, + .geometryCount = item.canonical->getGeometryCount(), + // this is not an update + .srcAS = nullptr, + .dstAS = as.get() + }); + if (item.canonical->getBuildFlags().hasFlags(GeometryIsAABBFlag)) + buildInfo.aabbs = nullptr; + else + buildInfo.triangles = nullptr; + computeCmdBuf->cmdbuf->buildAccelerationStructures(buildInfo,rangeInfo); + } + } + } + + // TLAS builds + auto& tlasToBuild = std::get>(reservations.m_conversionRequests); + if (!tlasToBuild.empty()) { } @@ -4100,6 +4206,10 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul // rescan all the GPU objects and find out if they depend on anything that failed, if so add to failure set bool depsMissing = false; // only go over types we could actually break via missing upload/build (i.e. pipelines are unbreakable) + if constexpr (std::is_same_v) + { + // there's no lifetime tracking (refcounting) from TLAS to BLAS, so one just must trust the pre-TLAS-build input validation to do its job + } if constexpr (std::is_same_v) depsMissing = missingDependent.operator()(item.first->getUnderlyingBuffer()); if constexpr (std::is_same_v) @@ -4141,8 +4251,8 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul depsMissing = missingDependent.operator()(static_cast(untypedDesc)); break; case asset::IDescriptor::EC_ACCELERATION_STRUCTURE: - _NBL_TODO(); - [[fallthrough]]; + depsMissing = missingDependent.operator()(static_cast(untypedDesc)); + break; default: assert(false); depsMissing = true; @@ -4170,8 +4280,8 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul // again, need to go bottom up so we can check dependencies being successes mergeCache.operator()(); mergeCache.operator()(); -// mergeCache.operator()(); -// mergeCache.operator()(); + mergeCache.operator()(); + mergeCache.operator()(); mergeCache.operator()(); mergeCache.operator()(); mergeCache.operator()(); From 8f43fef5a2daf1640ed0b8328d2e2f76a6741d40 Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 22 Nov 2024 13:36:20 +0100 Subject: [PATCH 10/11] Figure out the TLAS/BLAS compaction logic and swap in cache. Also update comments about what ends up in `m_gpuObjects` --- include/nbl/asset/IDescriptorSetLayout.h | 10 +++ include/nbl/video/utilities/CAssetConverter.h | 49 ++++++----- src/nbl/video/utilities/CAssetConverter.cpp | 84 +++++++++++++++---- 3 files changed, 105 insertions(+), 38 deletions(-) diff --git a/include/nbl/asset/IDescriptorSetLayout.h b/include/nbl/asset/IDescriptorSetLayout.h index 34e7d2ccc4..5437a09330 100644 --- a/include/nbl/asset/IDescriptorSetLayout.h +++ b/include/nbl/asset/IDescriptorSetLayout.h @@ -147,6 +147,16 @@ class IDescriptorSetLayoutBase : public virtual core::IReferenceCounted // TODO: return getStorageOffset(index); } + // Weird functions for exceptional situations + inline storage_range_index_t findBindingStorageIndex(const storage_offset_t offset) const + { + const auto found = std::upper_bound(m_storageOffsets, m_storageOffsets+m_count, offset, [](storage_range_index_t a, storage_range_index_t b) -> bool {return a.data < b.data; }); + const auto ix = m_storageOffsets - found; + if (ix>=m_count) + return {}; + return storage_range_index_t(ix); + } + inline uint32_t getTotalCount() const { return (m_count == 0ull) ? 0u : m_storageOffsets[m_count - 1].data; } private: diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h index 069b049f2a..8789c63079 100644 --- a/include/nbl/video/utilities/CAssetConverter.h +++ b/include/nbl/video/utilities/CAssetConverter.h @@ -965,7 +965,8 @@ class CAssetConverter : public core::IReferenceCounted // inline operator bool() const {return bool(m_converter);} - // until `convert` is called, this will only contain valid entries for items already found in `SInput::readCache` + // Until `convert` is called, the Buffers and Images are not filled with content and Acceleration Structures are not built, unless found in the `SInput::readCache` + // WARNING: The Acceleration Structure Pointer WILL CHANGE after calling `convert` if its patch dictates that it will be compacted! (since AS can't resize) // TODO: we could also return per-object semaphore values when object is ready for use (would have to propagate two semaphores up through dependants) template std::span> getGPUObjects() const {return std::get>(m_gpuObjects);} @@ -1020,34 +1021,36 @@ class CAssetConverter : public core::IReferenceCounted core::tuple_transform_t m_stagingCaches; // need a more explicit list of GPU objects that need device-assisted conversion template - struct ConversionRequest + struct SConversionRequestBase { // canonical asset (the one that provides content) core::smart_refctd_ptr canonical; // gpu object to transfer canonical's data to or build it from asset_traits::video_t* gpuObj; - union - { - // only relevant for images - uint16_t recomputeMips = 0; - // - struct ASBuildParams - { - // TODO: buildFlags - uint8_t host : 1; - uint8_t compact : 1; - } asBuildParams; - }; }; - template - using conversion_requests_t = core::vector>; - using convertible_asset_types = core::type_list< - asset::ICPUBuffer, - asset::ICPUImage, - asset::ICPUBottomLevelAccelerationStructure, - asset::ICPUTopLevelAccelerationStructure - >; - core::tuple_transform_t m_conversionRequests; + using SConvReqBuffer = SConversionRequestBase; + core::vector m_bufferConversions; + struct SConvReqImage : SConversionRequestBase + { + bool recomputeMips = 0; + }; + core::vector m_imageConversions; + template// requires std::is_base_of_v + struct SConvReqAccelerationStructure : SConversionRequestBase + { + constexpr static inline uint64_t WontCompact = (0x1ull<<48)-1; + inline bool compact() const {return compactedASWriteOffset!=WontCompact;} + + using build_f = typename CPUAccelerationStructure::BUILD_FLAGS; + inline void setBuildFlags(const build_f _flags) {buildFlags = static_cast(_flags);} + inline build_f getBuildFlags() const {return static_cast(buildFlags);} + + + uint64_t compactedASWriteOffset : 48 = WontCompact; + uint64_t buildFlags : 16 = static_cast(build_f::NONE); + }; + core::vector> m_blasConversions[2]; + core::vector> m_tlasConversions[2]; // uint64_t m_minASBuildScratchSize[2] = {0,0}; diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index b313ed7d24..dcf27099a5 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -3016,14 +3016,13 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult return; } } - // this is super annoying, was hoping metaprogramming with `has_type` would actually work - auto getConversionRequests = [&]()->auto&{return std::get>(retval.m_conversionRequests);}; + // if constexpr (std::is_same_v) - getConversionRequests.operator()().emplace_back(core::smart_refctd_ptr(instance.asset),created.gpuObj.get());; + retval.m_bufferConversions.emplace_back({core::smart_refctd_ptr(instance.asset),created.gpuObj.get()}); if constexpr (std::is_same_v) { const uint16_t recomputeMips = created.patch.recomputeMips; - getConversionRequests.operator()().emplace_back(core::smart_refctd_ptr(instance.asset),created.gpuObj.get(),recomputeMips); + retval.m_imageConversions.emplace_back({core::smart_refctd_ptr(instance.asset),created.gpuObj.get()},recomputeMips); } // TODO: BLAS and TLAS requests } @@ -3337,7 +3336,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult } const auto& found = dfsCache.nodes[metadata[i].patchIndex.value]; // write it out to the results - if (const auto& gpuObj=found.gpuObj; gpuObj) // found from the `input.readCache` + if (const auto& gpuObj=found.gpuObj; gpuObj) { results[i] = gpuObj; // if something with this content hash is in the stagingCache, then it must match the `found->gpuObj` @@ -3372,6 +3371,8 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul auto device = m_params.device; const auto reqQueueFlags = reservations.getRequiredQueueFlags(); + // compacted TLASes need to be substituted in cache and Descriptor Sets + core::unordered_map> compactedTLASMap; // Anything to do? if (reqQueueFlags.value!=IQueue::FAMILY_FLAGS::NONE) { @@ -3536,7 +3537,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul }; // upload Buffers - auto& buffersToUpload = std::get>(reservations.m_conversionRequests); + auto& buffersToUpload = reservations.m_bufferConversions; { core::vector> ownershipTransfers; ownershipTransfers.reserve(buffersToUpload.size()); @@ -3630,7 +3631,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul origXferStallCallback(tillScratchResettable); }; - auto& imagesToUpload = std::get>(reservations.m_conversionRequests); + auto& imagesToUpload = reservations.m_imageConversions; if (!imagesToUpload.empty()) { // @@ -4088,7 +4089,8 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul } // BLAS builds - auto& blasToBuild = std::get>(reservations.m_conversionRequests); + core::unordered_map> compactedBLASMap; + auto& blasToBuild = reservations.m_blasConversions[0]; if (const auto blasCount = blasToBuild.size(); blasCount) { constexpr auto GeometryIsAABBFlag = ICPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT; @@ -4111,6 +4113,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul triangles.reserve(totalTriGeoCount); triangles.reserve(totalAABBGeoCount); } +#if 0 for (auto& item : blasToBuild) { auto* as = item.gpuObj; @@ -4141,13 +4144,15 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul computeCmdBuf->cmdbuf->buildAccelerationStructures(buildInfo,rangeInfo); } } +#endif } // TLAS builds - auto& tlasToBuild = std::get>(reservations.m_conversionRequests); + auto& tlasToBuild = reservations.m_tlasConversions[0]; if (!tlasToBuild.empty()) { } + compactedBLASMap.clear(); const bool computeSubmitIsNeeded = submitsNeeded.hasFlags(IQueue::FAMILY_FLAGS::COMPUTE_BIT); // first submit transfer @@ -4183,6 +4188,9 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul } + // Descriptor Sets need their TLAS descriptors substituted if they've been compacted + core::vector tlasRewrites; tlasRewrites.reserve(compactedTLASMap.size()); + core::vector tlasInfos; tlasInfos.reserve(compactedTLASMap.size()); // want to check if deps successfully exist auto missingDependent = [&reservations](const typename asset_traits::video_t* dep)->bool { @@ -4200,13 +4208,14 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul auto& cache = std::get>(m_caches); cache.m_forwardMap.reserve(cache.m_forwardMap.size()+stagingCache.size()); cache.m_reverseMap.reserve(cache.m_reverseMap.size()+stagingCache.size()); + constexpr bool IsTLAS = std::is_same_v; for (auto& item : stagingCache) if (item.second.value!=CHashCache::NoContentHash) // didn't get wiped { // rescan all the GPU objects and find out if they depend on anything that failed, if so add to failure set bool depsMissing = false; // only go over types we could actually break via missing upload/build (i.e. pipelines are unbreakable) - if constexpr (std::is_same_v) + if constexpr (IsTLAS) { // there's no lifetime tracking (refcounting) from TLAS to BLAS, so one just must trust the pre-TLAS-build input validation to do its job } @@ -4225,6 +4234,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul if (samplers[i]) depsMissing = missingDependent.operator()(samplers[i].get()); } + const auto tlasRewriteOldSize = tlasRewrites.size(); for (auto i=0u; !depsMissing && i(asset::IDescriptor::E_TYPE::ET_COUNT); i++) { const auto type = static_cast(i); @@ -4251,8 +4261,30 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul depsMissing = missingDependent.operator()(static_cast(untypedDesc)); break; case asset::IDescriptor::EC_ACCELERATION_STRUCTURE: - depsMissing = missingDependent.operator()(static_cast(untypedDesc)); + { + const auto* tlas = static_cast(untypedDesc); + depsMissing = missingDependent.operator()(tlas); + if (!depsMissing) + { + auto found = compactedTLASMap.find(tlas); + if (found==compactedTLASMap.end()) + break; + // written TLAS got compacted, so queue the descriptor for update + using redirect_t = IDescriptorSetLayoutBase::CBindingRedirect; + const redirect_t& redirect = layout->getDescriptorRedirect(IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE); + const auto bindingRange = redirect.findBindingStorageIndex(redirect_t::storage_offset_t(i)); + const auto firstElementOffset = redirect.getStorageOffset(bindingRange).data; + tlasRewrites.push_back({ + .set = item.first, + .binding = redirect.getBinding(bindingRange).data, + .arrayElement = i-firstElementOffset, + .count = 1, // write them one by one, no point optimizing + .info = nullptr // for now + }); + tlasInfos.emplace_back().desc = smart_refctd_ptr(found->second); + } break; + } default: assert(false); depsMissing = true; @@ -4260,21 +4292,34 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul } } } + // don't bother overwriting a Descriptor Set that won't be marked as successfully converted (inserted into write cache) + if (depsMissing) + { + tlasRewrites.resize(tlasRewriteOldSize); + tlasInfos.resize(tlasRewriteOldSize); + } } + auto* pGpuObj = item.first; if (depsMissing) { const auto* hashAsU64 = reinterpret_cast(item.second.value.data); - logger.log("GPU Obj %s not writing to final cache because conversion of a dependant failed!", system::ILogger::ELL_ERROR, item.first->getObjectDebugName()); + logger.log("GPU Obj %s not writing to final cache because conversion of a dependant failed!", system::ILogger::ELL_ERROR, pGpuObj->getObjectDebugName()); // wipe self, to let users know item.second.value = {}; continue; } - if (!params.writeCache(item.second)) + if (!params.writeCache(item.second)) // TODO: let the user know the pointer too? continue; + if constexpr (IsTLAS) + { + auto found = compactedTLASMap.find(pGpuObj); + if (found!=compactedTLASMap.end()) + pGpuObj = found->second.get(); + } asset_cached_t cached; - cached.value = core::smart_refctd_ptr::video_t>(item.first); + cached.value = core::smart_refctd_ptr::video_t>(pGpuObj); + cache.m_reverseMap.emplace(pGpuObj,item.second); cache.m_forwardMap.emplace(item.second,std::move(cached)); - cache.m_reverseMap.emplace(item.first,item.second); } }; // again, need to go bottom up so we can check dependencies being successes @@ -4293,6 +4338,15 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul mergeCache.operator()(); mergeCache.operator()(); mergeCache.operator()(); + // deal with rewriting the TLASes with compacted ones + { + compactedTLASMap.clear(); + auto* infoIt = tlasInfos.data(); + for (auto& write : tlasRewrites) + write.info = infoIt++; + if (!tlasRewrites.empty()) + device->updateDescriptorSets(tlasRewrites,{}); + } // mergeCache.operator()(); // no submit was necessary, so should signal the extra semaphores from the host From a4307f4c6034bb31289ed92f10b2aa5bc7bef860 Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 25 Nov 2024 17:13:48 +0100 Subject: [PATCH 11/11] fix a nasty possible threading bug with IDeferredOperation and a bunch of typos --- include/nbl/asset/ICPUAccelerationStructure.h | 4 ++-- include/nbl/asset/IDescriptorSetLayout.h | 4 ++-- include/nbl/video/IDeferredOperation.h | 4 ++++ include/nbl/video/ILogicalDevice.h | 2 ++ 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h index dcf2975b78..1c83afc783 100644 --- a/include/nbl/asset/ICPUAccelerationStructure.h +++ b/include/nbl/asset/ICPUAccelerationStructure.h @@ -90,13 +90,13 @@ class ICPUBottomLevelAccelerationStructure final : public IBottomLevelAccelerati inline std::span> getAABBGeometries() { if (!isMutable() || !m_AABBGeoms) - return {nullptr,nullptr}; + return {}; return {m_AABBGeoms->data(),m_AABBGeoms->size()}; } inline std::span> getAABBGeometries() const { if (!m_AABBGeoms) - return {nullptr,nullptr}; + return {}; return {m_AABBGeoms->data(),m_AABBGeoms->size()}; } inline bool setGeometries(core::smart_refctd_dynamic_array>&& geometries, core::smart_refctd_dynamic_array&& ranges) diff --git a/include/nbl/asset/IDescriptorSetLayout.h b/include/nbl/asset/IDescriptorSetLayout.h index 5437a09330..803253e8b3 100644 --- a/include/nbl/asset/IDescriptorSetLayout.h +++ b/include/nbl/asset/IDescriptorSetLayout.h @@ -150,8 +150,8 @@ class IDescriptorSetLayoutBase : public virtual core::IReferenceCounted // TODO: // Weird functions for exceptional situations inline storage_range_index_t findBindingStorageIndex(const storage_offset_t offset) const { - const auto found = std::upper_bound(m_storageOffsets, m_storageOffsets+m_count, offset, [](storage_range_index_t a, storage_range_index_t b) -> bool {return a.data < b.data; }); - const auto ix = m_storageOffsets - found; + const auto found = std::upper_bound(m_storageOffsets,m_storageOffsets+m_count,offset,[](storage_offset_t a, storage_offset_t b)->bool{return a.data=m_count) return {}; return storage_range_index_t(ix); diff --git a/include/nbl/video/IDeferredOperation.h b/include/nbl/video/IDeferredOperation.h index 0f85048f75..44f4d4221d 100644 --- a/include/nbl/video/IDeferredOperation.h +++ b/include/nbl/video/IDeferredOperation.h @@ -30,7 +30,10 @@ class IDeferredOperation : public IBackendObject { const auto retval = execute_impl(); if (retval==STATUS::COMPLETED || retval==STATUS::_ERROR) + { + std::lock_guard lock(vectorLock); m_resourceTracking.clear(); + } return retval; } @@ -66,6 +69,7 @@ class IDeferredOperation : public IBackendObject private: friend class ILogicalDevice; // when we improve allocators, etc. we'll stop using STL containers here + std::mutex vectorLock; core::vector> m_resourceTracking; }; diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index 39325bd6dd..ee7d31bdba 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -46,6 +46,8 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe //! Basic getters + inline system::ILogger* getLogger() const {return m_logger.get();} + inline const IPhysicalDevice* getPhysicalDevice() const { return m_physicalDevice; } inline const SPhysicalDeviceFeatures& getEnabledFeatures() const { return m_enabledFeatures; }