diff --git a/include/nbl/asset/IAccelerationStructure.h b/include/nbl/asset/IAccelerationStructure.h index 94e53238b..46d8a5c4c 100644 --- a/include/nbl/asset/IAccelerationStructure.h +++ b/include/nbl/asset/IAccelerationStructure.h @@ -155,7 +155,7 @@ class ITopLevelAccelerationStructure : public AccelerationStructure PREFER_FAST_BUILD_BIT = 0x1u<<3u, LOW_MEMORY_BIT = 0x1u<<4u, // Synthetic flag we use to indicate `VkAccelerationStructureGeometryInstancesDataKHR::arrayOfPointers` - INSTANCE_DATA_IS_POINTERS_TYPE_ENCODED_LSB = 0x1u<<5u, + INSTANCE_DATA_IS_POINTERS_TYPE_ENCODED_LSB = 0x1u<<5u, // this flag really shouldn't be settable outside of `video::IGPU` // Provided by VK_NV_ray_tracing_motion_blur, but we always override and deduce from creation flag because of // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkAccelerationStructureBuildGeometryInfoKHR-dstAccelerationStructure-04927 //MOTION_BIT = 0x1u<<5u, diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h index dfc96ce01..1c83afc78 100644 --- a/include/nbl/asset/ICPUAccelerationStructure.h +++ b/include/nbl/asset/ICPUAccelerationStructure.h @@ -47,7 +47,7 @@ class ICPUBottomLevelAccelerationStructure final : public IBottomLevelAccelerati return {m_geometryPrimitiveCount->begin(),m_geometryPrimitiveCount->end()}; return {}; } - inline std::span getGeometryPrimitiveCounts(const size_t geomIx) const + inline std::span getGeometryPrimitiveCounts() const { if (m_geometryPrimitiveCount) return {m_geometryPrimitiveCount->begin(),m_geometryPrimitiveCount->end()}; @@ -79,7 +79,7 @@ class ICPUBottomLevelAccelerationStructure final : public IBottomLevelAccelerati { if (!isMutable()) return false; - m_buildFlags &= BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT; + m_buildFlags &= ~BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT; m_geometryPrimitiveCount = std::move(ranges); m_triangleGeoms = std::move(geometries); m_AABBGeoms = nullptr; @@ -87,17 +87,17 @@ class ICPUBottomLevelAccelerationStructure final : public IBottomLevelAccelerati } // - inline core::SRange> getAABBGeometries() + inline std::span> getAABBGeometries() { if (!isMutable() || !m_AABBGeoms) - return {nullptr,nullptr}; - return {m_AABBGeoms->begin(),m_AABBGeoms->end()}; + return {}; + return {m_AABBGeoms->data(),m_AABBGeoms->size()}; } - inline core::SRange> getAABBGeometries() const + inline std::span> getAABBGeometries() const { if (!m_AABBGeoms) - return {nullptr,nullptr}; - return {m_AABBGeoms->begin(),m_AABBGeoms->end()}; + return {}; + return {m_AABBGeoms->data(),m_AABBGeoms->size()}; } inline bool setGeometries(core::smart_refctd_dynamic_array>&& geometries, core::smart_refctd_dynamic_array&& ranges) { @@ -337,17 +337,17 @@ class ICPUTopLevelAccelerationStructure final : public ITopLevelAccelerationStru std::variant instance = StaticInstance{}; }; - core::SRange getInstances() + std::span getInstances() { if (!isMutable() || !m_instances) - return {nullptr,nullptr}; - return {m_instances->begin(),m_instances->end()}; + return {}; + return {m_instances->data(),m_instances->size()}; } - core::SRange getInstances() const + std::span getInstances() const { if (!m_instances) - return {nullptr,nullptr}; - return {m_instances->begin(),m_instances->end()}; + return {}; + return {m_instances->data(),m_instances->size()}; } bool setInstances(core::smart_refctd_dynamic_array&& _instances) { @@ -367,7 +367,7 @@ class ICPUTopLevelAccelerationStructure final : public ITopLevelAccelerationStru } //! - constexpr static inline auto AssetType = ET_BOTOM_LEVEL_ACCELERATION_STRUCTURE; + constexpr static inline auto AssetType = ET_TOP_LEVEL_ACCELERATION_STRUCTURE; inline IAsset::E_TYPE getAssetType() const override { return AssetType; } inline core::smart_refctd_ptr clone(uint32_t _depth = ~0u) const override diff --git a/include/nbl/asset/IDescriptorSetLayout.h b/include/nbl/asset/IDescriptorSetLayout.h index 34e7d2ccc..803253e8b 100644 --- a/include/nbl/asset/IDescriptorSetLayout.h +++ b/include/nbl/asset/IDescriptorSetLayout.h @@ -147,6 +147,16 @@ class IDescriptorSetLayoutBase : public virtual core::IReferenceCounted // TODO: return getStorageOffset(index); } + // Weird functions for exceptional situations + inline storage_range_index_t findBindingStorageIndex(const storage_offset_t offset) const + { + const auto found = std::upper_bound(m_storageOffsets,m_storageOffsets+m_count,offset,[](storage_offset_t a, storage_offset_t b)->bool{return a.data=m_count) + return {}; + return storage_range_index_t(ix); + } + inline uint32_t getTotalCount() const { return (m_count == 0ull) ? 0u : m_storageOffsets[m_count - 1].data; } private: diff --git a/include/nbl/video/IDeferredOperation.h b/include/nbl/video/IDeferredOperation.h index 0f85048f7..44f4d4221 100644 --- a/include/nbl/video/IDeferredOperation.h +++ b/include/nbl/video/IDeferredOperation.h @@ -30,7 +30,10 @@ class IDeferredOperation : public IBackendObject { const auto retval = execute_impl(); if (retval==STATUS::COMPLETED || retval==STATUS::_ERROR) + { + std::lock_guard lock(vectorLock); m_resourceTracking.clear(); + } return retval; } @@ -66,6 +69,7 @@ class IDeferredOperation : public IBackendObject private: friend class ILogicalDevice; // when we improve allocators, etc. we'll stop using STL containers here + std::mutex vectorLock; core::vector> m_resourceTracking; }; diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index 276ed7450..ee7d31bdb 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -46,6 +46,8 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe //! Basic getters + inline system::ILogger* getLogger() const {return m_logger.get();} + inline const IPhysicalDevice* getPhysicalDevice() const { return m_physicalDevice; } inline const SPhysicalDeviceFeatures& getEnabledFeatures() const { return m_enabledFeatures; } @@ -358,7 +360,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe // Create a sampler object to use with ImageViews virtual core::smart_refctd_ptr createSampler(const IGPUSampler::SParams& _params) = 0; // acceleration structures - inline core::smart_refctd_ptr createBottomLevelAccelerationStructure(IGPUAccelerationStructure::SCreationParams&& params) + inline core::smart_refctd_ptr createBottomLevelAccelerationStructure(IGPUBottomLevelAccelerationStructure::SCreationParams&& params) { if (invalidCreationParams(params)) { @@ -402,7 +404,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes( const core::bitflag flags, const bool motionBlur, - const std::span geometries, + const std::span geometries, const uint32_t* const pMaxPrimitiveCounts ) const { @@ -412,7 +414,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe return {}; } - if (!IGPUBottomLevelAccelerationStructure::validBuildFlags(flags, m_enabledFeatures)) + if (!IGPUBottomLevelAccelerationStructure::validBuildFlags(flags,m_enabledFeatures)) { NBL_LOG_ERROR("Invalid build flags"); return {}; diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h index cc3e0a93b..8789c6307 100644 --- a/include/nbl/video/utilities/CAssetConverter.h +++ b/include/nbl/video/utilities/CAssetConverter.h @@ -38,7 +38,8 @@ class CAssetConverter : public core::IReferenceCounted asset::ICPUSampler, asset::ICPUShader, asset::ICPUBuffer, - // acceleration structures, + asset::ICPUBottomLevelAccelerationStructure, + asset::ICPUTopLevelAccelerationStructure, asset::ICPUImage, asset::ICPUBufferView, asset::ICPUImageView, @@ -71,6 +72,14 @@ class CAssetConverter : public core::IReferenceCounted { if (!params.valid()) return nullptr; + #ifndef _NBL_DEBUG + if (!params.optimizer) + { + using pass_e = asset::ISPIRVOptimizer::E_OPTIMIZER_PASS; + // shall we do others? + params.optimizer = core::make_smart_rectd_ptr({EOP_STRIP_DEBUG_INFO}); + } + #endif return core::smart_refctd_ptr(new CAssetConverter(std::move(params)),core::dont_grab); } // When getting dependents, the creation parameters of GPU objects will be produced and patched appropriately. @@ -149,6 +158,75 @@ class CAssetConverter : public core::IReferenceCounted return {true,retval}; } }; + struct NBL_API2 acceleration_structure_patch_base + { + public: + enum class BuildPreference : uint8_t + { + None = 0, + FastTrace = 1, + FastBuild = 2, + Invalid = 3 + }; + + //! select build flags + uint8_t allowUpdate : 1 = false; + uint8_t allowCompaction : 1 = false; + uint8_t allowDataAccess : 1 = false; + BuildPreference preference : 2 = BuildPreference::Invalid; + uint8_t lowMemory : 1 = false; + //! things that control the build + uint8_t hostBuild : 1 = false; + uint8_t compactAfterBuild : 1 = false; + + protected: + bool valid(const ILogicalDevice* device); + + template + std::pair combine_impl(const CRTP& _this, const CRTP& other) const + { + if (_this.preference!=other.preference || _this.preference==BuildPreference::Invalid) + return {false,_this}; + CRTP retval = _this; + retval.allowUpdate |= other.allowUpdate; + retval.allowCompaction |= other.allowCompaction; + retval.allowDataAccess |= other.allowDataAccess; + retval.lowMemory |= other.lowMemory; + retval.hostBuild |= other.hostBuild; + retval.compactAfterBuild |= other.compactAfterBuild; + return {true,retval}; + } + }; + template<> + struct NBL_API2 patch_impl_t : acceleration_structure_patch_base + { + public: + PATCH_IMPL_BOILERPLATE(asset::ICPUBottomLevelAccelerationStructure); + + using build_flags_t = asset::ICPUBottomLevelAccelerationStructure::BUILD_FLAGS; + core::bitflag getBuildFlags(const asset::ICPUBottomLevelAccelerationStructure* blas) const; + + protected: + inline std::pair combine(const this_t& other) const + { + return combine_impl(*this,other); + } + }; + template<> + struct NBL_API2 patch_impl_t : acceleration_structure_patch_base + { + public: + PATCH_IMPL_BOILERPLATE(asset::ICPUTopLevelAccelerationStructure); + + using build_flags_t = asset::ICPUTopLevelAccelerationStructure::BUILD_FLAGS; + core::bitflag getBuildFlags(const asset::ICPUTopLevelAccelerationStructure* tlas) const; + + protected: + inline std::pair combine(const this_t& other) const + { + return combine_impl(*this,other); + } + }; template<> struct NBL_API2 patch_impl_t { @@ -458,6 +536,8 @@ class CAssetConverter : public core::IReferenceCounted virtual const patch_t* operator()(const lookup_t&) const = 0; virtual const patch_t* operator()(const lookup_t&) const = 0; virtual const patch_t* operator()(const lookup_t&) const = 0; + virtual const patch_t* operator()(const lookup_t&) const = 0; + virtual const patch_t* operator()(const lookup_t&) const = 0; virtual const patch_t* operator()(const lookup_t&) const = 0; virtual const patch_t* operator()(const lookup_t&) const = 0; virtual const patch_t* operator()(const lookup_t&) const = 0; @@ -577,6 +657,8 @@ class CAssetConverter : public core::IReferenceCounted bool operator()(lookup_t); bool operator()(lookup_t); bool operator()(lookup_t); + bool operator()(lookup_t); + bool operator()(lookup_t); bool operator()(lookup_t); bool operator()(lookup_t); bool operator()(lookup_t); @@ -717,6 +799,16 @@ class CAssetConverter : public core::IReferenceCounted return {}; } + // this a weird signature, but its for an acceleration structure backing IGPUBuffer + virtual inline std::span getSharedOwnershipQueueFamilies(const size_t groupCopyID, const asset::ICPUBottomLevelAccelerationStructure* blas, const patch_t& patch) const + { + return {}; + } + virtual inline std::span getSharedOwnershipQueueFamilies(const size_t groupCopyID, const asset::ICPUTopLevelAccelerationStructure* tlas, const patch_t& patch) const + { + return {}; + } + virtual inline std::span getSharedOwnershipQueueFamilies(const size_t groupCopyID, const asset::ICPUImage* buffer, const patch_t& patch) const { return {}; @@ -793,6 +885,7 @@ class CAssetConverter : public core::IReferenceCounted { // By default the last to queue to touch a GPU object will own it after any transfer or compute operations are complete. // If you want to record a pipeline barrier that will release ownership to another family, override this. + // The overload for the IGPUBuffer may be called with a hash belonging to a Acceleration Structure, this means that its the storage buffer backing the AS virtual inline uint32_t getFinalOwnerQueueFamily(const IGPUBuffer* buffer, const core::blake3_hash_t& createdFrom) { return IQueue::FamilyIgnored; @@ -829,6 +922,11 @@ class CAssetConverter : public core::IReferenceCounted IUtilities* utilities = nullptr; // optional, last submit (compute, transfer if no compute needed) signals these in addition to the scratch semaphore std::span extraSignalSemaphores = {}; + // specific to Acceleration Structure Build, they need to be at least as large as the largest amount of scratch required for an AS build + CAsyncSingleBufferSubAllocatorST* scratchForDeviceASBuild = nullptr; + std::pmr::memory_resource* scratchForHostASBuild = nullptr; + // needs to service allocations without limit, unlike the above where failure will just force a flush and performance of already queued up builds + IDeviceMemoryAllocator* compactedASAllocator = nullptr; // specific to mip-map recomputation, these are okay defaults for the size of our Descriptor Indexed temporary descriptor set uint32_t sampledImageBindingCount = 1<<10; uint32_t storageImageBindingCount = 11<<10; @@ -853,10 +951,22 @@ class CAssetConverter : public core::IReferenceCounted // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdCopyBufferToImage.html#VUID-vkCmdCopyBufferToImage-commandBuffer-07739 inline core::bitflag getRequiredQueueFlags() const {return m_queueFlags;} + // This is just enough memory to build the Acceleration Structures one by one waiting for each Device Build to complete inbetween. If 0 there are no Device AS Builds or Compactions to perform. + inline uint64_t getMinASBuildScratchSize(const bool forHostOps) const {return m_minASBuildScratchSize[forHostOps];} + // Enough memory to build and compact all the Acceleration Structures at once, obviously respecting order of BLAS (build->compact) -> TLAS (build->compact) + inline uint64_t getMaxASBuildScratchSize(const bool forHostOps) const {return m_maxASBuildScratchSize[forHostOps];} + // What usage flags your scratch buffer must have, if returns NONE means are no Device AS Builds to perform. + inline auto getASBuildScratchUsages() const {return m_ASBuildScratchUsages;} + // tells you if you need to provide a valid `SConvertParams::scratchForHostASBuild` + inline bool willHostASBuild() const {return m_willHostBuildSomeAS;} + // tells you if you need to provide a valid `SConvertParams::compactedASAllocator` + inline bool willCompactAS() const {return m_willHostBuildSomeAS;} + // inline operator bool() const {return bool(m_converter);} - // until `convert` is called, this will only contain valid entries for items already found in `SInput::readCache` + // Until `convert` is called, the Buffers and Images are not filled with content and Acceleration Structures are not built, unless found in the `SInput::readCache` + // WARNING: The Acceleration Structure Pointer WILL CHANGE after calling `convert` if its patch dictates that it will be compacted! (since AS can't resize) // TODO: we could also return per-object semaphore values when object is ready for use (would have to propagate two semaphores up through dependants) template std::span> getGPUObjects() const {return std::get>(m_gpuObjects);} @@ -911,24 +1021,43 @@ class CAssetConverter : public core::IReferenceCounted core::tuple_transform_t m_stagingCaches; // need a more explicit list of GPU objects that need device-assisted conversion template - struct ConversionRequest + struct SConversionRequestBase { // canonical asset (the one that provides content) core::smart_refctd_ptr canonical; // gpu object to transfer canonical's data to or build it from asset_traits::video_t* gpuObj; - // only relevant for images - uint16_t recomputeMips = 0; }; - template - using conversion_requests_t = core::vector>; - using convertible_asset_types = core::type_list< - asset::ICPUBuffer, - asset::ICPUImage/*, - asset::ICPUBottomLevelAccelerationStructure, - asset::ICPUTopLevelAccelerationStructure*/ - >; - core::tuple_transform_t m_conversionRequests; + using SConvReqBuffer = SConversionRequestBase; + core::vector m_bufferConversions; + struct SConvReqImage : SConversionRequestBase + { + bool recomputeMips = 0; + }; + core::vector m_imageConversions; + template// requires std::is_base_of_v + struct SConvReqAccelerationStructure : SConversionRequestBase + { + constexpr static inline uint64_t WontCompact = (0x1ull<<48)-1; + inline bool compact() const {return compactedASWriteOffset!=WontCompact;} + + using build_f = typename CPUAccelerationStructure::BUILD_FLAGS; + inline void setBuildFlags(const build_f _flags) {buildFlags = static_cast(_flags);} + inline build_f getBuildFlags() const {return static_cast(buildFlags);} + + + uint64_t compactedASWriteOffset : 48 = WontCompact; + uint64_t buildFlags : 16 = static_cast(build_f::NONE); + }; + core::vector> m_blasConversions[2]; + core::vector> m_tlasConversions[2]; + + // + uint64_t m_minASBuildScratchSize[2] = {0,0}; + uint64_t m_maxASBuildScratchSize[2] = {0,0}; + core::bitflag m_ASBuildScratchUsages = IGPUBuffer::E_USAGE_FLAGS::EUF_NONE; + uint8_t m_willHostBuildSomeAS : 1 = false; + uint8_t m_willCompactSomeAS : 1 = false; // core::bitflag m_queueFlags = IQueue::FAMILY_FLAGS::NONE; diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index cbbfe09a7..dcf27099a 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -40,6 +40,7 @@ bool CAssetConverter::patch_impl_t::valid(const ILogicalDevice* dev return true; } + CAssetConverter::patch_impl_t::patch_impl_t(const ICPUShader* shader) : stage(shader->getStage()) {} bool CAssetConverter::patch_impl_t::valid(const ILogicalDevice* device) { @@ -99,6 +100,120 @@ bool CAssetConverter::patch_impl_t::valid(const ILogicalDevice* devi return true; } +bool CAssetConverter::acceleration_structure_patch_base::valid(const ILogicalDevice* device) +{ + // note that we don't check the validity of things we don't patch, all the instance and geometry data, but it will be checked by the driver anyway during creation/build + if (preference==BuildPreference::Invalid) // unititialized or just wrong + return false; + // potentially skip a lot of work and allocations + const auto& features = device->getEnabledFeatures(); + if (!features.accelerationStructure) + return false; + // just make the flags agree/canonicalize + allowCompaction = allowCompaction || compactAfterBuild; + // don't make invalid, just soft fail + const auto& limits = device->getPhysicalDevice()->getLimits(); + if (allowDataAccess) // TODO: && !limits.rayTracingPositionFetch) + allowDataAccess = false; + if (hostBuild && !features.accelerationStructureHostCommands) + hostBuild = false; + return true; +} +CAssetConverter::patch_impl_t::patch_impl_t(const ICPUBottomLevelAccelerationStructure* blas) +{ + const auto flags = blas->getBuildFlags(); + // straight up invalid + if (flags.hasFlags(build_flags_t::PREFER_FAST_TRACE_BIT|build_flags_t::PREFER_FAST_BUILD_BIT)) + return; + + allowUpdate = flags.hasFlags(build_flags_t::ALLOW_UPDATE_BIT); + allowCompaction = flags.hasFlags(build_flags_t::ALLOW_COMPACTION_BIT); + allowDataAccess = flags.hasFlags(build_flags_t::ALLOW_DATA_ACCESS_KHR); + if (flags.hasFlags(build_flags_t::PREFER_FAST_TRACE_BIT)) + preference = BuildPreference::FastTrace; + else if (flags.hasFlags(build_flags_t::PREFER_FAST_BUILD_BIT)) + preference = BuildPreference::FastBuild; + else + preference = BuildPreference::None; + lowMemory = flags.hasFlags(build_flags_t::LOW_MEMORY_BIT); +} +auto CAssetConverter::patch_impl_t::getBuildFlags(const ICPUBottomLevelAccelerationStructure* blas) const -> core::bitflag +{ + constexpr build_flags_t OverridableMask = build_flags_t::LOW_MEMORY_BIT|build_flags_t::PREFER_FAST_TRACE_BIT|build_flags_t::PREFER_FAST_BUILD_BIT|build_flags_t::ALLOW_COMPACTION_BIT|build_flags_t::ALLOW_UPDATE_BIT|build_flags_t::ALLOW_DATA_ACCESS_KHR; + auto flags = blas->getBuildFlags()&(~OverridableMask); + if (lowMemory) + flags |= build_flags_t::LOW_MEMORY_BIT; + if (allowDataAccess) + flags |= build_flags_t::ALLOW_DATA_ACCESS_KHR; + if (allowCompaction) + flags |= build_flags_t::ALLOW_COMPACTION_BIT; + if (allowUpdate) + flags |= build_flags_t::ALLOW_UPDATE_BIT; + switch (preference) + { + case acceleration_structure_patch_base::BuildPreference::FastTrace: + flags |= build_flags_t::PREFER_FAST_TRACE_BIT; + break; + case acceleration_structure_patch_base::BuildPreference::FastBuild: + flags |= build_flags_t::PREFER_FAST_BUILD_BIT; + break; + default: + break; + } + return flags; +} +bool CAssetConverter::patch_impl_t::valid(const ILogicalDevice* device) +{ + return acceleration_structure_patch_base::valid(device); +} +CAssetConverter::patch_impl_t::patch_impl_t(const ICPUTopLevelAccelerationStructure* tlas) +{ + const auto flags = tlas->getBuildFlags(); + // straight up invalid + if (tlas->getInstances().empty()) + return; + if (flags.hasFlags(build_flags_t::PREFER_FAST_TRACE_BIT|build_flags_t::PREFER_FAST_BUILD_BIT)) + return; + + allowUpdate = flags.hasFlags(build_flags_t::ALLOW_UPDATE_BIT); + allowCompaction = flags.hasFlags(build_flags_t::ALLOW_COMPACTION_BIT); + allowDataAccess = false; + if (flags.hasFlags(build_flags_t::PREFER_FAST_TRACE_BIT)) + preference = BuildPreference::FastTrace; + else if (flags.hasFlags(build_flags_t::PREFER_FAST_BUILD_BIT)) + preference = BuildPreference::FastBuild; + else + preference = BuildPreference::None; + lowMemory = flags.hasFlags(build_flags_t::LOW_MEMORY_BIT); +} +auto CAssetConverter::patch_impl_t::getBuildFlags(const ICPUTopLevelAccelerationStructure* tlas) const -> core::bitflag +{ + constexpr build_flags_t OverridableMask = build_flags_t::LOW_MEMORY_BIT|build_flags_t::PREFER_FAST_TRACE_BIT|build_flags_t::PREFER_FAST_BUILD_BIT|build_flags_t::ALLOW_COMPACTION_BIT|build_flags_t::ALLOW_UPDATE_BIT; + auto flags = tlas->getBuildFlags()&(~OverridableMask); + if (lowMemory) + flags |= build_flags_t::LOW_MEMORY_BIT; + if (allowCompaction) + flags |= build_flags_t::ALLOW_COMPACTION_BIT; + if (allowUpdate) + flags |= build_flags_t::ALLOW_UPDATE_BIT; + switch (preference) + { + case acceleration_structure_patch_base::BuildPreference::FastTrace: + flags |= build_flags_t::PREFER_FAST_TRACE_BIT; + break; + case acceleration_structure_patch_base::BuildPreference::FastBuild: + flags |= build_flags_t::PREFER_FAST_BUILD_BIT; + break; + default: + break; + } + return flags; +} +bool CAssetConverter::patch_impl_t::valid(const ILogicalDevice* device) +{ + return acceleration_structure_patch_base::valid(device); +} + // smol utility function template void deduceMetaUsages(Patch& patch, const core::bitflag usages, const E_FORMAT originalFormat, const bool hasDepthAspect=true) @@ -318,6 +433,25 @@ class AssetVisitor : public CRTP } private: + // there is no `impl()` overload taking `ICPUTopLevelAccelerationStructure` same as there is no `ICPUmage` + inline bool impl(const instance_t& instance, const CAssetConverter::patch_t& userPatch) + { + const auto blasInstances = instance.asset->getInstances(); + if (blasInstances.empty()) // TODO: is it valid to have a BLASless TLAS? + return false; + for (size_t i=0; i patch = {blas}; + if (userPatch.allowDataAccess) // TODO: check if all BLAS within TLAS need to have the flag ON vs OFF or only some + patch.allowDataAccess = true; + if (!descend(blas,std::move(patch),i)) + return false; + } + return true; + } inline bool impl(const instance_t& instance, const CAssetConverter::patch_t& userPatch) { const auto* dep = instance.asset->getUnderlyingBuffer(); @@ -556,8 +690,10 @@ class AssetVisitor : public CRTP } case IDescriptor::EC_ACCELERATION_STRUCTURE: { - _NBL_TODO(); - [[fallthrough]]; + auto tlas = static_cast(untypedDesc); + if (!descend(tlas,{tlas},type,binding,el)) + return false; + break; } default: assert(false); @@ -845,6 +981,8 @@ class PatchOverride final : public CAssetConverter::CHashCache::IPatchOverride inline const patch_t* operator()(const lookup_t& lookup) const override {return impl(lookup);} inline const patch_t* operator()(const lookup_t& lookup) const override {return impl(lookup);} inline const patch_t* operator()(const lookup_t& lookup) const override {return impl(lookup);} + inline const patch_t* operator()(const lookup_t& lookup) const override {return impl(lookup);} + inline const patch_t* operator()(const lookup_t& lookup) const override {return impl(lookup);} inline const patch_t* operator()(const lookup_t& lookup) const override {return impl(lookup);} inline const patch_t* operator()(const lookup_t& lookup) const override {return impl(lookup);} inline const patch_t* operator()(const lookup_t& lookup) const override {return impl(lookup);} @@ -955,6 +1093,50 @@ bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t loo hasher.update(&patchedParams,sizeof(patchedParams)) << lookup.asset->getContentHash(); return true; } +bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t lookup) +{ + // extras from the patch + hasher << lookup.patch->hostBuild; + hasher << lookup.patch->compactAfterBuild; + // overriden flags + hasher << lookup.patch->getBuildFlags(lookup.asset); + // finally the contents +//TODO: hasher << lookup.asset->getContentHash(); + return true; +} +bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t lookup) +{ + const auto* asset = lookup.asset; + // + AssetVisitor> visitor = { + *this, + {asset,static_cast(patchOverride)->uniqueCopyGroupID}, + *lookup.patch + }; + if (!visitor()) + return false; + // extras from the patch + hasher << lookup.patch->hostBuild; + hasher << lookup.patch->compactAfterBuild; + // overriden flags + hasher << lookup.patch->getBuildFlags(lookup.asset); + const auto instances = asset->getInstances(); + // important two passes to not give identical data due to variable length polymorphic array being hashed + for (const auto& instance : instances) + hasher << instance.getType(); + for (const auto& instance : instances) + { + std::visit([&hasher](const auto& typedInstance)->void + { + using instance_t = std::decay_t; + // the BLAS pointers (the BLAS contents already get hashed via asset visitor and `getDependent`, its only the metadate we need to hash + hasher.update(&typedInstance,offsetof(instance_t,base)+offsetof(ICPUTopLevelAccelerationStructure::Instance,blas)); + }, + instance.instance + ); + } + return true; +} bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t lookup) { // failed promotion @@ -1366,8 +1548,8 @@ void CAssetConverter::CHashCache::eraseStale(const IPatchOverride* patchOverride rehash.operator()(); rehash.operator()(); rehash.operator()(); -// rehash.operator()(); -// rehash.operator()(); + rehash.operator()(); + rehash.operator()(); // only once all the descriptor types have been hashed, we can hash sets rehash.operator()(); // naturally any pipeline depends on shaders and pipeline cache @@ -1417,6 +1599,27 @@ class GetDependantVisitBase template class GetDependantVisit; +template<> +class GetDependantVisit : public GetDependantVisitBase +{ + public: + // because of the deferred building of TLASes and lack of lifetime tracking between them, nothing to do on some passes + //core::smart_refctd_ptr* const outBLASes; + + protected: + bool descend_impl( + const instance_t& user, const CAssetConverter::patch_t& userPatch, + const instance_t& dep, const CAssetConverter::patch_t& soloPatch, + const uint32_t instanceIndex // not the custom index, its literally just an ordinal in `getInstances()` + ) + { + auto depObj = getDependant(dep,soloPatch); + if (!depObj) + return false; + // outBLASes[instanceIndex] = std::move(depObj); + return true; + } +}; template<> class GetDependantVisit : public GetDependantVisitBase { @@ -1848,6 +2051,9 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult case ICPUImageView::AssetType: visit.operator()(entry); break; + case ICPUTopLevelAccelerationStructure::AssetType: + visit.operator()(entry); + break; // these assets have no dependants, should have never been pushed on the stack default: assert(false); @@ -1963,12 +2169,12 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult using memory_backed_ptr_variant_t = std::variant*,asset_cached_t*>; core::map> allocationRequests; // for this we require that the data storage for the dfsCaches' nodes does not change - auto requestAllocation = [&inputs,device,&allocationRequests](asset_cached_t* pGpuObj)->bool + auto requestAllocation = [&inputs,device,&allocationRequests](asset_cached_t* pGpuObj, const uint32_t memoryTypeConstraint=~0u)->bool { auto* gpuObj = pGpuObj->get(); const IDeviceMemoryBacked::SDeviceMemoryRequirements& memReqs = gpuObj->getMemoryReqs(); // this shouldn't be possible - assert(memReqs.memoryTypeBits); + assert(memReqs.memoryTypeBits&memoryTypeConstraint); // allocate right away those that need their own allocation if (memReqs.requiresDedicatedAllocation) { @@ -1984,17 +2190,34 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult { // make the creation conditional upon allocation success MemoryRequirementBin reqBin = { - .compatibileMemoryTypeBits = memReqs.memoryTypeBits, + .compatibileMemoryTypeBits = memReqs.memoryTypeBits&memoryTypeConstraint, // we ignore this for now, because we can't know how many `DeviceMemory` objects we have left to make, so just join everything by default //.refersDedicatedAllocation = memReqs.prefersDedicatedAllocation }; if constexpr (std::is_same_v,IGPUBuffer>) - reqBin.needsDeviceAddress = gpuObj->getCreationParams().usage.hasFlags(IGPUBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT); + { + const auto usage = gpuObj->getCreationParams().usage; + reqBin.needsDeviceAddress = usage.hasFlags(IGPUBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT); + // stops us needing weird awful code to ensure buffer storing AS has alignment of at least 256 + assert(!usage.hasFlags(IGPUBuffer::E_USAGE_FLAGS::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT) || memReqs.alignmentLog2>=8); + } allocationRequests[reqBin].emplace_back(pGpuObj); } return true; }; + // BLAS and TLAS creation is somewhat delayed by buffer creation and allocation + struct DeferredASCreationParams + { + asset_cached_t storage; + size_t scratchSize : 62 = 0; + size_t motionBlur : 1 = false; + size_t compactAfterBuild : 1 = false; + size_t inputSize = 0; + uint32_t maxInstanceCount = 0; + }; + core::vector accelerationStructureParams[2]; + // Deduplication, Creation and Propagation auto dedupCreateProp = [&]()->void { @@ -2014,7 +2237,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult contentHash = retval.getHashCache()->hash( {instance.asset,&created.patch}, &patchOverride, - /*.mistrustLevel = */1 + /*.mistrustLevel =*/ 1 ); // failed to hash all together (only possible reason is failure of `PatchGetter` to provide a valid patch) if (contentHash==CHashCache::NoContentHash) @@ -2164,6 +2387,133 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; } } + if constexpr (std::is_same_v || std::is_same_v) + { + using mem_prop_f = IDeviceMemoryAllocation::E_MEMORY_PROPERTY_FLAGS; + const auto deviceBuildMemoryTypes = device->getPhysicalDevice()->getMemoryTypeBitsFromMemoryTypeFlags(mem_prop_f::EMPF_DEVICE_LOCAL_BIT); + const auto hostBuildMemoryTypes = device->getPhysicalDevice()->getMemoryTypeBitsFromMemoryTypeFlags(mem_prop_f::EMPF_DEVICE_LOCAL_BIT|mem_prop_f::EMPF_HOST_WRITABLE_BIT|mem_prop_f::EMPF_HOST_CACHED_BIT); + + constexpr bool IsTLAS = std::is_same_v; + accelerationStructureParams[IsTLAS].resize(gpuObjects.size()); + for (auto& entry : conversionRequests) + for (auto i=0ull; iusesMotion(); + // we will need to temporarily store the build input buffers somewhere + size_t inputSize = 0; + ILogicalDevice::AccelerationStructureBuildSizes sizes = {}; + { + const auto buildFlags = patch.getBuildFlags(as); + if constexpr (IsTLAS) + { + AssetVisitor> visitor = { + {visitBase}, + {asset,uniqueCopyGroupID}, + patch + }; + if (!visitor()) + continue; + const auto instanceCount = as->getInstances().size(); + sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,instanceCount); + inputSize = (motionBlur ? sizeof(IGPUTopLevelAccelerationStructure::DevicePolymorphicInstance):sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance))*instanceCount; + } + else + { + const uint32_t* pMaxPrimitiveCounts = as->getGeometryPrimitiveCounts().data(); + // the code here is not pretty, but DRY-ing is of this is for later + if (buildFlags.hasFlags(ICPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT)) + { + const auto geoms = as->getAABBGeometries(); + if (patch.hostBuild) + { + const std::span> cpuGeoms = { + reinterpret_cast*>(geoms.data()),geoms.size() + }; + sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts); + } + else + { + const std::span> cpuGeoms = { + reinterpret_cast*>(geoms.data()),geoms.size() + }; + sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts); + // TODO: check if the strides need to be aligned to 4 bytes for AABBs + for (const auto& geom : geoms) + if (const auto aabbCount=*(pMaxPrimitiveCounts++); aabbCount) + inputSize = core::roundUp(inputSize,sizeof(float))+aabbCount*geom.stride; + } + } + else + { + core::map allocationsPerStride; + const auto geoms = as->getTriangleGeometries(); + if (patch.hostBuild) + { + const std::span> cpuGeoms = { + reinterpret_cast*>(geoms.data()),geoms.size() + }; + sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts); + } + else + { + const std::span> cpuGeoms = { + reinterpret_cast*>(geoms.data()),geoms.size() + }; + sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts); + // TODO: check if the strides need to be aligned to 4 bytes for AABBs + for (const auto& geom : geoms) + if (const auto triCount=*(pMaxPrimitiveCounts++); triCount) + { + switch (geom.indexType) + { + case E_INDEX_TYPE::EIT_16BIT: + allocationsPerStride[sizeof(uint16_t)] += triCount*3; + break; + case E_INDEX_TYPE::EIT_32BIT: + allocationsPerStride[sizeof(uint32_t)] += triCount*3; + break; + default: + break; + } + size_t bytesPerVertex = geom.vertexStride; + if (geom.vertexData[1]) + bytesPerVertex += bytesPerVertex; + allocationsPerStride[geom.vertexStride] += geom.maxVertex; + } + } + for (const auto& entry : allocationsPerStride) + inputSize = core::roundUp(inputSize,entry.first)+entry.first*entry.second; + } + } + } + if (!sizes) + continue; + // this is where it gets a bit weird, we need to create a buffer to back the acceleration structure + IGPUBuffer::SCreationParams params = {}; + constexpr size_t MinASBufferAlignment = 256u; + params.size = core::roundUp(sizes.accelerationStructureSize,MinASBufferAlignment); + params.usage = IGPUBuffer::E_USAGE_FLAGS::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT|IGPUBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT; + // concurrent ownership if any + const auto outIx = i+entry.second.firstCopyIx; + const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx]; + const auto queueFamilies = inputs.getSharedOwnershipQueueFamilies(uniqueCopyGroupID,as,patch); + params.queueFamilyIndexCount = queueFamilies.size(); + params.queueFamilyIndices = queueFamilies.data(); + // we need to save the buffer in a side-channel for later + auto& out = accelerationStructureParams[IsTLAS][baseOffset+entry.second.firstCopyIx+i]; + out = { + .storage = device->createBuffer(std::move(params)), + .scratchSize = sizes.buildScratchSize, + .motionBlur = motionBlur, + .compactAfterBuild = patch.compactAfterBuild, + .inputSize = inputSize + }; + if (out.storage) + requestAllocation(&out.storage,patch.hostBuild ? hostBuildMemoryTypes:deviceBuildMemoryTypes); + } + } if constexpr (std::is_same_v) { for (auto& entry : conversionRequests) @@ -2600,9 +2950,11 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult } // Propagate the results back, since the dfsCache has the original asset pointers as keys, we map in reverse - auto& stagingCache = std::get>(retval.m_stagingCaches); - dfsCache.for_each([&](const instance_t& instance, dfs_cache::created_t& created)->void + // This gets deferred till AFTER the Buffer Memory Allocations and Binding for Acceleration Structures + if constexpr (!std::is_same_v && !std::is_same_v) + dfsCache.for_each([&](const instance_t& instance, dfs_cache::created_t& created)->void { + auto& stagingCache = std::get>(retval.m_stagingCaches); // already found in read cache and not converted if (created.gpuObj) return; @@ -2664,16 +3016,15 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult return; } } - // this is super annoying, was hoping metaprogramming with `has_type` would actually work - auto getConversionRequests = [&]()->auto&{return std::get>(retval.m_conversionRequests);}; + // if constexpr (std::is_same_v) - getConversionRequests.operator()().emplace_back(core::smart_refctd_ptr(instance.asset),created.gpuObj.get());; + retval.m_bufferConversions.emplace_back({core::smart_refctd_ptr(instance.asset),created.gpuObj.get()}); if constexpr (std::is_same_v) { const uint16_t recomputeMips = created.patch.recomputeMips; - getConversionRequests.operator()().emplace_back(core::smart_refctd_ptr(instance.asset),created.gpuObj.get(),recomputeMips); + retval.m_imageConversions.emplace_back({core::smart_refctd_ptr(instance.asset),created.gpuObj.get()},recomputeMips); } - // TODO: BLAS and TLAS requests +// TODO: BLAS and TLAS requests } ); }; @@ -2681,8 +3032,9 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult // Both so we can hash in O(Depth) and not O(Depth^2) but also so we have all the possible dependants ready. // If two Asset chains are independent then we order them from most catastrophic failure to least. dedupCreateProp.operator()(); + dedupCreateProp.operator()(); + dedupCreateProp.operator()(); dedupCreateProp.operator()(); -// TODO: add backing buffers (not assets) for BLAS and TLAS builds // Allocate Memory { auto getAsBase = [](const memory_backed_ptr_variant_t& var) -> const IDeviceMemoryBacked* @@ -2877,7 +3229,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult for (auto& req : reqBin.second) { const auto asBacked = getAsBase(req); - if (asBacked->getBoundMemory().isValid()) + if (asBacked->getBoundMemory().isValid()) // TODO: maybe change to an assert? Our `failures` should really kinda make sure we never continue; switch (req.index()) { @@ -2895,8 +3247,60 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult } allocationRequests.clear(); } -// dedupCreateProp.operator()(); -// dedupCreateProp.operator()(); + // Deal with Deferred Creation of Acceleration structures + { + for (auto asLevel=0; asLevel<2; asLevel++) + { + // each of these stages must have a barrier inbetween + size_t scratchSizeFullParallelBuild = 0; + size_t scratchSizeFullParallelCompact = 0; + // we collect that stats AFTER making sure that the BLAS / TLAS can actually be created + for (const auto& deferredParams : accelerationStructureParams[asLevel]) + { + // buffer failed to create/allocate + if (!deferredParams.storage.get()) + continue; + IGPUAccelerationStructure::SCreationParams baseParams; + { + auto* buf = deferredParams.storage.get(); + const auto bufSz = buf->getSize(); + using create_f = IGPUAccelerationStructure::SCreationParams::FLAGS; + baseParams = { + .bufferRange = {.offset=0,.size=bufSz,.buffer=smart_refctd_ptr(buf)}, + .flags = deferredParams.motionBlur ? create_f::MOTION_BIT:create_f::NONE + }; + } + smart_refctd_ptr as; + if (asLevel) + { + as = device->createBottomLevelAccelerationStructure({baseParams,deferredParams.maxInstanceCount}); + } + else + { + as = device->createTopLevelAccelerationStructure({baseParams,deferredParams.maxInstanceCount}); + } + // note that in order to compact an AS you need to allocate a buffer range whose size is known only after the build + const auto buildSize = deferredParams.inputSize+deferredParams.scratchSize; + // sizes for building 1-by-1 vs parallel, note that + retval.m_minASBuildScratchSize = core::max(buildSize,retval.m_minASBuildScratchSize); + scratchSizeFullParallelBuild += buildSize; + if (deferredParams.compactAfterBuild) + scratchSizeFullParallelCompact += deferredParams.scratchSize; + // triangles, AABBs or Instance Transforms will need to be supplied from VRAM + // TODO: also mark somehow that we'll need a BUILD INPUT READ ONLY BUFFER WITH XFER usage + if (deferredParams.inputSize) + retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; + } + // + retval.m_maxASBuildScratchSize = core::max(core::max(scratchSizeFullParallelBuild,scratchSizeFullParallelCompact),retval.m_maxASBuildScratchSize); + } + // + if (retval.m_minASBuildScratchSize) + { + retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT; + retval.m_maxASBuildScratchSize = core::max(core::max(scratchSizeFullParallelBLASBuild,scratchSizeFullParallelBLASCompact),core::max(scratchSizeFullParallelTLASBuild,scratchSizeFullParallelTLASCompact)); + } + } dedupCreateProp.operator()(); dedupCreateProp.operator()(); dedupCreateProp.operator()(); @@ -2932,7 +3336,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult } const auto& found = dfsCache.nodes[metadata[i].patchIndex.value]; // write it out to the results - if (const auto& gpuObj=found.gpuObj; gpuObj) // found from the `input.readCache` + if (const auto& gpuObj=found.gpuObj; gpuObj) { results[i] = gpuObj; // if something with this content hash is in the stagingCache, then it must match the `found->gpuObj` @@ -2967,12 +3371,14 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul auto device = m_params.device; const auto reqQueueFlags = reservations.getRequiredQueueFlags(); + // compacted TLASes need to be substituted in cache and Descriptor Sets + core::unordered_map> compactedTLASMap; // Anything to do? if (reqQueueFlags.value!=IQueue::FAMILY_FLAGS::NONE) { if (reqQueueFlags.hasFlags(IQueue::FAMILY_FLAGS::TRANSFER_BIT) && (!params.utilities || params.utilities->getLogicalDevice()!=device)) { - logger.log("Transfer Capability required for this conversion and no compatible `utilities` provided!", system::ILogger::ELL_ERROR); + logger.log("Transfer Capability required for this conversion and no compatible `utilities` provided!",system::ILogger::ELL_ERROR); return retval; } @@ -3001,6 +3407,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul }; // If the transfer queue will be used, the transfer Intended Submit Info must be valid and utilities must be provided auto reqTransferQueueCaps = IQueue::FAMILY_FLAGS::TRANSFER_BIT; + // Depth/Stencil transfers need Graphics Capabilities, so make sure the queue chosen for transfers also has them! if (reservations.m_queueFlags.hasFlags(IQueue::FAMILY_FLAGS::GRAPHICS_BIT)) reqTransferQueueCaps |= IQueue::FAMILY_FLAGS::GRAPHICS_BIT; if (invalidIntended(reqTransferQueueCaps,params.transfer)) @@ -3023,7 +3430,52 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul } } - // wipe gpu item in staging cache (this may drop it as well if it was made for only a root asset == no users) + // check things necessary for building Acceleration Structures + using buffer_usage_f = IGPUBuffer::E_USAGE_FLAGS; + if (reservations.m_ASBuildScratchUsages!=buffer_usage_f::EUF_NONE) + { + if (!params.scratchForDeviceASBuild) + { + logger.log("An Acceleration Structure will be built on Device but no scratch allocator provided!",system::ILogger::ELL_ERROR); + return retval; + } + // TODO: do the build input buffers also need `EUF_STORAGE_BUFFER_BIT` ? + constexpr buffer_usage_f asBuildInputFlags = buffer_usage_f::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT|buffer_usage_f::EUF_TRANSFER_DST_BIT|buffer_usage_f::EUF_SHADER_DEVICE_ADDRESS_BIT; + // we may use the staging buffer directly to skip an extra copy on small enough geometries + if (!params.utilities->getDefaultUpStreamingBuffer()->getBuffer()->getCreationParams().usage.hasFlags(asBuildInputFlags)) + { + logger.log("An Acceleration Structure will be built on Device but Default UpStreaming Buffer from IUtilities doesn't have required usage flags!",system::ILogger::ELL_ERROR); + return retval; + } + constexpr buffer_usage_f asBuildScratchFlags = buffer_usage_f::EUF_STORAGE_BUFFER_BIT|buffer_usage_f::EUF_SHADER_DEVICE_ADDRESS_BIT; + // we use the scratch allocator both for scratch and uploaded geometry data + if (!params.scratchForDeviceASBuild->getBuffer()->getCreationParams().usage.hasFlags(asBuildScratchFlags|asBuildInputFlags)) + { + logger.log("An Acceleration Structure will be built on Device but scratch buffer doesn't have required usage flags!",system::ILogger::ELL_ERROR); + return retval; + } + const auto& addrAlloc = params.scratchForDeviceASBuild->getAddressAllocator(); + // could have used an address allocator trait to work this out, same verbosity + if (addrAlloc.get_allocated_size()+addrAlloc.get_free_size()(const typename asset_traits::video_t* gpuObj)->core::blake3_hash_t* { auto& stagingCache = std::get>(reservations.m_stagingCaches); @@ -3085,7 +3537,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul }; // upload Buffers - auto& buffersToUpload = std::get>(reservations.m_conversionRequests); + auto& buffersToUpload = reservations.m_bufferConversions; { core::vector> ownershipTransfers; ownershipTransfers.reserve(buffersToUpload.size()); @@ -3142,9 +3594,9 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul const auto computeFamily = shouldDoSomeCompute ? params.compute->queue->getFamilyIndex():IQueue::FamilyIgnored; // whenever transfer needs to do a submit overflow because it ran out of memory for streaming an image, we can already submit the recorded mip-map compute shader dispatches auto computeCmdBuf = shouldDoSomeCompute ? params.compute->getCommandBufferForRecording():nullptr; - auto drainCompute = [¶ms,shouldDoSomeCompute,&computeCmdBuf](const std::span extraSignal={})->auto + auto drainCompute = [¶ms,&computeCmdBuf](const std::span extraSignal={})->auto { - if (!shouldDoSomeCompute || computeCmdBuf->cmdbuf->empty()) + if (!computeCmdBuf || computeCmdBuf->cmdbuf->empty()) return IQueue::RESULT::SUCCESS; // before we overflow submit we need to inject extra wait semaphores auto& waitSemaphoreSpan = params.compute->waitSemaphores; @@ -3163,6 +3615,8 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul IQueue::RESULT res = params.compute->submit(computeCmdBuf,extraSignal); if (res!=IQueue::RESULT::SUCCESS) return res; + // set to empty so we don't grow over and over again + waitSemaphoreSpan = {}; if (!params.compute->beginNextCommandBuffer(computeCmdBuf)) return IQueue::RESULT::OTHER_ERROR; return res; @@ -3177,7 +3631,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul origXferStallCallback(tillScratchResettable); }; - auto& imagesToUpload = std::get>(reservations.m_conversionRequests); + auto& imagesToUpload = reservations.m_imageConversions; if (!imagesToUpload.empty()) { // @@ -3634,9 +4088,71 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul imagesToUpload.clear(); } - // TODO: build BLASes and TLASes + // BLAS builds + core::unordered_map> compactedBLASMap; + auto& blasToBuild = reservations.m_blasConversions[0]; + if (const auto blasCount = blasToBuild.size(); blasCount) + { + constexpr auto GeometryIsAABBFlag = ICPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT; + + core::vector buildInfos; buildInfos.reserve(blasCount); + core::vector rangeInfo; rangeInfo.reserve(blasCount); + core::vector> triangles; + core::vector> aabbs; + { + size_t totalTriGeoCount = 0; + size_t totalAABBGeoCount = 0; + for (auto& item : blasToBuild) + { + const size_t geoCount = item.canonical->getGeometryCount(); + if (item.canonical->getBuildFlags().hasFlags(GeometryIsAABBFlag)) + totalAABBGeoCount += geoCount; + else + totalTriGeoCount += geoCount; + } + triangles.reserve(totalTriGeoCount); + triangles.reserve(totalAABBGeoCount); + } +#if 0 + for (auto& item : blasToBuild) + { + auto* as = item.gpuObj; + auto pFoundHash = findInStaging.operator()(as); + if (item.asBuildParams.host) + { + auto dOp = device->createDeferredOperation(); + // + if (!device->buildAccelerationStructure(dOp.get(),info,range)) + { + markFailureInStaging(gpuObj,pFoundHash); + continue; + } + } + else + { + auto& buildInfo = buildInfo.emplace_back({ + .buildFlags = item.buildFlags, + .geometryCount = item.canonical->getGeometryCount(), + // this is not an update + .srcAS = nullptr, + .dstAS = as.get() + }); + if (item.canonical->getBuildFlags().hasFlags(GeometryIsAABBFlag)) + buildInfo.aabbs = nullptr; + else + buildInfo.triangles = nullptr; + computeCmdBuf->cmdbuf->buildAccelerationStructures(buildInfo,rangeInfo); + } + } +#endif + } + + // TLAS builds + auto& tlasToBuild = reservations.m_tlasConversions[0]; + if (!tlasToBuild.empty()) { } + compactedBLASMap.clear(); const bool computeSubmitIsNeeded = submitsNeeded.hasFlags(IQueue::FAMILY_FLAGS::COMPUTE_BIT); // first submit transfer @@ -3672,6 +4188,9 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul } + // Descriptor Sets need their TLAS descriptors substituted if they've been compacted + core::vector tlasRewrites; tlasRewrites.reserve(compactedTLASMap.size()); + core::vector tlasInfos; tlasInfos.reserve(compactedTLASMap.size()); // want to check if deps successfully exist auto missingDependent = [&reservations](const typename asset_traits::video_t* dep)->bool { @@ -3689,12 +4208,17 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul auto& cache = std::get>(m_caches); cache.m_forwardMap.reserve(cache.m_forwardMap.size()+stagingCache.size()); cache.m_reverseMap.reserve(cache.m_reverseMap.size()+stagingCache.size()); + constexpr bool IsTLAS = std::is_same_v; for (auto& item : stagingCache) if (item.second.value!=CHashCache::NoContentHash) // didn't get wiped { // rescan all the GPU objects and find out if they depend on anything that failed, if so add to failure set bool depsMissing = false; // only go over types we could actually break via missing upload/build (i.e. pipelines are unbreakable) + if constexpr (IsTLAS) + { + // there's no lifetime tracking (refcounting) from TLAS to BLAS, so one just must trust the pre-TLAS-build input validation to do its job + } if constexpr (std::is_same_v) depsMissing = missingDependent.operator()(item.first->getUnderlyingBuffer()); if constexpr (std::is_same_v) @@ -3710,6 +4234,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul if (samplers[i]) depsMissing = missingDependent.operator()(samplers[i].get()); } + const auto tlasRewriteOldSize = tlasRewrites.size(); for (auto i=0u; !depsMissing && i(asset::IDescriptor::E_TYPE::ET_COUNT); i++) { const auto type = static_cast(i); @@ -3736,8 +4261,30 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul depsMissing = missingDependent.operator()(static_cast(untypedDesc)); break; case asset::IDescriptor::EC_ACCELERATION_STRUCTURE: - _NBL_TODO(); - [[fallthrough]]; + { + const auto* tlas = static_cast(untypedDesc); + depsMissing = missingDependent.operator()(tlas); + if (!depsMissing) + { + auto found = compactedTLASMap.find(tlas); + if (found==compactedTLASMap.end()) + break; + // written TLAS got compacted, so queue the descriptor for update + using redirect_t = IDescriptorSetLayoutBase::CBindingRedirect; + const redirect_t& redirect = layout->getDescriptorRedirect(IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE); + const auto bindingRange = redirect.findBindingStorageIndex(redirect_t::storage_offset_t(i)); + const auto firstElementOffset = redirect.getStorageOffset(bindingRange).data; + tlasRewrites.push_back({ + .set = item.first, + .binding = redirect.getBinding(bindingRange).data, + .arrayElement = i-firstElementOffset, + .count = 1, // write them one by one, no point optimizing + .info = nullptr // for now + }); + tlasInfos.emplace_back().desc = smart_refctd_ptr(found->second); + } + break; + } default: assert(false); depsMissing = true; @@ -3745,28 +4292,41 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul } } } + // don't bother overwriting a Descriptor Set that won't be marked as successfully converted (inserted into write cache) + if (depsMissing) + { + tlasRewrites.resize(tlasRewriteOldSize); + tlasInfos.resize(tlasRewriteOldSize); + } } + auto* pGpuObj = item.first; if (depsMissing) { const auto* hashAsU64 = reinterpret_cast(item.second.value.data); - logger.log("GPU Obj %s not writing to final cache because conversion of a dependant failed!", system::ILogger::ELL_ERROR, item.first->getObjectDebugName()); + logger.log("GPU Obj %s not writing to final cache because conversion of a dependant failed!", system::ILogger::ELL_ERROR, pGpuObj->getObjectDebugName()); // wipe self, to let users know item.second.value = {}; continue; } - if (!params.writeCache(item.second)) + if (!params.writeCache(item.second)) // TODO: let the user know the pointer too? continue; + if constexpr (IsTLAS) + { + auto found = compactedTLASMap.find(pGpuObj); + if (found!=compactedTLASMap.end()) + pGpuObj = found->second.get(); + } asset_cached_t cached; - cached.value = core::smart_refctd_ptr::video_t>(item.first); + cached.value = core::smart_refctd_ptr::video_t>(pGpuObj); + cache.m_reverseMap.emplace(pGpuObj,item.second); cache.m_forwardMap.emplace(item.second,std::move(cached)); - cache.m_reverseMap.emplace(item.first,item.second); } }; // again, need to go bottom up so we can check dependencies being successes mergeCache.operator()(); mergeCache.operator()(); -// mergeCache.operator()(); -// mergeCache.operator()(); + mergeCache.operator()(); + mergeCache.operator()(); mergeCache.operator()(); mergeCache.operator()(); mergeCache.operator()(); @@ -3778,6 +4338,15 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul mergeCache.operator()(); mergeCache.operator()(); mergeCache.operator()(); + // deal with rewriting the TLASes with compacted ones + { + compactedTLASMap.clear(); + auto* infoIt = tlasInfos.data(); + for (auto& write : tlasRewrites) + write.info = infoIt++; + if (!tlasRewrites.empty()) + device->updateDescriptorSets(tlasRewrites,{}); + } // mergeCache.operator()(); // no submit was necessary, so should signal the extra semaphores from the host