From 1e3f5dd9322f1eb1ffd816ed7e5f59e54362e504 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 20 Nov 2024 16:44:36 +0100
Subject: [PATCH 01/11] Decide on the patchable parameters for the TLAS and
 BLAS builds.

Note that pointer/build param encoding stuff shouldn't be in the CPU side but don't touch anything.

Also fix a typo, change the SRange to a std::span, and add default SPIR-V optimizer if none provided to asset converter.
---
 include/nbl/asset/IAccelerationStructure.h    |  2 +-
 include/nbl/asset/ICPUAccelerationStructure.h | 14 +--
 include/nbl/video/utilities/CAssetConverter.h | 94 ++++++++++++++++++-
 3 files changed, 99 insertions(+), 11 deletions(-)
diff --git a/include/nbl/asset/IAccelerationStructure.h b/include/nbl/asset/IAccelerationStructure.h
index 94e53238b2..46d8a5c4c6 100644
--- a/include/nbl/asset/IAccelerationStructure.h
+++ b/include/nbl/asset/IAccelerationStructure.h
@@ -155,7 +155,7 @@ class ITopLevelAccelerationStructure : public AccelerationStructure
 			PREFER_FAST_BUILD_BIT = 0x1u<<3u,
 			LOW_MEMORY_BIT = 0x1u<<4u,
 			// Synthetic flag we use to indicate `VkAccelerationStructureGeometryInstancesDataKHR::arrayOfPointers`
-			INSTANCE_DATA_IS_POINTERS_TYPE_ENCODED_LSB = 0x1u<<5u,
+			INSTANCE_DATA_IS_POINTERS_TYPE_ENCODED_LSB = 0x1u<<5u, // this flag really shouldn't be settable outside of `video::IGPU`
 			// Provided by VK_NV_ray_tracing_motion_blur, but we always override and deduce from creation flag because of
 			// https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkAccelerationStructureBuildGeometryInfoKHR-dstAccelerationStructure-04927
 			//MOTION_BIT = 0x1u<<5u,
diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h
index dfc96ce01a..87dcee99ab 100644
--- a/include/nbl/asset/ICPUAccelerationStructure.h
+++ b/include/nbl/asset/ICPUAccelerationStructure.h
@@ -337,17 +337,17 @@ class ICPUTopLevelAccelerationStructure final : public ITopLevelAccelerationStru
 			std::variant<StaticInstance,MatrixMotionInstance,SRTMotionInstance> instance = StaticInstance{};
 		};
 
-		core::SRange<PolymorphicInstance> getInstances()
+		std::span<PolymorphicInstance> getInstances()
 		{
 			if (!isMutable() || !m_instances)
-				return {nullptr,nullptr};
-			return {m_instances->begin(),m_instances->end()};
+				return {};
+			return {m_instances->data(),m_instances->size()};
 		}
-		core::SRange<const PolymorphicInstance> getInstances() const
+		std::span<const PolymorphicInstance> getInstances() const
 		{
 			if (!m_instances)
-				return {nullptr,nullptr};
-			return {m_instances->begin(),m_instances->end()};
+				return {};
+			return {m_instances->data(),m_instances->size()};
 		}
 		bool setInstances(core::smart_refctd_dynamic_array<PolymorphicInstance>&& _instances)
 		{
@@ -367,7 +367,7 @@ class ICPUTopLevelAccelerationStructure final : public ITopLevelAccelerationStru
 		}
 
 		//!
-		constexpr static inline auto AssetType = ET_BOTOM_LEVEL_ACCELERATION_STRUCTURE;
+		constexpr static inline auto AssetType = ET_TOP_LEVEL_ACCELERATION_STRUCTURE;
 		inline IAsset::E_TYPE getAssetType() const override { return AssetType; }
 
 		inline core::smart_refctd_ptr<IAsset> clone(uint32_t _depth = ~0u) const override
diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h
index cc3e0a93b9..452a52f3da 100644
--- a/include/nbl/video/utilities/CAssetConverter.h
+++ b/include/nbl/video/utilities/CAssetConverter.h
@@ -38,7 +38,8 @@ class CAssetConverter : public core::IReferenceCounted
 			asset::ICPUSampler,
 			asset::ICPUShader,
 			asset::ICPUBuffer,
-			// acceleration structures,
+			asset::ICPUBottomLevelAccelerationStructure,
+			asset::ICPUTopLevelAccelerationStructure,
 			asset::ICPUImage,
 			asset::ICPUBufferView,
 			asset::ICPUImageView,
@@ -71,6 +72,14 @@ class CAssetConverter : public core::IReferenceCounted
 		{
 			if (!params.valid())
 				return nullptr;
+		#ifndef _NBL_DEBUG
+			if (!params.optimizer)
+			{
+				using pass_e = asset::ISPIRVOptimizer::E_OPTIMIZER_PASS;
+				// shall we do others?
+				params.optimizer = core::make_smart_rectd_ptr<asset::ISPIRVOptimizer>({EOP_STRIP_DEBUG_INFO});
+			}
+		#endif
 			return core::smart_refctd_ptr<CAssetConverter>(new CAssetConverter(std::move(params)),core::dont_grab);
 		}
 		// When getting dependents, the creation parameters of GPU objects will be produced and patched appropriately.
@@ -149,6 +158,71 @@ class CAssetConverter : public core::IReferenceCounted
 					return {true,retval};
 				}
 		};
+		struct NBL_API2 acceleration_structure_patch_base
+		{
+			public:
+				enum class BuildPreference : uint8_t
+				{
+					None = 0,
+					FastTrace = 1,
+					FastBuild = 2,
+					Invalid = 3
+				};
+
+				//! select build flags
+				uint8_t allowUpdate : 1 = false;
+				uint8_t allowCompaction : 1 = false;
+				uint8_t allowDataAccess : 1 = false;
+				BuildPreference preference : 2 = BuildPreference::Invalid;
+				uint8_t lowMemory : 1 = false;
+				//! things that control the build
+				uint8_t hostBuild : 1 = false;
+				uint8_t compactAfterBuild : 1 = false;
+
+			protected:
+				bool valid(const ILogicalDevice* device);
+				
+				template<typename CRTP>
+				std::pair<bool,CRTP> combine_impl(const CRTP& _this, const CRTP& other) const
+				{
+					if (_this.preference!=other.preference || _this.preference==BuildPreference::Invalid)
+						return {false,_this};
+					CRTP retval = _this;
+					retval.allowUpdate |= other.allowUpdate;
+					retval.allowCompaction |= other.allowCompaction;
+					retval.allowDataAccess |= other.allowDataAccess;
+					retval.lowMemory |= other.lowMemory;
+					retval.hostBuild |= other.hostBuild;
+					retval.compactAfterBuild |= other.compactAfterBuild;
+					return {true,retval};
+				}
+		};
+		template<>
+		struct NBL_API2 patch_impl_t<asset::ICPUBottomLevelAccelerationStructure> : acceleration_structure_patch_base
+		{
+			public:
+				PATCH_IMPL_BOILERPLATE(asset::ICPUBottomLevelAccelerationStructure);
+
+			protected:
+				using build_flags_t = asset::ICPUBottomLevelAccelerationStructure::BUILD_FLAGS;
+				inline std::pair<bool,this_t> combine(const this_t& other) const
+				{
+					return combine_impl<this_t>(*this,other);
+				}
+		};
+		template<>
+		struct NBL_API2 patch_impl_t<asset::ICPUTopLevelAccelerationStructure> : acceleration_structure_patch_base
+		{
+			public:
+				PATCH_IMPL_BOILERPLATE(asset::ICPUTopLevelAccelerationStructure);
+
+			protected:
+				using build_flags_t = asset::ICPUTopLevelAccelerationStructure::BUILD_FLAGS;
+				inline std::pair<bool,this_t> combine(const this_t& other) const
+				{
+					return combine_impl<this_t>(*this,other);
+				}
+		};
 		template<>
 		struct NBL_API2 patch_impl_t<asset::ICPUImage>
 		{
@@ -458,6 +532,8 @@ class CAssetConverter : public core::IReferenceCounted
 						virtual const patch_t<asset::ICPUSampler>* operator()(const lookup_t<asset::ICPUSampler>&) const = 0;
 						virtual const patch_t<asset::ICPUShader>* operator()(const lookup_t<asset::ICPUShader>&) const = 0;
 						virtual const patch_t<asset::ICPUBuffer>* operator()(const lookup_t<asset::ICPUBuffer>&) const = 0;
+						virtual const patch_t<asset::ICPUBottomLevelAccelerationStructure>* operator()(const lookup_t<asset::ICPUBottomLevelAccelerationStructure>&) const = 0;
+						virtual const patch_t<asset::ICPUTopLevelAccelerationStructure>* operator()(const lookup_t<asset::ICPUTopLevelAccelerationStructure>&) const = 0;
 						virtual const patch_t<asset::ICPUImage>* operator()(const lookup_t<asset::ICPUImage>&) const = 0;
 						virtual const patch_t<asset::ICPUBufferView>* operator()(const lookup_t<asset::ICPUBufferView>&) const = 0;
 						virtual const patch_t<asset::ICPUImageView>* operator()(const lookup_t<asset::ICPUImageView>&) const = 0;
@@ -577,6 +653,8 @@ class CAssetConverter : public core::IReferenceCounted
 					bool operator()(lookup_t<asset::ICPUSampler>);
 					bool operator()(lookup_t<asset::ICPUShader>);
 					bool operator()(lookup_t<asset::ICPUBuffer>);
+					bool operator()(lookup_t<asset::ICPUBottomLevelAccelerationStructure>);
+					bool operator()(lookup_t<asset::ICPUTopLevelAccelerationStructure>);
 					bool operator()(lookup_t<asset::ICPUImage>);
 					bool operator()(lookup_t<asset::ICPUBufferView>);
 					bool operator()(lookup_t<asset::ICPUImageView>);
@@ -829,6 +907,8 @@ class CAssetConverter : public core::IReferenceCounted
 			IUtilities* utilities = nullptr;
 			// optional, last submit (compute, transfer if no compute needed) signals these in addition to the scratch semaphore
 			std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> extraSignalSemaphores = {};
+			// specific to Acceleration Structure Build, they need to be at least as large as the largest amount of scratch required for an AS build
+			CAsyncSingleBufferSubAllocatorST<>* scratchForASBuild = nullptr;
 			// specific to mip-map recomputation, these are okay defaults for the size of our Descriptor Indexed temporary descriptor set
 			uint32_t sampledImageBindingCount = 1<<10;
 			uint32_t storageImageBindingCount = 11<<10;
@@ -853,6 +933,11 @@ class CAssetConverter : public core::IReferenceCounted
 				// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdCopyBufferToImage.html#VUID-vkCmdCopyBufferToImage-commandBuffer-07739
 				inline core::bitflag<IQueue::FAMILY_FLAGS> getRequiredQueueFlags() const {return m_queueFlags;}
 
+				// just enough memory to build the Acceleration Structures one by one waiting for each build to complete inbetween
+				inline uint64_t getMinASBuildScratchSize() const {return m_minASBuildScratchSize;}
+				// enough memory to build and compact the all Acceleration Structures at once, obviously respecting order of BLAS (build->compact) -> TLAS (build->compact)
+				inline uint64_t getMaxASBuildScratchSize() const {return m_maxASBuildScratchSize;}
+
 				//
 				inline operator bool() const {return bool(m_converter);}
 
@@ -924,12 +1009,15 @@ class CAssetConverter : public core::IReferenceCounted
 				using conversion_requests_t = core::vector<ConversionRequest<AssetType>>;
 				using convertible_asset_types = core::type_list<
 					asset::ICPUBuffer,
-					asset::ICPUImage/*,
+					asset::ICPUImage,
 					asset::ICPUBottomLevelAccelerationStructure,
-					asset::ICPUTopLevelAccelerationStructure*/
+					asset::ICPUTopLevelAccelerationStructure
 				>;
 				core::tuple_transform_t<conversion_requests_t,convertible_asset_types> m_conversionRequests;
 
+				//
+				uint64_t m_minASBuildScratchSize = 0;
+				uint64_t m_maxASBuildScratchSize = 0;
 				//
 				core::bitflag<IQueue::FAMILY_FLAGS> m_queueFlags = IQueue::FAMILY_FLAGS::NONE;
         };

From f065d7c3b9b19c7e5958b598bb08069fdbf796d0 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 20 Nov 2024 16:45:08 +0100
Subject: [PATCH 02/11] start going through the implementation

---
 src/nbl/video/utilities/CAssetConverter.cpp | 200 +++++++++++++++++++-
 1 file changed, 196 insertions(+), 4 deletions(-)

diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index cbbfe09a7c..95e21848e5 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -40,6 +40,7 @@ bool CAssetConverter::patch_impl_t<ICPUSampler>::valid(const ILogicalDevice* dev
 	return true;
 }
 
+
 CAssetConverter::patch_impl_t<ICPUShader>::patch_impl_t(const ICPUShader* shader) : stage(shader->getStage()) {}
 bool CAssetConverter::patch_impl_t<ICPUShader>::valid(const ILogicalDevice* device)
 {
@@ -99,6 +100,67 @@ bool CAssetConverter::patch_impl_t<ICPUBuffer>::valid(const ILogicalDevice* devi
 	return true;
 }
 
+bool CAssetConverter::acceleration_structure_patch_base::valid(const ILogicalDevice* device)
+{
+	// note that we don't check the validity of things we don't patch, all the instance and geometry data, but it will be checked by the driver anyway during creation/build
+	if (preference==BuildPreference::Invalid) // unititialized or just wrong
+		return false;
+	// just make the flags agree/canonicalize
+	allowCompaction = allowCompaction || compactAfterBuild;
+	// don't make invalid, just soft fail
+	const auto& limits = device->getPhysicalDevice()->getLimits();
+	if (allowDataAccess) // TODO: && !limits.rayTracingPositionFetch)
+		allowDataAccess = false;
+	const auto& features = device->getEnabledFeatures();
+	if (hostBuild && !features.accelerationStructureHostCommands)
+		hostBuild = false;
+	return true;
+}
+CAssetConverter::patch_impl_t<ICPUBottomLevelAccelerationStructure>::patch_impl_t(const ICPUBottomLevelAccelerationStructure* blas)
+{
+	const auto flags = blas->getBuildFlags();
+	// straight up invalid
+	if (flags.hasFlags(build_flags_t::PREFER_FAST_TRACE_BIT | build_flags_t::PREFER_FAST_BUILD_BIT))
+		return;
+
+	allowUpdate = flags.hasFlags(build_flags_t::ALLOW_UPDATE_BIT);
+	allowCompaction = flags.hasFlags(build_flags_t::ALLOW_COMPACTION_BIT);
+	allowDataAccess = flags.hasFlags(build_flags_t::ALLOW_DATA_ACCESS_KHR);
+	if (flags.hasFlags(build_flags_t::PREFER_FAST_TRACE_BIT))
+		preference = BuildPreference::FastTrace;
+	else if (flags.hasFlags(build_flags_t::PREFER_FAST_BUILD_BIT))
+		preference = BuildPreference::FastBuild;
+	else
+		preference = BuildPreference::None;
+	lowMemory = flags.hasFlags(build_flags_t::LOW_MEMORY_BIT);
+}
+bool CAssetConverter::patch_impl_t<ICPUBottomLevelAccelerationStructure>::valid(const ILogicalDevice* device)
+{
+	return acceleration_structure_patch_base::valid(device);
+}
+CAssetConverter::patch_impl_t<ICPUTopLevelAccelerationStructure>::patch_impl_t(const ICPUTopLevelAccelerationStructure* blas)
+{
+	const auto flags = blas->getBuildFlags();
+	// straight up invalid
+	if (flags.hasFlags(build_flags_t::PREFER_FAST_TRACE_BIT|build_flags_t::PREFER_FAST_BUILD_BIT))
+		return;
+
+	allowUpdate = flags.hasFlags(build_flags_t::ALLOW_UPDATE_BIT);
+	allowCompaction = flags.hasFlags(build_flags_t::ALLOW_COMPACTION_BIT);
+	allowDataAccess = false;
+	if (flags.hasFlags(build_flags_t::PREFER_FAST_TRACE_BIT))
+		preference = BuildPreference::FastTrace;
+	else if (flags.hasFlags(build_flags_t::PREFER_FAST_BUILD_BIT))
+		preference = BuildPreference::FastBuild;
+	else
+		preference = BuildPreference::None;
+	lowMemory = flags.hasFlags(build_flags_t::LOW_MEMORY_BIT);
+}
+bool CAssetConverter::patch_impl_t<ICPUTopLevelAccelerationStructure>::valid(const ILogicalDevice* device)
+{
+	return acceleration_structure_patch_base::valid(device);
+}
+
 // smol utility function
 template<typename Patch>
 void deduceMetaUsages(Patch& patch, const core::bitflag<IGPUImage::E_USAGE_FLAGS> usages, const E_FORMAT originalFormat, const bool hasDepthAspect=true)
@@ -330,6 +392,24 @@ class AssetVisitor : public CRTP
 				patch.usage |= IGPUBuffer::E_USAGE_FLAGS::EUF_STORAGE_TEXEL_BUFFER_BIT;
 			return descend<ICPUBuffer>(dep,std::move(patch));
 		}
+		inline bool impl(const instance_t<ICPUTopLevelAccelerationStructure>& instance, const CAssetConverter::patch_t<ICPUTopLevelAccelerationStructure>& userPatch)
+		{
+			const auto blasInstances = instance.asset->getInstances();
+			if (blasInstances.empty()) // TODO: is it valid to have a BLASless TLAS?
+				return false;
+			for (size_t i=0; i<blasInstances.size(); i++)
+			{
+				const auto* blas = blasInstances[i].getBase().blas.get();
+				if (!blas)
+					return false;
+				CAssetConverter::patch_t<ICPUBottomLevelAccelerationStructure> patch = {blas};
+				if (userPatch.allowDataAccess) // TODO: check if all BLAS within TLAS need to have the flag ON vs OFF or only some
+					patch.allowDataAccess = true;
+				if (!descend(blas,std::move(patch),i))
+					return false;
+			}
+			return true;
+		}
 		inline bool impl(const instance_t<ICPUImageView>& instance, const CAssetConverter::patch_t<ICPUImageView>& userPatch)
 		{
 			const auto& params = instance.asset->getCreationParameters();
@@ -556,8 +636,10 @@ class AssetVisitor : public CRTP
 							}
 							case IDescriptor::EC_ACCELERATION_STRUCTURE:
 							{
-								_NBL_TODO();
-								[[fallthrough]];
+								auto tlas = static_cast<const ICPUTopLevelAccelerationStructure*>(untypedDesc);
+								if (!descend(tlas,{tlas},type,binding,el))
+									return false;
+								break;
 							}
 							default:
 								assert(false);
@@ -845,6 +927,8 @@ class PatchOverride final : public CAssetConverter::CHashCache::IPatchOverride
 		inline const patch_t<ICPUSampler>* operator()(const lookup_t<ICPUSampler>& lookup) const override {return impl(lookup);}
 		inline const patch_t<ICPUShader>* operator()(const lookup_t<ICPUShader>& lookup) const override {return impl(lookup);}
 		inline const patch_t<ICPUBuffer>* operator()(const lookup_t<ICPUBuffer>& lookup) const override {return impl(lookup);}
+		inline const patch_t<ICPUBottomLevelAccelerationStructure>* operator()(const lookup_t<ICPUBottomLevelAccelerationStructure>& lookup) const override {return impl(lookup);}
+		inline const patch_t<ICPUTopLevelAccelerationStructure>* operator()(const lookup_t<ICPUTopLevelAccelerationStructure>& lookup) const override {return impl(lookup);}
 		inline const patch_t<ICPUImage>* operator()(const lookup_t<ICPUImage>& lookup) const override {return impl(lookup);}
 		inline const patch_t<ICPUBufferView>* operator()(const lookup_t<ICPUBufferView>& lookup) const override {return impl(lookup);}
 		inline const patch_t<ICPUImageView>* operator()(const lookup_t<ICPUImageView>& lookup) const override {return impl(lookup);}
@@ -955,6 +1039,68 @@ bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t<ICPUBuffer> loo
 	hasher.update(&patchedParams,sizeof(patchedParams)) << lookup.asset->getContentHash();
 	return true;
 }
+bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t<ICPUBottomLevelAccelerationStructure> lookup)
+{
+	// extras from the patch
+	hasher << lookup.patch->hostBuild;
+	hasher << lookup.patch->compactAfterBuild;
+	// overriden flags
+	using build_flags_t = ICPUBottomLevelAccelerationStructure::BUILD_FLAGS;
+	constexpr build_flags_t OverridableMask = build_flags_t::LOW_MEMORY_BIT|build_flags_t::PREFER_FAST_TRACE_BIT|build_flags_t::PREFER_FAST_BUILD_BIT|build_flags_t::ALLOW_COMPACTION_BIT|build_flags_t::ALLOW_UPDATE_BIT|build_flags_t::ALLOW_DATA_ACCESS_KHR;
+	auto patchedBuildFlags = lookup.asset->getBuildFlags()&(~OverridableMask);
+	if (lookup.patch->lowMemory)
+		patchedBuildFlags |= build_flags_t::LOW_MEMORY_BIT;
+	if (lookup.patch->allowDataAccess)
+		patchedBuildFlags |= build_flags_t::ALLOW_DATA_ACCESS_KHR;
+	if (lookup.patch->allowCompaction)
+		patchedBuildFlags |= build_flags_t::ALLOW_COMPACTION_BIT;
+	if (lookup.patch->allowUpdate)
+		patchedBuildFlags |= build_flags_t::ALLOW_UPDATE_BIT;
+	switch (lookup.patch->preference)
+	{
+		case acceleration_structure_patch_base::BuildPreference::FastTrace:
+			patchedBuildFlags |= build_flags_t::PREFER_FAST_TRACE_BIT;
+			break;
+		case acceleration_structure_patch_base::BuildPreference::FastBuild:
+			patchedBuildFlags |= build_flags_t::PREFER_FAST_BUILD_BIT;
+			break;
+		default:
+			break;
+	}
+	hasher << patchedBuildFlags;
+// TODO: hash the geometry metadata thats not already in the dependents (c.f. Renderpass, Descriptor Set)
+	return true;
+}
+bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t<ICPUTopLevelAccelerationStructure> lookup)
+{
+	// extras from the patch
+	hasher << lookup.patch->hostBuild;
+	hasher << lookup.patch->compactAfterBuild;
+	// overriden flags
+	using build_flags_t = ICPUTopLevelAccelerationStructure::BUILD_FLAGS;
+	constexpr build_flags_t OverridableMask = build_flags_t::LOW_MEMORY_BIT|build_flags_t::PREFER_FAST_TRACE_BIT|build_flags_t::PREFER_FAST_BUILD_BIT|build_flags_t::ALLOW_COMPACTION_BIT|build_flags_t::ALLOW_UPDATE_BIT;
+	auto patchedBuildFlags = lookup.asset->getBuildFlags()&(~OverridableMask);
+	if (lookup.patch->lowMemory)
+		patchedBuildFlags |= build_flags_t::LOW_MEMORY_BIT;
+	if (lookup.patch->allowCompaction)
+		patchedBuildFlags |= build_flags_t::ALLOW_COMPACTION_BIT;
+	if (lookup.patch->allowUpdate)
+		patchedBuildFlags |= build_flags_t::ALLOW_UPDATE_BIT;
+	switch (lookup.patch->preference)
+	{
+		case acceleration_structure_patch_base::BuildPreference::FastTrace:
+			patchedBuildFlags |= build_flags_t::PREFER_FAST_TRACE_BIT;
+			break;
+		case acceleration_structure_patch_base::BuildPreference::FastBuild:
+			patchedBuildFlags |= build_flags_t::PREFER_FAST_BUILD_BIT;
+			break;
+		default:
+			break;
+	}
+	hasher << patchedBuildFlags;
+// TODO: hash the instance metadata thats not already in the dependents (c.f. Renderpass, Descriptor Set)
+	return true;
+}
 bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t<ICPUImage> lookup)
 {
 	// failed promotion
@@ -1366,8 +1512,8 @@ void CAssetConverter::CHashCache::eraseStale(const IPatchOverride* patchOverride
 	rehash.operator()<ICPUBufferView>();
 	rehash.operator()<ICPUImage>();
 	rehash.operator()<ICPUImageView>();
-//	rehash.operator()<ICPUBottomLevelAccelerationStructure>();
-//	rehash.operator()<ICPUTopLevelAccelerationStructure>();
+	rehash.operator()<ICPUBottomLevelAccelerationStructure>();
+	rehash.operator()<ICPUTopLevelAccelerationStructure>();
 	// only once all the descriptor types have been hashed, we can hash sets
 	rehash.operator()<ICPUDescriptorSet>();
 	// naturally any pipeline depends on shaders and pipeline cache
@@ -1417,6 +1563,52 @@ class GetDependantVisitBase
 template<Asset AssetType>
 class GetDependantVisit;
 
+template<>
+class GetDependantVisit<ICPUBottomLevelAccelerationStructure> : public GetDependantVisitBase<ICPUBottomLevelAccelerationStructure>
+{
+	public:
+		SBufferRange<IGPUBuffer> underlying = {};
+
+	protected:
+		bool descend_impl(
+			const instance_t<AssetType>& user, const CAssetConverter::patch_t<AssetType>& userPatch,
+			const instance_t<ICPUBuffer>& dep, const CAssetConverter::patch_t<ICPUBuffer>& soloPatch
+		)
+		{
+			auto depObj = getDependant<ICPUBuffer>(dep,soloPatch);
+			if (!depObj)
+				return false;
+			underlying = {
+				.offset = user.asset->getOffsetInBuffer(),
+				.size = user.asset->getByteSize(),
+				.buffer = std::move(depObj)
+			};
+			return underlying.isValid();
+		}
+};
+template<>
+class GetDependantVisit<ICPUTopLevelAccelerationStructure> : public GetDependantVisitBase<ICPUTopLevelAccelerationStructure>
+{
+	public:
+		SBufferRange<IGPUBuffer> underlying = {};
+
+	protected:
+		bool descend_impl(
+			const instance_t<AssetType>& user, const CAssetConverter::patch_t<AssetType>& userPatch,
+			const instance_t<ICPUBuffer>& dep, const CAssetConverter::patch_t<ICPUBuffer>& soloPatch
+		)
+		{
+			auto depObj = getDependant<ICPUBuffer>(dep,soloPatch);
+			if (!depObj)
+				return false;
+			underlying = {
+				.offset = user.asset->getOffsetInBuffer(),
+				.size = user.asset->getByteSize(),
+				.buffer = std::move(depObj)
+			};
+			return underlying.isValid();
+		}
+};
 template<>
 class GetDependantVisit<ICPUBufferView> : public GetDependantVisitBase<ICPUBufferView>
 {

From 3689118cd968c02258208ffe6439b08451f31c3b Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 21 Nov 2024 11:17:02 +0100
Subject: [PATCH 03/11] const correctness

---
 include/nbl/video/ILogicalDevice.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h
index 276ed74501..bd47f6650c 100644
--- a/include/nbl/video/ILogicalDevice.h
+++ b/include/nbl/video/ILogicalDevice.h
@@ -402,7 +402,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
         inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes(
             const core::bitflag<IGPUBottomLevelAccelerationStructure::BUILD_FLAGS> flags,
             const bool motionBlur,
-            const std::span<Geometry> geometries,
+            const std::span<const Geometry> geometries,
             const uint32_t* const pMaxPrimitiveCounts
         ) const
         {
@@ -412,7 +412,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
                 return {};
             }
 
-            if (!IGPUBottomLevelAccelerationStructure::validBuildFlags(flags, m_enabledFeatures))
+            if (!IGPUBottomLevelAccelerationStructure::validBuildFlags(flags,m_enabledFeatures))
             {
                 NBL_LOG_ERROR("Invalid build flags");
                 return {};

From 215ee50a7541d66447cb76f9f04bbfe939514cab Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 21 Nov 2024 11:18:15 +0100
Subject: [PATCH 04/11] shared ownership needs to be settled for the buffers
 backing acceelration structures

---
 include/nbl/video/utilities/CAssetConverter.h | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h
index 452a52f3da..816d956d5f 100644
--- a/include/nbl/video/utilities/CAssetConverter.h
+++ b/include/nbl/video/utilities/CAssetConverter.h
@@ -203,8 +203,10 @@ class CAssetConverter : public core::IReferenceCounted
 			public:
 				PATCH_IMPL_BOILERPLATE(asset::ICPUBottomLevelAccelerationStructure);
 
-			protected:
 				using build_flags_t = asset::ICPUBottomLevelAccelerationStructure::BUILD_FLAGS;
+				core::bitflag<build_flags_t> getBuildFlags(const asset::ICPUBottomLevelAccelerationStructure* blas) const;
+
+			protected:
 				inline std::pair<bool,this_t> combine(const this_t& other) const
 				{
 					return combine_impl<this_t>(*this,other);
@@ -216,8 +218,10 @@ class CAssetConverter : public core::IReferenceCounted
 			public:
 				PATCH_IMPL_BOILERPLATE(asset::ICPUTopLevelAccelerationStructure);
 
-			protected:
 				using build_flags_t = asset::ICPUTopLevelAccelerationStructure::BUILD_FLAGS;
+				core::bitflag<build_flags_t> getBuildFlags(const asset::ICPUTopLevelAccelerationStructure* tlas) const;
+
+			protected:
 				inline std::pair<bool,this_t> combine(const this_t& other) const
 				{
 					return combine_impl<this_t>(*this,other);
@@ -795,6 +799,16 @@ class CAssetConverter : public core::IReferenceCounted
 				return {};
 			}
 
+			// this a weird signature, but its for an acceleration structure backing IGPUBuffer
+			virtual inline std::span<const uint32_t> getSharedOwnershipQueueFamilies(const size_t groupCopyID, const asset::ICPUBottomLevelAccelerationStructure* blas, const patch_t<asset::ICPUBottomLevelAccelerationStructure>& patch) const
+			{
+				return {};
+			}
+			virtual inline std::span<const uint32_t> getSharedOwnershipQueueFamilies(const size_t groupCopyID, const asset::ICPUTopLevelAccelerationStructure* tlas, const patch_t<asset::ICPUTopLevelAccelerationStructure>& patch) const
+			{
+				return {};
+			}
+
 			virtual inline std::span<const uint32_t> getSharedOwnershipQueueFamilies(const size_t groupCopyID, const asset::ICPUImage* buffer, const patch_t<asset::ICPUImage>& patch) const
 			{
 				return {};

From 19e3e57d56d8ed845b80ead7f3caaf97dbef5b32 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 21 Nov 2024 11:18:30 +0100
Subject: [PATCH 05/11] fix typos

---
 include/nbl/asset/ICPUAccelerationStructure.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h
index 87dcee99ab..c2779151d4 100644
--- a/include/nbl/asset/ICPUAccelerationStructure.h
+++ b/include/nbl/asset/ICPUAccelerationStructure.h
@@ -47,7 +47,7 @@ class ICPUBottomLevelAccelerationStructure final : public IBottomLevelAccelerati
 				return {m_geometryPrimitiveCount->begin(),m_geometryPrimitiveCount->end()};
 			return {};
 		}
-		inline std::span<const uint32_t> getGeometryPrimitiveCounts(const size_t geomIx) const
+		inline std::span<const uint32_t> getGeometryPrimitiveCounts() const
 		{
 			if (m_geometryPrimitiveCount)
 				return {m_geometryPrimitiveCount->begin(),m_geometryPrimitiveCount->end()};
@@ -79,7 +79,7 @@ class ICPUBottomLevelAccelerationStructure final : public IBottomLevelAccelerati
 		{
 			if (!isMutable())
 				return false;
-			m_buildFlags &= BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT;
+			m_buildFlags &= ~BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT;
 			m_geometryPrimitiveCount = std::move(ranges);
 			m_triangleGeoms = std::move(geometries);
 			m_AABBGeoms = nullptr;

From 100ae7d121165752d3c37a07176f404878f3f94d Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 21 Nov 2024 16:28:13 +0100
Subject: [PATCH 06/11] Realize that compacted acceleration structures need an
 allocator for their storage.

Change more stuff to span in `ICPUBottomLevelAccelerationStructure`

Use a semantically better typedef/alias in `ILogicalDevice::createBottomLevelAccelerationStructure`
---
 include/nbl/asset/ICPUAccelerationStructure.h |  8 ++++----
 include/nbl/video/ILogicalDevice.h            |  2 +-
 include/nbl/video/utilities/CAssetConverter.h | 19 +++++++++++++++++--
 3 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h
index c2779151d4..dcf2975b78 100644
--- a/include/nbl/asset/ICPUAccelerationStructure.h
+++ b/include/nbl/asset/ICPUAccelerationStructure.h
@@ -87,17 +87,17 @@ class ICPUBottomLevelAccelerationStructure final : public IBottomLevelAccelerati
 		}
 
 		//
-		inline core::SRange<AABBs<asset::ICPUBuffer>> getAABBGeometries()
+		inline std::span<AABBs<asset::ICPUBuffer>> getAABBGeometries()
 		{
 			if (!isMutable() || !m_AABBGeoms)
 				return {nullptr,nullptr};
-			return {m_AABBGeoms->begin(),m_AABBGeoms->end()};
+			return {m_AABBGeoms->data(),m_AABBGeoms->size()};
 		}
-		inline core::SRange<const AABBs<asset::ICPUBuffer>> getAABBGeometries() const
+		inline std::span<const AABBs<asset::ICPUBuffer>> getAABBGeometries() const
 		{
 			if (!m_AABBGeoms)
 				return {nullptr,nullptr};
-			return {m_AABBGeoms->begin(),m_AABBGeoms->end()};
+			return {m_AABBGeoms->data(),m_AABBGeoms->size()};
 		}
 		inline bool setGeometries(core::smart_refctd_dynamic_array<AABBs<ICPUBuffer>>&& geometries, core::smart_refctd_dynamic_array<uint32_t>&& ranges)
 		{
diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h
index bd47f6650c..39325bd6dd 100644
--- a/include/nbl/video/ILogicalDevice.h
+++ b/include/nbl/video/ILogicalDevice.h
@@ -358,7 +358,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
         // Create a sampler object to use with ImageViews
         virtual core::smart_refctd_ptr<IGPUSampler> createSampler(const IGPUSampler::SParams& _params) = 0;
         // acceleration structures
-        inline core::smart_refctd_ptr<IGPUBottomLevelAccelerationStructure> createBottomLevelAccelerationStructure(IGPUAccelerationStructure::SCreationParams&& params)
+        inline core::smart_refctd_ptr<IGPUBottomLevelAccelerationStructure> createBottomLevelAccelerationStructure(IGPUBottomLevelAccelerationStructure::SCreationParams&& params)
         {
             if (invalidCreationParams(params))
             {
diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h
index 816d956d5f..57ddec91a9 100644
--- a/include/nbl/video/utilities/CAssetConverter.h
+++ b/include/nbl/video/utilities/CAssetConverter.h
@@ -885,6 +885,7 @@ class CAssetConverter : public core::IReferenceCounted
 		{
 			// By default the last to queue to touch a GPU object will own it after any transfer or compute operations are complete.
 			// If you want to record a pipeline barrier that will release ownership to another family, override this.
+			// The overload for the IGPUBuffer may be called with a hash belonging to a Acceleration Structure, this means that its the storage buffer backing the AS
 			virtual inline uint32_t getFinalOwnerQueueFamily(const IGPUBuffer* buffer, const core::blake3_hash_t& createdFrom)
 			{
 				return IQueue::FamilyIgnored;
@@ -923,6 +924,8 @@ class CAssetConverter : public core::IReferenceCounted
 			std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> extraSignalSemaphores = {};
 			// specific to Acceleration Structure Build, they need to be at least as large as the largest amount of scratch required for an AS build
 			CAsyncSingleBufferSubAllocatorST<>* scratchForASBuild = nullptr;
+			//
+			IDeviceMemoryAllocator* compactedASAllocator = nullptr;
 			// specific to mip-map recomputation, these are okay defaults for the size of our Descriptor Indexed temporary descriptor set
 			uint32_t sampledImageBindingCount = 1<<10;
 			uint32_t storageImageBindingCount = 11<<10;
@@ -951,6 +954,8 @@ class CAssetConverter : public core::IReferenceCounted
 				inline uint64_t getMinASBuildScratchSize() const {return m_minASBuildScratchSize;}
 				// enough memory to build and compact the all Acceleration Structures at once, obviously respecting order of BLAS (build->compact) -> TLAS (build->compact)
 				inline uint64_t getMaxASBuildScratchSize() const {return m_maxASBuildScratchSize;}
+				// if returns NONE means there are no acceleration structures to build
+				inline auto getASBuildScratchUsages() const {return m_ASBuildScratchUsages;}
 
 				//
 				inline operator bool() const {return bool(m_converter);}
@@ -1016,8 +1021,17 @@ class CAssetConverter : public core::IReferenceCounted
 					core::smart_refctd_ptr<const AssetType> canonical;
 					// gpu object to transfer canonical's data to or build it from
 					asset_traits<AssetType>::video_t* gpuObj;
-					// only relevant for images
-					uint16_t recomputeMips = 0;
+					union
+					{
+						// only relevant for images
+						uint16_t recomputeMips = 0;
+						//
+						struct ASBuildParams
+						{
+							uint8_t host : 1;
+							uint8_t compact : 1;
+						} asBuildParams;
+					};
 				};
 				template<asset::Asset AssetType>
 				using conversion_requests_t = core::vector<ConversionRequest<AssetType>>;
@@ -1032,6 +1046,7 @@ class CAssetConverter : public core::IReferenceCounted
 				//
 				uint64_t m_minASBuildScratchSize = 0;
 				uint64_t m_maxASBuildScratchSize = 0;
+				core::bitflag<IGPUBuffer::E_USAGE_FLAGS> m_ASBuildScratchUsages = IGPUBuffer::E_USAGE_FLAGS::EUF_NONE;
 				//
 				core::bitflag<IQueue::FAMILY_FLAGS> m_queueFlags = IQueue::FAMILY_FLAGS::NONE;
         };

From 0763416af703ed5550a636bfc753444b849a6c04 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 21 Nov 2024 16:56:24 +0100
Subject: [PATCH 07/11] start attempts at AS creation

---
 src/nbl/video/utilities/CAssetConverter.cpp | 421 +++++++++++++++-----
 1 file changed, 317 insertions(+), 104 deletions(-)

diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index 95e21848e5..8c6ee875aa 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -105,13 +105,16 @@ bool CAssetConverter::acceleration_structure_patch_base::valid(const ILogicalDev
 	// note that we don't check the validity of things we don't patch, all the instance and geometry data, but it will be checked by the driver anyway during creation/build
 	if (preference==BuildPreference::Invalid) // unititialized or just wrong
 		return false;
+	// potentially skip a lot of work and allocations
+	const auto& features = device->getEnabledFeatures();
+	if (!features.accelerationStructure)
+		return false;
 	// just make the flags agree/canonicalize
 	allowCompaction = allowCompaction || compactAfterBuild;
 	// don't make invalid, just soft fail
 	const auto& limits = device->getPhysicalDevice()->getLimits();
 	if (allowDataAccess) // TODO: && !limits.rayTracingPositionFetch)
 		allowDataAccess = false;
-	const auto& features = device->getEnabledFeatures();
 	if (hostBuild && !features.accelerationStructureHostCommands)
 		hostBuild = false;
 	return true;
@@ -120,7 +123,7 @@ CAssetConverter::patch_impl_t<ICPUBottomLevelAccelerationStructure>::patch_impl_
 {
 	const auto flags = blas->getBuildFlags();
 	// straight up invalid
-	if (flags.hasFlags(build_flags_t::PREFER_FAST_TRACE_BIT | build_flags_t::PREFER_FAST_BUILD_BIT))
+	if (flags.hasFlags(build_flags_t::PREFER_FAST_TRACE_BIT|build_flags_t::PREFER_FAST_BUILD_BIT))
 		return;
 
 	allowUpdate = flags.hasFlags(build_flags_t::ALLOW_UPDATE_BIT);
@@ -134,14 +137,41 @@ CAssetConverter::patch_impl_t<ICPUBottomLevelAccelerationStructure>::patch_impl_
 		preference = BuildPreference::None;
 	lowMemory = flags.hasFlags(build_flags_t::LOW_MEMORY_BIT);
 }
+auto CAssetConverter::patch_impl_t<ICPUBottomLevelAccelerationStructure>::getBuildFlags(const ICPUBottomLevelAccelerationStructure* blas) const -> core::bitflag<build_flags_t>
+{
+	constexpr build_flags_t OverridableMask = build_flags_t::LOW_MEMORY_BIT|build_flags_t::PREFER_FAST_TRACE_BIT|build_flags_t::PREFER_FAST_BUILD_BIT|build_flags_t::ALLOW_COMPACTION_BIT|build_flags_t::ALLOW_UPDATE_BIT|build_flags_t::ALLOW_DATA_ACCESS_KHR;
+	auto flags = blas->getBuildFlags()&(~OverridableMask);
+	if (lowMemory)
+		flags |= build_flags_t::LOW_MEMORY_BIT;
+	if (allowDataAccess)
+		flags |= build_flags_t::ALLOW_DATA_ACCESS_KHR;
+	if (allowCompaction)
+		flags |= build_flags_t::ALLOW_COMPACTION_BIT;
+	if (allowUpdate)
+		flags |= build_flags_t::ALLOW_UPDATE_BIT;
+	switch (preference)
+	{
+		case acceleration_structure_patch_base::BuildPreference::FastTrace:
+			flags |= build_flags_t::PREFER_FAST_TRACE_BIT;
+			break;
+		case acceleration_structure_patch_base::BuildPreference::FastBuild:
+			flags |= build_flags_t::PREFER_FAST_BUILD_BIT;
+			break;
+		default:
+			break;
+	}
+	return flags;
+}
 bool CAssetConverter::patch_impl_t<ICPUBottomLevelAccelerationStructure>::valid(const ILogicalDevice* device)
 {
 	return acceleration_structure_patch_base::valid(device);
 }
-CAssetConverter::patch_impl_t<ICPUTopLevelAccelerationStructure>::patch_impl_t(const ICPUTopLevelAccelerationStructure* blas)
+CAssetConverter::patch_impl_t<ICPUTopLevelAccelerationStructure>::patch_impl_t(const ICPUTopLevelAccelerationStructure* tlas)
 {
-	const auto flags = blas->getBuildFlags();
+	const auto flags = tlas->getBuildFlags();
 	// straight up invalid
+	if (tlas->getInstances().empty())
+		return;
 	if (flags.hasFlags(build_flags_t::PREFER_FAST_TRACE_BIT|build_flags_t::PREFER_FAST_BUILD_BIT))
 		return;
 
@@ -156,6 +186,29 @@ CAssetConverter::patch_impl_t<ICPUTopLevelAccelerationStructure>::patch_impl_t(c
 		preference = BuildPreference::None;
 	lowMemory = flags.hasFlags(build_flags_t::LOW_MEMORY_BIT);
 }
+auto CAssetConverter::patch_impl_t<ICPUTopLevelAccelerationStructure>::getBuildFlags(const ICPUTopLevelAccelerationStructure* tlas) const -> core::bitflag<build_flags_t>
+{
+	constexpr build_flags_t OverridableMask = build_flags_t::LOW_MEMORY_BIT|build_flags_t::PREFER_FAST_TRACE_BIT|build_flags_t::PREFER_FAST_BUILD_BIT|build_flags_t::ALLOW_COMPACTION_BIT|build_flags_t::ALLOW_UPDATE_BIT;
+	auto flags = tlas->getBuildFlags()&(~OverridableMask);
+	if (lowMemory)
+		flags |= build_flags_t::LOW_MEMORY_BIT;
+	if (allowCompaction)
+		flags |= build_flags_t::ALLOW_COMPACTION_BIT;
+	if (allowUpdate)
+		flags |= build_flags_t::ALLOW_UPDATE_BIT;
+	switch (preference)
+	{
+		case acceleration_structure_patch_base::BuildPreference::FastTrace:
+			flags |= build_flags_t::PREFER_FAST_TRACE_BIT;
+			break;
+		case acceleration_structure_patch_base::BuildPreference::FastBuild:
+			flags |= build_flags_t::PREFER_FAST_BUILD_BIT;
+			break;
+		default:
+			break;
+	}
+	return flags;
+}
 bool CAssetConverter::patch_impl_t<ICPUTopLevelAccelerationStructure>::valid(const ILogicalDevice* device)
 {
 	return acceleration_structure_patch_base::valid(device);
@@ -380,18 +433,7 @@ class AssetVisitor : public CRTP
 		}
 
 	private:
-		inline bool impl(const instance_t<ICPUBufferView>& instance, const CAssetConverter::patch_t<ICPUBufferView>& userPatch)
-		{
-			const auto* dep = instance.asset->getUnderlyingBuffer();
-			if (!dep)
-				return false;
-			CAssetConverter::patch_t<ICPUBuffer> patch = {dep};
-			if (userPatch.utbo)
-				patch.usage |= IGPUBuffer::E_USAGE_FLAGS::EUF_UNIFORM_TEXEL_BUFFER_BIT;
-			if (userPatch.stbo)
-				patch.usage |= IGPUBuffer::E_USAGE_FLAGS::EUF_STORAGE_TEXEL_BUFFER_BIT;
-			return descend<ICPUBuffer>(dep,std::move(patch));
-		}
+		// there is no `impl()` overload taking `ICPUTopLevelAccelerationStructure` same as there is no `ICPUmage`
 		inline bool impl(const instance_t<ICPUTopLevelAccelerationStructure>& instance, const CAssetConverter::patch_t<ICPUTopLevelAccelerationStructure>& userPatch)
 		{
 			const auto blasInstances = instance.asset->getInstances();
@@ -410,6 +452,18 @@ class AssetVisitor : public CRTP
 			}
 			return true;
 		}
+		inline bool impl(const instance_t<ICPUBufferView>& instance, const CAssetConverter::patch_t<ICPUBufferView>& userPatch)
+		{
+			const auto* dep = instance.asset->getUnderlyingBuffer();
+			if (!dep)
+				return false;
+			CAssetConverter::patch_t<ICPUBuffer> patch = {dep};
+			if (userPatch.utbo)
+				patch.usage |= IGPUBuffer::E_USAGE_FLAGS::EUF_UNIFORM_TEXEL_BUFFER_BIT;
+			if (userPatch.stbo)
+				patch.usage |= IGPUBuffer::E_USAGE_FLAGS::EUF_STORAGE_TEXEL_BUFFER_BIT;
+			return descend<ICPUBuffer>(dep,std::move(patch));
+		}
 		inline bool impl(const instance_t<ICPUImageView>& instance, const CAssetConverter::patch_t<ICPUImageView>& userPatch)
 		{
 			const auto& params = instance.asset->getCreationParameters();
@@ -1045,60 +1099,42 @@ bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t<ICPUBottomLevel
 	hasher << lookup.patch->hostBuild;
 	hasher << lookup.patch->compactAfterBuild;
 	// overriden flags
-	using build_flags_t = ICPUBottomLevelAccelerationStructure::BUILD_FLAGS;
-	constexpr build_flags_t OverridableMask = build_flags_t::LOW_MEMORY_BIT|build_flags_t::PREFER_FAST_TRACE_BIT|build_flags_t::PREFER_FAST_BUILD_BIT|build_flags_t::ALLOW_COMPACTION_BIT|build_flags_t::ALLOW_UPDATE_BIT|build_flags_t::ALLOW_DATA_ACCESS_KHR;
-	auto patchedBuildFlags = lookup.asset->getBuildFlags()&(~OverridableMask);
-	if (lookup.patch->lowMemory)
-		patchedBuildFlags |= build_flags_t::LOW_MEMORY_BIT;
-	if (lookup.patch->allowDataAccess)
-		patchedBuildFlags |= build_flags_t::ALLOW_DATA_ACCESS_KHR;
-	if (lookup.patch->allowCompaction)
-		patchedBuildFlags |= build_flags_t::ALLOW_COMPACTION_BIT;
-	if (lookup.patch->allowUpdate)
-		patchedBuildFlags |= build_flags_t::ALLOW_UPDATE_BIT;
-	switch (lookup.patch->preference)
-	{
-		case acceleration_structure_patch_base::BuildPreference::FastTrace:
-			patchedBuildFlags |= build_flags_t::PREFER_FAST_TRACE_BIT;
-			break;
-		case acceleration_structure_patch_base::BuildPreference::FastBuild:
-			patchedBuildFlags |= build_flags_t::PREFER_FAST_BUILD_BIT;
-			break;
-		default:
-			break;
-	}
-	hasher << patchedBuildFlags;
-// TODO: hash the geometry metadata thats not already in the dependents (c.f. Renderpass, Descriptor Set)
+	hasher << lookup.patch->getBuildFlags(lookup.asset);
+	// finally the contents
+//TODO:	hasher << lookup.asset->getContentHash();
 	return true;
 }
 bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t<ICPUTopLevelAccelerationStructure> lookup)
 {
+	const auto* asset = lookup.asset;
+	//
+	AssetVisitor<HashVisit<ICPUTopLevelAccelerationStructure>> visitor = {
+		*this,
+		{asset,static_cast<const PatchOverride*>(patchOverride)->uniqueCopyGroupID},
+		*lookup.patch
+	};
+	if (!visitor())
+		return false;
 	// extras from the patch
 	hasher << lookup.patch->hostBuild;
 	hasher << lookup.patch->compactAfterBuild;
 	// overriden flags
-	using build_flags_t = ICPUTopLevelAccelerationStructure::BUILD_FLAGS;
-	constexpr build_flags_t OverridableMask = build_flags_t::LOW_MEMORY_BIT|build_flags_t::PREFER_FAST_TRACE_BIT|build_flags_t::PREFER_FAST_BUILD_BIT|build_flags_t::ALLOW_COMPACTION_BIT|build_flags_t::ALLOW_UPDATE_BIT;
-	auto patchedBuildFlags = lookup.asset->getBuildFlags()&(~OverridableMask);
-	if (lookup.patch->lowMemory)
-		patchedBuildFlags |= build_flags_t::LOW_MEMORY_BIT;
-	if (lookup.patch->allowCompaction)
-		patchedBuildFlags |= build_flags_t::ALLOW_COMPACTION_BIT;
-	if (lookup.patch->allowUpdate)
-		patchedBuildFlags |= build_flags_t::ALLOW_UPDATE_BIT;
-	switch (lookup.patch->preference)
+	hasher << lookup.patch->getBuildFlags(lookup.asset);
+	const auto instances = asset->getInstances();
+	// important two passes to not give identical data due to variable length polymorphic array being hashed
+	for (const auto& instance : instances)
+		hasher << instance.getType();
+	for (const auto& instance : instances)
 	{
-		case acceleration_structure_patch_base::BuildPreference::FastTrace:
-			patchedBuildFlags |= build_flags_t::PREFER_FAST_TRACE_BIT;
-			break;
-		case acceleration_structure_patch_base::BuildPreference::FastBuild:
-			patchedBuildFlags |= build_flags_t::PREFER_FAST_BUILD_BIT;
-			break;
-		default:
-			break;
+		std::visit([&hasher](const auto& typedInstance)->void
+			{
+				using instance_t = std::decay_t<decltype(typedInstance)>;
+				// the BLAS pointers (the BLAS contents already get hashed via asset visitor and `getDependent`, its only the metadate we need to hash
+				hasher.update(&typedInstance,offsetof(instance_t,base)+offsetof(ICPUTopLevelAccelerationStructure::Instance,blas));
+			},
+			instance.instance
+		);
 	}
-	hasher << patchedBuildFlags;
-// TODO: hash the instance metadata thats not already in the dependents (c.f. Renderpass, Descriptor Set)
 	return true;
 }
 bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t<ICPUImage> lookup)
@@ -1563,50 +1599,25 @@ class GetDependantVisitBase
 template<Asset AssetType>
 class GetDependantVisit;
 
-template<>
-class GetDependantVisit<ICPUBottomLevelAccelerationStructure> : public GetDependantVisitBase<ICPUBottomLevelAccelerationStructure>
-{
-	public:
-		SBufferRange<IGPUBuffer> underlying = {};
-
-	protected:
-		bool descend_impl(
-			const instance_t<AssetType>& user, const CAssetConverter::patch_t<AssetType>& userPatch,
-			const instance_t<ICPUBuffer>& dep, const CAssetConverter::patch_t<ICPUBuffer>& soloPatch
-		)
-		{
-			auto depObj = getDependant<ICPUBuffer>(dep,soloPatch);
-			if (!depObj)
-				return false;
-			underlying = {
-				.offset = user.asset->getOffsetInBuffer(),
-				.size = user.asset->getByteSize(),
-				.buffer = std::move(depObj)
-			};
-			return underlying.isValid();
-		}
-};
 template<>
 class GetDependantVisit<ICPUTopLevelAccelerationStructure> : public GetDependantVisitBase<ICPUTopLevelAccelerationStructure>
 {
 	public:
-		SBufferRange<IGPUBuffer> underlying = {};
+		// because of the deferred building of TLASes and lack of lifetime tracking between them, nothing to do on some passes
+		//core::smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>* const outBLASes;
 
 	protected:
 		bool descend_impl(
 			const instance_t<AssetType>& user, const CAssetConverter::patch_t<AssetType>& userPatch,
-			const instance_t<ICPUBuffer>& dep, const CAssetConverter::patch_t<ICPUBuffer>& soloPatch
+			const instance_t<ICPUBottomLevelAccelerationStructure>& dep, const CAssetConverter::patch_t<ICPUBottomLevelAccelerationStructure>& soloPatch,
+			const uint32_t instanceIndex // not the custom index, its literally just an ordinal in `getInstances()`
 		)
 		{
-			auto depObj = getDependant<ICPUBuffer>(dep,soloPatch);
+			auto depObj = getDependant<ICPUBottomLevelAccelerationStructure>(dep,soloPatch);
 			if (!depObj)
 				return false;
-			underlying = {
-				.offset = user.asset->getOffsetInBuffer(),
-				.size = user.asset->getByteSize(),
-				.buffer = std::move(depObj)
-			};
-			return underlying.isValid();
+			// outBLASes[instanceIndex] = std::move(depObj);
+			return true;
 		}
 };
 template<>
@@ -2040,6 +2051,9 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					case ICPUImageView::AssetType:
 						visit.operator()<ICPUImageView>(entry);
 						break;
+					case ICPUTopLevelAccelerationStructure::AssetType:
+						visit.operator()<ICPUTopLevelAccelerationStructure>(entry);
+						break;
 					// these assets have no dependants, should have never been pushed on the stack
 					default:
 						assert(false);
@@ -2155,12 +2169,12 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 		using memory_backed_ptr_variant_t = std::variant<asset_cached_t<ICPUBuffer>*,asset_cached_t<ICPUImage>*>;
 		core::map<MemoryRequirementBin,core::vector<memory_backed_ptr_variant_t>> allocationRequests;
 		// for this we require that the data storage for the dfsCaches' nodes does not change
-		auto requestAllocation = [&inputs,device,&allocationRequests]<Asset AssetType>(asset_cached_t<AssetType>* pGpuObj)->bool
+		auto requestAllocation = [&inputs,device,&allocationRequests]<Asset AssetType>(asset_cached_t<AssetType>* pGpuObj, const uint32_t memoryTypeConstraint=~0u)->bool
 		{
 			auto* gpuObj = pGpuObj->get();
 			const IDeviceMemoryBacked::SDeviceMemoryRequirements& memReqs = gpuObj->getMemoryReqs();
 			// this shouldn't be possible
-			assert(memReqs.memoryTypeBits);
+			assert(memReqs.memoryTypeBits&memoryTypeConstraint);
 			// allocate right away those that need their own allocation
 			if (memReqs.requiresDedicatedAllocation)
 			{
@@ -2176,17 +2190,34 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			{
 				// make the creation conditional upon allocation success
 				MemoryRequirementBin reqBin = {
-					.compatibileMemoryTypeBits = memReqs.memoryTypeBits,
+					.compatibileMemoryTypeBits = memReqs.memoryTypeBits&memoryTypeConstraint,
 					// we ignore this for now, because we can't know how many `DeviceMemory` objects we have left to make, so just join everything by default
 					//.refersDedicatedAllocation = memReqs.prefersDedicatedAllocation
 				};
 				if constexpr (std::is_same_v<std::remove_pointer_t<decltype(gpuObj)>,IGPUBuffer>)
-					reqBin.needsDeviceAddress = gpuObj->getCreationParams().usage.hasFlags(IGPUBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT);
+				{
+					const auto usage = gpuObj->getCreationParams().usage;
+					reqBin.needsDeviceAddress = usage.hasFlags(IGPUBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT);
+					// stops us needing weird awful code to ensure buffer storing AS has alignment of at least 256
+					assert(!usage.hasFlags(IGPUBuffer::E_USAGE_FLAGS::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT) || memReqs.alignmentLog2>=8);
+				}
 				allocationRequests[reqBin].emplace_back(pGpuObj);
 			}
 			return true;
 		};
 
+		// BLAS and TLAS creation is somewhat delayed by buffer creation and allocation
+		struct DeferredASCreationParams
+		{
+			asset_cached_t<ICPUBuffer> storage;
+			size_t scratchSize : 62 = 0;
+			size_t motionBlur : 1 = false;
+			size_t compactAfterBuild : 1 = false;
+			size_t inputSize = 0;
+			uint32_t maxInstanceCount = 0;
+		};
+		core::vector<DeferredASCreationParams> accelerationStructureParams[2];
+
 		// Deduplication, Creation and Propagation
 		auto dedupCreateProp = [&]<Asset AssetType>()->void
 		{
@@ -2206,7 +2237,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					contentHash = retval.getHashCache()->hash<AssetType>(
 						{instance.asset,&created.patch},
 						&patchOverride,
-						/*.mistrustLevel = */1
+						/*.mistrustLevel =*/ 1
 					);
 					// failed to hash all together (only possible reason is failure of `PatchGetter` to provide a valid patch)
 					if (contentHash==CHashCache::NoContentHash)
@@ -2356,6 +2387,133 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 						retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
 				}
 			}
+			if constexpr (std::is_same_v<AssetType,ICPUBottomLevelAccelerationStructure> || std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>)
+			{
+				using mem_prop_f = IDeviceMemoryAllocation::E_MEMORY_PROPERTY_FLAGS;
+				const auto deviceBuildMemoryTypes = device->getPhysicalDevice()->getMemoryTypeBitsFromMemoryTypeFlags(mem_prop_f::EMPF_DEVICE_LOCAL_BIT);
+				const auto hostBuildMemoryTypes = device->getPhysicalDevice()->getMemoryTypeBitsFromMemoryTypeFlags(mem_prop_f::EMPF_DEVICE_LOCAL_BIT|mem_prop_f::EMPF_HOST_WRITABLE_BIT|mem_prop_f::EMPF_HOST_CACHED_BIT);
+				
+				constexpr bool IsTLAS = std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>;
+				accelerationStructureParams[IsTLAS].resize(gpuObjects.size());
+				for (auto& entry : conversionRequests)
+				for (auto i=0ull; i<entry.second.copyCount; i++)
+				{
+					const auto* as = entry.second.canonicalAsset;
+					const auto& patch = dfsCache.nodes[entry.second.patchIndex.value].patch;
+					const bool motionBlur = as->usesMotion();
+					// we will need to temporarily store the build input buffers somewhere
+					size_t inputSize = 0;
+					ILogicalDevice::AccelerationStructureBuildSizes sizes = {};
+					{
+						const auto buildFlags = patch.getBuildFlags(as);
+						if constexpr (IsTLAS)
+						{
+							AssetVisitor<GetDependantVisit<ICPUTopLevelAccelerationStructure>> visitor = {
+								{visitBase},
+								{asset,uniqueCopyGroupID},
+								patch
+							};
+							if (!visitor())
+								continue;
+							const auto instanceCount = as->getInstances().size();
+							sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,instanceCount);
+							inputSize = (motionBlur ? sizeof(IGPUTopLevelAccelerationStructure::DevicePolymorphicInstance):sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance))*instanceCount;
+						}
+						else
+						{
+							const uint32_t* pMaxPrimitiveCounts = as->getGeometryPrimitiveCounts().data();
+							// the code here is not pretty, but DRY-ing is of this is for later
+							if (buildFlags.hasFlags(ICPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT))
+							{
+								const auto geoms = as->getAABBGeometries();
+								if (patch.hostBuild)
+								{
+									const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> cpuGeoms = {
+										reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>*>(geoms.data()),geoms.size()
+									};
+									sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts);
+								}
+								else
+								{
+									const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const ICPUBuffer>> cpuGeoms = {
+										reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const ICPUBuffer>*>(geoms.data()),geoms.size()
+									};
+									sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts);
+									// TODO: check if the strides need to be aligned to 4 bytes for AABBs
+									for (const auto& geom : geoms)
+									if (const auto aabbCount=*(pMaxPrimitiveCounts++); aabbCount)
+										inputSize = core::roundUp(inputSize,sizeof(float))+aabbCount*geom.stride;
+								}
+							}
+							else
+							{
+								core::map<uint32_t,size_t> allocationsPerStride;
+								const auto geoms = as->getTriangleGeometries();
+								if (patch.hostBuild)
+								{
+									const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> cpuGeoms = {
+										reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>*>(geoms.data()),geoms.size()
+									};
+									sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts);
+								}
+								else
+								{
+									const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const ICPUBuffer>> cpuGeoms = {
+										reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const ICPUBuffer>*>(geoms.data()),geoms.size()
+									};
+									sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts);
+									// TODO: check if the strides need to be aligned to 4 bytes for AABBs
+									for (const auto& geom : geoms)
+									if (const auto triCount=*(pMaxPrimitiveCounts++); triCount)
+									{
+										switch (geom.indexType)
+										{
+											case E_INDEX_TYPE::EIT_16BIT:
+												allocationsPerStride[sizeof(uint16_t)] += triCount*3;
+												break;
+											case E_INDEX_TYPE::EIT_32BIT:
+												allocationsPerStride[sizeof(uint32_t)] += triCount*3;
+												break;
+											default:
+												break;
+										}
+										size_t bytesPerVertex = geom.vertexStride;
+										if (geom.vertexData[1])
+											bytesPerVertex += bytesPerVertex;
+										allocationsPerStride[geom.vertexStride] += geom.maxVertex;
+									}
+								}
+								for (const auto& entry : allocationsPerStride)
+									inputSize = core::roundUp<size_t>(inputSize,entry.first)+entry.first*entry.second;
+							}
+						}
+					}
+					if (!sizes)
+						continue;
+					// this is where it gets a bit weird, we need to create a buffer to back the acceleration structure
+					IGPUBuffer::SCreationParams params = {};
+					constexpr size_t MinASBufferAlignment = 256u;
+					params.size = core::roundUp(sizes.accelerationStructureSize,MinASBufferAlignment);
+					params.usage = IGPUBuffer::E_USAGE_FLAGS::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT|IGPUBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT;
+					// concurrent ownership if any
+					const auto outIx = i+entry.second.firstCopyIx;
+					const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx];
+					const auto queueFamilies =  inputs.getSharedOwnershipQueueFamilies(uniqueCopyGroupID,as,patch);
+					params.queueFamilyIndexCount = queueFamilies.size();
+					params.queueFamilyIndices = queueFamilies.data();
+					// we need to save the buffer in a side-channel for later
+					auto& out = accelerationStructureParams[IsTLAS][baseOffset+entry.second.firstCopyIx+i];
+					out = {
+						.storage = device->createBuffer(std::move(params)),
+						.scratchSize = sizes.buildScratchSize,
+						.motionBlur = motionBlur,
+						.compactAfterBuild = patch.compactAfterBuild,
+						.inputSize = inputSize
+					};
+					if (out.storage)
+						requestAllocation(&out.storage,patch.hostBuild ? hostBuildMemoryTypes:deviceBuildMemoryTypes);
+				}
+			}
 			if constexpr (std::is_same_v<AssetType,ICPUImage>)
 			{
 				for (auto& entry : conversionRequests)
@@ -2792,9 +2950,11 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			}
 
 			// Propagate the results back, since the dfsCache has the original asset pointers as keys, we map in reverse
-			auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(retval.m_stagingCaches);
-			dfsCache.for_each([&](const instance_t<AssetType>& instance, dfs_cache<AssetType>::created_t& created)->void
+			// This gets deferred till AFTER the Buffer Memory Allocations and Binding for Acceleration Structures
+			if constexpr (!std::is_same_v<AssetType,ICPUBottomLevelAccelerationStructure> && !std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>)
+				dfsCache.for_each([&](const instance_t<AssetType>& instance, dfs_cache<AssetType>::created_t& created)->void
 				{
+					auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(retval.m_stagingCaches);
 					// already found in read cache and not converted
 					if (created.gpuObj)
 						return;
@@ -2865,7 +3025,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 						const uint16_t recomputeMips = created.patch.recomputeMips;
 						getConversionRequests.operator()<ICPUImage>().emplace_back(core::smart_refctd_ptr<const AssetType>(instance.asset),created.gpuObj.get(),recomputeMips);
 					}
-					// TODO: BLAS and TLAS requests
+// TODO: BLAS and TLAS requests
 				}
 			);
 		};
@@ -2873,8 +3033,9 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 		// Both so we can hash in O(Depth) and not O(Depth^2) but also so we have all the possible dependants ready.
 		// If two Asset chains are independent then we order them from most catastrophic failure to least.
 		dedupCreateProp.operator()<ICPUBuffer>();
+		dedupCreateProp.operator()<ICPUBottomLevelAccelerationStructure>();
+		dedupCreateProp.operator()<ICPUTopLevelAccelerationStructure>();
 		dedupCreateProp.operator()<ICPUImage>();
-// TODO: add backing buffers (not assets) for BLAS and TLAS builds
 		// Allocate Memory
 		{
 			auto getAsBase = [](const memory_backed_ptr_variant_t& var) -> const IDeviceMemoryBacked*
@@ -3069,7 +3230,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			for (auto& req : reqBin.second)
 			{
 				const auto asBacked = getAsBase(req);
-				if (asBacked->getBoundMemory().isValid())
+				if (asBacked->getBoundMemory().isValid()) // TODO: maybe change to an assert? Our `failures` should really kinda make sure we never 
 					continue;
 				switch (req.index())
 				{
@@ -3087,8 +3248,60 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			}
 			allocationRequests.clear();
 		}
-//		dedupCreateProp.operator()<ICPUBottomLevelAccelerationStructure>();
-//		dedupCreateProp.operator()<ICPUTopLevelAccelerationStructure>();
+		// Deal with Deferred Creation of Acceleration structures
+		{
+			for (auto asLevel=0; asLevel<2; asLevel++)
+			{
+				// each of these stages must have a barrier inbetween
+				size_t scratchSizeFullParallelBuild = 0;
+				size_t scratchSizeFullParallelCompact = 0;
+				// we collect that stats AFTER making sure that the BLAS / TLAS can actually be created
+				for (const auto& deferredParams : accelerationStructureParams[asLevel])
+				{
+					// buffer failed to create/allocate
+					if (!deferredParams.storage.get())
+						continue;
+					IGPUAccelerationStructure::SCreationParams baseParams;
+					{
+						auto* buf = deferredParams.storage.get();
+						const auto bufSz = buf->getSize();
+						using create_f = IGPUAccelerationStructure::SCreationParams::FLAGS;
+						baseParams = {
+							.bufferRange = {.offset=0,.size=bufSz,.buffer=smart_refctd_ptr<IGPUBuffer>(buf)},
+							.flags = deferredParams.motionBlur ? create_f::MOTION_BIT:create_f::NONE
+						};
+					}
+					smart_refctd_ptr<IGPUAccelerationStructure> as;
+					if (asLevel)
+					{
+						as = device->createBottomLevelAccelerationStructure({baseParams,deferredParams.maxInstanceCount});
+					}
+					else
+					{
+						as = device->createTopLevelAccelerationStructure({baseParams,deferredParams.maxInstanceCount});
+					}
+					// note that in order to compact an AS you need to allocate a buffer range whose size is known only after the build
+					const auto buildSize = deferredParams.inputSize+deferredParams.scratchSize;
+					// sizes for building 1-by-1 vs parallel, note that
+					retval.m_minASBuildScratchSize = core::max(buildSize,retval.m_minASBuildScratchSize);
+					scratchSizeFullParallelBuild += buildSize;
+					if (deferredParams.compactAfterBuild)
+						scratchSizeFullParallelCompact += deferredParams.scratchSize;
+					// triangles, AABBs or Instance Transforms will need to be supplied from VRAM
+	// TODO: also mark somehow that we'll need a BUILD INPUT READ ONLY BUFFER WITH XFER usage
+					if (deferredParams.inputSize)
+						retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
+				}
+				// 
+				retval.m_maxASBuildScratchSize = core::max(core::max(scratchSizeFullParallelBuild,scratchSizeFullParallelCompact),retval.m_maxASBuildScratchSize);
+			}
+			//
+			if (retval.m_minASBuildScratchSize)
+			{
+				retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
+				retval.m_maxASBuildScratchSize = core::max(core::max(scratchSizeFullParallelBLASBuild,scratchSizeFullParallelBLASCompact),core::max(scratchSizeFullParallelTLASBuild,scratchSizeFullParallelTLASCompact));
+			}
+		}
 		dedupCreateProp.operator()<ICPUBufferView>();
 		dedupCreateProp.operator()<ICPUImageView>();
 		dedupCreateProp.operator()<ICPUShader>();

From 11a141caef7c28c072004ed18df53d8f031def13 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 21 Nov 2024 21:22:59 +0100
Subject: [PATCH 08/11] realize that scratches need to be provided separately
 for Device and Host builds

---
 include/nbl/video/utilities/CAssetConverter.h | 27 ++++++++++++-------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h
index 57ddec91a9..069b049f2a 100644
--- a/include/nbl/video/utilities/CAssetConverter.h
+++ b/include/nbl/video/utilities/CAssetConverter.h
@@ -923,8 +923,9 @@ class CAssetConverter : public core::IReferenceCounted
 			// optional, last submit (compute, transfer if no compute needed) signals these in addition to the scratch semaphore
 			std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> extraSignalSemaphores = {};
 			// specific to Acceleration Structure Build, they need to be at least as large as the largest amount of scratch required for an AS build
-			CAsyncSingleBufferSubAllocatorST<>* scratchForASBuild = nullptr;
-			//
+			CAsyncSingleBufferSubAllocatorST</*TODO: try uint64_t GP Address Allocator*/>* scratchForDeviceASBuild = nullptr;
+			std::pmr::memory_resource* scratchForHostASBuild = nullptr;
+			// needs to service allocations without limit, unlike the above where failure will just force a flush and performance of already queued up builds
 			IDeviceMemoryAllocator* compactedASAllocator = nullptr;
 			// specific to mip-map recomputation, these are okay defaults for the size of our Descriptor Indexed temporary descriptor set
 			uint32_t sampledImageBindingCount = 1<<10;
@@ -950,12 +951,16 @@ class CAssetConverter : public core::IReferenceCounted
 				// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdCopyBufferToImage.html#VUID-vkCmdCopyBufferToImage-commandBuffer-07739
 				inline core::bitflag<IQueue::FAMILY_FLAGS> getRequiredQueueFlags() const {return m_queueFlags;}
 
-				// just enough memory to build the Acceleration Structures one by one waiting for each build to complete inbetween
-				inline uint64_t getMinASBuildScratchSize() const {return m_minASBuildScratchSize;}
-				// enough memory to build and compact the all Acceleration Structures at once, obviously respecting order of BLAS (build->compact) -> TLAS (build->compact)
-				inline uint64_t getMaxASBuildScratchSize() const {return m_maxASBuildScratchSize;}
-				// if returns NONE means there are no acceleration structures to build
+				// This is just enough memory to build the Acceleration Structures one by one waiting for each Device Build to complete inbetween. If 0 there are no Device AS Builds or Compactions to perform.
+				inline uint64_t getMinASBuildScratchSize(const bool forHostOps) const {return m_minASBuildScratchSize[forHostOps];}
+				// Enough memory to build and compact all the Acceleration Structures at once, obviously respecting order of BLAS (build->compact) -> TLAS (build->compact)
+				inline uint64_t getMaxASBuildScratchSize(const bool forHostOps) const {return m_maxASBuildScratchSize[forHostOps];}
+				// What usage flags your scratch buffer must have, if returns NONE means are no Device AS Builds to perform.
 				inline auto getASBuildScratchUsages() const {return m_ASBuildScratchUsages;}
+				// tells you if you need to provide a valid `SConvertParams::scratchForHostASBuild`
+				inline bool willHostASBuild() const {return m_willHostBuildSomeAS;}
+				// tells you if you need to provide a valid `SConvertParams::compactedASAllocator`
+				inline bool willCompactAS() const {return m_willHostBuildSomeAS;}
 
 				//
 				inline operator bool() const {return bool(m_converter);}
@@ -1028,6 +1033,7 @@ class CAssetConverter : public core::IReferenceCounted
 						//
 						struct ASBuildParams
 						{
+							// TODO: buildFlags
 							uint8_t host : 1;
 							uint8_t compact : 1;
 						} asBuildParams;
@@ -1044,9 +1050,12 @@ class CAssetConverter : public core::IReferenceCounted
 				core::tuple_transform_t<conversion_requests_t,convertible_asset_types> m_conversionRequests;
 
 				//
-				uint64_t m_minASBuildScratchSize = 0;
-				uint64_t m_maxASBuildScratchSize = 0;
+				uint64_t m_minASBuildScratchSize[2] = {0,0};
+				uint64_t m_maxASBuildScratchSize[2] = {0,0};
 				core::bitflag<IGPUBuffer::E_USAGE_FLAGS> m_ASBuildScratchUsages = IGPUBuffer::E_USAGE_FLAGS::EUF_NONE;
+				uint8_t m_willHostBuildSomeAS : 1 = false;
+				uint8_t m_willCompactSomeAS : 1 = false;
+
 				//
 				core::bitflag<IQueue::FAMILY_FLAGS> m_queueFlags = IQueue::FAMILY_FLAGS::NONE;
         };

From 667522460ca80ae47b9f26cf2ed8a471dc583462 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 21 Nov 2024 21:24:42 +0100
Subject: [PATCH 09/11] starting writing the building code, realize we need to
 bucket the Device and Host build requests separately

---
 src/nbl/video/utilities/CAssetConverter.cpp | 128 ++++++++++++++++++--
 1 file changed, 119 insertions(+), 9 deletions(-)

diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index 8c6ee875aa..b313ed7d24 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -3377,7 +3377,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 	{
 		if (reqQueueFlags.hasFlags(IQueue::FAMILY_FLAGS::TRANSFER_BIT) && (!params.utilities || params.utilities->getLogicalDevice()!=device))
 		{
-			logger.log("Transfer Capability required for this conversion and no compatible `utilities` provided!", system::ILogger::ELL_ERROR);
+			logger.log("Transfer Capability required for this conversion and no compatible `utilities` provided!",system::ILogger::ELL_ERROR);
 			return retval;
 		}
 
@@ -3406,6 +3406,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 		};
 		// If the transfer queue will be used, the transfer Intended Submit Info must be valid and utilities must be provided
 		auto reqTransferQueueCaps = IQueue::FAMILY_FLAGS::TRANSFER_BIT;
+		// Depth/Stencil transfers need Graphics Capabilities, so make sure the queue chosen for transfers also has them!
 		if (reservations.m_queueFlags.hasFlags(IQueue::FAMILY_FLAGS::GRAPHICS_BIT))
 			reqTransferQueueCaps |= IQueue::FAMILY_FLAGS::GRAPHICS_BIT;
 		if (invalidIntended(reqTransferQueueCaps,params.transfer))
@@ -3428,7 +3429,52 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 			}
 		}
 
-		// wipe gpu item in staging cache (this may drop it as well if it was made for only a root asset == no users)
+		// check things necessary for building Acceleration Structures
+		using buffer_usage_f = IGPUBuffer::E_USAGE_FLAGS;
+		if (reservations.m_ASBuildScratchUsages!=buffer_usage_f::EUF_NONE)
+		{
+			if (!params.scratchForDeviceASBuild)
+			{
+				logger.log("An Acceleration Structure will be built on Device but no scratch allocator provided!",system::ILogger::ELL_ERROR);
+				return retval;
+			}
+			// TODO: do the build input buffers also need `EUF_STORAGE_BUFFER_BIT` ?
+			constexpr buffer_usage_f asBuildInputFlags = buffer_usage_f::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT|buffer_usage_f::EUF_TRANSFER_DST_BIT|buffer_usage_f::EUF_SHADER_DEVICE_ADDRESS_BIT;
+			// we may use the staging buffer directly to skip an extra copy on small enough geometries
+			if (!params.utilities->getDefaultUpStreamingBuffer()->getBuffer()->getCreationParams().usage.hasFlags(asBuildInputFlags))
+			{
+				logger.log("An Acceleration Structure will be built on Device but Default UpStreaming Buffer from IUtilities doesn't have required usage flags!",system::ILogger::ELL_ERROR);
+				return retval;
+			}
+			constexpr buffer_usage_f asBuildScratchFlags = buffer_usage_f::EUF_STORAGE_BUFFER_BIT|buffer_usage_f::EUF_SHADER_DEVICE_ADDRESS_BIT;
+			// we use the scratch allocator both for scratch and uploaded geometry data
+			if (!params.scratchForDeviceASBuild->getBuffer()->getCreationParams().usage.hasFlags(asBuildScratchFlags|asBuildInputFlags))
+			{
+				logger.log("An Acceleration Structure will be built on Device but scratch buffer doesn't have required usage flags!",system::ILogger::ELL_ERROR);
+				return retval;
+			}
+			const auto& addrAlloc = params.scratchForDeviceASBuild->getAddressAllocator();
+			// could have used an address allocator trait to work this out, same verbosity
+			if (addrAlloc.get_allocated_size()+addrAlloc.get_free_size()<reservations.m_minASBuildScratchSize[0])
+			{
+				logger.log("Acceleration Structure Scratch Device Memory Allocator not large enough!",system::ILogger::ELL_ERROR);
+				return retval;
+			}
+		}
+		// the elusive and exotic host builds
+		if (reservations.m_willHostBuildSomeAS && !params.scratchForHostASBuild)
+		{
+			logger.log("An Acceleration Structure will be built on the Host but no Scratch Memory Allocator provided!", system::ILogger::ELL_ERROR);
+			return retval;
+		}
+		// and compacting
+		if (reservations.m_willCompactSomeAS && !params.compactedASAllocator)
+		{
+			logger.log("An Acceleration Structure will be compacted but no Device Memory Allocator provided!", system::ILogger::ELL_ERROR);
+			return retval;
+		}
+
+		//
 		auto findInStaging = [&reservations]<Asset AssetType>(const typename asset_traits<AssetType>::video_t* gpuObj)->core::blake3_hash_t*
 		{
 			auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(reservations.m_stagingCaches);
@@ -3547,9 +3593,9 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 		const auto computeFamily = shouldDoSomeCompute ? params.compute->queue->getFamilyIndex():IQueue::FamilyIgnored;
 		// whenever transfer needs to do a submit overflow because it ran out of memory for streaming an image, we can already submit the recorded mip-map compute shader dispatches
 		auto computeCmdBuf = shouldDoSomeCompute ? params.compute->getCommandBufferForRecording():nullptr;
-		auto drainCompute = [&params,shouldDoSomeCompute,&computeCmdBuf](const std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> extraSignal={})->auto
+		auto drainCompute = [&params,&computeCmdBuf](const std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> extraSignal={})->auto
 		{
-			if (!shouldDoSomeCompute || computeCmdBuf->cmdbuf->empty())
+			if (!computeCmdBuf || computeCmdBuf->cmdbuf->empty())
 				return IQueue::RESULT::SUCCESS;
 			// before we overflow submit we need to inject extra wait semaphores
 			auto& waitSemaphoreSpan = params.compute->waitSemaphores;
@@ -3568,6 +3614,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
             IQueue::RESULT res = params.compute->submit(computeCmdBuf,extraSignal);
             if (res!=IQueue::RESULT::SUCCESS)
                 return res;
+			// set to empty so we don't grow over and over again
+			waitSemaphoreSpan = {};
             if (!params.compute->beginNextCommandBuffer(computeCmdBuf))
                 return IQueue::RESULT::OTHER_ERROR;
 			return res;
@@ -4039,7 +4087,65 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 			imagesToUpload.clear();
 		}
 
-		// TODO: build BLASes and TLASes
+		// BLAS builds
+		auto& blasToBuild = std::get<SReserveResult::conversion_requests_t<ICPUBottomLevelAccelerationStructure>>(reservations.m_conversionRequests);
+		if (const auto blasCount = blasToBuild.size(); blasCount)
+		{
+			constexpr auto GeometryIsAABBFlag = ICPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT;
+
+			core::vector<IGPUBottomLevelAccelerationStructure::DeviceBuildInfo> buildInfos; buildInfos.reserve(blasCount);
+			core::vector<IGPUBottomLevelAccelerationStructure::DeviceBuildInfo> rangeInfo; rangeInfo.reserve(blasCount);
+			core::vector<IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> triangles;
+			core::vector<IGPUBottomLevelAccelerationStructure::AABBs<const IGPUBuffer>> aabbs;
+			{
+				size_t totalTriGeoCount = 0;
+				size_t totalAABBGeoCount = 0;
+				for (auto& item : blasToBuild)
+				{
+					const size_t geoCount = item.canonical->getGeometryCount();
+					if (item.canonical->getBuildFlags().hasFlags(GeometryIsAABBFlag))
+						totalAABBGeoCount += geoCount;
+					else
+						totalTriGeoCount += geoCount;
+				}
+				triangles.reserve(totalTriGeoCount);
+				triangles.reserve(totalAABBGeoCount);
+			}
+			for (auto& item : blasToBuild)
+			{
+				auto* as = item.gpuObj;
+				auto pFoundHash = findInStaging.operator()<ICPUBottomLevelAccelerationStructure>(as);
+				if (item.asBuildParams.host)
+				{
+					auto dOp = device->createDeferredOperation();
+					//
+					if (!device->buildAccelerationStructure(dOp.get(),info,range))
+					{
+						markFailureInStaging(gpuObj,pFoundHash);
+						continue;
+					}
+				}
+				else
+				{
+					auto& buildInfo = buildInfo.emplace_back({
+						.buildFlags  = item.buildFlags,
+						.geometryCount = item.canonical->getGeometryCount(),
+						// this is not an update
+						.srcAS = nullptr,
+						.dstAS = as.get()
+					});
+					if (item.canonical->getBuildFlags().hasFlags(GeometryIsAABBFlag))
+						buildInfo.aabbs = nullptr;
+					else
+						buildInfo.triangles = nullptr;
+					computeCmdBuf->cmdbuf->buildAccelerationStructures(buildInfo,rangeInfo);
+				}
+			}
+		}
+
+		// TLAS builds
+		auto& tlasToBuild = std::get<SReserveResult::conversion_requests_t<ICPUTopLevelAccelerationStructure>>(reservations.m_conversionRequests);
+		if (!tlasToBuild.empty())
 		{
 		}
 
@@ -4100,6 +4206,10 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 			// rescan all the GPU objects and find out if they depend on anything that failed, if so add to failure set
 			bool depsMissing = false;
 			// only go over types we could actually break via missing upload/build (i.e. pipelines are unbreakable)
+			if constexpr (std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>)
+			{
+				// there's no lifetime tracking (refcounting) from TLAS to BLAS, so one just must trust the pre-TLAS-build input validation to do its job
+			}
 			if constexpr (std::is_same_v<AssetType,ICPUBufferView>)
 				depsMissing = missingDependent.operator()<ICPUBuffer>(item.first->getUnderlyingBuffer());
 			if constexpr (std::is_same_v<AssetType,ICPUImageView>)
@@ -4141,8 +4251,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 								depsMissing = missingDependent.operator()<ICPUBufferView>(static_cast<const IGPUBufferView*>(untypedDesc));
 								break;
 							case asset::IDescriptor::EC_ACCELERATION_STRUCTURE:
-								_NBL_TODO();
-								[[fallthrough]];
+								depsMissing = missingDependent.operator()<ICPUTopLevelAccelerationStructure>(static_cast<const ICPUTopLevelAccelerationStructure*>(untypedDesc));
+								break;
 							default:
 								assert(false);
 								depsMissing = true;
@@ -4170,8 +4280,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 	// again, need to go bottom up so we can check dependencies being successes
 	mergeCache.operator()<ICPUBuffer>();
 	mergeCache.operator()<ICPUImage>();
-//	mergeCache.operator()<ICPUBottomLevelAccelerationStructure>();
-//	mergeCache.operator()<ICPUTopLevelAccelerationStructure>();
+	mergeCache.operator()<ICPUBottomLevelAccelerationStructure>();
+	mergeCache.operator()<ICPUTopLevelAccelerationStructure>();
 	mergeCache.operator()<ICPUBufferView>();
 	mergeCache.operator()<ICPUImageView>();
 	mergeCache.operator()<ICPUShader>();

From 8f43fef5a2daf1640ed0b8328d2e2f76a6741d40 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Fri, 22 Nov 2024 13:36:20 +0100
Subject: [PATCH 10/11] Figure out the TLAS/BLAS compaction logic and swap in
 cache.

Also update comments about what ends up in `m_gpuObjects`
---
 include/nbl/asset/IDescriptorSetLayout.h      | 10 +++
 include/nbl/video/utilities/CAssetConverter.h | 49 ++++++-----
 src/nbl/video/utilities/CAssetConverter.cpp   | 84 +++++++++++++++----
 3 files changed, 105 insertions(+), 38 deletions(-)

diff --git a/include/nbl/asset/IDescriptorSetLayout.h b/include/nbl/asset/IDescriptorSetLayout.h
index 34e7d2ccc4..5437a09330 100644
--- a/include/nbl/asset/IDescriptorSetLayout.h
+++ b/include/nbl/asset/IDescriptorSetLayout.h
@@ -147,6 +147,16 @@ class IDescriptorSetLayoutBase : public virtual core::IReferenceCounted // TODO:
 					return getStorageOffset(index);
 				}
 
+				// Weird functions for exceptional situations
+				inline storage_range_index_t findBindingStorageIndex(const storage_offset_t offset) const
+				{
+					const auto found = std::upper_bound(m_storageOffsets, m_storageOffsets+m_count, offset, [](storage_range_index_t a, storage_range_index_t b) -> bool {return a.data < b.data; });
+					const auto ix = m_storageOffsets - found;
+					if (ix>=m_count)
+						return {};
+					return storage_range_index_t(ix);
+				}
+
 				inline uint32_t getTotalCount() const { return (m_count == 0ull) ? 0u : m_storageOffsets[m_count - 1].data; }
 
 			private:
diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h
index 069b049f2a..8789c63079 100644
--- a/include/nbl/video/utilities/CAssetConverter.h
+++ b/include/nbl/video/utilities/CAssetConverter.h
@@ -965,7 +965,8 @@ class CAssetConverter : public core::IReferenceCounted
 				//
 				inline operator bool() const {return bool(m_converter);}
 
-				// until `convert` is called, this will only contain valid entries for items already found in `SInput::readCache`
+				// Until `convert` is called, the Buffers and Images are not filled with content and Acceleration Structures are not built, unless found in the `SInput::readCache`
+				// WARNING: The Acceleration Structure Pointer WILL CHANGE after calling `convert` if its patch dictates that it will be compacted! (since AS can't resize)
 				// TODO: we could also return per-object semaphore values when object is ready for use (would have to propagate two semaphores up through dependants)
 				template<asset::Asset AssetType>
 				std::span<const asset_cached_t<AssetType>> getGPUObjects() const {return std::get<vector_t<AssetType>>(m_gpuObjects);}
@@ -1020,34 +1021,36 @@ class CAssetConverter : public core::IReferenceCounted
 				core::tuple_transform_t<staging_cache_t,supported_asset_types> m_stagingCaches;
 				// need a more explicit list of GPU objects that need device-assisted conversion
 				template<asset::Asset AssetType>
-				struct ConversionRequest
+				struct SConversionRequestBase
 				{
 					// canonical asset (the one that provides content)
 					core::smart_refctd_ptr<const AssetType> canonical;
 					// gpu object to transfer canonical's data to or build it from
 					asset_traits<AssetType>::video_t* gpuObj;
-					union
-					{
-						// only relevant for images
-						uint16_t recomputeMips = 0;
-						//
-						struct ASBuildParams
-						{
-							// TODO: buildFlags
-							uint8_t host : 1;
-							uint8_t compact : 1;
-						} asBuildParams;
-					};
 				};
-				template<asset::Asset AssetType>
-				using conversion_requests_t = core::vector<ConversionRequest<AssetType>>;
-				using convertible_asset_types = core::type_list<
-					asset::ICPUBuffer,
-					asset::ICPUImage,
-					asset::ICPUBottomLevelAccelerationStructure,
-					asset::ICPUTopLevelAccelerationStructure
-				>;
-				core::tuple_transform_t<conversion_requests_t,convertible_asset_types> m_conversionRequests;
+				using SConvReqBuffer = SConversionRequestBase<asset::ICPUBuffer>;
+				core::vector<SConvReqBuffer> m_bufferConversions;
+				struct SConvReqImage : SConversionRequestBase<asset::ICPUImage>
+				{
+					bool recomputeMips = 0;
+				};
+				core::vector<SConvReqImage> m_imageConversions;
+				template<typename CPUAccelerationStructure>// requires std::is_base_of_v<asset::ICPUAccelerationStructure,CPUAccelerationStructure>
+				struct SConvReqAccelerationStructure : SConversionRequestBase<CPUAccelerationStructure>
+				{
+					constexpr static inline uint64_t WontCompact = (0x1ull<<48)-1;
+					inline bool compact() const {return compactedASWriteOffset!=WontCompact;}
+
+					using build_f = typename CPUAccelerationStructure::BUILD_FLAGS;
+					inline void setBuildFlags(const build_f _flags) {buildFlags = static_cast<uint16_t>(_flags);}
+					inline build_f getBuildFlags() const {return static_cast<build_f>(buildFlags);}
+
+
+					uint64_t compactedASWriteOffset : 48 = WontCompact;
+					uint64_t buildFlags : 16 = static_cast<uint16_t>(build_f::NONE);
+				};
+				core::vector<SConvReqAccelerationStructure<asset::ICPUBottomLevelAccelerationStructure>> m_blasConversions[2];
+				core::vector<SConvReqAccelerationStructure<asset::ICPUTopLevelAccelerationStructure>> m_tlasConversions[2];
 
 				//
 				uint64_t m_minASBuildScratchSize[2] = {0,0};
diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index b313ed7d24..dcf27099a5 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -3016,14 +3016,13 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 							return;
 						}
 					}
-					// this is super annoying, was hoping metaprogramming with `has_type` would actually work
-					auto getConversionRequests = [&]<typename AssetU>()->auto&{return std::get<SReserveResult::conversion_requests_t<AssetU>>(retval.m_conversionRequests);};
+					//
 					if constexpr (std::is_same_v<AssetType,ICPUBuffer>)
-						getConversionRequests.operator()<ICPUBuffer>().emplace_back(core::smart_refctd_ptr<const AssetType>(instance.asset),created.gpuObj.get());;
+						retval.m_bufferConversions.emplace_back({core::smart_refctd_ptr<const AssetType>(instance.asset),created.gpuObj.get()});
 					if constexpr (std::is_same_v<AssetType,ICPUImage>)
 					{
 						const uint16_t recomputeMips = created.patch.recomputeMips;
-						getConversionRequests.operator()<ICPUImage>().emplace_back(core::smart_refctd_ptr<const AssetType>(instance.asset),created.gpuObj.get(),recomputeMips);
+						retval.m_imageConversions.emplace_back({core::smart_refctd_ptr<const AssetType>(instance.asset),created.gpuObj.get()},recomputeMips);
 					}
 // TODO: BLAS and TLAS requests
 				}
@@ -3337,7 +3336,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			}
 			const auto& found = dfsCache.nodes[metadata[i].patchIndex.value];
 			// write it out to the results
-			if (const auto& gpuObj=found.gpuObj; gpuObj) // found from the `input.readCache`
+			if (const auto& gpuObj=found.gpuObj; gpuObj)
 			{
 				results[i] = gpuObj;
 				// if something with this content hash is in the stagingCache, then it must match the `found->gpuObj`
@@ -3372,6 +3371,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 	auto device = m_params.device;
 	const auto reqQueueFlags = reservations.getRequiredQueueFlags();
 
+	// compacted TLASes need to be substituted in cache and Descriptor Sets
+	core::unordered_map<IGPUTopLevelAccelerationStructure*,smart_refctd_ptr<IGPUTopLevelAccelerationStructure>> compactedTLASMap;
 	// Anything to do?
 	if (reqQueueFlags.value!=IQueue::FAMILY_FLAGS::NONE)
 	{
@@ -3536,7 +3537,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 		};
 
 		// upload Buffers
-		auto& buffersToUpload = std::get<SReserveResult::conversion_requests_t<ICPUBuffer>>(reservations.m_conversionRequests);
+		auto& buffersToUpload = reservations.m_bufferConversions;
 		{
 			core::vector<IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier>> ownershipTransfers;
 			ownershipTransfers.reserve(buffersToUpload.size());
@@ -3630,7 +3631,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 					origXferStallCallback(tillScratchResettable);
 			};
 
-		auto& imagesToUpload = std::get<SReserveResult::conversion_requests_t<ICPUImage>>(reservations.m_conversionRequests);
+		auto& imagesToUpload = reservations.m_imageConversions;
 		if (!imagesToUpload.empty())
 		{
 			//
@@ -4088,7 +4089,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 		}
 
 		// BLAS builds
-		auto& blasToBuild = std::get<SReserveResult::conversion_requests_t<ICPUBottomLevelAccelerationStructure>>(reservations.m_conversionRequests);
+		core::unordered_map<IGPUBottomLevelAccelerationStructure*,smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>> compactedBLASMap;
+		auto& blasToBuild = reservations.m_blasConversions[0];
 		if (const auto blasCount = blasToBuild.size(); blasCount)
 		{
 			constexpr auto GeometryIsAABBFlag = ICPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT;
@@ -4111,6 +4113,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 				triangles.reserve(totalTriGeoCount);
 				triangles.reserve(totalAABBGeoCount);
 			}
+#if 0
 			for (auto& item : blasToBuild)
 			{
 				auto* as = item.gpuObj;
@@ -4141,13 +4144,15 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 					computeCmdBuf->cmdbuf->buildAccelerationStructures(buildInfo,rangeInfo);
 				}
 			}
+#endif
 		}
 
 		// TLAS builds
-		auto& tlasToBuild = std::get<SReserveResult::conversion_requests_t<ICPUTopLevelAccelerationStructure>>(reservations.m_conversionRequests);
+		auto& tlasToBuild = reservations.m_tlasConversions[0];
 		if (!tlasToBuild.empty())
 		{
 		}
+		compactedBLASMap.clear();
 
 		const bool computeSubmitIsNeeded = submitsNeeded.hasFlags(IQueue::FAMILY_FLAGS::COMPUTE_BIT);
 		// first submit transfer
@@ -4183,6 +4188,9 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 	}
 	
 
+	// Descriptor Sets need their TLAS descriptors substituted if they've been compacted 
+	core::vector<IGPUDescriptorSet::SWriteDescriptorSet> tlasRewrites; tlasRewrites.reserve(compactedTLASMap.size());
+	core::vector<IGPUDescriptorSet::SDescriptorInfo> tlasInfos; tlasInfos.reserve(compactedTLASMap.size());
 	// want to check if deps successfully exist
 	auto missingDependent = [&reservations]<Asset AssetType>(const typename asset_traits<AssetType>::video_t* dep)->bool
 	{
@@ -4200,13 +4208,14 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 		auto& cache = std::get<CCache<AssetType>>(m_caches);
 		cache.m_forwardMap.reserve(cache.m_forwardMap.size()+stagingCache.size());
 		cache.m_reverseMap.reserve(cache.m_reverseMap.size()+stagingCache.size());
+		constexpr bool IsTLAS = std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>;
 		for (auto& item : stagingCache)
 		if (item.second.value!=CHashCache::NoContentHash) // didn't get wiped
 		{
 			// rescan all the GPU objects and find out if they depend on anything that failed, if so add to failure set
 			bool depsMissing = false;
 			// only go over types we could actually break via missing upload/build (i.e. pipelines are unbreakable)
-			if constexpr (std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>)
+			if constexpr (IsTLAS)
 			{
 				// there's no lifetime tracking (refcounting) from TLAS to BLAS, so one just must trust the pre-TLAS-build input validation to do its job
 			}
@@ -4225,6 +4234,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 					if (samplers[i])
 						depsMissing = missingDependent.operator()<ICPUSampler>(samplers[i].get());
 				}
+				const auto tlasRewriteOldSize = tlasRewrites.size();
 				for (auto i=0u; !depsMissing && i<static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_COUNT); i++)
 				{
 					const auto type = static_cast<asset::IDescriptor::E_TYPE>(i);
@@ -4251,8 +4261,30 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 								depsMissing = missingDependent.operator()<ICPUBufferView>(static_cast<const IGPUBufferView*>(untypedDesc));
 								break;
 							case asset::IDescriptor::EC_ACCELERATION_STRUCTURE:
-								depsMissing = missingDependent.operator()<ICPUTopLevelAccelerationStructure>(static_cast<const ICPUTopLevelAccelerationStructure*>(untypedDesc));
+							{
+								const auto* tlas = static_cast<const IGPUTopLevelAccelerationStructure*>(untypedDesc);
+								depsMissing = missingDependent.operator()<ICPUTopLevelAccelerationStructure>(tlas);
+								if (!depsMissing)
+								{
+									auto found = compactedTLASMap.find(tlas);
+									if (found==compactedTLASMap.end())
+										break;
+									// written TLAS got compacted, so queue the descriptor for update
+									using redirect_t = IDescriptorSetLayoutBase::CBindingRedirect;
+									const redirect_t& redirect = layout->getDescriptorRedirect(IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE);
+									const auto bindingRange = redirect.findBindingStorageIndex(redirect_t::storage_offset_t(i));
+									const auto firstElementOffset = redirect.getStorageOffset(bindingRange).data;
+									tlasRewrites.push_back({
+										.set = item.first,
+										.binding = redirect.getBinding(bindingRange).data,
+										.arrayElement = i-firstElementOffset,
+										.count = 1, // write them one by one, no point optimizing
+										.info = nullptr // for now
+									});
+									tlasInfos.emplace_back().desc = smart_refctd_ptr<IGPUTopLevelAccelerationStructure>(found->second);
+								}
 								break;
+							}
 							default:
 								assert(false);
 								depsMissing = true;
@@ -4260,21 +4292,34 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						}
 					}
 				}
+				// don't bother overwriting a Descriptor Set that won't be marked as successfully converted (inserted into write cache)
+				if (depsMissing)
+				{
+					tlasRewrites.resize(tlasRewriteOldSize);
+					tlasInfos.resize(tlasRewriteOldSize);
+				}
 			}
+			auto* pGpuObj = item.first;
 			if (depsMissing)
 			{
 				const auto* hashAsU64 = reinterpret_cast<const uint64_t*>(item.second.value.data);
-				logger.log("GPU Obj %s not writing to final cache because conversion of a dependant failed!", system::ILogger::ELL_ERROR, item.first->getObjectDebugName());
+				logger.log("GPU Obj %s not writing to final cache because conversion of a dependant failed!", system::ILogger::ELL_ERROR, pGpuObj->getObjectDebugName());
 				// wipe self, to let users know
 				item.second.value = {};
 				continue;
 			}
-			if (!params.writeCache(item.second))
+			if (!params.writeCache(item.second)) // TODO: let the user know the pointer too?
 				continue;
+			if constexpr (IsTLAS)
+			{
+				auto found = compactedTLASMap.find(pGpuObj);
+				if (found!=compactedTLASMap.end())
+					pGpuObj = found->second.get();
+			}
 			asset_cached_t<AssetType> cached;
-			cached.value = core::smart_refctd_ptr<typename asset_traits<AssetType>::video_t>(item.first);
+			cached.value = core::smart_refctd_ptr<typename asset_traits<AssetType>::video_t>(pGpuObj);
+			cache.m_reverseMap.emplace(pGpuObj,item.second);
 			cache.m_forwardMap.emplace(item.second,std::move(cached));
-			cache.m_reverseMap.emplace(item.first,item.second);
 		}
 	};
 	// again, need to go bottom up so we can check dependencies being successes
@@ -4293,6 +4338,15 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 	mergeCache.operator()<ICPURenderpass>();
 	mergeCache.operator()<ICPUGraphicsPipeline>();
 	mergeCache.operator()<ICPUDescriptorSet>();
+	// deal with rewriting the TLASes with compacted ones
+	{
+		compactedTLASMap.clear();
+		auto* infoIt = tlasInfos.data();
+		for (auto& write : tlasRewrites)
+			write.info = infoIt++;
+		if (!tlasRewrites.empty())
+			device->updateDescriptorSets(tlasRewrites,{});
+	}
 //	mergeCache.operator()<ICPUFramebuffer>();
 
 	// no submit was necessary, so should signal the extra semaphores from the host

From a4307f4c6034bb31289ed92f10b2aa5bc7bef860 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Mon, 25 Nov 2024 17:13:48 +0100
Subject: [PATCH 11/11] fix a nasty possible threading bug with
 IDeferredOperation and a bunch of typos

---
 include/nbl/asset/ICPUAccelerationStructure.h | 4 ++--
 include/nbl/asset/IDescriptorSetLayout.h      | 4 ++--
 include/nbl/video/IDeferredOperation.h        | 4 ++++
 include/nbl/video/ILogicalDevice.h            | 2 ++
 4 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h
index dcf2975b78..1c83afc783 100644
--- a/include/nbl/asset/ICPUAccelerationStructure.h
+++ b/include/nbl/asset/ICPUAccelerationStructure.h
@@ -90,13 +90,13 @@ class ICPUBottomLevelAccelerationStructure final : public IBottomLevelAccelerati
 		inline std::span<AABBs<asset::ICPUBuffer>> getAABBGeometries()
 		{
 			if (!isMutable() || !m_AABBGeoms)
-				return {nullptr,nullptr};
+				return {};
 			return {m_AABBGeoms->data(),m_AABBGeoms->size()};
 		}
 		inline std::span<const AABBs<asset::ICPUBuffer>> getAABBGeometries() const
 		{
 			if (!m_AABBGeoms)
-				return {nullptr,nullptr};
+				return {};
 			return {m_AABBGeoms->data(),m_AABBGeoms->size()};
 		}
 		inline bool setGeometries(core::smart_refctd_dynamic_array<AABBs<ICPUBuffer>>&& geometries, core::smart_refctd_dynamic_array<uint32_t>&& ranges)
diff --git a/include/nbl/asset/IDescriptorSetLayout.h b/include/nbl/asset/IDescriptorSetLayout.h
index 5437a09330..803253e8b3 100644
--- a/include/nbl/asset/IDescriptorSetLayout.h
+++ b/include/nbl/asset/IDescriptorSetLayout.h
@@ -150,8 +150,8 @@ class IDescriptorSetLayoutBase : public virtual core::IReferenceCounted // TODO:
 				// Weird functions for exceptional situations
 				inline storage_range_index_t findBindingStorageIndex(const storage_offset_t offset) const
 				{
-					const auto found = std::upper_bound(m_storageOffsets, m_storageOffsets+m_count, offset, [](storage_range_index_t a, storage_range_index_t b) -> bool {return a.data < b.data; });
-					const auto ix = m_storageOffsets - found;
+					const auto found = std::upper_bound(m_storageOffsets,m_storageOffsets+m_count,offset,[](storage_offset_t a, storage_offset_t b)->bool{return a.data<b.data;});
+					const auto ix = m_storageOffsets-found;
 					if (ix>=m_count)
 						return {};
 					return storage_range_index_t(ix);
diff --git a/include/nbl/video/IDeferredOperation.h b/include/nbl/video/IDeferredOperation.h
index 0f85048f75..44f4d4221d 100644
--- a/include/nbl/video/IDeferredOperation.h
+++ b/include/nbl/video/IDeferredOperation.h
@@ -30,7 +30,10 @@ class IDeferredOperation : public IBackendObject
         {
             const auto retval = execute_impl();
             if (retval==STATUS::COMPLETED || retval==STATUS::_ERROR)
+            {
+                std::lock_guard lock(vectorLock);
                 m_resourceTracking.clear();
+            }
             return retval;
         }
 
@@ -66,6 +69,7 @@ class IDeferredOperation : public IBackendObject
     private:
         friend class ILogicalDevice;
         // when we improve allocators, etc. we'll stop using STL containers here
+        std::mutex vectorLock;
         core::vector<core::smart_refctd_ptr<const IReferenceCounted>> m_resourceTracking;
 };
 
diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h
index 39325bd6dd..ee7d31bdba 100644
--- a/include/nbl/video/ILogicalDevice.h
+++ b/include/nbl/video/ILogicalDevice.h
@@ -46,6 +46,8 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
 
 
         //! Basic getters
+        inline system::ILogger* getLogger() const {return m_logger.get();}
+
         inline const IPhysicalDevice* getPhysicalDevice() const { return m_physicalDevice; }
 
         inline const SPhysicalDeviceFeatures& getEnabledFeatures() const { return m_enabledFeatures; }