From 9ddf496c1691e27a3ed8cb8687253493cd715ee5 Mon Sep 17 00:00:00 2001
From: m-schuetz <markus_schuetz@gmx.at>
Date: Sun, 7 Apr 2024 16:12:36 +0200
Subject: [PATCH] update to CUDA 12.4

---
 .gitignore                                    |   6 +-
 CMakeLists.txt                                |   3 +
 README.md                                     |  10 +-
 include/CudaModularProgram.h                  | 192 +++++++++---------
 .../progressive_octree/HostDeviceInterface.h  |   5 +
 .../main_progressive_octree.cpp               | 139 ++++++++++---
 .../progressive_octree_voxels.cu              |  10 +-
 modules/progressive_octree/utils.h.cu         |  10 +-
 8 files changed, 240 insertions(+), 135 deletions(-)
diff --git a/.gitignore b/.gitignore
index 580a47b..17dd4c9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,4 +20,8 @@ build/laszip/laszip.dir/
 build/laszip/Release/
 build/laszip/Debug/
 build/Release/laszip.dll
-build/
\ No newline at end of file
+build/
+.generated_cuda_programs/progressive_octree_voxels.cu__utils.cu/cubin
+.generated_cuda_programs/render.cu__utils.cu/cubin
+.generated_cuda_programs/reset.cu__utils.cu/cubin
+build_old/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 725fe10..0d42f4a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -99,6 +99,9 @@ target_link_libraries(${PROJECT_NAME} laszip)
 target_include_directories(${PROJECT_NAME} PRIVATE
 	libs/laszip)
 
+## TODO: disable copying of cuda source files
+## - developer builds should use the project root dir as the working directory and directly load from ./modules
+## - public release builds should be shipped with the precompiled cuda programs (cached in temp/cuda_programs)
 # Post-Build
 if (SimLOD_CreateCUDASymlinks)
 	execute_process(
diff --git a/README.md b/README.md
index 3ca6555..5147c1c 100644
--- a/README.md
+++ b/README.md
@@ -65,7 +65,7 @@ During rendering, a CUDA kernel first computes a list of visible octree nodes. E
 
 ### Install Dependencies
 
-* CUDA Toolkit 11.8
+* CUDA Toolkit 12.4
 
 ### Build & Run
 
@@ -144,9 +144,11 @@ The <a href="https://doi.org/10.5069/G9CN71V5">full San Simeon data set (18 bill
 @article{SimLOD,
     title =      "SimLOD: Simultaneous LOD Generation and Rendering",
     author =     "Markus Schütz and Lukas Herzberger and Michael Wimmer",
-    year =       "2023",
-    month =      oct,
-    journal =    "Arxiv",
+    year =       "2024",
+    month =      may,
+    journal =    "Proceedings of the ACM in Computer Graphics and Interactive Techniques",
+	volume =     "7",
+	note =       "Source Code: https://github.com/m-schuetz/SimLOD",
     keywords =   "point-based rendering",
 }
 </pre>
diff --git a/include/CudaModularProgram.h b/include/CudaModularProgram.h
index 2217db9..d19aa0b 100644
--- a/include/CudaModularProgram.h
+++ b/include/CudaModularProgram.h
@@ -2,9 +2,9 @@
 
 #include <string>
 #include <unordered_map>
-
 #include "unsuck.hpp"
 #include "nvrtc.h"
+#include <nvJitLink.h>
 #include <cmath>
 #include "cuda.h"
 
@@ -12,6 +12,26 @@ using std::string;
 
 using namespace std;
 
+#define NVJITLINK_SAFE_CALL(h,x)                                  \
+do {                                                              \
+   nvJitLinkResult result = x;                                    \
+   if (result != NVJITLINK_SUCCESS) {                             \
+      std::cerr << "\nerror: " #x " failed with error "           \
+                << result << '\n';                                \
+      size_t lsize;                                               \
+      result = nvJitLinkGetErrorLogSize(h, &lsize);               \
+      if (result == NVJITLINK_SUCCESS && lsize > 0) {             \
+         char *log = (char*)malloc(lsize);                        \
+         result = nvJitLinkGetErrorLog(h, log);                   \
+         if (result == NVJITLINK_SUCCESS) {                       \
+            std::cerr << "error: " << log << '\n';                \
+            free(log);                                            \
+         }                                                        \
+      }                                                           \
+      exit(1);                                                    \
+   }                                                              \
+} while(0)
+
 struct CudaModule{
 
 	void cu_checked(CUresult result){
@@ -24,11 +44,15 @@ struct CudaModule{
 	string name = "";
 	bool compiled = false;
 	bool success = false;
+	
 	size_t ptxSize = 0;
 	char* ptx = nullptr;
 
-	size_t nvvmSize;
-	char *nvvm = nullptr;
+	size_t ltoirSize = 0;
+	char* ltoir = nullptr;
+
+	//size_t nvvmSize;
+	//char *nvvm = nullptr;
 
 	CudaModule(string path, string name){
 		this->path = path;
@@ -38,52 +62,48 @@ struct CudaModule{
 	void compile(){
 		auto tStart = now();
 
-		// cout << "================================================================================" << endl;
-		// cout << "=== COMPILING: " << fs::path(path).filename().string() << endl;
-		// cout << "================================================================================" << endl;
-		printfmt("compiling {} ", fs::path(path).filename().string());
-
-		const char* cuda_path_cstr = std::getenv("CUDA_PATH");
-		std::string cuda_path;
-		if (cuda_path_cstr) {
-			cuda_path = std::string(cuda_path_cstr);
-			if (!cuda_path.empty()) {
-				std::cout << "\nCUDA_PATH is set to: " << cuda_path << std::endl;
-			} else {
-				std::cout << "\nCUDA_PATH is empty. Please set it." << std::endl;
-				exit(-1);
-			}
-		} else {
-			std::cout << "\nCUDA_PATH is not set. Please set it." << std::endl;
-			exit(-1);
-		}
+		cout << "================================================================================" << endl;
+		cout << "=== COMPILING: " << fs::path(path).filename().string() << endl;
+		cout << "================================================================================" << endl;
+
+		success = false;
+
+		string dir = fs::path(path).parent_path().string();
+		// string optInclude = "-I " + dir;
 
-		const string cuda_include = "-I " + cuda_path + "/include";
-		const string dir = fs::path(path).parent_path().string();
-		const string optInclude = "-I " + dir;
+		string cuda_path = std::getenv("CUDA_PATH");
+		// string cuda_include = "-I " + cuda_path + "/include";
 
+		string optInclude = std::format("-I {}", dir).c_str();
+		string cuda_include = std::format("-I {}/include", cuda_path);
+		string cudastd_include = std::format("-I {}/include/cuda/std", cuda_path);
+		
 		nvrtcProgram prog;
 		string source = readFile(path);
 		nvrtcCreateProgram(&prog, source.c_str(), name.c_str(), 0, NULL, NULL);
-
-		success = false;
-
-		std::vector<const char*> opts = {
-			// "--gpu-architecture=compute_75",
-			"--gpu-architecture=compute_86",
+		std::vector<const char*> opts = { 
+			"--gpu-architecture=compute_89",
+			// "--gpu-architecture=compute_86",
 			"--use_fast_math",
 			"--extra-device-vectorization",
 			"-lineinfo",
-			optInclude.c_str(),
 			cuda_include.c_str(),
+			optInclude.c_str(),
+			"-I ./",
 			"--relocatable-device-code=true",
 			"-default-device",
 			"-dlto", 
-			// "--dopt=on",
-			"--std=c++17"
+			"--std=c++20",
+			"--disable-warnings",
 		};
 
-		nvrtcResult res = nvrtcCompileProgram(prog, static_cast<int>(opts.size()), opts.data());
+
+		for(auto opt : opts){
+			cout << opt << endl;
+		}
+		cout << "====" << endl;
+
+		nvrtcResult res = nvrtcCompileProgram(prog, opts.size(), opts.data());
 		
 		if (res != NVRTC_SUCCESS)
 		{
@@ -99,23 +119,19 @@ struct CudaModule{
 			}
 		}
 
-		if(nvvmSize > 0){
-			delete[] nvvm;
-			nvvmSize = 0;
-		}
+		nvrtcGetLTOIRSize(prog, &ltoirSize);
+		ltoir = new char[ltoirSize];
+		nvrtcGetLTOIR(prog, ltoir);
+
+		printfmt("compiled ltoir. size: {} byte \n", ltoirSize);
 
-		nvrtcGetNVVMSize(prog, &nvvmSize);
-		nvvm = new char[nvvmSize];
-		nvrtcGetNVVM(prog, nvvm);
-		// Destroy the program.
 		nvrtcDestroyProgram(&prog);
 
 		compiled = true;
 		success = true;
 
-		// printElapsedTime("compiled " + name, tStart);
-		auto duration = now() - tStart;
-		printfmt("- compiled in {:.3}s \n", duration);
+		printElapsedTime("compile " + name, tStart);
+
 	}
 
 };
@@ -138,6 +154,8 @@ struct CudaModularProgram{
 
 	CUmodule mod;
 	// CUfunction kernel = nullptr;
+	void* cubin;
+	size_t cubinSize;
 
 	vector<std::function<void(void)>> compileCallbacks;
 
@@ -153,9 +171,6 @@ struct CudaModularProgram{
 
 		this->kernelNames = kernelNames;
 
-		printfmt("================================================================================\n");
-		printfmt("building CUDA program \n");
-
 		for(auto modulePath : modulePaths){
 
 			string moduleName = fs::path(modulePath).filename().string();
@@ -164,9 +179,6 @@ struct CudaModularProgram{
 			module->compile();
 
 			monitorFile(modulePath, [&, module]() {
-				printfmt("================================================================================\n");
-				printfmt("building CUDA program \n");
-
 				module->compile();
 				link();
 			});
@@ -179,9 +191,9 @@ struct CudaModularProgram{
 
 	void link(){
 
-		// cout << "================================================================================" << endl;
-		// cout << "=== LINKING" << endl;
-		// cout << "================================================================================" << endl;
+		cout << "================================================================================" << endl;
+		cout << "=== LINKING" << endl;
+		cout << "================================================================================" << endl;
 		
 		auto tStart = now();
 
@@ -193,58 +205,44 @@ struct CudaModularProgram{
 
 		float walltime;
 		constexpr uint32_t v_optimization_level = 1;
-		constexpr size_t logSize = 8192;
+		constexpr uint32_t logSize = 8192;
 		char info_log[logSize];
 		char error_log[logSize];
-
-		vector<CUjit_option> options = {
-			CU_JIT_LTO,
-			CU_JIT_WALL_TIME,
-			CU_JIT_OPTIMIZATION_LEVEL,
-			CU_JIT_INFO_LOG_BUFFER,
-			CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
-			CU_JIT_ERROR_LOG_BUFFER,
-			CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
-			CU_JIT_LOG_VERBOSE,
-			// CU_JIT_FAST_COMPILE // CUDA internal only (?)
-		};
-
-		vector<void*> optionVals = {
-			(void*) 1,
-			(void*) &walltime,
-			(void*) 4,
-			(void*) info_log,
-			(void*) logSize,
-			(void*) error_log,
-			(void*) logSize,
-			(void*) 1,
-			// (void*) 1
-		};
 		
 		CUlinkState linkState;
 
-		int numOptions = static_cast<int>(options.size());
-		cu_checked(cuLinkCreate(numOptions, options.data(), optionVals.data(), &linkState));
+		CUdevice cuDevice;
+		cuDeviceGet(&cuDevice, 0);
+
+		int major = 0;
+		int minor = 0;
+		cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice);
+		cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice);
+
+		int arch = major * 10 + minor;
+		string strArch = std::format("-arch=sm_{}", arch);
+
+		const char *lopts[] = {"-dlto", strArch.c_str()};
+
+		nvJitLinkHandle handle;
+		nvJitLinkCreate(&handle, 2, lopts);
 
 		for(auto module : modules){
-			cu_checked(cuLinkAddData(linkState, CU_JIT_INPUT_NVVM,
-				module->nvvm, module->nvvmSize, module->name.c_str(),
-				0, 0, 0));
+			NVJITLINK_SAFE_CALL(handle, nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, (void *)module->ltoir, module->ltoirSize, "module label"));
 		}
 
-		size_t cubinSize;
-		void *cubin;
+		NVJITLINK_SAFE_CALL(handle, nvJitLinkComplete(handle));
+		NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubinSize(handle, &cubinSize));
 
-		cu_checked(cuLinkComplete(linkState, &cubin, &cubinSize));
+		cubin = malloc(cubinSize);
+		NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubin(handle, cubin));
+		NVJITLINK_SAFE_CALL(handle, nvJitLinkDestroy(&handle));
 
-		// {
-		// 	printf("link duration: %f ms \n", walltime);
-			printf("link error message: %s \n", error_log);
-			printf("link info message: %s \n", info_log);
-		// }
+		static int cubinID = 0;
+		// writeBinaryFile(format("./program_{}.cubin", cubinID), (uint8_t*)cubin, cubinSize);
+		cubinID++;
 
 		cu_checked(cuModuleLoadData(&mod, cubin));
-		//cu_checked(cuModuleGetFunction(&kernel, mod, "kernel"));
 
 		for(string kernelName : kernelNames){
 			CUfunction kernel;
@@ -257,10 +255,8 @@ struct CudaModularProgram{
 			callback();
 		}
 
-		// printElapsedTime("link duration: ", tStart);
-		auto duration = now() - tStart;
-		printfmt("link duration: {:.3} \n", duration);
-		printfmt("================================================================================\n");
+		// printElapsedTime("cuda link duration: ", tStart);
+
 	}
 
 	void onCompile(std::function<void(void)> callback){
diff --git a/modules/progressive_octree/HostDeviceInterface.h b/modules/progressive_octree/HostDeviceInterface.h
index 73239a2..9fd217c 100644
--- a/modules/progressive_octree/HostDeviceInterface.h
+++ b/modules/progressive_octree/HostDeviceInterface.h
@@ -31,6 +31,7 @@ struct Uniforms{
 	bool showPoints;
 	bool colorByNode;
 	bool colorByLOD;
+	bool colorWhite;
 	bool doUpdateVisibility;
 	bool doProgressive;
 	float LOD;
@@ -38,6 +39,8 @@ struct Uniforms{
 	float minNodeSize;
 	int pointSize;
 	bool updateStats;
+	bool enableEDL;
+	float edlStrength;
 };
 
 struct Stats{
@@ -55,6 +58,8 @@ struct Stats{
 	uint32_t numVisibleLeaves           = 0;
 	uint32_t numVisiblePoints           = 0;
 	uint32_t numVisibleVoxels           = 0;
+	uint32_t numChunksPoints            = 0;
+	uint32_t numChunksVoxels            = 0;
 
 	uint32_t batchletIndex              = 0;
 	uint64_t numPointsProcessed         = 0;
diff --git a/modules/progressive_octree/main_progressive_octree.cpp b/modules/progressive_octree/main_progressive_octree.cpp
index 6c0cd1b..6e719fa 100644
--- a/modules/progressive_octree/main_progressive_octree.cpp
+++ b/modules/progressive_octree/main_progressive_octree.cpp
@@ -118,19 +118,24 @@ glm::mat4 transform_updatebound;
 Stats stats;
 void* h_stats_pinned = nullptr;
 
+double t_drop_start = 0.0;
+
 struct {
-	bool useHighQualityShading       = false;
+	bool useHighQualityShading       = true;
 	bool showBoundingBox             = false;
 	bool doUpdateVisibility          = true;
 	bool showPoints                  = true;
 	bool colorByNode                 = false;
 	bool colorByLOD                  = false;
+	bool colorWhite                  = false;
 	bool autoFocusOnLoad             = true;
 	bool benchmarkRendering          = false;
 	float LOD                        = 0.2f;
 	float minNodeSize                = 64.0f;
 	int pointSize                    = 1;
 	float fovy                       = 60.0f;
+	bool enableEDL                   = true;
+	float edlStrength                = 0.8f;
 } settings;
 
 struct PinnedMemPool{
@@ -225,12 +230,21 @@ bool requestColorFiltering         = false;
 float renderingDuration            = 0.0f;
 uint32_t numPointsUploaded         = 0;
 float loadStart                    = 0.0f;
+
 float kernelUpdateDuration         = 0.0f;
 float totalUpdateDuration          = 0.0f;
 double minKernelUpdateDuration     = Infinity;
 double maxKernelUpdateDuration     = 0.0;
 double avgKernelUpdateDuration     = 0.0;
 double cntKernelUpdateDuration     = 0.0;
+
+float kernelRenderDuration         = 0.0f;
+float totalRenderDuration          = 0.0f;
+double minKernelRenderDuration     = Infinity;
+double maxKernelRenderDuration     = 0.0;
+double avgKernelRenderDuration     = 0.0;
+double cntKernelRenderDuration     = 0.0;
+
 atomic_uint64_t numPointsTotal     = 0;
 atomic_uint64_t numPointsLoaded    = 0;
 atomic_uint64_t numBytesTotal      = 0;
@@ -291,24 +305,27 @@ Uniforms getUniforms(shared_ptr<GLRenderer> renderer){
 	memcpy(&uniforms.transform_updateBound, &transform_updatebound, sizeof(transform_updatebound));
 	memcpy(&uniforms.transformInv_updateBound, &transform_inv_updatebound, sizeof(transform_inv_updatebound));
 	
-	uniforms.width                  = static_cast<float>(renderer->width);
-	uniforms.height                 = static_cast<float>(renderer->height);
-	uniforms.fovy_rad               = 3.1415f * renderer->camera->fovy / 180.0;
-	uniforms.time                   = static_cast<float>(now());
-	uniforms.boxMin                 = float3{0.0f, 0.0f, 0.0f};
-	uniforms.boxMax                 = boxSize;
-	uniforms.frameCounter           = frameCounter;
-	uniforms.showBoundingBox        = settings.showBoundingBox;
-	uniforms.doUpdateVisibility     = settings.doUpdateVisibility;
-	uniforms.showPoints             = settings.showPoints;
-	uniforms.colorByNode            = settings.colorByNode;
-	uniforms.colorByLOD             = settings.colorByLOD;
-	uniforms.LOD                    = settings.LOD;
-	uniforms.minNodeSize            = settings.minNodeSize;
-	uniforms.pointSize              = settings.pointSize;
-	uniforms.useHighQualityShading  = settings.useHighQualityShading;
+	uniforms.width                    = static_cast<float>(renderer->width);
+	uniforms.height                   = static_cast<float>(renderer->height);
+	uniforms.fovy_rad                 = 3.1415f * renderer->camera->fovy / 180.0;
+	uniforms.time                     = static_cast<float>(now());
+	uniforms.boxMin                   = float3{0.0f, 0.0f, 0.0f};
+	uniforms.boxMax                   = boxSize;
+	uniforms.frameCounter             = frameCounter;
+	uniforms.showBoundingBox          = settings.showBoundingBox;
+	uniforms.doUpdateVisibility       = settings.doUpdateVisibility;
+	uniforms.showPoints               = settings.showPoints;
+	uniforms.colorByNode              = settings.colorByNode;
+	uniforms.colorByLOD               = settings.colorByLOD;
+	uniforms.colorWhite               = settings.colorWhite;
+	uniforms.LOD                      = settings.LOD;
+	uniforms.minNodeSize              = settings.minNodeSize;
+	uniforms.pointSize                = settings.pointSize;
+	uniforms.useHighQualityShading    = settings.useHighQualityShading;
 	uniforms.persistentBufferCapacity = persistentBufferCapacity;
 	uniforms.momentaryBufferCapacity  = momentaryBufferCapacity;
+	uniforms.enableEDL                = settings.enableEDL;
+	uniforms.edlStrength              = settings.edlStrength;
 
 	return uniforms;
 }
@@ -489,6 +506,7 @@ void renderCUDA(shared_ptr<GLRenderer> renderer){
 		& cudaprint.cptr
 	};
 
+	
 	auto res_launch = cuLaunchCooperativeKernel(cuda_program_render->kernels["kernel_render"],
 		numGroups, 1, 1,
 		workgroupSize, 1, 1,
@@ -502,6 +520,20 @@ void renderCUDA(shared_ptr<GLRenderer> renderer){
 
 	cuEventRecord(ce_render_end, 0);
 
+	// benchmark kernel- slows down overall loading!
+	if(requestBenchmark){ 
+		cuCtxSynchronize();
+
+		float duration;
+		cuEventElapsedTime(&duration, ce_render_start, ce_render_end);
+
+		kernelRenderDuration    += duration;
+		minKernelRenderDuration = std::min(minKernelRenderDuration, double(duration));
+		maxKernelRenderDuration = std::max(maxKernelRenderDuration, double(duration));
+		avgKernelRenderDuration = (cntKernelRenderDuration * avgKernelRenderDuration + duration) / (cntKernelRenderDuration + 1.0);
+		cntKernelRenderDuration += 1.0;
+	}
+
 	if(settings.benchmarkRendering){
 		cuCtxSynchronize();
 		cuEventElapsedTime(&renderingDuration, ce_render_start, ce_render_end);
@@ -549,7 +581,7 @@ void initCudaProgram(shared_ptr<GLRenderer> renderer){
 	size_t totalMem = 0;
 	cuMemGetInfo(&availableMem, &totalMem);
 
-	size_t cptr_buffer_persistent_bytes = static_cast<size_t>(static_cast<double>(availableMem) * 0.96);
+	size_t cptr_buffer_persistent_bytes = static_cast<size_t>(static_cast<double>(availableMem) * 0.80);
 	persistentBufferCapacity = cptr_buffer_persistent_bytes;
 	cuMemAlloc(&cptr_buffer_persistent, cptr_buffer_persistent_bytes);
 
@@ -621,6 +653,13 @@ void reload(){
 	avgKernelUpdateDuration = 0.0;
 	cntKernelUpdateDuration = 0.0;
 
+	totalRenderDuration     = 0.0f;
+	kernelRenderDuration    = 0.0f;
+	minKernelRenderDuration = Infinity;
+	maxKernelRenderDuration = 0.0;
+	avgKernelRenderDuration = 0.0;
+	cntKernelRenderDuration = 0.0;
+
 	lock_guard<mutex> lock_batchesToProcess(mtx_batchesToProcess);
 	lock_guard<mutex> lock_batchesInPinnedMemory(mtx_batchesInPinnedMemory);
 	lock_guard<mutex> lock_batchesInPageableMemory(mtx_batchesInPageableMemory);
@@ -780,6 +819,10 @@ void spawnLoader(size_t i) {
 			bool everythingIsDone = batchStreamUploadIndex == numBatchesTotal;
 			bool processingLagsBehind = numPointsLoaded > stats.numPointsProcessed + BATCH_STREAM_SIZE * MAX_BATCH_SIZE;
 
+			// if (processingLagsBehind) {
+			// 	printfmt("processing lags behind\n");
+			// }
+
 			if (everythingIsDone || processingLagsBehind || resetInProgress.load()) {
 				std::this_thread::sleep_for(1ms);
 				continue;
@@ -814,6 +857,10 @@ void spawnLoader(size_t i) {
 			if (batch.count > 0) {
 				// load points in batch
 
+				int batchID = batch.first / MAX_BATCH_SIZE;
+				double t_start = now();
+				// printfmt("start loading batch {} at {:.3f} \n", batchID, t_start);
+
 				numThreadsLoading++;
 				if(iEndsWith(batch.file, "las")){
 					void* target = (void*)pinnedPoints;
@@ -859,6 +906,12 @@ void spawnLoader(size_t i) {
 						point.rgba[0] = rgb[0] > 255 ? rgb[0] / 256 : rgb[0];
 						point.rgba[1] = rgb[1] > 255 ? rgb[1] / 256 : rgb[1];
 						point.rgba[2] = rgb[2] > 255 ? rgb[2] / 256 : rgb[2];
+						
+						//int intensity = laz_point->intensity;
+						//point.rgba[0] = intensity / 200;
+						//point.rgba[1] = intensity / 200;
+						//point.rgba[2] = intensity / 200;
+
 
 						pinnedPoints[i] = point;
 					}
@@ -887,6 +940,10 @@ void spawnLoader(size_t i) {
 					batchesInPinnedMemory.push_back(batch);
 				}
 
+				// double t_end = now();
+				// double millies = (t_end - t_start) * 1000.0;
+				// printfmt("finished loading batch {} at {:.3f}. duration: {:.3f} ms \n", batchID, t_end, millies);
+
 				numThreadsLoading--;
 			}else {
 				// give back pinned memory slot if we didn't use it
@@ -1036,6 +1093,13 @@ int main(){
 	// renderer->controls->radius = 929.239;
 	// renderer->controls->target = { 606.560, 385.040, 13.848, };
 
+	// position: 448.8209204653559, 768.7683535080489, 23.676426584479366 
+	// renderer->controls->yaw    = -4.660;
+	// renderer->controls->pitch  = -0.293;
+	// renderer->controls->radius = 94.341;
+	// renderer->controls->target = { 354.609, 764.038, 25.101, };
+
+
 	initCuda();
 	initCudaProgram(renderer);
 
@@ -1056,6 +1120,9 @@ int main(){
 	renderer->onFileDrop([&](vector<string> files){
 		vector<string> pointCloudFiles;
 
+		t_drop_start = now();
+		printfmt("drop at {:.3f} \n", now());
+
 		for(auto file : files){
 			printfmt("dropped: {} \n", file);
 
@@ -1136,6 +1203,16 @@ int main(){
 
 			statsAge = static_cast<int>(renderer->frameCount) - stats.frameID;
 
+			static uint64_t previousNumPointsProcessed = 0;
+			uint64_t numPointsProcessed = stats.numPointsProcessed;
+
+			// if(numPointsProcessed != previousNumPointsProcessed){
+			// 	printfmt("processed {} at {:.3f}. since drop: {:.3f} \n", numPointsProcessed, now(), now() - t_drop_start);
+
+			// 	previousNumPointsProcessed = numPointsProcessed;
+			// }
+			
+
 			bool newLastBatchFinishedDevice = stats.numPointsProcessed == uint64_t(numPointsTotal);
 			if(stats.memCapacityReached){
 				newLastBatchFinishedDevice = true;
@@ -1157,14 +1234,16 @@ int main(){
 
 			ImGui::Begin("Settings");
 			// ImGui::Text("Test abc");
-			ImGui::Checkbox("Show Bounding Box",     &settings.showBoundingBox);
-			ImGui::Checkbox("Update Visibility",     &settings.doUpdateVisibility);
-			ImGui::Checkbox("Show Points",           &settings.showPoints);
-			ImGui::Checkbox("Color by Node",         &settings.colorByNode);
-			ImGui::Checkbox("Color by LOD",          &settings.colorByLOD);
-			ImGui::Checkbox("High-Quality-Shading",  &settings.useHighQualityShading);
-			ImGui::Checkbox("Auto-focus on load",    &settings.autoFocusOnLoad);
-			ImGui::Checkbox("Benchmark Rendering",   &settings.benchmarkRendering);
+			ImGui::Checkbox("Show Bounding Box",        &settings.showBoundingBox);
+			ImGui::Checkbox("Update Visibility",        &settings.doUpdateVisibility);
+			ImGui::Checkbox("Show Points",              &settings.showPoints);
+			ImGui::Checkbox("Color by Node",            &settings.colorByNode);
+			ImGui::Checkbox("Color by LOD",             &settings.colorByLOD);
+			// ImGui::Checkbox("Color white",              &settings.colorWhite);
+			ImGui::Checkbox("enable Eye Dome Lighting", &settings.enableEDL);
+			ImGui::Checkbox("High-Quality-Shading",     &settings.useHighQualityShading);
+			ImGui::Checkbox("Auto-focus on load",       &settings.autoFocusOnLoad);
+			ImGui::Checkbox("Benchmark Rendering",      &settings.benchmarkRendering);
 
 			if(ImGui::Button("Reset")){
 				requestReset = true;
@@ -1285,8 +1364,9 @@ int main(){
 			}
 			
 			ImGui::SliderFloat("minNodeSize", &settings.minNodeSize, 32.0f, 1024.0f);
-			ImGui::SliderInt("Point Size", &settings.pointSize, 1, 5);
+			ImGui::SliderInt("Point Size", &settings.pointSize, 1, 10);
 			ImGui::SliderFloat("FovY", &settings.fovy, 20.0f, 100.0f);
+			ImGui::SliderFloat("EDL Strength", &settings.edlStrength, 0.0f, 3.0f);
 
 			if(ImGui::Button("Copy Camera")){
 				auto controls = renderer->controls;
@@ -1432,6 +1512,8 @@ int main(){
 				{"GB/s (disk I/O)          ", toGB(gbs_file)                                        , format("{:.1f}", gbs_file / GB)},
 				{"GB/s (gpu)               ", toGB(gbs_gpu)                                         , format("{:.1f}", gbs_gpu / GB)},
 				{"=========================", " "                                                   , " "},
+				{"#render kernel duration  ", toMS(kernelRenderDuration)                            , format("{:.1f}", kernelRenderDuration)},
+				{"=========================", " "                                                   , " "},
 				{"rendering duration       ", toMS(renderingDuration)                               , format("{:.1f}", renderingDuration)},
 				{"    points / sec         ", toB(millionPointsSecRendered)                        , format("{:.1f}", millionPointsSecRendered / B)},
 				{"    voxels / sec         ", toB(millionVoxelsSecRendered)                        , format("{:.1f}", millionVoxelsSecRendered / B)},
@@ -1442,6 +1524,9 @@ int main(){
 				{"    #inner               ", toIntString(stats.numInner)                           , format("{}", stats.numInner)},
 				{"    #leaves (nonempty)   ", toIntString(stats.numNonemptyLeaves)                  , format("{}", stats.numNonemptyLeaves)},
 				{"    #leaves (empty)      ", toIntString(numEmptyLeaves)                           , format("{}", numEmptyLeaves)},
+				{"#chunks                  ", toIntString(stats.numNodes)                           , format("{}", stats.numNodes)},
+				{"    #voxels              ", toIntString(stats.numChunksVoxels)                    , format("{}", stats.numChunksVoxels)},
+				{"    #points              ", toIntString(stats.numChunksPoints)                    , format("{}", stats.numChunksPoints)},
 				{"#samples                 ", toM(stats.numPoints + stats.numVoxels)                , format("{:.1f}", (stats.numPoints + stats.numVoxels) / M)},
 				{"    #points              ", toM(stats.numPoints)                                  , format("{:.1f}", stats.numPoints / M)},
 				{"    #voxels              ", toM(stats.numVoxels)                                  , format("{:.1f}", stats.numVoxels / M)},
diff --git a/modules/progressive_octree/progressive_octree_voxels.cu b/modules/progressive_octree/progressive_octree_voxels.cu
index fa6be42..6dafe08 100644
--- a/modules/progressive_octree/progressive_octree_voxels.cu
+++ b/modules/progressive_octree/progressive_octree_voxels.cu
@@ -905,7 +905,7 @@ void kernel_construct(
 			);
 
 			stats->memCapacityReached = true;
-		}else{
+		}else if(!memCapacityReached){
 			stats->memCapacityReached = false;
 		}
 
@@ -960,6 +960,8 @@ void kernel_construct(
 	uint32_t* counter_nonempty_leaves = allocator->alloc<uint32_t*>(4);
 	uint32_t* counter_points          = allocator->alloc<uint32_t*>(4);
 	uint32_t* counter_voxels          = allocator->alloc<uint32_t*>(4);
+	uint32_t* counter_chunks_points   = allocator->alloc<uint32_t*>(4);
+	uint32_t* counter_chunks_voxels   = allocator->alloc<uint32_t*>(4);
 
 	if(grid.thread_rank() == 0){
 		*counter_inner           = 0;
@@ -967,6 +969,8 @@ void kernel_construct(
 		*counter_nonempty_leaves = 0;
 		*counter_points          = 0;
 		*counter_voxels          = 0;
+		*counter_chunks_points   = 0;
+		*counter_chunks_voxels   = 0;
 	}
 	grid.sync();
 
@@ -976,6 +980,7 @@ void kernel_construct(
 		if(node->isLeafFn()){
 			atomicAdd(counter_leaves, 1);
 			atomicAdd(counter_points, node->numPoints);
+			atomicAdd(counter_chunks_points, (node->numPoints + POINTS_PER_CHUNK - 1) / POINTS_PER_CHUNK);
 
 			if(node->numPoints > 0){
 				atomicAdd(counter_nonempty_leaves, 1);
@@ -983,6 +988,7 @@ void kernel_construct(
 		}else{
 			atomicAdd(counter_inner, 1);
 			atomicAdd(counter_voxels, node->numVoxels);
+			atomicAdd(counter_chunks_voxels, (node->numVoxels + POINTS_PER_CHUNK - 1) / POINTS_PER_CHUNK);
 		}
 
 	});
@@ -995,6 +1001,8 @@ void kernel_construct(
 		stats->numNonemptyLeaves         = *counter_nonempty_leaves;
 		stats->numPoints                 = *counter_points;
 		stats->numVoxels                 = *counter_voxels;
+		stats->numChunksPoints           = *counter_chunks_points;
+		stats->numChunksVoxels           = *counter_chunks_voxels;
 		stats->allocatedBytes_momentary  = allocator->offset;
 		stats->allocatedBytes_persistent = allocator_persistent->offset;
 		stats->frameID                   = uniforms.frameCounter;
diff --git a/modules/progressive_octree/utils.h.cu b/modules/progressive_octree/utils.h.cu
index 88f5e11..c90d239 100644
--- a/modules/progressive_octree/utils.h.cu
+++ b/modules/progressive_octree/utils.h.cu
@@ -8,12 +8,14 @@ namespace cg = cooperative_groups;
 #define FALSE 0
 #define TRUE 1
 
-typedef unsigned int uint32_t;
-typedef int int32_t;
-typedef char int8_t;
+typedef signed char int8_t;
 typedef unsigned char uint8_t;
+typedef signed short int16_t;
+typedef unsigned short uint16_t;
+typedef signed int int32_t;
+typedef unsigned int uint32_t;
+typedef signed long long int64_t;
 typedef unsigned long long uint64_t;
-typedef long long int64_t;
 
 // #define Infinity 0x7f800000u
 #define Infinity 1.0f / 0.0f