From 9ddf496c1691e27a3ed8cb8687253493cd715ee5 Mon Sep 17 00:00:00 2001 From: m-schuetz Date: Sun, 7 Apr 2024 16:12:36 +0200 Subject: [PATCH] update to CUDA 12.4 --- .gitignore | 6 +- CMakeLists.txt | 3 + README.md | 10 +- include/CudaModularProgram.h | 192 +++++++++--------- .../progressive_octree/HostDeviceInterface.h | 5 + .../main_progressive_octree.cpp | 139 ++++++++++--- .../progressive_octree_voxels.cu | 10 +- modules/progressive_octree/utils.h.cu | 10 +- 8 files changed, 240 insertions(+), 135 deletions(-) diff --git a/.gitignore b/.gitignore index 580a47b..17dd4c9 100644 --- a/.gitignore +++ b/.gitignore @@ -20,4 +20,8 @@ build/laszip/laszip.dir/ build/laszip/Release/ build/laszip/Debug/ build/Release/laszip.dll -build/ \ No newline at end of file +build/ +.generated_cuda_programs/progressive_octree_voxels.cu__utils.cu/cubin +.generated_cuda_programs/render.cu__utils.cu/cubin +.generated_cuda_programs/reset.cu__utils.cu/cubin +build_old/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 725fe10..0d42f4a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,6 +99,9 @@ target_link_libraries(${PROJECT_NAME} laszip) target_include_directories(${PROJECT_NAME} PRIVATE libs/laszip) +## TODO: disable copying of cuda source files +## - developer builds should use the project root dir as the working directory and directly load from ./modules +## - public release builds should be shipped with the precompiled cuda programs (cached in temp/cuda_programs) # Post-Build if (SimLOD_CreateCUDASymlinks) execute_process( diff --git a/README.md b/README.md index 3ca6555..5147c1c 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ During rendering, a CUDA kernel first computes a list of visible octree nodes. E ### Install Dependencies -* CUDA Toolkit 11.8 +* CUDA Toolkit 12.4 ### Build & Run @@ -144,9 +144,11 @@ The full San Simeon data set (18 bill @article{SimLOD, title = "SimLOD: Simultaneous LOD Generation and Rendering", author = "Markus Schütz and Lukas Herzberger and Michael Wimmer", - year = "2023", - month = oct, - journal = "Arxiv", + year = "2024", + month = may, + journal = "Proceedings of the ACM in Computer Graphics and Interactive Techniques", + volume = "7", + note = "Source Code: https://github.com/m-schuetz/SimLOD", keywords = "point-based rendering", } diff --git a/include/CudaModularProgram.h b/include/CudaModularProgram.h index 2217db9..d19aa0b 100644 --- a/include/CudaModularProgram.h +++ b/include/CudaModularProgram.h @@ -2,9 +2,9 @@ #include #include - #include "unsuck.hpp" #include "nvrtc.h" +#include #include #include "cuda.h" @@ -12,6 +12,26 @@ using std::string; using namespace std; +#define NVJITLINK_SAFE_CALL(h,x) \ +do { \ + nvJitLinkResult result = x; \ + if (result != NVJITLINK_SUCCESS) { \ + std::cerr << "\nerror: " #x " failed with error " \ + << result << '\n'; \ + size_t lsize; \ + result = nvJitLinkGetErrorLogSize(h, &lsize); \ + if (result == NVJITLINK_SUCCESS && lsize > 0) { \ + char *log = (char*)malloc(lsize); \ + result = nvJitLinkGetErrorLog(h, log); \ + if (result == NVJITLINK_SUCCESS) { \ + std::cerr << "error: " << log << '\n'; \ + free(log); \ + } \ + } \ + exit(1); \ + } \ +} while(0) + struct CudaModule{ void cu_checked(CUresult result){ @@ -24,11 +44,15 @@ struct CudaModule{ string name = ""; bool compiled = false; bool success = false; + size_t ptxSize = 0; char* ptx = nullptr; - size_t nvvmSize; - char *nvvm = nullptr; + size_t ltoirSize = 0; + char* ltoir = nullptr; + + //size_t nvvmSize; + //char *nvvm = nullptr; CudaModule(string path, string name){ this->path = path; @@ -38,52 +62,48 @@ struct CudaModule{ void compile(){ auto tStart = now(); - // cout << "================================================================================" << endl; - // cout << "=== COMPILING: " << fs::path(path).filename().string() << endl; - // cout << "================================================================================" << endl; - printfmt("compiling {} ", fs::path(path).filename().string()); - - const char* cuda_path_cstr = std::getenv("CUDA_PATH"); - std::string cuda_path; - if (cuda_path_cstr) { - cuda_path = std::string(cuda_path_cstr); - if (!cuda_path.empty()) { - std::cout << "\nCUDA_PATH is set to: " << cuda_path << std::endl; - } else { - std::cout << "\nCUDA_PATH is empty. Please set it." << std::endl; - exit(-1); - } - } else { - std::cout << "\nCUDA_PATH is not set. Please set it." << std::endl; - exit(-1); - } + cout << "================================================================================" << endl; + cout << "=== COMPILING: " << fs::path(path).filename().string() << endl; + cout << "================================================================================" << endl; + + success = false; + + string dir = fs::path(path).parent_path().string(); + // string optInclude = "-I " + dir; - const string cuda_include = "-I " + cuda_path + "/include"; - const string dir = fs::path(path).parent_path().string(); - const string optInclude = "-I " + dir; + string cuda_path = std::getenv("CUDA_PATH"); + // string cuda_include = "-I " + cuda_path + "/include"; + string optInclude = std::format("-I {}", dir).c_str(); + string cuda_include = std::format("-I {}/include", cuda_path); + string cudastd_include = std::format("-I {}/include/cuda/std", cuda_path); + nvrtcProgram prog; string source = readFile(path); nvrtcCreateProgram(&prog, source.c_str(), name.c_str(), 0, NULL, NULL); - - success = false; - - std::vector opts = { - // "--gpu-architecture=compute_75", - "--gpu-architecture=compute_86", + std::vector opts = { + "--gpu-architecture=compute_89", + // "--gpu-architecture=compute_86", "--use_fast_math", "--extra-device-vectorization", "-lineinfo", - optInclude.c_str(), cuda_include.c_str(), + optInclude.c_str(), + "-I ./", "--relocatable-device-code=true", "-default-device", "-dlto", - // "--dopt=on", - "--std=c++17" + "--std=c++20", + "--disable-warnings", }; - nvrtcResult res = nvrtcCompileProgram(prog, static_cast(opts.size()), opts.data()); + + for(auto opt : opts){ + cout << opt << endl; + } + cout << "====" << endl; + + nvrtcResult res = nvrtcCompileProgram(prog, opts.size(), opts.data()); if (res != NVRTC_SUCCESS) { @@ -99,23 +119,19 @@ struct CudaModule{ } } - if(nvvmSize > 0){ - delete[] nvvm; - nvvmSize = 0; - } + nvrtcGetLTOIRSize(prog, <oirSize); + ltoir = new char[ltoirSize]; + nvrtcGetLTOIR(prog, ltoir); + + printfmt("compiled ltoir. size: {} byte \n", ltoirSize); - nvrtcGetNVVMSize(prog, &nvvmSize); - nvvm = new char[nvvmSize]; - nvrtcGetNVVM(prog, nvvm); - // Destroy the program. nvrtcDestroyProgram(&prog); compiled = true; success = true; - // printElapsedTime("compiled " + name, tStart); - auto duration = now() - tStart; - printfmt("- compiled in {:.3}s \n", duration); + printElapsedTime("compile " + name, tStart); + } }; @@ -138,6 +154,8 @@ struct CudaModularProgram{ CUmodule mod; // CUfunction kernel = nullptr; + void* cubin; + size_t cubinSize; vector> compileCallbacks; @@ -153,9 +171,6 @@ struct CudaModularProgram{ this->kernelNames = kernelNames; - printfmt("================================================================================\n"); - printfmt("building CUDA program \n"); - for(auto modulePath : modulePaths){ string moduleName = fs::path(modulePath).filename().string(); @@ -164,9 +179,6 @@ struct CudaModularProgram{ module->compile(); monitorFile(modulePath, [&, module]() { - printfmt("================================================================================\n"); - printfmt("building CUDA program \n"); - module->compile(); link(); }); @@ -179,9 +191,9 @@ struct CudaModularProgram{ void link(){ - // cout << "================================================================================" << endl; - // cout << "=== LINKING" << endl; - // cout << "================================================================================" << endl; + cout << "================================================================================" << endl; + cout << "=== LINKING" << endl; + cout << "================================================================================" << endl; auto tStart = now(); @@ -193,58 +205,44 @@ struct CudaModularProgram{ float walltime; constexpr uint32_t v_optimization_level = 1; - constexpr size_t logSize = 8192; + constexpr uint32_t logSize = 8192; char info_log[logSize]; char error_log[logSize]; - - vector options = { - CU_JIT_LTO, - CU_JIT_WALL_TIME, - CU_JIT_OPTIMIZATION_LEVEL, - CU_JIT_INFO_LOG_BUFFER, - CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, - CU_JIT_ERROR_LOG_BUFFER, - CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, - CU_JIT_LOG_VERBOSE, - // CU_JIT_FAST_COMPILE // CUDA internal only (?) - }; - - vector optionVals = { - (void*) 1, - (void*) &walltime, - (void*) 4, - (void*) info_log, - (void*) logSize, - (void*) error_log, - (void*) logSize, - (void*) 1, - // (void*) 1 - }; CUlinkState linkState; - int numOptions = static_cast(options.size()); - cu_checked(cuLinkCreate(numOptions, options.data(), optionVals.data(), &linkState)); + CUdevice cuDevice; + cuDeviceGet(&cuDevice, 0); + + int major = 0; + int minor = 0; + cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice); + cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice); + + int arch = major * 10 + minor; + string strArch = std::format("-arch=sm_{}", arch); + + const char *lopts[] = {"-dlto", strArch.c_str()}; + + nvJitLinkHandle handle; + nvJitLinkCreate(&handle, 2, lopts); for(auto module : modules){ - cu_checked(cuLinkAddData(linkState, CU_JIT_INPUT_NVVM, - module->nvvm, module->nvvmSize, module->name.c_str(), - 0, 0, 0)); + NVJITLINK_SAFE_CALL(handle, nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, (void *)module->ltoir, module->ltoirSize, "module label")); } - size_t cubinSize; - void *cubin; + NVJITLINK_SAFE_CALL(handle, nvJitLinkComplete(handle)); + NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubinSize(handle, &cubinSize)); - cu_checked(cuLinkComplete(linkState, &cubin, &cubinSize)); + cubin = malloc(cubinSize); + NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubin(handle, cubin)); + NVJITLINK_SAFE_CALL(handle, nvJitLinkDestroy(&handle)); - // { - // printf("link duration: %f ms \n", walltime); - printf("link error message: %s \n", error_log); - printf("link info message: %s \n", info_log); - // } + static int cubinID = 0; + // writeBinaryFile(format("./program_{}.cubin", cubinID), (uint8_t*)cubin, cubinSize); + cubinID++; cu_checked(cuModuleLoadData(&mod, cubin)); - //cu_checked(cuModuleGetFunction(&kernel, mod, "kernel")); for(string kernelName : kernelNames){ CUfunction kernel; @@ -257,10 +255,8 @@ struct CudaModularProgram{ callback(); } - // printElapsedTime("link duration: ", tStart); - auto duration = now() - tStart; - printfmt("link duration: {:.3} \n", duration); - printfmt("================================================================================\n"); + // printElapsedTime("cuda link duration: ", tStart); + } void onCompile(std::function callback){ diff --git a/modules/progressive_octree/HostDeviceInterface.h b/modules/progressive_octree/HostDeviceInterface.h index 73239a2..9fd217c 100644 --- a/modules/progressive_octree/HostDeviceInterface.h +++ b/modules/progressive_octree/HostDeviceInterface.h @@ -31,6 +31,7 @@ struct Uniforms{ bool showPoints; bool colorByNode; bool colorByLOD; + bool colorWhite; bool doUpdateVisibility; bool doProgressive; float LOD; @@ -38,6 +39,8 @@ struct Uniforms{ float minNodeSize; int pointSize; bool updateStats; + bool enableEDL; + float edlStrength; }; struct Stats{ @@ -55,6 +58,8 @@ struct Stats{ uint32_t numVisibleLeaves = 0; uint32_t numVisiblePoints = 0; uint32_t numVisibleVoxels = 0; + uint32_t numChunksPoints = 0; + uint32_t numChunksVoxels = 0; uint32_t batchletIndex = 0; uint64_t numPointsProcessed = 0; diff --git a/modules/progressive_octree/main_progressive_octree.cpp b/modules/progressive_octree/main_progressive_octree.cpp index 6c0cd1b..6e719fa 100644 --- a/modules/progressive_octree/main_progressive_octree.cpp +++ b/modules/progressive_octree/main_progressive_octree.cpp @@ -118,19 +118,24 @@ glm::mat4 transform_updatebound; Stats stats; void* h_stats_pinned = nullptr; +double t_drop_start = 0.0; + struct { - bool useHighQualityShading = false; + bool useHighQualityShading = true; bool showBoundingBox = false; bool doUpdateVisibility = true; bool showPoints = true; bool colorByNode = false; bool colorByLOD = false; + bool colorWhite = false; bool autoFocusOnLoad = true; bool benchmarkRendering = false; float LOD = 0.2f; float minNodeSize = 64.0f; int pointSize = 1; float fovy = 60.0f; + bool enableEDL = true; + float edlStrength = 0.8f; } settings; struct PinnedMemPool{ @@ -225,12 +230,21 @@ bool requestColorFiltering = false; float renderingDuration = 0.0f; uint32_t numPointsUploaded = 0; float loadStart = 0.0f; + float kernelUpdateDuration = 0.0f; float totalUpdateDuration = 0.0f; double minKernelUpdateDuration = Infinity; double maxKernelUpdateDuration = 0.0; double avgKernelUpdateDuration = 0.0; double cntKernelUpdateDuration = 0.0; + +float kernelRenderDuration = 0.0f; +float totalRenderDuration = 0.0f; +double minKernelRenderDuration = Infinity; +double maxKernelRenderDuration = 0.0; +double avgKernelRenderDuration = 0.0; +double cntKernelRenderDuration = 0.0; + atomic_uint64_t numPointsTotal = 0; atomic_uint64_t numPointsLoaded = 0; atomic_uint64_t numBytesTotal = 0; @@ -291,24 +305,27 @@ Uniforms getUniforms(shared_ptr renderer){ memcpy(&uniforms.transform_updateBound, &transform_updatebound, sizeof(transform_updatebound)); memcpy(&uniforms.transformInv_updateBound, &transform_inv_updatebound, sizeof(transform_inv_updatebound)); - uniforms.width = static_cast(renderer->width); - uniforms.height = static_cast(renderer->height); - uniforms.fovy_rad = 3.1415f * renderer->camera->fovy / 180.0; - uniforms.time = static_cast(now()); - uniforms.boxMin = float3{0.0f, 0.0f, 0.0f}; - uniforms.boxMax = boxSize; - uniforms.frameCounter = frameCounter; - uniforms.showBoundingBox = settings.showBoundingBox; - uniforms.doUpdateVisibility = settings.doUpdateVisibility; - uniforms.showPoints = settings.showPoints; - uniforms.colorByNode = settings.colorByNode; - uniforms.colorByLOD = settings.colorByLOD; - uniforms.LOD = settings.LOD; - uniforms.minNodeSize = settings.minNodeSize; - uniforms.pointSize = settings.pointSize; - uniforms.useHighQualityShading = settings.useHighQualityShading; + uniforms.width = static_cast(renderer->width); + uniforms.height = static_cast(renderer->height); + uniforms.fovy_rad = 3.1415f * renderer->camera->fovy / 180.0; + uniforms.time = static_cast(now()); + uniforms.boxMin = float3{0.0f, 0.0f, 0.0f}; + uniforms.boxMax = boxSize; + uniforms.frameCounter = frameCounter; + uniforms.showBoundingBox = settings.showBoundingBox; + uniforms.doUpdateVisibility = settings.doUpdateVisibility; + uniforms.showPoints = settings.showPoints; + uniforms.colorByNode = settings.colorByNode; + uniforms.colorByLOD = settings.colorByLOD; + uniforms.colorWhite = settings.colorWhite; + uniforms.LOD = settings.LOD; + uniforms.minNodeSize = settings.minNodeSize; + uniforms.pointSize = settings.pointSize; + uniforms.useHighQualityShading = settings.useHighQualityShading; uniforms.persistentBufferCapacity = persistentBufferCapacity; uniforms.momentaryBufferCapacity = momentaryBufferCapacity; + uniforms.enableEDL = settings.enableEDL; + uniforms.edlStrength = settings.edlStrength; return uniforms; } @@ -489,6 +506,7 @@ void renderCUDA(shared_ptr renderer){ & cudaprint.cptr }; + auto res_launch = cuLaunchCooperativeKernel(cuda_program_render->kernels["kernel_render"], numGroups, 1, 1, workgroupSize, 1, 1, @@ -502,6 +520,20 @@ void renderCUDA(shared_ptr renderer){ cuEventRecord(ce_render_end, 0); + // benchmark kernel- slows down overall loading! + if(requestBenchmark){ + cuCtxSynchronize(); + + float duration; + cuEventElapsedTime(&duration, ce_render_start, ce_render_end); + + kernelRenderDuration += duration; + minKernelRenderDuration = std::min(minKernelRenderDuration, double(duration)); + maxKernelRenderDuration = std::max(maxKernelRenderDuration, double(duration)); + avgKernelRenderDuration = (cntKernelRenderDuration * avgKernelRenderDuration + duration) / (cntKernelRenderDuration + 1.0); + cntKernelRenderDuration += 1.0; + } + if(settings.benchmarkRendering){ cuCtxSynchronize(); cuEventElapsedTime(&renderingDuration, ce_render_start, ce_render_end); @@ -549,7 +581,7 @@ void initCudaProgram(shared_ptr renderer){ size_t totalMem = 0; cuMemGetInfo(&availableMem, &totalMem); - size_t cptr_buffer_persistent_bytes = static_cast(static_cast(availableMem) * 0.96); + size_t cptr_buffer_persistent_bytes = static_cast(static_cast(availableMem) * 0.80); persistentBufferCapacity = cptr_buffer_persistent_bytes; cuMemAlloc(&cptr_buffer_persistent, cptr_buffer_persistent_bytes); @@ -621,6 +653,13 @@ void reload(){ avgKernelUpdateDuration = 0.0; cntKernelUpdateDuration = 0.0; + totalRenderDuration = 0.0f; + kernelRenderDuration = 0.0f; + minKernelRenderDuration = Infinity; + maxKernelRenderDuration = 0.0; + avgKernelRenderDuration = 0.0; + cntKernelRenderDuration = 0.0; + lock_guard lock_batchesToProcess(mtx_batchesToProcess); lock_guard lock_batchesInPinnedMemory(mtx_batchesInPinnedMemory); lock_guard lock_batchesInPageableMemory(mtx_batchesInPageableMemory); @@ -780,6 +819,10 @@ void spawnLoader(size_t i) { bool everythingIsDone = batchStreamUploadIndex == numBatchesTotal; bool processingLagsBehind = numPointsLoaded > stats.numPointsProcessed + BATCH_STREAM_SIZE * MAX_BATCH_SIZE; + // if (processingLagsBehind) { + // printfmt("processing lags behind\n"); + // } + if (everythingIsDone || processingLagsBehind || resetInProgress.load()) { std::this_thread::sleep_for(1ms); continue; @@ -814,6 +857,10 @@ void spawnLoader(size_t i) { if (batch.count > 0) { // load points in batch + int batchID = batch.first / MAX_BATCH_SIZE; + double t_start = now(); + // printfmt("start loading batch {} at {:.3f} \n", batchID, t_start); + numThreadsLoading++; if(iEndsWith(batch.file, "las")){ void* target = (void*)pinnedPoints; @@ -859,6 +906,12 @@ void spawnLoader(size_t i) { point.rgba[0] = rgb[0] > 255 ? rgb[0] / 256 : rgb[0]; point.rgba[1] = rgb[1] > 255 ? rgb[1] / 256 : rgb[1]; point.rgba[2] = rgb[2] > 255 ? rgb[2] / 256 : rgb[2]; + + //int intensity = laz_point->intensity; + //point.rgba[0] = intensity / 200; + //point.rgba[1] = intensity / 200; + //point.rgba[2] = intensity / 200; + pinnedPoints[i] = point; } @@ -887,6 +940,10 @@ void spawnLoader(size_t i) { batchesInPinnedMemory.push_back(batch); } + // double t_end = now(); + // double millies = (t_end - t_start) * 1000.0; + // printfmt("finished loading batch {} at {:.3f}. duration: {:.3f} ms \n", batchID, t_end, millies); + numThreadsLoading--; }else { // give back pinned memory slot if we didn't use it @@ -1036,6 +1093,13 @@ int main(){ // renderer->controls->radius = 929.239; // renderer->controls->target = { 606.560, 385.040, 13.848, }; + // position: 448.8209204653559, 768.7683535080489, 23.676426584479366 + // renderer->controls->yaw = -4.660; + // renderer->controls->pitch = -0.293; + // renderer->controls->radius = 94.341; + // renderer->controls->target = { 354.609, 764.038, 25.101, }; + + initCuda(); initCudaProgram(renderer); @@ -1056,6 +1120,9 @@ int main(){ renderer->onFileDrop([&](vector files){ vector pointCloudFiles; + t_drop_start = now(); + printfmt("drop at {:.3f} \n", now()); + for(auto file : files){ printfmt("dropped: {} \n", file); @@ -1136,6 +1203,16 @@ int main(){ statsAge = static_cast(renderer->frameCount) - stats.frameID; + static uint64_t previousNumPointsProcessed = 0; + uint64_t numPointsProcessed = stats.numPointsProcessed; + + // if(numPointsProcessed != previousNumPointsProcessed){ + // printfmt("processed {} at {:.3f}. since drop: {:.3f} \n", numPointsProcessed, now(), now() - t_drop_start); + + // previousNumPointsProcessed = numPointsProcessed; + // } + + bool newLastBatchFinishedDevice = stats.numPointsProcessed == uint64_t(numPointsTotal); if(stats.memCapacityReached){ newLastBatchFinishedDevice = true; @@ -1157,14 +1234,16 @@ int main(){ ImGui::Begin("Settings"); // ImGui::Text("Test abc"); - ImGui::Checkbox("Show Bounding Box", &settings.showBoundingBox); - ImGui::Checkbox("Update Visibility", &settings.doUpdateVisibility); - ImGui::Checkbox("Show Points", &settings.showPoints); - ImGui::Checkbox("Color by Node", &settings.colorByNode); - ImGui::Checkbox("Color by LOD", &settings.colorByLOD); - ImGui::Checkbox("High-Quality-Shading", &settings.useHighQualityShading); - ImGui::Checkbox("Auto-focus on load", &settings.autoFocusOnLoad); - ImGui::Checkbox("Benchmark Rendering", &settings.benchmarkRendering); + ImGui::Checkbox("Show Bounding Box", &settings.showBoundingBox); + ImGui::Checkbox("Update Visibility", &settings.doUpdateVisibility); + ImGui::Checkbox("Show Points", &settings.showPoints); + ImGui::Checkbox("Color by Node", &settings.colorByNode); + ImGui::Checkbox("Color by LOD", &settings.colorByLOD); + // ImGui::Checkbox("Color white", &settings.colorWhite); + ImGui::Checkbox("enable Eye Dome Lighting", &settings.enableEDL); + ImGui::Checkbox("High-Quality-Shading", &settings.useHighQualityShading); + ImGui::Checkbox("Auto-focus on load", &settings.autoFocusOnLoad); + ImGui::Checkbox("Benchmark Rendering", &settings.benchmarkRendering); if(ImGui::Button("Reset")){ requestReset = true; @@ -1285,8 +1364,9 @@ int main(){ } ImGui::SliderFloat("minNodeSize", &settings.minNodeSize, 32.0f, 1024.0f); - ImGui::SliderInt("Point Size", &settings.pointSize, 1, 5); + ImGui::SliderInt("Point Size", &settings.pointSize, 1, 10); ImGui::SliderFloat("FovY", &settings.fovy, 20.0f, 100.0f); + ImGui::SliderFloat("EDL Strength", &settings.edlStrength, 0.0f, 3.0f); if(ImGui::Button("Copy Camera")){ auto controls = renderer->controls; @@ -1432,6 +1512,8 @@ int main(){ {"GB/s (disk I/O) ", toGB(gbs_file) , format("{:.1f}", gbs_file / GB)}, {"GB/s (gpu) ", toGB(gbs_gpu) , format("{:.1f}", gbs_gpu / GB)}, {"=========================", " " , " "}, + {"#render kernel duration ", toMS(kernelRenderDuration) , format("{:.1f}", kernelRenderDuration)}, + {"=========================", " " , " "}, {"rendering duration ", toMS(renderingDuration) , format("{:.1f}", renderingDuration)}, {" points / sec ", toB(millionPointsSecRendered) , format("{:.1f}", millionPointsSecRendered / B)}, {" voxels / sec ", toB(millionVoxelsSecRendered) , format("{:.1f}", millionVoxelsSecRendered / B)}, @@ -1442,6 +1524,9 @@ int main(){ {" #inner ", toIntString(stats.numInner) , format("{}", stats.numInner)}, {" #leaves (nonempty) ", toIntString(stats.numNonemptyLeaves) , format("{}", stats.numNonemptyLeaves)}, {" #leaves (empty) ", toIntString(numEmptyLeaves) , format("{}", numEmptyLeaves)}, + {"#chunks ", toIntString(stats.numNodes) , format("{}", stats.numNodes)}, + {" #voxels ", toIntString(stats.numChunksVoxels) , format("{}", stats.numChunksVoxels)}, + {" #points ", toIntString(stats.numChunksPoints) , format("{}", stats.numChunksPoints)}, {"#samples ", toM(stats.numPoints + stats.numVoxels) , format("{:.1f}", (stats.numPoints + stats.numVoxels) / M)}, {" #points ", toM(stats.numPoints) , format("{:.1f}", stats.numPoints / M)}, {" #voxels ", toM(stats.numVoxels) , format("{:.1f}", stats.numVoxels / M)}, diff --git a/modules/progressive_octree/progressive_octree_voxels.cu b/modules/progressive_octree/progressive_octree_voxels.cu index fa6be42..6dafe08 100644 --- a/modules/progressive_octree/progressive_octree_voxels.cu +++ b/modules/progressive_octree/progressive_octree_voxels.cu @@ -905,7 +905,7 @@ void kernel_construct( ); stats->memCapacityReached = true; - }else{ + }else if(!memCapacityReached){ stats->memCapacityReached = false; } @@ -960,6 +960,8 @@ void kernel_construct( uint32_t* counter_nonempty_leaves = allocator->alloc(4); uint32_t* counter_points = allocator->alloc(4); uint32_t* counter_voxels = allocator->alloc(4); + uint32_t* counter_chunks_points = allocator->alloc(4); + uint32_t* counter_chunks_voxels = allocator->alloc(4); if(grid.thread_rank() == 0){ *counter_inner = 0; @@ -967,6 +969,8 @@ void kernel_construct( *counter_nonempty_leaves = 0; *counter_points = 0; *counter_voxels = 0; + *counter_chunks_points = 0; + *counter_chunks_voxels = 0; } grid.sync(); @@ -976,6 +980,7 @@ void kernel_construct( if(node->isLeafFn()){ atomicAdd(counter_leaves, 1); atomicAdd(counter_points, node->numPoints); + atomicAdd(counter_chunks_points, (node->numPoints + POINTS_PER_CHUNK - 1) / POINTS_PER_CHUNK); if(node->numPoints > 0){ atomicAdd(counter_nonempty_leaves, 1); @@ -983,6 +988,7 @@ void kernel_construct( }else{ atomicAdd(counter_inner, 1); atomicAdd(counter_voxels, node->numVoxels); + atomicAdd(counter_chunks_voxels, (node->numVoxels + POINTS_PER_CHUNK - 1) / POINTS_PER_CHUNK); } }); @@ -995,6 +1001,8 @@ void kernel_construct( stats->numNonemptyLeaves = *counter_nonempty_leaves; stats->numPoints = *counter_points; stats->numVoxels = *counter_voxels; + stats->numChunksPoints = *counter_chunks_points; + stats->numChunksVoxels = *counter_chunks_voxels; stats->allocatedBytes_momentary = allocator->offset; stats->allocatedBytes_persistent = allocator_persistent->offset; stats->frameID = uniforms.frameCounter; diff --git a/modules/progressive_octree/utils.h.cu b/modules/progressive_octree/utils.h.cu index 88f5e11..c90d239 100644 --- a/modules/progressive_octree/utils.h.cu +++ b/modules/progressive_octree/utils.h.cu @@ -8,12 +8,14 @@ namespace cg = cooperative_groups; #define FALSE 0 #define TRUE 1 -typedef unsigned int uint32_t; -typedef int int32_t; -typedef char int8_t; +typedef signed char int8_t; typedef unsigned char uint8_t; +typedef signed short int16_t; +typedef unsigned short uint16_t; +typedef signed int int32_t; +typedef unsigned int uint32_t; +typedef signed long long int64_t; typedef unsigned long long uint64_t; -typedef long long int64_t; // #define Infinity 0x7f800000u #define Infinity 1.0f / 0.0f