update to CUDA 12.4

m-schuetz · Apr 7, 2024 · 9ddf496 · 9ddf496
1 parent de23f1c
commit 9ddf496
Show file tree

Hide file tree

Showing 8 changed files with 240 additions and 135 deletions.
diff --git a/.gitignore b/.gitignore
@@ -20,4 +20,8 @@ build/laszip/laszip.dir/
 build/laszip/Release/
 build/laszip/Debug/
 build/Release/laszip.dll
-build/
+build/
+.generated_cuda_programs/progressive_octree_voxels.cu__utils.cu/cubin
+.generated_cuda_programs/render.cu__utils.cu/cubin
+.generated_cuda_programs/reset.cu__utils.cu/cubin
+build_old/
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -99,6 +99,9 @@ target_link_libraries(${PROJECT_NAME} laszip)
 target_include_directories(${PROJECT_NAME} PRIVATE
 	libs/laszip)
 
+## TODO: disable copying of cuda source files
+## - developer builds should use the project root dir as the working directory and directly load from ./modules
+## - public release builds should be shipped with the precompiled cuda programs (cached in temp/cuda_programs)
 # Post-Build
 if (SimLOD_CreateCUDASymlinks)
 	execute_process(

diff --git a/README.md b/README.md
@@ -65,7 +65,7 @@ During rendering, a CUDA kernel first computes a list of visible octree nodes. E
 
 ### Install Dependencies
 
-* CUDA Toolkit 11.8
+* CUDA Toolkit 12.4
 
 ### Build & Run
 
@@ -144,9 +144,11 @@ The <a href="https://doi.org/10.5069/G9CN71V5">full San Simeon data set (18 bill
 @article{SimLOD,
     title =      "SimLOD: Simultaneous LOD Generation and Rendering",
     author =     "Markus Schütz and Lukas Herzberger and Michael Wimmer",
-    year =       "2023",
-    month =      oct,
-    journal =    "Arxiv",
+    year =       "2024",
+    month =      may,
+    journal =    "Proceedings of the ACM in Computer Graphics and Interactive Techniques",
+	volume =     "7",
+	note =       "Source Code: https://github.com/m-schuetz/SimLOD",
     keywords =   "point-based rendering",
 }
 </pre>

diff --git a/include/CudaModularProgram.h b/include/CudaModularProgram.h
@@ -2,16 +2,36 @@
 
 #include <string>
 #include <unordered_map>
-
 #include "unsuck.hpp"
 #include "nvrtc.h"
+#include <nvJitLink.h>
 #include <cmath>
 #include "cuda.h"
 
 using std::string;
 
 using namespace std;
 
+#define NVJITLINK_SAFE_CALL(h,x)                                  \
+do {                                                              \
+   nvJitLinkResult result = x;                                    \
+   if (result != NVJITLINK_SUCCESS) {                             \
+      std::cerr << "\nerror: " #x " failed with error "           \
+                << result << '\n';                                \
+      size_t lsize;                                               \
+      result = nvJitLinkGetErrorLogSize(h, &lsize);               \
+      if (result == NVJITLINK_SUCCESS && lsize > 0) {             \
+         char *log = (char*)malloc(lsize);                        \
+         result = nvJitLinkGetErrorLog(h, log);                   \
+         if (result == NVJITLINK_SUCCESS) {                       \
+            std::cerr << "error: " << log << '\n';                \
+            free(log);                                            \
+         }                                                        \
+      }                                                           \
+      exit(1);                                                    \
+   }                                                              \
+} while(0)
+
 struct CudaModule{
 
 	void cu_checked(CUresult result){
@@ -24,11 +44,15 @@ struct CudaModule{
 	string name = "";
 	bool compiled = false;
 	bool success = false;
+
 	size_t ptxSize = 0;
 	char* ptx = nullptr;
 
-	size_t nvvmSize;
-	char *nvvm = nullptr;
+	size_t ltoirSize = 0;
+	char* ltoir = nullptr;
+
+	//size_t nvvmSize;
+	//char *nvvm = nullptr;
 
 	CudaModule(string path, string name){
 		this->path = path;
@@ -38,52 +62,48 @@ struct CudaModule{
 	void compile(){
 		auto tStart = now();
 
-		// cout << "================================================================================" << endl;
-		// cout << "=== COMPILING: " << fs::path(path).filename().string() << endl;
-		// cout << "================================================================================" << endl;
-		printfmt("compiling {} ", fs::path(path).filename().string());
-
-		const char* cuda_path_cstr = std::getenv("CUDA_PATH");
-		std::string cuda_path;
-		if (cuda_path_cstr) {
-			cuda_path = std::string(cuda_path_cstr);
-			if (!cuda_path.empty()) {
-				std::cout << "\nCUDA_PATH is set to: " << cuda_path << std::endl;
-			} else {
-				std::cout << "\nCUDA_PATH is empty. Please set it." << std::endl;
-				exit(-1);
-			}
-		} else {
-			std::cout << "\nCUDA_PATH is not set. Please set it." << std::endl;
-			exit(-1);
-		}
+		cout << "================================================================================" << endl;
+		cout << "=== COMPILING: " << fs::path(path).filename().string() << endl;
+		cout << "================================================================================" << endl;
+
+		success = false;
+
+		string dir = fs::path(path).parent_path().string();
+		// string optInclude = "-I " + dir;
 
-		const string cuda_include = "-I " + cuda_path + "/include";
-		const string dir = fs::path(path).parent_path().string();
-		const string optInclude = "-I " + dir;
+		string cuda_path = std::getenv("CUDA_PATH");
+		// string cuda_include = "-I " + cuda_path + "/include";
 
+		string optInclude = std::format("-I {}", dir).c_str();
+		string cuda_include = std::format("-I {}/include", cuda_path);
+		string cudastd_include = std::format("-I {}/include/cuda/std", cuda_path);
+
 		nvrtcProgram prog;
 		string source = readFile(path);
 		nvrtcCreateProgram(&prog, source.c_str(), name.c_str(), 0, NULL, NULL);
-
-		success = false;
-
-		std::vector<const char*> opts = {
-			// "--gpu-architecture=compute_75",
-			"--gpu-architecture=compute_86",
+		std::vector<const char*> opts = { 
+			"--gpu-architecture=compute_89",
+			// "--gpu-architecture=compute_86",
 			"--use_fast_math",
 			"--extra-device-vectorization",
 			"-lineinfo",
-			optInclude.c_str(),
 			cuda_include.c_str(),
+			optInclude.c_str(),
+			"-I ./",
 			"--relocatable-device-code=true",
 			"-default-device",
 			"-dlto", 
-			// "--dopt=on",
-			"--std=c++17"
+			"--std=c++20",
+			"--disable-warnings",
 		};
 
-		nvrtcResult res = nvrtcCompileProgram(prog, static_cast<int>(opts.size()), opts.data());
+
+		for(auto opt : opts){
+			cout << opt << endl;
+		}
+		cout << "====" << endl;
+
+		nvrtcResult res = nvrtcCompileProgram(prog, opts.size(), opts.data());
 
 		if (res != NVRTC_SUCCESS)
 		{
@@ -99,23 +119,19 @@ struct CudaModule{
 			}
 		}
 
-		if(nvvmSize > 0){
-			delete[] nvvm;
-			nvvmSize = 0;
-		}
+		nvrtcGetLTOIRSize(prog, &ltoirSize);
+		ltoir = new char[ltoirSize];
+		nvrtcGetLTOIR(prog, ltoir);
+
+		printfmt("compiled ltoir. size: {} byte \n", ltoirSize);
 
-		nvrtcGetNVVMSize(prog, &nvvmSize);
-		nvvm = new char[nvvmSize];
-		nvrtcGetNVVM(prog, nvvm);
-		// Destroy the program.
 		nvrtcDestroyProgram(&prog);
 
 		compiled = true;
 		success = true;
 
-		// printElapsedTime("compiled " + name, tStart);
-		auto duration = now() - tStart;
-		printfmt("- compiled in {:.3}s \n", duration);
+		printElapsedTime("compile " + name, tStart);
+
 	}
 
 };
@@ -138,6 +154,8 @@ struct CudaModularProgram{
 
 	CUmodule mod;
 	// CUfunction kernel = nullptr;
+	void* cubin;
+	size_t cubinSize;
 
 	vector<std::function<void(void)>> compileCallbacks;
 
@@ -153,9 +171,6 @@ struct CudaModularProgram{
 
 		this->kernelNames = kernelNames;
 
-		printfmt("================================================================================\n");
-		printfmt("building CUDA program \n");
-
 		for(auto modulePath : modulePaths){
 
 			string moduleName = fs::path(modulePath).filename().string();
@@ -164,9 +179,6 @@ struct CudaModularProgram{
 			module->compile();
 
 			monitorFile(modulePath, [&, module]() {
-				printfmt("================================================================================\n");
-				printfmt("building CUDA program \n");
-
 				module->compile();
 				link();
 			});
@@ -179,9 +191,9 @@ struct CudaModularProgram{
 
 	void link(){
 
-		// cout << "================================================================================" << endl;
-		// cout << "=== LINKING" << endl;
-		// cout << "================================================================================" << endl;
+		cout << "================================================================================" << endl;
+		cout << "=== LINKING" << endl;
+		cout << "================================================================================" << endl;
 
 		auto tStart = now();
 
@@ -193,58 +205,44 @@ struct CudaModularProgram{
 
 		float walltime;
 		constexpr uint32_t v_optimization_level = 1;
-		constexpr size_t logSize = 8192;
+		constexpr uint32_t logSize = 8192;
 		char info_log[logSize];
 		char error_log[logSize];
-
-		vector<CUjit_option> options = {
-			CU_JIT_LTO,
-			CU_JIT_WALL_TIME,
-			CU_JIT_OPTIMIZATION_LEVEL,
-			CU_JIT_INFO_LOG_BUFFER,
-			CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
-			CU_JIT_ERROR_LOG_BUFFER,
-			CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
-			CU_JIT_LOG_VERBOSE,
-			// CU_JIT_FAST_COMPILE // CUDA internal only (?)
-		};
-
-		vector<void*> optionVals = {
-			(void*) 1,
-			(void*) &walltime,
-			(void*) 4,
-			(void*) info_log,
-			(void*) logSize,
-			(void*) error_log,
-			(void*) logSize,
-			(void*) 1,
-			// (void*) 1
-		};
 
 		CUlinkState linkState;
 
-		int numOptions = static_cast<int>(options.size());
-		cu_checked(cuLinkCreate(numOptions, options.data(), optionVals.data(), &linkState));
+		CUdevice cuDevice;
+		cuDeviceGet(&cuDevice, 0);
+
+		int major = 0;
+		int minor = 0;
+		cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice);
+		cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice);
+
+		int arch = major * 10 + minor;
+		string strArch = std::format("-arch=sm_{}", arch);
+
+		const char *lopts[] = {"-dlto", strArch.c_str()};
+
+		nvJitLinkHandle handle;
+		nvJitLinkCreate(&handle, 2, lopts);
 
 		for(auto module : modules){
-			cu_checked(cuLinkAddData(linkState, CU_JIT_INPUT_NVVM,
-				module->nvvm, module->nvvmSize, module->name.c_str(),
-				0, 0, 0));
+			NVJITLINK_SAFE_CALL(handle, nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, (void *)module->ltoir, module->ltoirSize, "module label"));
 		}
 
-		size_t cubinSize;
-		void *cubin;
+		NVJITLINK_SAFE_CALL(handle, nvJitLinkComplete(handle));
+		NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubinSize(handle, &cubinSize));
 
-		cu_checked(cuLinkComplete(linkState, &cubin, &cubinSize));
+		cubin = malloc(cubinSize);
+		NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubin(handle, cubin));
+		NVJITLINK_SAFE_CALL(handle, nvJitLinkDestroy(&handle));
 
-		// {
-		// 	printf("link duration: %f ms \n", walltime);
-			printf("link error message: %s \n", error_log);
-			printf("link info message: %s \n", info_log);
-		// }
+		static int cubinID = 0;
+		// writeBinaryFile(format("./program_{}.cubin", cubinID), (uint8_t*)cubin, cubinSize);
+		cubinID++;
 
 		cu_checked(cuModuleLoadData(&mod, cubin));
-		//cu_checked(cuModuleGetFunction(&kernel, mod, "kernel"));
 
 		for(string kernelName : kernelNames){
 			CUfunction kernel;
@@ -257,10 +255,8 @@ struct CudaModularProgram{
 			callback();
 		}
 
-		// printElapsedTime("link duration: ", tStart);
-		auto duration = now() - tStart;
-		printfmt("link duration: {:.3} \n", duration);
-		printfmt("================================================================================\n");
+		// printElapsedTime("cuda link duration: ", tStart);
+
 	}
 
 	void onCompile(std::function<void(void)> callback){

diff --git a/modules/progressive_octree/HostDeviceInterface.h b/modules/progressive_octree/HostDeviceInterface.h
@@ -31,13 +31,16 @@ struct Uniforms{
 	bool showPoints;
 	bool colorByNode;
 	bool colorByLOD;
+	bool colorWhite;
 	bool doUpdateVisibility;
 	bool doProgressive;
 	float LOD;
 	bool useHighQualityShading;
 	float minNodeSize;
 	int pointSize;
 	bool updateStats;
+	bool enableEDL;
+	float edlStrength;
 };
 
 struct Stats{
@@ -55,6 +58,8 @@ struct Stats{
 	uint32_t numVisibleLeaves           = 0;
 	uint32_t numVisiblePoints           = 0;
 	uint32_t numVisibleVoxels           = 0;
+	uint32_t numChunksPoints            = 0;
+	uint32_t numChunksVoxels            = 0;
 
 	uint32_t batchletIndex              = 0;
 	uint64_t numPointsProcessed         = 0;