SciNim · lilkeet · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024
diff --git a/c2nim/cuda12_5/cuda.h b/c2nim/cuda12_5/cuda.h
@@ -0,0 +1,32 @@
+
+#ifdef C2NIM
+  #assumendef __cuda_cuda_h__
+
+  #mangle cuuint32_t cint
+  #mangle cuuclonglong culonglong
+  #mangle default_ defaultUnderScore
+  #mangle _oversize underScoreOversize
+  #mangle _internal_padding underScoreInternal_padding
+
+  #def __device_builtin__
+
+  #def __CUDA_DEPRECATED
+
+  #skipinclude
+
+  #dynlib libName
+  #private libName
+  #cdecl
+  #if defined(windows)
+  #  stdcall
+  #  define libName "cuda.dll" // dont know that this is right
+  #elif defined(macosx)
+  #  define libName "libcuda.dylib"
+  #else
+  #  define libName "libcuda.so"
+  #endif
+
+
+
+#endif
+
diff --git a/c2nim/cuda12_5/nvrtc.h b/c2nim/cuda12_5/nvrtc.h
@@ -0,0 +1,28 @@
+
+#ifdef C2NIM
+  #assumendef __NVRTC_H__
+
+  #def __device_builtin__
+
+  #mangle _nvrtcProgram nvrtcProgramObj
+
+  #skipinclude
+
+  #dynlib libName
+  #private libName
+  #cdecl
+  #if defined(windows)
+  #  stdcall
+  #  define libName "nvrtc64.dll" // dont know that this is right
+  #elif defined(macosx)
+  #  define libName "libnvrtc.dylib"
+  #else
+  #  define libName "libnvrtc.so"
+  #endif
+
+#@
+type nvrtcProgramObj {.noDecl, incompleteStruct.} = object
+@#
+
+#endif
+
diff --git a/examples/cuda12_5/runtimeCompilation.nim b/examples/cuda12_5/runtimeCompilation.nim
@@ -0,0 +1,120 @@
+
+# this example is from https://docs.nvidia.com/cuda/nvrtc/index.html#basic-usage
+
+import
+  std / [strformat],
+  ../../src/nimcuda/cuda12_5/[nvrtc, check, cuda]
+
+const
+  NumThreads = 128
+  NumBlocks = 32
+
+const Saxpy = cstring"""
+extern "C" __global__
+void saxpy(float a, float *x, float *y, float *out, size_t n)
+{
+   size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+   if (tid < n) {
+      out[tid] = a * x[tid] + y[tid];
+   }
+}
+"""
+
+proc main =
+  # Create an instance of nvrtcProgram with the SAXPY code string.
+  var prog: nvrtcProgram
+  check nvrtcCreateProgram(addr(prog), Saxpy, "saxpy.cu", 0, nil, nil)
+
+  # Compile the program with fmad disabled.
+  # Note: Can specify GPU target architecture explicitly with '-arch' flag.
+  const
+    Options = [cstring "--fmad=false"]
+    NumberOfOptions = cint Options.len
+  let compileResult =  nvrtcCompileProgram(prog, NumberOfOptions,
+                                           cast[cstringArray](addr Options[0]))
+
+  block obtainLog: # Obtain compilation log from the program.
+    var logSize: csize_t
+    check nvrtcGetProgramLogSize(prog, addr logSize)
+
+    var log = cstring newString(Natural logSize)
+
+    check nvrtcGetProgramLog(prog, log)
+
+    echo fmt"log = '{log}'" # usually empty if no issues found by the compiler
+
+  check compileResult
+
+
+  var ptx: cstring
+  block obtainPtx: # Obtain PTX from the program.
+    var ptxSize: csize_t
+    check nvrtcGetPTXSize(prog, addr ptxSize)
+
+    ptx = cstring newString(Natural ptxSize)
+    check nvrtcGetPTX(prog, ptx)
+
+    check nvrtcDestroyProgram(addr prog) # Destroy the program.
+
+  block execution:
+    # Load the generated PTX and get a handle to the SAXPY kernel.
+    var
+      cuDevice: CUdevice
+      context: CUcontext
+      module: CUmodule
+      kernel: CUfunction
+
+    check cuInit(0)
+    check cuDeviceGet(addr cuDevice, 0)
+    check cuCtxCreate(addr context, 0, cuDevice)
+    check cuModuleLoadDataEx(addr module, ptx, 0, nil, nil)
+    check cuModuleGetFunction(addr kernel, module, "saxpy")
+
+    let
+      n = csize_t(NumThreads * NumBlocks)
+      bufferSize = n * csize_t(sizeOf cfloat)
+    let a = cfloat 5.1
+    var
+      hX = newSeqUninit[cfloat](n)
+      hY = newSeqUninit[cfloat](n)
+      hOut = newSeqUninit[cfloat](n)
+    for i in 0 ..< n: # Initialize host data (fill hX and hY with ur data)
+      hX[i] = cfloat(i)
+      hY[i] = cfloat(i * 2)
+
+    var
+      dX: CUdeviceptr
+      dY: CUdeviceptr
+      dOut: CUdeviceptr
+    check cuMemAlloc(addr dX, bufferSize)
+    check cuMemAlloc(addr dY, bufferSize)
+    check cuMemAlloc(addr dOut, bufferSize)
+
+    check cuMemcpyHtoD(dX, addr hX[0], bufferSize)
+    check cuMemcpyHtoD(dY, addr hY[0], bufferSize)
+
+    # Execute SAXPY.
+    let args: array[5, pointer] = [pointer(addr a), addr dX, addr dY,
+      addr dOut, addr n]
+    check cuLaunchKernel(kernel,
+                         NumBlocks, 1, 1,  # grid dim
+                         NumThreads, 1, 1, # block dim
+                         0, nil,           # shared mem and stream
+                         addr args[0],      # arguments,
+                         nil)
+
+    check cuCtxSynchronize()
+    check cuMemcpyDtoH(addr hOut[0], dOut, bufferSize)
+
+    for i in 0 ..< n:
+      echo fmt"{a} * {hX[i]} + {hY[i]} = {hOut[i]}"
+
+    check cuMemFree(dX)
+    check cuMemFree(dY)
+    check cuMemFree(dOut)
+
+    check cuModuleUnload module
+    check cuCtxDestroy context
+
+
+main()
diff --git a/include/cuda12_5/cuda.h b/include/cuda12_5/cuda.h
@@ -54,14 +54,14 @@
 
 
 #include <stdlib.h>
-#ifdef _MSC_VER
-typedef unsigned __int32 cuuint32_t;
-typedef unsigned __int64 cuuclonglong;
-#else
-#include <stdint.h>
-typedef uint32_t cuuint32_t;
-typedef uclonglong cuuclonglong;
-#endif
+// #ifdef _MSC_VER
+// typedef unsigned __int32 cuuint32_t;
+// typedef unsigned __int64 cuuclonglong;
+// #else
+// #include <stdint.h>
+// typedef uint32_t cuuint32_t;
+// typedef uclonglong cuuclonglong;
+// #endif
 
 // #if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
 // #define __CUDA_DEPRECATED