First OMPT stuffs

tud-zih-energy · Feb 11, 2025 · 48030ec · 48030ec
1 parent 36ce911
commit 48030ec
Show file tree

Hide file tree

Showing 5 changed files with 169 additions and 1 deletion.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -10,7 +10,6 @@ include(CheckStructHasMember)
 include(CheckFunctionExists)
 include(FeatureSummary)
 include(GNUInstallDirs)
-
 SET(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake;${CMAKE_MODULE_PATH}")
 
 include(cmake/DefaultBuildType.cmake)
@@ -119,6 +118,7 @@ find_package(Radare)
 find_package(Audit)
 find_package(LibElf REQUIRED)
 find_package(Debuginfod)
+find_package(OpenMP)
 
 
 # configurable options
@@ -140,6 +140,8 @@ CMAKE_DEPENDENT_OPTION(USE_CUPTI "Use CUPTI to record CUDA activity." ON "CUDATo
 add_feature_info("USE_CUPTI" USE_CUPTI "Use CUPTI to record CUDA activity.")
 CMAKE_DEPENDENT_OPTION(USE_DEBUGINFOD "Use Debuginfod to download debug information on-demand." ON "Debuginfod_FOUND" OFF)
 add_feature_info("USE_DEBUGINFOD" USE_DEBUGINFOD "Use Debuginfod to download debug information on-demand.")
+CMAKE_DEPENDENT_OPTION(USE_OMPT "Use Debuginfod to download debug information on-demand." ON "OpenMP_CXX_FOUND" OFF)
+add_feature_info("USE_OMPT" USE_OMPT "Use Debuginfod to download debug information on-demand.")
 # system configuration checks
 CHECK_INCLUDE_FILES(linux/hw_breakpoint.h HAVE_HW_BREAKPOINT_H)
 CHECK_STRUCT_HAS_MEMBER("struct perf_event_attr" clockid linux/perf_event.h HAVE_PERF_EVENT_ATTR_CLOCKID)
@@ -357,6 +359,7 @@ target_include_directories(rb_test PRIVATE include ${CMAKE_CURRENT_BINARY_DIR}/i
 	    otf2xx::Writer)
 
 set(LO2S_CUDA_INJECTIONLIB_PATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/liblo2s_injection.so")
+
 if(USE_CUPTI)
     if(CUDAToolkit_FOUND)
         add_library(lo2s_injection SHARED src/cupti/lib.cpp src/types.cpp)
@@ -386,6 +389,30 @@ if(USE_CUPTI)
         message(SEND_ERROR "Cupti not found but requested.")
     endif()
 endif()
+if(USE_OMPT)
+    if(OpenMP_CXX_FOUND)
+        add_library(ompt_injection SHARED src/ompt/lib.cpp)
+        target_include_directories(ompt_injection PRIVATE include
+            ${CMAKE_CURRENT_BINARY_DIR}/include)
+
+	target_link_libraries(ompt_injection PRIVATE fmt::fmt
+            Nitro::log
+            Nitro::env
+            Nitro::dl
+            Nitro::options
+	    otf2xx::Writer)
+    target_link_libraries(ompt_injection PRIVATE OpenMP::OpenMP_CXX)
+
+        if(SHM_OPEN_FOUND_WITH_RT)
+            target_link_libraries(ompt_injection PRIVATE rt)
+        endif()
+
+        target_compile_definitions(lo2s PUBLIC HAVE_OMPT)
+        install(TARGETS ompt_injection LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
+    else()
+        message(SEND_ERROR "OMPT not found but requested.")
+    endif()
+endif()
 
 
 

diff --git a/include/lo2s/config.hpp b/include/lo2s/config.hpp
@@ -112,6 +112,7 @@ struct Config
     bool use_nvidia = false;
     std::string cuda_injectionlib_path;
     uint64_t nvidia_ringbuf_size;
+    bool use_ompt = false;
     DwarfUsage dwarf;
 
     std::string socket_path;

diff --git a/src/config.cpp b/src/config.cpp
@@ -381,6 +381,9 @@ void parse_program_options(int argc, const char** argv)
 #ifdef HAVE_VEOSINFO
     accelerators.push_back("nec");
 #endif
+#ifdef HAVE_OMPT
+    accelerators.push_back("ompt");
+#endif
 
     accel_options
         .multi_option(
@@ -613,6 +616,15 @@ void parse_program_options(int argc, const char** argv)
 #else
             std::cerr << "lo2s was built without support for CUDA kernel recording\n";
             std::exit(EXIT_FAILURE);
+#endif
+        }
+        else if (accel == "ompt")
+        {
+#ifdef HAVE_OMPT
+            config.use_ompt = true;
+#else
+            std::cerr << "lo2s was built without support for CUDA kernel recording\n";
+            std::exit(EXIT_FAILURE);
 #endif
         }
         else

diff --git a/src/monitor/process_monitor_main.cpp b/src/monitor/process_monitor_main.cpp
@@ -168,6 +168,17 @@ std::vector<char*> to_vector_of_c_str(const std::vector<std::string>& vec)
         }
     }
 #endif
+#ifdef HAVE_OMPT
+    if (config().use_ompt)
+    {
+        Log::error() << "Using ompt!";
+        env.push_back("OMP_TOOL=enabled");
+        env.push_back("OMP_TOOL_LIBRARIES=libompt_injection.so");
+        env.push_back("LD_LIBRARY_PATH=/home/cvonelm/dev/lo2s/build");
+        //+ config().cuda_injectionlib_path );
+    }
+#endif
+
     std::vector<char*> c_env = to_vector_of_c_str(env);
     std::vector<char*> c_args = to_vector_of_c_str(command_and_args);
 

diff --git a/src/ompt/lib.cpp b/src/ompt/lib.cpp
@@ -0,0 +1,117 @@
+#include <omp-tools.h>
+#include <omp.h>
+
+#include <assert.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <sys/resource.h>
+
+#include <lo2s/ringbuf.hpp>
+
+#define PARALLEL_BEGIN 1
+#define PARALLEL_END 2
+std::unique_ptr<lo2s::RingbufWriter> rb_writer = nullptr;
+clockid_t clockid = CLOCK_MONOTONIC_RAW;
+std::map<uint64_t, std::string> cctx_map = { { PARALLEL_BEGIN, "parallel_begin" },
+                                             { PARALLEL_END, "parallel_end" } };
+
+uint64_t timestampfunc()
+{
+    struct timespec ts;
+    clock_gettime(clockid, &ts);
+    std::cerr << clockid << std::endl;
+    uint64_t res = ts.tv_sec * 1000000000 + ts.tv_nsec;
+    std::cerr << res << std::endl;
+    return res;
+}
+
+static void on_ompt_callback_implicit_task(ompt_scope_endpoint_t endpoint,
+                                           ompt_data_t* parallel_data, ompt_data_t* task_data,
+                                           unsigned int actual_parallelism, unsigned int index,
+                                           int flags)
+{
+}
+
+static void on_ompt_callback_parallel_begin(ompt_data_t* encountering_task_data,
+                                            const ompt_frame_t* encountering_task_frame,
+                                            ompt_data_t* parallel_data,
+                                            uint32_t requested_parallelism, int flags,
+                                            const void* codeptr_ra)
+{
+    printf("PARALLEL_BEGIN!");
+}
+
+static void on_ompt_callback_task_create(
+    ompt_data_t* encountering_task_data,                    /* id of parent task            */
+    const ompt_frame_t* encountering_task_frame,            /* frame data for parent task   */
+    ompt_data_t* new_task_data,                             /* id of created task           */
+    int flags, int has_dependences, const void* codeptr_ra) /* pointer to outlined function */
+{
+}
+
+static void on_ompt_callback_task_schedule(ompt_data_t* prior_task_data,
+                                           ompt_task_status_t prior_task_status,
+                                           ompt_data_t* next_task_data)
+{
+}
+
+static void on_ompt_callback_thread_begin(ompt_thread_t thread_type, ompt_data_t* thread_data)
+{
+}
+
+static void on_ompt_callback_parallel_end(ompt_data_t* parallel_data,
+                                          ompt_data_t* encountering_task_data, int flags,
+                                          const void* codeptr_ra)
+{
+    printf("PARALLEL_END!");
+}
+
+static void on_ompt_callback_thread_end(ompt_data_t* thread_data)
+{
+}
+
+#define register_callback_t(name, type)                                                            \
+    do                                                                                             \
+    {                                                                                              \
+        type f_##name = &on_##name;                                                                \
+        if (ompt_set_callback(name, (ompt_callback_t)f_##name) == ompt_set_never)                  \
+            printf("0: Could not register callback '" #name "'\n");                                \
+    } while (0)
+
+#define register_callback(name) register_callback_t(name, name##_t)
+
+int ompt_initialize(ompt_function_lookup_t lookup, int initial_device_num, ompt_data_t* tool_data)
+{
+    std::cout << "Initializing injection!" << std::endl;
+    pid_t pid = getpid();
+    rb_writer =
+        std::make_unique<lo2s::RingbufWriter>(16, lo2s::ExecutionScope(lo2s::Process(pid)), "cuda");
+
+    while (!rb_writer->ready())
+    {
+    };
+    clockid = rb_writer->header()->clockid;
+    ompt_set_callback_t ompt_set_callback = (ompt_set_callback_t)lookup("ompt_set_callback");
+
+    register_callback(ompt_callback_implicit_task);
+    register_callback(ompt_callback_parallel_begin);
+    register_callback(ompt_callback_parallel_end);
+    register_callback(ompt_callback_task_create);
+    register_callback(ompt_callback_task_schedule);
+    register_callback(ompt_callback_thread_begin);
+    register_callback(ompt_callback_thread_end);
+
+    return 1; // success
+}
+
+void ompt_finalize(ompt_data_t* tool_data)
+{
+}
+
+ompt_start_tool_result_t* ompt_start_tool(unsigned int omp_version, const char* runtime_version)
+{
+    static ompt_start_tool_result_t ompt_start_tool_result = { &ompt_initialize,
+                                                               &ompt_finalize,
+                                                               { .value = 0 } };
+    return &ompt_start_tool_result;
+}