[DEBUG] Add support for print-after-all etc for llvm passes in optimi…

…ze_module (triton-lang#2995) This is guarded with LLVM_IR_ENABLE_DUMP. translateLLVMIRToASM is using the legacy pass manager, it doesn't need this hook. I haven't figured out how to set PrintAfterAll with this env variable. Another related issue is triton-llvm-opt doesn't have the same pipeline setup. See https://github.com/openai/triton/blob/e6e5d5468e92ed3af3e40babdd55c3da506ab01f/bin/triton-llvm-opt.cpp#L61. It may be worthwhile to also add O3 pipeline as https://github.com/openai/triton/blob/e6e5d5468e92ed3af3e40babdd55c3da506ab01f/python/src/llvm.cc#L192. Ideally we want triton-llvm-opt to run on the un-optimized llir (i.e with DISABLE_LLVM_OPT) and to have the same optimizations as optimize_module, in order to reproduce issues. --------- Co-authored-by: Manman Ren <[email protected]> Co-authored-by: Keren Zhou <[email protected]>
intelligent-machine-learning · Feb 2, 2024 · 5f60092 · 5f60092
1 parent 075701a
commit 5f60092
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 3 deletions.
diff --git a/include/triton/Tools/Sys/GetEnv.hpp b/include/triton/Tools/Sys/GetEnv.hpp
@@ -32,7 +32,7 @@ namespace triton {
 const std::set<std::string> ENV_VARS = {
     "DISABLE_MMA_V3",     "TRITON_DISABLE_LINE_INFO", "DISABLE_FAST_REDUCTION",
     "ENABLE_TMA",         "MLIR_ENABLE_DUMP",         "LLVM_IR_ENABLE_DUMP",
-    "AMDGCN_ENABLE_DUMP", "DISABLE_LLVM_OPT"};
+    "AMDGCN_ENABLE_DUMP", "DISABLE_LLVM_OPT",         "DISABLE_PTXAS_OPT"};
 
 namespace tools {
 

diff --git a/python/src/llvm.cc b/python/src/llvm.cc
@@ -14,6 +14,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Passes/OptimizationLevel.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Passes/StandardInstrumentations.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetMachine.h"
@@ -160,6 +161,16 @@ void init_triton_llvm(py::module &&m) {
     FunctionAnalysisManager fam;
     CGSCCAnalysisManager cgam;
     ModuleAnalysisManager mam;
+
+    PassInstrumentationCallbacks *instrCbPtr = nullptr;
+    PassInstrumentationCallbacks passInstrCb;
+    StandardInstrumentations standardInstr(mod->getContext(),
+                                           /*DebugLogging*/ true);
+    if (triton::tools::getBoolEnv("LLVM_IR_ENABLE_DUMP")) {
+      standardInstr.registerCallbacks(passInstrCb, &mam);
+      instrCbPtr = &passInstrCb;
+    }
+
     PipelineTuningOptions tuningOptions;
     tuningOptions.LoopUnrolling = true;
     tuningOptions.LoopInterleaving = true;
@@ -172,7 +183,8 @@ void init_triton_llvm(py::module &&m) {
     // some scheduling solution.
     tuningOptions.SLPVectorization = true;
 
-    PassBuilder pb(nullptr /*targetMachine*/, tuningOptions);
+    PassBuilder pb(nullptr /*targetMachine*/, tuningOptions, std::nullopt,
+                   instrCbPtr);
 
     pb.registerModuleAnalyses(mam);
     pb.registerCGSCCAnalyses(cgam);

diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py
@@ -554,7 +554,10 @@ def make_cubin(src, metadata, opt, capability):
             line_info = '' if os.environ.get('TRITON_DISABLE_LINE_INFO') else ' -lineinfo'
             fmad = '' if opt.enable_fp_fusion else ' --fmad=false'
             suffix = 'a ' if capability == 90 else ' '
-            cmd = f'{ptxas}{line_info}{fmad} -v --gpu-name=sm_{capability}{suffix}{fsrc.name} -o {fbin} 2> {flog.name}'
+            if os.environ.get("DISABLE_PTXAS_OPT", "0") == "1":
+              cmd = f'{ptxas}{line_info}{fmad} -v --opt-level 0 --gpu-name=sm_{capability}{suffix}{fsrc.name} -o {fbin} 2> {flog.name}'
+            else:
+              cmd = f'{ptxas}{line_info}{fmad} -v --gpu-name=sm_{capability}{suffix}{fsrc.name} -o {fbin} 2> {flog.name}'
 
             try:
                 subprocess.run(cmd, shell=True, check=True)