Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tracer annotations to HIP code in ReSolve #134

Merged
merged 6 commits into from
Jan 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ornl_ascent_mirror.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ name: ORNL Ascent Mirror

# triggers a github action everytime there is a push or mr
on:
push:
#push:

jobs:
# To test on HPC resources we must first mirror the repo and then trigger a pipeline
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/ornl_crusher_mirror.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ name: ORNL Crusher Mirror

# triggers a github action everytime there is a push or mr
on:
pull_request:
#pull_request:
push:
branches:
- develop
- main
- never #develop
- ever #main

jobs:
# To test on HPC resources we must first mirror the repo and then trigger a pipeline
Expand Down
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ option(RESOLVE_TEST_WITH_BSUB "Use `jsrun` instead of `mpirun` commands when run
option(RESOLVE_USE_KLU "Use KLU, AMD and COLAMD libraries from SuiteSparse" ON)
option(RESOLVE_USE_CUDA "Use CUDA language and SDK" OFF)
option(RESOLVE_USE_HIP "Use HIP language and ROCm library" OFF)
option(RESOLVE_USE_PROFILING "Set profiling tracers in the code" OFF)

option(RESOLVE_USE_GPU "Use GPU device for computations" OFF)
mark_as_advanced(FORCE RESOLVE_USE_GPU)
Expand Down
3 changes: 3 additions & 0 deletions cmake/ReSolveFindHipLibraries.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@ target_link_libraries(resolve_hip INTERFACE
roc::rocsolver
)

# HIP/ROCm targets still don't have include directories set correctly
# We need this little hack for now :/
get_target_property(hip_includes hip::device INTERFACE_INCLUDE_DIRECTORIES)
message(STATUS "HIP include directories found at: ${hip_includes}")

target_include_directories(resolve_hip INTERFACE
$<BUILD_INTERFACE:${hip_includes}>)
Expand Down
11 changes: 4 additions & 7 deletions docs/sphinx/developer_guide/profiling.rst
Original file line number Diff line number Diff line change
Expand Up @@ -110,13 +110,10 @@ requires code to be instrumented using `ROC Tracer <https://rocm.docs.amd.com/pr
library. Both, ROCProfiler and ROC Tracer are part of the ROCm library, so
no additional software needs to be installed once you obtain ROCm.

To build your instrumented code, you need to link your Re::Solve build to
ROC Tracer library:

.. code:: cmake

target_include_directories(ReSolve SYSTEM PUBLIC ${HIP_PATH}/roctracer/include ${HIP_PATH}/include )
target_link_libraries(ReSolve PUBLIC "-L${HIP_PATH}/roctracer/lib -lroctracer64" "-L${HIP_PATH}/roctracer/lib -lroctx64" )
First, you need to make sure your Re::Solve library was built with ROC Tracer
support, i.e. the build was configured with CMake boolean flag
``RESOLVE_USE_PROFILING`` set to ``On``. Note that ROC Tracer annotation will
be enabled only if Re::Solve is built with HIP support.

Next, you need to annotate events you want to trace in your code execution.
This could be done in a straightforward manner using ROC Tracer push and pop
Expand Down
35 changes: 26 additions & 9 deletions examples/r_KLU_rocSolverRf_FGMRES.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <resolve/LinSolverDirectRocSolverRf.hpp>
#include <resolve/LinSolverIterativeFGMRES.hpp>
#include <resolve/workspace/LinAlgWorkspace.hpp>
#include <resolve/Profiling.hpp>

using namespace ReSolve::constants;

Expand All @@ -28,8 +29,8 @@ int main(int argc, char *argv[])
std::string rhsFileName = argv[2];

index_type numSystems = atoi(argv[3]);
std::cout<<"Family mtx file name: "<< matrixFileName << ", total number of matrices: "<<numSystems<<std::endl;
std::cout<<"Family rhs file name: "<< rhsFileName << ", total number of RHSes: " << numSystems<<std::endl;
std::cout << "Family mtx file name: " << matrixFileName << ", total number of matrices: " << numSystems << std::endl;
std::cout << "Family rhs file name: " << rhsFileName << ", total number of RHSes: " << numSystems << std::endl;

std::string fileId;
std::string rhsId;
Expand All @@ -48,15 +49,19 @@ int main(int argc, char *argv[])
vector_type* vec_rhs;
vector_type* vec_x;
vector_type* vec_r;
real_type norm_A, norm_x, norm_r;//used for INF norm
real_type norm_A;
real_type norm_x;
real_type norm_r;

ReSolve::GramSchmidt* GS = new ReSolve::GramSchmidt(vector_handler, ReSolve::GramSchmidt::cgs2);
ReSolve::LinSolverDirectKLU* KLU = new ReSolve::LinSolverDirectKLU;
ReSolve::LinSolverDirectRocSolverRf* Rf = new ReSolve::LinSolverDirectRocSolverRf(workspace_HIP);
ReSolve::LinSolverIterativeFGMRES* FGMRES = new ReSolve::LinSolverIterativeFGMRES(matrix_handler, vector_handler, GS);

RESOLVE_RANGE_PUSH(__FUNCTION__);
for (int i = 0; i < numSystems; ++i)
{
RESOLVE_RANGE_PUSH("Matrix Read");
index_type j = 4 + i * 2;
fileId = argv[j];
rhsId = argv[j + 1];
Expand Down Expand Up @@ -104,11 +109,16 @@ int main(int argc, char *argv[])
ReSolve::io::readAndUpdateMatrix(mat_file, A_coo);
ReSolve::io::readAndUpdateRhs(rhs_file, &rhs);
}
std::cout<<"Finished reading the matrix and rhs, size: "<<A->getNumRows()<<" x "<<A->getNumColumns()<< ", nnz: "<< A->getNnz()<< ", symmetric? "<<A->symmetric()<< ", Expanded? "<<A->expanded()<<std::endl;
std::cout << "Finished reading the matrix and rhs, size: " << A->getNumRows() << " x "<< A->getNumColumns()
<< ", nnz: " << A->getNnz()
<< ", symmetric? " << A->symmetric()
<< ", Expanded? " << A->expanded() << std::endl;
mat_file.close();
rhs_file.close();
RESOLVE_RANGE_POP("Matrix Read");

//Now convert to CSR.
RESOLVE_RANGE_PUSH("Convert to CSR");
if (i < 2) {
A->updateFromCoo(A_coo, ReSolve::memory::HOST);
vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
Expand All @@ -117,10 +127,12 @@ int main(int argc, char *argv[])
A->updateFromCoo(A_coo, ReSolve::memory::DEVICE);
vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
}
RESOLVE_RANGE_POP("Convert to CSR");
std::cout<<"COO to CSR completed. Expanded NNZ: "<< A->getNnzExpanded()<<std::endl;
int status;
real_type norm_b;
if (i < 2){
if (i < 2) {
RESOLVE_RANGE_PUSH("KLU");
KLU->setup(A);
matrix_handler->setValuesChanged(true, ReSolve::memory::DEVICE);
status = KLU->analyze();
Expand Down Expand Up @@ -153,7 +165,9 @@ int main(int argc, char *argv[])
GS->setup(A->getNumRows(), FGMRES->getRestart());
FGMRES->setup(A);
}
RESOLVE_RANGE_POP("KLU");
} else {
RESOLVE_RANGE_PUSH("RocSolver");
//status = KLU->refactorize();
std::cout<<"Using ROCSOLVER RF"<<std::endl;
status = Rf->refactorize();
Expand All @@ -177,10 +191,11 @@ int main(int argc, char *argv[])
matrix_handler->matrixInfNorm(A, &norm_A, ReSolve::memory::DEVICE);
norm_x = vector_handler->infNorm(vec_x, ReSolve::memory::DEVICE);
norm_r = vector_handler->infNorm(vec_r, ReSolve::memory::DEVICE);
std::cout << "\t Matrix inf norm: " << std::scientific << std::setprecision(16) << norm_A<<"\n"
<< "\t Residual inf norm: " << norm_r <<"\n"
<< "\t Solution inf norm: " << norm_x <<"\n"
<< "\t Norm of scaled residuals: "<< norm_r / (norm_A * norm_x) << "\n";
std::cout << std::scientific << std::setprecision(16)
<< "\t Matrix inf norm: " << norm_A << "\n"
<< "\t Residual inf norm: " << norm_r << "\n"
<< "\t Solution inf norm: " << norm_x << "\n"
<< "\t Norm of scaled residuals: " << norm_r / (norm_A * norm_x) << "\n";

vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
if(!std::isnan(rnrm) && !std::isinf(rnrm)) {
Expand All @@ -193,9 +208,11 @@ int main(int argc, char *argv[])
<< FGMRES->getFinalResidualNorm()/norm_b
<< " iter: " << FGMRES->getNumIter() << "\n";
}
RESOLVE_RANGE_POP("RocSolver");
}

} // for (int i = 0; i < numSystems; ++i)
RESOLVE_RANGE_POP(__FUNCTION__);

delete A;
delete A_coo;
Expand Down
18 changes: 17 additions & 1 deletion resolve/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,26 @@ target_include_directories(ReSolve INTERFACE
$<INSTALL_INTERFACE:include>
)

# TODO: Make this PRIVATE dependency (requires refactoring ReSolve code)
target_link_libraries(ReSolve PUBLIC ${ReSolve_Targets_List})
target_link_libraries(ReSolve PRIVATE resolve_version)

if(RESOLVE_USE_PROFILING)
if(RESOLVE_USE_HIP)
# Roctracer does not provide CMake target, so we use this hack here.
# The assumption is roctracer lib and headers are installed at the same
# place as the rest of ROCm.
target_link_libraries(ReSolve PUBLIC "-lroctracer64 -lroctx64")
elseif(RESOLVE_USE_CUDA)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd suggest throwing a warning or error here to notify the user that profiling is currently only support with roctx.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe 918a794 addresses this comment.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cc #63

# Nothing to do for CUDA profiling for now.
message(NOTICE "Profiling support enabled, but Re::Solve does not create tracer annotations for CUDA.")
message(NOTICE "This profiling support option will have no effect.")
else()
# Noting to do for profiling on the host for now.
message(NOTICE "Profiling support enabled, but Re::Solve does not create tracer annotations for host code.")
message(NOTICE "This profiling support option will have no effect.")
Comment on lines +153 to +154
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could support this with a different profiling tool like caliper/tau... At the same time, I don't see why we couldn't use ROCm/CUDA profilers to view host code performance as well, albeit that would require a hack + messy CMake

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right, we can, it is just that I didn't do it in this PR. Probably the message should say "tracer annotation not implemented for ...".

endif()
endif(RESOLVE_USE_PROFILING)

# Install targets
install(TARGETS ReSolve
EXPORT ReSolveTargets
Expand Down
21 changes: 14 additions & 7 deletions resolve/LinSolverDirectRocSolverRf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include <resolve/matrix/Csr.hpp>
#include "LinSolverDirectRocSolverRf.hpp"
#include <resolve/hip/hipKernels.h>
#include <resolve/Profiling.hpp>

namespace ReSolve
{
Expand Down Expand Up @@ -31,6 +32,7 @@ namespace ReSolve
index_type* Q,
vector_type* rhs)
{
RESOLVE_RANGE_PUSH(__FUNCTION__);
//remember - P and Q are generally CPU variables
int error_sum = 0;
this->A_ = (matrix::Csr*) A;
Expand Down Expand Up @@ -127,7 +129,7 @@ namespace ReSolve
n,
L_csr_->getNnz(),
descr_L_,
L_csr_->getValues(ReSolve::memory::DEVICE), //vals_,
L_csr_->getValues(ReSolve::memory::DEVICE),
L_csr_->getRowData(ReSolve::memory::DEVICE),
L_csr_->getColData(ReSolve::memory::DEVICE),
info_L_,
Expand All @@ -140,7 +142,7 @@ namespace ReSolve
n,
U_csr_->getNnz(),
descr_U_,
U_csr_->getValues(ReSolve::memory::DEVICE), //vals_,
U_csr_->getValues(ReSolve::memory::DEVICE),
U_csr_->getRowData(ReSolve::memory::DEVICE),
U_csr_->getColData(ReSolve::memory::DEVICE),
info_U_,
Expand All @@ -153,7 +155,7 @@ namespace ReSolve
n,
L_csr_->getNnz(),
descr_L_,
L_csr_->getValues(ReSolve::memory::DEVICE), //vals_,
L_csr_->getValues(ReSolve::memory::DEVICE),
L_csr_->getRowData(ReSolve::memory::DEVICE),
L_csr_->getColData(ReSolve::memory::DEVICE),
info_L_,
Expand Down Expand Up @@ -185,11 +187,13 @@ namespace ReSolve
}

}
RESOLVE_RANGE_POP(__FUNCTION__);
return error_sum;
}

int LinSolverDirectRocSolverRf::refactorize()
{
RESOLVE_RANGE_PUSH(__FUNCTION__);
int error_sum = 0;
mem_.deviceSynchronize();
status_rocblas_ = rocsolver_dcsrrf_refactlu(workspace_->getRocblasHandle(),
Expand Down Expand Up @@ -228,13 +232,14 @@ namespace ReSolve
error_sum += status_rocblas_;

}

RESOLVE_RANGE_POP(__FUNCTION__);
return error_sum;
}

// solution is returned in RHS
int LinSolverDirectRocSolverRf::solve(vector_type* rhs)
{
RESOLVE_RANGE_PUSH(__FUNCTION__);
int error_sum = 0;
if (solve_mode_ == 0) {
mem_.deviceSynchronize();
Expand Down Expand Up @@ -290,11 +295,13 @@ namespace ReSolve
permuteVectorQ(A_->getNumRows(), d_Q_,d_aux1_,rhs->getData(ReSolve::memory::DEVICE));
mem_.deviceSynchronize();
}
RESOLVE_RANGE_POP(__FUNCTION__);
return error_sum;
}

int LinSolverDirectRocSolverRf::solve(vector_type* rhs, vector_type* x)
{
RESOLVE_RANGE_PUSH(__FUNCTION__);
x->update(rhs->getData(ReSolve::memory::DEVICE), ReSolve::memory::DEVICE, ReSolve::memory::DEVICE);
x->setDataUpdated(ReSolve::memory::DEVICE);
int error_sum = 0;
Expand Down Expand Up @@ -355,6 +362,7 @@ namespace ReSolve
permuteVectorQ(A_->getNumRows(), d_Q_,d_aux1_,x->getData(ReSolve::memory::DEVICE));
mem_.deviceSynchronize();
}
RESOLVE_RANGE_POP(__FUNCTION__);
return error_sum;
}

Expand Down Expand Up @@ -427,6 +435,5 @@ namespace ReSolve
Mshifts[static_cast<size_t>(row)]++;
}
}
//Mshifts.~vector();
}
}// namespace resolve
} // LinSolverDirectRocSolverRf::addFactors
} // namespace resolve
18 changes: 18 additions & 0 deletions resolve/Profiling.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#pragma once

#ifdef RESOLVE_USE_PROFILING

#ifdef RESOLVE_USE_HIP
#include <roctracer/roctx.h>
#define RESOLVE_RANGE_PUSH(x) roctxRangePush(x)
#define RESOLVE_RANGE_POP(x) roctxRangePop(); \
roctxMarkA(x)
#endif

#else

// Not using profiling
#define RESOLVE_RANGE_PUSH(x)
#define RESOLVE_RANGE_POP(x)

#endif // RESOLVE_USE_PROFILING
1 change: 1 addition & 0 deletions resolve/resolve_defs.hpp.in
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#cmakedefine RESOLVE_USE_RAJA
#cmakedefine RESOLVE_USE_EIGEN
#cmakedefine RESOLVE_USE_KLU
#cmakedefine RESOLVE_USE_PROFILING
#define RESOLVE_VERSION "@PROJECT_VERSION@"

#define RESOLVE_VERSION_MAJOR "@PROJECT_VERSION_MAJOR@"
Expand Down