Skip to content

Commit

Permalink
Merge Pull Request #12703 from jgfouca/Trilinos/jgfouca/fix_shylu_gpu…
Browse files Browse the repository at this point in the history
…_issues

Automatically Merged using Trilinos Pull Request AutoTester
PR Title: b'Fix Shylu GPU/Serial issues'
PR Author: jgfouca
  • Loading branch information
trilinos-autotester authored Jan 29, 2024
2 parents 4ac842a + 82f945e commit 944fb88
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 41 deletions.
4 changes: 2 additions & 2 deletions packages/ifpack2/test/unit_tests/Ifpack2_SolverFactory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -252,10 +252,10 @@ namespace {
"RILUK",
"MDF",
"RBILUK",
// "FAST_IC", // Fails when run with SerialNode in a build with DefaultNode=Cuda
"FAST_IC",
"FAST_ILU",
"FAST_ILU_B",
// "FAST_ILDL", // Fails when run with SerialNode in a build with DefaultNode=Cuda
"FAST_ILDL",
"BLOCK RELAXATION",
// "DATABASE SCHWARZ", // Skipping because it fails
"SPARSE_BLOCK_RELAXATION"
Expand Down
45 changes: 23 additions & 22 deletions packages/shylu/shylu_node/fastilu/src/shylu_fastic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ class FastICPrec
typedef typename OrdinalArray::host_mirror_type OrdinalArrayMirror;
typedef typename ScalarArray::host_mirror_type ScalarArrayMirror;

typedef Kokkos::RangePolicy<ExecSpace> RangePolicy;

using STS = Kokkos::ArithTraits<Scalar>;
using RTS = Kokkos::ArithTraits<Real>;

Expand Down Expand Up @@ -218,7 +220,7 @@ class FastICPrec

//Make sure all memory resides on the device
ExecSpace().fence();
Kokkos::parallel_for(nRows, copyFunc1);
Kokkos::parallel_for(RangePolicy(0, nRows), copyFunc1);

//Note that the following is a temporary measure
//to ensure that memory resides on the device.
Expand All @@ -229,9 +231,9 @@ class FastICPrec
MemoryPrimeFunctorNnzCsr<Ordinal, Scalar, ExecSpace> copyFunc3(lColIdx, lVal);

ExecSpace().fence();
Kokkos::parallel_for(nRows, copyFunc1);
Kokkos::parallel_for(nnzA, copyFunc2);
Kokkos::parallel_for(nnzL, copyFunc3);
Kokkos::parallel_for(RangePolicy(0, nRows), copyFunc1);
Kokkos::parallel_for(RangePolicy(0, nnzA), copyFunc2);
Kokkos::parallel_for(RangePolicy(0, nnzL), copyFunc3);
#endif
double t = timer.seconds();

Expand Down Expand Up @@ -301,7 +303,7 @@ class FastICPrec
ltVal_[rowPtrs[row]] = value;
ltColIdx_[rowPtrs[row]] = i;
rowPtrs[row]++;
assert(rowPtrs[row] <= ltRowMap[row + 1]);
assert(rowPtrs[row] <= ltRowMap_[row + 1]);
}
}
Kokkos::deep_copy(ltRowMap, ltRowMap_);
Expand Down Expand Up @@ -594,7 +596,7 @@ class FastICPrec
}
}
Kokkos::deep_copy(diagElems, diagElems_);
assert(lPtr == lRowMap[nRows]);
assert(lPtr == lRowMap_[nRows]);

if ((level > 0) && (guessFlag !=0))
{
Expand Down Expand Up @@ -703,22 +705,22 @@ class FastICPrec
{
ParScalFunctor<Ordinal, Scalar, Scalar, ExecSpace> parScal(x, y, diagElemsInv);
ExecSpace().fence();
Kokkos::parallel_for(nRows, parScal);
Kokkos::parallel_for(RangePolicy(0, nRows), parScal);
ExecSpace().fence();

}
void applyD(ScalarArray &x, ScalarArray &y)
{
ParScalFunctor<Ordinal, Scalar, Real, ExecSpace> parScal(x, y, diagFact);
ExecSpace().fence();
Kokkos::parallel_for(nRows, parScal);
Kokkos::parallel_for(RangePolicy(0, nRows), parScal);
ExecSpace().fence();

}
void applyLIC(ScalarArray &x, ScalarArray &y)
{
ParInitZeroFunctor<Ordinal, Scalar, ExecSpace> parInitZero(xOld);
Kokkos::parallel_for(nRows, parInitZero);
Kokkos::parallel_for(RangePolicy(0, nRows), parInitZero);
ExecSpace().fence();
#if 0
JacobiIterFunctor<Ordinal, Scalar, ExecSpace> jacIter(nRows, lRowMap, lColIdx, lVal,
Expand All @@ -736,9 +738,9 @@ class FastICPrec
ExecSpace().fence();
for (Ordinal i = 0; i < nTrisol; i++)
{
Kokkos::parallel_for(extent, jacIter);
Kokkos::parallel_for(RangePolicy(0, extent), jacIter);
ExecSpace().fence();
Kokkos::parallel_for(nRows, parCopy);
Kokkos::parallel_for(RangePolicy(0, nRows), parCopy);
ExecSpace().fence();
}
return;
Expand All @@ -747,7 +749,7 @@ class FastICPrec
void applyLT(ScalarArray &x, ScalarArray &y)
{
ParInitZeroFunctor<Ordinal, Scalar, ExecSpace> parInitZero(xOld);
Kokkos::parallel_for(nRows, parInitZero);
Kokkos::parallel_for(RangePolicy(0, nRows), parInitZero);
ExecSpace().fence();
#if 0
JacobiIterFunctor<Ordinal, Scalar, ExecSpace> jacIter(nRows, ltRowMap, ltColIdx, ltVal,
Expand All @@ -764,9 +766,9 @@ class FastICPrec
ExecSpace().fence();
for (Ordinal i = 0; i < nTrisol; i++)
{
Kokkos::parallel_for(extent, jacIter);
Kokkos::parallel_for(RangePolicy(0, extent), jacIter);
ExecSpace().fence();
Kokkos::parallel_for(nRows, parCopy);
Kokkos::parallel_for(RangePolicy(0, nRows), parCopy);
ExecSpace().fence();
}
return;
Expand All @@ -778,7 +780,7 @@ class FastICPrec
{
ParInitZeroFunctor<Ordinal, Scalar, ExecSpace> parInitZero(nRows, xOld);
ParInitZeroFunctor<Ordinal, Scalar, ExecSpace> parInitZero1(nRows, y);
Kokkos::parallel_for(nRows, parInitZero);
Kokkos::parallel_for(RangePolicy(0, nRows), parInitZero);
ExecSpace().fence();
//Transpose Jacobi implementation
JacobiIterFunctorT<Ordinal, Scalar, ExecSpace> jacIterT(nRows, lRowMap, lColIdx, lVal, x, y, xOld, onesVector);
Expand All @@ -788,11 +790,11 @@ class FastICPrec

for (Ordinal i = 0; i < nTrisol; i++)
{
Kokkos::parallel_for(nRows, parInitZero1);
Kokkos::parallel_for(RangePolicy(0, nRows), parInitZero1);
ExecSpace().fence();
Kokkos::parallel_for(nRows, jacIterT);
Kokkos::parallel_for(RangePolicy(0, nRows), jacIterT);
ExecSpace().fence();
Kokkos::parallel_for(nRows, parCopy);
Kokkos::parallel_for(RangePolicy(0, nRows), parCopy);
ExecSpace().fence();
}

Expand Down Expand Up @@ -830,7 +832,7 @@ class FastICPrec
//Ordinal extent = aRowMap[nRows];
for (int i = 0; i < nFact; i++)
{
Kokkos::parallel_for(extent, icFunctor);
Kokkos::parallel_for(RangePolicy(0, extent), icFunctor);
}
ExecSpace().fence();

Expand Down Expand Up @@ -871,11 +873,10 @@ class FastICPrec

void apply(ScalarArray &x, ScalarArray &y)
{

Kokkos::Timer timer;
ParCopyFunctor<Ordinal, Scalar, ExecSpace> parCopyFunctor(xTemp, x);
ExecSpace().fence();
Kokkos::parallel_for(nRows, parCopyFunctor);
Kokkos::parallel_for(RangePolicy(0, nRows), parCopyFunctor);
ExecSpace().fence();

applyD(x, xTemp);
Expand All @@ -893,7 +894,7 @@ class FastICPrec

// ParCopyFunctor<Ordinal, Scalar, ExecSpace> parCopyFunctor2(nRows, y, xTemp);
// ExecSpace().fence();
// Kokkos::parallel_for(nRows, parCopyFunctor2);
// Kokkos::parallel_for(RangePolicy(0, nRows), parCopyFunctor2);
// ExecSpace().fence();

double t = timer.seconds();
Expand Down
36 changes: 19 additions & 17 deletions packages/shylu/shylu_node/fastilu/src/shylu_fastildl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ class FastILDLPrec
typedef typename OrdinalArray::host_mirror_type OrdinalArrayMirror;
typedef typename ScalarArray::host_mirror_type ScalarArrayMirror;

typedef Kokkos::RangePolicy<ExecSpace> RangePolicy;

using STS = Kokkos::ArithTraits<Scalar>;
using RTS = Kokkos::ArithTraits<Real>;

Expand Down Expand Up @@ -221,7 +223,7 @@ class FastILDLPrec

//Make sure all memory resides on the device
ExecSpace().fence();
Kokkos::parallel_for(nRows, copyFunc1);
Kokkos::parallel_for(RangePolicy(0, nRows), copyFunc1);

//Note that the following is a temporary measure
//to ensure that memory resides on the device.
Expand All @@ -231,9 +233,9 @@ class FastILDLPrec
MemoryPrimeFunctorNnzCsr<Ordinal, Scalar, ExecSpace> copyFunc3(lColIdx, lVal);

ExecSpace().fence();
Kokkos::parallel_for(nRows, copyFunc1);
Kokkos::parallel_for(nnzA, copyFunc2);
Kokkos::parallel_for(nnzL, copyFunc3);
Kokkos::parallel_for(RangePolicy(0, nRows), copyFunc1);
Kokkos::parallel_for(RangePolicy(0, nnzA), copyFunc2);
Kokkos::parallel_for(RangePolicy(0, nnzL), copyFunc3);
#endif
double t = timer.seconds();
#ifdef FASTILDL_DEBUG_OUTPUT
Expand Down Expand Up @@ -304,7 +306,7 @@ class FastILDLPrec
ltVal_[rowPtrs[row]] = value;
ltColIdx_[rowPtrs[row]] = i;
rowPtrs[row]++;
assert(rowPtrs[row] <= ltRowMap[row + 1]);
assert(rowPtrs[row] <= ltRowMap_[row + 1]);
}
}
Kokkos::deep_copy(ltRowMap, ltRowMap_);
Expand Down Expand Up @@ -599,7 +601,7 @@ class FastILDLPrec
}
}
}
assert(lPtr == lRowMap[nRows]);
assert(lPtr == lRowMap_[nRows]);
Kokkos::deep_copy(diagElems, diagElems_);

if ((level > 0) && (guessFlag !=0))
Expand Down Expand Up @@ -709,22 +711,22 @@ class FastILDLPrec
{
ParScalFunctor<Ordinal, Scalar, Scalar, ExecSpace> parScal(x, y, diagElemsInv);
ExecSpace().fence();
Kokkos::parallel_for(nRows, parScal);
Kokkos::parallel_for(RangePolicy(0, nRows), parScal);
ExecSpace().fence();

}
void applyD(ScalarArray &x, ScalarArray &y)
{
ParScalFunctor<Ordinal, Scalar, Real, ExecSpace> parScal(x, y, diagFact);
ExecSpace().fence();
Kokkos::parallel_for(nRows, parScal);
Kokkos::parallel_for(RangePolicy(0, nRows), parScal);
ExecSpace().fence();

}
void applyLIC(ScalarArray &x, ScalarArray &y)
{
ParInitZeroFunctor<Ordinal, Scalar, ExecSpace> parInitZero(xOld);
Kokkos::parallel_for(nRows, parInitZero);
Kokkos::parallel_for(RangePolicy(0, nRows), parInitZero);
ExecSpace().fence();
#if 0
JacobiIterFunctor<Ordinal, Scalar, ExecSpace> jacIter(nRows, lRowMap, lColIdx, lVal, x, y, xOld, onesVector);
Expand All @@ -740,9 +742,9 @@ class FastILDLPrec
ExecSpace().fence();
for (Ordinal i = 0; i < nTrisol; i++)
{
Kokkos::parallel_for(extent, jacIter);
Kokkos::parallel_for(RangePolicy(0, extent), jacIter);
ExecSpace().fence();
Kokkos::parallel_for(nRows, parCopy);
Kokkos::parallel_for(RangePolicy(0, nRows), parCopy);
ExecSpace().fence();
}
return;
Expand All @@ -751,7 +753,7 @@ class FastILDLPrec
void applyLT(ScalarArray &x, ScalarArray &y)
{
ParInitZeroFunctor<Ordinal, Scalar, ExecSpace> parInitZero(xOld);
Kokkos::parallel_for(nRows, parInitZero);
Kokkos::parallel_for(RangePolicy(0, nRows), parInitZero);
ExecSpace().fence();
#if 0
JacobiIterFunctor<Ordinal, Scalar, ExecSpace> jacIter(nRows, ltRowMap, ltColIdx, ltVal, x, y, xOld, onesVector);
Expand All @@ -768,9 +770,9 @@ class FastILDLPrec

for (Ordinal i = 0; i < nTrisol; i++)
{
Kokkos::parallel_for(extent, jacIter);
Kokkos::parallel_for(RangePolicy(0, extent), jacIter);
ExecSpace().fence();
Kokkos::parallel_for(nRows, parCopy);
Kokkos::parallel_for(RangePolicy(0, nRows), parCopy);
ExecSpace().fence();
}

Expand Down Expand Up @@ -809,7 +811,7 @@ class FastILDLPrec
//Ordinal extent = aRowMap[nRows];
for (int i = 0; i < nFact; i++)
{
Kokkos::parallel_for(extent, ildltFunctor);
Kokkos::parallel_for(RangePolicy(0, extent), ildltFunctor);
}
ExecSpace().fence();

Expand Down Expand Up @@ -854,7 +856,7 @@ class FastILDLPrec
Kokkos::Timer timer;
ParCopyFunctor<Ordinal, Scalar, ExecSpace> parCopyFunctor(xTemp, x);
ExecSpace().fence();
Kokkos::parallel_for(nRows, parCopyFunctor);
Kokkos::parallel_for(RangePolicy(0, nRows), parCopyFunctor);
ExecSpace().fence();

applyD(x, xTemp);
Expand All @@ -873,7 +875,7 @@ class FastILDLPrec

ParCopyFunctor<Ordinal, Scalar, ExecSpace> parCopyFunctor2(y, xTemp);
ExecSpace().fence();
Kokkos::parallel_for(nRows, parCopyFunctor2);
Kokkos::parallel_for(RangePolicy(0, nRows), parCopyFunctor2);
ExecSpace().fence();

double t = timer.seconds();
Expand Down

0 comments on commit 944fb88

Please sign in to comment.