Skip to content

Commit

Permalink
Fix Shylu GPU/Serial issues
Browse files Browse the repository at this point in the history
A few asserts were using the wrong views. There were many parallel_fors
that were using the default exec space even when ExecSpace for the class
was set to Serial. This was causing GPU kernels to try to access Serial views
  • Loading branch information
jgfouca committed Jan 29, 2024
1 parent 7536a9e commit 5b797dc
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 39 deletions.
45 changes: 23 additions & 22 deletions packages/shylu/shylu_node/fastilu/src/shylu_fastic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ class FastICPrec
typedef typename OrdinalArray::host_mirror_type OrdinalArrayMirror;
typedef typename ScalarArray::host_mirror_type ScalarArrayMirror;

typedef Kokkos::RangePolicy<ExecSpace> RangePolicy;

using STS = Kokkos::ArithTraits<Scalar>;
using RTS = Kokkos::ArithTraits<Real>;

Expand Down Expand Up @@ -218,7 +220,7 @@ class FastICPrec

//Make sure all memory resides on the device
ExecSpace().fence();
Kokkos::parallel_for(nRows, copyFunc1);
Kokkos::parallel_for(RangePolicy(0, nRows), copyFunc1);

//Note that the following is a temporary measure
//to ensure that memory resides on the device.
Expand All @@ -229,9 +231,9 @@ class FastICPrec
MemoryPrimeFunctorNnzCsr<Ordinal, Scalar, ExecSpace> copyFunc3(lColIdx, lVal);

ExecSpace().fence();
Kokkos::parallel_for(nRows, copyFunc1);
Kokkos::parallel_for(nnzA, copyFunc2);
Kokkos::parallel_for(nnzL, copyFunc3);
Kokkos::parallel_for(RangePolicy(0, nRows), copyFunc1);
Kokkos::parallel_for(RangePolicy(0, nnzA), copyFunc2);
Kokkos::parallel_for(RangePolicy(0, nnzL), copyFunc3);
#endif
double t = timer.seconds();

Expand Down Expand Up @@ -301,7 +303,7 @@ class FastICPrec
ltVal_[rowPtrs[row]] = value;
ltColIdx_[rowPtrs[row]] = i;
rowPtrs[row]++;
assert(rowPtrs[row] <= ltRowMap[row + 1]);
assert(rowPtrs[row] <= ltRowMap_[row + 1]);
}
}
Kokkos::deep_copy(ltRowMap, ltRowMap_);
Expand Down Expand Up @@ -594,7 +596,7 @@ class FastICPrec
}
}
Kokkos::deep_copy(diagElems, diagElems_);
assert(lPtr == lRowMap[nRows]);
assert(lPtr == lRowMap_[nRows]);

if ((level > 0) && (guessFlag !=0))
{
Expand Down Expand Up @@ -703,22 +705,22 @@ class FastICPrec
{
ParScalFunctor<Ordinal, Scalar, Scalar, ExecSpace> parScal(x, y, diagElemsInv);
ExecSpace().fence();
Kokkos::parallel_for(nRows, parScal);
Kokkos::parallel_for(RangePolicy(0, nRows), parScal);
ExecSpace().fence();

}
void applyD(ScalarArray &x, ScalarArray &y)
{
ParScalFunctor<Ordinal, Scalar, Real, ExecSpace> parScal(x, y, diagFact);
ExecSpace().fence();
Kokkos::parallel_for(nRows, parScal);
Kokkos::parallel_for(RangePolicy(0, nRows), parScal);
ExecSpace().fence();

}
void applyLIC(ScalarArray &x, ScalarArray &y)
{
ParInitZeroFunctor<Ordinal, Scalar, ExecSpace> parInitZero(xOld);
Kokkos::parallel_for(nRows, parInitZero);
Kokkos::parallel_for(RangePolicy(0, nRows), parInitZero);
ExecSpace().fence();
#if 0
JacobiIterFunctor<Ordinal, Scalar, ExecSpace> jacIter(nRows, lRowMap, lColIdx, lVal,
Expand All @@ -736,9 +738,9 @@ class FastICPrec
ExecSpace().fence();
for (Ordinal i = 0; i < nTrisol; i++)
{
Kokkos::parallel_for(extent, jacIter);
Kokkos::parallel_for(RangePolicy(0, extent), jacIter);
ExecSpace().fence();
Kokkos::parallel_for(nRows, parCopy);
Kokkos::parallel_for(RangePolicy(0, nRows), parCopy);
ExecSpace().fence();
}
return;
Expand All @@ -747,7 +749,7 @@ class FastICPrec
void applyLT(ScalarArray &x, ScalarArray &y)
{
ParInitZeroFunctor<Ordinal, Scalar, ExecSpace> parInitZero(xOld);
Kokkos::parallel_for(nRows, parInitZero);
Kokkos::parallel_for(RangePolicy(0, nRows), parInitZero);
ExecSpace().fence();
#if 0
JacobiIterFunctor<Ordinal, Scalar, ExecSpace> jacIter(nRows, ltRowMap, ltColIdx, ltVal,
Expand All @@ -764,9 +766,9 @@ class FastICPrec
ExecSpace().fence();
for (Ordinal i = 0; i < nTrisol; i++)
{
Kokkos::parallel_for(extent, jacIter);
Kokkos::parallel_for(RangePolicy(0, extent), jacIter);
ExecSpace().fence();
Kokkos::parallel_for(nRows, parCopy);
Kokkos::parallel_for(RangePolicy(0, nRows), parCopy);
ExecSpace().fence();
}
return;
Expand All @@ -778,7 +780,7 @@ class FastICPrec
{
ParInitZeroFunctor<Ordinal, Scalar, ExecSpace> parInitZero(nRows, xOld);
ParInitZeroFunctor<Ordinal, Scalar, ExecSpace> parInitZero1(nRows, y);
Kokkos::parallel_for(nRows, parInitZero);
Kokkos::parallel_for(RangePolicy(0, nRows), parInitZero);
ExecSpace().fence();
//Transpose Jacobi implementation
JacobiIterFunctorT<Ordinal, Scalar, ExecSpace> jacIterT(nRows, lRowMap, lColIdx, lVal, x, y, xOld, onesVector);
Expand All @@ -788,11 +790,11 @@ class FastICPrec

for (Ordinal i = 0; i < nTrisol; i++)
{
Kokkos::parallel_for(nRows, parInitZero1);
Kokkos::parallel_for(RangePolicy(0, nRows), parInitZero1);
ExecSpace().fence();
Kokkos::parallel_for(nRows, jacIterT);
Kokkos::parallel_for(RangePolicy(0, nRows), jacIterT);
ExecSpace().fence();
Kokkos::parallel_for(nRows, parCopy);
Kokkos::parallel_for(RangePolicy(0, nRows), parCopy);
ExecSpace().fence();
}

Expand Down Expand Up @@ -830,7 +832,7 @@ class FastICPrec
//Ordinal extent = aRowMap[nRows];
for (int i = 0; i < nFact; i++)
{
Kokkos::parallel_for(extent, icFunctor);
Kokkos::parallel_for(RangePolicy(0, extent), icFunctor);
}
ExecSpace().fence();

Expand Down Expand Up @@ -871,11 +873,10 @@ class FastICPrec

void apply(ScalarArray &x, ScalarArray &y)
{

Kokkos::Timer timer;
ParCopyFunctor<Ordinal, Scalar, ExecSpace> parCopyFunctor(xTemp, x);
ExecSpace().fence();
Kokkos::parallel_for(nRows, parCopyFunctor);
Kokkos::parallel_for(RangePolicy(0, nRows), parCopyFunctor);
ExecSpace().fence();

applyD(x, xTemp);
Expand All @@ -893,7 +894,7 @@ class FastICPrec

// ParCopyFunctor<Ordinal, Scalar, ExecSpace> parCopyFunctor2(nRows, y, xTemp);
// ExecSpace().fence();
// Kokkos::parallel_for(nRows, parCopyFunctor2);
// Kokkos::parallel_for(RangePolicy(0, nRows), parCopyFunctor2);
// ExecSpace().fence();

double t = timer.seconds();
Expand Down
36 changes: 19 additions & 17 deletions packages/shylu/shylu_node/fastilu/src/shylu_fastildl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ class FastILDLPrec
typedef typename OrdinalArray::host_mirror_type OrdinalArrayMirror;
typedef typename ScalarArray::host_mirror_type ScalarArrayMirror;

typedef Kokkos::RangePolicy<ExecSpace> RangePolicy;

using STS = Kokkos::ArithTraits<Scalar>;
using RTS = Kokkos::ArithTraits<Real>;

Expand Down Expand Up @@ -221,7 +223,7 @@ class FastILDLPrec

//Make sure all memory resides on the device
ExecSpace().fence();
Kokkos::parallel_for(nRows, copyFunc1);
Kokkos::parallel_for(RangePolicy(0, nRows), copyFunc1);

//Note that the following is a temporary measure
//to ensure that memory resides on the device.
Expand All @@ -231,9 +233,9 @@ class FastILDLPrec
MemoryPrimeFunctorNnzCsr<Ordinal, Scalar, ExecSpace> copyFunc3(lColIdx, lVal);

ExecSpace().fence();
Kokkos::parallel_for(nRows, copyFunc1);
Kokkos::parallel_for(nnzA, copyFunc2);
Kokkos::parallel_for(nnzL, copyFunc3);
Kokkos::parallel_for(RangePolicy(0, nRows), copyFunc1);
Kokkos::parallel_for(RangePolicy(0, nnzA), copyFunc2);
Kokkos::parallel_for(RangePolicy(0, nnzL), copyFunc3);
#endif
double t = timer.seconds();
#ifdef FASTILDL_DEBUG_OUTPUT
Expand Down Expand Up @@ -304,7 +306,7 @@ class FastILDLPrec
ltVal_[rowPtrs[row]] = value;
ltColIdx_[rowPtrs[row]] = i;
rowPtrs[row]++;
assert(rowPtrs[row] <= ltRowMap[row + 1]);
assert(rowPtrs[row] <= ltRowMap_[row + 1]);
}
}
Kokkos::deep_copy(ltRowMap, ltRowMap_);
Expand Down Expand Up @@ -599,7 +601,7 @@ class FastILDLPrec
}
}
}
assert(lPtr == lRowMap[nRows]);
assert(lPtr == lRowMap_[nRows]);
Kokkos::deep_copy(diagElems, diagElems_);

if ((level > 0) && (guessFlag !=0))
Expand Down Expand Up @@ -709,22 +711,22 @@ class FastILDLPrec
{
ParScalFunctor<Ordinal, Scalar, Scalar, ExecSpace> parScal(x, y, diagElemsInv);
ExecSpace().fence();
Kokkos::parallel_for(nRows, parScal);
Kokkos::parallel_for(RangePolicy(0, nRows), parScal);
ExecSpace().fence();

}
void applyD(ScalarArray &x, ScalarArray &y)
{
ParScalFunctor<Ordinal, Scalar, Real, ExecSpace> parScal(x, y, diagFact);
ExecSpace().fence();
Kokkos::parallel_for(nRows, parScal);
Kokkos::parallel_for(RangePolicy(0, nRows), parScal);
ExecSpace().fence();

}
void applyLIC(ScalarArray &x, ScalarArray &y)
{
ParInitZeroFunctor<Ordinal, Scalar, ExecSpace> parInitZero(xOld);
Kokkos::parallel_for(nRows, parInitZero);
Kokkos::parallel_for(RangePolicy(0, nRows), parInitZero);
ExecSpace().fence();
#if 0
JacobiIterFunctor<Ordinal, Scalar, ExecSpace> jacIter(nRows, lRowMap, lColIdx, lVal, x, y, xOld, onesVector);
Expand All @@ -740,9 +742,9 @@ class FastILDLPrec
ExecSpace().fence();
for (Ordinal i = 0; i < nTrisol; i++)
{
Kokkos::parallel_for(extent, jacIter);
Kokkos::parallel_for(RangePolicy(0, extent), jacIter);
ExecSpace().fence();
Kokkos::parallel_for(nRows, parCopy);
Kokkos::parallel_for(RangePolicy(0, nRows), parCopy);
ExecSpace().fence();
}
return;
Expand All @@ -751,7 +753,7 @@ class FastILDLPrec
void applyLT(ScalarArray &x, ScalarArray &y)
{
ParInitZeroFunctor<Ordinal, Scalar, ExecSpace> parInitZero(xOld);
Kokkos::parallel_for(nRows, parInitZero);
Kokkos::parallel_for(RangePolicy(0, nRows), parInitZero);
ExecSpace().fence();
#if 0
JacobiIterFunctor<Ordinal, Scalar, ExecSpace> jacIter(nRows, ltRowMap, ltColIdx, ltVal, x, y, xOld, onesVector);
Expand All @@ -768,9 +770,9 @@ class FastILDLPrec

for (Ordinal i = 0; i < nTrisol; i++)
{
Kokkos::parallel_for(extent, jacIter);
Kokkos::parallel_for(RangePolicy(0, extent), jacIter);
ExecSpace().fence();
Kokkos::parallel_for(nRows, parCopy);
Kokkos::parallel_for(RangePolicy(0, nRows), parCopy);
ExecSpace().fence();
}

Expand Down Expand Up @@ -809,7 +811,7 @@ class FastILDLPrec
//Ordinal extent = aRowMap[nRows];
for (int i = 0; i < nFact; i++)
{
Kokkos::parallel_for(extent, ildltFunctor);
Kokkos::parallel_for(RangePolicy(0, extent), ildltFunctor);
}
ExecSpace().fence();

Expand Down Expand Up @@ -854,7 +856,7 @@ class FastILDLPrec
Kokkos::Timer timer;
ParCopyFunctor<Ordinal, Scalar, ExecSpace> parCopyFunctor(xTemp, x);
ExecSpace().fence();
Kokkos::parallel_for(nRows, parCopyFunctor);
Kokkos::parallel_for(RangePolicy(0, nRows), parCopyFunctor);
ExecSpace().fence();

applyD(x, xTemp);
Expand All @@ -873,7 +875,7 @@ class FastILDLPrec

ParCopyFunctor<Ordinal, Scalar, ExecSpace> parCopyFunctor2(y, xTemp);
ExecSpace().fence();
Kokkos::parallel_for(nRows, parCopyFunctor2);
Kokkos::parallel_for(RangePolicy(0, nRows), parCopyFunctor2);
ExecSpace().fence();

double t = timer.seconds();
Expand Down

0 comments on commit 5b797dc

Please sign in to comment.