From 3414c914b11e5fc116b501de8e78c98fcdfb023c Mon Sep 17 00:00:00 2001
From: Carl Pearson <cwpearson@users.noreply.github.com>
Date: Wed, 22 May 2024 10:03:40 -0600
Subject: [PATCH] SpMV: Test NaN, fix NaN handling when beta=0 (#2188)

* Test_Sparse_spmv_bsr.hpp: add NaNs to tests

* handle NaN in spmv_beta_transpose when beta=0

* handle nan in SpmvMergeHierarchical when beta=0

* Test NaNs in Y, don't reuse modifed Y, catch NaNs in results test

* remove unused <iostream> include

* explicit casting of zero

* Test_sparse_spmv.hpp: remove unused nans parameter

* KokkosSparse_spmv.hpp: CUDA11 can't detect this function always returns

* Test_Sparse_spmv.hpp: remove unused variable

* Run unit tests in correct execution space

* Test_Sparse_spmv.hpp: remove unused type aliases

* Kokkos::nan() -> KokkosKernels::Impl::quiet_NaN()
---
 common/impl/KokkosKernels_NaN.hpp            | 44 ++++++++++++
 sparse/impl/KokkosSparse_spmv_impl.hpp       | 18 +++--
 sparse/impl/KokkosSparse_spmv_impl_merge.hpp |  6 +-
 sparse/src/KokkosSparse_spmv.hpp             |  1 +
 sparse/unit_test/Test_Sparse_spmv.hpp        | 76 +++++++++++++++-----
 sparse/unit_test/Test_Sparse_spmv_bsr.hpp    | 62 +++++++++++++++-
 6 files changed, 178 insertions(+), 29 deletions(-)
 create mode 100644 common/impl/KokkosKernels_NaN.hpp
diff --git a/common/impl/KokkosKernels_NaN.hpp b/common/impl/KokkosKernels_NaN.hpp
new file mode 100644
index 0000000000..f319539a9f
--- /dev/null
+++ b/common/impl/KokkosKernels_NaN.hpp
@@ -0,0 +1,44 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOSKERNELS_NAN_HPP
+#define KOKKOSKERNELS_NAN_HPP
+
+#include <Kokkos_ArithTraits.hpp>
+#include <Kokkos_NumericTraits.hpp>
+
+namespace KokkosKernels::Impl {
+
+// This could be constexpr if Kokkos::complex ctor was
+template <typename T>
+KOKKOS_INLINE_FUNCTION T quiet_NaN() {
+  if constexpr (std::is_same_v<double, T>) {
+    return double(Kokkos::Experimental::quiet_NaN_v<
+                  float>);  // Kokkos::Experimetnal::quiet_NaN_v<double>
+                            // is undefined in
+                            // device code
+  } else if constexpr (Kokkos::ArithTraits<T>::is_complex) {
+    using value_type = typename T::value_type;
+    return T(quiet_NaN<value_type>(),
+             quiet_NaN<value_type>());  // Kokkos::complex ctor is not constexpr
+  } else {
+    return Kokkos::Experimental::quiet_NaN_v<T>;
+  }
+}
+
+}  // namespace KokkosKernels::Impl
+
+#endif  // KOKKOSKERNELS_NAN_HPP
diff --git a/sparse/impl/KokkosSparse_spmv_impl.hpp b/sparse/impl/KokkosSparse_spmv_impl.hpp
index 5f9cbea040..a2bb19a44c 100644
--- a/sparse/impl/KokkosSparse_spmv_impl.hpp
+++ b/sparse/impl/KokkosSparse_spmv_impl.hpp
@@ -450,8 +450,9 @@ static void spmv_beta_transpose(const execution_space& exec,
                                 const AMatrix& A, const XVector& x,
                                 typename YVector::const_value_type& beta,
                                 const YVector& y) {
-  using ordinal_type = typename AMatrix::non_const_ordinal_type;
-  using size_type    = typename AMatrix::non_const_size_type;
+  using ordinal_type  = typename AMatrix::non_const_ordinal_type;
+  using size_type     = typename AMatrix::non_const_size_type;
+  using y_scalar_type = typename YVector::non_const_value_type;
 
   if (A.numRows() <= static_cast<ordinal_type>(0)) {
     return;
@@ -459,7 +460,9 @@ static void spmv_beta_transpose(const execution_space& exec,
 
   // We need to scale y first ("scaling" by zero just means filling
   // with zeros), since the functor works by atomic-adding into y.
-  if (dobeta != 1) {
+  if (0 == dobeta || y_scalar_type(0) == beta) {
+    Kokkos::deep_copy(exec, y, y_scalar_type(0));
+  } else if (dobeta != 1) {
     KokkosBlas::scal(exec, y, beta, y);
   }
 
@@ -540,8 +543,9 @@ static void spmv_beta_transpose(const execution_space& exec,
                                 const AMatrix& A, const XVector& x,
                                 typename YVector::const_value_type& beta,
                                 const YVector& y) {
-  using ordinal_type = typename AMatrix::non_const_ordinal_type;
-  using size_type    = typename AMatrix::non_const_size_type;
+  using ordinal_type  = typename AMatrix::non_const_ordinal_type;
+  using size_type     = typename AMatrix::non_const_size_type;
+  using y_scalar_type = typename YVector::non_const_value_type;
 
   if (A.numRows() <= static_cast<ordinal_type>(0)) {
     return;
@@ -549,7 +553,9 @@ static void spmv_beta_transpose(const execution_space& exec,
 
   // We need to scale y first ("scaling" by zero just means filling
   // with zeros), since the functor works by atomic-adding into y.
-  if (dobeta != 1) {
+  if (0 == dobeta || y_scalar_type(0) == beta) {
+    Kokkos::deep_copy(exec, y, y_scalar_type(0));
+  } else if (dobeta != 1) {
     KokkosBlas::scal(exec, y, beta, y);
   }
 
diff --git a/sparse/impl/KokkosSparse_spmv_impl_merge.hpp b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp
index c49519cc3a..1717d190be 100644
--- a/sparse/impl/KokkosSparse_spmv_impl_merge.hpp
+++ b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp
@@ -309,7 +309,11 @@ struct SpmvMergeHierarchical {
     static_assert(XVector::rank == 1, "");
     static_assert(YVector::rank == 1, "");
 
-    KokkosBlas::scal(y, beta, y);
+    if (y_value_type(0) == beta) {
+      Kokkos::deep_copy(space, y, y_value_type(0));
+    } else {
+      KokkosBlas::scal(space, y, beta, y);
+    }
 
     /* determine launch parameters for different architectures
        On architectures where there is a natural execution hierarchy with true
diff --git a/sparse/src/KokkosSparse_spmv.hpp b/sparse/src/KokkosSparse_spmv.hpp
index f11b61f675..b59124df20 100644
--- a/sparse/src/KokkosSparse_spmv.hpp
+++ b/sparse/src/KokkosSparse_spmv.hpp
@@ -62,6 +62,7 @@ inline constexpr bool spmv_general_tpl_avail() {
       return spmv_mv_bsrmatrix_tpl_spec_avail<ExecutionSpace, Handle, AMatrix,
                                               XVector, YVector>::value;
   }
+  return false;
 }
 }  // namespace Impl
 
diff --git a/sparse/unit_test/Test_Sparse_spmv.hpp b/sparse/unit_test/Test_Sparse_spmv.hpp
index 9afd941c93..2057a8ba14 100644
--- a/sparse/unit_test/Test_Sparse_spmv.hpp
+++ b/sparse/unit_test/Test_Sparse_spmv.hpp
@@ -23,6 +23,7 @@
 #include <KokkosKernels_IOUtils.hpp>
 #include <KokkosSparse_IOUtils.hpp>
 #include <KokkosKernels_Utils.hpp>
+#include <KokkosKernels_NaN.hpp>
 
 #include "KokkosKernels_default_types.hpp"
 
@@ -86,7 +87,10 @@ struct fSPMV {
   void operator()(const int i, value_type &err) const {
     const mag_type error = AT::abs(expected_y(i) - y(i));
 
-    if (error > eps * max_val) {
+    // only one is NaN or error is too large
+    if ((Kokkos::isnan(AT::abs(expected_y(i))) ^
+         Kokkos::isnan(AT::abs(y(i)))) ||
+        (error > eps * max_val)) {
       err++;
       Kokkos::printf("expected_y(%d)=%f, y(%d)=%f err=%e, max_error=%e\n", i,
                      AT::abs(expected_y(i)), i, AT::abs(y(i)), error,
@@ -116,6 +120,7 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y,
   using size_type_view_t = typename graph_t::row_map_type;
   using lno_view_t       = typename graph_t::entries_type;
   using scalar_view_t    = typename crsMat_t::values_type::non_const_type;
+  using y_scalar_t       = typename y_vector_type::non_const_value_type;
 
   using size_type = typename size_type_view_t::non_const_value_type;
   using lno_t     = typename lno_view_t::non_const_value_type;
@@ -145,7 +150,13 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y,
   lno_t nr = input_mat.numRows();
 
   // first, scale y by beta
-  for (size_t i = 0; i < h_y.extent(0); i++) h_y(i) *= beta;
+  for (size_t i = 0; i < h_y.extent(0); i++) {
+    if (beta == y_scalar_t(0)) {
+      h_y(i) = y_scalar_t(0);
+    } else {
+      h_y(i) *= beta;
+    }
+  }
 
   // then go through the matrix and accumulate the matrix-vector product
   for (lno_t row = 0; row < nr; ++row) {
@@ -184,17 +195,19 @@ void check_spmv(
 
   const y_value_mag_type eps =
       10 * Kokkos::ArithTraits<y_value_mag_type>::eps();
-  bool transposed = (mode == "T") || (mode == "H");
-  y_vector_type expected_y(
-      "expected", transposed ? input_mat.numCols() : input_mat.numRows());
+
+  y_vector_type actual_y("actual_y", y.extent(0));
+  y_vector_type expected_y("expected_y", y.extent(0));
   Kokkos::deep_copy(expected_y, y);
+  Kokkos::deep_copy(actual_y, y);
   Kokkos::fence();
 
   sequential_spmv(input_mat, x, expected_y, alpha, beta, mode);
   bool threw = false;
   std::string msg;
   try {
-    KokkosSparse::spmv(handle, mode.data(), alpha, input_mat, x, beta, y);
+    KokkosSparse::spmv(handle, mode.data(), alpha, input_mat, x, beta,
+                       actual_y);
     Kokkos::fence();
   } catch (std::exception &e) {
     threw = true;
@@ -203,11 +216,11 @@ void check_spmv(
   ASSERT_FALSE(threw) << "KokkosSparse::Test::spmv 1D, mode " << mode
                       << ": threw exception:\n"
                       << msg << '\n';
+
   int num_errors = 0;
   Kokkos::parallel_reduce(
-      "KokkosSparse::Test::spmv", my_exec_space(0, y.extent(0)),
-      fSPMV<y_vector_type, y_vector_type>(expected_y, y, eps, max_val),
-      num_errors);
+      "KokkosSparse::Test::spmv", my_exec_space(0, actual_y.extent(0)),
+      fSPMV(expected_y, actual_y, eps, max_val), num_errors);
   if (num_errors > 0)
     printf("KokkosSparse::Test::spmv: %i errors of %i with params: %lf %lf\n",
            num_errors, y.extent_int(0), y_value_trait::abs(alpha),
@@ -266,10 +279,9 @@ void check_spmv_mv(
 
     auto y_spmv    = Kokkos::subview(y, Kokkos::ALL(), i);
     int num_errors = 0;
-    Kokkos::parallel_reduce(
-        "KokkosSparse::Test::spmv_mv", my_exec_space(0, y_i.extent(0)),
-        fSPMV<decltype(y_i), decltype(y_spmv)>(y_i, y_spmv, eps, max_val),
-        num_errors);
+    Kokkos::parallel_reduce("KokkosSparse::Test::spmv_mv",
+                            my_exec_space(0, y_i.extent(0)),
+                            fSPMV(y_i, y_spmv, eps, max_val), num_errors);
     if (num_errors > 0)
       std::cout << "KokkosSparse::Test::spmv_mv: " << num_errors
                 << " errors of " << y_i.extent_int(0) << " for mv " << i
@@ -404,6 +416,7 @@ void test_spmv(KokkosSparse::SPMVAlgorithm algo, lno_t numRows, size_type nnz,
   using mag_t         = typename Kokkos::ArithTraits<scalar_t>::mag_type;
   using handle_t =
       KokkosSparse::SPMVHandle<Device, crsMat_t, x_vector_type, y_vector_type>;
+  using y_policy = Kokkos::RangePolicy<typename y_vector_type::execution_space>;
 
   constexpr mag_t max_x   = static_cast<mag_t>(1);
   constexpr mag_t max_y   = static_cast<mag_t>(1);
@@ -419,18 +432,35 @@ void test_spmv(KokkosSparse::SPMVAlgorithm algo, lno_t numRows, size_type nnz,
   const lno_t max_nnz_per_row =
       numRows ? (nnz / numRows + row_size_variance) : 0;
 
+  // Create vectors with and without nans
   x_vector_type input_x("x", nc);
-  y_vector_type output_y("y", nr);
   x_vector_type input_xt("x", nr);
-  y_vector_type output_yt("y", nc);
+  y_vector_type input_y("y", nr), input_y_nans("y_nans", nr);
+  y_vector_type input_yt("y", nc), input_yt_nans("y_nans", nc);
 
   Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
       13718);
 
   Kokkos::fill_random(input_x, rand_pool, randomUpperBound<scalar_t>(max_x));
-  Kokkos::fill_random(output_y, rand_pool, randomUpperBound<scalar_t>(max_y));
+  Kokkos::fill_random(input_y, rand_pool, randomUpperBound<scalar_t>(max_y));
   Kokkos::fill_random(input_xt, rand_pool, randomUpperBound<scalar_t>(max_x));
-  Kokkos::fill_random(output_yt, rand_pool, randomUpperBound<scalar_t>(max_y));
+  Kokkos::fill_random(input_yt, rand_pool, randomUpperBound<scalar_t>(max_y));
+
+  // sprinkle in some nans
+  Kokkos::deep_copy(input_y_nans, input_y);
+  Kokkos::deep_copy(input_yt_nans, input_yt);
+  Kokkos::parallel_for(
+      y_policy(0, input_y_nans.extent(0)), KOKKOS_LAMBDA(const size_t i) {
+        if (0 == (i % 19)) {
+          input_y_nans(i) = KokkosKernels::Impl::quiet_NaN<scalar_t>();
+        }
+      });
+  Kokkos::parallel_for(
+      y_policy(0, input_yt_nans.extent(0)), KOKKOS_LAMBDA(const size_t i) {
+        if (0 == (i % 23)) {
+          input_yt_nans(i) = KokkosKernels::Impl::quiet_NaN<scalar_t>();
+        }
+      });
 
   // We also need to bound the values
   // in the matrix to bound the cancellations
@@ -457,8 +487,12 @@ void test_spmv(KokkosSparse::SPMVAlgorithm algo, lno_t numRows, size_type nnz,
       for (double beta : testAlphaBeta) {
         mag_t max_error =
             beta * max_y + alpha * max_nnz_per_row * max_val * max_x;
-        Test::check_spmv(&handle, input_mat, input_x, output_y, alpha, beta,
+        Test::check_spmv(&handle, input_mat, input_x, input_y, alpha, beta,
                          mode, max_error);
+        if (0 == beta) {
+          Test::check_spmv(&handle, input_mat, input_x, input_y_nans, alpha,
+                           beta, mode, max_error);
+        }
       }
     }
   }
@@ -468,8 +502,12 @@ void test_spmv(KokkosSparse::SPMVAlgorithm algo, lno_t numRows, size_type nnz,
         // hoping the transpose won't have a long column...
         mag_t max_error =
             beta * max_y + alpha * max_nnz_per_row * max_val * max_x;
-        Test::check_spmv(&handle, input_mat, input_xt, output_yt, alpha, beta,
+        Test::check_spmv(&handle, input_mat, input_xt, input_yt, alpha, beta,
                          mode, max_error);
+        if (0 == beta) {
+          Test::check_spmv(&handle, input_mat, input_x, input_yt_nans, alpha,
+                           beta, mode, max_error);
+        }
       }
     }
   }
diff --git a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp
index cb42f5c2e4..e9b23298f9 100644
--- a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp
+++ b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp
@@ -41,6 +41,7 @@
 #include <KokkosKernels_IOUtils.hpp>
 #include <KokkosKernels_Utils.hpp>
 #include "KokkosKernels_default_types.hpp"
+#include <KokkosKernels_NaN.hpp>
 
 #include "KokkosSparse_spmv.hpp"
 #include "KokkosSparse_BsrMatrix.hpp"
@@ -327,10 +328,15 @@ spmv_random(const char *mode, const int blockSize, const int blockRows,
 /*! \brief create random x and y multivectors for a given matrix and spmv mode
  */
 template <typename Bsr>
-auto random_vecs_for_spmv(const char *mode, const Bsr &a) {
+auto random_vecs_for_spmv(const char *mode, const Bsr &a,
+                          const bool nans = false)
+    -> std::tuple<typename VectorTypeFor<Bsr>::type,
+                  typename VectorTypeFor<Bsr>::type> {
   using scalar_type     = typename Bsr::non_const_value_type;
   using vector_type     = typename VectorTypeFor<Bsr>::type;
   using execution_space = typename Bsr::execution_space;
+  using policy_type =
+      Kokkos::RangePolicy<typename vector_type::execution_space>;
 
   size_t nx = a.numCols() * a.blockDim();
   size_t ny = a.numRows() * a.blockDim();
@@ -344,6 +350,21 @@ auto random_vecs_for_spmv(const char *mode, const Bsr &a) {
   Kokkos::fill_random(x, random, max_x<scalar_type>());
   Kokkos::fill_random(y, random, max_y<scalar_type>());
 
+  if (nans) {
+    Kokkos::parallel_for(
+        policy_type(0, x.extent(0)), KOKKOS_LAMBDA(size_t i) {
+          if (0 == (i % 17)) {
+            x(i) = KokkosKernels::Impl::quiet_NaN<scalar_type>();
+          }
+        });
+    Kokkos::parallel_for(
+        policy_type(0, y.extent(0)), KOKKOS_LAMBDA(size_t i) {
+          if (0 == (i % 17)) {
+            y(i) = KokkosKernels::Impl::quiet_NaN<scalar_type>();
+          }
+        });
+  }
+
   return std::make_tuple(x, y);
 }
 
@@ -356,7 +377,8 @@ void test_spmv_combos(const char *mode, const Bsr &a, const Crs &acrs,
   using scalar_type     = typename Bsr::non_const_value_type;
   using execution_space = typename Bsr::execution_space;
 
-  auto [x, y] = random_vecs_for_spmv(mode, a);
+  auto [x, y]                     = random_vecs_for_spmv(mode, a);
+  auto [x_with_nans, y_with_nans] = random_vecs_for_spmv(mode, a, true);
 
   using handle_t = SPMVHandle<execution_space, Bsr, decltype(x), decltype(y)>;
 
@@ -389,6 +411,10 @@ void test_spmv_combos(const char *mode, const Bsr &a, const Crs &acrs,
       for (scalar_type beta : {scalar_type(0), scalar_type(1), scalar_type(-1),
                                scalar_type(-1.5)}) {
         test_spmv(handle, mode, alpha, beta, a, acrs, maxNnzPerRow, x, y);
+        if (beta == scalar_type(0)) {
+          test_spmv(handle, mode, alpha, beta, a, acrs, maxNnzPerRow,
+                    x_with_nans, y_with_nans);
+        }
       }
     }
   }
@@ -563,10 +589,14 @@ struct MultiVectorTypeFor {
  */
 template <typename Layout, typename Bsr>
 auto random_multivecs_for_spm_mv(const char *mode, const Bsr &a,
-                                 const size_t numVecs) {
+                                 const size_t numVecs, const bool nans = false)
+    -> std::tuple<typename MultiVectorTypeFor<Layout, Bsr>::type,
+                  typename MultiVectorTypeFor<Layout, Bsr>::type> {
   using scalar_type     = typename Bsr::non_const_value_type;
   using vector_type     = typename MultiVectorTypeFor<Layout, Bsr>::type;
   using execution_space = typename Bsr::execution_space;
+  using policy_type =
+      Kokkos::RangePolicy<typename vector_type::execution_space>;
 
   size_t nx = a.numCols() * a.blockDim();
   size_t ny = a.numRows() * a.blockDim();
@@ -580,6 +610,26 @@ auto random_multivecs_for_spm_mv(const char *mode, const Bsr &a,
   Kokkos::fill_random(x, random, max_x<scalar_type>());
   Kokkos::fill_random(y, random, max_y<scalar_type>());
 
+  // sprinkle some "random" NaNs in
+  if (nans) {
+    Kokkos::parallel_for(
+        policy_type(0, x.extent(0)), KOKKOS_LAMBDA(size_t i) {
+          for (size_t j = 0; j < x.extent(1); ++j) {
+            if (0 == ((i * x.extent(1) + j) % 13)) {
+              x(i, j) = KokkosKernels::Impl::quiet_NaN<scalar_type>();
+            }
+          }
+        });
+    Kokkos::parallel_for(
+        policy_type(0, y.extent(0)), KOKKOS_LAMBDA(size_t i) {
+          for (size_t j = 0; j < y.extent(1); ++j) {
+            if (0 == ((i * y.extent(1) + j) % 17)) {
+              y(i, j) = KokkosKernels::Impl::quiet_NaN<scalar_type>();
+            }
+          }
+        });
+  }
+
   return std::make_tuple(x, y);
 }
 
@@ -618,12 +668,18 @@ void test_spm_mv_combos(const char *mode, const Bsr &a, const Crs &acrs,
 
   for (size_t numVecs : {1, 7}) {  // num multivecs
     auto [x, y] = random_multivecs_for_spm_mv<Layout>(mode, a, numVecs);
+    auto [x_with_nans, y_with_nans] =
+        random_multivecs_for_spm_mv<Layout>(mode, a, numVecs, true);
     for (handle_t *handle : handles) {
       for (scalar_type alpha : {scalar_type(0), scalar_type(1), scalar_type(-1),
                                 scalar_type(3.7)}) {
         for (scalar_type beta : {scalar_type(0), scalar_type(1),
                                  scalar_type(-1), scalar_type(-1.5)}) {
           test_spm_mv(handle, mode, alpha, beta, a, acrs, maxNnzPerRow, x, y);
+          if (beta == scalar_type(0)) {
+            test_spm_mv(handle, mode, alpha, beta, a, acrs, maxNnzPerRow,
+                        x_with_nans, y_with_nans);
+          }
         }
       }
     }