From 21c0d6ba883e4654b0606f08d8e48d9638faea22 Mon Sep 17 00:00:00 2001
From: Daniel Richard G <skunk@iSKUNK.ORG>
Date: Tue, 5 Mar 2024 17:05:29 -0500
Subject: [PATCH] Fix several integer-signedness warnings

---
 include/cute/numeric/integral_constant.hpp           |  2 +-
 include/cute/numeric/math.hpp                        |  2 +-
 include/cutlass/arch/mma_sm60.h                      | 12 ++++++------
 include/cutlass/arch/mma_sm61.h                      |  4 ++--
 .../epilogue/warp/tile_iterator_tensor_op_mixed.h    |  8 ++++----
 include/cutlass/fast_math.h                          |  4 ++--
 include/cutlass/gemm/kernel/tile_scheduler_params.h  |  6 +++---
 .../cutlass/gemm/threadblock/threadblock_swizzle.h   | 10 ++++++++--
 .../gemm/threadblock/threadblock_swizzle_streamk.h   |  6 +++---
 include/cutlass/gemm/warp/mma_tensor_op.h            |  8 ++++----
 include/cutlass/half.h                               |  4 ++--
 include/cutlass/layout/permute.h                     |  8 ++++----
 include/cutlass/predicate_vector.h                   |  4 ++--
 test/unit/gemm/device/testbed.h                      |  2 +-
 test/unit/gemm/device/testbed_universal.h            |  2 +-
 15 files changed, 44 insertions(+), 38 deletions(-)
diff --git a/include/cute/numeric/integral_constant.hpp b/include/cute/numeric/integral_constant.hpp
index 904a672638..be26aaaaed 100644
--- a/include/cute/numeric/integral_constant.hpp
+++ b/include/cute/numeric/integral_constant.hpp
@@ -451,7 +451,7 @@ template <class... Ts>
 constexpr uint64_t parse_int_digits(uint64_t result, int digit, Ts... digits)
 {
   if constexpr (sizeof...(Ts) == 0) {
-    return 10 * result + digit;
+    return 10 * result + uint64_t(digit);
   } else {
     return parse_int_digits(10 * result + digit, digits...);
   }
diff --git a/include/cute/numeric/math.hpp b/include/cute/numeric/math.hpp
index 5be503390a..29bacb3693 100644
--- a/include/cute/numeric/math.hpp
+++ b/include/cute/numeric/math.hpp
@@ -151,7 +151,7 @@ bit_width(T x) {
                     (numeric_limits<T>::digits ==  8 ? 3 : (assert(false),0)))));
   T r = 0;
   for (int i = N - 1; i >= 0; --i) {
-    T shift = (x > ((T(1) << (T(1) << i))-1)) << i;
+    T shift = T(x > ((T(1) << (T(1) << i))-1)) << i;
     x >>= shift;
     r  |= shift;
   }
diff --git a/include/cutlass/arch/mma_sm60.h b/include/cutlass/arch/mma_sm60.h
index 3e3c71ef36..8c414ee891 100644
--- a/include/cutlass/arch/mma_sm60.h
+++ b/include/cutlass/arch/mma_sm60.h
@@ -84,7 +84,7 @@ struct Mma<
 
 #else
     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 2; ++i) {
+    for (size_t i = 0; i < 2; ++i) {
       d[i] = a[i] * b[0] + c[i];
     }
 #endif
@@ -130,7 +130,7 @@ struct Mma<
 
 #else
     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 2; ++i) {
+    for (size_t i = 0; i < 2; ++i) {
       d[i] = a[0] * b[i] + c[i];
     }
 #endif
@@ -182,9 +182,9 @@ struct Mma <
 
 #else
     CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < 2; ++j) {
+    for (size_t j = 0; j < 2; ++j) {
       CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < 2; ++i) {
+      for (size_t i = 0; i < 2; ++i) {
         d[i + 2 * j] = a[i] * b[j] + c[i + 2 * j];
       }
     }
@@ -236,9 +236,9 @@ struct Mma<
     D[1] = reinterpret_cast<Array<half_t, 2> &>(Dhi);
 #else
     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 2; ++i) {
+    for (size_t i = 0; i < 2; ++i) {
       CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < 2; ++j) {
+      for (size_t j = 0; j < 2; ++j) {
         d[i * 2 + j] = a[i] * b[j] + c[i * 2 + j];
       }
     }
diff --git a/include/cutlass/arch/mma_sm61.h b/include/cutlass/arch/mma_sm61.h
index 82a5aa7280..2373a1faad 100644
--- a/include/cutlass/arch/mma_sm61.h
+++ b/include/cutlass/arch/mma_sm61.h
@@ -82,7 +82,7 @@ struct Mma<
     d[0] = c[0];
 
     CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < 4; ++k) {
+    for (size_t k = 0; k < 4; ++k) {
       d[0] += a[k] * b[k];
     }
 
@@ -129,7 +129,7 @@ struct Mma<
     d[0] = c[0];
 
     CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < 2; ++k) {
+    for (size_t k = 0; k < 2; ++k) {
       d[0] += a[k] * b[k];
     }
 #endif
diff --git a/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h b/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h
index c512dd873b..40561715b2 100644
--- a/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h
+++ b/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h
@@ -384,7 +384,7 @@ class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, int32_t, 32, OutputS
     TensorRef const &ref,
     unsigned lane_id
   ):
-    stride_(ref.stride()[0] / AccessType::kElements) { 
+    stride_(size_t(ref.stride()[0]) / AccessType::kElements) {
 
     int quad_id = (lane_id / Detail::kLanesInQuad); 
     int lane_in_quad = (lane_id % Detail::kLanesInQuad);
@@ -579,7 +579,7 @@ class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, int32_t, 32, OutputS
     TensorRef const &ref,
     unsigned lane_id
   ):
-    stride_(ref.stride()[0] / AccessType::kElements) { 
+    stride_(size_t(ref.stride()[0]) / AccessType::kElements) {
 
     int quad_id = (lane_id / Detail::kLanesInQuad); 
     int lane_in_quad = (lane_id % Detail::kLanesInQuad);
@@ -774,7 +774,7 @@ class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, float, 32, 8, 16, 8>
     TensorRef const &ref,
     unsigned lane_id
   ):
-    stride_(ref.stride()[0] / AccessType::kElements) {
+    stride_(size_t(ref.stride()[0]) / AccessType::kElements) {
 
     int quad_id = (lane_id / Detail::kLanesInQuad);
     int lane_in_quad = (lane_id % Detail::kLanesInQuad);
@@ -963,7 +963,7 @@ class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, float, 32, 8, 8, 8>
     TensorRef const &ref,
     unsigned lane_id
   ):
-    stride_(ref.stride()[0] / AccessType::kElements) {
+    stride_(size_t(ref.stride()[0]) / AccessType::kElements) {
 
     int quad_id = (lane_id / Detail::kLanesInQuad);
     int lane_in_quad = (lane_id % Detail::kLanesInQuad);
diff --git a/include/cutlass/fast_math.h b/include/cutlass/fast_math.h
index 84fb06def2..49308b9ff0 100644
--- a/include/cutlass/fast_math.h
+++ b/include/cutlass/fast_math.h
@@ -242,7 +242,7 @@ CUTLASS_CONSTEXPR_IF_CXX17
 value_t find_log2(value_t x) {
   int a = int(31 - clz(x));
   a += (x & (x - 1)) != 0;  // Round up, add 1 if not a power of 2.
-  return a;
+  return value_t(a);
 }
 
 
@@ -364,7 +364,7 @@ struct FastDivmod {
   FastDivmod(int divisor): divisor(divisor) {
 
     if (divisor != 1) {
-      unsigned int p = 31 + find_log2(divisor);
+      auto p = static_cast<unsigned int>(31 + find_log2(divisor));
       unsigned m = unsigned(((1ull << p) + unsigned(divisor) - 1) / unsigned(divisor));
 
       multiplier = m;
diff --git a/include/cutlass/gemm/kernel/tile_scheduler_params.h b/include/cutlass/gemm/kernel/tile_scheduler_params.h
index 1630583f6c..f5d06638c4 100644
--- a/include/cutlass/gemm/kernel/tile_scheduler_params.h
+++ b/include/cutlass/gemm/kernel/tile_scheduler_params.h
@@ -958,9 +958,9 @@ struct PersistentTileSchedulerSm90StreamKParams {
     uint32_t epilogue_subtile = 1,
     uint32_t num_accumulator_mtxs = 1) {
 
-    auto log_swizzle_size = UnderlyingParams::get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle);
-    problem_blocks.x = round_up(problem_blocks.x, (1 << log_swizzle_size) * cluster_shape.m());
-    problem_blocks.y = round_up(problem_blocks.y, (1 << log_swizzle_size) * cluster_shape.n());
+    auto log_swizzle_size = UnderlyingParams::get_log_swizzle_size(int(problem_blocks.x), int(problem_blocks.y), max_swizzle);
+    problem_blocks.x = unsigned(round_up(int(problem_blocks.x), (1 << log_swizzle_size) * cluster_shape.m()));
+    problem_blocks.y = unsigned(round_up(int(problem_blocks.y), (1 << log_swizzle_size) * cluster_shape.n()));
 
     // Workspace is needed only for output tiles that will be split. Thus, we first determine the number
     // of output tiles that will be split, and then calculate the workspace needed to cover these.
diff --git a/include/cutlass/gemm/threadblock/threadblock_swizzle.h b/include/cutlass/gemm/threadblock/threadblock_swizzle.h
index 1a4948d077..9a5cdd6179 100644
--- a/include/cutlass/gemm/threadblock/threadblock_swizzle.h
+++ b/include/cutlass/gemm/threadblock/threadblock_swizzle.h
@@ -109,7 +109,10 @@ struct GemmIdentityThreadblockSwizzle {
   CUTLASS_HOST_DEVICE
   static dim3 get_grid_shape(GemmCoord tiled_shape) {
     int tile = 1 << get_log_tile(tiled_shape);
-    return dim3(tiled_shape.m() * tile, (tiled_shape.n() + tile - 1) / tile, tiled_shape.k());
+    return dim3(
+      unsigned(tiled_shape.m() * tile),
+      unsigned((tiled_shape.n() + tile - 1) / tile),
+      unsigned(tiled_shape.k()));
   }
 
   /// Calculates optimal swizzle width
@@ -301,7 +304,10 @@ struct GemmSplitKIdentityThreadblockSwizzle {
   CUTLASS_HOST_DEVICE
   static dim3 get_grid_shape(GemmCoord tiled_shape) {
     int tile = 1 << get_log_tile(tiled_shape);
-    return dim3(tiled_shape.m() * tile, (tiled_shape.n() + tile - 1) / tile, tiled_shape.k());
+    return dim3(
+      unsigned(tiled_shape.m() * tile),
+      unsigned((tiled_shape.n() + tile - 1) / tile),
+      unsigned(tiled_shape.k()));
   }
 
   /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
diff --git a/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h b/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h
index b79e587d7c..3b43494e0a 100644
--- a/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h
+++ b/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h
@@ -437,9 +437,9 @@ struct ThreadblockSwizzleStreamK {
       batch_count);
 
     size_t problem_bytes =
-              (element_C_bytes_ * problem_size.m() * problem_size.n()) +
-              (element_A_bytes_ * problem_size.m() * problem_size.k()) +
-              (element_B_bytes_ * problem_size.k() * problem_size.n());
+              (element_C_bytes_ * size_t(problem_size.m()) * size_t(problem_size.n())) +
+              (element_A_bytes_ * size_t(problem_size.m()) * size_t(problem_size.k())) +
+              (element_B_bytes_ * size_t(problem_size.k()) * size_t(problem_size.n()));
 
     size_t problem_flops = size_t(problem_size.m()) * size_t(problem_size.n()) * size_t(problem_size.k()) * 2;
 
diff --git a/include/cutlass/gemm/warp/mma_tensor_op.h b/include/cutlass/gemm/warp/mma_tensor_op.h
index b9212c43ab..9ac1522f75 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op.h
@@ -99,8 +99,8 @@ struct ConvertAndPack<bfloat16_t, float, N, Round> {
     Array<float, N> tmp;
 
     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      int idx = (((i << 1) & 2) | ((i >> 1) & 1) | (i & 0xfffffffc));
+    for (uint32_t i = 0; i < N; ++i) {
+      uint32_t idx = (((i << 1) & 2) | ((i >> 1) & 1) | (i & 0xfffffffc));
       tmp[i] = source[idx];
     }
 
@@ -120,8 +120,8 @@ struct ConvertAndPack<half_t, float, N, Round> {
     Array<float, N> tmp;
 
     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      int idx = (((i << 1) & 2) | ((i >> 1) & 1) | (i & 0xfffffffc));
+    for (uint32_t i = 0; i < N; ++i) {
+      uint32_t idx = (((i << 1) & 2) | ((i >> 1) & 1) | (i & 0xfffffffc));
       tmp[i] = source[idx];
     }
 
diff --git a/include/cutlass/half.h b/include/cutlass/half.h
index c203e6cb07..0835844c42 100644
--- a/include/cutlass/half.h
+++ b/include/cutlass/half.h
@@ -214,7 +214,7 @@ struct alignas(2) half_t {
     #endif
 
     uint16_t sign = uint16_t((s >> 16) & 0x8000);
-    int16_t exp = uint16_t(((s >> 23) & 0xff) - 127);
+    auto exp = int16_t(((s >> 23) & 0xff) - 127);
     int mantissa = s & 0x7fffff;
     uint16_t u = 0;
 
@@ -238,7 +238,7 @@ struct alignas(2) half_t {
 
     if (exp >= -14) {
       // normal fp32 to normal fp16
-      exp = uint16_t(exp + uint16_t(15));
+      exp = static_cast<decltype(exp)>(exp + 15);
       u = uint16_t(((exp & 0x1f) << 10));
       u = uint16_t(u | (mantissa >> 13));
     } else {
diff --git a/include/cutlass/layout/permute.h b/include/cutlass/layout/permute.h
index 912eb2c8cf..e66581cdac 100644
--- a/include/cutlass/layout/permute.h
+++ b/include/cutlass/layout/permute.h
@@ -318,7 +318,7 @@ class Tensor4DPermuteBMM0213RowMajor : public PermuteBase {
   LongIndex operator()(MatrixCoord coord) const {
 
     // The batch index for BMM
-    Index BMM_batch_idx = blockIdx.z;
+    auto BMM_batch_idx = Index(blockIdx.z);
     
     // [i,j,k,l] -> [i,k,j,l]
     Index l = coord.column();
@@ -381,7 +381,7 @@ class Tensor4DPermuteBMM0213RowMajorInverse : public PermuteBase {
   LongIndex operator()(MatrixCoord coord) const {
 
     // The batch index for BMM
-    Index BMM_batch_idx = blockIdx.z;
+    auto BMM_batch_idx = Index(blockIdx.z);
     
     // The following assumes grouping [(D0)->batch, (D2)->row, (D1,D3)->col]
     Index l = coord.column() % D3_;
@@ -453,7 +453,7 @@ class Tensor4DPermuteBMM0321ColumnMajor : public PermuteBase {
   CUTLASS_HOST_DEVICE
   LongIndex operator()(MatrixCoord coord) const {
 
-    Index BMM_batch_idx = blockIdx.z;
+    auto BMM_batch_idx = Index(blockIdx.z);
     
     // [i,j,k,l] -> [i,k,j,l]
     Index l = coord.column();
@@ -514,7 +514,7 @@ class Tensor4DPermuteBMM0321ColumnMajorInverse : public PermuteBase {
   CUTLASS_HOST_DEVICE
   LongIndex operator()(MatrixCoord coord) const {
 
-    Index BMM_batch_idx = blockIdx.z;
+    auto BMM_batch_idx = Index(blockIdx.z);
     
     // The following assumes grouping [(D0)->batch, (D1,D2)->row, (D3)->col]
     Index l = coord.column();
diff --git a/include/cutlass/predicate_vector.h b/include/cutlass/predicate_vector.h
index aa4e3f1a12..b0f093d70a 100644
--- a/include/cutlass/predicate_vector.h
+++ b/include/cutlass/predicate_vector.h
@@ -159,8 +159,8 @@ struct PredicateVector {
     int byte = (idx / kPredicatesPerByte);
     int bit_offset = (idx % kPredicatesPerByte);
 
-    word = byte / sizeof(Storage);
-    int byte_offset = (byte % sizeof(Storage));
+    word = byte / int(sizeof(Storage));
+    int byte_offset = (byte % int(sizeof(Storage)));
 
     bit = byte_offset * 8 + bit_offset + kPredicateStart;
   }
diff --git a/test/unit/gemm/device/testbed.h b/test/unit/gemm/device/testbed.h
index c1599561a5..9cffd255c9 100644
--- a/test/unit/gemm/device/testbed.h
+++ b/test/unit/gemm/device/testbed.h
@@ -156,7 +156,7 @@ struct Testbed {
     else if (dist_kind == cutlass::Distribution::Sequential) {
 
       cutlass::reference::host::BlockFillSequential(
-        view.data(), view.capacity());
+        view.data(), int64_t(view.capacity()));
     } 
     else {
       EXPECT_TRUE(false) << "Not implemented";
diff --git a/test/unit/gemm/device/testbed_universal.h b/test/unit/gemm/device/testbed_universal.h
index 8dc92db0e5..b9115efe2c 100644
--- a/test/unit/gemm/device/testbed_universal.h
+++ b/test/unit/gemm/device/testbed_universal.h
@@ -133,7 +133,7 @@ struct TestbedUniversal {
     else if (dist_kind == cutlass::Distribution::Sequential) {
 
       cutlass::reference::host::BlockFillSequential(
-        view.data(), view.capacity());
+        view.data(), int64_t(view.capacity()));
     }
     else {
       EXPECT_TRUE(false) << "Not implemented";