From 21c0d6ba883e4654b0606f08d8e48d9638faea22 Mon Sep 17 00:00:00 2001 From: Daniel Richard G Date: Tue, 5 Mar 2024 17:05:29 -0500 Subject: [PATCH] Fix several integer-signedness warnings --- include/cute/numeric/integral_constant.hpp | 2 +- include/cute/numeric/math.hpp | 2 +- include/cutlass/arch/mma_sm60.h | 12 ++++++------ include/cutlass/arch/mma_sm61.h | 4 ++-- .../epilogue/warp/tile_iterator_tensor_op_mixed.h | 8 ++++---- include/cutlass/fast_math.h | 4 ++-- include/cutlass/gemm/kernel/tile_scheduler_params.h | 6 +++--- .../cutlass/gemm/threadblock/threadblock_swizzle.h | 10 ++++++++-- .../gemm/threadblock/threadblock_swizzle_streamk.h | 6 +++--- include/cutlass/gemm/warp/mma_tensor_op.h | 8 ++++---- include/cutlass/half.h | 4 ++-- include/cutlass/layout/permute.h | 8 ++++---- include/cutlass/predicate_vector.h | 4 ++-- test/unit/gemm/device/testbed.h | 2 +- test/unit/gemm/device/testbed_universal.h | 2 +- 15 files changed, 44 insertions(+), 38 deletions(-) diff --git a/include/cute/numeric/integral_constant.hpp b/include/cute/numeric/integral_constant.hpp index 904a672638..be26aaaaed 100644 --- a/include/cute/numeric/integral_constant.hpp +++ b/include/cute/numeric/integral_constant.hpp @@ -451,7 +451,7 @@ template constexpr uint64_t parse_int_digits(uint64_t result, int digit, Ts... digits) { if constexpr (sizeof...(Ts) == 0) { - return 10 * result + digit; + return 10 * result + uint64_t(digit); } else { return parse_int_digits(10 * result + digit, digits...); } diff --git a/include/cute/numeric/math.hpp b/include/cute/numeric/math.hpp index 5be503390a..29bacb3693 100644 --- a/include/cute/numeric/math.hpp +++ b/include/cute/numeric/math.hpp @@ -151,7 +151,7 @@ bit_width(T x) { (numeric_limits::digits == 8 ? 3 : (assert(false),0))))); T r = 0; for (int i = N - 1; i >= 0; --i) { - T shift = (x > ((T(1) << (T(1) << i))-1)) << i; + T shift = T(x > ((T(1) << (T(1) << i))-1)) << i; x >>= shift; r |= shift; } diff --git a/include/cutlass/arch/mma_sm60.h b/include/cutlass/arch/mma_sm60.h index 3e3c71ef36..8c414ee891 100644 --- a/include/cutlass/arch/mma_sm60.h +++ b/include/cutlass/arch/mma_sm60.h @@ -84,7 +84,7 @@ struct Mma< #else CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < 2; ++i) { + for (size_t i = 0; i < 2; ++i) { d[i] = a[i] * b[0] + c[i]; } #endif @@ -130,7 +130,7 @@ struct Mma< #else CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < 2; ++i) { + for (size_t i = 0; i < 2; ++i) { d[i] = a[0] * b[i] + c[i]; } #endif @@ -182,9 +182,9 @@ struct Mma < #else CUTLASS_PRAGMA_UNROLL - for (int j = 0; j < 2; ++j) { + for (size_t j = 0; j < 2; ++j) { CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < 2; ++i) { + for (size_t i = 0; i < 2; ++i) { d[i + 2 * j] = a[i] * b[j] + c[i + 2 * j]; } } @@ -236,9 +236,9 @@ struct Mma< D[1] = reinterpret_cast &>(Dhi); #else CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < 2; ++i) { + for (size_t i = 0; i < 2; ++i) { CUTLASS_PRAGMA_UNROLL - for (int j = 0; j < 2; ++j) { + for (size_t j = 0; j < 2; ++j) { d[i * 2 + j] = a[i] * b[j] + c[i * 2 + j]; } } diff --git a/include/cutlass/arch/mma_sm61.h b/include/cutlass/arch/mma_sm61.h index 82a5aa7280..2373a1faad 100644 --- a/include/cutlass/arch/mma_sm61.h +++ b/include/cutlass/arch/mma_sm61.h @@ -82,7 +82,7 @@ struct Mma< d[0] = c[0]; CUTLASS_PRAGMA_UNROLL - for (int k = 0; k < 4; ++k) { + for (size_t k = 0; k < 4; ++k) { d[0] += a[k] * b[k]; } @@ -129,7 +129,7 @@ struct Mma< d[0] = c[0]; CUTLASS_PRAGMA_UNROLL - for (int k = 0; k < 2; ++k) { + for (size_t k = 0; k < 2; ++k) { d[0] += a[k] * b[k]; } #endif diff --git a/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h b/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h index c512dd873b..40561715b2 100644 --- a/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h +++ b/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h @@ -384,7 +384,7 @@ class TileIteratorTensorOpMixed TensorRef const &ref, unsigned lane_id ): - stride_(ref.stride()[0] / AccessType::kElements) { + stride_(size_t(ref.stride()[0]) / AccessType::kElements) { int quad_id = (lane_id / Detail::kLanesInQuad); int lane_in_quad = (lane_id % Detail::kLanesInQuad); @@ -963,7 +963,7 @@ class TileIteratorTensorOpMixed TensorRef const &ref, unsigned lane_id ): - stride_(ref.stride()[0] / AccessType::kElements) { + stride_(size_t(ref.stride()[0]) / AccessType::kElements) { int quad_id = (lane_id / Detail::kLanesInQuad); int lane_in_quad = (lane_id % Detail::kLanesInQuad); diff --git a/include/cutlass/fast_math.h b/include/cutlass/fast_math.h index 84fb06def2..49308b9ff0 100644 --- a/include/cutlass/fast_math.h +++ b/include/cutlass/fast_math.h @@ -242,7 +242,7 @@ CUTLASS_CONSTEXPR_IF_CXX17 value_t find_log2(value_t x) { int a = int(31 - clz(x)); a += (x & (x - 1)) != 0; // Round up, add 1 if not a power of 2. - return a; + return value_t(a); } @@ -364,7 +364,7 @@ struct FastDivmod { FastDivmod(int divisor): divisor(divisor) { if (divisor != 1) { - unsigned int p = 31 + find_log2(divisor); + auto p = static_cast(31 + find_log2(divisor)); unsigned m = unsigned(((1ull << p) + unsigned(divisor) - 1) / unsigned(divisor)); multiplier = m; diff --git a/include/cutlass/gemm/kernel/tile_scheduler_params.h b/include/cutlass/gemm/kernel/tile_scheduler_params.h index 1630583f6c..f5d06638c4 100644 --- a/include/cutlass/gemm/kernel/tile_scheduler_params.h +++ b/include/cutlass/gemm/kernel/tile_scheduler_params.h @@ -958,9 +958,9 @@ struct PersistentTileSchedulerSm90StreamKParams { uint32_t epilogue_subtile = 1, uint32_t num_accumulator_mtxs = 1) { - auto log_swizzle_size = UnderlyingParams::get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle); - problem_blocks.x = round_up(problem_blocks.x, (1 << log_swizzle_size) * cluster_shape.m()); - problem_blocks.y = round_up(problem_blocks.y, (1 << log_swizzle_size) * cluster_shape.n()); + auto log_swizzle_size = UnderlyingParams::get_log_swizzle_size(int(problem_blocks.x), int(problem_blocks.y), max_swizzle); + problem_blocks.x = unsigned(round_up(int(problem_blocks.x), (1 << log_swizzle_size) * cluster_shape.m())); + problem_blocks.y = unsigned(round_up(int(problem_blocks.y), (1 << log_swizzle_size) * cluster_shape.n())); // Workspace is needed only for output tiles that will be split. Thus, we first determine the number // of output tiles that will be split, and then calculate the workspace needed to cover these. diff --git a/include/cutlass/gemm/threadblock/threadblock_swizzle.h b/include/cutlass/gemm/threadblock/threadblock_swizzle.h index 1a4948d077..9a5cdd6179 100644 --- a/include/cutlass/gemm/threadblock/threadblock_swizzle.h +++ b/include/cutlass/gemm/threadblock/threadblock_swizzle.h @@ -109,7 +109,10 @@ struct GemmIdentityThreadblockSwizzle { CUTLASS_HOST_DEVICE static dim3 get_grid_shape(GemmCoord tiled_shape) { int tile = 1 << get_log_tile(tiled_shape); - return dim3(tiled_shape.m() * tile, (tiled_shape.n() + tile - 1) / tile, tiled_shape.k()); + return dim3( + unsigned(tiled_shape.m() * tile), + unsigned((tiled_shape.n() + tile - 1) / tile), + unsigned(tiled_shape.k())); } /// Calculates optimal swizzle width @@ -301,7 +304,10 @@ struct GemmSplitKIdentityThreadblockSwizzle { CUTLASS_HOST_DEVICE static dim3 get_grid_shape(GemmCoord tiled_shape) { int tile = 1 << get_log_tile(tiled_shape); - return dim3(tiled_shape.m() * tile, (tiled_shape.n() + tile - 1) / tile, tiled_shape.k()); + return dim3( + unsigned(tiled_shape.m() * tile), + unsigned((tiled_shape.n() + tile - 1) / tile), + unsigned(tiled_shape.k())); } /// Obtains the threadblock offset (in units of threadblock-scoped tiles) diff --git a/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h b/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h index b79e587d7c..3b43494e0a 100644 --- a/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h +++ b/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h @@ -437,9 +437,9 @@ struct ThreadblockSwizzleStreamK { batch_count); size_t problem_bytes = - (element_C_bytes_ * problem_size.m() * problem_size.n()) + - (element_A_bytes_ * problem_size.m() * problem_size.k()) + - (element_B_bytes_ * problem_size.k() * problem_size.n()); + (element_C_bytes_ * size_t(problem_size.m()) * size_t(problem_size.n())) + + (element_A_bytes_ * size_t(problem_size.m()) * size_t(problem_size.k())) + + (element_B_bytes_ * size_t(problem_size.k()) * size_t(problem_size.n())); size_t problem_flops = size_t(problem_size.m()) * size_t(problem_size.n()) * size_t(problem_size.k()) * 2; diff --git a/include/cutlass/gemm/warp/mma_tensor_op.h b/include/cutlass/gemm/warp/mma_tensor_op.h index b9212c43ab..9ac1522f75 100644 --- a/include/cutlass/gemm/warp/mma_tensor_op.h +++ b/include/cutlass/gemm/warp/mma_tensor_op.h @@ -99,8 +99,8 @@ struct ConvertAndPack { Array tmp; CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < N; ++i) { - int idx = (((i << 1) & 2) | ((i >> 1) & 1) | (i & 0xfffffffc)); + for (uint32_t i = 0; i < N; ++i) { + uint32_t idx = (((i << 1) & 2) | ((i >> 1) & 1) | (i & 0xfffffffc)); tmp[i] = source[idx]; } @@ -120,8 +120,8 @@ struct ConvertAndPack { Array tmp; CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < N; ++i) { - int idx = (((i << 1) & 2) | ((i >> 1) & 1) | (i & 0xfffffffc)); + for (uint32_t i = 0; i < N; ++i) { + uint32_t idx = (((i << 1) & 2) | ((i >> 1) & 1) | (i & 0xfffffffc)); tmp[i] = source[idx]; } diff --git a/include/cutlass/half.h b/include/cutlass/half.h index c203e6cb07..0835844c42 100644 --- a/include/cutlass/half.h +++ b/include/cutlass/half.h @@ -214,7 +214,7 @@ struct alignas(2) half_t { #endif uint16_t sign = uint16_t((s >> 16) & 0x8000); - int16_t exp = uint16_t(((s >> 23) & 0xff) - 127); + auto exp = int16_t(((s >> 23) & 0xff) - 127); int mantissa = s & 0x7fffff; uint16_t u = 0; @@ -238,7 +238,7 @@ struct alignas(2) half_t { if (exp >= -14) { // normal fp32 to normal fp16 - exp = uint16_t(exp + uint16_t(15)); + exp = static_cast(exp + 15); u = uint16_t(((exp & 0x1f) << 10)); u = uint16_t(u | (mantissa >> 13)); } else { diff --git a/include/cutlass/layout/permute.h b/include/cutlass/layout/permute.h index 912eb2c8cf..e66581cdac 100644 --- a/include/cutlass/layout/permute.h +++ b/include/cutlass/layout/permute.h @@ -318,7 +318,7 @@ class Tensor4DPermuteBMM0213RowMajor : public PermuteBase { LongIndex operator()(MatrixCoord coord) const { // The batch index for BMM - Index BMM_batch_idx = blockIdx.z; + auto BMM_batch_idx = Index(blockIdx.z); // [i,j,k,l] -> [i,k,j,l] Index l = coord.column(); @@ -381,7 +381,7 @@ class Tensor4DPermuteBMM0213RowMajorInverse : public PermuteBase { LongIndex operator()(MatrixCoord coord) const { // The batch index for BMM - Index BMM_batch_idx = blockIdx.z; + auto BMM_batch_idx = Index(blockIdx.z); // The following assumes grouping [(D0)->batch, (D2)->row, (D1,D3)->col] Index l = coord.column() % D3_; @@ -453,7 +453,7 @@ class Tensor4DPermuteBMM0321ColumnMajor : public PermuteBase { CUTLASS_HOST_DEVICE LongIndex operator()(MatrixCoord coord) const { - Index BMM_batch_idx = blockIdx.z; + auto BMM_batch_idx = Index(blockIdx.z); // [i,j,k,l] -> [i,k,j,l] Index l = coord.column(); @@ -514,7 +514,7 @@ class Tensor4DPermuteBMM0321ColumnMajorInverse : public PermuteBase { CUTLASS_HOST_DEVICE LongIndex operator()(MatrixCoord coord) const { - Index BMM_batch_idx = blockIdx.z; + auto BMM_batch_idx = Index(blockIdx.z); // The following assumes grouping [(D0)->batch, (D1,D2)->row, (D3)->col] Index l = coord.column(); diff --git a/include/cutlass/predicate_vector.h b/include/cutlass/predicate_vector.h index aa4e3f1a12..b0f093d70a 100644 --- a/include/cutlass/predicate_vector.h +++ b/include/cutlass/predicate_vector.h @@ -159,8 +159,8 @@ struct PredicateVector { int byte = (idx / kPredicatesPerByte); int bit_offset = (idx % kPredicatesPerByte); - word = byte / sizeof(Storage); - int byte_offset = (byte % sizeof(Storage)); + word = byte / int(sizeof(Storage)); + int byte_offset = (byte % int(sizeof(Storage))); bit = byte_offset * 8 + bit_offset + kPredicateStart; } diff --git a/test/unit/gemm/device/testbed.h b/test/unit/gemm/device/testbed.h index c1599561a5..9cffd255c9 100644 --- a/test/unit/gemm/device/testbed.h +++ b/test/unit/gemm/device/testbed.h @@ -156,7 +156,7 @@ struct Testbed { else if (dist_kind == cutlass::Distribution::Sequential) { cutlass::reference::host::BlockFillSequential( - view.data(), view.capacity()); + view.data(), int64_t(view.capacity())); } else { EXPECT_TRUE(false) << "Not implemented"; diff --git a/test/unit/gemm/device/testbed_universal.h b/test/unit/gemm/device/testbed_universal.h index 8dc92db0e5..b9115efe2c 100644 --- a/test/unit/gemm/device/testbed_universal.h +++ b/test/unit/gemm/device/testbed_universal.h @@ -133,7 +133,7 @@ struct TestbedUniversal { else if (dist_kind == cutlass::Distribution::Sequential) { cutlass::reference::host::BlockFillSequential( - view.data(), view.capacity()); + view.data(), int64_t(view.capacity())); } else { EXPECT_TRUE(false) << "Not implemented";