Merge branch 'main' into workflow_changes

NVIDIA · Oct 16, 2024 · 44bcca2 · 44bcca2
2 parents 40bc50f + 949d444
commit 44bcca2
Show file tree

Hide file tree

Showing 12 changed files with 1,023 additions and 445 deletions.
diff --git a/docker/build/assets.Dockerfile b/docker/build/assets.Dockerfile
@@ -286,6 +286,7 @@ RUN cd /cuda-quantum && source scripts/configure_build.sh && \
         # The tests is marked correctly as requiring nvcc, but since nvcc
         # is available during the build we need to filter it manually.
         filtered=" --filter-out MixedLanguage/cuda-1"; \
+	filtered+="|AST-Quake/calling_convention"; \
     fi && \
     "$LLVM_INSTALL_PREFIX/bin/llvm-lit" -v build/test \
         --param nvqpp_site_config=build/test/lit.site.cfg.py ${filtered} && \

diff --git a/include/cudaq/Optimizer/Builder/Factory.h b/include/cudaq/Optimizer/Builder/Factory.h
@@ -236,6 +236,10 @@ createMonotonicLoop(mlir::OpBuilder &builder, mlir::Location loc,
 
 bool hasHiddenSRet(mlir::FunctionType funcTy);
 
+/// Check a function to see if argument 0 has the `sret` attribute. Typically,
+/// one may find this on a host-side entry point function.
+bool hasSRet(mlir::func::FuncOp funcOp);
+
 /// Convert the function type \p funcTy to a signature compatible with the code
 /// on the host side. This will add hidden arguments, such as the `this`
 /// pointer, convert some results to `sret` pointers, etc.
@@ -251,7 +255,8 @@ bool isX86_64(mlir::ModuleOp);
 bool isAArch64(mlir::ModuleOp);
 
 /// A small structure may be passed as two arguments on the host side. (e.g., on
-/// the X86-64 ABI.) If \p ty is not a `struct`, this returns `false`.
+/// the X86-64 ABI.) If \p ty is not a `struct`, this returns `false`. Note
+/// also, some small structs may be packed into a single register.
 bool structUsesTwoArguments(mlir::Type ty);
 
 std::optional<std::int64_t> getIntIfConstant(mlir::Value value);

diff --git a/lib/Optimizer/Builder/Factory.cpp b/lib/Optimizer/Builder/Factory.cpp
@@ -18,6 +18,9 @@ using namespace mlir;
 
 namespace cudaq::opt {
 
+// The common small struct limit for architectures cudaq is supporting.
+static constexpr unsigned CommonSmallStructSize = 128;
+
 bool factory::isX86_64(ModuleOp module) {
   std::string triple;
   if (auto ta = module->getAttr(targetTripleAttrName))
@@ -302,33 +305,6 @@ cc::LoopOp factory::createMonotonicLoop(
   return loop;
 }
 
-// FIXME: some ABIs may return a small struct in registers rather than via an
-// sret pointer.
-//
-// On x86_64,
-//   pair of:  argument         return value    packed from msb to lsb
-//    i32   :   i64              i64             (second, first)
-//    i64   :   i64, i64         { i64, i64 }
-//    f32   :   <2 x float>      <2 x float>
-//    f64   :   double, double   { double, double }
-//
-// On aarch64,
-//   pair of:  argument         return value    packed from msb to lsb
-//    i32   :   i64              i64             (second, first)
-//    i64   :   [2 x i64]        [2 x i64]
-//    f32   :   [2 x float]      { float, float }
-//    f64   :   [2 x double]     { double, double }
-bool factory::hasHiddenSRet(FunctionType funcTy) {
-  // If a function has more than 1 result, the results are promoted to a
-  // structured return argument. Otherwise, if there is 1 result and it is an
-  // aggregate type, then it is promoted to a structured return argument.
-  auto numResults = funcTy.getNumResults();
-  return numResults > 1 ||
-         (numResults == 1 && funcTy.getResult(0)
-                                 .isa<cc::SpanLikeType, cc::StructType,
-                                      cc::ArrayType, cc::CallableType>());
-}
-
 cc::StructType factory::stlStringType(MLIRContext *ctx) {
   auto i8Ty = IntegerType::get(ctx, 8);
   auto ptrI8Ty = cc::PointerType::get(i8Ty);
@@ -361,8 +337,8 @@ Type factory::getSRetElementType(FunctionType funcTy) {
   auto *ctx = funcTy.getContext();
   if (funcTy.getNumResults() > 1)
     return cc::StructType::get(ctx, funcTy.getResults());
-  if (isa<cc::SpanLikeType>(funcTy.getResult(0)))
-    return getDynamicBufferType(ctx);
+  if (auto spanTy = dyn_cast<cc::SpanLikeType>(funcTy.getResult(0)))
+    return stlVectorType(spanTy.getElementType());
   return funcTy.getResult(0);
 }
 
@@ -403,33 +379,50 @@ static Type convertToHostSideType(Type ty) {
 // function tries to simulate GCC argument passing conventions. classify() also
 // has a number of FIXME comments, where it diverges from the referenced ABI.
 // Empirical evidence show that on x86_64, integers and floats are packed in
-// integers of size 32 or 64 together, unless the float member fits by itself.
+// integers of size 8, 16, 24, 32 or 64 together, unless the float member fits
+// by itself.
 static bool shouldExpand(SmallVectorImpl<Type> &packedTys,
-                         cc::StructType structTy) {
+                         cc::StructType structTy, unsigned scaling = 8) {
   if (structTy.isEmpty())
     return false;
   auto *ctx = structTy.getContext();
   unsigned bits = 0;
+  const auto scaleBy = scaling - 1;
+  auto scaleBits = [&](unsigned size) {
+    if (size < 32)
+      size = (size + scaleBy) & ~scaleBy;
+    if (size > 32 && size <= 64)
+      size = 64;
+    return size;
+  };
 
   // First split the members into a "lo" set and a "hi" set.
   SmallVector<Type> set1;
   SmallVector<Type> set2;
   for (auto ty : structTy.getMembers()) {
     if (auto intTy = dyn_cast<IntegerType>(ty)) {
-      bits += intTy.getWidth();
-      if (bits <= 64)
+      auto addBits = scaleBits(intTy.getWidth());
+      if (bits + addBits <= 64) {
+        bits += addBits;
         set1.push_back(ty);
-      else
+      } else {
+        bits = std::max(bits, 64u) + addBits;
         set2.push_back(ty);
+      }
     } else if (auto fltTy = dyn_cast<FloatType>(ty)) {
-      bits += fltTy.getWidth();
-      if (bits <= 64)
+      auto addBits = fltTy.getWidth();
+      if (bits + addBits <= 64) {
+        bits += addBits;
         set1.push_back(ty);
-      else
+      } else {
+        bits = std::max(bits, 64u) + addBits;
         set2.push_back(ty);
+      }
     } else {
       return false;
     }
+    if (bits > CommonSmallStructSize)
+      return false;
   }
 
   // Process the sets. If the set has anything integral, use integer. If the set
@@ -441,28 +434,83 @@ static bool shouldExpand(SmallVectorImpl<Type> &packedTys,
         return true;
     return false;
   };
+  auto intSetSize = [&](auto theSet) {
+    unsigned size = 0;
+    for (auto ty : theSet)
+      size += scaleBits(ty.getIntOrFloatBitWidth());
+    return size;
+  };
   auto processMembers = [&](auto theSet, unsigned packIdx) {
     if (useInt(theSet)) {
-      packedTys[packIdx] = IntegerType::get(ctx, bits > 32 ? 64 : 32);
+      auto size = intSetSize(theSet);
+      if (size <= 32)
+        packedTys[packIdx] = IntegerType::get(ctx, size);
+      else
+        packedTys[packIdx] = IntegerType::get(ctx, 64);
     } else if (theSet.size() == 1) {
       packedTys[packIdx] = theSet[0];
     } else {
+      assert(theSet[0] == FloatType::getF32(ctx) && "must be float");
       packedTys[packIdx] =
           VectorType::get(ArrayRef<std::int64_t>{2}, theSet[0]);
     }
   };
   assert(!set1.empty() && "struct must have members");
   packedTys.resize(set2.empty() ? 1 : 2);
   processMembers(set1, 0);
-  if (!set2.empty())
-    processMembers(set2, 1);
+  if (set2.empty())
+    return false;
+  processMembers(set2, 1);
   return true;
 }
 
+bool factory::hasSRet(func::FuncOp funcOp) {
+  if (funcOp.getNumArguments() > 0)
+    if (auto dict = funcOp.getArgAttrDict(0))
+      return dict.contains(LLVM::LLVMDialect::getStructRetAttrName());
+  return false;
+}
+
+// On x86_64,
+//   pair of:  argument         return value    packed from msb to lsb
+//    i32   :   i64              i64             (second, first)
+//    i64   :   i64, i64         { i64, i64 }
+//    f32   :   <2 x float>      <2 x float>
+//    f64   :   double, double   { double, double }
+//    ptr   :   ptr, ptr         { ptr, ptr }
+//
+// On aarch64,
+//   pair of:  argument         return value    packed from msb to lsb
+//    i32   :   i64              i64             (second, first)
+//    i64   :   [2 x i64]        [2 x i64]
+//    f32   :   [2 x float]      { float, float }
+//    f64   :   [2 x double]     { double, double }
+//    ptr   :   [2 x i64]        [2 x i64]
+bool factory::hasHiddenSRet(FunctionType funcTy) {
+  // If a function has more than 1 result, the results are promoted to a
+  // structured return argument. Otherwise, if there is 1 result and it is an
+  // aggregate type, then it is promoted to a structured return argument.
+  auto numResults = funcTy.getNumResults();
+  if (numResults == 0)
+    return false;
+  if (numResults > 1)
+    return true;
+  auto resTy = funcTy.getResult(0);
+  if (resTy.isa<cc::SpanLikeType, cc::ArrayType, cc::CallableType>())
+    return true;
+  if (auto strTy = dyn_cast<cc::StructType>(resTy)) {
+    SmallVector<Type> packedTys;
+    bool inRegisters = shouldExpand(packedTys, strTy) || !packedTys.empty();
+    return !inRegisters;
+  }
+  return false;
+}
+
 bool factory::structUsesTwoArguments(mlir::Type ty) {
   // Unchecked! This is only valid if target is X86-64.
   auto structTy = dyn_cast<cc::StructType>(ty);
-  if (!structTy || structTy.getBitSize() == 0 || structTy.getBitSize() > 128)
+  if (!structTy || structTy.getBitSize() == 0 ||
+      structTy.getBitSize() > CommonSmallStructSize)
     return false;
   SmallVector<Type> unused;
   return shouldExpand(unused, structTy);
@@ -478,6 +526,17 @@ static bool onlyArithmeticMembers(cc::StructType structTy) {
   return true;
 }
 
+// Unchecked precondition: structTy must be entirely arithmetic.
+static unsigned getLargestWidth(cc::StructType structTy) {
+  unsigned largest = 8;
+  for (auto ty : structTy.getMembers()) {
+    auto width = ty.getIntOrFloatBitWidth();
+    if (width > largest)
+      largest = width;
+  }
+  return largest;
+}
+
 // When the kernel comes from a class, there is always a default `this` argument
 // to the kernel entry function. The CUDA-Q spec doesn't allow the kernel
 // object to contain data members (yet), so we can ignore the `this` pointer.
@@ -486,18 +545,50 @@ FunctionType factory::toHostSideFuncType(FunctionType funcTy, bool addThisPtr,
   auto *ctx = funcTy.getContext();
   SmallVector<Type> inputTys;
   bool hasSRet = false;
-  if (factory::hasHiddenSRet(funcTy)) {
-    // When the kernel is returning a std::vector<T> result, the result is
-    // returned via a sret argument in the first position. When this argument
-    // is added, the this pointer becomes the second argument. Both are opaque
-    // pointers at this point.
-    auto eleTy = convertToHostSideType(getSRetElementType(funcTy));
-    inputTys.push_back(cc::PointerType::get(eleTy));
-    hasSRet = true;
+  Type resultTy;
+  auto i64Ty = IntegerType::get(ctx, 64);
+  if (funcTy.getNumResults() == 1)
+    if (auto strTy = dyn_cast<cc::StructType>(funcTy.getResult(0)))
+      if (strTy.getBitSize() != 0 &&
+          strTy.getBitSize() <= CommonSmallStructSize) {
+        if (isX86_64(module)) {
+          // X86_64: Byte addressable scaling (packed registers). Default is a
+          // struct.
+          SmallVector<Type, 2> packedTys;
+          if (shouldExpand(packedTys, strTy) || !packedTys.empty()) {
+            if (packedTys.size() == 1)
+              resultTy = packedTys[0];
+            else
+              resultTy = cc::StructType::get(ctx, packedTys);
+          }
+        } else if (isAArch64(module) && onlyArithmeticMembers(strTy)) {
+          // AARCH64: Padded registers. Default is a two-element array.
+          unsigned largest = getLargestWidth(strTy);
+          SmallVector<Type, 2> packedTys;
+          if (shouldExpand(packedTys, strTy, largest) || !packedTys.empty()) {
+            if (packedTys.size() == 1)
+              resultTy = packedTys[0];
+            else
+              resultTy = cc::ArrayType::get(ctx, packedTys[0], 2);
+          }
+        }
+      }
+  if (!resultTy && funcTy.getNumResults()) {
+    if (factory::hasHiddenSRet(funcTy)) {
+      // When the kernel is returning a std::vector<T> result, the result is
+      // returned via a sret argument in the first position. When this argument
+      // is added, the this pointer becomes the second argument. Both are opaque
+      // pointers at this point.
+      auto eleTy = convertToHostSideType(getSRetElementType(funcTy));
+      inputTys.push_back(cc::PointerType::get(eleTy));
+      hasSRet = true;
+    } else {
+      assert(funcTy.getNumResults() == 1);
+      resultTy = funcTy.getResult(0);
+    }
   }
   // If this kernel is a plain old function or a static member function, we
   // don't want to add a hidden `this` argument.
-  auto i64Ty = IntegerType::get(ctx, 64);
   auto ptrTy = cc::PointerType::get(IntegerType::get(ctx, 8));
   if (addThisPtr)
     inputTys.push_back(ptrTy);
@@ -509,22 +600,33 @@ FunctionType factory::toHostSideFuncType(FunctionType funcTy, bool addThisPtr,
       // On x86_64 and aarch64, a struct that is smaller than 128 bits may be
       // passed in registers as separate arguments. See classifyArgumentType()
       // in CodeGen/TargetInfo.cpp.
-      if (strTy.getBitSize() != 0 && strTy.getBitSize() <= 128) {
+      if (strTy.getBitSize() != 0 &&
+          strTy.getBitSize() <= CommonSmallStructSize) {
         if (isX86_64(module)) {
           SmallVector<Type, 2> packedTys;
           if (shouldExpand(packedTys, strTy)) {
             for (auto ty : packedTys)
               inputTys.push_back(ty);
             continue;
+          } else if (!packedTys.empty()) {
+            for (auto ty : packedTys)
+              inputTys.push_back(ty);
+            continue;
           }
         } else {
           assert(isAArch64(module) && "aarch64 expected");
           if (onlyArithmeticMembers(strTy)) {
             // Empirical evidence shows that on aarch64, arguments are packed
-            // into a single i64 or a [2 x i64] typed value based on the size of
-            // the struct. This is regardless of whether the value(s) are
-            // floating-point or not.
-            if (strTy.getBitSize() > 64)
+            // into a single i64 or a [2 x i64] typed value based on the size
+            // of the struct. The exception is when there are 2 elements and
+            // they are both float or both double.
+            if ((strTy.getMembers().size() == 2) &&
+                (strTy.getMember(0) == strTy.getMember(1)) &&
+                ((strTy.getMember(0) == Float32Type::get(ctx)) ||
+                 (strTy.getMember(0) == Float64Type::get(ctx))))
+              inputTys.push_back(
+                  cc::ArrayType::get(ctx, strTy.getMember(0), 2));
+            else if (strTy.getBitSize() > 64)
               inputTys.push_back(cc::ArrayType::get(ctx, i64Ty, 2));
             else
               inputTys.push_back(i64Ty);
@@ -542,8 +644,8 @@ FunctionType factory::toHostSideFuncType(FunctionType funcTy, bool addThisPtr,
   // and it hasn't been converted to a hidden sret argument.
   if (funcTy.getNumResults() == 0 || hasSRet)
     return FunctionType::get(ctx, inputTys, {});
-  assert(funcTy.getNumResults() == 1);
-  return FunctionType::get(ctx, inputTys, funcTy.getResults());
+  assert(funcTy.getNumResults() == 1 && resultTy);
+  return FunctionType::get(ctx, inputTys, resultTy);
 }
 
 bool factory::isStdVecArg(Type type) {