From 56fbaf956f35d3fa7cfa0c151d1dcf4e554af9cd Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Sat, 18 Jan 2025 01:08:35 +0000
Subject: [PATCH] [compiler] Do not mix kernels with different sub-group sizes.

Per OpenCL 3.0 API 3.2.1 Mapping Work-items Onto an Nd-range, all
sub-groups within a work-group will be the same size, apart from the
sub-group with the maximum index which may be smaller if the size of the
work-group is not evenly divisible by the size of the sub-groups. We
were not meeting this requirement: in cases where we would not or could
not generate a predicated vectorized kernel, we would execute the scalar
kernel in a loop for any remaining work items, possibly resulting in
multiple sub-groups that are smaller than the maximum sub-group size.

To avoid this situation, we need to avoid mixing vector and scalar
kernels if those kernels use different sub-group sizes. If we can handle
all items with vector kernels, possibly with predication, continue to do
so. If the vector and scalar kernels do not depend on the sub-group
size, also continue to handle this as before. If the vector and scalar
kernels do depend on the sub-group size, and the vector kernel cannot
handle all work items, we need to switch to the scalar kernel for all
work items.

This includes a small optimization where if we know the kernel does not
use sub-group information, we avoid setting sub-group IDs.

This includes one change to createLoop which permits nullptr PHIs. They
will be skipped over, and are useful since PHIs must be referred to by
index in the callback function. This allows indices to be constant even
when the caller has multiple optional PHIs.

This also includes one bugfix to ControlFlowConversionPass to fix a
crash seen now, where we use the result of createMasked{Load,Store}
before checking whether it succeeded.

This also includes one improvement to CompileKernelToBin.cmake. If the
executed command fails, it will now be printed in a format that can be
copied and pasted.
---
 .../source/pass_functions.cpp                 |   4 +
 .../source/work_item_loops_pass.cpp           | 321 ++++++++++--------
 .../test/lit/passes/barriers-cfg-reduce.ll    |   4 +-
 .../test/lit/passes/subgroup-loop-unroll.ll   |  18 +-
 .../control_flow_conversion_pass.cpp          |   3 +-
 .../UnitCL/cmake/CompileKernelToBin.cmake     |  21 +-
 6 files changed, 196 insertions(+), 175 deletions(-)

diff --git a/modules/compiler/compiler_pipeline/source/pass_functions.cpp b/modules/compiler/compiler_pipeline/source/pass_functions.cpp
index 1287e1009..d13a17515 100644
--- a/modules/compiler/compiler_pipeline/source/pass_functions.cpp
+++ b/modules/compiler/compiler_pipeline/source/pass_functions.cpp
@@ -519,6 +519,9 @@ llvm::BasicBlock *createLoop(llvm::BasicBlock *entry, llvm::BasicBlock *exit,
 
   // Set up all of our user PHIs
   for (unsigned i = 0, e = currIVs.size(); i != e; i++) {
+    // For convenience to callers, permit nullptr and skip over it.
+    if (!currIVs[i]) continue;
+
     auto *const phi = loopIR.CreatePHI(currIVs[i]->getType(), 2);
     llvm::cast<llvm::PHINode>(phi)->addIncoming(currIVs[i],
                                                 entryIR.GetInsertBlock());
@@ -542,6 +545,7 @@ llvm::BasicBlock *createLoop(llvm::BasicBlock *entry, llvm::BasicBlock *exit,
 
   // Update all of our PHIs
   for (unsigned i = 0, e = currIVs.size(); i != e; i++) {
+    if (!currIVs[i]) continue;
     llvm::cast<llvm::PHINode>(currIVs[i])->addIncoming(nextIVs[i], latch);
   }
 
diff --git a/modules/compiler/compiler_pipeline/source/work_item_loops_pass.cpp b/modules/compiler/compiler_pipeline/source/work_item_loops_pass.cpp
index 0fa90f90e..6ea8d484f 100644
--- a/modules/compiler/compiler_pipeline/source/work_item_loops_pass.cpp
+++ b/modules/compiler/compiler_pipeline/source/work_item_loops_pass.cpp
@@ -203,8 +203,8 @@ struct ScheduleGenerator {
   AllocaInst *nextID = nullptr;
   Value *mainLoopLimit = nullptr;
   Value *peel = nullptr;
+  bool noExplicitSubgroups = false;
   bool emitTail = true;
-  bool isVectorPredicated = false;
   bool wrapperHasMain = false;
   bool wrapperHasTail = false;
 
@@ -726,8 +726,10 @@ struct ScheduleGenerator {
         mainPreheaderBB->moveAfter(block);
         mainExitBB->moveAfter(mainPreheaderBB);
 
-        subgroupMergePhi = PHINode::Create(i32Ty, 2, "", mainExitBB);
-        subgroupMergePhi->addIncoming(i32Zero, block);
+        if (!noExplicitSubgroups) {
+          subgroupMergePhi = PHINode::Create(i32Ty, 2, "", mainExitBB);
+          subgroupMergePhi->addIncoming(i32Zero, block);
+        }
 
         auto *const needMain =
             CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, zero,
@@ -744,7 +746,9 @@ struct ScheduleGenerator {
       wrapperHasMain = true;
       // Subgroup induction variables
       compiler::utils::CreateLoopOpts outer_opts;
-      outer_opts.IVs = {i32Zero};
+      if (!noExplicitSubgroups) {
+        outer_opts.IVs = {i32Zero};
+      }
 
       // looping through num groups in the third (outermost) dimension
       mainExitBB = compiler::utils::createLoop(
@@ -788,32 +792,41 @@ struct ScheduleGenerator {
                           MutableArrayRef<Value *> ivsNext0) -> BasicBlock * {
                         IRBuilder<> ir(block);
 
-                        // set our subgroup id
-                        ir.CreateCall(set_subgroup_id, {ivs0[0]})
-                            ->setCallingConv(set_subgroup_id->getCallingConv());
+                        if (!noExplicitSubgroups) {
+                          // set our subgroup id
+                          ir.CreateCall(set_subgroup_id, {ivs0[0]})
+                              ->setCallingConv(
+                                  set_subgroup_id->getCallingConv());
+                        }
 
                         createWorkItemLoopBody(barrierMain, ir, block,
                                                barrierID, dim_0, dim_1, dim_2,
                                                accum, VF);
 
-                        nextSubgroupIV =
-                            ir.CreateAdd(ivs0[0], ConstantInt::get(i32Ty, 1));
-                        ivsNext0[0] = nextSubgroupIV;
+                        if (!noExplicitSubgroups) {
+                          nextSubgroupIV =
+                              ir.CreateAdd(ivs0[0], ConstantInt::get(i32Ty, 1));
+                          ivsNext0[0] = nextSubgroupIV;
+                        }
 
                         return block;
                       });
 
-                  // Don't forget to update the subgroup IV phi.
-                  ivsNext1[0] = nextSubgroupIV;
+                  if (!noExplicitSubgroups) {
+                    // Don't forget to update the subgroup IV phi.
+                    ivsNext1[0] = nextSubgroupIV;
+                  }
 
                   return exit0;
                 });
 
-            // Don't forget to update the subgroup IV phi.
-            ivsNext2[0] = nextSubgroupIV;
+            if (!noExplicitSubgroups) {
+              // Don't forget to update the subgroup IV phi.
+              ivsNext2[0] = nextSubgroupIV;
 
-            if (subgroupMergePhi) {
-              subgroupMergePhi->addIncoming(nextSubgroupIV, exit1);
+              if (subgroupMergePhi) {
+                subgroupMergePhi->addIncoming(nextSubgroupIV, exit1);
+              }
             }
 
             return exit1;
@@ -861,7 +874,9 @@ struct ScheduleGenerator {
       wrapperHasTail = true;
       // Subgroup induction variables
       compiler::utils::CreateLoopOpts outer_opts;
-      outer_opts.IVs = {subgroupMergePhi ? subgroupMergePhi : nextSubgroupIV};
+      if (!noExplicitSubgroups) {
+        outer_opts.IVs = {subgroupMergePhi ? subgroupMergePhi : nextSubgroupIV};
+      }
 
       // looping through num groups in the third (outermost) dimension
       tailExitBB = compiler::utils::createLoop(
@@ -899,7 +914,7 @@ struct ScheduleGenerator {
                           MutableArrayRef<Value *> ivsNext0) -> BasicBlock * {
                         IRBuilder<> ir(block);
 
-                        if (set_subgroup_id) {
+                        if (!noExplicitSubgroups) {
                           // set our subgroup id
                           ir.CreateCall(set_subgroup_id, {ivs0[0]})
                               ->setCallingConv(
@@ -910,21 +925,27 @@ struct ScheduleGenerator {
                             *barrierTail, ir, block, barrierID, dim_0, dim_1,
                             dim_2, accum, /*VF*/ nullptr, mainLoopLimit);
 
-                        nextSubgroupIV =
-                            ir.CreateAdd(ivs0[0], ConstantInt::get(i32Ty, 1));
-                        ivsNext0[0] = nextSubgroupIV;
+                        if (!noExplicitSubgroups) {
+                          nextSubgroupIV =
+                              ir.CreateAdd(ivs0[0], ConstantInt::get(i32Ty, 1));
+                          ivsNext0[0] = nextSubgroupIV;
+                        }
 
                         return block;
                       });
 
-                  // Don't forget to update the subgroup IV phi.
-                  ivsNext1[0] = nextSubgroupIV;
+                  if (!noExplicitSubgroups) {
+                    // Don't forget to update the subgroup IV phi.
+                    ivsNext1[0] = nextSubgroupIV;
+                  }
 
                   return exit0;
                 });
 
-            // Don't forget to update the subgroup IV phi.
-            ivsNext2[0] = nextSubgroupIV;
+            if (!noExplicitSubgroups) {
+              // Don't forget to update the subgroup IV phi.
+              ivsNext2[0] = nextSubgroupIV;
+            }
 
             return exit1;
           });
@@ -960,11 +981,11 @@ struct ScheduleGenerator {
 
     // The subgroup induction variable, set to the value of the subgroup ID at
     // the end of the last loop (i.e. beginning of the next loop)
-    Value *nextSubgroupIV = i32Zero;
+    Value *nextSubgroupIV = noExplicitSubgroups ? nullptr : i32Zero;
 
     // The work-group scan induction variable, set to the current scan value at
     // the end of the last loop (i.e. beginning of the next loop)
-    Value *nextScanIV = accum;
+    Value *nextScanIV = isScan ? accum : nullptr;
 
     // We need to ensure any subgroup IV is defined on the path in which
     // the vector loop is skipped.
@@ -973,12 +994,8 @@ struct ScheduleGenerator {
     PHINode *scanMergePhi = nullptr;
 
     compiler::utils::CreateLoopOpts outer_opts;
-    outer_opts.IVs.push_back(i32Zero);
-    outer_opts.loopIVNames.push_back("sg.z");
-    if (isScan) {
-      outer_opts.IVs.push_back(nextScanIV);
-      outer_opts.loopIVNames.push_back("scan.z");
-    }
+    outer_opts.IVs = {nextSubgroupIV, nextScanIV};
+    outer_opts.loopIVNames = {"sg.z", "scan.z"};
 
     // looping through num groups in the third (outermost) dimension
     return compiler::utils::createLoop(
@@ -993,10 +1010,7 @@ struct ScheduleGenerator {
 
           compiler::utils::CreateLoopOpts middle_opts;
           middle_opts.IVs = ivs2.vec();
-          middle_opts.loopIVNames.push_back("sg.y");
-          if (isScan) {
-            middle_opts.loopIVNames.push_back("scan.y");
-          }
+          middle_opts.loopIVNames = {"sg.y", "scan.y"};
 
           // looping through num groups in the second dimension
           BasicBlock *exit1 = compiler::utils::createLoop(
@@ -1023,7 +1037,9 @@ struct ScheduleGenerator {
                       // No main iterations at all!
                       mainPreheaderBB = nullptr;
                       mainExitBB = block;
-                      nextSubgroupIV = ivs1[0];
+                      if (!noExplicitSubgroups) {
+                        nextSubgroupIV = ivs1[0];
+                      }
                       if (isScan) {
                         nextScanIV = ivs1[1];
                       }
@@ -1037,9 +1053,11 @@ struct ScheduleGenerator {
                     mainPreheaderBB->moveAfter(block);
                     mainExitBB->moveAfter(mainPreheaderBB);
 
-                    subgroupMergePhi =
-                        PHINode::Create(i32Ty, 2, "sg.merge", mainExitBB);
-                    subgroupMergePhi->addIncoming(ivs1[0], block);
+                    if (!noExplicitSubgroups) {
+                      subgroupMergePhi =
+                          PHINode::Create(i32Ty, 2, "sg.merge", mainExitBB);
+                      subgroupMergePhi->addIncoming(ivs1[0], block);
+                    }
 
                     if (isScan) {
                       scanMergePhi = PHINode::Create(accum->getType(), 2,
@@ -1072,10 +1090,7 @@ struct ScheduleGenerator {
                   compiler::utils::CreateLoopOpts inner_vf_opts;
                   inner_vf_opts.indexInc = VF;
                   inner_vf_opts.IVs = ivs1.vec();
-                  inner_vf_opts.loopIVNames.push_back("sg.x.main");
-                  if (isScan) {
-                    inner_vf_opts.loopIVNames.push_back("scan.y.main");
-                  }
+                  inner_vf_opts.loopIVNames = {"sg.x.main", "scan.x.main"};
 
                   mainExitBB = compiler::utils::createLoop(
                       mainPreheaderBB, mainExitBB, zero, mainLoopLimit,
@@ -1085,7 +1100,7 @@ struct ScheduleGenerator {
                           MutableArrayRef<Value *> ivsNext0) -> BasicBlock * {
                         IRBuilder<> ir(block);
 
-                        if (set_subgroup_id) {
+                        if (!noExplicitSubgroups) {
                           // set our subgroup id
                           ir.CreateCall(set_subgroup_id, {ivs0[0]})
                               ->setCallingConv(
@@ -1112,10 +1127,12 @@ struct ScheduleGenerator {
                                                barrierID, dim_0, dim_1, dim_2,
                                                accum, VF);
 
-                        nextSubgroupIV =
-                            ir.CreateAdd(ivs0[0], ConstantInt::get(i32Ty, 1),
-                                         "sg.x.main.inc");
-                        ivsNext0[0] = nextSubgroupIV;
+                        if (!noExplicitSubgroups) {
+                          nextSubgroupIV =
+                              ir.CreateAdd(ivs0[0], ConstantInt::get(i32Ty, 1),
+                                           "sg.x.main.inc");
+                          ivsNext0[0] = nextSubgroupIV;
+                        }
 
                         // Move the exit after the loop block, as it reads more
                         // logically.
@@ -1129,10 +1146,12 @@ struct ScheduleGenerator {
 
                   if (subgroupMergePhi) {
                     subgroupMergePhi->addIncoming(nextSubgroupIV, mainLoopBB);
+                    nextSubgroupIV = subgroupMergePhi;
                   }
 
                   if (scanMergePhi) {
                     scanMergePhi->addIncoming(nextScanIV, mainLoopBB);
+                    nextScanIV = scanMergePhi;
                   }
                 }
                 assert(mainExitBB && "didn't create a loop exit block!");
@@ -1178,17 +1197,13 @@ struct ScheduleGenerator {
                   assert(barrierTail);
                   wrapperHasTail = true;
                   // Subgroup induction variables
-                  SmallVector<Value *, 2> subgroupIVs0 = {
-                      subgroupMergePhi ? subgroupMergePhi : nextSubgroupIV};
-                  if (isScan) {
-                    subgroupIVs0.push_back(scanMergePhi ? scanMergePhi
-                                                        : nextScanIV);
-                  }
+                  SmallVector<Value *, 2> subgroupIVs0 = {nextSubgroupIV,
+                                                          nextScanIV};
 
                   BasicBlock *tailLoopBB = nullptr;
                   if (barrierTail->getVFInfo().IsVectorPredicated) {
                     IRBuilder<> ir(tailPreheaderBB);
-                    if (set_subgroup_id) {
+                    if (!noExplicitSubgroups) {
                       // set our subgroup id
                       ir.CreateCall(set_subgroup_id, {subgroupIVs0[0]})
                           ->setCallingConv(set_subgroup_id->getCallingConv());
@@ -1215,9 +1230,12 @@ struct ScheduleGenerator {
                                            barrierID, zero, dim_1, dim_2, accum,
                                            /*VF*/ nullptr, mainLoopLimit);
 
-                    nextSubgroupIV = ir.CreateAdd(subgroupIVs0[0],
-                                                  ConstantInt::get(i32Ty, 1),
-                                                  "sg.x.tail.inc");
+                    if (!noExplicitSubgroups) {
+                      nextSubgroupIV = ir.CreateAdd(subgroupIVs0[0],
+                                                    ConstantInt::get(i32Ty, 1),
+                                                    "sg.x.tail.inc");
+                    }
+
                     assert(tailExitBB);
                     ir.CreateBr(tailExitBB);
                     tailLoopBB = tailPreheaderBB;
@@ -1226,10 +1244,8 @@ struct ScheduleGenerator {
                     inner_scalar_opts.disableVectorize = true;
                     inner_scalar_opts.IVs.assign(subgroupIVs0.begin(),
                                                  subgroupIVs0.end());
-                    inner_scalar_opts.loopIVNames.push_back("sg.x.tail");
-                    if (isScan) {
-                      inner_scalar_opts.loopIVNames.push_back("scan.x.tail");
-                    }
+                    inner_scalar_opts.loopIVNames = {"sg.x.tail",
+                                                     "scan.x.tail"};
 
                     tailExitBB = compiler::utils::createLoop(
                         tailPreheaderBB, tailExitBB, zero, peel,
@@ -1239,7 +1255,7 @@ struct ScheduleGenerator {
                             MutableArrayRef<Value *> ivsNext0) -> BasicBlock * {
                           IRBuilder<> ir(block);
 
-                          if (set_subgroup_id) {
+                          if (!noExplicitSubgroups) {
                             // set our subgroup id
                             ir.CreateCall(set_subgroup_id, {ivs0[0]})
                                 ->setCallingConv(
@@ -1269,10 +1285,12 @@ struct ScheduleGenerator {
                               *barrierTail, ir, block, barrierID, dim_0, dim_1,
                               dim_2, accum, /*VF*/ nullptr, mainLoopLimit);
 
-                          nextSubgroupIV =
-                              ir.CreateAdd(ivs0[0], ConstantInt::get(i32Ty, 1),
-                                           "sg.x.tail.inc");
-                          ivsNext0[0] = nextSubgroupIV;
+                          if (!noExplicitSubgroups) {
+                            nextSubgroupIV = ir.CreateAdd(
+                                ivs0[0], ConstantInt::get(i32Ty, 1),
+                                "sg.x.tail.inc");
+                            ivsNext0[0] = nextSubgroupIV;
+                          }
 
                           tailLoopBB = block;
                           // Move the exit after the loop block, as it reads
@@ -1309,21 +1327,29 @@ struct ScheduleGenerator {
                         ->addIncoming(scanMergePhi, mainExitBB);
                   }
                 }
-                // Don't forget to update the subgroup IV phi.
-                ivsNext1[0] = nextSubgroupIV;
+
+                if (!noExplicitSubgroups) {
+                  // Don't forget to update the subgroup IV phi.
+                  ivsNext1[0] = nextSubgroupIV;
+                }
+
                 if (isScan) {
                   // ... or the scan IV phi.
                   ivsNext1[1] = nextScanIV;
                 }
+
                 return tailExitBB;
               });
 
-          // Don't forget to update the subgroup IV phi.
-          ivsNext2[0] = nextSubgroupIV;
+          if (!noExplicitSubgroups) {
+            // Don't forget to update the subgroup IV phi.
+            ivsNext2[0] = nextSubgroupIV;
+          }
           if (isScan) {
             // ... or the scan IV phi.
             ivsNext2[1] = nextScanIV;
           }
+
           return exit1;
         });
   }
@@ -1478,50 +1504,96 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
   // happening.
   // We want to insert a call to __mux__set_max_sub_group_size after these
   // assumptions, to keep track of the last one we've inserted.
-  Instruction *setMaxSubgroupSizeInsertPt = nullptr;
   for (auto i = 0; i < 3; i++) {
     auto *const nonZero = entryIR.CreateICmpNE(
         localSizeDim[i], ConstantInt::get(localSizeDim[i]->getType(), 0));
-    setMaxSubgroupSizeInsertPt = entryIR.CreateAssumption(nonZero);
+    entryIR.CreateAssumption(nonZero);
   }
 
-  const bool isVectorPredicated = barrierMain.getVFInfo().IsVectorPredicated;
+  // There are four cases:
+  //
+  // 1. If !emitTail: in this case, only the main function will be called. The
+  // main function may be a scalar function, may be a predicated vector
+  // function, or may be an unpredicated vector function where the local size is
+  // known to be a multiple of the vectorization factor.
+  //
+  // 2. Otherwise, if tailInfo->IsVectorPredicated: in this case, the main
+  // function will be unpredicated and will be called for any multiples of vf,
+  // and one tail call will handle any remainder. vf of the main function and
+  // the tail function are the same.
+  //
+  // 3. Otherwise, if hasNoExplicitSubgroups(refF): in this case, the main
+  // function will be unpredicated and will be called for any multiples of vf,
+  // and one tail loop will handle any remainder. vf of the main function is
+  // used.
+  //
+  // 4. Otherwise: if local_size_x is a multiple of the main function's vf, the
+  // main function will handle the full loop and the main function's vf is used,
+  // else the tail function will handle the full loop and the tail function's vf
+  // is used.
+  //
+  // Unless hasNoExplicitSubgroups(refF), the subgroups are calculated as
+  //
+  //    get_max_sub_group_size() = min(vf, local_size_x)
+  //    get_num_sub_groups() = ((local_size_x + vector_width - 1) / vf)
+  //      * local_size_y * local_size_z
+  //
+  // If hasNoExplicitSubgroups(refF) (even for cases 1 and 2), the subgroups are
+  // not calculated.
+
+  const bool noExplicitSubgroups = hasNoExplicitSubgroups(refF);
 
   Value *mainLoopLimit = localSizeDim[workItemDim0];
   Value *peel = nullptr;
+
+  Value *effectiveVF = VF;
+
   if (emitTail) {
-    peel = entryIR.CreateSRem(mainLoopLimit, VF, "peel");
+    auto *const rem = entryIR.CreateSRem(mainLoopLimit, VF, "rem");
+    if (tailInfo->IsVectorPredicated || noExplicitSubgroups) {
+      peel = rem;
+    } else {
+      // We must have no more than one iteration with a subgroup size below the
+      // maximum subgroup size. To meet this requirement, if the tail is scalar
+      // and the vector size does not divide the workgroup size, do not use the
+      // vectorized kernel at all.
+      auto *const remcond = entryIR.CreateICmpNE(
+          rem, Constant::getNullValue(rem->getType()), "remcond");
+      peel = entryIR.CreateSelect(
+          remcond, mainLoopLimit,
+          Constant::getNullValue(mainLoopLimit->getType()), "peel");
+      effectiveVF = entryIR.CreateSelect(
+          remcond, materializeVF(entryIR, barrierTail->getVFInfo().vf), VF);
+    }
     mainLoopLimit = entryIR.CreateSub(mainLoopLimit, peel, "mainLoopLimit");
   }
 
-  // Set the number of subgroups in this kernel
-  {
+  // Set the subgroup maximum size and number of subgroups in this kernel
+  // wrapper.
+  if (!noExplicitSubgroups) {
+    auto setMaxSubgroupSizeFn =
+        BI.getOrDeclareMuxBuiltin(eMuxBuiltinSetMaxSubGroupSize, M);
+    assert(setMaxSubgroupSizeFn && "Missing __mux_set_max_sub_group_size");
     auto setNumSubgroupsFn =
         BI.getOrDeclareMuxBuiltin(eMuxBuiltinSetNumSubGroups, M);
     assert(setNumSubgroupsFn && "Missing __mux_set_num_sub_groups");
-    // First, compute Z * Y
-    auto *const numSubgroupsZY = entryIR.CreateMul(
-        localSizeDim[workItemDim2], localSizeDim[workItemDim1], "sg.zy");
-    // Now multiply by the number of subgroups in the X dimension.
-    auto *numSubgroupsX = entryIR.CreateUDiv(mainLoopLimit, VF, "sg.main.x");
-    // Add on any tail iterations here.
-    if (peel) {
-      numSubgroupsX = entryIR.CreateAdd(numSubgroupsX, peel, "sg.x");
-    } else if (isVectorPredicated) {
-      // Vector predication will use an extra subgroup to mop up any remainder.
-      auto *const leftover = entryIR.CreateSRem(mainLoopLimit, VF, "peel");
-      auto *hasLeftover = entryIR.CreateICmp(
-          CmpInst::ICMP_NE, leftover, ConstantInt::get(leftover->getType(), 0),
-          "sg.has.vp");
-      hasLeftover = entryIR.CreateZExt(hasLeftover, numSubgroupsX->getType());
-      numSubgroupsX = entryIR.CreateAdd(numSubgroupsX, hasLeftover, "sg.x");
-    }
-    auto *numSubgroups =
-        entryIR.CreateMul(numSubgroupsZY, numSubgroupsX, "sg.zyx");
-    if (numSubgroups->getType() != i32Ty) {
-      numSubgroups = entryIR.CreateTrunc(numSubgroups, i32Ty);
-    }
-    entryIR.CreateCall(setNumSubgroupsFn, {numSubgroups});
+    auto *const localSizeInVecDim = localSizeDim[workItemDim0];
+    auto *const localSizeInNonVecDim = entryIR.CreateMul(
+        localSizeDim[workItemDim1], localSizeDim[workItemDim2], "wg.yz");
+    auto *maxSubgroupSize = entryIR.CreateBinaryIntrinsic(
+        Intrinsic::umin, localSizeInVecDim, effectiveVF, {}, "sg.x");
+    entryIR.CreateCall(setMaxSubgroupSizeFn,
+                       {entryIR.CreateTrunc(maxSubgroupSize, i32Ty)});
+    auto *const numSubgroupsInVecDim = entryIR.CreateUDiv(
+        entryIR.CreateAdd(
+            localSizeInVecDim,
+            entryIR.CreateSub(effectiveVF,
+                              ConstantInt::get(effectiveVF->getType(), 1))),
+        effectiveVF, "sgs.x");
+    auto *const numSubgroups =
+        entryIR.CreateMul(numSubgroupsInVecDim, localSizeInNonVecDim, "sgs");
+    entryIR.CreateCall(setNumSubgroupsFn,
+                       {entryIR.CreateTrunc(numSubgroups, i32Ty)});
   }
 
   if (barrierMain.hasLiveVars()) {
@@ -1530,7 +1602,7 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
     // This catches cases where we need two loop iterations, e.g., VF=4 and
     // size=7, where rounding down would give one.
     Value *numerator = mainLoopLimit;
-    if (isVectorPredicated) {
+    if (mainInfo.IsVectorPredicated) {
       Value *const vf_minus_1 =
           entryIR.CreateSub(VF, ConstantInt::get(VF->getType(), 1));
       numerator = entryIR.CreateAdd(mainLoopLimit, vf_minus_1);
@@ -1546,7 +1618,7 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
   // barriers, even when the main kernel does not.
   if (emitTail && barrierTail->hasLiveVars()) {
     Value *size0 = peel;
-    if (barrierTail->getVFInfo().IsVectorPredicated) {
+    if (tailInfo->IsVectorPredicated) {
       // If the tail is predicated, it will only have a single (vectorized) item
       // along the X axis, or none.
       auto *const hasLeftover = entryIR.CreateICmp(
@@ -1584,8 +1656,8 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
   schedule.wrapperDbgLoc = wrapperDbgLoc;
   schedule.nextID = nextID;
   schedule.mainLoopLimit = mainLoopLimit;
+  schedule.noExplicitSubgroups = noExplicitSubgroups;
   schedule.emitTail = emitTail;
-  schedule.isVectorPredicated = isVectorPredicated;
   schedule.peel = peel;
 
   // Make call instruction for first new kernel. It follows wrapper function's
@@ -1726,45 +1798,6 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
   bbs[kBarrier_EndID]->moveAfter(&new_wrapper->back());
   bbs[kBarrier_EndID]->setName("kernel.exit");
 
-  // Set the subgroup maximum size in this kernel wrapper.
-  // There are three cases:
-  //
-  // 1. With no vectorization:
-  //    get_max_sub_group_size() = mux sub-group size
-  //
-  // 2. With predicated vectorization:
-  //    get_max_sub_group_size() = min(vector_width,
-  //    local_size_in_vectorization_dimension)
-  //
-  // 3. Without predicated vectorization:
-  //    get_max_sub_group_size() = local_size_in_vectorization_dimension
-  //    < vector_width ? mux sub-group size : vector_width
-  {
-    // Reset the insertion point back to the wrapper entry block, after VF was
-    // materialized.
-    entryIR.SetInsertPoint(setMaxSubgroupSizeInsertPt);
-    auto setMaxSubgroupSizeFn =
-        BI.getOrDeclareMuxBuiltin(eMuxBuiltinSetMaxSubGroupSize, M);
-    assert(setMaxSubgroupSizeFn && "Missing __mux_set_max_sub_group_size");
-    // Assume no vectorization to begin with i.e. get_max_sub_group_size() = mux
-    // sub-group size.
-    Value *maxSubgroupSize = entryIR.getInt32(getMuxSubgroupSize(refF));
-    if (schedule.wrapperHasMain) {
-      auto *localSizeInVecDim = localSizeDim[workItemDim0];
-      auto *cmp = entryIR.CreateICmpULT(localSizeInVecDim, VF);
-      if (isVectorPredicated) {
-        maxSubgroupSize = entryIR.CreateSelect(cmp, localSizeInVecDim, VF);
-      } else {
-        maxSubgroupSize = entryIR.CreateSelect(
-            cmp, ConstantInt::get(VF->getType(), getMuxSubgroupSize(refF)), VF);
-      }
-      if (maxSubgroupSize->getType() != i32Ty) {
-        maxSubgroupSize = entryIR.CreateTrunc(maxSubgroupSize, i32Ty);
-      }
-    }
-    entryIR.CreateCall(setMaxSubgroupSizeFn, {maxSubgroupSize});
-  }
-
   // Remap any constant expression which take a reference to the old function
   // FIXME: What about the main function?
   for (auto *user : make_early_inc_range(refF.users())) {
diff --git a/modules/compiler/test/lit/passes/barriers-cfg-reduce.ll b/modules/compiler/test/lit/passes/barriers-cfg-reduce.ll
index b087fca7a..ed367fbc5 100644
--- a/modules/compiler/test/lit/passes/barriers-cfg-reduce.ll
+++ b/modules/compiler/test/lit/passes/barriers-cfg-reduce.ll
@@ -33,11 +33,11 @@ target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:
 ; CHECK:  %[[LD:.+]] = load i32, ptr %[[VAL]], align 4
 ; CHECK:  %[[ACCUM_NEXT]] = add i32 %[[ACCUM]], %[[LD]]
 ; CHECK:  %[[IDX_NEXT]] = add i64 %[[IDX]], 1
-; CHECK:  %[[LOOP_COND:.+]] = icmp ult i64 %24, 262144
+; CHECK:  %[[LOOP_COND:.+]] = icmp ult i64 %[[IDX_NEXT]], 262144
 ; CHECK:  br i1 %[[LOOP_COND]], label %[[REDUCE_LOOP]], label %[[REDUCE_EXIT:[^,]+]]
 
 ; CHECK: [[REDUCE_EXIT]]:
-; CHECK:  %reduce = phi i32 [ %23, %[[REDUCE_LOOP]] ]
+; CHECK:  %reduce = phi i32 [ %[[ACCUM_NEXT]], %[[REDUCE_LOOP]] ]
 
 declare i64 @__mux_get_global_id(i32 %x)
 declare i32 @__mux_work_group_reduce_add_i32(i32 %id, i32 %x)
diff --git a/modules/compiler/test/lit/passes/subgroup-loop-unroll.ll b/modules/compiler/test/lit/passes/subgroup-loop-unroll.ll
index 43ca4d10a..a483c53b7 100644
--- a/modules/compiler/test/lit/passes/subgroup-loop-unroll.ll
+++ b/modules/compiler/test/lit/passes/subgroup-loop-unroll.ll
@@ -65,18 +65,6 @@ attributes #4 = { alwaysinline norecurse nounwind  }
 !13 = !{i32 32, i32 0, i32 0, i32 0}
 !20 = !{!13, ptr @sub_group_all_builtin}
 
-; CHECK-LABEL: sw.bb2:
-; CHECK: br label %loopIR13
-
-; CHECK-LABEL: loopIR13:
-; CHECK: %[[PHI:.+]] = phi i64 [ 0, %sw.bb2 ], [ %[[INC:.+]], %loopIR13 ]
-; CHECK: %[[PHI_ACCUM:.+]] = phi i1 [ true, %sw.bb2 ], [ %[[ACCUM:.+]], %loopIR13 ]
-; CHECK: %[[BARRIER:.+]] = getelementptr inbounds %__vecz_v32_sub_group_all_builtin_live_mem_info, ptr %live_variables, i64 %[[PHI]]
-; CHECK: %[[ITEM:.+]] = getelementptr inbounds %__vecz_v32_sub_group_all_builtin_live_mem_info, ptr %[[BARRIER]], i32 0, i32 0
-; CHECK: %[[LD:.+]] = load i1, ptr %[[ITEM]], align 1
-; CHECK: %[[ACCUM]] = and i1 %[[PHI_ACCUM]], %[[LD]]
-; CHECK: %[[CMP:.+]] = icmp ult i64 %[[INC]], 2
-; CHECK: br i1 %[[CMP]], label %loopIR13, label %exitIR14
-
-; CHECK-LABEL: exitIR14:
-; CHECK: %WGC_reduce = phi i1 [ %[[ACCUM]], %loopIR13 ]
+; The vectorization factor does not divide the required workgroup size so we
+; must fall back to using the scalar kernel.
+; CHECK-NOT: call i32 @__vecz_v
diff --git a/modules/compiler/vecz/source/transform/control_flow_conversion_pass.cpp b/modules/compiler/vecz/source/transform/control_flow_conversion_pass.cpp
index 67c256ffb..2b5c3aa9e 100644
--- a/modules/compiler/vecz/source/transform/control_flow_conversion_pass.cpp
+++ b/modules/compiler/vecz/source/transform/control_flow_conversion_pass.cpp
@@ -1302,9 +1302,10 @@ bool ControlFlowConversionState::Impl::tryApplyMaskToMemOp(
           Ctx, memOp.getDataOperand(), memOp.getPointerOperand(), wideMask,
           /*VL*/ nullptr, memOp.getAlignment(), I->getName());
     }
+    VECZ_FAIL_IF(!newVal);
+
     newVal->insertBefore(I->getIterator());
 
-    VECZ_FAIL_IF(!newVal);
     if (!I->getType()->isVoidTy()) {
       I->replaceAllUsesWith(newVal);
     }
diff --git a/source/cl/test/UnitCL/cmake/CompileKernelToBin.cmake b/source/cl/test/UnitCL/cmake/CompileKernelToBin.cmake
index d01c04642..032883b42 100644
--- a/source/cl/test/UnitCL/cmake/CompileKernelToBin.cmake
+++ b/source/cl/test/UnitCL/cmake/CompileKernelToBin.cmake
@@ -239,18 +239,13 @@ if(NOT clc_result EQUAL 0)
     file(WRITE ${OUTPUT_FILE} "// clc could not compile optional 'mayfail' "
          "requirement kernel for '${DEVICE_NAME}' - stderr:\n${clc_error}")
   else()
-    # execute_process() doesn't print the failing command, so attempt to
-    # reconstruct it here
-    message(FATAL_ERROR
-      "clc failed with status '${clc_result}':
-      ${CLC_EXECUTABLE}
-        -d '${DEVICE_NAME}'
-        -cl-kernel-arg-info
-        -cl-std=CL${CLC_CL_STD}
-        ${CLC_OPTIONS_LIST}
-        ${DEFS_LIST}
-        -o '${OUTPUT_FILE}'
-        -- '${INPUT_FILE}'
-      ${clc_error}")
+    # execute_process() doesn't print the command, so attempt to reconstruct it here
+    set(clc_command ${CLC_EXECUTABLE} -d '${DEVICE_NAME}' -cl-kernel-arg-info
+      -cl-std=CL${CLC_CL_STD} ${CLC_OPTIONS_LIST} ${DEFS_LIST}
+      -o '${OUTPUT_FILE}' -- '${INPUT_FILE}')
+    list(JOIN clc_command " " clc_command)
+    message(NOTICE "${clc_command}")
+    message(NOTICE "${clc_error}")
+    message(FATAL_ERROR "clc returned error code ${clc_result}")
   endif()
 endif()