ROCm
diff --git a/‎flang/lib/Lower/Runtime.cpp‎
Lines changed: 47 additions & 4 deletions b/‎flang/lib/Lower/Runtime.cpp‎
Lines changed: 47 additions & 4 deletions
diff --git a/‎flang/test/Lower/pause-statement.f90‎
Lines changed: 26 additions & 2 deletions b/‎flang/test/Lower/pause-statement.f90‎
Lines changed: 26 additions & 2 deletions
diff --git a/‎llvm/include/llvm/MC/MCObjectFileInfo.h‎
Lines changed: 0 additions & 7 deletions b/‎llvm/include/llvm/MC/MCObjectFileInfo.h‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎llvm/lib/MC/MCObjectFileInfo.cpp‎
Lines changed: 0 additions & 4 deletions b/‎llvm/lib/MC/MCObjectFileInfo.cpp‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎llvm/lib/Transforms/Utils/BasicBlockUtils.cpp‎
Lines changed: 16 additions & 0 deletions b/‎llvm/lib/Transforms/Utils/BasicBlockUtils.cpp‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎llvm/lib/Transforms/Vectorize/LoopVectorize.cpp‎
Lines changed: 23 additions & 11 deletions b/‎llvm/lib/Transforms/Vectorize/LoopVectorize.cpp‎
Lines changed: 23 additions & 11 deletions
diff --git a/‎llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp‎
Lines changed: 4 additions & 3 deletions b/‎llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll‎
Lines changed: 116 additions & 0 deletions b/‎llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll‎
Lines changed: 116 additions & 0 deletions
diff --git a/‎llvm/test/Transforms/LoopVectorize/select-smax-last-index.ll‎
Lines changed: 42 additions & 0 deletions b/‎llvm/test/Transforms/LoopVectorize/select-smax-last-index.ll‎
Lines changed: 42 additions & 0 deletions
@@ -169,12 +169,55 @@ void Fortran::lower::genUnlockStatement(
 
 void Fortran::lower::genPauseStatement(
     Fortran::lower::AbstractConverter &converter,
-    const Fortran::parser::PauseStmt &) {
+    const Fortran::parser::PauseStmt &stmt) {
+
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   mlir::Location loc = converter.getCurrentLocation();
-  mlir::func::FuncOp callee =
-      fir::runtime::getRuntimeFunc<mkRTKey(PauseStatement)>(loc, builder);
-  fir::CallOp::create(builder, loc, callee, mlir::ValueRange{});
+  Fortran::lower::StatementContext stmtCtx;
+
+  llvm::SmallVector<mlir::Value> operands;
+  mlir::func::FuncOp callee;
+  mlir::FunctionType calleeType;
+
+  if (stmt.v.has_value()) {
+    const auto &code = stmt.v.value();
+    auto expr =
+        converter.genExprValue(*Fortran::semantics::GetExpr(code), stmtCtx);
+    expr.match(
+        // Character-valued expression -> call PauseStatementText (CHAR, LEN)
+        [&](const fir::CharBoxValue &x) {
+          callee = fir::runtime::getRuntimeFunc<mkRTKey(PauseStatementText)>(
+              loc, builder);
+          calleeType = callee.getFunctionType();
+
+          operands.push_back(
+              builder.createConvert(loc, calleeType.getInput(0), x.getAddr()));
+          operands.push_back(
+              builder.createConvert(loc, calleeType.getInput(1), x.getLen()));
+        },
+        // Unboxed value -> call PauseStatementInt which accepts an integer.
+        [&](fir::UnboxedValue x) {
+          callee = fir::runtime::getRuntimeFunc<mkRTKey(PauseStatementInt)>(
+              loc, builder);
+          calleeType = callee.getFunctionType();
+          assert(calleeType.getNumInputs() >= 1);
+          mlir::Value cast =
+              builder.createConvert(loc, calleeType.getInput(0), x);
+          operands.push_back(cast);
+        },
+        [&](auto) {
+          fir::emitFatalError(loc, "unhandled expression in PAUSE");
+        });
+  } else {
+    callee =
+        fir::runtime::getRuntimeFunc<mkRTKey(PauseStatement)>(loc, builder);
+    calleeType = callee.getFunctionType();
+  }
+
+  fir::CallOp::create(builder, loc, callee, operands);
+
+  // NOTE: PAUSE does not terminate the current block. The program may resume
+  // and continue normal execution, so we do not emit control-flow terminators.
 }
 
 void Fortran::lower::genPointerAssociate(fir::FirOpBuilder &builder,
 
@@ -2,7 +2,31 @@
 
 ! CHECK-LABEL: pause_test
 subroutine pause_test()
-  ! CHECK: fir.call @_Fortran{{.*}}PauseStatement()
-  ! CHECK-NEXT: return
   pause
+  ! CHECK: fir.call @_FortranA{{.*}}PauseStatement()
+  ! CHECK-NEXT: return
+end subroutine
+
+! CHECK-LABEL: pause_code
+subroutine pause_code()
+  pause 42
+  ! CHECK: %[[c42:.*]] = arith.constant 42 : i32
+  ! CHECK: fir.call @_FortranA{{.*}}PauseStatementInt(%[[c42]])
+  ! CHECK-NEXT: return
 end subroutine
+
+! CHECK-LABEL: pause_msg
+subroutine pause_msg()
+  pause "hello"
+  ! CHECK-DAG: %[[five:.*]] = arith.constant 5 : index
+  ! CHECK-DAG: %[[addr:.*]] = fir.address_of(@_QQ{{.*}}) : !fir.ref<!fir.char<1,5>>
+  ! CHECK-DAG: %[[str:.*]]:2 = hlfir.declare %[[addr]] typeparams %[[five]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQ{{.*}}"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
+  ! CHECK-DAG: %[[buff:.*]] = fir.convert %[[str]]#0 : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
+  ! CHECK-DAG: %[[len:.*]] = fir.convert %[[five]] : (index) -> i64
+  ! CHECK: fir.call @_FortranA{{.*}}PauseStatementText(%[[buff]], %[[len]])
+  ! CHECK-NEXT: return
+end subroutine
+
+! CHECK-DAG: func private @_FortranA{{.*}}PauseStatement
+! CHECK-DAG: func private @_FortranA{{.*}}PauseStatementInt
+! CHECK-DAG: func private @_FortranA{{.*}}PauseStatementText
@@ -29,10 +29,6 @@ class MCSection;
 
 class LLVM_ABI MCObjectFileInfo {
 protected:
-  /// True if target object file supports a weak_definition of constant 0 for an
-  /// omitted EH frame.
-  bool SupportsWeakOmittedEHFrame = false;
-
   /// True if the target object file supports emitting a compact unwind section
   /// without an associated EH frame section.
   bool SupportsCompactUnwindWithoutEHFrame = false;
@@ -260,9 +256,6 @@ class LLVM_ABI MCObjectFileInfo {
   virtual ~MCObjectFileInfo();
   MCContext &getContext() const { return *Ctx; }
 
-  bool getSupportsWeakOmittedEHFrame() const {
-    return SupportsWeakOmittedEHFrame;
-  }
   bool getSupportsCompactUnwindWithoutEHFrame() const {
     return SupportsCompactUnwindWithoutEHFrame;
   }
 
@@ -61,9 +61,6 @@ static bool useCompactUnwind(const Triple &T) {
 }
 
 void MCObjectFileInfo::initMachOMCObjectFileInfo(const Triple &T) {
-  // MachO
-  SupportsWeakOmittedEHFrame = false;
-
   EHFrameSection = Ctx->getMachOSection(
       "__TEXT", "__eh_frame",
       MachO::S_COALESCED | MachO::S_ATTR_NO_TOC |
@@ -1090,7 +1087,6 @@ void MCObjectFileInfo::initMCObjectFileInfo(MCContext &MCCtx, bool PIC,
   Ctx = &MCCtx;
 
   // Common.
-  SupportsWeakOmittedEHFrame = true;
   SupportsCompactUnwindWithoutEHFrame = false;
   OmitDwarfIfHaveCompactUnwind = false;
 
 
@@ -92,6 +92,15 @@ emptyAndDetachBlock(BasicBlock *BB,
          "applying corresponding DTU updates.");
 }
 
+static bool HasLoopOrEntryConvergenceToken(const BasicBlock *BB) {
+  for (const Instruction &I : *BB) {
+    const ConvergenceControlInst *CCI = dyn_cast<ConvergenceControlInst>(&I);
+    if (CCI && (CCI->isLoop() || CCI->isEntry()))
+      return true;
+  }
+  return false;
+}
+
 void llvm::detachDeadBlocks(ArrayRef<BasicBlock *> BBs,
                             SmallVectorImpl<DominatorTree::UpdateType> *Updates,
                             bool KeepOneInputPHIs) {
@@ -259,6 +268,13 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
     if (llvm::is_contained(PN.incoming_values(), &PN))
       return false;
 
+  // Don't break if both the basic block and the predecessor contain loop or
+  // entry convergent intrinsics, since there may only be one convergence token
+  // per block.
+  if (HasLoopOrEntryConvergenceToken(BB) &&
+      HasLoopOrEntryConvergenceToken(PredBB))
+    return false;
+
   LLVM_DEBUG(dbgs() << "Merging: " << BB->getName() << " into "
                     << PredBB->getName() << "\n");
 
 
@@ -7187,17 +7187,29 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
   VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind,
                         *CM.PSE.getSE(), OrigLoop);
   precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
-  // Verify that the VPlan-based and legacy cost models agree, except for VPlans
-  // with early exits and plans with additional VPlan simplifications. The
-  // legacy cost model doesn't properly model costs for such loops.
-  assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
-          !Legal->getLAI()->getSymbolicStrides().empty() ||
-          planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
-                                                CostCtx, OrigLoop,
-                                                BestFactor.Width) ||
-          planContainsAdditionalSimplifications(
-              getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) &&
-         " VPlan cost model and legacy cost model disagreed");
+  // Verify that the VPlan-based and legacy cost models agree, except for
+  // * VPlans with early exits,
+  // * VPlans with additional VPlan simplifications,
+  // * EVL-based VPlans with gather/scatters (the VPlan-based cost model uses
+  //   vp_scatter/vp_gather).
+  // The legacy cost model doesn't properly model costs for such loops.
+  bool UsesEVLGatherScatter =
+      any_of(VPBlockUtils::blocksOnly<VPBasicBlock>(vp_depth_first_shallow(
+                 BestPlan.getVectorLoopRegion()->getEntry())),
+             [](VPBasicBlock *VPBB) {
+               return any_of(*VPBB, [](VPRecipeBase &R) {
+                 return isa<VPWidenLoadEVLRecipe, VPWidenStoreEVLRecipe>(&R) &&
+                        !cast<VPWidenMemoryRecipe>(&R)->isConsecutive();
+               });
+             });
+  assert(
+      (BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
+       !Legal->getLAI()->getSymbolicStrides().empty() || UsesEVLGatherScatter ||
+       planContainsAdditionalSimplifications(
+           getPlanFor(BestFactor.Width), CostCtx, OrigLoop, BestFactor.Width) ||
+       planContainsAdditionalSimplifications(
+           getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) &&
+      " VPlan cost model and legacy cost model disagreed");
   assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
          "when vectorizing, the scalar cost must be computed.");
 #endif
 
@@ -1080,9 +1080,10 @@ bool VPlanTransforms::handleMultiUseReductions(VPlan &Plan) {
                            FindIVPhiR->getRecurrenceKind()))
       return false;
 
-    assert(match(IVOp, m_TruncOrSelf(m_VPValue(IVOp))) &&
-           isa<VPWidenIntOrFpInductionRecipe>(IVOp) &&
-           "other select operand must be a (truncated) wide induction");
+    // TODO: Support cases where IVOp is the IV increment.
+    if (!match(IVOp, m_TruncOrSelf(m_VPValue(IVOp))) ||
+        !isa<VPWidenIntOrFpInductionRecipe>(IVOp))
+      return false;
 
     CmpInst::Predicate RdxPredicate = [RdxKind]() {
       switch (RdxKind) {
 
@@ -219,3 +219,119 @@ loop:
 exit:
   ret void
 }
+
+; Test for https://github.com/llvm/llvm-project/issues/169948.
+define i8 @mixed_gather_scatters(ptr %A, ptr %B, ptr %C) #0 {
+; RVA23-LABEL: @mixed_gather_scatters(
+; RVA23-NEXT:  entry:
+; RVA23-NEXT:    br label [[VECTOR_PH:%.*]]
+; RVA23:       vector.ph:
+; RVA23-NEXT:    br label [[VECTOR_BODY:%.*]]
+; RVA23:       vector.body:
+; RVA23-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; RVA23-NEXT:    [[AVL:%.*]] = phi i32 [ 10, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; RVA23-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 2, i1 true)
+; RVA23-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A:%.*]], align 8
+; RVA23-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[TMP1]], i64 0
+; RVA23-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; RVA23-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.vp.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[BROADCAST_SPLAT]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP0]])
+; RVA23-NEXT:    [[TMP2:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_MASKED_GATHER]], zeroinitializer
+; RVA23-NEXT:    [[TMP3:%.*]] = zext <vscale x 2 x i1> [[TMP2]] to <vscale x 2 x i8>
+; RVA23-NEXT:    [[TMP4:%.*]] = or <vscale x 2 x i8> [[VEC_PHI]], [[TMP3]]
+; RVA23-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B:%.*]], align 8
+; RVA23-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[TMP5]], i64 0
+; RVA23-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; RVA23-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 2 x i64> @llvm.vp.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[BROADCAST_SPLAT2]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP0]])
+; RVA23-NEXT:    [[TMP6:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_MASKED_GATHER3]], zeroinitializer
+; RVA23-NEXT:    [[TMP7:%.*]] = zext <vscale x 2 x i1> [[TMP6]] to <vscale x 2 x i8>
+; RVA23-NEXT:    [[TMP8:%.*]] = or <vscale x 2 x i8> [[TMP4]], [[TMP7]]
+; RVA23-NEXT:    [[TMP9:%.*]] = or <vscale x 2 x i8> [[TMP8]], splat (i8 1)
+; RVA23-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C:%.*]], align 8
+; RVA23-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[TMP10]], i64 0
+; RVA23-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT4]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; RVA23-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x i64> @llvm.vp.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[BROADCAST_SPLAT5]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP0]])
+; RVA23-NEXT:    [[TMP11:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_MASKED_GATHER6]], zeroinitializer
+; RVA23-NEXT:    [[TMP12:%.*]] = zext <vscale x 2 x i1> [[TMP11]] to <vscale x 2 x i8>
+; RVA23-NEXT:    [[TMP13:%.*]] = or <vscale x 2 x i8> [[TMP9]], [[TMP12]]
+; RVA23-NEXT:    [[TMP14]] = call <vscale x 2 x i8> @llvm.vp.merge.nxv2i8(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> [[TMP13]], <vscale x 2 x i8> [[VEC_PHI]], i32 [[TMP0]])
+; RVA23-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP0]]
+; RVA23-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
+; RVA23-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; RVA23:       middle.block:
+; RVA23-NEXT:    [[TMP16:%.*]] = call i8 @llvm.vector.reduce.or.nxv2i8(<vscale x 2 x i8> [[TMP14]])
+; RVA23-NEXT:    br label [[EXIT:%.*]]
+; RVA23:       exit:
+; RVA23-NEXT:    ret i8 [[TMP16]]
+;
+; RVA23ZVL1024B-LABEL: @mixed_gather_scatters(
+; RVA23ZVL1024B-NEXT:  entry:
+; RVA23ZVL1024B-NEXT:    br label [[VECTOR_PH:%.*]]
+; RVA23ZVL1024B:       vector.ph:
+; RVA23ZVL1024B-NEXT:    br label [[VECTOR_BODY:%.*]]
+; RVA23ZVL1024B:       vector.body:
+; RVA23ZVL1024B-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 1 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; RVA23ZVL1024B-NEXT:    [[AVL:%.*]] = phi i32 [ 10, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; RVA23ZVL1024B-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 1, i1 true)
+; RVA23ZVL1024B-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A:%.*]], align 8
+; RVA23ZVL1024B-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x ptr> poison, ptr [[TMP1]], i64 0
+; RVA23ZVL1024B-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 1 x ptr> poison, <vscale x 1 x i32> zeroinitializer
+; RVA23ZVL1024B-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 1 x i64> @llvm.vp.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> align 8 [[BROADCAST_SPLAT]], <vscale x 1 x i1> splat (i1 true), i32 [[TMP0]])
+; RVA23ZVL1024B-NEXT:    [[TMP2:%.*]] = icmp sgt <vscale x 1 x i64> [[WIDE_MASKED_GATHER]], zeroinitializer
+; RVA23ZVL1024B-NEXT:    [[TMP3:%.*]] = zext <vscale x 1 x i1> [[TMP2]] to <vscale x 1 x i8>
+; RVA23ZVL1024B-NEXT:    [[TMP4:%.*]] = or <vscale x 1 x i8> [[VEC_PHI]], [[TMP3]]
+; RVA23ZVL1024B-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B:%.*]], align 8
+; RVA23ZVL1024B-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 1 x ptr> poison, ptr [[TMP5]], i64 0
+; RVA23ZVL1024B-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 1 x ptr> [[BROADCAST_SPLATINSERT1]], <vscale x 1 x ptr> poison, <vscale x 1 x i32> zeroinitializer
+; RVA23ZVL1024B-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 1 x i64> @llvm.vp.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> align 8 [[BROADCAST_SPLAT2]], <vscale x 1 x i1> splat (i1 true), i32 [[TMP0]])
+; RVA23ZVL1024B-NEXT:    [[TMP6:%.*]] = icmp sgt <vscale x 1 x i64> [[WIDE_MASKED_GATHER3]], zeroinitializer
+; RVA23ZVL1024B-NEXT:    [[TMP7:%.*]] = zext <vscale x 1 x i1> [[TMP6]] to <vscale x 1 x i8>
+; RVA23ZVL1024B-NEXT:    [[TMP8:%.*]] = or <vscale x 1 x i8> [[TMP4]], [[TMP7]]
+; RVA23ZVL1024B-NEXT:    [[TMP9:%.*]] = or <vscale x 1 x i8> [[TMP8]], splat (i8 1)
+; RVA23ZVL1024B-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C:%.*]], align 8
+; RVA23ZVL1024B-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <vscale x 1 x ptr> poison, ptr [[TMP10]], i64 0
+; RVA23ZVL1024B-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <vscale x 1 x ptr> [[BROADCAST_SPLATINSERT4]], <vscale x 1 x ptr> poison, <vscale x 1 x i32> zeroinitializer
+; RVA23ZVL1024B-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 1 x i64> @llvm.vp.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> align 8 [[BROADCAST_SPLAT5]], <vscale x 1 x i1> splat (i1 true), i32 [[TMP0]])
+; RVA23ZVL1024B-NEXT:    [[TMP11:%.*]] = icmp sgt <vscale x 1 x i64> [[WIDE_MASKED_GATHER6]], zeroinitializer
+; RVA23ZVL1024B-NEXT:    [[TMP12:%.*]] = zext <vscale x 1 x i1> [[TMP11]] to <vscale x 1 x i8>
+; RVA23ZVL1024B-NEXT:    [[TMP13:%.*]] = or <vscale x 1 x i8> [[TMP9]], [[TMP12]]
+; RVA23ZVL1024B-NEXT:    [[TMP14]] = call <vscale x 1 x i8> @llvm.vp.merge.nxv1i8(<vscale x 1 x i1> splat (i1 true), <vscale x 1 x i8> [[TMP13]], <vscale x 1 x i8> [[VEC_PHI]], i32 [[TMP0]])
+; RVA23ZVL1024B-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP0]]
+; RVA23ZVL1024B-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
+; RVA23ZVL1024B-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; RVA23ZVL1024B:       middle.block:
+; RVA23ZVL1024B-NEXT:    [[TMP16:%.*]] = call i8 @llvm.vector.reduce.or.nxv1i8(<vscale x 1 x i8> [[TMP14]])
+; RVA23ZVL1024B-NEXT:    br label [[EXIT:%.*]]
+; RVA23ZVL1024B:       exit:
+; RVA23ZVL1024B-NEXT:    ret i8 [[TMP16]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %accum = phi i8 [ 0, %entry ], [ %or.4, %loop ]
+  %ptr.0 = load ptr, ptr %A, align 8
+  %val.0 = load i64, ptr %ptr.0, align 8
+  %cmp.0 = icmp sgt i64 %val.0, 0
+  %ext.0 = zext i1 %cmp.0 to i8
+  %or.0 = or i8 %accum, %ext.0
+  %ptr.1 = load ptr, ptr %B, align 8
+  %val.1 = load i64, ptr %ptr.1, align 8
+  %cmp.1 = icmp sgt i64 %val.1, 0
+  %ext.1 = zext i1 %cmp.1 to i8
+  %or.1 = or i8 %or.0, %ext.1
+  %or.2 = or i8 %or.1, 1
+  %ptr.4 = load ptr, ptr %C, align 8
+  %val.4 = load i64, ptr %ptr.4, align 8
+  %cmp.4 = icmp sgt i64 %val.4, 0
+  %ext.4 = zext i1 %cmp.4 to i8
+  %or.4 = or i8 %or.2, %ext.4
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv, 9
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret i8 %or.4
+}
+
+attributes #0 = { "target-features"="+zve64x,+zvl256b" }
@@ -656,5 +656,47 @@ exit:
   ret i64 %res
 }
 
+define i64 @test_vectorize_select_smax_idx_inc(ptr %src, i64 %n) {
+; CHECK-LABEL: define i64 @test_vectorize_select_smax_idx_inc(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MAX_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MAX_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i64 [[MAX_VAL]], [[L]]
+; CHECK-NEXT:    [[MAX_VAL_NEXT]] = tail call i64 @llvm.smax.i64(i64 [[MAX_VAL]], i64 [[L]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[MAX_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV_NEXT]], i64 [[MAX_IDX]]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[MAX_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %max.idx = phi i64 [ 0, %entry ], [ %max.idx.next, %loop ]
+  %max.val = phi i64 [ 0, %entry ], [ %max.val.next, %loop ]
+  %gep = getelementptr i64, ptr %src, i64 %iv
+  %l = load i64, ptr %gep
+  %cmp = icmp sle i64 %max.val, %l
+  %max.val.next = tail call i64 @llvm.smax.i64(i64 %max.val, i64 %l)
+  %iv.next = add nuw nsw i64 %iv, 1
+  %max.idx.next = select i1 %cmp, i64 %iv.next, i64 %max.idx
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  %res = phi i64 [ %max.idx.next, %loop ]
+  ret i64 %res
+}
+
 declare i64 @llvm.smax.i64(i64, i64)
 declare i16 @llvm.smax.i16(i16, i16)