@@ -173,6 +173,8 @@ const char LLVMLoopVectorizeFollowupEpilogue[] =
173173STATISTIC (LoopsVectorized, " Number of loops vectorized" );
174174STATISTIC (LoopsAnalyzed, " Number of loops analyzed for vectorization" );
175175STATISTIC (LoopsEpilogueVectorized, " Number of epilogues vectorized" );
176+ STATISTIC (CSAsVectorized,
177+ " Number of conditional scalar assignments vectorized" );
176178
177179static cl::opt<bool > EnableEpilogueVectorization (
178180 " enable-epilogue-vectorization" , cl::init(true ), cl::Hidden,
@@ -501,6 +503,10 @@ class InnerLoopVectorizer {
501503 // / Fix the vectorized code, taking care of header phi's, and more.
502504 void fixVectorizedLoop (VPTransformState &State);
503505
506+ // / For all vectorized CSAs, replace uses of live-out scalar from the orignal
507+ // / loop with the extracted scalar from the vector loop for.
508+ void fixCSALiveOuts (VPTransformState &State, VPlan &Plan);
509+
504510 // Return true if any runtime check is added.
505511 bool areSafetyChecksAdded () { return AddedSafetyChecks; }
506512
@@ -2934,6 +2940,25 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
29342940 TargetTransformInfo::TCK_RecipThroughput);
29352941}
29362942
2943+ void InnerLoopVectorizer::fixCSALiveOuts (VPTransformState &State, VPlan &Plan) {
2944+ for (const auto &CSA : Plan.getCSAStates ()) {
2945+ VPCSADataUpdateRecipe *VPDataUpdate = CSA.second ->getDataUpdate ();
2946+ assert (VPDataUpdate &&
2947+ " VPDataUpdate must have been introduced prior to fixing live outs" );
2948+ Value *V = VPDataUpdate->getUnderlyingValue ();
2949+ Value *ExtractedScalar = State.get (CSA.second ->getExtractScalarRecipe (), 0 ,
2950+ /* NeedsScalar=*/ true );
2951+ // Fix LCSSAPhis
2952+ llvm::SmallPtrSet<PHINode *, 2 > ToFix;
2953+ for (User *U : V->users ())
2954+ if (auto *Phi = dyn_cast<PHINode>(U);
2955+ Phi && Phi->getParent () == LoopExitBlock)
2956+ ToFix.insert (Phi);
2957+ for (PHINode *Phi : ToFix)
2958+ Phi->addIncoming (ExtractedScalar, LoopMiddleBlock);
2959+ }
2960+ }
2961+
29372962void InnerLoopVectorizer::fixVectorizedLoop (VPTransformState &State) {
29382963 // Fix widened non-induction PHIs by setting up the PHI operands.
29392964 if (EnableVPlanNativePath)
@@ -2969,6 +2994,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
29692994 fixupIVUsers (Entry.first , Entry.second ,
29702995 getOrCreateVectorTripCount (nullptr ),
29712996 IVEndValues[Entry.first ], LoopMiddleBlock, State);
2997+ fixCSALiveOuts (State, Plan);
29722998 }
29732999
29743000 for (Instruction *PI : PredicatedInstructions)
@@ -4494,6 +4520,9 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
44944520 case VPDef::VPEVLBasedIVPHISC:
44954521 case VPDef::VPPredInstPHISC:
44964522 case VPDef::VPBranchOnMaskSC:
4523+ case VPRecipeBase::VPCSADataUpdateSC:
4524+ case VPRecipeBase::VPCSAExtractScalarSC:
4525+ case VPRecipeBase::VPCSAHeaderPHISC:
44974526 continue ;
44984527 case VPDef::VPReductionSC:
44994528 case VPDef::VPActiveLaneMaskPHISC:
@@ -8675,9 +8704,6 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
86758704 return Recipe;
86768705
86778706 VPHeaderPHIRecipe *PhiRecipe = nullptr ;
8678- assert ((Legal->isReductionVariable (Phi) ||
8679- Legal->isFixedOrderRecurrence (Phi)) &&
8680- " can only widen reductions and fixed-order recurrences here" );
86818707 VPValue *StartV = Operands[0 ];
86828708 if (Legal->isReductionVariable (Phi)) {
86838709 const RecurrenceDescriptor &RdxDesc =
@@ -8687,12 +8713,23 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
86878713 PhiRecipe = new VPReductionPHIRecipe (Phi, RdxDesc, *StartV,
86888714 CM.isInLoopReduction (Phi),
86898715 CM.useOrderedReductions (RdxDesc));
8690- } else {
8716+ } else if (Legal-> isFixedOrderRecurrence (Phi)) {
86918717 // TODO: Currently fixed-order recurrences are modeled as chains of
86928718 // first-order recurrences. If there are no users of the intermediate
86938719 // recurrences in the chain, the fixed order recurrence should be modeled
86948720 // directly, enabling more efficient codegen.
86958721 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe (Phi, *StartV);
8722+ } else if (Legal->isCSAPhi (Phi)) {
8723+ VPCSAState *State = Plan.getCSAStates ().find (Phi)->second ;
8724+ VPValue *InitData = State->getVPInitData ();
8725+ // When the VF=getFixed(1), InitData is just InitScalar.
8726+ if (!InitData)
8727+ InitData = State->getVPInitScalar ();
8728+ PhiRecipe = new VPCSAHeaderPHIRecipe (Phi, InitData);
8729+ State->setPhiRecipe (cast<VPCSAHeaderPHIRecipe>(PhiRecipe));
8730+ } else {
8731+ llvm_unreachable (
8732+ " can only widen reductions, fixed-order recurrences, and CSAs here" );
86968733 }
86978734
86988735 PhisToFix.push_back (PhiRecipe);
@@ -8726,6 +8763,19 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
87268763 make_range (Operands.begin (), Operands.end ()));
87278764
87288765 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8766+ auto *CSADescIt = find_if (Legal->getCSAs (), [&](auto CSA) {
8767+ return CSADescriptor::isCSASelect (CSA.second , SI);
8768+ });
8769+ if (CSADescIt != Legal->getCSAs ().end ()) {
8770+ PHINode *CSAPhi = CSADescIt->first ;
8771+ VPCSAState *State = Plan.getCSAStates ().find (CSAPhi)->second ;
8772+ VPValue *VPDataPhi = State->getPhiRecipe ();
8773+ auto *R = new VPCSADataUpdateRecipe (
8774+ SI, {VPDataPhi, Operands[0 ], Operands[1 ], Operands[2 ]});
8775+ State->setDataUpdate (R);
8776+ return R;
8777+ }
8778+
87298779 return new VPWidenSelectRecipe (
87308780 *SI, make_range (Operands.begin (), Operands.end ()));
87318781 }
@@ -8738,6 +8788,107 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
87388788 return tryToWiden (Instr, Operands, VPBB);
87398789}
87408790
8791+ // / Add CSA Recipes that can occur before each instruction in the input IR
8792+ // / is processed and introduced into VPlan.
8793+ static void
8794+ addCSAPreprocessRecipes (const LoopVectorizationLegality::CSAList &CSAs,
8795+ Loop *OrigLoop, VPBasicBlock *PreheaderVPBB,
8796+ VPBasicBlock *HeaderVPBB, DebugLoc DL, VFRange &Range,
8797+ VPlan &Plan) {
8798+
8799+ // Don't build full CSA for VF=ElementCount::getFixed(1)
8800+ bool IsScalarVF = LoopVectorizationPlanner::getDecisionAndClampRange (
8801+ [&](ElementCount VF) { return VF.isScalar (); }, Range);
8802+
8803+ for (const auto &CSA : CSAs) {
8804+ VPValue *VPInitScalar = Plan.getOrAddLiveIn (
8805+ CSA.first ->getIncomingValueForBlock (OrigLoop->getLoopPreheader ()));
8806+
8807+ // Scalar VF builds the scalar version of the loop. In that case,
8808+ // no maintenence of mask nor extraction in middle block is needed.
8809+ if (IsScalarVF) {
8810+ VPCSAState *S = new VPCSAState (VPInitScalar);
8811+ Plan.addCSAState (CSA.first , S);
8812+ continue ;
8813+ }
8814+
8815+ auto *VPInitMask =
8816+ new VPInstruction (VPInstruction::CSAInitMask, {}, DL, " csa.init.mask" );
8817+ auto *VPInitData = new VPInstruction (VPInstruction::CSAInitData,
8818+ {VPInitScalar}, DL, " csa.init.data" );
8819+ PreheaderVPBB->appendRecipe (VPInitMask);
8820+ PreheaderVPBB->appendRecipe (VPInitData);
8821+
8822+ auto *VPMaskPhi = new VPInstruction (VPInstruction::CSAMaskPhi, {VPInitMask},
8823+ DL, " csa.mask.phi" );
8824+ HeaderVPBB->appendRecipe (VPMaskPhi);
8825+
8826+ auto *S = new VPCSAState (VPInitScalar, VPInitData, VPMaskPhi);
8827+ Plan.addCSAState (CSA.first , S);
8828+ }
8829+ }
8830+
8831+ // / Add CSA Recipes that must occur after each instruction in the input IR
8832+ // / is processed and introduced into VPlan.
8833+ static void
8834+ addCSAPostprocessRecipes (VPRecipeBuilder &RecipeBuilder,
8835+ const LoopVectorizationLegality::CSAList &CSAs,
8836+ VPBasicBlock *MiddleVPBB, DebugLoc DL, VFRange &Range,
8837+ VPlan &Plan) {
8838+ // Don't build CSA for VF=ElementCount::getFixed(1)
8839+ if (LoopVectorizationPlanner::getDecisionAndClampRange (
8840+ [&](ElementCount VF) { return VF.isScalar (); }, Range))
8841+ return ;
8842+
8843+ for (const auto &CSA : CSAs) {
8844+ VPCSAState *CSAState = Plan.getCSAStates ().find (CSA.first )->second ;
8845+ VPCSADataUpdateRecipe *VPDataUpdate = CSAState->getDataUpdate ();
8846+
8847+ assert (VPDataUpdate &&
8848+ " VPDataUpdate must have been introduced prior to postprocess" );
8849+ assert (CSA.second .getCond () &&
8850+ " CSADescriptor must know how to describe the condition" );
8851+ auto GetVPValue = [&](Value *I) {
8852+ return RecipeBuilder.getRecipe (cast<Instruction>(I))->getVPSingleValue ();
8853+ };
8854+ VPValue *WidenedCond = GetVPValue (CSA.second .getCond ());
8855+ VPValue *VPInitScalar = CSAState->getVPInitScalar ();
8856+
8857+ // The CSA optimization wants to use a condition such that when it is
8858+ // true, a new value is assigned. However, it is possible that a true lane
8859+ // in WidenedCond corresponds to selection of the initial value instead.
8860+ // In that case, we must use the negation of WidenedCond.
8861+ // i.e. select cond new_val old_val versus select cond.not old_val new_val
8862+ VPValue *CondToUse = WidenedCond;
8863+ if (cast<SelectInst>(CSA.second .getAssignment ())->getTrueValue () ==
8864+ CSA.first ) {
8865+ auto *VPNotCond = new VPInstruction (VPInstruction::Not, WidenedCond, DL);
8866+ VPNotCond->insertBefore (
8867+ GetVPValue (CSA.second .getAssignment ())->getDefiningRecipe ());
8868+ CondToUse = VPNotCond;
8869+ }
8870+
8871+ auto *VPAnyActive = new VPInstruction (
8872+ VPInstruction::CSAAnyActive, {CondToUse}, DL, " csa.cond.anyactive" );
8873+ VPAnyActive->insertBefore (
8874+ GetVPValue (CSA.second .getAssignment ())->getDefiningRecipe ());
8875+
8876+ auto *VPMaskSel = new VPInstruction (
8877+ VPInstruction::CSAMaskSel,
8878+ {CondToUse, CSAState->getVPMaskPhi (), VPAnyActive}, DL, " csa.mask.sel" );
8879+ VPMaskSel->insertAfter (VPAnyActive);
8880+ VPDataUpdate->setVPNewMaskAndVPAnyActive (VPMaskSel, VPAnyActive);
8881+ VPCSAExtractScalarRecipe *ExtractScalarRecipe =
8882+ new VPCSAExtractScalarRecipe ({VPInitScalar, VPMaskSel, VPDataUpdate});
8883+
8884+ MiddleVPBB->insert (ExtractScalarRecipe, MiddleVPBB->getFirstNonPhi ());
8885+
8886+ // Update CSAState with new recipes
8887+ CSAState->setExtractScalarRecipe (ExtractScalarRecipe);
8888+ CSAState->setVPAnyActive (VPAnyActive);
8889+ }
8890+ }
8891+
87418892void LoopVectorizationPlanner::buildVPlansWithVPRecipes (ElementCount MinVF,
87428893 ElementCount MaxVF) {
87438894 assert (OrigLoop->isInnermost () && " Inner loop expected." );
@@ -8830,7 +8981,8 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
88308981// increments.
88318982static SetVector<VPIRInstruction *> collectUsersInExitBlock (
88328983 Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
8833- const MapVector<PHINode *, InductionDescriptor> &Inductions) {
8984+ const MapVector<PHINode *, InductionDescriptor> &Inductions,
8985+ const MapVector<PHINode *, CSADescriptor> &CSAs) {
88348986 auto *MiddleVPBB = Plan.getMiddleBlock ();
88358987 // No edge from the middle block to the unique exit block has been inserted
88368988 // and there is nothing to fix from vector loop; phis should have incoming
@@ -8862,6 +9014,17 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlock(
88629014 return P && Inductions.contains (P);
88639015 })))
88649016 continue ;
9017+ // Exit values for CSAs are computed and updated outside of VPlan and
9018+ // independent of induction recipes.
9019+ // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
9020+ // live-outs.
9021+ if (isa<VPCSADataUpdateRecipe>(V) &&
9022+ (isa<Instruction>(IncomingValue) &&
9023+ any_of (IncomingValue->users (), [&CSAs](User *U) {
9024+ auto *P = dyn_cast<PHINode>(U);
9025+ return P && CSAs.contains (P);
9026+ })))
9027+ continue ;
88659028 ExitUsersToFix.insert (ExitIRI);
88669029 ExitIRI->addOperand (V);
88679030 }
@@ -9038,6 +9201,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
90389201 bool HasNUW = Style == TailFoldingStyle::None;
90399202 addCanonicalIVRecipes (*Plan, Legal->getWidestInductionType (), HasNUW, DL);
90409203
9204+ addCSAPreprocessRecipes (Legal->getCSAs (), OrigLoop, Plan->getPreheader (),
9205+ Plan->getVectorLoopRegion ()->getEntryBasicBlock (), DL,
9206+ Range, *Plan);
9207+
90419208 VPRecipeBuilder RecipeBuilder (*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
90429209
90439210 // ---------------------------------------------------------------------------
@@ -9155,6 +9322,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
91559322 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor ());
91569323 }
91579324
9325+ VPBasicBlock *MiddleVPBB =
9326+ cast<VPBasicBlock>(Plan->getVectorLoopRegion ()->getSingleSuccessor ());
9327+ addCSAPostprocessRecipes (RecipeBuilder, Legal->getCSAs (), MiddleVPBB, DL,
9328+ Range, *Plan);
9329+
91589330 // After here, VPBB should not be used.
91599331 VPBB = nullptr ;
91609332
@@ -9165,8 +9337,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
91659337 RecipeBuilder.fixHeaderPhis ();
91669338
91679339 addScalarResumePhis (RecipeBuilder, *Plan);
9168- SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock (
9169- OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars ());
9340+ SetVector<VPIRInstruction *> ExitUsersToFix =
9341+ collectUsersInExitBlock (OrigLoop, RecipeBuilder, *Plan,
9342+ Legal->getInductionVars (), Legal->getCSAs ());
91709343 addExitUsersForFirstOrderRecurrences (*Plan, ExitUsersToFix);
91719344 addUsersInExitBlock (*Plan, ExitUsersToFix);
91729345 // ---------------------------------------------------------------------------
@@ -10235,6 +10408,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1023510408 auto ExpandedSCEVs = LVP.executePlan (EPI.MainLoopVF , EPI.MainLoopUF ,
1023610409 *BestMainPlan, MainILV, DT, false );
1023710410 ++LoopsVectorized;
10411+ CSAsVectorized += LVL.getCSAs ().size ();
1023810412
1023910413 // Second pass vectorizes the epilogue and adjusts the control flow
1024010414 // edges from the first pass.
@@ -10330,6 +10504,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1033010504 PSI, Checks, BestPlan);
1033110505 LVP.executePlan (VF.Width , IC, BestPlan, LB, DT, false );
1033210506 ++LoopsVectorized;
10507+ CSAsVectorized += LVL.getCSAs ().size ();
1033310508
1033410509 // Add metadata to disable runtime unrolling a scalar loop when there
1033510510 // are no runtime checks about strides and memory. A scalar loop that is
0 commit comments