1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file contains implementations for different VPlan recipes. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "VPlan.h" 15 #include "VPlanAnalysis.h" 16 #include "VPlanUtils.h" 17 #include "llvm/ADT/STLExtras.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/Twine.h" 20 #include "llvm/Analysis/IVDescriptors.h" 21 #include "llvm/IR/BasicBlock.h" 22 #include "llvm/IR/IRBuilder.h" 23 #include "llvm/IR/Instruction.h" 24 #include "llvm/IR/Instructions.h" 25 #include "llvm/IR/Type.h" 26 #include "llvm/IR/Value.h" 27 #include "llvm/IR/VectorBuilder.h" 28 #include "llvm/Support/Casting.h" 29 #include "llvm/Support/CommandLine.h" 30 #include "llvm/Support/Debug.h" 31 #include "llvm/Support/raw_ostream.h" 32 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 33 #include "llvm/Transforms/Utils/LoopUtils.h" 34 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 35 #include <cassert> 36 37 using namespace llvm; 38 39 using VectorParts = SmallVector<Value *, 2>; 40 41 namespace llvm { 42 extern cl::opt<bool> EnableVPlanNativePath; 43 } 44 extern cl::opt<unsigned> ForceTargetInstructionCost; 45 46 #define LV_NAME "loop-vectorize" 47 #define DEBUG_TYPE LV_NAME 48 49 bool VPRecipeBase::mayWriteToMemory() const { 50 switch (getVPDefID()) { 51 case VPInterleaveSC: 52 return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0; 53 case VPWidenStoreEVLSC: 54 case VPWidenStoreSC: 55 return true; 56 case VPReplicateSC: 57 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 58 ->mayWriteToMemory(); 59 case VPWidenCallSC: 60 return !cast<VPWidenCallRecipe>(this) 61 ->getCalledScalarFunction() 62 ->onlyReadsMemory(); 63 case VPBranchOnMaskSC: 64 case VPScalarIVStepsSC: 65 case VPPredInstPHISC: 66 return false; 67 case VPBlendSC: 68 case VPReductionEVLSC: 69 case VPReductionSC: 70 case VPWidenCanonicalIVSC: 71 case VPWidenCastSC: 72 case VPWidenGEPSC: 73 case VPWidenIntOrFpInductionSC: 74 case VPWidenLoadEVLSC: 75 case VPWidenLoadSC: 76 case VPWidenPHISC: 77 case VPWidenSC: 78 case VPWidenEVLSC: 79 case VPWidenSelectSC: { 80 const Instruction *I = 81 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 82 (void)I; 83 assert((!I || !I->mayWriteToMemory()) && 84 "underlying instruction may write to memory"); 85 return false; 86 } 87 default: 88 return true; 89 } 90 } 91 92 bool VPRecipeBase::mayReadFromMemory() const { 93 switch (getVPDefID()) { 94 case VPWidenLoadEVLSC: 95 case VPWidenLoadSC: 96 return true; 97 case VPReplicateSC: 98 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 99 ->mayReadFromMemory(); 100 case VPWidenCallSC: 101 return !cast<VPWidenCallRecipe>(this) 102 ->getCalledScalarFunction() 103 ->onlyWritesMemory(); 104 case VPBranchOnMaskSC: 105 case VPPredInstPHISC: 106 case VPScalarIVStepsSC: 107 case VPWidenStoreEVLSC: 108 case VPWidenStoreSC: 109 return false; 110 case VPBlendSC: 111 case VPReductionEVLSC: 112 case VPReductionSC: 113 case VPWidenCanonicalIVSC: 114 case VPWidenCastSC: 115 case VPWidenGEPSC: 116 case VPWidenIntOrFpInductionSC: 117 case VPWidenPHISC: 118 case VPWidenSC: 119 case VPWidenEVLSC: 120 case VPWidenSelectSC: { 121 const Instruction *I = 122 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 123 (void)I; 124 assert((!I || !I->mayReadFromMemory()) && 125 "underlying instruction may read from memory"); 126 return false; 127 } 128 default: 129 return true; 130 } 131 } 132 133 bool VPRecipeBase::mayHaveSideEffects() const { 134 switch (getVPDefID()) { 135 case VPDerivedIVSC: 136 case VPPredInstPHISC: 137 case VPScalarCastSC: 138 return false; 139 case VPInstructionSC: 140 switch (cast<VPInstruction>(this)->getOpcode()) { 141 case Instruction::Or: 142 case Instruction::ICmp: 143 case Instruction::Select: 144 case VPInstruction::Not: 145 case VPInstruction::CalculateTripCountMinusVF: 146 case VPInstruction::CanonicalIVIncrementForPart: 147 case VPInstruction::ExtractFromEnd: 148 case VPInstruction::FirstOrderRecurrenceSplice: 149 case VPInstruction::LogicalAnd: 150 case VPInstruction::PtrAdd: 151 return false; 152 default: 153 return true; 154 } 155 case VPWidenCallSC: { 156 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction(); 157 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn(); 158 } 159 case VPBlendSC: 160 case VPReductionEVLSC: 161 case VPReductionSC: 162 case VPScalarIVStepsSC: 163 case VPWidenCanonicalIVSC: 164 case VPWidenCastSC: 165 case VPWidenGEPSC: 166 case VPWidenIntOrFpInductionSC: 167 case VPWidenPHISC: 168 case VPWidenPointerInductionSC: 169 case VPWidenSC: 170 case VPWidenEVLSC: 171 case VPWidenSelectSC: { 172 const Instruction *I = 173 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 174 (void)I; 175 assert((!I || !I->mayHaveSideEffects()) && 176 "underlying instruction has side-effects"); 177 return false; 178 } 179 case VPInterleaveSC: 180 return mayWriteToMemory(); 181 case VPWidenLoadEVLSC: 182 case VPWidenLoadSC: 183 case VPWidenStoreEVLSC: 184 case VPWidenStoreSC: 185 assert( 186 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() == 187 mayWriteToMemory() && 188 "mayHaveSideffects result for ingredient differs from this " 189 "implementation"); 190 return mayWriteToMemory(); 191 case VPReplicateSC: { 192 auto *R = cast<VPReplicateRecipe>(this); 193 return R->getUnderlyingInstr()->mayHaveSideEffects(); 194 } 195 default: 196 return true; 197 } 198 } 199 200 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { 201 VPValue *ExitValue = getOperand(0); 202 VPBasicBlock *MiddleVPBB = 203 cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor()); 204 VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe(); 205 auto *ExitingVPBB = ExitingRecipe ? ExitingRecipe->getParent() : nullptr; 206 // Values leaving the vector loop reach live out phi's in the exiting block 207 // via middle block. 208 auto *PredVPBB = !ExitingVPBB || ExitingVPBB->getEnclosingLoopRegion() 209 ? MiddleVPBB 210 : ExitingVPBB; 211 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; 212 Value *V = State.get(ExitValue, VPIteration(0, 0)); 213 if (Phi->getBasicBlockIndex(PredBB) != -1) 214 Phi->setIncomingValueForBlock(PredBB, V); 215 else 216 Phi->addIncoming(V, PredBB); 217 } 218 219 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 220 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { 221 O << "Live-out "; 222 getPhi()->printAsOperand(O); 223 O << " = "; 224 getOperand(0)->printAsOperand(O, SlotTracker); 225 O << "\n"; 226 } 227 #endif 228 229 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { 230 assert(!Parent && "Recipe already in some VPBasicBlock"); 231 assert(InsertPos->getParent() && 232 "Insertion position not in any VPBasicBlock"); 233 InsertPos->getParent()->insert(this, InsertPos->getIterator()); 234 } 235 236 void VPRecipeBase::insertBefore(VPBasicBlock &BB, 237 iplist<VPRecipeBase>::iterator I) { 238 assert(!Parent && "Recipe already in some VPBasicBlock"); 239 assert(I == BB.end() || I->getParent() == &BB); 240 BB.insert(this, I); 241 } 242 243 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { 244 assert(!Parent && "Recipe already in some VPBasicBlock"); 245 assert(InsertPos->getParent() && 246 "Insertion position not in any VPBasicBlock"); 247 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator())); 248 } 249 250 void VPRecipeBase::removeFromParent() { 251 assert(getParent() && "Recipe not in any VPBasicBlock"); 252 getParent()->getRecipeList().remove(getIterator()); 253 Parent = nullptr; 254 } 255 256 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { 257 assert(getParent() && "Recipe not in any VPBasicBlock"); 258 return getParent()->getRecipeList().erase(getIterator()); 259 } 260 261 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { 262 removeFromParent(); 263 insertAfter(InsertPos); 264 } 265 266 void VPRecipeBase::moveBefore(VPBasicBlock &BB, 267 iplist<VPRecipeBase>::iterator I) { 268 removeFromParent(); 269 insertBefore(BB, I); 270 } 271 272 /// Return the underlying instruction to be used for computing \p R's cost via 273 /// the legacy cost model. Return nullptr if there's no suitable instruction. 274 static Instruction *getInstructionForCost(const VPRecipeBase *R) { 275 if (auto *S = dyn_cast<VPSingleDefRecipe>(R)) 276 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue()); 277 if (auto *IG = dyn_cast<VPInterleaveRecipe>(R)) 278 return IG->getInsertPos(); 279 // Currently the legacy cost model only calculates the instruction cost with 280 // underlying instruction. Removing the WidenMem here will prevent 281 // force-target-instruction-cost overwriting the cost of recipe with 282 // underlying instruction which is inconsistent with the legacy model. 283 // TODO: Remove WidenMem from this function when we don't need to compare to 284 // the legacy model. 285 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R)) 286 return &WidenMem->getIngredient(); 287 return nullptr; 288 } 289 290 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { 291 auto *UI = getInstructionForCost(this); 292 if (UI && Ctx.skipCostComputation(UI, VF.isVector())) 293 return 0; 294 295 InstructionCost RecipeCost = computeCost(VF, Ctx); 296 if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 && 297 RecipeCost.isValid()) 298 RecipeCost = InstructionCost(ForceTargetInstructionCost); 299 300 LLVM_DEBUG({ 301 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": "; 302 dump(); 303 }); 304 return RecipeCost; 305 } 306 307 InstructionCost VPRecipeBase::computeCost(ElementCount VF, 308 VPCostContext &Ctx) const { 309 // Compute the cost for the recipe falling back to the legacy cost model using 310 // the underlying instruction. If there is no underlying instruction, returns 311 // 0. 312 Instruction *UI = getInstructionForCost(this); 313 if (UI && isa<VPReplicateRecipe>(this)) { 314 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan 315 // transform, avoid computing their cost multiple times for now. 316 Ctx.SkipCostComputation.insert(UI); 317 } 318 return UI ? Ctx.getLegacyCost(UI, VF) : 0; 319 } 320 321 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { 322 assert(OpType == OperationType::FPMathOp && 323 "recipe doesn't have fast math flags"); 324 FastMathFlags Res; 325 Res.setAllowReassoc(FMFs.AllowReassoc); 326 Res.setNoNaNs(FMFs.NoNaNs); 327 Res.setNoInfs(FMFs.NoInfs); 328 Res.setNoSignedZeros(FMFs.NoSignedZeros); 329 Res.setAllowReciprocal(FMFs.AllowReciprocal); 330 Res.setAllowContract(FMFs.AllowContract); 331 Res.setApproxFunc(FMFs.ApproxFunc); 332 return Res; 333 } 334 335 template <unsigned PartOpIdx> 336 VPValue * 337 VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(VPUser &U) const { 338 if (U.getNumOperands() == PartOpIdx + 1) 339 return U.getOperand(PartOpIdx); 340 return nullptr; 341 } 342 343 template <unsigned PartOpIdx> 344 unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(VPUser &U) const { 345 if (auto *UnrollPartOp = getUnrollPartOperand(U)) 346 return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue(); 347 return 0; 348 } 349 350 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, 351 VPValue *A, VPValue *B, DebugLoc DL, 352 const Twine &Name) 353 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}), 354 Pred, DL), 355 Opcode(Opcode), Name(Name.str()) { 356 assert(Opcode == Instruction::ICmp && 357 "only ICmp predicates supported at the moment"); 358 } 359 360 VPInstruction::VPInstruction(unsigned Opcode, 361 std::initializer_list<VPValue *> Operands, 362 FastMathFlags FMFs, DebugLoc DL, const Twine &Name) 363 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL), 364 Opcode(Opcode), Name(Name.str()) { 365 // Make sure the VPInstruction is a floating-point operation. 366 assert(isFPMathOp() && "this op can't take fast-math flags"); 367 } 368 369 bool VPInstruction::doesGeneratePerAllLanes() const { 370 return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this); 371 } 372 373 bool VPInstruction::canGenerateScalarForFirstLane() const { 374 if (Instruction::isBinaryOp(getOpcode())) 375 return true; 376 if (isSingleScalar() || isVectorToScalar()) 377 return true; 378 switch (Opcode) { 379 case Instruction::ICmp: 380 case VPInstruction::BranchOnCond: 381 case VPInstruction::BranchOnCount: 382 case VPInstruction::CalculateTripCountMinusVF: 383 case VPInstruction::CanonicalIVIncrementForPart: 384 case VPInstruction::PtrAdd: 385 case VPInstruction::ExplicitVectorLength: 386 return true; 387 default: 388 return false; 389 } 390 } 391 392 Value *VPInstruction::generatePerLane(VPTransformState &State, 393 const VPIteration &Lane) { 394 IRBuilderBase &Builder = State.Builder; 395 396 assert(getOpcode() == VPInstruction::PtrAdd && 397 "only PtrAdd opcodes are supported for now"); 398 return Builder.CreatePtrAdd(State.get(getOperand(0), Lane), 399 State.get(getOperand(1), Lane), Name); 400 } 401 402 Value *VPInstruction::generate(VPTransformState &State) { 403 IRBuilderBase &Builder = State.Builder; 404 405 if (Instruction::isBinaryOp(getOpcode())) { 406 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 407 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); 408 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed); 409 auto *Res = 410 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); 411 if (auto *I = dyn_cast<Instruction>(Res)) 412 setFlags(I); 413 return Res; 414 } 415 416 switch (getOpcode()) { 417 case VPInstruction::Not: { 418 Value *A = State.get(getOperand(0)); 419 return Builder.CreateNot(A, Name); 420 } 421 case Instruction::ICmp: { 422 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 423 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); 424 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed); 425 return Builder.CreateCmp(getPredicate(), A, B, Name); 426 } 427 case Instruction::Select: { 428 Value *Cond = State.get(getOperand(0)); 429 Value *Op1 = State.get(getOperand(1)); 430 Value *Op2 = State.get(getOperand(2)); 431 return Builder.CreateSelect(Cond, Op1, Op2, Name); 432 } 433 case VPInstruction::ActiveLaneMask: { 434 // Get first lane of vector induction variable. 435 Value *VIVElem0 = State.get(getOperand(0), VPIteration(0, 0)); 436 // Get the original loop tripcount. 437 Value *ScalarTC = State.get(getOperand(1), VPIteration(0, 0)); 438 439 // If this part of the active lane mask is scalar, generate the CMP directly 440 // to avoid unnecessary extracts. 441 if (State.VF.isScalar()) 442 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC, 443 Name); 444 445 auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); 446 auto *PredTy = VectorType::get(Int1Ty, State.VF); 447 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, 448 {PredTy, ScalarTC->getType()}, 449 {VIVElem0, ScalarTC}, nullptr, Name); 450 } 451 case VPInstruction::FirstOrderRecurrenceSplice: { 452 // Generate code to combine the previous and current values in vector v3. 453 // 454 // vector.ph: 455 // v_init = vector(..., ..., ..., a[-1]) 456 // br vector.body 457 // 458 // vector.body 459 // i = phi [0, vector.ph], [i+4, vector.body] 460 // v1 = phi [v_init, vector.ph], [v2, vector.body] 461 // v2 = a[i, i+1, i+2, i+3]; 462 // v3 = vector(v1(3), v2(0, 1, 2)) 463 464 auto *V1 = State.get(getOperand(0)); 465 if (!V1->getType()->isVectorTy()) 466 return V1; 467 Value *V2 = State.get(getOperand(1)); 468 return Builder.CreateVectorSplice(V1, V2, -1, Name); 469 } 470 case VPInstruction::CalculateTripCountMinusVF: { 471 unsigned UF = getParent()->getPlan()->getUF(); 472 Value *ScalarTC = State.get(getOperand(0), {0, 0}); 473 Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF); 474 Value *Sub = Builder.CreateSub(ScalarTC, Step); 475 Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step); 476 Value *Zero = ConstantInt::get(ScalarTC->getType(), 0); 477 return Builder.CreateSelect(Cmp, Sub, Zero); 478 } 479 case VPInstruction::ExplicitVectorLength: { 480 // Compute EVL 481 auto GetEVL = [=](VPTransformState &State, Value *AVL) { 482 assert(AVL->getType()->isIntegerTy() && 483 "Requested vector length should be an integer."); 484 485 // TODO: Add support for MaxSafeDist for correct loop emission. 486 assert(State.VF.isScalable() && "Expected scalable vector factor."); 487 Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue()); 488 489 Value *EVL = State.Builder.CreateIntrinsic( 490 State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length, 491 {AVL, VFArg, State.Builder.getTrue()}); 492 return EVL; 493 }; 494 // TODO: Restructure this code with an explicit remainder loop, vsetvli can 495 // be outside of the main loop. 496 // Compute VTC - IV as the AVL (requested vector length). 497 Value *Index = State.get(getOperand(0), VPIteration(0, 0)); 498 Value *TripCount = State.get(getOperand(1), VPIteration(0, 0)); 499 Value *AVL = State.Builder.CreateSub(TripCount, Index); 500 Value *EVL = GetEVL(State, AVL); 501 return EVL; 502 } 503 case VPInstruction::CanonicalIVIncrementForPart: { 504 unsigned Part = getUnrollPart(*this); 505 auto *IV = State.get(getOperand(0), VPIteration(0, 0)); 506 assert(Part != 0 && "Must have a positive part"); 507 // The canonical IV is incremented by the vectorization factor (num of 508 // SIMD elements) times the unroll part. 509 Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); 510 return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(), 511 hasNoSignedWrap()); 512 } 513 case VPInstruction::BranchOnCond: { 514 Value *Cond = State.get(getOperand(0), VPIteration(0, 0)); 515 // Replace the temporary unreachable terminator with a new conditional 516 // branch, hooking it up to backward destination for exiting blocks now and 517 // to forward destination(s) later when they are created. 518 BranchInst *CondBr = 519 Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr); 520 CondBr->setSuccessor(0, nullptr); 521 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 522 523 if (!getParent()->isExiting()) 524 return CondBr; 525 526 VPRegionBlock *ParentRegion = getParent()->getParent(); 527 VPBasicBlock *Header = ParentRegion->getEntryBasicBlock(); 528 CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]); 529 return CondBr; 530 } 531 case VPInstruction::BranchOnCount: { 532 // First create the compare. 533 Value *IV = State.get(getOperand(0), /*IsScalar*/ true); 534 Value *TC = State.get(getOperand(1), /*IsScalar*/ true); 535 Value *Cond = Builder.CreateICmpEQ(IV, TC); 536 537 // Now create the branch. 538 auto *Plan = getParent()->getPlan(); 539 VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); 540 VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock(); 541 542 // Replace the temporary unreachable terminator with a new conditional 543 // branch, hooking it up to backward destination (the header) now and to the 544 // forward destination (the exit/middle block) later when it is created. 545 // Note that CreateCondBr expects a valid BB as first argument, so we need 546 // to set it to nullptr later. 547 BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), 548 State.CFG.VPBB2IRBB[Header]); 549 CondBr->setSuccessor(0, nullptr); 550 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 551 return CondBr; 552 } 553 case VPInstruction::ComputeReductionResult: { 554 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary 555 // and will be removed by breaking up the recipe further. 556 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0)); 557 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 558 // Get its reduction variable descriptor. 559 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 560 561 RecurKind RK = RdxDesc.getRecurrenceKind(); 562 563 Type *PhiTy = OrigPhi->getType(); 564 // The recipe's operands are the reduction phi, followed by one operand for 565 // each part of the reduction. 566 unsigned UF = getNumOperands() - 1; 567 VectorParts RdxParts(UF); 568 for (unsigned Part = 0; Part < UF; ++Part) 569 RdxParts[Part] = State.get(getOperand(1 + Part), PhiR->isInLoop()); 570 571 // If the vector reduction can be performed in a smaller type, we truncate 572 // then extend the loop exit value to enable InstCombine to evaluate the 573 // entire expression in the smaller type. 574 // TODO: Handle this in truncateToMinBW. 575 if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 576 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF); 577 for (unsigned Part = 0; Part < UF; ++Part) 578 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 579 } 580 // Reduce all of the unrolled parts into a single vector. 581 Value *ReducedPartRdx = RdxParts[0]; 582 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 583 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) 584 Op = Instruction::Or; 585 586 if (PhiR->isOrdered()) { 587 ReducedPartRdx = RdxParts[UF - 1]; 588 } else { 589 // Floating-point operations should have some FMF to enable the reduction. 590 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 591 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 592 for (unsigned Part = 1; Part < UF; ++Part) { 593 Value *RdxPart = RdxParts[Part]; 594 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 595 ReducedPartRdx = Builder.CreateBinOp( 596 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 597 else 598 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 599 } 600 } 601 602 // Create the reduction after the loop. Note that inloop reductions create 603 // the target reduction in the loop using a Reduction recipe. 604 if ((State.VF.isVector() || 605 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) && 606 !PhiR->isInLoop()) { 607 ReducedPartRdx = 608 createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); 609 // If the reduction can be performed in a smaller type, we need to extend 610 // the reduction to the wider type before we branch to the original loop. 611 if (PhiTy != RdxDesc.getRecurrenceType()) 612 ReducedPartRdx = RdxDesc.isSigned() 613 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 614 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 615 } 616 617 // If there were stores of the reduction value to a uniform memory address 618 // inside the loop, create the final store here. 619 if (StoreInst *SI = RdxDesc.IntermediateStore) { 620 auto *NewSI = Builder.CreateAlignedStore( 621 ReducedPartRdx, SI->getPointerOperand(), SI->getAlign()); 622 propagateMetadata(NewSI, SI); 623 } 624 625 return ReducedPartRdx; 626 } 627 case VPInstruction::ExtractFromEnd: { 628 auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue()); 629 unsigned Offset = CI->getZExtValue(); 630 assert(Offset > 0 && "Offset from end must be positive"); 631 Value *Res; 632 if (State.VF.isVector()) { 633 assert(Offset <= State.VF.getKnownMinValue() && 634 "invalid offset to extract from"); 635 // Extract lane VF - Offset from the operand. 636 Res = State.get(getOperand(0), 637 VPIteration(0, VPLane::getLaneFromEnd(State.VF, Offset))); 638 } else { 639 assert(Offset <= 1 && "invalid offset to extract from"); 640 Res = State.get(getOperand(0)); 641 } 642 if (isa<ExtractElementInst>(Res)) 643 Res->setName(Name); 644 return Res; 645 } 646 case VPInstruction::LogicalAnd: { 647 Value *A = State.get(getOperand(0)); 648 Value *B = State.get(getOperand(1)); 649 return Builder.CreateLogicalAnd(A, B, Name); 650 } 651 case VPInstruction::PtrAdd: { 652 assert(vputils::onlyFirstLaneUsed(this) && 653 "can only generate first lane for PtrAdd"); 654 Value *Ptr = State.get(getOperand(0), /* IsScalar */ true); 655 Value *Addend = State.get(getOperand(1), /* IsScalar */ true); 656 return Builder.CreatePtrAdd(Ptr, Addend, Name); 657 } 658 case VPInstruction::ResumePhi: { 659 Value *IncomingFromVPlanPred = 660 State.get(getOperand(0), /* IsScalar */ true); 661 Value *IncomingFromOtherPreds = 662 State.get(getOperand(1), /* IsScalar */ true); 663 auto *NewPhi = 664 Builder.CreatePHI(IncomingFromOtherPreds->getType(), 2, Name); 665 BasicBlock *VPlanPred = 666 State.CFG 667 .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getSinglePredecessor())]; 668 NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred); 669 for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) { 670 assert(OtherPred != VPlanPred && 671 "VPlan predecessors should not be connected yet"); 672 NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred); 673 } 674 return NewPhi; 675 } 676 677 default: 678 llvm_unreachable("Unsupported opcode for instruction"); 679 } 680 } 681 682 bool VPInstruction::isVectorToScalar() const { 683 return getOpcode() == VPInstruction::ExtractFromEnd || 684 getOpcode() == VPInstruction::ComputeReductionResult; 685 } 686 687 bool VPInstruction::isSingleScalar() const { 688 return getOpcode() == VPInstruction::ResumePhi; 689 } 690 691 #if !defined(NDEBUG) 692 bool VPInstruction::isFPMathOp() const { 693 // Inspired by FPMathOperator::classof. Notable differences are that we don't 694 // support Call, PHI and Select opcodes here yet. 695 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || 696 Opcode == Instruction::FNeg || Opcode == Instruction::FSub || 697 Opcode == Instruction::FDiv || Opcode == Instruction::FRem || 698 Opcode == Instruction::FCmp || Opcode == Instruction::Select; 699 } 700 #endif 701 702 void VPInstruction::execute(VPTransformState &State) { 703 assert(!State.Instance && "VPInstruction executing an Instance"); 704 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 705 assert((hasFastMathFlags() == isFPMathOp() || 706 getOpcode() == Instruction::Select) && 707 "Recipe not a FPMathOp but has fast-math flags?"); 708 if (hasFastMathFlags()) 709 State.Builder.setFastMathFlags(getFastMathFlags()); 710 State.setDebugLocFrom(getDebugLoc()); 711 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() && 712 (vputils::onlyFirstLaneUsed(this) || 713 isVectorToScalar() || isSingleScalar()); 714 bool GeneratesPerAllLanes = doesGeneratePerAllLanes(); 715 if (GeneratesPerAllLanes) { 716 for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue(); 717 Lane != NumLanes; ++Lane) { 718 Value *GeneratedValue = generatePerLane(State, VPIteration(0, Lane)); 719 assert(GeneratedValue && "generatePerLane must produce a value"); 720 State.set(this, GeneratedValue, VPIteration(0, Lane)); 721 } 722 return; 723 } 724 725 Value *GeneratedValue = generate(State); 726 if (!hasResult()) 727 return; 728 assert(GeneratedValue && "generate must produce a value"); 729 assert( 730 (GeneratedValue->getType()->isVectorTy() == !GeneratesPerFirstLaneOnly || 731 State.VF.isScalar()) && 732 "scalar value but not only first lane defined"); 733 State.set(this, GeneratedValue, 734 /*IsScalar*/ GeneratesPerFirstLaneOnly); 735 } 736 737 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { 738 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 739 if (Instruction::isBinaryOp(getOpcode())) 740 return vputils::onlyFirstLaneUsed(this); 741 742 switch (getOpcode()) { 743 default: 744 return false; 745 case Instruction::ICmp: 746 case VPInstruction::PtrAdd: 747 // TODO: Cover additional opcodes. 748 return vputils::onlyFirstLaneUsed(this); 749 case VPInstruction::ActiveLaneMask: 750 case VPInstruction::ExplicitVectorLength: 751 case VPInstruction::CalculateTripCountMinusVF: 752 case VPInstruction::CanonicalIVIncrementForPart: 753 case VPInstruction::BranchOnCount: 754 case VPInstruction::BranchOnCond: 755 case VPInstruction::ResumePhi: 756 return true; 757 }; 758 llvm_unreachable("switch should return"); 759 } 760 761 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const { 762 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 763 if (Instruction::isBinaryOp(getOpcode())) 764 return vputils::onlyFirstPartUsed(this); 765 766 switch (getOpcode()) { 767 default: 768 return false; 769 case Instruction::ICmp: 770 case Instruction::Select: 771 return vputils::onlyFirstPartUsed(this); 772 case VPInstruction::BranchOnCount: 773 case VPInstruction::BranchOnCond: 774 case VPInstruction::CanonicalIVIncrementForPart: 775 return true; 776 }; 777 llvm_unreachable("switch should return"); 778 } 779 780 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 781 void VPInstruction::dump() const { 782 VPSlotTracker SlotTracker(getParent()->getPlan()); 783 print(dbgs(), "", SlotTracker); 784 } 785 786 void VPInstruction::print(raw_ostream &O, const Twine &Indent, 787 VPSlotTracker &SlotTracker) const { 788 O << Indent << "EMIT "; 789 790 if (hasResult()) { 791 printAsOperand(O, SlotTracker); 792 O << " = "; 793 } 794 795 switch (getOpcode()) { 796 case VPInstruction::Not: 797 O << "not"; 798 break; 799 case VPInstruction::SLPLoad: 800 O << "combined load"; 801 break; 802 case VPInstruction::SLPStore: 803 O << "combined store"; 804 break; 805 case VPInstruction::ActiveLaneMask: 806 O << "active lane mask"; 807 break; 808 case VPInstruction::ResumePhi: 809 O << "resume-phi"; 810 break; 811 case VPInstruction::ExplicitVectorLength: 812 O << "EXPLICIT-VECTOR-LENGTH"; 813 break; 814 case VPInstruction::FirstOrderRecurrenceSplice: 815 O << "first-order splice"; 816 break; 817 case VPInstruction::BranchOnCond: 818 O << "branch-on-cond"; 819 break; 820 case VPInstruction::CalculateTripCountMinusVF: 821 O << "TC > VF ? TC - VF : 0"; 822 break; 823 case VPInstruction::CanonicalIVIncrementForPart: 824 O << "VF * Part +"; 825 break; 826 case VPInstruction::BranchOnCount: 827 O << "branch-on-count"; 828 break; 829 case VPInstruction::ExtractFromEnd: 830 O << "extract-from-end"; 831 break; 832 case VPInstruction::ComputeReductionResult: 833 O << "compute-reduction-result"; 834 break; 835 case VPInstruction::LogicalAnd: 836 O << "logical-and"; 837 break; 838 case VPInstruction::PtrAdd: 839 O << "ptradd"; 840 break; 841 default: 842 O << Instruction::getOpcodeName(getOpcode()); 843 } 844 845 printFlags(O); 846 printOperands(O, SlotTracker); 847 848 if (auto DL = getDebugLoc()) { 849 O << ", !dbg "; 850 DL.print(O); 851 } 852 } 853 #endif 854 855 void VPIRInstruction::execute(VPTransformState &State) { 856 assert((isa<PHINode>(&I) || getNumOperands() == 0) && 857 "Only PHINodes can have extra operands"); 858 if (getNumOperands() == 1) { 859 VPValue *ExitValue = getOperand(0); 860 auto Lane = vputils::isUniformAfterVectorization(ExitValue) 861 ? VPLane::getFirstLane() 862 : VPLane::getLastLaneForVF(State.VF); 863 auto *PredVPBB = cast<VPBasicBlock>(getParent()->getSinglePredecessor()); 864 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; 865 // Set insertion point in PredBB in case an extract needs to be generated. 866 // TODO: Model extracts explicitly. 867 State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt()); 868 Value *V = State.get(ExitValue, VPIteration(0, Lane)); 869 auto *Phi = cast<PHINode>(&I); 870 Phi->addIncoming(V, PredBB); 871 } 872 873 // Advance the insert point after the wrapped IR instruction. This allows 874 // interleaving VPIRInstructions and other recipes. 875 State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator())); 876 } 877 878 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 879 void VPIRInstruction::print(raw_ostream &O, const Twine &Indent, 880 VPSlotTracker &SlotTracker) const { 881 O << Indent << "IR " << I; 882 883 if (getNumOperands() != 0) { 884 assert(getNumOperands() == 1 && "can have at most 1 operand"); 885 O << " (extra operand: "; 886 printOperands(O, SlotTracker); 887 O << ")"; 888 } 889 } 890 #endif 891 892 void VPWidenCallRecipe::execute(VPTransformState &State) { 893 assert(State.VF.isVector() && "not widening"); 894 Function *CalledScalarFn = getCalledScalarFunction(); 895 assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) && 896 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 897 State.setDebugLocFrom(getDebugLoc()); 898 899 bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic; 900 FunctionType *VFTy = nullptr; 901 if (Variant) 902 VFTy = Variant->getFunctionType(); 903 SmallVector<Type *, 2> TysForDecl; 904 // Add return type if intrinsic is overloaded on it. 905 if (UseIntrinsic && 906 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1)) 907 TysForDecl.push_back(VectorType::get( 908 CalledScalarFn->getReturnType()->getScalarType(), State.VF)); 909 SmallVector<Value *, 4> Args; 910 for (const auto &I : enumerate(arg_operands())) { 911 // Some intrinsics have a scalar argument - don't replace it with a 912 // vector. 913 Value *Arg; 914 if (UseIntrinsic && 915 isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())) 916 Arg = State.get(I.value(), VPIteration(0, 0)); 917 // Some vectorized function variants may also take a scalar argument, 918 // e.g. linear parameters for pointers. This needs to be the scalar value 919 // from the start of the respective part when interleaving. 920 else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy()) 921 Arg = State.get(I.value(), VPIteration(0, 0)); 922 else 923 Arg = State.get(I.value()); 924 if (UseIntrinsic && 925 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index())) 926 TysForDecl.push_back(Arg->getType()); 927 Args.push_back(Arg); 928 } 929 930 Function *VectorF; 931 if (UseIntrinsic) { 932 // Use vector version of the intrinsic. 933 Module *M = State.Builder.GetInsertBlock()->getModule(); 934 VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl); 935 assert(VectorF && "Can't retrieve vector intrinsic."); 936 } else { 937 #ifndef NDEBUG 938 assert(Variant != nullptr && "Can't create vector function."); 939 #endif 940 VectorF = Variant; 941 } 942 943 auto *CI = cast_or_null<CallInst>(getUnderlyingInstr()); 944 SmallVector<OperandBundleDef, 1> OpBundles; 945 if (CI) 946 CI->getOperandBundlesAsDefs(OpBundles); 947 948 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); 949 950 if (isa<FPMathOperator>(V)) 951 V->copyFastMathFlags(CI); 952 953 if (!V->getType()->isVoidTy()) 954 State.set(this, V); 955 State.addMetadata(V, CI); 956 } 957 958 InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF, 959 VPCostContext &Ctx) const { 960 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 961 if (Variant) { 962 return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(), 963 Variant->getFunctionType()->params(), 964 CostKind); 965 } 966 967 FastMathFlags FMF; 968 // TODO: Manage flags via VPRecipeWithIRFlags. 969 if (auto *FPMO = dyn_cast_or_null<FPMathOperator>(getUnderlyingValue())) 970 FMF = FPMO->getFastMathFlags(); 971 972 // Some backends analyze intrinsic arguments to determine cost. Use the 973 // underlying value for the operand if it has one. Otherwise try to use the 974 // operand of the underlying call instruction, if there is one. Otherwise 975 // clear Arguments. 976 // TODO: Rework TTI interface to be independent of concrete IR values. 977 SmallVector<const Value *> Arguments; 978 for (const auto &[Idx, Op] : enumerate(operands())) { 979 auto *V = Op->getUnderlyingValue(); 980 if (!V) { 981 if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) { 982 Arguments.push_back(UI->getArgOperand(Idx)); 983 continue; 984 } 985 Arguments.clear(); 986 break; 987 } 988 Arguments.push_back(V); 989 } 990 991 Type *RetTy = 992 ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); 993 SmallVector<Type *> ParamTys; 994 for (unsigned I = 0; I != getNumOperands(); ++I) 995 ParamTys.push_back( 996 ToVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF)); 997 998 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst. 999 IntrinsicCostAttributes CostAttrs( 1000 VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF, 1001 dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue())); 1002 return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, CostKind); 1003 } 1004 1005 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1006 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, 1007 VPSlotTracker &SlotTracker) const { 1008 O << Indent << "WIDEN-CALL "; 1009 1010 Function *CalledFn = getCalledScalarFunction(); 1011 if (CalledFn->getReturnType()->isVoidTy()) 1012 O << "void "; 1013 else { 1014 printAsOperand(O, SlotTracker); 1015 O << " = "; 1016 } 1017 1018 O << "call @" << CalledFn->getName() << "("; 1019 interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) { 1020 Op->printAsOperand(O, SlotTracker); 1021 }); 1022 O << ")"; 1023 1024 if (VectorIntrinsicID) 1025 O << " (using vector intrinsic)"; 1026 else { 1027 O << " (using library function"; 1028 if (Variant->hasName()) 1029 O << ": " << Variant->getName(); 1030 O << ")"; 1031 } 1032 } 1033 1034 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, 1035 VPSlotTracker &SlotTracker) const { 1036 O << Indent << "WIDEN-SELECT "; 1037 printAsOperand(O, SlotTracker); 1038 O << " = select "; 1039 getOperand(0)->printAsOperand(O, SlotTracker); 1040 O << ", "; 1041 getOperand(1)->printAsOperand(O, SlotTracker); 1042 O << ", "; 1043 getOperand(2)->printAsOperand(O, SlotTracker); 1044 O << (isInvariantCond() ? " (condition is loop invariant)" : ""); 1045 } 1046 #endif 1047 1048 void VPWidenSelectRecipe::execute(VPTransformState &State) { 1049 State.setDebugLocFrom(getDebugLoc()); 1050 1051 // The condition can be loop invariant but still defined inside the 1052 // loop. This means that we can't just use the original 'cond' value. 1053 // We have to take the 'vectorized' value and pick the first lane. 1054 // Instcombine will make this a no-op. 1055 auto *InvarCond = 1056 isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr; 1057 1058 Value *Cond = InvarCond ? InvarCond : State.get(getCond()); 1059 Value *Op0 = State.get(getOperand(1)); 1060 Value *Op1 = State.get(getOperand(2)); 1061 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 1062 State.set(this, Sel); 1063 State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1064 } 1065 1066 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy( 1067 const FastMathFlags &FMF) { 1068 AllowReassoc = FMF.allowReassoc(); 1069 NoNaNs = FMF.noNaNs(); 1070 NoInfs = FMF.noInfs(); 1071 NoSignedZeros = FMF.noSignedZeros(); 1072 AllowReciprocal = FMF.allowReciprocal(); 1073 AllowContract = FMF.allowContract(); 1074 ApproxFunc = FMF.approxFunc(); 1075 } 1076 1077 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1078 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { 1079 switch (OpType) { 1080 case OperationType::Cmp: 1081 O << " " << CmpInst::getPredicateName(getPredicate()); 1082 break; 1083 case OperationType::DisjointOp: 1084 if (DisjointFlags.IsDisjoint) 1085 O << " disjoint"; 1086 break; 1087 case OperationType::PossiblyExactOp: 1088 if (ExactFlags.IsExact) 1089 O << " exact"; 1090 break; 1091 case OperationType::OverflowingBinOp: 1092 if (WrapFlags.HasNUW) 1093 O << " nuw"; 1094 if (WrapFlags.HasNSW) 1095 O << " nsw"; 1096 break; 1097 case OperationType::FPMathOp: 1098 getFastMathFlags().print(O); 1099 break; 1100 case OperationType::GEPOp: 1101 if (GEPFlags.IsInBounds) 1102 O << " inbounds"; 1103 break; 1104 case OperationType::NonNegOp: 1105 if (NonNegFlags.NonNeg) 1106 O << " nneg"; 1107 break; 1108 case OperationType::Other: 1109 break; 1110 } 1111 if (getNumOperands() > 0) 1112 O << " "; 1113 } 1114 #endif 1115 1116 void VPWidenRecipe::execute(VPTransformState &State) { 1117 State.setDebugLocFrom(getDebugLoc()); 1118 auto &Builder = State.Builder; 1119 switch (Opcode) { 1120 case Instruction::Call: 1121 case Instruction::Br: 1122 case Instruction::PHI: 1123 case Instruction::GetElementPtr: 1124 case Instruction::Select: 1125 llvm_unreachable("This instruction is handled by a different recipe."); 1126 case Instruction::UDiv: 1127 case Instruction::SDiv: 1128 case Instruction::SRem: 1129 case Instruction::URem: 1130 case Instruction::Add: 1131 case Instruction::FAdd: 1132 case Instruction::Sub: 1133 case Instruction::FSub: 1134 case Instruction::FNeg: 1135 case Instruction::Mul: 1136 case Instruction::FMul: 1137 case Instruction::FDiv: 1138 case Instruction::FRem: 1139 case Instruction::Shl: 1140 case Instruction::LShr: 1141 case Instruction::AShr: 1142 case Instruction::And: 1143 case Instruction::Or: 1144 case Instruction::Xor: { 1145 // Just widen unops and binops. 1146 SmallVector<Value *, 2> Ops; 1147 for (VPValue *VPOp : operands()) 1148 Ops.push_back(State.get(VPOp)); 1149 1150 Value *V = Builder.CreateNAryOp(Opcode, Ops); 1151 1152 if (auto *VecOp = dyn_cast<Instruction>(V)) 1153 setFlags(VecOp); 1154 1155 // Use this vector value for all users of the original instruction. 1156 State.set(this, V); 1157 State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1158 break; 1159 } 1160 case Instruction::Freeze: { 1161 Value *Op = State.get(getOperand(0)); 1162 1163 Value *Freeze = Builder.CreateFreeze(Op); 1164 State.set(this, Freeze); 1165 break; 1166 } 1167 case Instruction::ICmp: 1168 case Instruction::FCmp: { 1169 // Widen compares. Generate vector compares. 1170 bool FCmp = Opcode == Instruction::FCmp; 1171 Value *A = State.get(getOperand(0)); 1172 Value *B = State.get(getOperand(1)); 1173 Value *C = nullptr; 1174 if (FCmp) { 1175 // Propagate fast math flags. 1176 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 1177 if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue())) 1178 Builder.setFastMathFlags(I->getFastMathFlags()); 1179 C = Builder.CreateFCmp(getPredicate(), A, B); 1180 } else { 1181 C = Builder.CreateICmp(getPredicate(), A, B); 1182 } 1183 State.set(this, C); 1184 State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1185 break; 1186 } 1187 default: 1188 // This instruction is not vectorized by simple widening. 1189 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " 1190 << Instruction::getOpcodeName(Opcode)); 1191 llvm_unreachable("Unhandled instruction!"); 1192 } // end of switch. 1193 1194 #if !defined(NDEBUG) 1195 // Verify that VPlan type inference results agree with the type of the 1196 // generated values. 1197 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) == 1198 State.get(this)->getType() && 1199 "inferred type and type from generated instructions do not match"); 1200 #endif 1201 } 1202 1203 InstructionCost VPWidenRecipe::computeCost(ElementCount VF, 1204 VPCostContext &Ctx) const { 1205 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1206 switch (Opcode) { 1207 case Instruction::FNeg: { 1208 Type *VectorTy = 1209 ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); 1210 return Ctx.TTI.getArithmeticInstrCost( 1211 Opcode, VectorTy, CostKind, 1212 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 1213 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}); 1214 } 1215 1216 case Instruction::UDiv: 1217 case Instruction::SDiv: 1218 case Instruction::SRem: 1219 case Instruction::URem: 1220 // More complex computation, let the legacy cost-model handle this for now. 1221 return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF); 1222 case Instruction::Add: 1223 case Instruction::FAdd: 1224 case Instruction::Sub: 1225 case Instruction::FSub: 1226 case Instruction::Mul: 1227 case Instruction::FMul: 1228 case Instruction::FDiv: 1229 case Instruction::FRem: 1230 case Instruction::Shl: 1231 case Instruction::LShr: 1232 case Instruction::AShr: 1233 case Instruction::And: 1234 case Instruction::Or: 1235 case Instruction::Xor: { 1236 VPValue *RHS = getOperand(1); 1237 // Certain instructions can be cheaper to vectorize if they have a constant 1238 // second vector operand. One example of this are shifts on x86. 1239 TargetTransformInfo::OperandValueInfo RHSInfo = { 1240 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}; 1241 if (RHS->isLiveIn()) 1242 RHSInfo = Ctx.TTI.getOperandInfo(RHS->getLiveInIRValue()); 1243 1244 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue && 1245 getOperand(1)->isDefinedOutsideLoopRegions()) 1246 RHSInfo.Kind = TargetTransformInfo::OK_UniformValue; 1247 Type *VectorTy = 1248 ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); 1249 Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue()); 1250 1251 SmallVector<const Value *, 4> Operands; 1252 if (CtxI) 1253 Operands.append(CtxI->value_op_begin(), CtxI->value_op_end()); 1254 return Ctx.TTI.getArithmeticInstrCost( 1255 Opcode, VectorTy, CostKind, 1256 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 1257 RHSInfo, Operands, CtxI, &Ctx.TLI); 1258 } 1259 case Instruction::Freeze: { 1260 // This opcode is unknown. Assume that it is the same as 'mul'. 1261 Type *VectorTy = 1262 ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); 1263 return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 1264 } 1265 case Instruction::ICmp: 1266 case Instruction::FCmp: { 1267 Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue()); 1268 Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); 1269 return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(), 1270 CostKind, CtxI); 1271 } 1272 default: 1273 llvm_unreachable("Unsupported opcode for instruction"); 1274 } 1275 } 1276 1277 void VPWidenEVLRecipe::execute(VPTransformState &State) { 1278 unsigned Opcode = getOpcode(); 1279 // TODO: Support other opcodes 1280 if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode)) 1281 llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute"); 1282 1283 State.setDebugLocFrom(getDebugLoc()); 1284 1285 assert(State.get(getOperand(0))->getType()->isVectorTy() && 1286 "VPWidenEVLRecipe should not be used for scalars"); 1287 1288 VPValue *EVL = getEVL(); 1289 Value *EVLArg = State.get(EVL, /*NeedsScalar=*/true); 1290 IRBuilderBase &BuilderIR = State.Builder; 1291 VectorBuilder Builder(BuilderIR); 1292 Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue()); 1293 1294 SmallVector<Value *, 4> Ops; 1295 for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) { 1296 VPValue *VPOp = getOperand(I); 1297 Ops.push_back(State.get(VPOp)); 1298 } 1299 1300 Builder.setMask(Mask).setEVL(EVLArg); 1301 Value *VPInst = 1302 Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops, "vp.op"); 1303 // Currently vp-intrinsics only accept FMF flags. 1304 // TODO: Enable other flags when support is added. 1305 if (isa<FPMathOperator>(VPInst)) 1306 setFlags(cast<Instruction>(VPInst)); 1307 1308 State.set(this, VPInst); 1309 State.addMetadata(VPInst, 1310 dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1311 } 1312 1313 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1314 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, 1315 VPSlotTracker &SlotTracker) const { 1316 O << Indent << "WIDEN "; 1317 printAsOperand(O, SlotTracker); 1318 O << " = " << Instruction::getOpcodeName(Opcode); 1319 printFlags(O); 1320 printOperands(O, SlotTracker); 1321 } 1322 1323 void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent, 1324 VPSlotTracker &SlotTracker) const { 1325 O << Indent << "WIDEN "; 1326 printAsOperand(O, SlotTracker); 1327 O << " = vp." << Instruction::getOpcodeName(getOpcode()); 1328 printFlags(O); 1329 printOperands(O, SlotTracker); 1330 } 1331 #endif 1332 1333 void VPWidenCastRecipe::execute(VPTransformState &State) { 1334 State.setDebugLocFrom(getDebugLoc()); 1335 auto &Builder = State.Builder; 1336 /// Vectorize casts. 1337 assert(State.VF.isVector() && "Not vectorizing?"); 1338 Type *DestTy = VectorType::get(getResultType(), State.VF); 1339 VPValue *Op = getOperand(0); 1340 Value *A = State.get(Op); 1341 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); 1342 State.set(this, Cast); 1343 State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue())); 1344 } 1345 1346 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1347 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, 1348 VPSlotTracker &SlotTracker) const { 1349 O << Indent << "WIDEN-CAST "; 1350 printAsOperand(O, SlotTracker); 1351 O << " = " << Instruction::getOpcodeName(Opcode) << " "; 1352 printFlags(O); 1353 printOperands(O, SlotTracker); 1354 O << " to " << *getResultType(); 1355 } 1356 #endif 1357 1358 /// This function adds 1359 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 1360 /// to each vector element of Val. The sequence starts at StartIndex. 1361 /// \p Opcode is relevant for FP induction variable. 1362 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 1363 Instruction::BinaryOps BinOp, ElementCount VF, 1364 IRBuilderBase &Builder) { 1365 assert(VF.isVector() && "only vector VFs are supported"); 1366 1367 // Create and check the types. 1368 auto *ValVTy = cast<VectorType>(Val->getType()); 1369 ElementCount VLen = ValVTy->getElementCount(); 1370 1371 Type *STy = Val->getType()->getScalarType(); 1372 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1373 "Induction Step must be an integer or FP"); 1374 assert(Step->getType() == STy && "Step has wrong type"); 1375 1376 SmallVector<Constant *, 8> Indices; 1377 1378 // Create a vector of consecutive numbers from zero to VF. 1379 VectorType *InitVecValVTy = ValVTy; 1380 if (STy->isFloatingPointTy()) { 1381 Type *InitVecValSTy = 1382 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 1383 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 1384 } 1385 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 1386 1387 // Splat the StartIdx 1388 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 1389 1390 if (STy->isIntegerTy()) { 1391 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 1392 Step = Builder.CreateVectorSplat(VLen, Step); 1393 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1394 // FIXME: The newly created binary instructions should contain nsw/nuw 1395 // flags, which can be found from the original scalar operations. 1396 Step = Builder.CreateMul(InitVec, Step); 1397 return Builder.CreateAdd(Val, Step, "induction"); 1398 } 1399 1400 // Floating point induction. 1401 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1402 "Binary Opcode should be specified for FP induction"); 1403 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 1404 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 1405 1406 Step = Builder.CreateVectorSplat(VLen, Step); 1407 Value *MulOp = Builder.CreateFMul(InitVec, Step); 1408 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1409 } 1410 1411 /// A helper function that returns an integer or floating-point constant with 1412 /// value C. 1413 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 1414 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 1415 : ConstantFP::get(Ty, C); 1416 } 1417 1418 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 1419 assert(!State.Instance && "Int or FP induction being replicated."); 1420 1421 Value *Start = getStartValue()->getLiveInIRValue(); 1422 const InductionDescriptor &ID = getInductionDescriptor(); 1423 TruncInst *Trunc = getTruncInst(); 1424 IRBuilderBase &Builder = State.Builder; 1425 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1426 assert(State.VF.isVector() && "must have vector VF"); 1427 1428 // The value from the original loop to which we are mapping the new induction 1429 // variable. 1430 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1431 1432 // Fast-math-flags propagate from the original induction instruction. 1433 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 1434 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 1435 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 1436 1437 // Now do the actual transformations, and start with fetching the step value. 1438 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 1439 1440 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1441 "Expected either an induction phi-node or a truncate of it!"); 1442 1443 // Construct the initial value of the vector IV in the vector loop preheader 1444 auto CurrIP = Builder.saveIP(); 1445 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1446 Builder.SetInsertPoint(VectorPH->getTerminator()); 1447 if (isa<TruncInst>(EntryVal)) { 1448 assert(Start->getType()->isIntegerTy() && 1449 "Truncation requires an integer type"); 1450 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1451 Step = Builder.CreateTrunc(Step, TruncType); 1452 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1453 } 1454 1455 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 1456 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 1457 Value *SteppedStart = getStepVector( 1458 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); 1459 1460 // We create vector phi nodes for both integer and floating-point induction 1461 // variables. Here, we determine the kind of arithmetic we will perform. 1462 Instruction::BinaryOps AddOp; 1463 Instruction::BinaryOps MulOp; 1464 if (Step->getType()->isIntegerTy()) { 1465 AddOp = Instruction::Add; 1466 MulOp = Instruction::Mul; 1467 } else { 1468 AddOp = ID.getInductionOpcode(); 1469 MulOp = Instruction::FMul; 1470 } 1471 1472 Value *SplatVF; 1473 if (VPValue *SplatVFOperand = getSplatVFValue()) { 1474 // The recipe has been unrolled. In that case, fetch the splat value for the 1475 // induction increment. 1476 SplatVF = State.get(SplatVFOperand); 1477 } else { 1478 // Multiply the vectorization factor by the step using integer or 1479 // floating-point arithmetic as appropriate. 1480 Type *StepType = Step->getType(); 1481 Value *RuntimeVF = State.get(getVFValue(), {0, 0}); 1482 if (Step->getType()->isFloatingPointTy()) 1483 RuntimeVF = Builder.CreateUIToFP(RuntimeVF, StepType); 1484 else 1485 RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, StepType); 1486 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 1487 1488 // Create a vector splat to use in the induction update. 1489 // 1490 // FIXME: If the step is non-constant, we create the vector splat with 1491 // IRBuilder. IRBuilder can constant-fold the multiply, but it 1492 // doesn't handle a constant vector splat. 1493 SplatVF = isa<Constant>(Mul) 1494 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 1495 : Builder.CreateVectorSplat(State.VF, Mul); 1496 } 1497 1498 Builder.restoreIP(CurrIP); 1499 1500 // We may need to add the step a number of times, depending on the unroll 1501 // factor. The last of those goes into the PHI. 1502 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind"); 1503 VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 1504 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1505 State.set(this, VecInd); 1506 1507 Instruction *LastInduction = cast<Instruction>( 1508 Builder.CreateBinOp(AddOp, VecInd, SplatVF, "vec.ind.next")); 1509 if (isa<TruncInst>(EntryVal)) 1510 State.addMetadata(LastInduction, EntryVal); 1511 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1512 1513 VecInd->addIncoming(SteppedStart, VectorPH); 1514 // Add induction update using an incorrect block temporarily. The phi node 1515 // will be fixed after VPlan execution. Note that at this point the latch 1516 // block cannot be used, as it does not exist yet. 1517 // TODO: Model increment value in VPlan, by turning the recipe into a 1518 // multi-def and a subclass of VPHeaderPHIRecipe. 1519 VecInd->addIncoming(LastInduction, VectorPH); 1520 } 1521 1522 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1523 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, 1524 VPSlotTracker &SlotTracker) const { 1525 O << Indent << "WIDEN-INDUCTION"; 1526 if (getTruncInst()) { 1527 O << "\\l\""; 1528 O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\""; 1529 O << " +\n" << Indent << "\" "; 1530 getVPValue(0)->printAsOperand(O, SlotTracker); 1531 } else 1532 O << " " << VPlanIngredient(IV); 1533 1534 O << ", "; 1535 getStepValue()->printAsOperand(O, SlotTracker); 1536 1537 O << ", "; 1538 getVFValue()->printAsOperand(O, SlotTracker); 1539 } 1540 #endif 1541 1542 bool VPWidenIntOrFpInductionRecipe::isCanonical() const { 1543 // The step may be defined by a recipe in the preheader (e.g. if it requires 1544 // SCEV expansion), but for the canonical induction the step is required to be 1545 // 1, which is represented as live-in. 1546 if (getStepValue()->getDefiningRecipe()) 1547 return false; 1548 auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue()); 1549 auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); 1550 auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin()); 1551 return StartC && StartC->isZero() && StepC && StepC->isOne() && 1552 getScalarType() == CanIV->getScalarType(); 1553 } 1554 1555 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1556 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, 1557 VPSlotTracker &SlotTracker) const { 1558 O << Indent; 1559 printAsOperand(O, SlotTracker); 1560 O << " = DERIVED-IV "; 1561 getStartValue()->printAsOperand(O, SlotTracker); 1562 O << " + "; 1563 getOperand(1)->printAsOperand(O, SlotTracker); 1564 O << " * "; 1565 getStepValue()->printAsOperand(O, SlotTracker); 1566 } 1567 #endif 1568 1569 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 1570 // Fast-math-flags propagate from the original induction instruction. 1571 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 1572 if (hasFastMathFlags()) 1573 State.Builder.setFastMathFlags(getFastMathFlags()); 1574 1575 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 1576 /// variable on which to base the steps, \p Step is the size of the step. 1577 1578 Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0)); 1579 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 1580 IRBuilderBase &Builder = State.Builder; 1581 1582 // Ensure step has the same type as that of scalar IV. 1583 Type *BaseIVTy = BaseIV->getType()->getScalarType(); 1584 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!"); 1585 1586 // We build scalar steps for both integer and floating-point induction 1587 // variables. Here, we determine the kind of arithmetic we will perform. 1588 Instruction::BinaryOps AddOp; 1589 Instruction::BinaryOps MulOp; 1590 if (BaseIVTy->isIntegerTy()) { 1591 AddOp = Instruction::Add; 1592 MulOp = Instruction::Mul; 1593 } else { 1594 AddOp = InductionOpcode; 1595 MulOp = Instruction::FMul; 1596 } 1597 1598 // Determine the number of scalars we need to generate for each unroll 1599 // iteration. 1600 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this); 1601 // Compute the scalar steps and save the results in State. 1602 Type *IntStepTy = 1603 IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits()); 1604 Type *VecIVTy = nullptr; 1605 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 1606 if (!FirstLaneOnly && State.VF.isScalable()) { 1607 VecIVTy = VectorType::get(BaseIVTy, State.VF); 1608 UnitStepVec = 1609 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 1610 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 1611 SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV); 1612 } 1613 1614 unsigned StartLane = 0; 1615 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 1616 if (State.Instance) { 1617 StartLane = State.Instance->Lane.getKnownLane(); 1618 EndLane = StartLane + 1; 1619 } 1620 Value *StartIdx0 = 1621 createStepForVF(Builder, IntStepTy, State.VF, getUnrollPart(*this)); 1622 1623 if (!FirstLaneOnly && State.VF.isScalable()) { 1624 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 1625 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 1626 if (BaseIVTy->isFloatingPointTy()) 1627 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 1628 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 1629 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 1630 State.set(this, Add); 1631 // It's useful to record the lane values too for the known minimum number 1632 // of elements so we do those below. This improves the code quality when 1633 // trying to extract the first element, for example. 1634 } 1635 1636 if (BaseIVTy->isFloatingPointTy()) 1637 StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy); 1638 1639 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { 1640 Value *StartIdx = Builder.CreateBinOp( 1641 AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane)); 1642 // The step returned by `createStepForVF` is a runtime-evaluated value 1643 // when VF is scalable. Otherwise, it should be folded into a Constant. 1644 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 1645 "Expected StartIdx to be folded to a constant when VF is not " 1646 "scalable"); 1647 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 1648 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul); 1649 State.set(this, Add, VPIteration(0, Lane)); 1650 } 1651 } 1652 1653 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1654 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, 1655 VPSlotTracker &SlotTracker) const { 1656 O << Indent; 1657 printAsOperand(O, SlotTracker); 1658 O << " = SCALAR-STEPS "; 1659 printOperands(O, SlotTracker); 1660 } 1661 #endif 1662 1663 void VPWidenGEPRecipe::execute(VPTransformState &State) { 1664 assert(State.VF.isVector() && "not widening"); 1665 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 1666 // Construct a vector GEP by widening the operands of the scalar GEP as 1667 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 1668 // results in a vector of pointers when at least one operand of the GEP 1669 // is vector-typed. Thus, to keep the representation compact, we only use 1670 // vector-typed operands for loop-varying values. 1671 1672 if (areAllOperandsInvariant()) { 1673 // If we are vectorizing, but the GEP has only loop-invariant operands, 1674 // the GEP we build (by only using vector-typed operands for 1675 // loop-varying values) would be a scalar pointer. Thus, to ensure we 1676 // produce a vector of pointers, we need to either arbitrarily pick an 1677 // operand to broadcast, or broadcast a clone of the original GEP. 1678 // Here, we broadcast a clone of the original. 1679 // 1680 // TODO: If at some point we decide to scalarize instructions having 1681 // loop-invariant operands, this special case will no longer be 1682 // required. We would add the scalarization decision to 1683 // collectLoopScalars() and teach getVectorValue() to broadcast 1684 // the lane-zero scalar value. 1685 SmallVector<Value *> Ops; 1686 for (unsigned I = 0, E = getNumOperands(); I != E; I++) 1687 Ops.push_back(State.get(getOperand(I), VPIteration(0, 0))); 1688 1689 auto *NewGEP = 1690 State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0], 1691 ArrayRef(Ops).drop_front(), "", isInBounds()); 1692 Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP); 1693 State.set(this, Splat); 1694 State.addMetadata(Splat, GEP); 1695 } else { 1696 // If the GEP has at least one loop-varying operand, we are sure to 1697 // produce a vector of pointers unless VF is scalar. 1698 // The pointer operand of the new GEP. If it's loop-invariant, we 1699 // won't broadcast it. 1700 auto *Ptr = isPointerLoopInvariant() 1701 ? State.get(getOperand(0), VPIteration(0, 0)) 1702 : State.get(getOperand(0)); 1703 1704 // Collect all the indices for the new GEP. If any index is 1705 // loop-invariant, we won't broadcast it. 1706 SmallVector<Value *, 4> Indices; 1707 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 1708 VPValue *Operand = getOperand(I); 1709 if (isIndexLoopInvariant(I - 1)) 1710 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 1711 else 1712 Indices.push_back(State.get(Operand)); 1713 } 1714 1715 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 1716 // but it should be a vector, otherwise. 1717 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, 1718 Indices, "", isInBounds()); 1719 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 1720 "NewGEP is not a pointer vector"); 1721 State.set(this, NewGEP); 1722 State.addMetadata(NewGEP, GEP); 1723 } 1724 } 1725 1726 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1727 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, 1728 VPSlotTracker &SlotTracker) const { 1729 O << Indent << "WIDEN-GEP "; 1730 O << (isPointerLoopInvariant() ? "Inv" : "Var"); 1731 for (size_t I = 0; I < getNumOperands() - 1; ++I) 1732 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]"; 1733 1734 O << " "; 1735 printAsOperand(O, SlotTracker); 1736 O << " = getelementptr"; 1737 printFlags(O); 1738 printOperands(O, SlotTracker); 1739 } 1740 #endif 1741 1742 void VPVectorPointerRecipe ::execute(VPTransformState &State) { 1743 auto &Builder = State.Builder; 1744 State.setDebugLocFrom(getDebugLoc()); 1745 unsigned CurrentPart = getUnrollPart(*this); 1746 // Use i32 for the gep index type when the value is constant, 1747 // or query DataLayout for a more suitable index type otherwise. 1748 const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout(); 1749 Type *IndexTy = State.VF.isScalable() && (IsReverse || CurrentPart > 0) 1750 ? DL.getIndexType(IndexedTy->getPointerTo()) 1751 : Builder.getInt32Ty(); 1752 Value *Ptr = State.get(getOperand(0), VPIteration(0, 0)); 1753 bool InBounds = isInBounds(); 1754 1755 Value *ResultPtr = nullptr; 1756 if (IsReverse) { 1757 // If the address is consecutive but reversed, then the 1758 // wide store needs to start at the last vector element. 1759 // RunTimeVF = VScale * VF.getKnownMinValue() 1760 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 1761 Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF); 1762 // NumElt = -CurrentPart * RunTimeVF 1763 Value *NumElt = Builder.CreateMul( 1764 ConstantInt::get(IndexTy, -(int64_t)CurrentPart), RunTimeVF); 1765 // LastLane = 1 - RunTimeVF 1766 Value *LastLane = 1767 Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); 1768 ResultPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds); 1769 ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "", InBounds); 1770 } else { 1771 Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart); 1772 ResultPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds); 1773 } 1774 1775 State.set(this, ResultPtr, /*IsScalar*/ true); 1776 } 1777 1778 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1779 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, 1780 VPSlotTracker &SlotTracker) const { 1781 O << Indent; 1782 printAsOperand(O, SlotTracker); 1783 O << " = vector-pointer "; 1784 if (IsReverse) 1785 O << "(reverse) "; 1786 1787 printOperands(O, SlotTracker); 1788 } 1789 #endif 1790 1791 void VPBlendRecipe::execute(VPTransformState &State) { 1792 assert(isNormalized() && "Expected blend to be normalized!"); 1793 State.setDebugLocFrom(getDebugLoc()); 1794 // We know that all PHIs in non-header blocks are converted into 1795 // selects, so we don't have to worry about the insertion order and we 1796 // can just use the builder. 1797 // At this point we generate the predication tree. There may be 1798 // duplications since this is a simple recursive scan, but future 1799 // optimizations will clean it up. 1800 1801 unsigned NumIncoming = getNumIncomingValues(); 1802 1803 // Generate a sequence of selects of the form: 1804 // SELECT(Mask3, In3, 1805 // SELECT(Mask2, In2, 1806 // SELECT(Mask1, In1, 1807 // In0))) 1808 // Note that Mask0 is never used: lanes for which no path reaches this phi and 1809 // are essentially undef are taken from In0. 1810 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 1811 Value *Result = nullptr; 1812 for (unsigned In = 0; In < NumIncoming; ++In) { 1813 // We might have single edge PHIs (blocks) - use an identity 1814 // 'select' for the first PHI operand. 1815 Value *In0 = State.get(getIncomingValue(In), OnlyFirstLaneUsed); 1816 if (In == 0) 1817 Result = In0; // Initialize with the first incoming value. 1818 else { 1819 // Select between the current value and the previous incoming edge 1820 // based on the incoming mask. 1821 Value *Cond = State.get(getMask(In), OnlyFirstLaneUsed); 1822 Result = State.Builder.CreateSelect(Cond, In0, Result, "predphi"); 1823 } 1824 } 1825 State.set(this, Result, OnlyFirstLaneUsed); 1826 } 1827 1828 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1829 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, 1830 VPSlotTracker &SlotTracker) const { 1831 O << Indent << "BLEND "; 1832 printAsOperand(O, SlotTracker); 1833 O << " ="; 1834 if (getNumIncomingValues() == 1) { 1835 // Not a User of any mask: not really blending, this is a 1836 // single-predecessor phi. 1837 O << " "; 1838 getIncomingValue(0)->printAsOperand(O, SlotTracker); 1839 } else { 1840 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { 1841 O << " "; 1842 getIncomingValue(I)->printAsOperand(O, SlotTracker); 1843 if (I == 0) 1844 continue; 1845 O << "/"; 1846 getMask(I)->printAsOperand(O, SlotTracker); 1847 } 1848 } 1849 } 1850 #endif 1851 1852 void VPReductionRecipe::execute(VPTransformState &State) { 1853 assert(!State.Instance && "Reduction being replicated."); 1854 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); 1855 RecurKind Kind = RdxDesc.getRecurrenceKind(); 1856 // Propagate the fast-math flags carried by the underlying instruction. 1857 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 1858 State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 1859 Value *NewVecOp = State.get(getVecOp()); 1860 if (VPValue *Cond = getCondOp()) { 1861 Value *NewCond = State.get(Cond, State.VF.isScalar()); 1862 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType()); 1863 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); 1864 1865 Value *Start; 1866 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind)) 1867 Start = RdxDesc.getRecurrenceStartValue(); 1868 else 1869 Start = llvm::getRecurrenceIdentity(Kind, ElementTy, 1870 RdxDesc.getFastMathFlags()); 1871 if (State.VF.isVector()) 1872 Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start); 1873 1874 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start); 1875 NewVecOp = Select; 1876 } 1877 Value *NewRed; 1878 Value *NextInChain; 1879 if (IsOrdered) { 1880 if (State.VF.isVector()) 1881 NewRed = 1882 createOrderedReduction(State.Builder, RdxDesc, NewVecOp, PrevInChain); 1883 else 1884 NewRed = State.Builder.CreateBinOp( 1885 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain, 1886 NewVecOp); 1887 PrevInChain = NewRed; 1888 NextInChain = NewRed; 1889 } else { 1890 PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); 1891 NewRed = createReduction(State.Builder, RdxDesc, NewVecOp); 1892 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 1893 NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(), 1894 NewRed, PrevInChain); 1895 else 1896 NextInChain = State.Builder.CreateBinOp( 1897 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain); 1898 } 1899 State.set(this, NextInChain, /*IsScalar*/ true); 1900 } 1901 1902 void VPReductionEVLRecipe::execute(VPTransformState &State) { 1903 assert(!State.Instance && "Reduction being replicated."); 1904 1905 auto &Builder = State.Builder; 1906 // Propagate the fast-math flags carried by the underlying instruction. 1907 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder); 1908 const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor(); 1909 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 1910 1911 RecurKind Kind = RdxDesc.getRecurrenceKind(); 1912 Value *Prev = State.get(getChainOp(), /*IsScalar*/ true); 1913 Value *VecOp = State.get(getVecOp()); 1914 Value *EVL = State.get(getEVL(), VPIteration(0, 0)); 1915 1916 VectorBuilder VBuilder(Builder); 1917 VBuilder.setEVL(EVL); 1918 Value *Mask; 1919 // TODO: move the all-true mask generation into VectorBuilder. 1920 if (VPValue *CondOp = getCondOp()) 1921 Mask = State.get(CondOp); 1922 else 1923 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 1924 VBuilder.setMask(Mask); 1925 1926 Value *NewRed; 1927 if (isOrdered()) { 1928 NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev); 1929 } else { 1930 NewRed = createSimpleReduction(VBuilder, VecOp, RdxDesc); 1931 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 1932 NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev); 1933 else 1934 NewRed = Builder.CreateBinOp( 1935 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, Prev); 1936 } 1937 State.set(this, NewRed, /*IsScalar*/ true); 1938 } 1939 1940 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1941 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, 1942 VPSlotTracker &SlotTracker) const { 1943 O << Indent << "REDUCE "; 1944 printAsOperand(O, SlotTracker); 1945 O << " = "; 1946 getChainOp()->printAsOperand(O, SlotTracker); 1947 O << " +"; 1948 if (isa<FPMathOperator>(getUnderlyingInstr())) 1949 O << getUnderlyingInstr()->getFastMathFlags(); 1950 O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; 1951 getVecOp()->printAsOperand(O, SlotTracker); 1952 if (isConditional()) { 1953 O << ", "; 1954 getCondOp()->printAsOperand(O, SlotTracker); 1955 } 1956 O << ")"; 1957 if (RdxDesc.IntermediateStore) 1958 O << " (with final reduction value stored in invariant address sank " 1959 "outside of loop)"; 1960 } 1961 1962 void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent, 1963 VPSlotTracker &SlotTracker) const { 1964 const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor(); 1965 O << Indent << "REDUCE "; 1966 printAsOperand(O, SlotTracker); 1967 O << " = "; 1968 getChainOp()->printAsOperand(O, SlotTracker); 1969 O << " +"; 1970 if (isa<FPMathOperator>(getUnderlyingInstr())) 1971 O << getUnderlyingInstr()->getFastMathFlags(); 1972 O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; 1973 getVecOp()->printAsOperand(O, SlotTracker); 1974 O << ", "; 1975 getEVL()->printAsOperand(O, SlotTracker); 1976 if (isConditional()) { 1977 O << ", "; 1978 getCondOp()->printAsOperand(O, SlotTracker); 1979 } 1980 O << ")"; 1981 if (RdxDesc.IntermediateStore) 1982 O << " (with final reduction value stored in invariant address sank " 1983 "outside of loop)"; 1984 } 1985 #endif 1986 1987 bool VPReplicateRecipe::shouldPack() const { 1988 // Find if the recipe is used by a widened recipe via an intervening 1989 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. 1990 return any_of(users(), [](const VPUser *U) { 1991 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U)) 1992 return any_of(PredR->users(), [PredR](const VPUser *U) { 1993 return !U->usesScalars(PredR); 1994 }); 1995 return false; 1996 }); 1997 } 1998 1999 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2000 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, 2001 VPSlotTracker &SlotTracker) const { 2002 O << Indent << (IsUniform ? "CLONE " : "REPLICATE "); 2003 2004 if (!getUnderlyingInstr()->getType()->isVoidTy()) { 2005 printAsOperand(O, SlotTracker); 2006 O << " = "; 2007 } 2008 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) { 2009 O << "call"; 2010 printFlags(O); 2011 O << "@" << CB->getCalledFunction()->getName() << "("; 2012 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), 2013 O, [&O, &SlotTracker](VPValue *Op) { 2014 Op->printAsOperand(O, SlotTracker); 2015 }); 2016 O << ")"; 2017 } else { 2018 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); 2019 printFlags(O); 2020 printOperands(O, SlotTracker); 2021 } 2022 2023 if (shouldPack()) 2024 O << " (S->V)"; 2025 } 2026 #endif 2027 2028 Value *VPScalarCastRecipe ::generate(VPTransformState &State) { 2029 assert(vputils::onlyFirstLaneUsed(this) && 2030 "Codegen only implemented for first lane."); 2031 switch (Opcode) { 2032 case Instruction::SExt: 2033 case Instruction::ZExt: 2034 case Instruction::Trunc: { 2035 // Note: SExt/ZExt not used yet. 2036 Value *Op = State.get(getOperand(0), VPIteration(0, 0)); 2037 return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy); 2038 } 2039 default: 2040 llvm_unreachable("opcode not implemented yet"); 2041 } 2042 } 2043 2044 void VPScalarCastRecipe ::execute(VPTransformState &State) { 2045 State.set(this, generate(State), VPIteration(0, 0)); 2046 } 2047 2048 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2049 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent, 2050 VPSlotTracker &SlotTracker) const { 2051 O << Indent << "SCALAR-CAST "; 2052 printAsOperand(O, SlotTracker); 2053 O << " = " << Instruction::getOpcodeName(Opcode) << " "; 2054 printOperands(O, SlotTracker); 2055 O << " to " << *ResultTy; 2056 } 2057 #endif 2058 2059 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 2060 assert(State.Instance && "Branch on Mask works only on single instance."); 2061 2062 unsigned Lane = State.Instance->Lane.getKnownLane(); 2063 2064 Value *ConditionBit = nullptr; 2065 VPValue *BlockInMask = getMask(); 2066 if (BlockInMask) { 2067 ConditionBit = State.get(BlockInMask); 2068 if (ConditionBit->getType()->isVectorTy()) 2069 ConditionBit = State.Builder.CreateExtractElement( 2070 ConditionBit, State.Builder.getInt32(Lane)); 2071 } else // Block in mask is all-one. 2072 ConditionBit = State.Builder.getTrue(); 2073 2074 // Replace the temporary unreachable terminator with a new conditional branch, 2075 // whose two destinations will be set later when they are created. 2076 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 2077 assert(isa<UnreachableInst>(CurrentTerminator) && 2078 "Expected to replace unreachable terminator with conditional branch."); 2079 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 2080 CondBr->setSuccessor(0, nullptr); 2081 ReplaceInstWithInst(CurrentTerminator, CondBr); 2082 } 2083 2084 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 2085 assert(State.Instance && "Predicated instruction PHI works per instance."); 2086 Instruction *ScalarPredInst = 2087 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 2088 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 2089 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 2090 assert(PredicatingBB && "Predicated block has no single predecessor."); 2091 assert(isa<VPReplicateRecipe>(getOperand(0)) && 2092 "operand must be VPReplicateRecipe"); 2093 2094 // By current pack/unpack logic we need to generate only a single phi node: if 2095 // a vector value for the predicated instruction exists at this point it means 2096 // the instruction has vector users only, and a phi for the vector value is 2097 // needed. In this case the recipe of the predicated instruction is marked to 2098 // also do that packing, thereby "hoisting" the insert-element sequence. 2099 // Otherwise, a phi node for the scalar value is needed. 2100 if (State.hasVectorValue(getOperand(0))) { 2101 Value *VectorValue = State.get(getOperand(0)); 2102 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 2103 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 2104 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 2105 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 2106 if (State.hasVectorValue(this)) 2107 State.reset(this, VPhi); 2108 else 2109 State.set(this, VPhi); 2110 // NOTE: Currently we need to update the value of the operand, so the next 2111 // predicated iteration inserts its generated value in the correct vector. 2112 State.reset(getOperand(0), VPhi); 2113 } else { 2114 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 2115 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 2116 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 2117 PredicatingBB); 2118 Phi->addIncoming(ScalarPredInst, PredicatedBB); 2119 if (State.hasScalarValue(this, *State.Instance)) 2120 State.reset(this, Phi, *State.Instance); 2121 else 2122 State.set(this, Phi, *State.Instance); 2123 // NOTE: Currently we need to update the value of the operand, so the next 2124 // predicated iteration inserts its generated value in the correct vector. 2125 State.reset(getOperand(0), Phi, *State.Instance); 2126 } 2127 } 2128 2129 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2130 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2131 VPSlotTracker &SlotTracker) const { 2132 O << Indent << "PHI-PREDICATED-INSTRUCTION "; 2133 printAsOperand(O, SlotTracker); 2134 O << " = "; 2135 printOperands(O, SlotTracker); 2136 } 2137 #endif 2138 2139 InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, 2140 VPCostContext &Ctx) const { 2141 Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF); 2142 const Align Alignment = 2143 getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient)); 2144 unsigned AS = 2145 getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient)); 2146 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2147 2148 if (!Consecutive) { 2149 // TODO: Using the original IR may not be accurate. 2150 // Currently, ARM will use the underlying IR to calculate gather/scatter 2151 // instruction cost. 2152 const Value *Ptr = getLoadStorePointerOperand(&Ingredient); 2153 assert(!Reverse && 2154 "Inconsecutive memory access should not have the order."); 2155 return Ctx.TTI.getAddressComputationCost(Ty) + 2156 Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr, 2157 IsMasked, Alignment, CostKind, 2158 &Ingredient); 2159 } 2160 2161 InstructionCost Cost = 0; 2162 if (IsMasked) { 2163 Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, 2164 AS, CostKind); 2165 } else { 2166 TTI::OperandValueInfo OpInfo = 2167 Ctx.TTI.getOperandInfo(Ingredient.getOperand(0)); 2168 Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS, 2169 CostKind, OpInfo, &Ingredient); 2170 } 2171 if (!Reverse) 2172 return Cost; 2173 2174 return Cost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 2175 cast<VectorType>(Ty), {}, CostKind, 0); 2176 } 2177 2178 void VPWidenLoadRecipe::execute(VPTransformState &State) { 2179 auto *LI = cast<LoadInst>(&Ingredient); 2180 2181 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 2182 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 2183 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2184 bool CreateGather = !isConsecutive(); 2185 2186 auto &Builder = State.Builder; 2187 State.setDebugLocFrom(getDebugLoc()); 2188 Value *Mask = nullptr; 2189 if (auto *VPMask = getMask()) { 2190 // Mask reversal is only needed for non-all-one (null) masks, as reverse 2191 // of a null all-one mask is a null mask. 2192 Mask = State.get(VPMask); 2193 if (isReverse()) 2194 Mask = Builder.CreateVectorReverse(Mask, "reverse"); 2195 } 2196 2197 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather); 2198 Value *NewLI; 2199 if (CreateGather) { 2200 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr, 2201 "wide.masked.gather"); 2202 } else if (Mask) { 2203 NewLI = 2204 Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask, 2205 PoisonValue::get(DataTy), "wide.masked.load"); 2206 } else { 2207 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load"); 2208 } 2209 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2210 State.addMetadata(NewLI, LI); 2211 if (Reverse) 2212 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 2213 State.set(this, NewLI); 2214 } 2215 2216 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2217 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, 2218 VPSlotTracker &SlotTracker) const { 2219 O << Indent << "WIDEN "; 2220 printAsOperand(O, SlotTracker); 2221 O << " = load "; 2222 printOperands(O, SlotTracker); 2223 } 2224 #endif 2225 2226 /// Use all-true mask for reverse rather than actual mask, as it avoids a 2227 /// dependence w/o affecting the result. 2228 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, 2229 Value *EVL, const Twine &Name) { 2230 VectorType *ValTy = cast<VectorType>(Operand->getType()); 2231 Value *AllTrueMask = 2232 Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue()); 2233 return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse, 2234 {Operand, AllTrueMask, EVL}, nullptr, Name); 2235 } 2236 2237 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { 2238 auto *LI = cast<LoadInst>(&Ingredient); 2239 2240 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 2241 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 2242 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2243 bool CreateGather = !isConsecutive(); 2244 2245 auto &Builder = State.Builder; 2246 State.setDebugLocFrom(getDebugLoc()); 2247 CallInst *NewLI; 2248 Value *EVL = State.get(getEVL(), VPIteration(0, 0)); 2249 Value *Addr = State.get(getAddr(), !CreateGather); 2250 Value *Mask = nullptr; 2251 if (VPValue *VPMask = getMask()) { 2252 Mask = State.get(VPMask); 2253 if (isReverse()) 2254 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); 2255 } else { 2256 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 2257 } 2258 2259 if (CreateGather) { 2260 NewLI = 2261 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, 2262 nullptr, "wide.masked.gather"); 2263 } else { 2264 VectorBuilder VBuilder(Builder); 2265 VBuilder.setEVL(EVL).setMask(Mask); 2266 NewLI = cast<CallInst>(VBuilder.createVectorInstruction( 2267 Instruction::Load, DataTy, Addr, "vp.op.load")); 2268 } 2269 NewLI->addParamAttr( 2270 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); 2271 State.addMetadata(NewLI, LI); 2272 Instruction *Res = NewLI; 2273 if (isReverse()) 2274 Res = createReverseEVL(Builder, Res, EVL, "vp.reverse"); 2275 State.set(this, Res); 2276 } 2277 2278 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2279 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, 2280 VPSlotTracker &SlotTracker) const { 2281 O << Indent << "WIDEN "; 2282 printAsOperand(O, SlotTracker); 2283 O << " = vp.load "; 2284 printOperands(O, SlotTracker); 2285 } 2286 #endif 2287 2288 void VPWidenStoreRecipe::execute(VPTransformState &State) { 2289 auto *SI = cast<StoreInst>(&Ingredient); 2290 2291 VPValue *StoredVPValue = getStoredValue(); 2292 bool CreateScatter = !isConsecutive(); 2293 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2294 2295 auto &Builder = State.Builder; 2296 State.setDebugLocFrom(getDebugLoc()); 2297 2298 Value *Mask = nullptr; 2299 if (auto *VPMask = getMask()) { 2300 // Mask reversal is only needed for non-all-one (null) masks, as reverse 2301 // of a null all-one mask is a null mask. 2302 Mask = State.get(VPMask); 2303 if (isReverse()) 2304 Mask = Builder.CreateVectorReverse(Mask, "reverse"); 2305 } 2306 2307 Value *StoredVal = State.get(StoredVPValue); 2308 if (isReverse()) { 2309 // If we store to reverse consecutive memory locations, then we need 2310 // to reverse the order of elements in the stored value. 2311 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 2312 // We don't want to update the value in the map as it might be used in 2313 // another expression. So don't call resetVectorValue(StoredVal). 2314 } 2315 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter); 2316 Instruction *NewSI = nullptr; 2317 if (CreateScatter) 2318 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask); 2319 else if (Mask) 2320 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask); 2321 else 2322 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment); 2323 State.addMetadata(NewSI, SI); 2324 } 2325 2326 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2327 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, 2328 VPSlotTracker &SlotTracker) const { 2329 O << Indent << "WIDEN store "; 2330 printOperands(O, SlotTracker); 2331 } 2332 #endif 2333 2334 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { 2335 auto *SI = cast<StoreInst>(&Ingredient); 2336 2337 VPValue *StoredValue = getStoredValue(); 2338 bool CreateScatter = !isConsecutive(); 2339 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2340 2341 auto &Builder = State.Builder; 2342 State.setDebugLocFrom(getDebugLoc()); 2343 2344 CallInst *NewSI = nullptr; 2345 Value *StoredVal = State.get(StoredValue); 2346 Value *EVL = State.get(getEVL(), VPIteration(0, 0)); 2347 if (isReverse()) 2348 StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse"); 2349 Value *Mask = nullptr; 2350 if (VPValue *VPMask = getMask()) { 2351 Mask = State.get(VPMask); 2352 if (isReverse()) 2353 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); 2354 } else { 2355 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 2356 } 2357 Value *Addr = State.get(getAddr(), !CreateScatter); 2358 if (CreateScatter) { 2359 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), 2360 Intrinsic::vp_scatter, 2361 {StoredVal, Addr, Mask, EVL}); 2362 } else { 2363 VectorBuilder VBuilder(Builder); 2364 VBuilder.setEVL(EVL).setMask(Mask); 2365 NewSI = cast<CallInst>(VBuilder.createVectorInstruction( 2366 Instruction::Store, Type::getVoidTy(EVL->getContext()), 2367 {StoredVal, Addr})); 2368 } 2369 NewSI->addParamAttr( 2370 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment)); 2371 State.addMetadata(NewSI, SI); 2372 } 2373 2374 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2375 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent, 2376 VPSlotTracker &SlotTracker) const { 2377 O << Indent << "WIDEN vp.store "; 2378 printOperands(O, SlotTracker); 2379 } 2380 #endif 2381 2382 static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V, 2383 VectorType *DstVTy, const DataLayout &DL) { 2384 // Verify that V is a vector type with same number of elements as DstVTy. 2385 auto VF = DstVTy->getElementCount(); 2386 auto *SrcVecTy = cast<VectorType>(V->getType()); 2387 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match"); 2388 Type *SrcElemTy = SrcVecTy->getElementType(); 2389 Type *DstElemTy = DstVTy->getElementType(); 2390 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2391 "Vector elements must have same size"); 2392 2393 // Do a direct cast if element types are castable. 2394 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2395 return Builder.CreateBitOrPointerCast(V, DstVTy); 2396 } 2397 // V cannot be directly casted to desired vector type. 2398 // May happen when V is a floating point vector but DstVTy is a vector of 2399 // pointers or vice-versa. Handle this using a two-step bitcast using an 2400 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2401 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2402 "Only one type should be a pointer type"); 2403 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2404 "Only one type should be a floating point type"); 2405 Type *IntTy = 2406 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2407 auto *VecIntTy = VectorType::get(IntTy, VF); 2408 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2409 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2410 } 2411 2412 /// Return a vector containing interleaved elements from multiple 2413 /// smaller input vectors. 2414 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, 2415 const Twine &Name) { 2416 unsigned Factor = Vals.size(); 2417 assert(Factor > 1 && "Tried to interleave invalid number of vectors"); 2418 2419 VectorType *VecTy = cast<VectorType>(Vals[0]->getType()); 2420 #ifndef NDEBUG 2421 for (Value *Val : Vals) 2422 assert(Val->getType() == VecTy && "Tried to interleave mismatched types"); 2423 #endif 2424 2425 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so 2426 // must use intrinsics to interleave. 2427 if (VecTy->isScalableTy()) { 2428 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); 2429 return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2, 2430 Vals, 2431 /*FMFSource=*/nullptr, Name); 2432 } 2433 2434 // Fixed length. Start by concatenating all vectors into a wide vector. 2435 Value *WideVec = concatenateVectors(Builder, Vals); 2436 2437 // Interleave the elements into the wide vector. 2438 const unsigned NumElts = VecTy->getElementCount().getFixedValue(); 2439 return Builder.CreateShuffleVector( 2440 WideVec, createInterleaveMask(NumElts, Factor), Name); 2441 } 2442 2443 // Try to vectorize the interleave group that \p Instr belongs to. 2444 // 2445 // E.g. Translate following interleaved load group (factor = 3): 2446 // for (i = 0; i < N; i+=3) { 2447 // R = Pic[i]; // Member of index 0 2448 // G = Pic[i+1]; // Member of index 1 2449 // B = Pic[i+2]; // Member of index 2 2450 // ... // do something to R, G, B 2451 // } 2452 // To: 2453 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2454 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2455 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2456 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2457 // 2458 // Or translate following interleaved store group (factor = 3): 2459 // for (i = 0; i < N; i+=3) { 2460 // ... do something to R, G, B 2461 // Pic[i] = R; // Member of index 0 2462 // Pic[i+1] = G; // Member of index 1 2463 // Pic[i+2] = B; // Member of index 2 2464 // } 2465 // To: 2466 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2467 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2468 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2469 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2470 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2471 void VPInterleaveRecipe::execute(VPTransformState &State) { 2472 assert(!State.Instance && "Interleave group being replicated."); 2473 const InterleaveGroup<Instruction> *Group = IG; 2474 Instruction *Instr = Group->getInsertPos(); 2475 2476 // Prepare for the vector type of the interleaved load/store. 2477 Type *ScalarTy = getLoadStoreType(Instr); 2478 unsigned InterleaveFactor = Group->getFactor(); 2479 auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor); 2480 2481 // Prepare for the new pointers. 2482 unsigned Index = Group->getIndex(Instr); 2483 2484 // TODO: extend the masked interleaved-group support to reversed access. 2485 VPValue *BlockInMask = getMask(); 2486 assert((!BlockInMask || !Group->isReverse()) && 2487 "Reversed masked interleave-group not supported."); 2488 2489 Value *Idx; 2490 // If the group is reverse, adjust the index to refer to the last vector lane 2491 // instead of the first. We adjust the index from the first vector lane, 2492 // rather than directly getting the pointer for lane VF - 1, because the 2493 // pointer operand of the interleaved access is supposed to be uniform. 2494 if (Group->isReverse()) { 2495 Value *RuntimeVF = 2496 getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF); 2497 Idx = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1)); 2498 Idx = State.Builder.CreateMul(Idx, 2499 State.Builder.getInt32(Group->getFactor())); 2500 Idx = State.Builder.CreateAdd(Idx, State.Builder.getInt32(Index)); 2501 Idx = State.Builder.CreateNeg(Idx); 2502 } else 2503 Idx = State.Builder.getInt32(-Index); 2504 2505 VPValue *Addr = getAddr(); 2506 Value *ResAddr = State.get(Addr, VPIteration(0, 0)); 2507 if (auto *I = dyn_cast<Instruction>(ResAddr)) 2508 State.setDebugLocFrom(I->getDebugLoc()); 2509 2510 // Notice current instruction could be any index. Need to adjust the address 2511 // to the member of index 0. 2512 // 2513 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2514 // b = A[i]; // Member of index 0 2515 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2516 // 2517 // E.g. A[i+1] = a; // Member of index 1 2518 // A[i] = b; // Member of index 0 2519 // A[i+2] = c; // Member of index 2 (Current instruction) 2520 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2521 2522 bool InBounds = false; 2523 if (auto *gep = dyn_cast<GetElementPtrInst>(ResAddr->stripPointerCasts())) 2524 InBounds = gep->isInBounds(); 2525 ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Idx, "", InBounds); 2526 2527 State.setDebugLocFrom(Instr->getDebugLoc()); 2528 Value *PoisonVec = PoisonValue::get(VecTy); 2529 2530 auto CreateGroupMask = [&BlockInMask, &State, 2531 &InterleaveFactor](Value *MaskForGaps) -> Value * { 2532 if (State.VF.isScalable()) { 2533 assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); 2534 assert(InterleaveFactor == 2 && 2535 "Unsupported deinterleave factor for scalable vectors"); 2536 auto *ResBlockInMask = State.get(BlockInMask); 2537 SmallVector<Value *, 2> Ops = {ResBlockInMask, ResBlockInMask}; 2538 auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(), 2539 State.VF.getKnownMinValue() * 2, true); 2540 return State.Builder.CreateIntrinsic( 2541 MaskTy, Intrinsic::vector_interleave2, Ops, 2542 /*FMFSource=*/nullptr, "interleaved.mask"); 2543 } 2544 2545 if (!BlockInMask) 2546 return MaskForGaps; 2547 2548 Value *ResBlockInMask = State.get(BlockInMask); 2549 Value *ShuffledMask = State.Builder.CreateShuffleVector( 2550 ResBlockInMask, 2551 createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()), 2552 "interleaved.mask"); 2553 return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And, 2554 ShuffledMask, MaskForGaps) 2555 : ShuffledMask; 2556 }; 2557 2558 const DataLayout &DL = Instr->getDataLayout(); 2559 // Vectorize the interleaved load group. 2560 if (isa<LoadInst>(Instr)) { 2561 Value *MaskForGaps = nullptr; 2562 if (NeedsMaskForGaps) { 2563 MaskForGaps = createBitMaskForGaps(State.Builder, 2564 State.VF.getKnownMinValue(), *Group); 2565 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2566 } 2567 2568 Instruction *NewLoad; 2569 if (BlockInMask || MaskForGaps) { 2570 Value *GroupMask = CreateGroupMask(MaskForGaps); 2571 NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr, 2572 Group->getAlign(), GroupMask, 2573 PoisonVec, "wide.masked.vec"); 2574 } else 2575 NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr, 2576 Group->getAlign(), "wide.vec"); 2577 Group->addMetadata(NewLoad); 2578 2579 ArrayRef<VPValue *> VPDefs = definedValues(); 2580 const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); 2581 if (VecTy->isScalableTy()) { 2582 assert(InterleaveFactor == 2 && 2583 "Unsupported deinterleave factor for scalable vectors"); 2584 2585 // Scalable vectors cannot use arbitrary shufflevectors (only splats), 2586 // so must use intrinsics to deinterleave. 2587 Value *DI = State.Builder.CreateIntrinsic( 2588 Intrinsic::vector_deinterleave2, VecTy, NewLoad, 2589 /*FMFSource=*/nullptr, "strided.vec"); 2590 unsigned J = 0; 2591 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2592 Instruction *Member = Group->getMember(I); 2593 2594 if (!Member) 2595 continue; 2596 2597 Value *StridedVec = State.Builder.CreateExtractValue(DI, I); 2598 // If this member has different type, cast the result type. 2599 if (Member->getType() != ScalarTy) { 2600 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); 2601 StridedVec = 2602 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); 2603 } 2604 2605 if (Group->isReverse()) 2606 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); 2607 2608 State.set(VPDefs[J], StridedVec); 2609 ++J; 2610 } 2611 2612 return; 2613 } 2614 2615 // For each member in the group, shuffle out the appropriate data from the 2616 // wide loads. 2617 unsigned J = 0; 2618 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2619 Instruction *Member = Group->getMember(I); 2620 2621 // Skip the gaps in the group. 2622 if (!Member) 2623 continue; 2624 2625 auto StrideMask = 2626 createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue()); 2627 Value *StridedVec = 2628 State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec"); 2629 2630 // If this member has different type, cast the result type. 2631 if (Member->getType() != ScalarTy) { 2632 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 2633 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); 2634 StridedVec = 2635 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); 2636 } 2637 2638 if (Group->isReverse()) 2639 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); 2640 2641 State.set(VPDefs[J], StridedVec); 2642 ++J; 2643 } 2644 return; 2645 } 2646 2647 // The sub vector type for current instruction. 2648 auto *SubVT = VectorType::get(ScalarTy, State.VF); 2649 2650 // Vectorize the interleaved store group. 2651 Value *MaskForGaps = 2652 createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group); 2653 assert((!MaskForGaps || !State.VF.isScalable()) && 2654 "masking gaps for scalable vectors is not yet supported."); 2655 ArrayRef<VPValue *> StoredValues = getStoredValues(); 2656 // Collect the stored vector from each member. 2657 SmallVector<Value *, 4> StoredVecs; 2658 unsigned StoredIdx = 0; 2659 for (unsigned i = 0; i < InterleaveFactor; i++) { 2660 assert((Group->getMember(i) || MaskForGaps) && 2661 "Fail to get a member from an interleaved store group"); 2662 Instruction *Member = Group->getMember(i); 2663 2664 // Skip the gaps in the group. 2665 if (!Member) { 2666 Value *Undef = PoisonValue::get(SubVT); 2667 StoredVecs.push_back(Undef); 2668 continue; 2669 } 2670 2671 Value *StoredVec = State.get(StoredValues[StoredIdx]); 2672 ++StoredIdx; 2673 2674 if (Group->isReverse()) 2675 StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse"); 2676 2677 // If this member has different type, cast it to a unified type. 2678 2679 if (StoredVec->getType() != SubVT) 2680 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL); 2681 2682 StoredVecs.push_back(StoredVec); 2683 } 2684 2685 // Interleave all the smaller vectors into one wider vector. 2686 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec"); 2687 Instruction *NewStoreInstr; 2688 if (BlockInMask || MaskForGaps) { 2689 Value *GroupMask = CreateGroupMask(MaskForGaps); 2690 NewStoreInstr = State.Builder.CreateMaskedStore( 2691 IVec, ResAddr, Group->getAlign(), GroupMask); 2692 } else 2693 NewStoreInstr = 2694 State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign()); 2695 2696 Group->addMetadata(NewStoreInstr); 2697 } 2698 2699 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2700 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 2701 VPSlotTracker &SlotTracker) const { 2702 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 2703 IG->getInsertPos()->printAsOperand(O, false); 2704 O << ", "; 2705 getAddr()->printAsOperand(O, SlotTracker); 2706 VPValue *Mask = getMask(); 2707 if (Mask) { 2708 O << ", "; 2709 Mask->printAsOperand(O, SlotTracker); 2710 } 2711 2712 unsigned OpIdx = 0; 2713 for (unsigned i = 0; i < IG->getFactor(); ++i) { 2714 if (!IG->getMember(i)) 2715 continue; 2716 if (getNumStoreOperands() > 0) { 2717 O << "\n" << Indent << " store "; 2718 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 2719 O << " to index " << i; 2720 } else { 2721 O << "\n" << Indent << " "; 2722 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 2723 O << " = load from index " << i; 2724 } 2725 ++OpIdx; 2726 } 2727 } 2728 #endif 2729 2730 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { 2731 Value *Start = getStartValue()->getLiveInIRValue(); 2732 PHINode *Phi = PHINode::Create(Start->getType(), 2, "index"); 2733 Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 2734 2735 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2736 Phi->addIncoming(Start, VectorPH); 2737 Phi->setDebugLoc(getDebugLoc()); 2738 State.set(this, Phi, /*IsScalar*/ true); 2739 } 2740 2741 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2742 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2743 VPSlotTracker &SlotTracker) const { 2744 O << Indent << "EMIT "; 2745 printAsOperand(O, SlotTracker); 2746 O << " = CANONICAL-INDUCTION "; 2747 printOperands(O, SlotTracker); 2748 } 2749 #endif 2750 2751 bool VPCanonicalIVPHIRecipe::isCanonical( 2752 InductionDescriptor::InductionKind Kind, VPValue *Start, 2753 VPValue *Step) const { 2754 // Must be an integer induction. 2755 if (Kind != InductionDescriptor::IK_IntInduction) 2756 return false; 2757 // Start must match the start value of this canonical induction. 2758 if (Start != getStartValue()) 2759 return false; 2760 2761 // If the step is defined by a recipe, it is not a ConstantInt. 2762 if (Step->getDefiningRecipe()) 2763 return false; 2764 2765 ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue()); 2766 return StepC && StepC->isOne(); 2767 } 2768 2769 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) { 2770 return IsScalarAfterVectorization && 2771 (!IsScalable || vputils::onlyFirstLaneUsed(this)); 2772 } 2773 2774 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 2775 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && 2776 "Not a pointer induction according to InductionDescriptor!"); 2777 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && 2778 "Unexpected type."); 2779 assert(!onlyScalarsGenerated(State.VF.isScalable()) && 2780 "Recipe should have been replaced"); 2781 2782 auto *IVR = getParent()->getPlan()->getCanonicalIV(); 2783 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, /*IsScalar*/ true)); 2784 unsigned CurrentPart = getUnrollPart(*this); 2785 2786 // Build a pointer phi 2787 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 2788 Type *ScStValueType = ScalarStartValue->getType(); 2789 2790 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2791 PHINode *NewPointerPhi = nullptr; 2792 if (CurrentPart == 0) { 2793 NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi", 2794 CanonicalIV->getIterator()); 2795 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 2796 } else { 2797 // The recipe has been unrolled. In that case, fetch the single pointer phi 2798 // shared among all unrolled parts of the recipe. 2799 auto *GEP = 2800 cast<GetElementPtrInst>(State.get(getFirstUnrolledPartOperand())); 2801 NewPointerPhi = cast<PHINode>(GEP->getPointerOperand()); 2802 } 2803 2804 // A pointer induction, performed by using a gep 2805 BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint(); 2806 Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0)); 2807 Type *PhiType = IndDesc.getStep()->getType(); 2808 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 2809 // Add induction update using an incorrect block temporarily. The phi node 2810 // will be fixed after VPlan execution. Note that at this point the latch 2811 // block cannot be used, as it does not exist yet. 2812 // TODO: Model increment value in VPlan, by turning the recipe into a 2813 // multi-def and a subclass of VPHeaderPHIRecipe. 2814 if (CurrentPart == 0) { 2815 // The recipe represents the first part of the pointer induction. Create the 2816 // GEP to increment the phi across all unrolled parts. 2817 unsigned UF = CurrentPart == 0 ? getParent()->getPlan()->getUF() : 1; 2818 Value *NumUnrolledElems = 2819 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, UF)); 2820 2821 Value *InductionGEP = GetElementPtrInst::Create( 2822 State.Builder.getInt8Ty(), NewPointerPhi, 2823 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 2824 InductionLoc); 2825 2826 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 2827 } 2828 2829 // Create actual address geps that use the pointer phi as base and a 2830 // vectorized version of the step value (<step*0, ..., step*N>) as offset. 2831 Type *VecPhiType = VectorType::get(PhiType, State.VF); 2832 Value *StartOffsetScalar = State.Builder.CreateMul( 2833 RuntimeVF, ConstantInt::get(PhiType, CurrentPart)); 2834 Value *StartOffset = 2835 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 2836 // Create a vector of consecutive numbers from zero to VF. 2837 StartOffset = State.Builder.CreateAdd( 2838 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 2839 2840 assert(ScalarStepValue == State.get(getOperand(1), VPIteration(0, 0)) && 2841 "scalar step must be the same across all parts"); 2842 Value *GEP = State.Builder.CreateGEP( 2843 State.Builder.getInt8Ty(), NewPointerPhi, 2844 State.Builder.CreateMul(StartOffset, State.Builder.CreateVectorSplat( 2845 State.VF, ScalarStepValue)), 2846 "vector.gep"); 2847 State.set(this, GEP); 2848 } 2849 2850 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2851 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, 2852 VPSlotTracker &SlotTracker) const { 2853 assert((getNumOperands() == 2 || getNumOperands() == 4) && 2854 "unexpected number of operands"); 2855 O << Indent << "EMIT "; 2856 printAsOperand(O, SlotTracker); 2857 O << " = WIDEN-POINTER-INDUCTION "; 2858 getStartValue()->printAsOperand(O, SlotTracker); 2859 O << ", " << *IndDesc.getStep(); 2860 if (getNumOperands() == 4) { 2861 O << ", "; 2862 getOperand(2)->printAsOperand(O, SlotTracker); 2863 O << ", "; 2864 getOperand(3)->printAsOperand(O, SlotTracker); 2865 } 2866 } 2867 #endif 2868 2869 void VPExpandSCEVRecipe::execute(VPTransformState &State) { 2870 assert(!State.Instance && "cannot be used in per-lane"); 2871 const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); 2872 SCEVExpander Exp(SE, DL, "induction"); 2873 2874 Value *Res = Exp.expandCodeFor(Expr, Expr->getType(), 2875 &*State.Builder.GetInsertPoint()); 2876 assert(!State.ExpandedSCEVs.contains(Expr) && 2877 "Same SCEV expanded multiple times"); 2878 State.ExpandedSCEVs[Expr] = Res; 2879 State.set(this, Res, {0, 0}); 2880 } 2881 2882 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2883 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, 2884 VPSlotTracker &SlotTracker) const { 2885 O << Indent << "EMIT "; 2886 getVPSingleValue()->printAsOperand(O, SlotTracker); 2887 O << " = EXPAND SCEV " << *Expr; 2888 } 2889 #endif 2890 2891 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { 2892 Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true); 2893 Type *STy = CanonicalIV->getType(); 2894 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); 2895 ElementCount VF = State.VF; 2896 Value *VStart = VF.isScalar() 2897 ? CanonicalIV 2898 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); 2899 Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this)); 2900 if (VF.isVector()) { 2901 VStep = Builder.CreateVectorSplat(VF, VStep); 2902 VStep = 2903 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); 2904 } 2905 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); 2906 State.set(this, CanonicalVectorIV); 2907 } 2908 2909 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2910 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, 2911 VPSlotTracker &SlotTracker) const { 2912 O << Indent << "EMIT "; 2913 printAsOperand(O, SlotTracker); 2914 O << " = WIDEN-CANONICAL-INDUCTION "; 2915 printOperands(O, SlotTracker); 2916 } 2917 #endif 2918 2919 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { 2920 auto &Builder = State.Builder; 2921 // Create a vector from the initial value. 2922 auto *VectorInit = getStartValue()->getLiveInIRValue(); 2923 2924 Type *VecTy = State.VF.isScalar() 2925 ? VectorInit->getType() 2926 : VectorType::get(VectorInit->getType(), State.VF); 2927 2928 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2929 if (State.VF.isVector()) { 2930 auto *IdxTy = Builder.getInt32Ty(); 2931 auto *One = ConstantInt::get(IdxTy, 1); 2932 IRBuilder<>::InsertPointGuard Guard(Builder); 2933 Builder.SetInsertPoint(VectorPH->getTerminator()); 2934 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF); 2935 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 2936 VectorInit = Builder.CreateInsertElement( 2937 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init"); 2938 } 2939 2940 // Create a phi node for the new recurrence. 2941 PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur"); 2942 Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 2943 Phi->addIncoming(VectorInit, VectorPH); 2944 State.set(this, Phi); 2945 } 2946 2947 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2948 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, 2949 VPSlotTracker &SlotTracker) const { 2950 O << Indent << "FIRST-ORDER-RECURRENCE-PHI "; 2951 printAsOperand(O, SlotTracker); 2952 O << " = phi "; 2953 printOperands(O, SlotTracker); 2954 } 2955 #endif 2956 2957 void VPReductionPHIRecipe::execute(VPTransformState &State) { 2958 auto &Builder = State.Builder; 2959 2960 // Reductions do not have to start at zero. They can start with 2961 // any loop invariant values. 2962 VPValue *StartVPV = getStartValue(); 2963 Value *StartV = StartVPV->getLiveInIRValue(); 2964 2965 // In order to support recurrences we need to be able to vectorize Phi nodes. 2966 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 2967 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 2968 // this value when we vectorize all of the instructions that use the PHI. 2969 bool ScalarPHI = State.VF.isScalar() || IsInLoop; 2970 Type *VecTy = ScalarPHI ? StartV->getType() 2971 : VectorType::get(StartV->getType(), State.VF); 2972 2973 BasicBlock *HeaderBB = State.CFG.PrevBB; 2974 assert(State.CurrentVectorLoop->getHeader() == HeaderBB && 2975 "recipe must be in the vector loop header"); 2976 auto *Phi = PHINode::Create(VecTy, 2, "vec.phi"); 2977 Phi->insertBefore(HeaderBB->getFirstInsertionPt()); 2978 State.set(this, Phi, IsInLoop); 2979 2980 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2981 2982 Value *Iden = nullptr; 2983 RecurKind RK = RdxDesc.getRecurrenceKind(); 2984 unsigned CurrentPart = getUnrollPart(*this); 2985 2986 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || 2987 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { 2988 // MinMax and AnyOf reductions have the start value as their identity. 2989 if (ScalarPHI) { 2990 Iden = StartV; 2991 } else { 2992 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 2993 Builder.SetInsertPoint(VectorPH->getTerminator()); 2994 StartV = Iden = State.get(StartVPV); 2995 } 2996 } else { 2997 Iden = llvm::getRecurrenceIdentity(RK, VecTy->getScalarType(), 2998 RdxDesc.getFastMathFlags()); 2999 3000 if (!ScalarPHI) { 3001 if (CurrentPart == 0) { 3002 // Create start and identity vector values for the reduction in the 3003 // preheader. 3004 // TODO: Introduce recipes in VPlan preheader to create initial values. 3005 Iden = Builder.CreateVectorSplat(State.VF, Iden); 3006 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 3007 Builder.SetInsertPoint(VectorPH->getTerminator()); 3008 Constant *Zero = Builder.getInt32(0); 3009 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 3010 } else { 3011 Iden = Builder.CreateVectorSplat(State.VF, Iden); 3012 } 3013 } 3014 } 3015 3016 Phi = cast<PHINode>(State.get(this, IsInLoop)); 3017 Value *StartVal = (CurrentPart == 0) ? StartV : Iden; 3018 Phi->addIncoming(StartVal, VectorPH); 3019 } 3020 3021 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3022 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3023 VPSlotTracker &SlotTracker) const { 3024 O << Indent << "WIDEN-REDUCTION-PHI "; 3025 3026 printAsOperand(O, SlotTracker); 3027 O << " = phi "; 3028 printOperands(O, SlotTracker); 3029 } 3030 #endif 3031 3032 void VPWidenPHIRecipe::execute(VPTransformState &State) { 3033 assert(EnableVPlanNativePath && 3034 "Non-native vplans are not expected to have VPWidenPHIRecipes."); 3035 3036 Value *Op0 = State.get(getOperand(0)); 3037 Type *VecTy = Op0->getType(); 3038 Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi"); 3039 State.set(this, VecPhi); 3040 } 3041 3042 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3043 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3044 VPSlotTracker &SlotTracker) const { 3045 O << Indent << "WIDEN-PHI "; 3046 3047 auto *OriginalPhi = cast<PHINode>(getUnderlyingValue()); 3048 // Unless all incoming values are modeled in VPlan print the original PHI 3049 // directly. 3050 // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming 3051 // values as VPValues. 3052 if (getNumOperands() != OriginalPhi->getNumOperands()) { 3053 O << VPlanIngredient(OriginalPhi); 3054 return; 3055 } 3056 3057 printAsOperand(O, SlotTracker); 3058 O << " = phi "; 3059 printOperands(O, SlotTracker); 3060 } 3061 #endif 3062 3063 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and 3064 // remove VPActiveLaneMaskPHIRecipe. 3065 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { 3066 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3067 Value *StartMask = State.get(getOperand(0)); 3068 PHINode *Phi = 3069 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); 3070 Phi->addIncoming(StartMask, VectorPH); 3071 Phi->setDebugLoc(getDebugLoc()); 3072 State.set(this, Phi); 3073 } 3074 3075 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3076 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3077 VPSlotTracker &SlotTracker) const { 3078 O << Indent << "ACTIVE-LANE-MASK-PHI "; 3079 3080 printAsOperand(O, SlotTracker); 3081 O << " = phi "; 3082 printOperands(O, SlotTracker); 3083 } 3084 #endif 3085 3086 void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) { 3087 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3088 Value *Start = State.get(getOperand(0), VPIteration(0, 0)); 3089 PHINode *Phi = State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv"); 3090 Phi->addIncoming(Start, VectorPH); 3091 Phi->setDebugLoc(getDebugLoc()); 3092 State.set(this, Phi, /*IsScalar=*/true); 3093 } 3094 3095 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3096 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3097 VPSlotTracker &SlotTracker) const { 3098 O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI "; 3099 3100 printAsOperand(O, SlotTracker); 3101 O << " = phi "; 3102 printOperands(O, SlotTracker); 3103 } 3104 #endif 3105