1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file contains implementations for different VPlan recipes. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "VPlan.h" 15 #include "VPlanAnalysis.h" 16 #include "VPlanPatternMatch.h" 17 #include "VPlanUtils.h" 18 #include "llvm/ADT/STLExtras.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/ADT/Twine.h" 21 #include "llvm/Analysis/IVDescriptors.h" 22 #include "llvm/IR/BasicBlock.h" 23 #include "llvm/IR/IRBuilder.h" 24 #include "llvm/IR/Instruction.h" 25 #include "llvm/IR/Instructions.h" 26 #include "llvm/IR/Intrinsics.h" 27 #include "llvm/IR/Type.h" 28 #include "llvm/IR/Value.h" 29 #include "llvm/IR/VectorBuilder.h" 30 #include "llvm/Support/Casting.h" 31 #include "llvm/Support/CommandLine.h" 32 #include "llvm/Support/Debug.h" 33 #include "llvm/Support/raw_ostream.h" 34 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 35 #include "llvm/Transforms/Utils/LoopUtils.h" 36 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 37 #include <cassert> 38 39 using namespace llvm; 40 41 using VectorParts = SmallVector<Value *, 2>; 42 43 namespace llvm { 44 extern cl::opt<bool> EnableVPlanNativePath; 45 } 46 extern cl::opt<unsigned> ForceTargetInstructionCost; 47 48 #define LV_NAME "loop-vectorize" 49 #define DEBUG_TYPE LV_NAME 50 51 bool VPRecipeBase::mayWriteToMemory() const { 52 switch (getVPDefID()) { 53 case VPInstructionSC: 54 if (Instruction::isBinaryOp(cast<VPInstruction>(this)->getOpcode())) 55 return false; 56 switch (cast<VPInstruction>(this)->getOpcode()) { 57 case Instruction::Or: 58 case Instruction::ICmp: 59 case Instruction::Select: 60 case VPInstruction::AnyOf: 61 case VPInstruction::Not: 62 case VPInstruction::CalculateTripCountMinusVF: 63 case VPInstruction::CanonicalIVIncrementForPart: 64 case VPInstruction::ExtractFromEnd: 65 case VPInstruction::FirstOrderRecurrenceSplice: 66 case VPInstruction::LogicalAnd: 67 case VPInstruction::PtrAdd: 68 return false; 69 default: 70 return true; 71 } 72 case VPInterleaveSC: 73 return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0; 74 case VPWidenStoreEVLSC: 75 case VPWidenStoreSC: 76 return true; 77 case VPReplicateSC: 78 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 79 ->mayWriteToMemory(); 80 case VPWidenCallSC: 81 return !cast<VPWidenCallRecipe>(this) 82 ->getCalledScalarFunction() 83 ->onlyReadsMemory(); 84 case VPWidenIntrinsicSC: 85 return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory(); 86 case VPBranchOnMaskSC: 87 case VPScalarIVStepsSC: 88 case VPPredInstPHISC: 89 return false; 90 case VPBlendSC: 91 case VPReductionEVLSC: 92 case VPReductionSC: 93 case VPVectorPointerSC: 94 case VPWidenCanonicalIVSC: 95 case VPWidenCastSC: 96 case VPWidenGEPSC: 97 case VPWidenIntOrFpInductionSC: 98 case VPWidenLoadEVLSC: 99 case VPWidenLoadSC: 100 case VPWidenPHISC: 101 case VPWidenSC: 102 case VPWidenEVLSC: 103 case VPWidenSelectSC: { 104 const Instruction *I = 105 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 106 (void)I; 107 assert((!I || !I->mayWriteToMemory()) && 108 "underlying instruction may write to memory"); 109 return false; 110 } 111 default: 112 return true; 113 } 114 } 115 116 bool VPRecipeBase::mayReadFromMemory() const { 117 switch (getVPDefID()) { 118 case VPWidenLoadEVLSC: 119 case VPWidenLoadSC: 120 return true; 121 case VPReplicateSC: 122 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 123 ->mayReadFromMemory(); 124 case VPWidenCallSC: 125 return !cast<VPWidenCallRecipe>(this) 126 ->getCalledScalarFunction() 127 ->onlyWritesMemory(); 128 case VPWidenIntrinsicSC: 129 return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory(); 130 case VPBranchOnMaskSC: 131 case VPPredInstPHISC: 132 case VPScalarIVStepsSC: 133 case VPWidenStoreEVLSC: 134 case VPWidenStoreSC: 135 return false; 136 case VPBlendSC: 137 case VPReductionEVLSC: 138 case VPReductionSC: 139 case VPVectorPointerSC: 140 case VPWidenCanonicalIVSC: 141 case VPWidenCastSC: 142 case VPWidenGEPSC: 143 case VPWidenIntOrFpInductionSC: 144 case VPWidenPHISC: 145 case VPWidenSC: 146 case VPWidenEVLSC: 147 case VPWidenSelectSC: { 148 const Instruction *I = 149 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 150 (void)I; 151 assert((!I || !I->mayReadFromMemory()) && 152 "underlying instruction may read from memory"); 153 return false; 154 } 155 default: 156 return true; 157 } 158 } 159 160 bool VPRecipeBase::mayHaveSideEffects() const { 161 switch (getVPDefID()) { 162 case VPDerivedIVSC: 163 case VPPredInstPHISC: 164 case VPScalarCastSC: 165 case VPReverseVectorPointerSC: 166 return false; 167 case VPInstructionSC: 168 return mayWriteToMemory(); 169 case VPWidenCallSC: { 170 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction(); 171 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn(); 172 } 173 case VPWidenIntrinsicSC: 174 return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects(); 175 case VPBlendSC: 176 case VPReductionEVLSC: 177 case VPReductionSC: 178 case VPScalarIVStepsSC: 179 case VPVectorPointerSC: 180 case VPWidenCanonicalIVSC: 181 case VPWidenCastSC: 182 case VPWidenGEPSC: 183 case VPWidenIntOrFpInductionSC: 184 case VPWidenPHISC: 185 case VPWidenPointerInductionSC: 186 case VPWidenSC: 187 case VPWidenEVLSC: 188 case VPWidenSelectSC: { 189 const Instruction *I = 190 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 191 (void)I; 192 assert((!I || !I->mayHaveSideEffects()) && 193 "underlying instruction has side-effects"); 194 return false; 195 } 196 case VPInterleaveSC: 197 return mayWriteToMemory(); 198 case VPWidenLoadEVLSC: 199 case VPWidenLoadSC: 200 case VPWidenStoreEVLSC: 201 case VPWidenStoreSC: 202 assert( 203 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() == 204 mayWriteToMemory() && 205 "mayHaveSideffects result for ingredient differs from this " 206 "implementation"); 207 return mayWriteToMemory(); 208 case VPReplicateSC: { 209 auto *R = cast<VPReplicateRecipe>(this); 210 return R->getUnderlyingInstr()->mayHaveSideEffects(); 211 } 212 default: 213 return true; 214 } 215 } 216 217 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { 218 assert(!Parent && "Recipe already in some VPBasicBlock"); 219 assert(InsertPos->getParent() && 220 "Insertion position not in any VPBasicBlock"); 221 InsertPos->getParent()->insert(this, InsertPos->getIterator()); 222 } 223 224 void VPRecipeBase::insertBefore(VPBasicBlock &BB, 225 iplist<VPRecipeBase>::iterator I) { 226 assert(!Parent && "Recipe already in some VPBasicBlock"); 227 assert(I == BB.end() || I->getParent() == &BB); 228 BB.insert(this, I); 229 } 230 231 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { 232 assert(!Parent && "Recipe already in some VPBasicBlock"); 233 assert(InsertPos->getParent() && 234 "Insertion position not in any VPBasicBlock"); 235 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator())); 236 } 237 238 void VPRecipeBase::removeFromParent() { 239 assert(getParent() && "Recipe not in any VPBasicBlock"); 240 getParent()->getRecipeList().remove(getIterator()); 241 Parent = nullptr; 242 } 243 244 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { 245 assert(getParent() && "Recipe not in any VPBasicBlock"); 246 return getParent()->getRecipeList().erase(getIterator()); 247 } 248 249 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { 250 removeFromParent(); 251 insertAfter(InsertPos); 252 } 253 254 void VPRecipeBase::moveBefore(VPBasicBlock &BB, 255 iplist<VPRecipeBase>::iterator I) { 256 removeFromParent(); 257 insertBefore(BB, I); 258 } 259 260 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { 261 // Get the underlying instruction for the recipe, if there is one. It is used 262 // to 263 // * decide if cost computation should be skipped for this recipe, 264 // * apply forced target instruction cost. 265 Instruction *UI = nullptr; 266 if (auto *S = dyn_cast<VPSingleDefRecipe>(this)) 267 UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue()); 268 else if (auto *IG = dyn_cast<VPInterleaveRecipe>(this)) 269 UI = IG->getInsertPos(); 270 else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this)) 271 UI = &WidenMem->getIngredient(); 272 273 InstructionCost RecipeCost; 274 if (UI && Ctx.skipCostComputation(UI, VF.isVector())) { 275 RecipeCost = 0; 276 } else { 277 RecipeCost = computeCost(VF, Ctx); 278 if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 && 279 RecipeCost.isValid()) 280 RecipeCost = InstructionCost(ForceTargetInstructionCost); 281 } 282 283 LLVM_DEBUG({ 284 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": "; 285 dump(); 286 }); 287 return RecipeCost; 288 } 289 290 InstructionCost VPRecipeBase::computeCost(ElementCount VF, 291 VPCostContext &Ctx) const { 292 llvm_unreachable("subclasses should implement computeCost"); 293 } 294 295 InstructionCost 296 VPPartialReductionRecipe::computeCost(ElementCount VF, 297 VPCostContext &Ctx) const { 298 std::optional<unsigned> Opcode = std::nullopt; 299 VPRecipeBase *BinOpR = getOperand(0)->getDefiningRecipe(); 300 if (auto *WidenR = dyn_cast<VPWidenRecipe>(BinOpR)) 301 Opcode = std::make_optional(WidenR->getOpcode()); 302 303 VPRecipeBase *ExtAR = BinOpR->getOperand(0)->getDefiningRecipe(); 304 VPRecipeBase *ExtBR = BinOpR->getOperand(1)->getDefiningRecipe(); 305 306 auto GetExtendKind = [](VPRecipeBase *R) { 307 auto *WidenCastR = dyn_cast<VPWidenCastRecipe>(R); 308 if (!WidenCastR) 309 return TargetTransformInfo::PR_None; 310 if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt) 311 return TargetTransformInfo::PR_ZeroExtend; 312 if (WidenCastR->getOpcode() == Instruction::CastOps::SExt) 313 return TargetTransformInfo::PR_SignExtend; 314 return TargetTransformInfo::PR_None; 315 }; 316 317 auto *PhiType = Ctx.Types.inferScalarType(getOperand(1)); 318 auto *ExtTy = Ctx.Types.inferScalarType(ExtAR->getOperand(0)); 319 320 return Ctx.TTI.getPartialReductionCost(getOpcode(), ExtTy, PhiType, VF, 321 GetExtendKind(ExtAR), 322 GetExtendKind(ExtBR), Opcode); 323 } 324 325 void VPPartialReductionRecipe::execute(VPTransformState &State) { 326 State.setDebugLocFrom(getDebugLoc()); 327 auto &Builder = State.Builder; 328 329 assert(getOpcode() == Instruction::Add && 330 "Unhandled partial reduction opcode"); 331 332 Value *BinOpVal = State.get(getOperand(0)); 333 Value *PhiVal = State.get(getOperand(1)); 334 assert(PhiVal && BinOpVal && "Phi and Mul must be set"); 335 336 Type *RetTy = PhiVal->getType(); 337 338 CallInst *V = Builder.CreateIntrinsic( 339 RetTy, Intrinsic::experimental_vector_partial_reduce_add, 340 {PhiVal, BinOpVal}, nullptr, "partial.reduce"); 341 342 State.set(this, V); 343 } 344 345 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 346 void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent, 347 VPSlotTracker &SlotTracker) const { 348 O << Indent << "PARTIAL-REDUCE "; 349 printAsOperand(O, SlotTracker); 350 O << " = " << Instruction::getOpcodeName(getOpcode()) << " "; 351 printOperands(O, SlotTracker); 352 } 353 #endif 354 355 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { 356 assert(OpType == OperationType::FPMathOp && 357 "recipe doesn't have fast math flags"); 358 FastMathFlags Res; 359 Res.setAllowReassoc(FMFs.AllowReassoc); 360 Res.setNoNaNs(FMFs.NoNaNs); 361 Res.setNoInfs(FMFs.NoInfs); 362 Res.setNoSignedZeros(FMFs.NoSignedZeros); 363 Res.setAllowReciprocal(FMFs.AllowReciprocal); 364 Res.setAllowContract(FMFs.AllowContract); 365 Res.setApproxFunc(FMFs.ApproxFunc); 366 return Res; 367 } 368 369 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 370 void VPSingleDefRecipe::dump() const { VPDef::dump(); } 371 #endif 372 373 template <unsigned PartOpIdx> 374 VPValue * 375 VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(VPUser &U) const { 376 if (U.getNumOperands() == PartOpIdx + 1) 377 return U.getOperand(PartOpIdx); 378 return nullptr; 379 } 380 381 template <unsigned PartOpIdx> 382 unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(VPUser &U) const { 383 if (auto *UnrollPartOp = getUnrollPartOperand(U)) 384 return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue(); 385 return 0; 386 } 387 388 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, 389 VPValue *A, VPValue *B, DebugLoc DL, 390 const Twine &Name) 391 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}), 392 Pred, DL), 393 Opcode(Opcode), Name(Name.str()) { 394 assert(Opcode == Instruction::ICmp && 395 "only ICmp predicates supported at the moment"); 396 } 397 398 VPInstruction::VPInstruction(unsigned Opcode, 399 std::initializer_list<VPValue *> Operands, 400 FastMathFlags FMFs, DebugLoc DL, const Twine &Name) 401 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL), 402 Opcode(Opcode), Name(Name.str()) { 403 // Make sure the VPInstruction is a floating-point operation. 404 assert(isFPMathOp() && "this op can't take fast-math flags"); 405 } 406 407 bool VPInstruction::doesGeneratePerAllLanes() const { 408 return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this); 409 } 410 411 bool VPInstruction::canGenerateScalarForFirstLane() const { 412 if (Instruction::isBinaryOp(getOpcode())) 413 return true; 414 if (isSingleScalar() || isVectorToScalar()) 415 return true; 416 switch (Opcode) { 417 case Instruction::ICmp: 418 case Instruction::Select: 419 case VPInstruction::BranchOnCond: 420 case VPInstruction::BranchOnCount: 421 case VPInstruction::CalculateTripCountMinusVF: 422 case VPInstruction::CanonicalIVIncrementForPart: 423 case VPInstruction::PtrAdd: 424 case VPInstruction::ExplicitVectorLength: 425 case VPInstruction::AnyOf: 426 return true; 427 default: 428 return false; 429 } 430 } 431 432 Value *VPInstruction::generatePerLane(VPTransformState &State, 433 const VPLane &Lane) { 434 IRBuilderBase &Builder = State.Builder; 435 436 assert(getOpcode() == VPInstruction::PtrAdd && 437 "only PtrAdd opcodes are supported for now"); 438 return Builder.CreatePtrAdd(State.get(getOperand(0), Lane), 439 State.get(getOperand(1), Lane), Name); 440 } 441 442 Value *VPInstruction::generate(VPTransformState &State) { 443 IRBuilderBase &Builder = State.Builder; 444 445 if (Instruction::isBinaryOp(getOpcode())) { 446 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 447 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); 448 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed); 449 auto *Res = 450 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); 451 if (auto *I = dyn_cast<Instruction>(Res)) 452 setFlags(I); 453 return Res; 454 } 455 456 switch (getOpcode()) { 457 case VPInstruction::Not: { 458 Value *A = State.get(getOperand(0)); 459 return Builder.CreateNot(A, Name); 460 } 461 case Instruction::ICmp: { 462 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 463 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); 464 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed); 465 return Builder.CreateCmp(getPredicate(), A, B, Name); 466 } 467 case Instruction::Select: { 468 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 469 Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed); 470 Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed); 471 Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed); 472 return Builder.CreateSelect(Cond, Op1, Op2, Name); 473 } 474 case VPInstruction::ActiveLaneMask: { 475 // Get first lane of vector induction variable. 476 Value *VIVElem0 = State.get(getOperand(0), VPLane(0)); 477 // Get the original loop tripcount. 478 Value *ScalarTC = State.get(getOperand(1), VPLane(0)); 479 480 // If this part of the active lane mask is scalar, generate the CMP directly 481 // to avoid unnecessary extracts. 482 if (State.VF.isScalar()) 483 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC, 484 Name); 485 486 auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); 487 auto *PredTy = VectorType::get(Int1Ty, State.VF); 488 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, 489 {PredTy, ScalarTC->getType()}, 490 {VIVElem0, ScalarTC}, nullptr, Name); 491 } 492 case VPInstruction::FirstOrderRecurrenceSplice: { 493 // Generate code to combine the previous and current values in vector v3. 494 // 495 // vector.ph: 496 // v_init = vector(..., ..., ..., a[-1]) 497 // br vector.body 498 // 499 // vector.body 500 // i = phi [0, vector.ph], [i+4, vector.body] 501 // v1 = phi [v_init, vector.ph], [v2, vector.body] 502 // v2 = a[i, i+1, i+2, i+3]; 503 // v3 = vector(v1(3), v2(0, 1, 2)) 504 505 auto *V1 = State.get(getOperand(0)); 506 if (!V1->getType()->isVectorTy()) 507 return V1; 508 Value *V2 = State.get(getOperand(1)); 509 return Builder.CreateVectorSplice(V1, V2, -1, Name); 510 } 511 case VPInstruction::CalculateTripCountMinusVF: { 512 unsigned UF = getParent()->getPlan()->getUF(); 513 Value *ScalarTC = State.get(getOperand(0), VPLane(0)); 514 Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF); 515 Value *Sub = Builder.CreateSub(ScalarTC, Step); 516 Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step); 517 Value *Zero = ConstantInt::get(ScalarTC->getType(), 0); 518 return Builder.CreateSelect(Cmp, Sub, Zero); 519 } 520 case VPInstruction::ExplicitVectorLength: { 521 // TODO: Restructure this code with an explicit remainder loop, vsetvli can 522 // be outside of the main loop. 523 Value *AVL = State.get(getOperand(0), /*IsScalar*/ true); 524 // Compute EVL 525 assert(AVL->getType()->isIntegerTy() && 526 "Requested vector length should be an integer."); 527 528 assert(State.VF.isScalable() && "Expected scalable vector factor."); 529 Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue()); 530 531 Value *EVL = State.Builder.CreateIntrinsic( 532 State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length, 533 {AVL, VFArg, State.Builder.getTrue()}); 534 return EVL; 535 } 536 case VPInstruction::CanonicalIVIncrementForPart: { 537 unsigned Part = getUnrollPart(*this); 538 auto *IV = State.get(getOperand(0), VPLane(0)); 539 assert(Part != 0 && "Must have a positive part"); 540 // The canonical IV is incremented by the vectorization factor (num of 541 // SIMD elements) times the unroll part. 542 Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); 543 return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(), 544 hasNoSignedWrap()); 545 } 546 case VPInstruction::BranchOnCond: { 547 Value *Cond = State.get(getOperand(0), VPLane(0)); 548 // Replace the temporary unreachable terminator with a new conditional 549 // branch, hooking it up to backward destination for exiting blocks now and 550 // to forward destination(s) later when they are created. 551 BranchInst *CondBr = 552 Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr); 553 CondBr->setSuccessor(0, nullptr); 554 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 555 556 if (!getParent()->isExiting()) 557 return CondBr; 558 559 VPRegionBlock *ParentRegion = getParent()->getParent(); 560 VPBasicBlock *Header = ParentRegion->getEntryBasicBlock(); 561 CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]); 562 return CondBr; 563 } 564 case VPInstruction::BranchOnCount: { 565 // First create the compare. 566 Value *IV = State.get(getOperand(0), /*IsScalar*/ true); 567 Value *TC = State.get(getOperand(1), /*IsScalar*/ true); 568 Value *Cond = Builder.CreateICmpEQ(IV, TC); 569 570 // Now create the branch. 571 auto *Plan = getParent()->getPlan(); 572 VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); 573 VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock(); 574 575 // Replace the temporary unreachable terminator with a new conditional 576 // branch, hooking it up to backward destination (the header) now and to the 577 // forward destination (the exit/middle block) later when it is created. 578 // Note that CreateCondBr expects a valid BB as first argument, so we need 579 // to set it to nullptr later. 580 BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), 581 State.CFG.VPBB2IRBB[Header]); 582 CondBr->setSuccessor(0, nullptr); 583 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 584 return CondBr; 585 } 586 case VPInstruction::ComputeReductionResult: { 587 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary 588 // and will be removed by breaking up the recipe further. 589 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0)); 590 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 591 // Get its reduction variable descriptor. 592 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 593 594 RecurKind RK = RdxDesc.getRecurrenceKind(); 595 596 Type *PhiTy = OrigPhi->getType(); 597 // The recipe's operands are the reduction phi, followed by one operand for 598 // each part of the reduction. 599 unsigned UF = getNumOperands() - 1; 600 VectorParts RdxParts(UF); 601 for (unsigned Part = 0; Part < UF; ++Part) 602 RdxParts[Part] = State.get(getOperand(1 + Part), PhiR->isInLoop()); 603 604 // If the vector reduction can be performed in a smaller type, we truncate 605 // then extend the loop exit value to enable InstCombine to evaluate the 606 // entire expression in the smaller type. 607 // TODO: Handle this in truncateToMinBW. 608 if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 609 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF); 610 for (unsigned Part = 0; Part < UF; ++Part) 611 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 612 } 613 // Reduce all of the unrolled parts into a single vector. 614 Value *ReducedPartRdx = RdxParts[0]; 615 unsigned Op = RdxDesc.getOpcode(); 616 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) 617 Op = Instruction::Or; 618 619 if (PhiR->isOrdered()) { 620 ReducedPartRdx = RdxParts[UF - 1]; 621 } else { 622 // Floating-point operations should have some FMF to enable the reduction. 623 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 624 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 625 for (unsigned Part = 1; Part < UF; ++Part) { 626 Value *RdxPart = RdxParts[Part]; 627 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 628 ReducedPartRdx = Builder.CreateBinOp( 629 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 630 else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) 631 ReducedPartRdx = 632 createMinMaxOp(Builder, RecurKind::SMax, ReducedPartRdx, RdxPart); 633 else 634 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 635 } 636 } 637 638 // Create the reduction after the loop. Note that inloop reductions create 639 // the target reduction in the loop using a Reduction recipe. 640 if ((State.VF.isVector() || 641 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) || 642 RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) && 643 !PhiR->isInLoop()) { 644 ReducedPartRdx = 645 createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); 646 // If the reduction can be performed in a smaller type, we need to extend 647 // the reduction to the wider type before we branch to the original loop. 648 if (PhiTy != RdxDesc.getRecurrenceType()) 649 ReducedPartRdx = RdxDesc.isSigned() 650 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 651 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 652 } 653 654 return ReducedPartRdx; 655 } 656 case VPInstruction::ExtractFromEnd: { 657 auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue()); 658 unsigned Offset = CI->getZExtValue(); 659 assert(Offset > 0 && "Offset from end must be positive"); 660 Value *Res; 661 if (State.VF.isVector()) { 662 assert(Offset <= State.VF.getKnownMinValue() && 663 "invalid offset to extract from"); 664 // Extract lane VF - Offset from the operand. 665 Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset)); 666 } else { 667 assert(Offset <= 1 && "invalid offset to extract from"); 668 Res = State.get(getOperand(0)); 669 } 670 if (isa<ExtractElementInst>(Res)) 671 Res->setName(Name); 672 return Res; 673 } 674 case VPInstruction::LogicalAnd: { 675 Value *A = State.get(getOperand(0)); 676 Value *B = State.get(getOperand(1)); 677 return Builder.CreateLogicalAnd(A, B, Name); 678 } 679 case VPInstruction::PtrAdd: { 680 assert(vputils::onlyFirstLaneUsed(this) && 681 "can only generate first lane for PtrAdd"); 682 Value *Ptr = State.get(getOperand(0), VPLane(0)); 683 Value *Addend = State.get(getOperand(1), VPLane(0)); 684 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags()); 685 } 686 case VPInstruction::ResumePhi: { 687 Value *IncomingFromVPlanPred = 688 State.get(getOperand(0), /* IsScalar */ true); 689 Value *IncomingFromOtherPreds = 690 State.get(getOperand(1), /* IsScalar */ true); 691 auto *NewPhi = 692 Builder.CreatePHI(State.TypeAnalysis.inferScalarType(this), 2, Name); 693 BasicBlock *VPlanPred = 694 State.CFG 695 .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getPredecessors()[0])]; 696 NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred); 697 for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) { 698 if (OtherPred == VPlanPred) 699 continue; 700 NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred); 701 } 702 return NewPhi; 703 } 704 case VPInstruction::AnyOf: { 705 Value *A = State.get(getOperand(0)); 706 return Builder.CreateOrReduce(A); 707 } 708 709 default: 710 llvm_unreachable("Unsupported opcode for instruction"); 711 } 712 } 713 714 bool VPInstruction::isVectorToScalar() const { 715 return getOpcode() == VPInstruction::ExtractFromEnd || 716 getOpcode() == VPInstruction::ComputeReductionResult || 717 getOpcode() == VPInstruction::AnyOf; 718 } 719 720 bool VPInstruction::isSingleScalar() const { 721 return getOpcode() == VPInstruction::ResumePhi; 722 } 723 724 #if !defined(NDEBUG) 725 bool VPInstruction::isFPMathOp() const { 726 // Inspired by FPMathOperator::classof. Notable differences are that we don't 727 // support Call, PHI and Select opcodes here yet. 728 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || 729 Opcode == Instruction::FNeg || Opcode == Instruction::FSub || 730 Opcode == Instruction::FDiv || Opcode == Instruction::FRem || 731 Opcode == Instruction::FCmp || Opcode == Instruction::Select; 732 } 733 #endif 734 735 void VPInstruction::execute(VPTransformState &State) { 736 assert(!State.Lane && "VPInstruction executing an Lane"); 737 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 738 assert((hasFastMathFlags() == isFPMathOp() || 739 getOpcode() == Instruction::Select) && 740 "Recipe not a FPMathOp but has fast-math flags?"); 741 if (hasFastMathFlags()) 742 State.Builder.setFastMathFlags(getFastMathFlags()); 743 State.setDebugLocFrom(getDebugLoc()); 744 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() && 745 (vputils::onlyFirstLaneUsed(this) || 746 isVectorToScalar() || isSingleScalar()); 747 bool GeneratesPerAllLanes = doesGeneratePerAllLanes(); 748 if (GeneratesPerAllLanes) { 749 for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue(); 750 Lane != NumLanes; ++Lane) { 751 Value *GeneratedValue = generatePerLane(State, VPLane(Lane)); 752 assert(GeneratedValue && "generatePerLane must produce a value"); 753 State.set(this, GeneratedValue, VPLane(Lane)); 754 } 755 return; 756 } 757 758 Value *GeneratedValue = generate(State); 759 if (!hasResult()) 760 return; 761 assert(GeneratedValue && "generate must produce a value"); 762 assert( 763 (GeneratedValue->getType()->isVectorTy() == !GeneratesPerFirstLaneOnly || 764 State.VF.isScalar()) && 765 "scalar value but not only first lane defined"); 766 State.set(this, GeneratedValue, 767 /*IsScalar*/ GeneratesPerFirstLaneOnly); 768 } 769 770 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { 771 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 772 if (Instruction::isBinaryOp(getOpcode())) 773 return vputils::onlyFirstLaneUsed(this); 774 775 switch (getOpcode()) { 776 default: 777 return false; 778 case Instruction::ICmp: 779 case Instruction::Select: 780 case Instruction::Or: 781 case VPInstruction::PtrAdd: 782 // TODO: Cover additional opcodes. 783 return vputils::onlyFirstLaneUsed(this); 784 case VPInstruction::ActiveLaneMask: 785 case VPInstruction::ExplicitVectorLength: 786 case VPInstruction::CalculateTripCountMinusVF: 787 case VPInstruction::CanonicalIVIncrementForPart: 788 case VPInstruction::BranchOnCount: 789 case VPInstruction::BranchOnCond: 790 case VPInstruction::ResumePhi: 791 return true; 792 }; 793 llvm_unreachable("switch should return"); 794 } 795 796 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const { 797 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 798 if (Instruction::isBinaryOp(getOpcode())) 799 return vputils::onlyFirstPartUsed(this); 800 801 switch (getOpcode()) { 802 default: 803 return false; 804 case Instruction::ICmp: 805 case Instruction::Select: 806 return vputils::onlyFirstPartUsed(this); 807 case VPInstruction::BranchOnCount: 808 case VPInstruction::BranchOnCond: 809 case VPInstruction::CanonicalIVIncrementForPart: 810 return true; 811 }; 812 llvm_unreachable("switch should return"); 813 } 814 815 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 816 void VPInstruction::dump() const { 817 VPSlotTracker SlotTracker(getParent()->getPlan()); 818 print(dbgs(), "", SlotTracker); 819 } 820 821 void VPInstruction::print(raw_ostream &O, const Twine &Indent, 822 VPSlotTracker &SlotTracker) const { 823 O << Indent << "EMIT "; 824 825 if (hasResult()) { 826 printAsOperand(O, SlotTracker); 827 O << " = "; 828 } 829 830 switch (getOpcode()) { 831 case VPInstruction::Not: 832 O << "not"; 833 break; 834 case VPInstruction::SLPLoad: 835 O << "combined load"; 836 break; 837 case VPInstruction::SLPStore: 838 O << "combined store"; 839 break; 840 case VPInstruction::ActiveLaneMask: 841 O << "active lane mask"; 842 break; 843 case VPInstruction::ResumePhi: 844 O << "resume-phi"; 845 break; 846 case VPInstruction::ExplicitVectorLength: 847 O << "EXPLICIT-VECTOR-LENGTH"; 848 break; 849 case VPInstruction::FirstOrderRecurrenceSplice: 850 O << "first-order splice"; 851 break; 852 case VPInstruction::BranchOnCond: 853 O << "branch-on-cond"; 854 break; 855 case VPInstruction::CalculateTripCountMinusVF: 856 O << "TC > VF ? TC - VF : 0"; 857 break; 858 case VPInstruction::CanonicalIVIncrementForPart: 859 O << "VF * Part +"; 860 break; 861 case VPInstruction::BranchOnCount: 862 O << "branch-on-count"; 863 break; 864 case VPInstruction::ExtractFromEnd: 865 O << "extract-from-end"; 866 break; 867 case VPInstruction::ComputeReductionResult: 868 O << "compute-reduction-result"; 869 break; 870 case VPInstruction::LogicalAnd: 871 O << "logical-and"; 872 break; 873 case VPInstruction::PtrAdd: 874 O << "ptradd"; 875 break; 876 case VPInstruction::AnyOf: 877 O << "any-of"; 878 break; 879 default: 880 O << Instruction::getOpcodeName(getOpcode()); 881 } 882 883 printFlags(O); 884 printOperands(O, SlotTracker); 885 886 if (auto DL = getDebugLoc()) { 887 O << ", !dbg "; 888 DL.print(O); 889 } 890 } 891 #endif 892 893 void VPIRInstruction::execute(VPTransformState &State) { 894 assert((isa<PHINode>(&I) || getNumOperands() == 0) && 895 "Only PHINodes can have extra operands"); 896 for (const auto &[Idx, Op] : enumerate(operands())) { 897 VPValue *ExitValue = Op; 898 auto Lane = vputils::isUniformAfterVectorization(ExitValue) 899 ? VPLane::getFirstLane() 900 : VPLane::getLastLaneForVF(State.VF); 901 VPBlockBase *Pred = getParent()->getPredecessors()[Idx]; 902 auto *PredVPBB = Pred->getExitingBasicBlock(); 903 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; 904 // Set insertion point in PredBB in case an extract needs to be generated. 905 // TODO: Model extracts explicitly. 906 State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt()); 907 Value *V = State.get(ExitValue, VPLane(Lane)); 908 auto *Phi = cast<PHINode>(&I); 909 // If there is no existing block for PredBB in the phi, add a new incoming 910 // value. Otherwise update the existing incoming value for PredBB. 911 if (Phi->getBasicBlockIndex(PredBB) == -1) 912 Phi->addIncoming(V, PredBB); 913 else 914 Phi->setIncomingValueForBlock(PredBB, V); 915 } 916 917 // Advance the insert point after the wrapped IR instruction. This allows 918 // interleaving VPIRInstructions and other recipes. 919 State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator())); 920 } 921 922 InstructionCost VPIRInstruction::computeCost(ElementCount VF, 923 VPCostContext &Ctx) const { 924 // The recipe wraps an existing IR instruction on the border of VPlan's scope, 925 // hence it does not contribute to the cost-modeling for the VPlan. 926 return 0; 927 } 928 929 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 930 void VPIRInstruction::print(raw_ostream &O, const Twine &Indent, 931 VPSlotTracker &SlotTracker) const { 932 O << Indent << "IR " << I; 933 934 if (getNumOperands() != 0) { 935 O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": "; 936 interleaveComma( 937 enumerate(operands()), O, [this, &O, &SlotTracker](auto Op) { 938 Op.value()->printAsOperand(O, SlotTracker); 939 O << " from "; 940 getParent()->getPredecessors()[Op.index()]->printAsOperand(O); 941 }); 942 O << ")"; 943 } 944 } 945 #endif 946 947 void VPWidenCallRecipe::execute(VPTransformState &State) { 948 assert(State.VF.isVector() && "not widening"); 949 State.setDebugLocFrom(getDebugLoc()); 950 951 FunctionType *VFTy = Variant->getFunctionType(); 952 // Add return type if intrinsic is overloaded on it. 953 SmallVector<Value *, 4> Args; 954 for (const auto &I : enumerate(arg_operands())) { 955 Value *Arg; 956 // Some vectorized function variants may also take a scalar argument, 957 // e.g. linear parameters for pointers. This needs to be the scalar value 958 // from the start of the respective part when interleaving. 959 if (!VFTy->getParamType(I.index())->isVectorTy()) 960 Arg = State.get(I.value(), VPLane(0)); 961 else 962 Arg = State.get(I.value(), onlyFirstLaneUsed(I.value())); 963 Args.push_back(Arg); 964 } 965 966 assert(Variant != nullptr && "Can't create vector function."); 967 968 auto *CI = cast_or_null<CallInst>(getUnderlyingValue()); 969 SmallVector<OperandBundleDef, 1> OpBundles; 970 if (CI) 971 CI->getOperandBundlesAsDefs(OpBundles); 972 973 CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles); 974 setFlags(V); 975 976 if (!V->getType()->isVoidTy()) 977 State.set(this, V); 978 State.addMetadata(V, CI); 979 } 980 981 InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF, 982 VPCostContext &Ctx) const { 983 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 984 return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(), 985 Variant->getFunctionType()->params(), 986 CostKind); 987 } 988 989 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 990 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, 991 VPSlotTracker &SlotTracker) const { 992 O << Indent << "WIDEN-CALL "; 993 994 Function *CalledFn = getCalledScalarFunction(); 995 if (CalledFn->getReturnType()->isVoidTy()) 996 O << "void "; 997 else { 998 printAsOperand(O, SlotTracker); 999 O << " = "; 1000 } 1001 1002 O << "call"; 1003 printFlags(O); 1004 O << " @" << CalledFn->getName() << "("; 1005 interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) { 1006 Op->printAsOperand(O, SlotTracker); 1007 }); 1008 O << ")"; 1009 1010 O << " (using library function"; 1011 if (Variant->hasName()) 1012 O << ": " << Variant->getName(); 1013 O << ")"; 1014 } 1015 #endif 1016 1017 void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { 1018 assert(State.VF.isVector() && "not widening"); 1019 State.setDebugLocFrom(getDebugLoc()); 1020 1021 SmallVector<Type *, 2> TysForDecl; 1022 // Add return type if intrinsic is overloaded on it. 1023 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1, State.TTI)) 1024 TysForDecl.push_back(VectorType::get(getResultType(), State.VF)); 1025 SmallVector<Value *, 4> Args; 1026 for (const auto &I : enumerate(operands())) { 1027 // Some intrinsics have a scalar argument - don't replace it with a 1028 // vector. 1029 Value *Arg; 1030 if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(), 1031 State.TTI)) 1032 Arg = State.get(I.value(), VPLane(0)); 1033 else 1034 Arg = State.get(I.value(), onlyFirstLaneUsed(I.value())); 1035 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(), 1036 State.TTI)) 1037 TysForDecl.push_back(Arg->getType()); 1038 Args.push_back(Arg); 1039 } 1040 1041 // Use vector version of the intrinsic. 1042 Module *M = State.Builder.GetInsertBlock()->getModule(); 1043 Function *VectorF = 1044 Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl); 1045 assert(VectorF && 1046 "Can't retrieve vector intrinsic or vector-predication intrinsics."); 1047 1048 auto *CI = cast_or_null<CallInst>(getUnderlyingValue()); 1049 SmallVector<OperandBundleDef, 1> OpBundles; 1050 if (CI) 1051 CI->getOperandBundlesAsDefs(OpBundles); 1052 1053 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); 1054 1055 setFlags(V); 1056 1057 if (!V->getType()->isVoidTy()) 1058 State.set(this, V); 1059 State.addMetadata(V, CI); 1060 } 1061 1062 InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, 1063 VPCostContext &Ctx) const { 1064 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1065 1066 // Some backends analyze intrinsic arguments to determine cost. Use the 1067 // underlying value for the operand if it has one. Otherwise try to use the 1068 // operand of the underlying call instruction, if there is one. Otherwise 1069 // clear Arguments. 1070 // TODO: Rework TTI interface to be independent of concrete IR values. 1071 SmallVector<const Value *> Arguments; 1072 for (const auto &[Idx, Op] : enumerate(operands())) { 1073 auto *V = Op->getUnderlyingValue(); 1074 if (!V) { 1075 // Push all the VP Intrinsic's ops into the Argments even if is nullptr. 1076 // Some VP Intrinsic's cost will assert the number of parameters. 1077 // Mainly appears in the following two scenarios: 1078 // 1. EVL Op is nullptr 1079 // 2. The Argmunt of the VP Intrinsic is also the VP Intrinsic 1080 if (VPIntrinsic::isVPIntrinsic(VectorIntrinsicID)) { 1081 Arguments.push_back(V); 1082 continue; 1083 } 1084 if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) { 1085 Arguments.push_back(UI->getArgOperand(Idx)); 1086 continue; 1087 } 1088 Arguments.clear(); 1089 break; 1090 } 1091 Arguments.push_back(V); 1092 } 1093 1094 Type *RetTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 1095 SmallVector<Type *> ParamTys; 1096 for (unsigned I = 0; I != getNumOperands(); ++I) 1097 ParamTys.push_back( 1098 toVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF)); 1099 1100 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst. 1101 FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags(); 1102 IntrinsicCostAttributes CostAttrs( 1103 VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF, 1104 dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue())); 1105 return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, CostKind); 1106 } 1107 1108 StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const { 1109 return Intrinsic::getBaseName(VectorIntrinsicID); 1110 } 1111 1112 bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const { 1113 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 1114 // Vector predication intrinsics only demand the the first lane the last 1115 // operand (the EVL operand). 1116 return VPIntrinsic::isVPIntrinsic(VectorIntrinsicID) && 1117 Op == getOperand(getNumOperands() - 1); 1118 } 1119 1120 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1121 void VPWidenIntrinsicRecipe::print(raw_ostream &O, const Twine &Indent, 1122 VPSlotTracker &SlotTracker) const { 1123 O << Indent << "WIDEN-INTRINSIC "; 1124 if (ResultTy->isVoidTy()) { 1125 O << "void "; 1126 } else { 1127 printAsOperand(O, SlotTracker); 1128 O << " = "; 1129 } 1130 1131 O << "call"; 1132 printFlags(O); 1133 O << getIntrinsicName() << "("; 1134 1135 interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) { 1136 Op->printAsOperand(O, SlotTracker); 1137 }); 1138 O << ")"; 1139 } 1140 #endif 1141 1142 void VPHistogramRecipe::execute(VPTransformState &State) { 1143 State.setDebugLocFrom(getDebugLoc()); 1144 IRBuilderBase &Builder = State.Builder; 1145 1146 Value *Address = State.get(getOperand(0)); 1147 Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true); 1148 VectorType *VTy = cast<VectorType>(Address->getType()); 1149 1150 // The histogram intrinsic requires a mask even if the recipe doesn't; 1151 // if the mask operand was omitted then all lanes should be executed and 1152 // we just need to synthesize an all-true mask. 1153 Value *Mask = nullptr; 1154 if (VPValue *VPMask = getMask()) 1155 Mask = State.get(VPMask); 1156 else 1157 Mask = 1158 Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1)); 1159 1160 // If this is a subtract, we want to invert the increment amount. We may 1161 // add a separate intrinsic in future, but for now we'll try this. 1162 if (Opcode == Instruction::Sub) 1163 IncAmt = Builder.CreateNeg(IncAmt); 1164 else 1165 assert(Opcode == Instruction::Add && "only add or sub supported for now"); 1166 1167 State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add, 1168 {VTy, IncAmt->getType()}, 1169 {Address, IncAmt, Mask}); 1170 } 1171 1172 InstructionCost VPHistogramRecipe::computeCost(ElementCount VF, 1173 VPCostContext &Ctx) const { 1174 // FIXME: Take the gather and scatter into account as well. For now we're 1175 // generating the same cost as the fallback path, but we'll likely 1176 // need to create a new TTI method for determining the cost, including 1177 // whether we can use base + vec-of-smaller-indices or just 1178 // vec-of-pointers. 1179 assert(VF.isVector() && "Invalid VF for histogram cost"); 1180 Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0)); 1181 VPValue *IncAmt = getOperand(1); 1182 Type *IncTy = Ctx.Types.inferScalarType(IncAmt); 1183 VectorType *VTy = VectorType::get(IncTy, VF); 1184 1185 // Assume that a non-constant update value (or a constant != 1) requires 1186 // a multiply, and add that into the cost. 1187 InstructionCost MulCost = 1188 Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy); 1189 if (IncAmt->isLiveIn()) { 1190 ConstantInt *CI = dyn_cast<ConstantInt>(IncAmt->getLiveInIRValue()); 1191 1192 if (CI && CI->getZExtValue() == 1) 1193 MulCost = TTI::TCC_Free; 1194 } 1195 1196 // Find the cost of the histogram operation itself. 1197 Type *PtrTy = VectorType::get(AddressTy, VF); 1198 Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF); 1199 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add, 1200 Type::getVoidTy(Ctx.LLVMCtx), 1201 {PtrTy, IncTy, MaskTy}); 1202 1203 // Add the costs together with the add/sub operation. 1204 return Ctx.TTI.getIntrinsicInstrCost( 1205 ICA, TargetTransformInfo::TCK_RecipThroughput) + 1206 MulCost + Ctx.TTI.getArithmeticInstrCost(Opcode, VTy); 1207 } 1208 1209 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1210 void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent, 1211 VPSlotTracker &SlotTracker) const { 1212 O << Indent << "WIDEN-HISTOGRAM buckets: "; 1213 getOperand(0)->printAsOperand(O, SlotTracker); 1214 1215 if (Opcode == Instruction::Sub) 1216 O << ", dec: "; 1217 else { 1218 assert(Opcode == Instruction::Add); 1219 O << ", inc: "; 1220 } 1221 getOperand(1)->printAsOperand(O, SlotTracker); 1222 1223 if (VPValue *Mask = getMask()) { 1224 O << ", mask: "; 1225 Mask->printAsOperand(O, SlotTracker); 1226 } 1227 } 1228 1229 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, 1230 VPSlotTracker &SlotTracker) const { 1231 O << Indent << "WIDEN-SELECT "; 1232 printAsOperand(O, SlotTracker); 1233 O << " = select "; 1234 getOperand(0)->printAsOperand(O, SlotTracker); 1235 O << ", "; 1236 getOperand(1)->printAsOperand(O, SlotTracker); 1237 O << ", "; 1238 getOperand(2)->printAsOperand(O, SlotTracker); 1239 O << (isInvariantCond() ? " (condition is loop invariant)" : ""); 1240 } 1241 #endif 1242 1243 void VPWidenSelectRecipe::execute(VPTransformState &State) { 1244 State.setDebugLocFrom(getDebugLoc()); 1245 1246 // The condition can be loop invariant but still defined inside the 1247 // loop. This means that we can't just use the original 'cond' value. 1248 // We have to take the 'vectorized' value and pick the first lane. 1249 // Instcombine will make this a no-op. 1250 auto *InvarCond = 1251 isInvariantCond() ? State.get(getCond(), VPLane(0)) : nullptr; 1252 1253 Value *Cond = InvarCond ? InvarCond : State.get(getCond()); 1254 Value *Op0 = State.get(getOperand(1)); 1255 Value *Op1 = State.get(getOperand(2)); 1256 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 1257 State.set(this, Sel); 1258 State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1259 } 1260 1261 InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF, 1262 VPCostContext &Ctx) const { 1263 SelectInst *SI = cast<SelectInst>(getUnderlyingValue()); 1264 bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions(); 1265 Type *ScalarTy = Ctx.Types.inferScalarType(this); 1266 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 1267 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1268 1269 VPValue *Op0, *Op1; 1270 using namespace llvm::VPlanPatternMatch; 1271 if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 && 1272 (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) || 1273 match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) { 1274 // select x, y, false --> x & y 1275 // select x, true, y --> x | y 1276 const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0); 1277 const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1); 1278 1279 SmallVector<const Value *, 2> Operands; 1280 if (all_of(operands(), 1281 [](VPValue *Op) { return Op->getUnderlyingValue(); })) 1282 Operands.append(SI->op_begin(), SI->op_end()); 1283 bool IsLogicalOr = match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))); 1284 return Ctx.TTI.getArithmeticInstrCost( 1285 IsLogicalOr ? Instruction::Or : Instruction::And, VectorTy, CostKind, 1286 {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI); 1287 } 1288 1289 Type *CondTy = Ctx.Types.inferScalarType(getOperand(0)); 1290 if (!ScalarCond) 1291 CondTy = VectorType::get(CondTy, VF); 1292 1293 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 1294 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 1295 Pred = Cmp->getPredicate(); 1296 return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VectorTy, CondTy, Pred, 1297 CostKind, {TTI::OK_AnyValue, TTI::OP_None}, 1298 {TTI::OK_AnyValue, TTI::OP_None}, SI); 1299 } 1300 1301 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy( 1302 const FastMathFlags &FMF) { 1303 AllowReassoc = FMF.allowReassoc(); 1304 NoNaNs = FMF.noNaNs(); 1305 NoInfs = FMF.noInfs(); 1306 NoSignedZeros = FMF.noSignedZeros(); 1307 AllowReciprocal = FMF.allowReciprocal(); 1308 AllowContract = FMF.allowContract(); 1309 ApproxFunc = FMF.approxFunc(); 1310 } 1311 1312 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1313 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { 1314 switch (OpType) { 1315 case OperationType::Cmp: 1316 O << " " << CmpInst::getPredicateName(getPredicate()); 1317 break; 1318 case OperationType::DisjointOp: 1319 if (DisjointFlags.IsDisjoint) 1320 O << " disjoint"; 1321 break; 1322 case OperationType::PossiblyExactOp: 1323 if (ExactFlags.IsExact) 1324 O << " exact"; 1325 break; 1326 case OperationType::OverflowingBinOp: 1327 if (WrapFlags.HasNUW) 1328 O << " nuw"; 1329 if (WrapFlags.HasNSW) 1330 O << " nsw"; 1331 break; 1332 case OperationType::FPMathOp: 1333 getFastMathFlags().print(O); 1334 break; 1335 case OperationType::GEPOp: 1336 if (GEPFlags.isInBounds()) 1337 O << " inbounds"; 1338 else if (GEPFlags.hasNoUnsignedSignedWrap()) 1339 O << " nusw"; 1340 if (GEPFlags.hasNoUnsignedWrap()) 1341 O << " nuw"; 1342 break; 1343 case OperationType::NonNegOp: 1344 if (NonNegFlags.NonNeg) 1345 O << " nneg"; 1346 break; 1347 case OperationType::Other: 1348 break; 1349 } 1350 if (getNumOperands() > 0) 1351 O << " "; 1352 } 1353 #endif 1354 1355 void VPWidenRecipe::execute(VPTransformState &State) { 1356 State.setDebugLocFrom(getDebugLoc()); 1357 auto &Builder = State.Builder; 1358 switch (Opcode) { 1359 case Instruction::Call: 1360 case Instruction::Br: 1361 case Instruction::PHI: 1362 case Instruction::GetElementPtr: 1363 case Instruction::Select: 1364 llvm_unreachable("This instruction is handled by a different recipe."); 1365 case Instruction::UDiv: 1366 case Instruction::SDiv: 1367 case Instruction::SRem: 1368 case Instruction::URem: 1369 case Instruction::Add: 1370 case Instruction::FAdd: 1371 case Instruction::Sub: 1372 case Instruction::FSub: 1373 case Instruction::FNeg: 1374 case Instruction::Mul: 1375 case Instruction::FMul: 1376 case Instruction::FDiv: 1377 case Instruction::FRem: 1378 case Instruction::Shl: 1379 case Instruction::LShr: 1380 case Instruction::AShr: 1381 case Instruction::And: 1382 case Instruction::Or: 1383 case Instruction::Xor: { 1384 // Just widen unops and binops. 1385 SmallVector<Value *, 2> Ops; 1386 for (VPValue *VPOp : operands()) 1387 Ops.push_back(State.get(VPOp)); 1388 1389 Value *V = Builder.CreateNAryOp(Opcode, Ops); 1390 1391 if (auto *VecOp = dyn_cast<Instruction>(V)) 1392 setFlags(VecOp); 1393 1394 // Use this vector value for all users of the original instruction. 1395 State.set(this, V); 1396 State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1397 break; 1398 } 1399 case Instruction::Freeze: { 1400 Value *Op = State.get(getOperand(0)); 1401 1402 Value *Freeze = Builder.CreateFreeze(Op); 1403 State.set(this, Freeze); 1404 break; 1405 } 1406 case Instruction::ICmp: 1407 case Instruction::FCmp: { 1408 // Widen compares. Generate vector compares. 1409 bool FCmp = Opcode == Instruction::FCmp; 1410 Value *A = State.get(getOperand(0)); 1411 Value *B = State.get(getOperand(1)); 1412 Value *C = nullptr; 1413 if (FCmp) { 1414 // Propagate fast math flags. 1415 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 1416 if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue())) 1417 Builder.setFastMathFlags(I->getFastMathFlags()); 1418 C = Builder.CreateFCmp(getPredicate(), A, B); 1419 } else { 1420 C = Builder.CreateICmp(getPredicate(), A, B); 1421 } 1422 State.set(this, C); 1423 State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1424 break; 1425 } 1426 default: 1427 // This instruction is not vectorized by simple widening. 1428 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " 1429 << Instruction::getOpcodeName(Opcode)); 1430 llvm_unreachable("Unhandled instruction!"); 1431 } // end of switch. 1432 1433 #if !defined(NDEBUG) 1434 // Verify that VPlan type inference results agree with the type of the 1435 // generated values. 1436 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) == 1437 State.get(this)->getType() && 1438 "inferred type and type from generated instructions do not match"); 1439 #endif 1440 } 1441 1442 InstructionCost VPWidenRecipe::computeCost(ElementCount VF, 1443 VPCostContext &Ctx) const { 1444 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1445 switch (Opcode) { 1446 case Instruction::FNeg: { 1447 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 1448 return Ctx.TTI.getArithmeticInstrCost( 1449 Opcode, VectorTy, CostKind, 1450 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 1451 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}); 1452 } 1453 1454 case Instruction::UDiv: 1455 case Instruction::SDiv: 1456 case Instruction::SRem: 1457 case Instruction::URem: 1458 // More complex computation, let the legacy cost-model handle this for now. 1459 return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF); 1460 case Instruction::Add: 1461 case Instruction::FAdd: 1462 case Instruction::Sub: 1463 case Instruction::FSub: 1464 case Instruction::Mul: 1465 case Instruction::FMul: 1466 case Instruction::FDiv: 1467 case Instruction::FRem: 1468 case Instruction::Shl: 1469 case Instruction::LShr: 1470 case Instruction::AShr: 1471 case Instruction::And: 1472 case Instruction::Or: 1473 case Instruction::Xor: { 1474 VPValue *RHS = getOperand(1); 1475 // Certain instructions can be cheaper to vectorize if they have a constant 1476 // second vector operand. One example of this are shifts on x86. 1477 TargetTransformInfo::OperandValueInfo RHSInfo = { 1478 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}; 1479 if (RHS->isLiveIn()) 1480 RHSInfo = Ctx.TTI.getOperandInfo(RHS->getLiveInIRValue()); 1481 1482 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue && 1483 getOperand(1)->isDefinedOutsideLoopRegions()) 1484 RHSInfo.Kind = TargetTransformInfo::OK_UniformValue; 1485 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 1486 Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue()); 1487 1488 SmallVector<const Value *, 4> Operands; 1489 if (CtxI) 1490 Operands.append(CtxI->value_op_begin(), CtxI->value_op_end()); 1491 return Ctx.TTI.getArithmeticInstrCost( 1492 Opcode, VectorTy, CostKind, 1493 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 1494 RHSInfo, Operands, CtxI, &Ctx.TLI); 1495 } 1496 case Instruction::Freeze: { 1497 // This opcode is unknown. Assume that it is the same as 'mul'. 1498 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 1499 return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 1500 } 1501 case Instruction::ICmp: 1502 case Instruction::FCmp: { 1503 Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue()); 1504 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); 1505 return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(), 1506 CostKind, 1507 {TTI::OK_AnyValue, TTI::OP_None}, 1508 {TTI::OK_AnyValue, TTI::OP_None}, CtxI); 1509 } 1510 default: 1511 llvm_unreachable("Unsupported opcode for instruction"); 1512 } 1513 } 1514 1515 void VPWidenEVLRecipe::execute(VPTransformState &State) { 1516 unsigned Opcode = getOpcode(); 1517 // TODO: Support other opcodes 1518 if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode)) 1519 llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute"); 1520 1521 State.setDebugLocFrom(getDebugLoc()); 1522 1523 assert(State.get(getOperand(0))->getType()->isVectorTy() && 1524 "VPWidenEVLRecipe should not be used for scalars"); 1525 1526 VPValue *EVL = getEVL(); 1527 Value *EVLArg = State.get(EVL, /*NeedsScalar=*/true); 1528 IRBuilderBase &BuilderIR = State.Builder; 1529 VectorBuilder Builder(BuilderIR); 1530 Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue()); 1531 1532 SmallVector<Value *, 4> Ops; 1533 for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) { 1534 VPValue *VPOp = getOperand(I); 1535 Ops.push_back(State.get(VPOp)); 1536 } 1537 1538 Builder.setMask(Mask).setEVL(EVLArg); 1539 Value *VPInst = 1540 Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops, "vp.op"); 1541 // Currently vp-intrinsics only accept FMF flags. 1542 // TODO: Enable other flags when support is added. 1543 if (isa<FPMathOperator>(VPInst)) 1544 setFlags(cast<Instruction>(VPInst)); 1545 1546 State.set(this, VPInst); 1547 State.addMetadata(VPInst, 1548 dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1549 } 1550 1551 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1552 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, 1553 VPSlotTracker &SlotTracker) const { 1554 O << Indent << "WIDEN "; 1555 printAsOperand(O, SlotTracker); 1556 O << " = " << Instruction::getOpcodeName(Opcode); 1557 printFlags(O); 1558 printOperands(O, SlotTracker); 1559 } 1560 1561 void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent, 1562 VPSlotTracker &SlotTracker) const { 1563 O << Indent << "WIDEN "; 1564 printAsOperand(O, SlotTracker); 1565 O << " = vp." << Instruction::getOpcodeName(getOpcode()); 1566 printFlags(O); 1567 printOperands(O, SlotTracker); 1568 } 1569 #endif 1570 1571 void VPWidenCastRecipe::execute(VPTransformState &State) { 1572 State.setDebugLocFrom(getDebugLoc()); 1573 auto &Builder = State.Builder; 1574 /// Vectorize casts. 1575 assert(State.VF.isVector() && "Not vectorizing?"); 1576 Type *DestTy = VectorType::get(getResultType(), State.VF); 1577 VPValue *Op = getOperand(0); 1578 Value *A = State.get(Op); 1579 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); 1580 State.set(this, Cast); 1581 State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue())); 1582 if (auto *CastOp = dyn_cast<Instruction>(Cast)) 1583 setFlags(CastOp); 1584 } 1585 1586 InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF, 1587 VPCostContext &Ctx) const { 1588 // TODO: In some cases, VPWidenCastRecipes are created but not considered in 1589 // the legacy cost model, including truncates/extends when evaluating a 1590 // reduction in a smaller type. 1591 if (!getUnderlyingValue()) 1592 return 0; 1593 // Computes the CastContextHint from a recipes that may access memory. 1594 auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint { 1595 if (VF.isScalar()) 1596 return TTI::CastContextHint::Normal; 1597 if (isa<VPInterleaveRecipe>(R)) 1598 return TTI::CastContextHint::Interleave; 1599 if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R)) 1600 return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked 1601 : TTI::CastContextHint::Normal; 1602 const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R); 1603 if (WidenMemoryRecipe == nullptr) 1604 return TTI::CastContextHint::None; 1605 if (!WidenMemoryRecipe->isConsecutive()) 1606 return TTI::CastContextHint::GatherScatter; 1607 if (WidenMemoryRecipe->isReverse()) 1608 return TTI::CastContextHint::Reversed; 1609 if (WidenMemoryRecipe->isMasked()) 1610 return TTI::CastContextHint::Masked; 1611 return TTI::CastContextHint::Normal; 1612 }; 1613 1614 VPValue *Operand = getOperand(0); 1615 TTI::CastContextHint CCH = TTI::CastContextHint::None; 1616 // For Trunc/FPTrunc, get the context from the only user. 1617 if ((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) && 1618 !hasMoreThanOneUniqueUser() && getNumUsers() > 0) { 1619 if (auto *StoreRecipe = dyn_cast<VPRecipeBase>(*user_begin())) 1620 CCH = ComputeCCH(StoreRecipe); 1621 } 1622 // For Z/Sext, get the context from the operand. 1623 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 1624 Opcode == Instruction::FPExt) { 1625 if (Operand->isLiveIn()) 1626 CCH = TTI::CastContextHint::Normal; 1627 else if (Operand->getDefiningRecipe()) 1628 CCH = ComputeCCH(Operand->getDefiningRecipe()); 1629 } 1630 1631 auto *SrcTy = 1632 cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(Operand), VF)); 1633 auto *DestTy = cast<VectorType>(toVectorTy(getResultType(), VF)); 1634 // Arm TTI will use the underlying instruction to determine the cost. 1635 return Ctx.TTI.getCastInstrCost( 1636 Opcode, DestTy, SrcTy, CCH, TTI::TCK_RecipThroughput, 1637 dyn_cast_if_present<Instruction>(getUnderlyingValue())); 1638 } 1639 1640 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1641 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, 1642 VPSlotTracker &SlotTracker) const { 1643 O << Indent << "WIDEN-CAST "; 1644 printAsOperand(O, SlotTracker); 1645 O << " = " << Instruction::getOpcodeName(Opcode) << " "; 1646 printFlags(O); 1647 printOperands(O, SlotTracker); 1648 O << " to " << *getResultType(); 1649 } 1650 #endif 1651 1652 InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF, 1653 VPCostContext &Ctx) const { 1654 return Ctx.TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 1655 } 1656 1657 /// This function adds 1658 /// (0 * Step, 1 * Step, 2 * Step, ...) 1659 /// to each vector element of Val. 1660 /// \p Opcode is relevant for FP induction variable. 1661 static Value *getStepVector(Value *Val, Value *Step, 1662 Instruction::BinaryOps BinOp, ElementCount VF, 1663 IRBuilderBase &Builder) { 1664 assert(VF.isVector() && "only vector VFs are supported"); 1665 1666 // Create and check the types. 1667 auto *ValVTy = cast<VectorType>(Val->getType()); 1668 ElementCount VLen = ValVTy->getElementCount(); 1669 1670 Type *STy = Val->getType()->getScalarType(); 1671 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1672 "Induction Step must be an integer or FP"); 1673 assert(Step->getType() == STy && "Step has wrong type"); 1674 1675 SmallVector<Constant *, 8> Indices; 1676 1677 // Create a vector of consecutive numbers from zero to VF. 1678 VectorType *InitVecValVTy = ValVTy; 1679 if (STy->isFloatingPointTy()) { 1680 Type *InitVecValSTy = 1681 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 1682 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 1683 } 1684 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 1685 1686 if (STy->isIntegerTy()) { 1687 Step = Builder.CreateVectorSplat(VLen, Step); 1688 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1689 // FIXME: The newly created binary instructions should contain nsw/nuw 1690 // flags, which can be found from the original scalar operations. 1691 Step = Builder.CreateMul(InitVec, Step); 1692 return Builder.CreateAdd(Val, Step, "induction"); 1693 } 1694 1695 // Floating point induction. 1696 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1697 "Binary Opcode should be specified for FP induction"); 1698 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 1699 1700 Step = Builder.CreateVectorSplat(VLen, Step); 1701 Value *MulOp = Builder.CreateFMul(InitVec, Step); 1702 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1703 } 1704 1705 /// A helper function that returns an integer or floating-point constant with 1706 /// value C. 1707 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 1708 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 1709 : ConstantFP::get(Ty, C); 1710 } 1711 1712 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 1713 assert(!State.Lane && "Int or FP induction being replicated."); 1714 1715 Value *Start = getStartValue()->getLiveInIRValue(); 1716 const InductionDescriptor &ID = getInductionDescriptor(); 1717 TruncInst *Trunc = getTruncInst(); 1718 IRBuilderBase &Builder = State.Builder; 1719 assert(getPHINode()->getType() == ID.getStartValue()->getType() && 1720 "Types must match"); 1721 assert(State.VF.isVector() && "must have vector VF"); 1722 1723 // The value from the original loop to which we are mapping the new induction 1724 // variable. 1725 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : getPHINode(); 1726 1727 // Fast-math-flags propagate from the original induction instruction. 1728 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 1729 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 1730 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 1731 1732 // Now do the actual transformations, and start with fetching the step value. 1733 Value *Step = State.get(getStepValue(), VPLane(0)); 1734 1735 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1736 "Expected either an induction phi-node or a truncate of it!"); 1737 1738 // Construct the initial value of the vector IV in the vector loop preheader 1739 auto CurrIP = Builder.saveIP(); 1740 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1741 Builder.SetInsertPoint(VectorPH->getTerminator()); 1742 if (isa<TruncInst>(EntryVal)) { 1743 assert(Start->getType()->isIntegerTy() && 1744 "Truncation requires an integer type"); 1745 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1746 Step = Builder.CreateTrunc(Step, TruncType); 1747 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1748 } 1749 1750 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 1751 Value *SteppedStart = getStepVector(SplatStart, Step, ID.getInductionOpcode(), 1752 State.VF, State.Builder); 1753 1754 // We create vector phi nodes for both integer and floating-point induction 1755 // variables. Here, we determine the kind of arithmetic we will perform. 1756 Instruction::BinaryOps AddOp; 1757 Instruction::BinaryOps MulOp; 1758 if (Step->getType()->isIntegerTy()) { 1759 AddOp = Instruction::Add; 1760 MulOp = Instruction::Mul; 1761 } else { 1762 AddOp = ID.getInductionOpcode(); 1763 MulOp = Instruction::FMul; 1764 } 1765 1766 Value *SplatVF; 1767 if (VPValue *SplatVFOperand = getSplatVFValue()) { 1768 // The recipe has been unrolled. In that case, fetch the splat value for the 1769 // induction increment. 1770 SplatVF = State.get(SplatVFOperand); 1771 } else { 1772 // Multiply the vectorization factor by the step using integer or 1773 // floating-point arithmetic as appropriate. 1774 Type *StepType = Step->getType(); 1775 Value *RuntimeVF = State.get(getVFValue(), VPLane(0)); 1776 if (Step->getType()->isFloatingPointTy()) 1777 RuntimeVF = Builder.CreateUIToFP(RuntimeVF, StepType); 1778 else 1779 RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, StepType); 1780 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 1781 1782 // Create a vector splat to use in the induction update. 1783 SplatVF = Builder.CreateVectorSplat(State.VF, Mul); 1784 } 1785 1786 Builder.restoreIP(CurrIP); 1787 1788 // We may need to add the step a number of times, depending on the unroll 1789 // factor. The last of those goes into the PHI. 1790 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind"); 1791 VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 1792 VecInd->setDebugLoc(getDebugLoc()); 1793 State.set(this, VecInd); 1794 1795 Instruction *LastInduction = cast<Instruction>( 1796 Builder.CreateBinOp(AddOp, VecInd, SplatVF, "vec.ind.next")); 1797 if (isa<TruncInst>(EntryVal)) 1798 State.addMetadata(LastInduction, EntryVal); 1799 LastInduction->setDebugLoc(getDebugLoc()); 1800 1801 VecInd->addIncoming(SteppedStart, VectorPH); 1802 // Add induction update using an incorrect block temporarily. The phi node 1803 // will be fixed after VPlan execution. Note that at this point the latch 1804 // block cannot be used, as it does not exist yet. 1805 // TODO: Model increment value in VPlan, by turning the recipe into a 1806 // multi-def and a subclass of VPHeaderPHIRecipe. 1807 VecInd->addIncoming(LastInduction, VectorPH); 1808 } 1809 1810 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1811 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, 1812 VPSlotTracker &SlotTracker) const { 1813 O << Indent; 1814 printAsOperand(O, SlotTracker); 1815 O << " = WIDEN-INDUCTION "; 1816 printOperands(O, SlotTracker); 1817 1818 if (auto *TI = getTruncInst()) 1819 O << " (truncated to " << *TI->getType() << ")"; 1820 } 1821 #endif 1822 1823 bool VPWidenIntOrFpInductionRecipe::isCanonical() const { 1824 // The step may be defined by a recipe in the preheader (e.g. if it requires 1825 // SCEV expansion), but for the canonical induction the step is required to be 1826 // 1, which is represented as live-in. 1827 if (getStepValue()->getDefiningRecipe()) 1828 return false; 1829 auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue()); 1830 auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); 1831 auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin()); 1832 return StartC && StartC->isZero() && StepC && StepC->isOne() && 1833 getScalarType() == CanIV->getScalarType(); 1834 } 1835 1836 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1837 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, 1838 VPSlotTracker &SlotTracker) const { 1839 O << Indent; 1840 printAsOperand(O, SlotTracker); 1841 O << " = DERIVED-IV "; 1842 getStartValue()->printAsOperand(O, SlotTracker); 1843 O << " + "; 1844 getOperand(1)->printAsOperand(O, SlotTracker); 1845 O << " * "; 1846 getStepValue()->printAsOperand(O, SlotTracker); 1847 } 1848 #endif 1849 1850 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 1851 // Fast-math-flags propagate from the original induction instruction. 1852 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 1853 if (hasFastMathFlags()) 1854 State.Builder.setFastMathFlags(getFastMathFlags()); 1855 1856 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 1857 /// variable on which to base the steps, \p Step is the size of the step. 1858 1859 Value *BaseIV = State.get(getOperand(0), VPLane(0)); 1860 Value *Step = State.get(getStepValue(), VPLane(0)); 1861 IRBuilderBase &Builder = State.Builder; 1862 1863 // Ensure step has the same type as that of scalar IV. 1864 Type *BaseIVTy = BaseIV->getType()->getScalarType(); 1865 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!"); 1866 1867 // We build scalar steps for both integer and floating-point induction 1868 // variables. Here, we determine the kind of arithmetic we will perform. 1869 Instruction::BinaryOps AddOp; 1870 Instruction::BinaryOps MulOp; 1871 if (BaseIVTy->isIntegerTy()) { 1872 AddOp = Instruction::Add; 1873 MulOp = Instruction::Mul; 1874 } else { 1875 AddOp = InductionOpcode; 1876 MulOp = Instruction::FMul; 1877 } 1878 1879 // Determine the number of scalars we need to generate for each unroll 1880 // iteration. 1881 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this); 1882 // Compute the scalar steps and save the results in State. 1883 Type *IntStepTy = 1884 IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits()); 1885 Type *VecIVTy = nullptr; 1886 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 1887 if (!FirstLaneOnly && State.VF.isScalable()) { 1888 VecIVTy = VectorType::get(BaseIVTy, State.VF); 1889 UnitStepVec = 1890 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 1891 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 1892 SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV); 1893 } 1894 1895 unsigned StartLane = 0; 1896 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 1897 if (State.Lane) { 1898 StartLane = State.Lane->getKnownLane(); 1899 EndLane = StartLane + 1; 1900 } 1901 Value *StartIdx0 = 1902 createStepForVF(Builder, IntStepTy, State.VF, getUnrollPart(*this)); 1903 1904 if (!FirstLaneOnly && State.VF.isScalable()) { 1905 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 1906 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 1907 if (BaseIVTy->isFloatingPointTy()) 1908 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 1909 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 1910 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 1911 State.set(this, Add); 1912 // It's useful to record the lane values too for the known minimum number 1913 // of elements so we do those below. This improves the code quality when 1914 // trying to extract the first element, for example. 1915 } 1916 1917 if (BaseIVTy->isFloatingPointTy()) 1918 StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy); 1919 1920 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { 1921 Value *StartIdx = Builder.CreateBinOp( 1922 AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane)); 1923 // The step returned by `createStepForVF` is a runtime-evaluated value 1924 // when VF is scalable. Otherwise, it should be folded into a Constant. 1925 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 1926 "Expected StartIdx to be folded to a constant when VF is not " 1927 "scalable"); 1928 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 1929 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul); 1930 State.set(this, Add, VPLane(Lane)); 1931 } 1932 } 1933 1934 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1935 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, 1936 VPSlotTracker &SlotTracker) const { 1937 O << Indent; 1938 printAsOperand(O, SlotTracker); 1939 O << " = SCALAR-STEPS "; 1940 printOperands(O, SlotTracker); 1941 } 1942 #endif 1943 1944 void VPWidenGEPRecipe::execute(VPTransformState &State) { 1945 assert(State.VF.isVector() && "not widening"); 1946 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 1947 // Construct a vector GEP by widening the operands of the scalar GEP as 1948 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 1949 // results in a vector of pointers when at least one operand of the GEP 1950 // is vector-typed. Thus, to keep the representation compact, we only use 1951 // vector-typed operands for loop-varying values. 1952 1953 if (areAllOperandsInvariant()) { 1954 // If we are vectorizing, but the GEP has only loop-invariant operands, 1955 // the GEP we build (by only using vector-typed operands for 1956 // loop-varying values) would be a scalar pointer. Thus, to ensure we 1957 // produce a vector of pointers, we need to either arbitrarily pick an 1958 // operand to broadcast, or broadcast a clone of the original GEP. 1959 // Here, we broadcast a clone of the original. 1960 // 1961 // TODO: If at some point we decide to scalarize instructions having 1962 // loop-invariant operands, this special case will no longer be 1963 // required. We would add the scalarization decision to 1964 // collectLoopScalars() and teach getVectorValue() to broadcast 1965 // the lane-zero scalar value. 1966 SmallVector<Value *> Ops; 1967 for (unsigned I = 0, E = getNumOperands(); I != E; I++) 1968 Ops.push_back(State.get(getOperand(I), VPLane(0))); 1969 1970 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0], 1971 ArrayRef(Ops).drop_front(), "", 1972 getGEPNoWrapFlags()); 1973 Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP); 1974 State.set(this, Splat); 1975 State.addMetadata(Splat, GEP); 1976 } else { 1977 // If the GEP has at least one loop-varying operand, we are sure to 1978 // produce a vector of pointers unless VF is scalar. 1979 // The pointer operand of the new GEP. If it's loop-invariant, we 1980 // won't broadcast it. 1981 auto *Ptr = isPointerLoopInvariant() ? State.get(getOperand(0), VPLane(0)) 1982 : State.get(getOperand(0)); 1983 1984 // Collect all the indices for the new GEP. If any index is 1985 // loop-invariant, we won't broadcast it. 1986 SmallVector<Value *, 4> Indices; 1987 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 1988 VPValue *Operand = getOperand(I); 1989 if (isIndexLoopInvariant(I - 1)) 1990 Indices.push_back(State.get(Operand, VPLane(0))); 1991 else 1992 Indices.push_back(State.get(Operand)); 1993 } 1994 1995 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 1996 // but it should be a vector, otherwise. 1997 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, 1998 Indices, "", getGEPNoWrapFlags()); 1999 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 2000 "NewGEP is not a pointer vector"); 2001 State.set(this, NewGEP); 2002 State.addMetadata(NewGEP, GEP); 2003 } 2004 } 2005 2006 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2007 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, 2008 VPSlotTracker &SlotTracker) const { 2009 O << Indent << "WIDEN-GEP "; 2010 O << (isPointerLoopInvariant() ? "Inv" : "Var"); 2011 for (size_t I = 0; I < getNumOperands() - 1; ++I) 2012 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]"; 2013 2014 O << " "; 2015 printAsOperand(O, SlotTracker); 2016 O << " = getelementptr"; 2017 printFlags(O); 2018 printOperands(O, SlotTracker); 2019 } 2020 #endif 2021 2022 static Type *getGEPIndexTy(bool IsScalable, bool IsReverse, 2023 unsigned CurrentPart, IRBuilderBase &Builder) { 2024 // Use i32 for the gep index type when the value is constant, 2025 // or query DataLayout for a more suitable index type otherwise. 2026 const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout(); 2027 return IsScalable && (IsReverse || CurrentPart > 0) 2028 ? DL.getIndexType(Builder.getPtrTy(0)) 2029 : Builder.getInt32Ty(); 2030 } 2031 2032 void VPReverseVectorPointerRecipe::execute(VPTransformState &State) { 2033 auto &Builder = State.Builder; 2034 State.setDebugLocFrom(getDebugLoc()); 2035 unsigned CurrentPart = getUnrollPart(*this); 2036 Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ true, 2037 CurrentPart, Builder); 2038 2039 // The wide store needs to start at the last vector element. 2040 Value *RunTimeVF = State.get(getVFValue(), VPLane(0)); 2041 if (IndexTy != RunTimeVF->getType()) 2042 RunTimeVF = Builder.CreateZExtOrTrunc(RunTimeVF, IndexTy); 2043 // NumElt = -CurrentPart * RunTimeVF 2044 Value *NumElt = Builder.CreateMul( 2045 ConstantInt::get(IndexTy, -(int64_t)CurrentPart), RunTimeVF); 2046 // LastLane = 1 - RunTimeVF 2047 Value *LastLane = Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); 2048 Value *Ptr = State.get(getOperand(0), VPLane(0)); 2049 Value *ResultPtr = 2050 Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags()); 2051 ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "", 2052 getGEPNoWrapFlags()); 2053 2054 State.set(this, ResultPtr, /*IsScalar*/ true); 2055 } 2056 2057 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2058 void VPReverseVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, 2059 VPSlotTracker &SlotTracker) const { 2060 O << Indent; 2061 printAsOperand(O, SlotTracker); 2062 O << " = reverse-vector-pointer"; 2063 printFlags(O); 2064 O << " "; 2065 printOperands(O, SlotTracker); 2066 } 2067 #endif 2068 2069 void VPVectorPointerRecipe::execute(VPTransformState &State) { 2070 auto &Builder = State.Builder; 2071 State.setDebugLocFrom(getDebugLoc()); 2072 unsigned CurrentPart = getUnrollPart(*this); 2073 Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false, 2074 CurrentPart, Builder); 2075 Value *Ptr = State.get(getOperand(0), VPLane(0)); 2076 2077 Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart); 2078 Value *ResultPtr = 2079 Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags()); 2080 2081 State.set(this, ResultPtr, /*IsScalar*/ true); 2082 } 2083 2084 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2085 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, 2086 VPSlotTracker &SlotTracker) const { 2087 O << Indent; 2088 printAsOperand(O, SlotTracker); 2089 O << " = vector-pointer "; 2090 2091 printOperands(O, SlotTracker); 2092 } 2093 #endif 2094 2095 void VPBlendRecipe::execute(VPTransformState &State) { 2096 assert(isNormalized() && "Expected blend to be normalized!"); 2097 State.setDebugLocFrom(getDebugLoc()); 2098 // We know that all PHIs in non-header blocks are converted into 2099 // selects, so we don't have to worry about the insertion order and we 2100 // can just use the builder. 2101 // At this point we generate the predication tree. There may be 2102 // duplications since this is a simple recursive scan, but future 2103 // optimizations will clean it up. 2104 2105 unsigned NumIncoming = getNumIncomingValues(); 2106 2107 // Generate a sequence of selects of the form: 2108 // SELECT(Mask3, In3, 2109 // SELECT(Mask2, In2, 2110 // SELECT(Mask1, In1, 2111 // In0))) 2112 // Note that Mask0 is never used: lanes for which no path reaches this phi and 2113 // are essentially undef are taken from In0. 2114 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 2115 Value *Result = nullptr; 2116 for (unsigned In = 0; In < NumIncoming; ++In) { 2117 // We might have single edge PHIs (blocks) - use an identity 2118 // 'select' for the first PHI operand. 2119 Value *In0 = State.get(getIncomingValue(In), OnlyFirstLaneUsed); 2120 if (In == 0) 2121 Result = In0; // Initialize with the first incoming value. 2122 else { 2123 // Select between the current value and the previous incoming edge 2124 // based on the incoming mask. 2125 Value *Cond = State.get(getMask(In), OnlyFirstLaneUsed); 2126 Result = State.Builder.CreateSelect(Cond, In0, Result, "predphi"); 2127 } 2128 } 2129 State.set(this, Result, OnlyFirstLaneUsed); 2130 } 2131 2132 InstructionCost VPBlendRecipe::computeCost(ElementCount VF, 2133 VPCostContext &Ctx) const { 2134 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2135 2136 // Handle cases where only the first lane is used the same way as the legacy 2137 // cost model. 2138 if (vputils::onlyFirstLaneUsed(this)) 2139 return Ctx.TTI.getCFInstrCost(Instruction::PHI, CostKind); 2140 2141 Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 2142 Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF); 2143 return (getNumIncomingValues() - 1) * 2144 Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy, 2145 CmpInst::BAD_ICMP_PREDICATE, CostKind); 2146 } 2147 2148 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2149 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, 2150 VPSlotTracker &SlotTracker) const { 2151 O << Indent << "BLEND "; 2152 printAsOperand(O, SlotTracker); 2153 O << " ="; 2154 if (getNumIncomingValues() == 1) { 2155 // Not a User of any mask: not really blending, this is a 2156 // single-predecessor phi. 2157 O << " "; 2158 getIncomingValue(0)->printAsOperand(O, SlotTracker); 2159 } else { 2160 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { 2161 O << " "; 2162 getIncomingValue(I)->printAsOperand(O, SlotTracker); 2163 if (I == 0) 2164 continue; 2165 O << "/"; 2166 getMask(I)->printAsOperand(O, SlotTracker); 2167 } 2168 } 2169 } 2170 #endif 2171 2172 void VPReductionRecipe::execute(VPTransformState &State) { 2173 assert(!State.Lane && "Reduction being replicated."); 2174 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); 2175 RecurKind Kind = RdxDesc.getRecurrenceKind(); 2176 // Propagate the fast-math flags carried by the underlying instruction. 2177 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 2178 State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 2179 Value *NewVecOp = State.get(getVecOp()); 2180 if (VPValue *Cond = getCondOp()) { 2181 Value *NewCond = State.get(Cond, State.VF.isScalar()); 2182 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType()); 2183 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); 2184 2185 Value *Start; 2186 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind)) 2187 Start = RdxDesc.getRecurrenceStartValue(); 2188 else 2189 Start = llvm::getRecurrenceIdentity(Kind, ElementTy, 2190 RdxDesc.getFastMathFlags()); 2191 if (State.VF.isVector()) 2192 Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start); 2193 2194 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start); 2195 NewVecOp = Select; 2196 } 2197 Value *NewRed; 2198 Value *NextInChain; 2199 if (IsOrdered) { 2200 if (State.VF.isVector()) 2201 NewRed = 2202 createOrderedReduction(State.Builder, RdxDesc, NewVecOp, PrevInChain); 2203 else 2204 NewRed = State.Builder.CreateBinOp( 2205 (Instruction::BinaryOps)RdxDesc.getOpcode(), PrevInChain, NewVecOp); 2206 PrevInChain = NewRed; 2207 NextInChain = NewRed; 2208 } else { 2209 PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); 2210 NewRed = createReduction(State.Builder, RdxDesc, NewVecOp); 2211 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 2212 NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(), 2213 NewRed, PrevInChain); 2214 else 2215 NextInChain = State.Builder.CreateBinOp( 2216 (Instruction::BinaryOps)RdxDesc.getOpcode(), NewRed, PrevInChain); 2217 } 2218 State.set(this, NextInChain, /*IsScalar*/ true); 2219 } 2220 2221 void VPReductionEVLRecipe::execute(VPTransformState &State) { 2222 assert(!State.Lane && "Reduction being replicated."); 2223 2224 auto &Builder = State.Builder; 2225 // Propagate the fast-math flags carried by the underlying instruction. 2226 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder); 2227 const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor(); 2228 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 2229 2230 RecurKind Kind = RdxDesc.getRecurrenceKind(); 2231 Value *Prev = State.get(getChainOp(), /*IsScalar*/ true); 2232 Value *VecOp = State.get(getVecOp()); 2233 Value *EVL = State.get(getEVL(), VPLane(0)); 2234 2235 VectorBuilder VBuilder(Builder); 2236 VBuilder.setEVL(EVL); 2237 Value *Mask; 2238 // TODO: move the all-true mask generation into VectorBuilder. 2239 if (VPValue *CondOp = getCondOp()) 2240 Mask = State.get(CondOp); 2241 else 2242 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 2243 VBuilder.setMask(Mask); 2244 2245 Value *NewRed; 2246 if (isOrdered()) { 2247 NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev); 2248 } else { 2249 NewRed = createSimpleReduction(VBuilder, VecOp, RdxDesc); 2250 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 2251 NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev); 2252 else 2253 NewRed = Builder.CreateBinOp((Instruction::BinaryOps)RdxDesc.getOpcode(), 2254 NewRed, Prev); 2255 } 2256 State.set(this, NewRed, /*IsScalar*/ true); 2257 } 2258 2259 InstructionCost VPReductionRecipe::computeCost(ElementCount VF, 2260 VPCostContext &Ctx) const { 2261 RecurKind RdxKind = RdxDesc.getRecurrenceKind(); 2262 Type *ElementTy = Ctx.Types.inferScalarType(this); 2263 auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF)); 2264 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2265 unsigned Opcode = RdxDesc.getOpcode(); 2266 2267 // TODO: Support any-of and in-loop reductions. 2268 assert( 2269 (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) || 2270 ForceTargetInstructionCost.getNumOccurrences() > 0) && 2271 "Any-of reduction not implemented in VPlan-based cost model currently."); 2272 assert( 2273 (!cast<VPReductionPHIRecipe>(getOperand(0))->isInLoop() || 2274 ForceTargetInstructionCost.getNumOccurrences() > 0) && 2275 "In-loop reduction not implemented in VPlan-based cost model currently."); 2276 2277 assert(ElementTy->getTypeID() == RdxDesc.getRecurrenceType()->getTypeID() && 2278 "Inferred type and recurrence type mismatch."); 2279 2280 // Cost = Reduction cost + BinOp cost 2281 InstructionCost Cost = 2282 Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, CostKind); 2283 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) { 2284 Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind); 2285 return Cost + Ctx.TTI.getMinMaxReductionCost( 2286 Id, VectorTy, RdxDesc.getFastMathFlags(), CostKind); 2287 } 2288 2289 return Cost + Ctx.TTI.getArithmeticReductionCost( 2290 Opcode, VectorTy, RdxDesc.getFastMathFlags(), CostKind); 2291 } 2292 2293 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2294 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, 2295 VPSlotTracker &SlotTracker) const { 2296 O << Indent << "REDUCE "; 2297 printAsOperand(O, SlotTracker); 2298 O << " = "; 2299 getChainOp()->printAsOperand(O, SlotTracker); 2300 O << " +"; 2301 if (isa<FPMathOperator>(getUnderlyingInstr())) 2302 O << getUnderlyingInstr()->getFastMathFlags(); 2303 O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; 2304 getVecOp()->printAsOperand(O, SlotTracker); 2305 if (isConditional()) { 2306 O << ", "; 2307 getCondOp()->printAsOperand(O, SlotTracker); 2308 } 2309 O << ")"; 2310 if (RdxDesc.IntermediateStore) 2311 O << " (with final reduction value stored in invariant address sank " 2312 "outside of loop)"; 2313 } 2314 2315 void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent, 2316 VPSlotTracker &SlotTracker) const { 2317 const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor(); 2318 O << Indent << "REDUCE "; 2319 printAsOperand(O, SlotTracker); 2320 O << " = "; 2321 getChainOp()->printAsOperand(O, SlotTracker); 2322 O << " +"; 2323 if (isa<FPMathOperator>(getUnderlyingInstr())) 2324 O << getUnderlyingInstr()->getFastMathFlags(); 2325 O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; 2326 getVecOp()->printAsOperand(O, SlotTracker); 2327 O << ", "; 2328 getEVL()->printAsOperand(O, SlotTracker); 2329 if (isConditional()) { 2330 O << ", "; 2331 getCondOp()->printAsOperand(O, SlotTracker); 2332 } 2333 O << ")"; 2334 if (RdxDesc.IntermediateStore) 2335 O << " (with final reduction value stored in invariant address sank " 2336 "outside of loop)"; 2337 } 2338 #endif 2339 2340 bool VPReplicateRecipe::shouldPack() const { 2341 // Find if the recipe is used by a widened recipe via an intervening 2342 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. 2343 return any_of(users(), [](const VPUser *U) { 2344 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U)) 2345 return any_of(PredR->users(), [PredR](const VPUser *U) { 2346 return !U->usesScalars(PredR); 2347 }); 2348 return false; 2349 }); 2350 } 2351 2352 InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, 2353 VPCostContext &Ctx) const { 2354 Instruction *UI = cast<Instruction>(getUnderlyingValue()); 2355 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan 2356 // transform, avoid computing their cost multiple times for now. 2357 Ctx.SkipCostComputation.insert(UI); 2358 return Ctx.getLegacyCost(UI, VF); 2359 } 2360 2361 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2362 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, 2363 VPSlotTracker &SlotTracker) const { 2364 O << Indent << (IsUniform ? "CLONE " : "REPLICATE "); 2365 2366 if (!getUnderlyingInstr()->getType()->isVoidTy()) { 2367 printAsOperand(O, SlotTracker); 2368 O << " = "; 2369 } 2370 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) { 2371 O << "call"; 2372 printFlags(O); 2373 O << "@" << CB->getCalledFunction()->getName() << "("; 2374 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), 2375 O, [&O, &SlotTracker](VPValue *Op) { 2376 Op->printAsOperand(O, SlotTracker); 2377 }); 2378 O << ")"; 2379 } else { 2380 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); 2381 printFlags(O); 2382 printOperands(O, SlotTracker); 2383 } 2384 2385 if (shouldPack()) 2386 O << " (S->V)"; 2387 } 2388 #endif 2389 2390 Value *VPScalarCastRecipe ::generate(VPTransformState &State) { 2391 assert(vputils::onlyFirstLaneUsed(this) && 2392 "Codegen only implemented for first lane."); 2393 switch (Opcode) { 2394 case Instruction::SExt: 2395 case Instruction::ZExt: 2396 case Instruction::Trunc: { 2397 // Note: SExt/ZExt not used yet. 2398 Value *Op = State.get(getOperand(0), VPLane(0)); 2399 return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy); 2400 } 2401 default: 2402 llvm_unreachable("opcode not implemented yet"); 2403 } 2404 } 2405 2406 void VPScalarCastRecipe ::execute(VPTransformState &State) { 2407 State.set(this, generate(State), VPLane(0)); 2408 } 2409 2410 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2411 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent, 2412 VPSlotTracker &SlotTracker) const { 2413 O << Indent << "SCALAR-CAST "; 2414 printAsOperand(O, SlotTracker); 2415 O << " = " << Instruction::getOpcodeName(Opcode) << " "; 2416 printOperands(O, SlotTracker); 2417 O << " to " << *ResultTy; 2418 } 2419 #endif 2420 2421 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 2422 assert(State.Lane && "Branch on Mask works only on single instance."); 2423 2424 unsigned Lane = State.Lane->getKnownLane(); 2425 2426 Value *ConditionBit = nullptr; 2427 VPValue *BlockInMask = getMask(); 2428 if (BlockInMask) { 2429 ConditionBit = State.get(BlockInMask); 2430 if (ConditionBit->getType()->isVectorTy()) 2431 ConditionBit = State.Builder.CreateExtractElement( 2432 ConditionBit, State.Builder.getInt32(Lane)); 2433 } else // Block in mask is all-one. 2434 ConditionBit = State.Builder.getTrue(); 2435 2436 // Replace the temporary unreachable terminator with a new conditional branch, 2437 // whose two destinations will be set later when they are created. 2438 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 2439 assert(isa<UnreachableInst>(CurrentTerminator) && 2440 "Expected to replace unreachable terminator with conditional branch."); 2441 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 2442 CondBr->setSuccessor(0, nullptr); 2443 ReplaceInstWithInst(CurrentTerminator, CondBr); 2444 } 2445 2446 InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF, 2447 VPCostContext &Ctx) const { 2448 // The legacy cost model doesn't assign costs to branches for individual 2449 // replicate regions. Match the current behavior in the VPlan cost model for 2450 // now. 2451 return 0; 2452 } 2453 2454 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 2455 State.setDebugLocFrom(getDebugLoc()); 2456 assert(State.Lane && "Predicated instruction PHI works per instance."); 2457 Instruction *ScalarPredInst = 2458 cast<Instruction>(State.get(getOperand(0), *State.Lane)); 2459 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 2460 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 2461 assert(PredicatingBB && "Predicated block has no single predecessor."); 2462 assert(isa<VPReplicateRecipe>(getOperand(0)) && 2463 "operand must be VPReplicateRecipe"); 2464 2465 // By current pack/unpack logic we need to generate only a single phi node: if 2466 // a vector value for the predicated instruction exists at this point it means 2467 // the instruction has vector users only, and a phi for the vector value is 2468 // needed. In this case the recipe of the predicated instruction is marked to 2469 // also do that packing, thereby "hoisting" the insert-element sequence. 2470 // Otherwise, a phi node for the scalar value is needed. 2471 if (State.hasVectorValue(getOperand(0))) { 2472 Value *VectorValue = State.get(getOperand(0)); 2473 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 2474 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 2475 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 2476 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 2477 if (State.hasVectorValue(this)) 2478 State.reset(this, VPhi); 2479 else 2480 State.set(this, VPhi); 2481 // NOTE: Currently we need to update the value of the operand, so the next 2482 // predicated iteration inserts its generated value in the correct vector. 2483 State.reset(getOperand(0), VPhi); 2484 } else { 2485 if (vputils::onlyFirstLaneUsed(this) && !State.Lane->isFirstLane()) 2486 return; 2487 2488 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 2489 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 2490 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 2491 PredicatingBB); 2492 Phi->addIncoming(ScalarPredInst, PredicatedBB); 2493 if (State.hasScalarValue(this, *State.Lane)) 2494 State.reset(this, Phi, *State.Lane); 2495 else 2496 State.set(this, Phi, *State.Lane); 2497 // NOTE: Currently we need to update the value of the operand, so the next 2498 // predicated iteration inserts its generated value in the correct vector. 2499 State.reset(getOperand(0), Phi, *State.Lane); 2500 } 2501 } 2502 2503 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2504 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2505 VPSlotTracker &SlotTracker) const { 2506 O << Indent << "PHI-PREDICATED-INSTRUCTION "; 2507 printAsOperand(O, SlotTracker); 2508 O << " = "; 2509 printOperands(O, SlotTracker); 2510 } 2511 #endif 2512 2513 InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, 2514 VPCostContext &Ctx) const { 2515 Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); 2516 const Align Alignment = 2517 getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient)); 2518 unsigned AS = 2519 getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient)); 2520 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2521 2522 if (!Consecutive) { 2523 // TODO: Using the original IR may not be accurate. 2524 // Currently, ARM will use the underlying IR to calculate gather/scatter 2525 // instruction cost. 2526 const Value *Ptr = getLoadStorePointerOperand(&Ingredient); 2527 assert(!Reverse && 2528 "Inconsecutive memory access should not have the order."); 2529 return Ctx.TTI.getAddressComputationCost(Ty) + 2530 Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr, 2531 IsMasked, Alignment, CostKind, 2532 &Ingredient); 2533 } 2534 2535 InstructionCost Cost = 0; 2536 if (IsMasked) { 2537 Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, 2538 AS, CostKind); 2539 } else { 2540 TTI::OperandValueInfo OpInfo = 2541 Ctx.TTI.getOperandInfo(Ingredient.getOperand(0)); 2542 Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS, 2543 CostKind, OpInfo, &Ingredient); 2544 } 2545 if (!Reverse) 2546 return Cost; 2547 2548 return Cost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 2549 cast<VectorType>(Ty), {}, CostKind, 0); 2550 } 2551 2552 void VPWidenLoadRecipe::execute(VPTransformState &State) { 2553 auto *LI = cast<LoadInst>(&Ingredient); 2554 2555 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 2556 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 2557 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2558 bool CreateGather = !isConsecutive(); 2559 2560 auto &Builder = State.Builder; 2561 State.setDebugLocFrom(getDebugLoc()); 2562 Value *Mask = nullptr; 2563 if (auto *VPMask = getMask()) { 2564 // Mask reversal is only needed for non-all-one (null) masks, as reverse 2565 // of a null all-one mask is a null mask. 2566 Mask = State.get(VPMask); 2567 if (isReverse()) 2568 Mask = Builder.CreateVectorReverse(Mask, "reverse"); 2569 } 2570 2571 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather); 2572 Value *NewLI; 2573 if (CreateGather) { 2574 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr, 2575 "wide.masked.gather"); 2576 } else if (Mask) { 2577 NewLI = 2578 Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask, 2579 PoisonValue::get(DataTy), "wide.masked.load"); 2580 } else { 2581 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load"); 2582 } 2583 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2584 State.addMetadata(NewLI, LI); 2585 if (Reverse) 2586 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 2587 State.set(this, NewLI); 2588 } 2589 2590 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2591 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, 2592 VPSlotTracker &SlotTracker) const { 2593 O << Indent << "WIDEN "; 2594 printAsOperand(O, SlotTracker); 2595 O << " = load "; 2596 printOperands(O, SlotTracker); 2597 } 2598 #endif 2599 2600 /// Use all-true mask for reverse rather than actual mask, as it avoids a 2601 /// dependence w/o affecting the result. 2602 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, 2603 Value *EVL, const Twine &Name) { 2604 VectorType *ValTy = cast<VectorType>(Operand->getType()); 2605 Value *AllTrueMask = 2606 Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue()); 2607 return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse, 2608 {Operand, AllTrueMask, EVL}, nullptr, Name); 2609 } 2610 2611 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { 2612 auto *LI = cast<LoadInst>(&Ingredient); 2613 2614 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 2615 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 2616 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2617 bool CreateGather = !isConsecutive(); 2618 2619 auto &Builder = State.Builder; 2620 State.setDebugLocFrom(getDebugLoc()); 2621 CallInst *NewLI; 2622 Value *EVL = State.get(getEVL(), VPLane(0)); 2623 Value *Addr = State.get(getAddr(), !CreateGather); 2624 Value *Mask = nullptr; 2625 if (VPValue *VPMask = getMask()) { 2626 Mask = State.get(VPMask); 2627 if (isReverse()) 2628 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); 2629 } else { 2630 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 2631 } 2632 2633 if (CreateGather) { 2634 NewLI = 2635 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, 2636 nullptr, "wide.masked.gather"); 2637 } else { 2638 VectorBuilder VBuilder(Builder); 2639 VBuilder.setEVL(EVL).setMask(Mask); 2640 NewLI = cast<CallInst>(VBuilder.createVectorInstruction( 2641 Instruction::Load, DataTy, Addr, "vp.op.load")); 2642 } 2643 NewLI->addParamAttr( 2644 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); 2645 State.addMetadata(NewLI, LI); 2646 Instruction *Res = NewLI; 2647 if (isReverse()) 2648 Res = createReverseEVL(Builder, Res, EVL, "vp.reverse"); 2649 State.set(this, Res); 2650 } 2651 2652 InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF, 2653 VPCostContext &Ctx) const { 2654 if (!Consecutive || IsMasked) 2655 return VPWidenMemoryRecipe::computeCost(VF, Ctx); 2656 2657 // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost() 2658 // here because the EVL recipes using EVL to replace the tail mask. But in the 2659 // legacy model, it will always calculate the cost of mask. 2660 // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we 2661 // don't need to compare to the legacy cost model. 2662 Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); 2663 const Align Alignment = 2664 getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient)); 2665 unsigned AS = 2666 getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient)); 2667 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2668 InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( 2669 Ingredient.getOpcode(), Ty, Alignment, AS, CostKind); 2670 if (!Reverse) 2671 return Cost; 2672 2673 return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 2674 cast<VectorType>(Ty), {}, CostKind, 0); 2675 } 2676 2677 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2678 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, 2679 VPSlotTracker &SlotTracker) const { 2680 O << Indent << "WIDEN "; 2681 printAsOperand(O, SlotTracker); 2682 O << " = vp.load "; 2683 printOperands(O, SlotTracker); 2684 } 2685 #endif 2686 2687 void VPWidenStoreRecipe::execute(VPTransformState &State) { 2688 auto *SI = cast<StoreInst>(&Ingredient); 2689 2690 VPValue *StoredVPValue = getStoredValue(); 2691 bool CreateScatter = !isConsecutive(); 2692 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2693 2694 auto &Builder = State.Builder; 2695 State.setDebugLocFrom(getDebugLoc()); 2696 2697 Value *Mask = nullptr; 2698 if (auto *VPMask = getMask()) { 2699 // Mask reversal is only needed for non-all-one (null) masks, as reverse 2700 // of a null all-one mask is a null mask. 2701 Mask = State.get(VPMask); 2702 if (isReverse()) 2703 Mask = Builder.CreateVectorReverse(Mask, "reverse"); 2704 } 2705 2706 Value *StoredVal = State.get(StoredVPValue); 2707 if (isReverse()) { 2708 // If we store to reverse consecutive memory locations, then we need 2709 // to reverse the order of elements in the stored value. 2710 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 2711 // We don't want to update the value in the map as it might be used in 2712 // another expression. So don't call resetVectorValue(StoredVal). 2713 } 2714 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter); 2715 Instruction *NewSI = nullptr; 2716 if (CreateScatter) 2717 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask); 2718 else if (Mask) 2719 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask); 2720 else 2721 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment); 2722 State.addMetadata(NewSI, SI); 2723 } 2724 2725 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2726 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, 2727 VPSlotTracker &SlotTracker) const { 2728 O << Indent << "WIDEN store "; 2729 printOperands(O, SlotTracker); 2730 } 2731 #endif 2732 2733 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { 2734 auto *SI = cast<StoreInst>(&Ingredient); 2735 2736 VPValue *StoredValue = getStoredValue(); 2737 bool CreateScatter = !isConsecutive(); 2738 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2739 2740 auto &Builder = State.Builder; 2741 State.setDebugLocFrom(getDebugLoc()); 2742 2743 CallInst *NewSI = nullptr; 2744 Value *StoredVal = State.get(StoredValue); 2745 Value *EVL = State.get(getEVL(), VPLane(0)); 2746 if (isReverse()) 2747 StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse"); 2748 Value *Mask = nullptr; 2749 if (VPValue *VPMask = getMask()) { 2750 Mask = State.get(VPMask); 2751 if (isReverse()) 2752 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); 2753 } else { 2754 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 2755 } 2756 Value *Addr = State.get(getAddr(), !CreateScatter); 2757 if (CreateScatter) { 2758 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), 2759 Intrinsic::vp_scatter, 2760 {StoredVal, Addr, Mask, EVL}); 2761 } else { 2762 VectorBuilder VBuilder(Builder); 2763 VBuilder.setEVL(EVL).setMask(Mask); 2764 NewSI = cast<CallInst>(VBuilder.createVectorInstruction( 2765 Instruction::Store, Type::getVoidTy(EVL->getContext()), 2766 {StoredVal, Addr})); 2767 } 2768 NewSI->addParamAttr( 2769 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment)); 2770 State.addMetadata(NewSI, SI); 2771 } 2772 2773 InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF, 2774 VPCostContext &Ctx) const { 2775 if (!Consecutive || IsMasked) 2776 return VPWidenMemoryRecipe::computeCost(VF, Ctx); 2777 2778 // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost() 2779 // here because the EVL recipes using EVL to replace the tail mask. But in the 2780 // legacy model, it will always calculate the cost of mask. 2781 // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we 2782 // don't need to compare to the legacy cost model. 2783 Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); 2784 const Align Alignment = 2785 getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient)); 2786 unsigned AS = 2787 getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient)); 2788 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2789 InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( 2790 Ingredient.getOpcode(), Ty, Alignment, AS, CostKind); 2791 if (!Reverse) 2792 return Cost; 2793 2794 return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 2795 cast<VectorType>(Ty), {}, CostKind, 0); 2796 } 2797 2798 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2799 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent, 2800 VPSlotTracker &SlotTracker) const { 2801 O << Indent << "WIDEN vp.store "; 2802 printOperands(O, SlotTracker); 2803 } 2804 #endif 2805 2806 static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V, 2807 VectorType *DstVTy, const DataLayout &DL) { 2808 // Verify that V is a vector type with same number of elements as DstVTy. 2809 auto VF = DstVTy->getElementCount(); 2810 auto *SrcVecTy = cast<VectorType>(V->getType()); 2811 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match"); 2812 Type *SrcElemTy = SrcVecTy->getElementType(); 2813 Type *DstElemTy = DstVTy->getElementType(); 2814 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2815 "Vector elements must have same size"); 2816 2817 // Do a direct cast if element types are castable. 2818 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2819 return Builder.CreateBitOrPointerCast(V, DstVTy); 2820 } 2821 // V cannot be directly casted to desired vector type. 2822 // May happen when V is a floating point vector but DstVTy is a vector of 2823 // pointers or vice-versa. Handle this using a two-step bitcast using an 2824 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2825 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2826 "Only one type should be a pointer type"); 2827 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2828 "Only one type should be a floating point type"); 2829 Type *IntTy = 2830 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2831 auto *VecIntTy = VectorType::get(IntTy, VF); 2832 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2833 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2834 } 2835 2836 /// Return a vector containing interleaved elements from multiple 2837 /// smaller input vectors. 2838 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, 2839 const Twine &Name) { 2840 unsigned Factor = Vals.size(); 2841 assert(Factor > 1 && "Tried to interleave invalid number of vectors"); 2842 2843 VectorType *VecTy = cast<VectorType>(Vals[0]->getType()); 2844 #ifndef NDEBUG 2845 for (Value *Val : Vals) 2846 assert(Val->getType() == VecTy && "Tried to interleave mismatched types"); 2847 #endif 2848 2849 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so 2850 // must use intrinsics to interleave. 2851 if (VecTy->isScalableTy()) { 2852 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); 2853 return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2, 2854 Vals, 2855 /*FMFSource=*/nullptr, Name); 2856 } 2857 2858 // Fixed length. Start by concatenating all vectors into a wide vector. 2859 Value *WideVec = concatenateVectors(Builder, Vals); 2860 2861 // Interleave the elements into the wide vector. 2862 const unsigned NumElts = VecTy->getElementCount().getFixedValue(); 2863 return Builder.CreateShuffleVector( 2864 WideVec, createInterleaveMask(NumElts, Factor), Name); 2865 } 2866 2867 // Try to vectorize the interleave group that \p Instr belongs to. 2868 // 2869 // E.g. Translate following interleaved load group (factor = 3): 2870 // for (i = 0; i < N; i+=3) { 2871 // R = Pic[i]; // Member of index 0 2872 // G = Pic[i+1]; // Member of index 1 2873 // B = Pic[i+2]; // Member of index 2 2874 // ... // do something to R, G, B 2875 // } 2876 // To: 2877 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2878 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2879 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2880 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2881 // 2882 // Or translate following interleaved store group (factor = 3): 2883 // for (i = 0; i < N; i+=3) { 2884 // ... do something to R, G, B 2885 // Pic[i] = R; // Member of index 0 2886 // Pic[i+1] = G; // Member of index 1 2887 // Pic[i+2] = B; // Member of index 2 2888 // } 2889 // To: 2890 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2891 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2892 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2893 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2894 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2895 void VPInterleaveRecipe::execute(VPTransformState &State) { 2896 assert(!State.Lane && "Interleave group being replicated."); 2897 const InterleaveGroup<Instruction> *Group = IG; 2898 Instruction *Instr = Group->getInsertPos(); 2899 2900 // Prepare for the vector type of the interleaved load/store. 2901 Type *ScalarTy = getLoadStoreType(Instr); 2902 unsigned InterleaveFactor = Group->getFactor(); 2903 auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor); 2904 2905 // TODO: extend the masked interleaved-group support to reversed access. 2906 VPValue *BlockInMask = getMask(); 2907 assert((!BlockInMask || !Group->isReverse()) && 2908 "Reversed masked interleave-group not supported."); 2909 2910 VPValue *Addr = getAddr(); 2911 Value *ResAddr = State.get(Addr, VPLane(0)); 2912 if (auto *I = dyn_cast<Instruction>(ResAddr)) 2913 State.setDebugLocFrom(I->getDebugLoc()); 2914 2915 // If the group is reverse, adjust the index to refer to the last vector lane 2916 // instead of the first. We adjust the index from the first vector lane, 2917 // rather than directly getting the pointer for lane VF - 1, because the 2918 // pointer operand of the interleaved access is supposed to be uniform. 2919 if (Group->isReverse()) { 2920 Value *RuntimeVF = 2921 getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF); 2922 Value *Index = 2923 State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1)); 2924 Index = State.Builder.CreateMul(Index, 2925 State.Builder.getInt32(Group->getFactor())); 2926 Index = State.Builder.CreateNeg(Index); 2927 2928 bool InBounds = false; 2929 if (auto *Gep = dyn_cast<GetElementPtrInst>(ResAddr->stripPointerCasts())) 2930 InBounds = Gep->isInBounds(); 2931 ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Index, "", InBounds); 2932 } 2933 2934 State.setDebugLocFrom(Instr->getDebugLoc()); 2935 Value *PoisonVec = PoisonValue::get(VecTy); 2936 2937 auto CreateGroupMask = [&BlockInMask, &State, 2938 &InterleaveFactor](Value *MaskForGaps) -> Value * { 2939 if (State.VF.isScalable()) { 2940 assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); 2941 assert(InterleaveFactor == 2 && 2942 "Unsupported deinterleave factor for scalable vectors"); 2943 auto *ResBlockInMask = State.get(BlockInMask); 2944 SmallVector<Value *, 2> Ops = {ResBlockInMask, ResBlockInMask}; 2945 auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(), 2946 State.VF.getKnownMinValue() * 2, true); 2947 return State.Builder.CreateIntrinsic( 2948 MaskTy, Intrinsic::vector_interleave2, Ops, 2949 /*FMFSource=*/nullptr, "interleaved.mask"); 2950 } 2951 2952 if (!BlockInMask) 2953 return MaskForGaps; 2954 2955 Value *ResBlockInMask = State.get(BlockInMask); 2956 Value *ShuffledMask = State.Builder.CreateShuffleVector( 2957 ResBlockInMask, 2958 createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()), 2959 "interleaved.mask"); 2960 return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And, 2961 ShuffledMask, MaskForGaps) 2962 : ShuffledMask; 2963 }; 2964 2965 const DataLayout &DL = Instr->getDataLayout(); 2966 // Vectorize the interleaved load group. 2967 if (isa<LoadInst>(Instr)) { 2968 Value *MaskForGaps = nullptr; 2969 if (NeedsMaskForGaps) { 2970 MaskForGaps = createBitMaskForGaps(State.Builder, 2971 State.VF.getKnownMinValue(), *Group); 2972 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2973 } 2974 2975 Instruction *NewLoad; 2976 if (BlockInMask || MaskForGaps) { 2977 Value *GroupMask = CreateGroupMask(MaskForGaps); 2978 NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr, 2979 Group->getAlign(), GroupMask, 2980 PoisonVec, "wide.masked.vec"); 2981 } else 2982 NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr, 2983 Group->getAlign(), "wide.vec"); 2984 Group->addMetadata(NewLoad); 2985 2986 ArrayRef<VPValue *> VPDefs = definedValues(); 2987 const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); 2988 if (VecTy->isScalableTy()) { 2989 assert(InterleaveFactor == 2 && 2990 "Unsupported deinterleave factor for scalable vectors"); 2991 2992 // Scalable vectors cannot use arbitrary shufflevectors (only splats), 2993 // so must use intrinsics to deinterleave. 2994 Value *DI = State.Builder.CreateIntrinsic( 2995 Intrinsic::vector_deinterleave2, VecTy, NewLoad, 2996 /*FMFSource=*/nullptr, "strided.vec"); 2997 unsigned J = 0; 2998 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2999 Instruction *Member = Group->getMember(I); 3000 3001 if (!Member) 3002 continue; 3003 3004 Value *StridedVec = State.Builder.CreateExtractValue(DI, I); 3005 // If this member has different type, cast the result type. 3006 if (Member->getType() != ScalarTy) { 3007 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); 3008 StridedVec = 3009 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); 3010 } 3011 3012 if (Group->isReverse()) 3013 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); 3014 3015 State.set(VPDefs[J], StridedVec); 3016 ++J; 3017 } 3018 3019 return; 3020 } 3021 3022 // For each member in the group, shuffle out the appropriate data from the 3023 // wide loads. 3024 unsigned J = 0; 3025 for (unsigned I = 0; I < InterleaveFactor; ++I) { 3026 Instruction *Member = Group->getMember(I); 3027 3028 // Skip the gaps in the group. 3029 if (!Member) 3030 continue; 3031 3032 auto StrideMask = 3033 createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue()); 3034 Value *StridedVec = 3035 State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec"); 3036 3037 // If this member has different type, cast the result type. 3038 if (Member->getType() != ScalarTy) { 3039 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 3040 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); 3041 StridedVec = 3042 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); 3043 } 3044 3045 if (Group->isReverse()) 3046 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); 3047 3048 State.set(VPDefs[J], StridedVec); 3049 ++J; 3050 } 3051 return; 3052 } 3053 3054 // The sub vector type for current instruction. 3055 auto *SubVT = VectorType::get(ScalarTy, State.VF); 3056 3057 // Vectorize the interleaved store group. 3058 Value *MaskForGaps = 3059 createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group); 3060 assert((!MaskForGaps || !State.VF.isScalable()) && 3061 "masking gaps for scalable vectors is not yet supported."); 3062 ArrayRef<VPValue *> StoredValues = getStoredValues(); 3063 // Collect the stored vector from each member. 3064 SmallVector<Value *, 4> StoredVecs; 3065 unsigned StoredIdx = 0; 3066 for (unsigned i = 0; i < InterleaveFactor; i++) { 3067 assert((Group->getMember(i) || MaskForGaps) && 3068 "Fail to get a member from an interleaved store group"); 3069 Instruction *Member = Group->getMember(i); 3070 3071 // Skip the gaps in the group. 3072 if (!Member) { 3073 Value *Undef = PoisonValue::get(SubVT); 3074 StoredVecs.push_back(Undef); 3075 continue; 3076 } 3077 3078 Value *StoredVec = State.get(StoredValues[StoredIdx]); 3079 ++StoredIdx; 3080 3081 if (Group->isReverse()) 3082 StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse"); 3083 3084 // If this member has different type, cast it to a unified type. 3085 3086 if (StoredVec->getType() != SubVT) 3087 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL); 3088 3089 StoredVecs.push_back(StoredVec); 3090 } 3091 3092 // Interleave all the smaller vectors into one wider vector. 3093 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec"); 3094 Instruction *NewStoreInstr; 3095 if (BlockInMask || MaskForGaps) { 3096 Value *GroupMask = CreateGroupMask(MaskForGaps); 3097 NewStoreInstr = State.Builder.CreateMaskedStore( 3098 IVec, ResAddr, Group->getAlign(), GroupMask); 3099 } else 3100 NewStoreInstr = 3101 State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign()); 3102 3103 Group->addMetadata(NewStoreInstr); 3104 } 3105 3106 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3107 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 3108 VPSlotTracker &SlotTracker) const { 3109 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 3110 IG->getInsertPos()->printAsOperand(O, false); 3111 O << ", "; 3112 getAddr()->printAsOperand(O, SlotTracker); 3113 VPValue *Mask = getMask(); 3114 if (Mask) { 3115 O << ", "; 3116 Mask->printAsOperand(O, SlotTracker); 3117 } 3118 3119 unsigned OpIdx = 0; 3120 for (unsigned i = 0; i < IG->getFactor(); ++i) { 3121 if (!IG->getMember(i)) 3122 continue; 3123 if (getNumStoreOperands() > 0) { 3124 O << "\n" << Indent << " store "; 3125 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 3126 O << " to index " << i; 3127 } else { 3128 O << "\n" << Indent << " "; 3129 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 3130 O << " = load from index " << i; 3131 } 3132 ++OpIdx; 3133 } 3134 } 3135 #endif 3136 3137 InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF, 3138 VPCostContext &Ctx) const { 3139 Instruction *InsertPos = getInsertPos(); 3140 // Find the VPValue index of the interleave group. We need to skip gaps. 3141 unsigned InsertPosIdx = 0; 3142 for (unsigned Idx = 0; IG->getFactor(); ++Idx) 3143 if (auto *Member = IG->getMember(Idx)) { 3144 if (Member == InsertPos) 3145 break; 3146 InsertPosIdx++; 3147 } 3148 Type *ValTy = Ctx.Types.inferScalarType( 3149 getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx) 3150 : getStoredValues()[InsertPosIdx]); 3151 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); 3152 unsigned AS = getLoadStoreAddressSpace(InsertPos); 3153 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 3154 3155 unsigned InterleaveFactor = IG->getFactor(); 3156 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 3157 3158 // Holds the indices of existing members in the interleaved group. 3159 SmallVector<unsigned, 4> Indices; 3160 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 3161 if (IG->getMember(IF)) 3162 Indices.push_back(IF); 3163 3164 // Calculate the cost of the whole interleaved group. 3165 InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost( 3166 InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices, 3167 IG->getAlign(), AS, CostKind, getMask(), NeedsMaskForGaps); 3168 3169 if (!IG->isReverse()) 3170 return Cost; 3171 3172 return Cost + IG->getNumMembers() * 3173 Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 3174 VectorTy, std::nullopt, CostKind, 0); 3175 } 3176 3177 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3178 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3179 VPSlotTracker &SlotTracker) const { 3180 O << Indent << "EMIT "; 3181 printAsOperand(O, SlotTracker); 3182 O << " = CANONICAL-INDUCTION "; 3183 printOperands(O, SlotTracker); 3184 } 3185 #endif 3186 3187 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) { 3188 return IsScalarAfterVectorization && 3189 (!IsScalable || vputils::onlyFirstLaneUsed(this)); 3190 } 3191 3192 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 3193 assert(getInductionDescriptor().getKind() == 3194 InductionDescriptor::IK_PtrInduction && 3195 "Not a pointer induction according to InductionDescriptor!"); 3196 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && 3197 "Unexpected type."); 3198 assert(!onlyScalarsGenerated(State.VF.isScalable()) && 3199 "Recipe should have been replaced"); 3200 3201 unsigned CurrentPart = getUnrollPart(*this); 3202 3203 // Build a pointer phi 3204 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 3205 Type *ScStValueType = ScalarStartValue->getType(); 3206 3207 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3208 PHINode *NewPointerPhi = nullptr; 3209 if (CurrentPart == 0) { 3210 auto *IVR = cast<VPHeaderPHIRecipe>(&getParent() 3211 ->getPlan() 3212 ->getVectorLoopRegion() 3213 ->getEntryBasicBlock() 3214 ->front()); 3215 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, /*IsScalar*/ true)); 3216 NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi", 3217 CanonicalIV->getIterator()); 3218 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 3219 NewPointerPhi->setDebugLoc(getDebugLoc()); 3220 } else { 3221 // The recipe has been unrolled. In that case, fetch the single pointer phi 3222 // shared among all unrolled parts of the recipe. 3223 auto *GEP = 3224 cast<GetElementPtrInst>(State.get(getFirstUnrolledPartOperand())); 3225 NewPointerPhi = cast<PHINode>(GEP->getPointerOperand()); 3226 } 3227 3228 // A pointer induction, performed by using a gep 3229 BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint(); 3230 Value *ScalarStepValue = State.get(getStepValue(), VPLane(0)); 3231 Type *PhiType = State.TypeAnalysis.inferScalarType(getStepValue()); 3232 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 3233 // Add induction update using an incorrect block temporarily. The phi node 3234 // will be fixed after VPlan execution. Note that at this point the latch 3235 // block cannot be used, as it does not exist yet. 3236 // TODO: Model increment value in VPlan, by turning the recipe into a 3237 // multi-def and a subclass of VPHeaderPHIRecipe. 3238 if (CurrentPart == 0) { 3239 // The recipe represents the first part of the pointer induction. Create the 3240 // GEP to increment the phi across all unrolled parts. 3241 unsigned UF = CurrentPart == 0 ? getParent()->getPlan()->getUF() : 1; 3242 Value *NumUnrolledElems = 3243 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, UF)); 3244 3245 Value *InductionGEP = GetElementPtrInst::Create( 3246 State.Builder.getInt8Ty(), NewPointerPhi, 3247 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 3248 InductionLoc); 3249 3250 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 3251 } 3252 3253 // Create actual address geps that use the pointer phi as base and a 3254 // vectorized version of the step value (<step*0, ..., step*N>) as offset. 3255 Type *VecPhiType = VectorType::get(PhiType, State.VF); 3256 Value *StartOffsetScalar = State.Builder.CreateMul( 3257 RuntimeVF, ConstantInt::get(PhiType, CurrentPart)); 3258 Value *StartOffset = 3259 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 3260 // Create a vector of consecutive numbers from zero to VF. 3261 StartOffset = State.Builder.CreateAdd( 3262 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 3263 3264 assert(ScalarStepValue == State.get(getOperand(1), VPLane(0)) && 3265 "scalar step must be the same across all parts"); 3266 Value *GEP = State.Builder.CreateGEP( 3267 State.Builder.getInt8Ty(), NewPointerPhi, 3268 State.Builder.CreateMul(StartOffset, State.Builder.CreateVectorSplat( 3269 State.VF, ScalarStepValue)), 3270 "vector.gep"); 3271 State.set(this, GEP); 3272 } 3273 3274 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3275 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, 3276 VPSlotTracker &SlotTracker) const { 3277 assert((getNumOperands() == 2 || getNumOperands() == 4) && 3278 "unexpected number of operands"); 3279 O << Indent << "EMIT "; 3280 printAsOperand(O, SlotTracker); 3281 O << " = WIDEN-POINTER-INDUCTION "; 3282 getStartValue()->printAsOperand(O, SlotTracker); 3283 O << ", "; 3284 getStepValue()->printAsOperand(O, SlotTracker); 3285 if (getNumOperands() == 4) { 3286 O << ", "; 3287 getOperand(2)->printAsOperand(O, SlotTracker); 3288 O << ", "; 3289 getOperand(3)->printAsOperand(O, SlotTracker); 3290 } 3291 } 3292 #endif 3293 3294 void VPExpandSCEVRecipe::execute(VPTransformState &State) { 3295 assert(!State.Lane && "cannot be used in per-lane"); 3296 if (State.ExpandedSCEVs.contains(Expr)) { 3297 // SCEV Expr has already been expanded, result must already be set. At the 3298 // moment we have to execute the entry block twice (once before skeleton 3299 // creation to get expanded SCEVs used by the skeleton and once during 3300 // regular VPlan execution). 3301 State.Builder.SetInsertPoint(State.CFG.VPBB2IRBB[getParent()]); 3302 assert(State.get(this, VPLane(0)) == State.ExpandedSCEVs[Expr] && 3303 "Results must match"); 3304 return; 3305 } 3306 3307 const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); 3308 SCEVExpander Exp(SE, DL, "induction"); 3309 3310 Value *Res = Exp.expandCodeFor(Expr, Expr->getType(), 3311 &*State.Builder.GetInsertPoint()); 3312 State.ExpandedSCEVs[Expr] = Res; 3313 State.set(this, Res, VPLane(0)); 3314 } 3315 3316 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3317 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, 3318 VPSlotTracker &SlotTracker) const { 3319 O << Indent << "EMIT "; 3320 printAsOperand(O, SlotTracker); 3321 O << " = EXPAND SCEV " << *Expr; 3322 } 3323 #endif 3324 3325 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { 3326 Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true); 3327 Type *STy = CanonicalIV->getType(); 3328 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); 3329 ElementCount VF = State.VF; 3330 Value *VStart = VF.isScalar() 3331 ? CanonicalIV 3332 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); 3333 Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this)); 3334 if (VF.isVector()) { 3335 VStep = Builder.CreateVectorSplat(VF, VStep); 3336 VStep = 3337 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); 3338 } 3339 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); 3340 State.set(this, CanonicalVectorIV); 3341 } 3342 3343 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3344 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, 3345 VPSlotTracker &SlotTracker) const { 3346 O << Indent << "EMIT "; 3347 printAsOperand(O, SlotTracker); 3348 O << " = WIDEN-CANONICAL-INDUCTION "; 3349 printOperands(O, SlotTracker); 3350 } 3351 #endif 3352 3353 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { 3354 auto &Builder = State.Builder; 3355 // Create a vector from the initial value. 3356 auto *VectorInit = getStartValue()->getLiveInIRValue(); 3357 3358 Type *VecTy = State.VF.isScalar() 3359 ? VectorInit->getType() 3360 : VectorType::get(VectorInit->getType(), State.VF); 3361 3362 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3363 if (State.VF.isVector()) { 3364 auto *IdxTy = Builder.getInt32Ty(); 3365 auto *One = ConstantInt::get(IdxTy, 1); 3366 IRBuilder<>::InsertPointGuard Guard(Builder); 3367 Builder.SetInsertPoint(VectorPH->getTerminator()); 3368 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF); 3369 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 3370 VectorInit = Builder.CreateInsertElement( 3371 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init"); 3372 } 3373 3374 // Create a phi node for the new recurrence. 3375 PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur"); 3376 Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 3377 Phi->addIncoming(VectorInit, VectorPH); 3378 State.set(this, Phi); 3379 } 3380 3381 InstructionCost 3382 VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF, 3383 VPCostContext &Ctx) const { 3384 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 3385 if (VF.isScalar()) 3386 return Ctx.TTI.getCFInstrCost(Instruction::PHI, CostKind); 3387 3388 if (VF.isScalable() && VF.getKnownMinValue() == 1) 3389 return InstructionCost::getInvalid(); 3390 3391 SmallVector<int> Mask(VF.getKnownMinValue()); 3392 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1); 3393 Type *VectorTy = 3394 toVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); 3395 3396 return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice, 3397 cast<VectorType>(VectorTy), Mask, CostKind, 3398 VF.getKnownMinValue() - 1); 3399 } 3400 3401 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3402 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, 3403 VPSlotTracker &SlotTracker) const { 3404 O << Indent << "FIRST-ORDER-RECURRENCE-PHI "; 3405 printAsOperand(O, SlotTracker); 3406 O << " = phi "; 3407 printOperands(O, SlotTracker); 3408 } 3409 #endif 3410 3411 void VPReductionPHIRecipe::execute(VPTransformState &State) { 3412 auto &Builder = State.Builder; 3413 3414 // If this phi is fed by a scaled reduction then it should output a 3415 // vector with fewer elements than the VF. 3416 ElementCount VF = State.VF.divideCoefficientBy(VFScaleFactor); 3417 3418 // Reductions do not have to start at zero. They can start with 3419 // any loop invariant values. 3420 VPValue *StartVPV = getStartValue(); 3421 Value *StartV = StartVPV->getLiveInIRValue(); 3422 3423 // In order to support recurrences we need to be able to vectorize Phi nodes. 3424 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3425 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 3426 // this value when we vectorize all of the instructions that use the PHI. 3427 bool ScalarPHI = State.VF.isScalar() || IsInLoop; 3428 Type *VecTy = 3429 ScalarPHI ? StartV->getType() : VectorType::get(StartV->getType(), VF); 3430 3431 BasicBlock *HeaderBB = State.CFG.PrevBB; 3432 assert(State.CurrentVectorLoop->getHeader() == HeaderBB && 3433 "recipe must be in the vector loop header"); 3434 auto *Phi = PHINode::Create(VecTy, 2, "vec.phi"); 3435 Phi->insertBefore(HeaderBB->getFirstInsertionPt()); 3436 State.set(this, Phi, IsInLoop); 3437 3438 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3439 3440 Value *Iden = nullptr; 3441 RecurKind RK = RdxDesc.getRecurrenceKind(); 3442 unsigned CurrentPart = getUnrollPart(*this); 3443 3444 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || 3445 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { 3446 // MinMax and AnyOf reductions have the start value as their identity. 3447 if (ScalarPHI) { 3448 Iden = StartV; 3449 } else { 3450 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 3451 Builder.SetInsertPoint(VectorPH->getTerminator()); 3452 StartV = Iden = State.get(StartVPV); 3453 } 3454 } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) { 3455 // [I|F]FindLastIV will use a sentinel value to initialize the reduction 3456 // phi or the resume value from the main vector loop when vectorizing the 3457 // epilogue loop. In the exit block, ComputeReductionResult will generate 3458 // checks to verify if the reduction result is the sentinel value. If the 3459 // result is the sentinel value, it will be corrected back to the start 3460 // value. 3461 // TODO: The sentinel value is not always necessary. When the start value is 3462 // a constant, and smaller than the start value of the induction variable, 3463 // the start value can be directly used to initialize the reduction phi. 3464 Iden = StartV; 3465 if (!ScalarPHI) { 3466 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 3467 Builder.SetInsertPoint(VectorPH->getTerminator()); 3468 StartV = Iden = Builder.CreateVectorSplat(State.VF, Iden); 3469 } 3470 } else { 3471 Iden = llvm::getRecurrenceIdentity(RK, VecTy->getScalarType(), 3472 RdxDesc.getFastMathFlags()); 3473 3474 if (!ScalarPHI) { 3475 if (CurrentPart == 0) { 3476 // Create start and identity vector values for the reduction in the 3477 // preheader. 3478 // TODO: Introduce recipes in VPlan preheader to create initial values. 3479 Iden = Builder.CreateVectorSplat(VF, Iden); 3480 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 3481 Builder.SetInsertPoint(VectorPH->getTerminator()); 3482 Constant *Zero = Builder.getInt32(0); 3483 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 3484 } else { 3485 Iden = Builder.CreateVectorSplat(VF, Iden); 3486 } 3487 } 3488 } 3489 3490 Phi = cast<PHINode>(State.get(this, IsInLoop)); 3491 Value *StartVal = (CurrentPart == 0) ? StartV : Iden; 3492 Phi->addIncoming(StartVal, VectorPH); 3493 } 3494 3495 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3496 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3497 VPSlotTracker &SlotTracker) const { 3498 O << Indent << "WIDEN-REDUCTION-PHI "; 3499 3500 printAsOperand(O, SlotTracker); 3501 O << " = phi "; 3502 printOperands(O, SlotTracker); 3503 if (VFScaleFactor != 1) 3504 O << " (VF scaled by 1/" << VFScaleFactor << ")"; 3505 } 3506 #endif 3507 3508 void VPWidenPHIRecipe::execute(VPTransformState &State) { 3509 assert(EnableVPlanNativePath && 3510 "Non-native vplans are not expected to have VPWidenPHIRecipes."); 3511 3512 Value *Op0 = State.get(getOperand(0)); 3513 Type *VecTy = Op0->getType(); 3514 Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi"); 3515 State.set(this, VecPhi); 3516 } 3517 3518 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3519 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3520 VPSlotTracker &SlotTracker) const { 3521 O << Indent << "WIDEN-PHI "; 3522 3523 auto *OriginalPhi = cast<PHINode>(getUnderlyingValue()); 3524 // Unless all incoming values are modeled in VPlan print the original PHI 3525 // directly. 3526 // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming 3527 // values as VPValues. 3528 if (getNumOperands() != OriginalPhi->getNumOperands()) { 3529 O << VPlanIngredient(OriginalPhi); 3530 return; 3531 } 3532 3533 printAsOperand(O, SlotTracker); 3534 O << " = phi "; 3535 printOperands(O, SlotTracker); 3536 } 3537 #endif 3538 3539 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and 3540 // remove VPActiveLaneMaskPHIRecipe. 3541 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { 3542 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3543 Value *StartMask = State.get(getOperand(0)); 3544 PHINode *Phi = 3545 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); 3546 Phi->addIncoming(StartMask, VectorPH); 3547 Phi->setDebugLoc(getDebugLoc()); 3548 State.set(this, Phi); 3549 } 3550 3551 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3552 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3553 VPSlotTracker &SlotTracker) const { 3554 O << Indent << "ACTIVE-LANE-MASK-PHI "; 3555 3556 printAsOperand(O, SlotTracker); 3557 O << " = phi "; 3558 printOperands(O, SlotTracker); 3559 } 3560 #endif 3561 3562 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3563 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3564 VPSlotTracker &SlotTracker) const { 3565 O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI "; 3566 3567 printAsOperand(O, SlotTracker); 3568 O << " = phi "; 3569 printOperands(O, SlotTracker); 3570 } 3571 #endif 3572 3573 void VPScalarPHIRecipe::execute(VPTransformState &State) { 3574 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3575 Value *Start = State.get(getStartValue(), VPLane(0)); 3576 PHINode *Phi = State.Builder.CreatePHI(Start->getType(), 2, Name); 3577 Phi->addIncoming(Start, VectorPH); 3578 Phi->setDebugLoc(getDebugLoc()); 3579 State.set(this, Phi, /*IsScalar=*/true); 3580 } 3581 3582 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3583 void VPScalarPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3584 VPSlotTracker &SlotTracker) const { 3585 O << Indent << "SCALAR-PHI "; 3586 printAsOperand(O, SlotTracker); 3587 O << " = phi "; 3588 printOperands(O, SlotTracker); 3589 } 3590 #endif 3591