1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file contains implementations for different VPlan recipes. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "VPlan.h" 15 #include "VPlanAnalysis.h" 16 #include "VPlanPatternMatch.h" 17 #include "VPlanUtils.h" 18 #include "llvm/ADT/STLExtras.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/ADT/Twine.h" 21 #include "llvm/Analysis/IVDescriptors.h" 22 #include "llvm/IR/BasicBlock.h" 23 #include "llvm/IR/IRBuilder.h" 24 #include "llvm/IR/Instruction.h" 25 #include "llvm/IR/Instructions.h" 26 #include "llvm/IR/Intrinsics.h" 27 #include "llvm/IR/Type.h" 28 #include "llvm/IR/Value.h" 29 #include "llvm/IR/VectorBuilder.h" 30 #include "llvm/Support/Casting.h" 31 #include "llvm/Support/CommandLine.h" 32 #include "llvm/Support/Debug.h" 33 #include "llvm/Support/raw_ostream.h" 34 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 35 #include "llvm/Transforms/Utils/LoopUtils.h" 36 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 37 #include <cassert> 38 39 using namespace llvm; 40 41 using VectorParts = SmallVector<Value *, 2>; 42 43 namespace llvm { 44 extern cl::opt<bool> EnableVPlanNativePath; 45 } 46 extern cl::opt<unsigned> ForceTargetInstructionCost; 47 48 #define LV_NAME "loop-vectorize" 49 #define DEBUG_TYPE LV_NAME 50 51 bool VPRecipeBase::mayWriteToMemory() const { 52 switch (getVPDefID()) { 53 case VPInstructionSC: 54 return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory(); 55 case VPInterleaveSC: 56 return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0; 57 case VPWidenStoreEVLSC: 58 case VPWidenStoreSC: 59 return true; 60 case VPReplicateSC: 61 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 62 ->mayWriteToMemory(); 63 case VPWidenCallSC: 64 return !cast<VPWidenCallRecipe>(this) 65 ->getCalledScalarFunction() 66 ->onlyReadsMemory(); 67 case VPWidenIntrinsicSC: 68 return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory(); 69 case VPBranchOnMaskSC: 70 case VPScalarIVStepsSC: 71 case VPPredInstPHISC: 72 return false; 73 case VPBlendSC: 74 case VPReductionEVLSC: 75 case VPReductionSC: 76 case VPVectorPointerSC: 77 case VPWidenCanonicalIVSC: 78 case VPWidenCastSC: 79 case VPWidenGEPSC: 80 case VPWidenIntOrFpInductionSC: 81 case VPWidenLoadEVLSC: 82 case VPWidenLoadSC: 83 case VPWidenPHISC: 84 case VPWidenSC: 85 case VPWidenEVLSC: 86 case VPWidenSelectSC: { 87 const Instruction *I = 88 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 89 (void)I; 90 assert((!I || !I->mayWriteToMemory()) && 91 "underlying instruction may write to memory"); 92 return false; 93 } 94 default: 95 return true; 96 } 97 } 98 99 bool VPRecipeBase::mayReadFromMemory() const { 100 switch (getVPDefID()) { 101 case VPInstructionSC: 102 return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory(); 103 case VPWidenLoadEVLSC: 104 case VPWidenLoadSC: 105 return true; 106 case VPReplicateSC: 107 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 108 ->mayReadFromMemory(); 109 case VPWidenCallSC: 110 return !cast<VPWidenCallRecipe>(this) 111 ->getCalledScalarFunction() 112 ->onlyWritesMemory(); 113 case VPWidenIntrinsicSC: 114 return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory(); 115 case VPBranchOnMaskSC: 116 case VPPredInstPHISC: 117 case VPScalarIVStepsSC: 118 case VPWidenStoreEVLSC: 119 case VPWidenStoreSC: 120 return false; 121 case VPBlendSC: 122 case VPReductionEVLSC: 123 case VPReductionSC: 124 case VPVectorPointerSC: 125 case VPWidenCanonicalIVSC: 126 case VPWidenCastSC: 127 case VPWidenGEPSC: 128 case VPWidenIntOrFpInductionSC: 129 case VPWidenPHISC: 130 case VPWidenSC: 131 case VPWidenEVLSC: 132 case VPWidenSelectSC: { 133 const Instruction *I = 134 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 135 (void)I; 136 assert((!I || !I->mayReadFromMemory()) && 137 "underlying instruction may read from memory"); 138 return false; 139 } 140 default: 141 return true; 142 } 143 } 144 145 bool VPRecipeBase::mayHaveSideEffects() const { 146 switch (getVPDefID()) { 147 case VPDerivedIVSC: 148 case VPPredInstPHISC: 149 case VPScalarCastSC: 150 case VPReverseVectorPointerSC: 151 return false; 152 case VPInstructionSC: 153 return mayWriteToMemory(); 154 case VPWidenCallSC: { 155 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction(); 156 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn(); 157 } 158 case VPWidenIntrinsicSC: 159 return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects(); 160 case VPBlendSC: 161 case VPReductionEVLSC: 162 case VPReductionSC: 163 case VPScalarIVStepsSC: 164 case VPVectorPointerSC: 165 case VPWidenCanonicalIVSC: 166 case VPWidenCastSC: 167 case VPWidenGEPSC: 168 case VPWidenIntOrFpInductionSC: 169 case VPWidenPHISC: 170 case VPWidenPointerInductionSC: 171 case VPWidenSC: 172 case VPWidenEVLSC: 173 case VPWidenSelectSC: { 174 const Instruction *I = 175 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 176 (void)I; 177 assert((!I || !I->mayHaveSideEffects()) && 178 "underlying instruction has side-effects"); 179 return false; 180 } 181 case VPInterleaveSC: 182 return mayWriteToMemory(); 183 case VPWidenLoadEVLSC: 184 case VPWidenLoadSC: 185 case VPWidenStoreEVLSC: 186 case VPWidenStoreSC: 187 assert( 188 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() == 189 mayWriteToMemory() && 190 "mayHaveSideffects result for ingredient differs from this " 191 "implementation"); 192 return mayWriteToMemory(); 193 case VPReplicateSC: { 194 auto *R = cast<VPReplicateRecipe>(this); 195 return R->getUnderlyingInstr()->mayHaveSideEffects(); 196 } 197 default: 198 return true; 199 } 200 } 201 202 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { 203 assert(!Parent && "Recipe already in some VPBasicBlock"); 204 assert(InsertPos->getParent() && 205 "Insertion position not in any VPBasicBlock"); 206 InsertPos->getParent()->insert(this, InsertPos->getIterator()); 207 } 208 209 void VPRecipeBase::insertBefore(VPBasicBlock &BB, 210 iplist<VPRecipeBase>::iterator I) { 211 assert(!Parent && "Recipe already in some VPBasicBlock"); 212 assert(I == BB.end() || I->getParent() == &BB); 213 BB.insert(this, I); 214 } 215 216 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { 217 assert(!Parent && "Recipe already in some VPBasicBlock"); 218 assert(InsertPos->getParent() && 219 "Insertion position not in any VPBasicBlock"); 220 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator())); 221 } 222 223 void VPRecipeBase::removeFromParent() { 224 assert(getParent() && "Recipe not in any VPBasicBlock"); 225 getParent()->getRecipeList().remove(getIterator()); 226 Parent = nullptr; 227 } 228 229 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { 230 assert(getParent() && "Recipe not in any VPBasicBlock"); 231 return getParent()->getRecipeList().erase(getIterator()); 232 } 233 234 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { 235 removeFromParent(); 236 insertAfter(InsertPos); 237 } 238 239 void VPRecipeBase::moveBefore(VPBasicBlock &BB, 240 iplist<VPRecipeBase>::iterator I) { 241 removeFromParent(); 242 insertBefore(BB, I); 243 } 244 245 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { 246 // Get the underlying instruction for the recipe, if there is one. It is used 247 // to 248 // * decide if cost computation should be skipped for this recipe, 249 // * apply forced target instruction cost. 250 Instruction *UI = nullptr; 251 if (auto *S = dyn_cast<VPSingleDefRecipe>(this)) 252 UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue()); 253 else if (auto *IG = dyn_cast<VPInterleaveRecipe>(this)) 254 UI = IG->getInsertPos(); 255 else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this)) 256 UI = &WidenMem->getIngredient(); 257 258 InstructionCost RecipeCost; 259 if (UI && Ctx.skipCostComputation(UI, VF.isVector())) { 260 RecipeCost = 0; 261 } else { 262 RecipeCost = computeCost(VF, Ctx); 263 if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 && 264 RecipeCost.isValid()) 265 RecipeCost = InstructionCost(ForceTargetInstructionCost); 266 } 267 268 LLVM_DEBUG({ 269 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": "; 270 dump(); 271 }); 272 return RecipeCost; 273 } 274 275 InstructionCost VPRecipeBase::computeCost(ElementCount VF, 276 VPCostContext &Ctx) const { 277 llvm_unreachable("subclasses should implement computeCost"); 278 } 279 280 InstructionCost 281 VPPartialReductionRecipe::computeCost(ElementCount VF, 282 VPCostContext &Ctx) const { 283 std::optional<unsigned> Opcode = std::nullopt; 284 VPRecipeBase *BinOpR = getOperand(0)->getDefiningRecipe(); 285 if (auto *WidenR = dyn_cast<VPWidenRecipe>(BinOpR)) 286 Opcode = std::make_optional(WidenR->getOpcode()); 287 288 VPRecipeBase *ExtAR = BinOpR->getOperand(0)->getDefiningRecipe(); 289 VPRecipeBase *ExtBR = BinOpR->getOperand(1)->getDefiningRecipe(); 290 291 auto *PhiType = Ctx.Types.inferScalarType(getOperand(1)); 292 auto *InputTypeA = Ctx.Types.inferScalarType(ExtAR ? ExtAR->getOperand(0) 293 : BinOpR->getOperand(0)); 294 auto *InputTypeB = Ctx.Types.inferScalarType(ExtBR ? ExtBR->getOperand(0) 295 : BinOpR->getOperand(1)); 296 297 auto GetExtendKind = [](VPRecipeBase *R) { 298 // The extend could come from outside the plan. 299 if (!R) 300 return TargetTransformInfo::PR_None; 301 auto *WidenCastR = dyn_cast<VPWidenCastRecipe>(R); 302 if (!WidenCastR) 303 return TargetTransformInfo::PR_None; 304 if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt) 305 return TargetTransformInfo::PR_ZeroExtend; 306 if (WidenCastR->getOpcode() == Instruction::CastOps::SExt) 307 return TargetTransformInfo::PR_SignExtend; 308 return TargetTransformInfo::PR_None; 309 }; 310 311 return Ctx.TTI.getPartialReductionCost(getOpcode(), InputTypeA, InputTypeB, 312 PhiType, VF, GetExtendKind(ExtAR), 313 GetExtendKind(ExtBR), Opcode); 314 } 315 316 void VPPartialReductionRecipe::execute(VPTransformState &State) { 317 State.setDebugLocFrom(getDebugLoc()); 318 auto &Builder = State.Builder; 319 320 assert(getOpcode() == Instruction::Add && 321 "Unhandled partial reduction opcode"); 322 323 Value *BinOpVal = State.get(getOperand(0)); 324 Value *PhiVal = State.get(getOperand(1)); 325 assert(PhiVal && BinOpVal && "Phi and Mul must be set"); 326 327 Type *RetTy = PhiVal->getType(); 328 329 CallInst *V = Builder.CreateIntrinsic( 330 RetTy, Intrinsic::experimental_vector_partial_reduce_add, 331 {PhiVal, BinOpVal}, nullptr, "partial.reduce"); 332 333 State.set(this, V); 334 } 335 336 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 337 void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent, 338 VPSlotTracker &SlotTracker) const { 339 O << Indent << "PARTIAL-REDUCE "; 340 printAsOperand(O, SlotTracker); 341 O << " = " << Instruction::getOpcodeName(getOpcode()) << " "; 342 printOperands(O, SlotTracker); 343 } 344 #endif 345 346 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { 347 assert(OpType == OperationType::FPMathOp && 348 "recipe doesn't have fast math flags"); 349 FastMathFlags Res; 350 Res.setAllowReassoc(FMFs.AllowReassoc); 351 Res.setNoNaNs(FMFs.NoNaNs); 352 Res.setNoInfs(FMFs.NoInfs); 353 Res.setNoSignedZeros(FMFs.NoSignedZeros); 354 Res.setAllowReciprocal(FMFs.AllowReciprocal); 355 Res.setAllowContract(FMFs.AllowContract); 356 Res.setApproxFunc(FMFs.ApproxFunc); 357 return Res; 358 } 359 360 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 361 void VPSingleDefRecipe::dump() const { VPDef::dump(); } 362 #endif 363 364 template <unsigned PartOpIdx> 365 VPValue * 366 VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(VPUser &U) const { 367 if (U.getNumOperands() == PartOpIdx + 1) 368 return U.getOperand(PartOpIdx); 369 return nullptr; 370 } 371 372 template <unsigned PartOpIdx> 373 unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(VPUser &U) const { 374 if (auto *UnrollPartOp = getUnrollPartOperand(U)) 375 return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue(); 376 return 0; 377 } 378 379 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, 380 VPValue *A, VPValue *B, DebugLoc DL, 381 const Twine &Name) 382 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}), 383 Pred, DL), 384 Opcode(Opcode), Name(Name.str()) { 385 assert(Opcode == Instruction::ICmp && 386 "only ICmp predicates supported at the moment"); 387 } 388 389 VPInstruction::VPInstruction(unsigned Opcode, 390 std::initializer_list<VPValue *> Operands, 391 FastMathFlags FMFs, DebugLoc DL, const Twine &Name) 392 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL), 393 Opcode(Opcode), Name(Name.str()) { 394 // Make sure the VPInstruction is a floating-point operation. 395 assert(isFPMathOp() && "this op can't take fast-math flags"); 396 } 397 398 bool VPInstruction::doesGeneratePerAllLanes() const { 399 return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this); 400 } 401 402 bool VPInstruction::canGenerateScalarForFirstLane() const { 403 if (Instruction::isBinaryOp(getOpcode())) 404 return true; 405 if (isSingleScalar() || isVectorToScalar()) 406 return true; 407 switch (Opcode) { 408 case Instruction::ICmp: 409 case Instruction::Select: 410 case VPInstruction::BranchOnCond: 411 case VPInstruction::BranchOnCount: 412 case VPInstruction::CalculateTripCountMinusVF: 413 case VPInstruction::CanonicalIVIncrementForPart: 414 case VPInstruction::PtrAdd: 415 case VPInstruction::ExplicitVectorLength: 416 case VPInstruction::AnyOf: 417 return true; 418 default: 419 return false; 420 } 421 } 422 423 Value *VPInstruction::generatePerLane(VPTransformState &State, 424 const VPLane &Lane) { 425 IRBuilderBase &Builder = State.Builder; 426 427 assert(getOpcode() == VPInstruction::PtrAdd && 428 "only PtrAdd opcodes are supported for now"); 429 return Builder.CreatePtrAdd(State.get(getOperand(0), Lane), 430 State.get(getOperand(1), Lane), Name); 431 } 432 433 Value *VPInstruction::generate(VPTransformState &State) { 434 IRBuilderBase &Builder = State.Builder; 435 436 if (Instruction::isBinaryOp(getOpcode())) { 437 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 438 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); 439 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed); 440 auto *Res = 441 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); 442 if (auto *I = dyn_cast<Instruction>(Res)) 443 setFlags(I); 444 return Res; 445 } 446 447 switch (getOpcode()) { 448 case VPInstruction::Not: { 449 Value *A = State.get(getOperand(0)); 450 return Builder.CreateNot(A, Name); 451 } 452 case Instruction::ICmp: { 453 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 454 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); 455 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed); 456 return Builder.CreateCmp(getPredicate(), A, B, Name); 457 } 458 case Instruction::Select: { 459 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 460 Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed); 461 Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed); 462 Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed); 463 return Builder.CreateSelect(Cond, Op1, Op2, Name); 464 } 465 case VPInstruction::ActiveLaneMask: { 466 // Get first lane of vector induction variable. 467 Value *VIVElem0 = State.get(getOperand(0), VPLane(0)); 468 // Get the original loop tripcount. 469 Value *ScalarTC = State.get(getOperand(1), VPLane(0)); 470 471 // If this part of the active lane mask is scalar, generate the CMP directly 472 // to avoid unnecessary extracts. 473 if (State.VF.isScalar()) 474 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC, 475 Name); 476 477 auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); 478 auto *PredTy = VectorType::get(Int1Ty, State.VF); 479 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, 480 {PredTy, ScalarTC->getType()}, 481 {VIVElem0, ScalarTC}, nullptr, Name); 482 } 483 case VPInstruction::FirstOrderRecurrenceSplice: { 484 // Generate code to combine the previous and current values in vector v3. 485 // 486 // vector.ph: 487 // v_init = vector(..., ..., ..., a[-1]) 488 // br vector.body 489 // 490 // vector.body 491 // i = phi [0, vector.ph], [i+4, vector.body] 492 // v1 = phi [v_init, vector.ph], [v2, vector.body] 493 // v2 = a[i, i+1, i+2, i+3]; 494 // v3 = vector(v1(3), v2(0, 1, 2)) 495 496 auto *V1 = State.get(getOperand(0)); 497 if (!V1->getType()->isVectorTy()) 498 return V1; 499 Value *V2 = State.get(getOperand(1)); 500 return Builder.CreateVectorSplice(V1, V2, -1, Name); 501 } 502 case VPInstruction::CalculateTripCountMinusVF: { 503 unsigned UF = getParent()->getPlan()->getUF(); 504 Value *ScalarTC = State.get(getOperand(0), VPLane(0)); 505 Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF); 506 Value *Sub = Builder.CreateSub(ScalarTC, Step); 507 Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step); 508 Value *Zero = ConstantInt::get(ScalarTC->getType(), 0); 509 return Builder.CreateSelect(Cmp, Sub, Zero); 510 } 511 case VPInstruction::ExplicitVectorLength: { 512 // TODO: Restructure this code with an explicit remainder loop, vsetvli can 513 // be outside of the main loop. 514 Value *AVL = State.get(getOperand(0), /*IsScalar*/ true); 515 // Compute EVL 516 assert(AVL->getType()->isIntegerTy() && 517 "Requested vector length should be an integer."); 518 519 assert(State.VF.isScalable() && "Expected scalable vector factor."); 520 Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue()); 521 522 Value *EVL = State.Builder.CreateIntrinsic( 523 State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length, 524 {AVL, VFArg, State.Builder.getTrue()}); 525 return EVL; 526 } 527 case VPInstruction::CanonicalIVIncrementForPart: { 528 unsigned Part = getUnrollPart(*this); 529 auto *IV = State.get(getOperand(0), VPLane(0)); 530 assert(Part != 0 && "Must have a positive part"); 531 // The canonical IV is incremented by the vectorization factor (num of 532 // SIMD elements) times the unroll part. 533 Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); 534 return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(), 535 hasNoSignedWrap()); 536 } 537 case VPInstruction::BranchOnCond: { 538 Value *Cond = State.get(getOperand(0), VPLane(0)); 539 // Replace the temporary unreachable terminator with a new conditional 540 // branch, hooking it up to backward destination for exiting blocks now and 541 // to forward destination(s) later when they are created. 542 BranchInst *CondBr = 543 Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr); 544 CondBr->setSuccessor(0, nullptr); 545 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 546 547 if (!getParent()->isExiting()) 548 return CondBr; 549 550 VPRegionBlock *ParentRegion = getParent()->getParent(); 551 VPBasicBlock *Header = ParentRegion->getEntryBasicBlock(); 552 CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]); 553 return CondBr; 554 } 555 case VPInstruction::BranchOnCount: { 556 // First create the compare. 557 Value *IV = State.get(getOperand(0), /*IsScalar*/ true); 558 Value *TC = State.get(getOperand(1), /*IsScalar*/ true); 559 Value *Cond = Builder.CreateICmpEQ(IV, TC); 560 561 // Now create the branch. 562 auto *Plan = getParent()->getPlan(); 563 VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); 564 VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock(); 565 566 // Replace the temporary unreachable terminator with a new conditional 567 // branch, hooking it up to backward destination (the header) now and to the 568 // forward destination (the exit/middle block) later when it is created. 569 // Note that CreateCondBr expects a valid BB as first argument, so we need 570 // to set it to nullptr later. 571 BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), 572 State.CFG.VPBB2IRBB[Header]); 573 CondBr->setSuccessor(0, nullptr); 574 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 575 return CondBr; 576 } 577 case VPInstruction::ComputeReductionResult: { 578 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary 579 // and will be removed by breaking up the recipe further. 580 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0)); 581 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 582 // Get its reduction variable descriptor. 583 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 584 585 RecurKind RK = RdxDesc.getRecurrenceKind(); 586 587 Type *PhiTy = OrigPhi->getType(); 588 // The recipe's operands are the reduction phi, followed by one operand for 589 // each part of the reduction. 590 unsigned UF = getNumOperands() - 1; 591 VectorParts RdxParts(UF); 592 for (unsigned Part = 0; Part < UF; ++Part) 593 RdxParts[Part] = State.get(getOperand(1 + Part), PhiR->isInLoop()); 594 595 // If the vector reduction can be performed in a smaller type, we truncate 596 // then extend the loop exit value to enable InstCombine to evaluate the 597 // entire expression in the smaller type. 598 // TODO: Handle this in truncateToMinBW. 599 if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 600 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF); 601 for (unsigned Part = 0; Part < UF; ++Part) 602 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 603 } 604 // Reduce all of the unrolled parts into a single vector. 605 Value *ReducedPartRdx = RdxParts[0]; 606 unsigned Op = RdxDesc.getOpcode(); 607 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) 608 Op = Instruction::Or; 609 610 if (PhiR->isOrdered()) { 611 ReducedPartRdx = RdxParts[UF - 1]; 612 } else { 613 // Floating-point operations should have some FMF to enable the reduction. 614 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 615 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 616 for (unsigned Part = 1; Part < UF; ++Part) { 617 Value *RdxPart = RdxParts[Part]; 618 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 619 ReducedPartRdx = Builder.CreateBinOp( 620 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 621 else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) 622 ReducedPartRdx = 623 createMinMaxOp(Builder, RecurKind::SMax, ReducedPartRdx, RdxPart); 624 else 625 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 626 } 627 } 628 629 // Create the reduction after the loop. Note that inloop reductions create 630 // the target reduction in the loop using a Reduction recipe. 631 if ((State.VF.isVector() || 632 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) || 633 RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) && 634 !PhiR->isInLoop()) { 635 ReducedPartRdx = 636 createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); 637 // If the reduction can be performed in a smaller type, we need to extend 638 // the reduction to the wider type before we branch to the original loop. 639 if (PhiTy != RdxDesc.getRecurrenceType()) 640 ReducedPartRdx = RdxDesc.isSigned() 641 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 642 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 643 } 644 645 return ReducedPartRdx; 646 } 647 case VPInstruction::ExtractFromEnd: { 648 auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue()); 649 unsigned Offset = CI->getZExtValue(); 650 assert(Offset > 0 && "Offset from end must be positive"); 651 Value *Res; 652 if (State.VF.isVector()) { 653 assert(Offset <= State.VF.getKnownMinValue() && 654 "invalid offset to extract from"); 655 // Extract lane VF - Offset from the operand. 656 Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset)); 657 } else { 658 assert(Offset <= 1 && "invalid offset to extract from"); 659 Res = State.get(getOperand(0)); 660 } 661 if (isa<ExtractElementInst>(Res)) 662 Res->setName(Name); 663 return Res; 664 } 665 case VPInstruction::LogicalAnd: { 666 Value *A = State.get(getOperand(0)); 667 Value *B = State.get(getOperand(1)); 668 return Builder.CreateLogicalAnd(A, B, Name); 669 } 670 case VPInstruction::PtrAdd: { 671 assert(vputils::onlyFirstLaneUsed(this) && 672 "can only generate first lane for PtrAdd"); 673 Value *Ptr = State.get(getOperand(0), VPLane(0)); 674 Value *Addend = State.get(getOperand(1), VPLane(0)); 675 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags()); 676 } 677 case VPInstruction::ResumePhi: { 678 Value *IncomingFromVPlanPred = 679 State.get(getOperand(0), /* IsScalar */ true); 680 Value *IncomingFromOtherPreds = 681 State.get(getOperand(1), /* IsScalar */ true); 682 auto *NewPhi = 683 Builder.CreatePHI(State.TypeAnalysis.inferScalarType(this), 2, Name); 684 BasicBlock *VPlanPred = 685 State.CFG 686 .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getPredecessors()[0])]; 687 NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred); 688 for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) { 689 if (OtherPred == VPlanPred) 690 continue; 691 NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred); 692 } 693 return NewPhi; 694 } 695 case VPInstruction::AnyOf: { 696 Value *A = State.get(getOperand(0)); 697 return Builder.CreateOrReduce(A); 698 } 699 700 default: 701 llvm_unreachable("Unsupported opcode for instruction"); 702 } 703 } 704 705 bool VPInstruction::isVectorToScalar() const { 706 return getOpcode() == VPInstruction::ExtractFromEnd || 707 getOpcode() == VPInstruction::ComputeReductionResult || 708 getOpcode() == VPInstruction::AnyOf; 709 } 710 711 bool VPInstruction::isSingleScalar() const { 712 return getOpcode() == VPInstruction::ResumePhi; 713 } 714 715 #if !defined(NDEBUG) 716 bool VPInstruction::isFPMathOp() const { 717 // Inspired by FPMathOperator::classof. Notable differences are that we don't 718 // support Call, PHI and Select opcodes here yet. 719 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || 720 Opcode == Instruction::FNeg || Opcode == Instruction::FSub || 721 Opcode == Instruction::FDiv || Opcode == Instruction::FRem || 722 Opcode == Instruction::FCmp || Opcode == Instruction::Select; 723 } 724 #endif 725 726 void VPInstruction::execute(VPTransformState &State) { 727 assert(!State.Lane && "VPInstruction executing an Lane"); 728 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 729 assert((hasFastMathFlags() == isFPMathOp() || 730 getOpcode() == Instruction::Select) && 731 "Recipe not a FPMathOp but has fast-math flags?"); 732 if (hasFastMathFlags()) 733 State.Builder.setFastMathFlags(getFastMathFlags()); 734 State.setDebugLocFrom(getDebugLoc()); 735 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() && 736 (vputils::onlyFirstLaneUsed(this) || 737 isVectorToScalar() || isSingleScalar()); 738 bool GeneratesPerAllLanes = doesGeneratePerAllLanes(); 739 if (GeneratesPerAllLanes) { 740 for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue(); 741 Lane != NumLanes; ++Lane) { 742 Value *GeneratedValue = generatePerLane(State, VPLane(Lane)); 743 assert(GeneratedValue && "generatePerLane must produce a value"); 744 State.set(this, GeneratedValue, VPLane(Lane)); 745 } 746 return; 747 } 748 749 Value *GeneratedValue = generate(State); 750 if (!hasResult()) 751 return; 752 assert(GeneratedValue && "generate must produce a value"); 753 assert( 754 (GeneratedValue->getType()->isVectorTy() == !GeneratesPerFirstLaneOnly || 755 State.VF.isScalar()) && 756 "scalar value but not only first lane defined"); 757 State.set(this, GeneratedValue, 758 /*IsScalar*/ GeneratesPerFirstLaneOnly); 759 } 760 761 bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { 762 if (Instruction::isBinaryOp(getOpcode())) 763 return false; 764 switch (getOpcode()) { 765 case Instruction::ICmp: 766 case Instruction::Select: 767 case VPInstruction::AnyOf: 768 case VPInstruction::CalculateTripCountMinusVF: 769 case VPInstruction::CanonicalIVIncrementForPart: 770 case VPInstruction::ExtractFromEnd: 771 case VPInstruction::FirstOrderRecurrenceSplice: 772 case VPInstruction::LogicalAnd: 773 case VPInstruction::Not: 774 case VPInstruction::PtrAdd: 775 return false; 776 default: 777 return true; 778 } 779 } 780 781 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { 782 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 783 if (Instruction::isBinaryOp(getOpcode())) 784 return vputils::onlyFirstLaneUsed(this); 785 786 switch (getOpcode()) { 787 default: 788 return false; 789 case Instruction::ICmp: 790 case Instruction::Select: 791 case Instruction::Or: 792 case VPInstruction::PtrAdd: 793 // TODO: Cover additional opcodes. 794 return vputils::onlyFirstLaneUsed(this); 795 case VPInstruction::ActiveLaneMask: 796 case VPInstruction::ExplicitVectorLength: 797 case VPInstruction::CalculateTripCountMinusVF: 798 case VPInstruction::CanonicalIVIncrementForPart: 799 case VPInstruction::BranchOnCount: 800 case VPInstruction::BranchOnCond: 801 case VPInstruction::ResumePhi: 802 return true; 803 }; 804 llvm_unreachable("switch should return"); 805 } 806 807 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const { 808 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 809 if (Instruction::isBinaryOp(getOpcode())) 810 return vputils::onlyFirstPartUsed(this); 811 812 switch (getOpcode()) { 813 default: 814 return false; 815 case Instruction::ICmp: 816 case Instruction::Select: 817 return vputils::onlyFirstPartUsed(this); 818 case VPInstruction::BranchOnCount: 819 case VPInstruction::BranchOnCond: 820 case VPInstruction::CanonicalIVIncrementForPart: 821 return true; 822 }; 823 llvm_unreachable("switch should return"); 824 } 825 826 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 827 void VPInstruction::dump() const { 828 VPSlotTracker SlotTracker(getParent()->getPlan()); 829 print(dbgs(), "", SlotTracker); 830 } 831 832 void VPInstruction::print(raw_ostream &O, const Twine &Indent, 833 VPSlotTracker &SlotTracker) const { 834 O << Indent << "EMIT "; 835 836 if (hasResult()) { 837 printAsOperand(O, SlotTracker); 838 O << " = "; 839 } 840 841 switch (getOpcode()) { 842 case VPInstruction::Not: 843 O << "not"; 844 break; 845 case VPInstruction::SLPLoad: 846 O << "combined load"; 847 break; 848 case VPInstruction::SLPStore: 849 O << "combined store"; 850 break; 851 case VPInstruction::ActiveLaneMask: 852 O << "active lane mask"; 853 break; 854 case VPInstruction::ResumePhi: 855 O << "resume-phi"; 856 break; 857 case VPInstruction::ExplicitVectorLength: 858 O << "EXPLICIT-VECTOR-LENGTH"; 859 break; 860 case VPInstruction::FirstOrderRecurrenceSplice: 861 O << "first-order splice"; 862 break; 863 case VPInstruction::BranchOnCond: 864 O << "branch-on-cond"; 865 break; 866 case VPInstruction::CalculateTripCountMinusVF: 867 O << "TC > VF ? TC - VF : 0"; 868 break; 869 case VPInstruction::CanonicalIVIncrementForPart: 870 O << "VF * Part +"; 871 break; 872 case VPInstruction::BranchOnCount: 873 O << "branch-on-count"; 874 break; 875 case VPInstruction::ExtractFromEnd: 876 O << "extract-from-end"; 877 break; 878 case VPInstruction::ComputeReductionResult: 879 O << "compute-reduction-result"; 880 break; 881 case VPInstruction::LogicalAnd: 882 O << "logical-and"; 883 break; 884 case VPInstruction::PtrAdd: 885 O << "ptradd"; 886 break; 887 case VPInstruction::AnyOf: 888 O << "any-of"; 889 break; 890 default: 891 O << Instruction::getOpcodeName(getOpcode()); 892 } 893 894 printFlags(O); 895 printOperands(O, SlotTracker); 896 897 if (auto DL = getDebugLoc()) { 898 O << ", !dbg "; 899 DL.print(O); 900 } 901 } 902 #endif 903 904 void VPIRInstruction::execute(VPTransformState &State) { 905 assert((isa<PHINode>(&I) || getNumOperands() == 0) && 906 "Only PHINodes can have extra operands"); 907 for (const auto &[Idx, Op] : enumerate(operands())) { 908 VPValue *ExitValue = Op; 909 auto Lane = vputils::isUniformAfterVectorization(ExitValue) 910 ? VPLane::getFirstLane() 911 : VPLane::getLastLaneForVF(State.VF); 912 VPBlockBase *Pred = getParent()->getPredecessors()[Idx]; 913 auto *PredVPBB = Pred->getExitingBasicBlock(); 914 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; 915 // Set insertion point in PredBB in case an extract needs to be generated. 916 // TODO: Model extracts explicitly. 917 State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt()); 918 Value *V = State.get(ExitValue, VPLane(Lane)); 919 auto *Phi = cast<PHINode>(&I); 920 // If there is no existing block for PredBB in the phi, add a new incoming 921 // value. Otherwise update the existing incoming value for PredBB. 922 if (Phi->getBasicBlockIndex(PredBB) == -1) 923 Phi->addIncoming(V, PredBB); 924 else 925 Phi->setIncomingValueForBlock(PredBB, V); 926 } 927 928 // Advance the insert point after the wrapped IR instruction. This allows 929 // interleaving VPIRInstructions and other recipes. 930 State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator())); 931 } 932 933 InstructionCost VPIRInstruction::computeCost(ElementCount VF, 934 VPCostContext &Ctx) const { 935 // The recipe wraps an existing IR instruction on the border of VPlan's scope, 936 // hence it does not contribute to the cost-modeling for the VPlan. 937 return 0; 938 } 939 940 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 941 void VPIRInstruction::print(raw_ostream &O, const Twine &Indent, 942 VPSlotTracker &SlotTracker) const { 943 O << Indent << "IR " << I; 944 945 if (getNumOperands() != 0) { 946 O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": "; 947 interleaveComma( 948 enumerate(operands()), O, [this, &O, &SlotTracker](auto Op) { 949 Op.value()->printAsOperand(O, SlotTracker); 950 O << " from "; 951 getParent()->getPredecessors()[Op.index()]->printAsOperand(O); 952 }); 953 O << ")"; 954 } 955 } 956 #endif 957 958 void VPWidenCallRecipe::execute(VPTransformState &State) { 959 assert(State.VF.isVector() && "not widening"); 960 State.setDebugLocFrom(getDebugLoc()); 961 962 FunctionType *VFTy = Variant->getFunctionType(); 963 // Add return type if intrinsic is overloaded on it. 964 SmallVector<Value *, 4> Args; 965 for (const auto &I : enumerate(arg_operands())) { 966 Value *Arg; 967 // Some vectorized function variants may also take a scalar argument, 968 // e.g. linear parameters for pointers. This needs to be the scalar value 969 // from the start of the respective part when interleaving. 970 if (!VFTy->getParamType(I.index())->isVectorTy()) 971 Arg = State.get(I.value(), VPLane(0)); 972 else 973 Arg = State.get(I.value(), onlyFirstLaneUsed(I.value())); 974 Args.push_back(Arg); 975 } 976 977 assert(Variant != nullptr && "Can't create vector function."); 978 979 auto *CI = cast_or_null<CallInst>(getUnderlyingValue()); 980 SmallVector<OperandBundleDef, 1> OpBundles; 981 if (CI) 982 CI->getOperandBundlesAsDefs(OpBundles); 983 984 CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles); 985 setFlags(V); 986 987 if (!V->getType()->isVoidTy()) 988 State.set(this, V); 989 State.addMetadata(V, CI); 990 } 991 992 InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF, 993 VPCostContext &Ctx) const { 994 return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(), 995 Variant->getFunctionType()->params(), 996 Ctx.CostKind); 997 } 998 999 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1000 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, 1001 VPSlotTracker &SlotTracker) const { 1002 O << Indent << "WIDEN-CALL "; 1003 1004 Function *CalledFn = getCalledScalarFunction(); 1005 if (CalledFn->getReturnType()->isVoidTy()) 1006 O << "void "; 1007 else { 1008 printAsOperand(O, SlotTracker); 1009 O << " = "; 1010 } 1011 1012 O << "call"; 1013 printFlags(O); 1014 O << " @" << CalledFn->getName() << "("; 1015 interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) { 1016 Op->printAsOperand(O, SlotTracker); 1017 }); 1018 O << ")"; 1019 1020 O << " (using library function"; 1021 if (Variant->hasName()) 1022 O << ": " << Variant->getName(); 1023 O << ")"; 1024 } 1025 #endif 1026 1027 void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { 1028 assert(State.VF.isVector() && "not widening"); 1029 State.setDebugLocFrom(getDebugLoc()); 1030 1031 SmallVector<Type *, 2> TysForDecl; 1032 // Add return type if intrinsic is overloaded on it. 1033 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1, State.TTI)) 1034 TysForDecl.push_back(VectorType::get(getResultType(), State.VF)); 1035 SmallVector<Value *, 4> Args; 1036 for (const auto &I : enumerate(operands())) { 1037 // Some intrinsics have a scalar argument - don't replace it with a 1038 // vector. 1039 Value *Arg; 1040 if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(), 1041 State.TTI)) 1042 Arg = State.get(I.value(), VPLane(0)); 1043 else 1044 Arg = State.get(I.value(), onlyFirstLaneUsed(I.value())); 1045 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(), 1046 State.TTI)) 1047 TysForDecl.push_back(Arg->getType()); 1048 Args.push_back(Arg); 1049 } 1050 1051 // Use vector version of the intrinsic. 1052 Module *M = State.Builder.GetInsertBlock()->getModule(); 1053 Function *VectorF = 1054 Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl); 1055 assert(VectorF && 1056 "Can't retrieve vector intrinsic or vector-predication intrinsics."); 1057 1058 auto *CI = cast_or_null<CallInst>(getUnderlyingValue()); 1059 SmallVector<OperandBundleDef, 1> OpBundles; 1060 if (CI) 1061 CI->getOperandBundlesAsDefs(OpBundles); 1062 1063 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); 1064 1065 setFlags(V); 1066 1067 if (!V->getType()->isVoidTy()) 1068 State.set(this, V); 1069 State.addMetadata(V, CI); 1070 } 1071 1072 InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, 1073 VPCostContext &Ctx) const { 1074 // Some backends analyze intrinsic arguments to determine cost. Use the 1075 // underlying value for the operand if it has one. Otherwise try to use the 1076 // operand of the underlying call instruction, if there is one. Otherwise 1077 // clear Arguments. 1078 // TODO: Rework TTI interface to be independent of concrete IR values. 1079 SmallVector<const Value *> Arguments; 1080 for (const auto &[Idx, Op] : enumerate(operands())) { 1081 auto *V = Op->getUnderlyingValue(); 1082 if (!V) { 1083 // Push all the VP Intrinsic's ops into the Argments even if is nullptr. 1084 // Some VP Intrinsic's cost will assert the number of parameters. 1085 // Mainly appears in the following two scenarios: 1086 // 1. EVL Op is nullptr 1087 // 2. The Argmunt of the VP Intrinsic is also the VP Intrinsic 1088 if (VPIntrinsic::isVPIntrinsic(VectorIntrinsicID)) { 1089 Arguments.push_back(V); 1090 continue; 1091 } 1092 if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) { 1093 Arguments.push_back(UI->getArgOperand(Idx)); 1094 continue; 1095 } 1096 Arguments.clear(); 1097 break; 1098 } 1099 Arguments.push_back(V); 1100 } 1101 1102 Type *RetTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 1103 SmallVector<Type *> ParamTys; 1104 for (unsigned I = 0; I != getNumOperands(); ++I) 1105 ParamTys.push_back( 1106 toVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF)); 1107 1108 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst. 1109 FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags(); 1110 IntrinsicCostAttributes CostAttrs( 1111 VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF, 1112 dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue())); 1113 return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind); 1114 } 1115 1116 StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const { 1117 return Intrinsic::getBaseName(VectorIntrinsicID); 1118 } 1119 1120 bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const { 1121 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 1122 // Vector predication intrinsics only demand the the first lane the last 1123 // operand (the EVL operand). 1124 return VPIntrinsic::isVPIntrinsic(VectorIntrinsicID) && 1125 Op == getOperand(getNumOperands() - 1); 1126 } 1127 1128 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1129 void VPWidenIntrinsicRecipe::print(raw_ostream &O, const Twine &Indent, 1130 VPSlotTracker &SlotTracker) const { 1131 O << Indent << "WIDEN-INTRINSIC "; 1132 if (ResultTy->isVoidTy()) { 1133 O << "void "; 1134 } else { 1135 printAsOperand(O, SlotTracker); 1136 O << " = "; 1137 } 1138 1139 O << "call"; 1140 printFlags(O); 1141 O << getIntrinsicName() << "("; 1142 1143 interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) { 1144 Op->printAsOperand(O, SlotTracker); 1145 }); 1146 O << ")"; 1147 } 1148 #endif 1149 1150 void VPHistogramRecipe::execute(VPTransformState &State) { 1151 State.setDebugLocFrom(getDebugLoc()); 1152 IRBuilderBase &Builder = State.Builder; 1153 1154 Value *Address = State.get(getOperand(0)); 1155 Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true); 1156 VectorType *VTy = cast<VectorType>(Address->getType()); 1157 1158 // The histogram intrinsic requires a mask even if the recipe doesn't; 1159 // if the mask operand was omitted then all lanes should be executed and 1160 // we just need to synthesize an all-true mask. 1161 Value *Mask = nullptr; 1162 if (VPValue *VPMask = getMask()) 1163 Mask = State.get(VPMask); 1164 else 1165 Mask = 1166 Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1)); 1167 1168 // If this is a subtract, we want to invert the increment amount. We may 1169 // add a separate intrinsic in future, but for now we'll try this. 1170 if (Opcode == Instruction::Sub) 1171 IncAmt = Builder.CreateNeg(IncAmt); 1172 else 1173 assert(Opcode == Instruction::Add && "only add or sub supported for now"); 1174 1175 State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add, 1176 {VTy, IncAmt->getType()}, 1177 {Address, IncAmt, Mask}); 1178 } 1179 1180 InstructionCost VPHistogramRecipe::computeCost(ElementCount VF, 1181 VPCostContext &Ctx) const { 1182 // FIXME: Take the gather and scatter into account as well. For now we're 1183 // generating the same cost as the fallback path, but we'll likely 1184 // need to create a new TTI method for determining the cost, including 1185 // whether we can use base + vec-of-smaller-indices or just 1186 // vec-of-pointers. 1187 assert(VF.isVector() && "Invalid VF for histogram cost"); 1188 Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0)); 1189 VPValue *IncAmt = getOperand(1); 1190 Type *IncTy = Ctx.Types.inferScalarType(IncAmt); 1191 VectorType *VTy = VectorType::get(IncTy, VF); 1192 1193 // Assume that a non-constant update value (or a constant != 1) requires 1194 // a multiply, and add that into the cost. 1195 InstructionCost MulCost = 1196 Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind); 1197 if (IncAmt->isLiveIn()) { 1198 ConstantInt *CI = dyn_cast<ConstantInt>(IncAmt->getLiveInIRValue()); 1199 1200 if (CI && CI->getZExtValue() == 1) 1201 MulCost = TTI::TCC_Free; 1202 } 1203 1204 // Find the cost of the histogram operation itself. 1205 Type *PtrTy = VectorType::get(AddressTy, VF); 1206 Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF); 1207 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add, 1208 Type::getVoidTy(Ctx.LLVMCtx), 1209 {PtrTy, IncTy, MaskTy}); 1210 1211 // Add the costs together with the add/sub operation. 1212 return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost + 1213 Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind); 1214 } 1215 1216 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1217 void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent, 1218 VPSlotTracker &SlotTracker) const { 1219 O << Indent << "WIDEN-HISTOGRAM buckets: "; 1220 getOperand(0)->printAsOperand(O, SlotTracker); 1221 1222 if (Opcode == Instruction::Sub) 1223 O << ", dec: "; 1224 else { 1225 assert(Opcode == Instruction::Add); 1226 O << ", inc: "; 1227 } 1228 getOperand(1)->printAsOperand(O, SlotTracker); 1229 1230 if (VPValue *Mask = getMask()) { 1231 O << ", mask: "; 1232 Mask->printAsOperand(O, SlotTracker); 1233 } 1234 } 1235 1236 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, 1237 VPSlotTracker &SlotTracker) const { 1238 O << Indent << "WIDEN-SELECT "; 1239 printAsOperand(O, SlotTracker); 1240 O << " = select "; 1241 printFlags(O); 1242 getOperand(0)->printAsOperand(O, SlotTracker); 1243 O << ", "; 1244 getOperand(1)->printAsOperand(O, SlotTracker); 1245 O << ", "; 1246 getOperand(2)->printAsOperand(O, SlotTracker); 1247 O << (isInvariantCond() ? " (condition is loop invariant)" : ""); 1248 } 1249 #endif 1250 1251 void VPWidenSelectRecipe::execute(VPTransformState &State) { 1252 State.setDebugLocFrom(getDebugLoc()); 1253 1254 // The condition can be loop invariant but still defined inside the 1255 // loop. This means that we can't just use the original 'cond' value. 1256 // We have to take the 'vectorized' value and pick the first lane. 1257 // Instcombine will make this a no-op. 1258 auto *InvarCond = 1259 isInvariantCond() ? State.get(getCond(), VPLane(0)) : nullptr; 1260 1261 Value *Cond = InvarCond ? InvarCond : State.get(getCond()); 1262 Value *Op0 = State.get(getOperand(1)); 1263 Value *Op1 = State.get(getOperand(2)); 1264 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 1265 State.set(this, Sel); 1266 if (isa<FPMathOperator>(Sel)) 1267 setFlags(cast<Instruction>(Sel)); 1268 State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1269 } 1270 1271 InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF, 1272 VPCostContext &Ctx) const { 1273 SelectInst *SI = cast<SelectInst>(getUnderlyingValue()); 1274 bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions(); 1275 Type *ScalarTy = Ctx.Types.inferScalarType(this); 1276 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 1277 1278 VPValue *Op0, *Op1; 1279 using namespace llvm::VPlanPatternMatch; 1280 if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 && 1281 (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) || 1282 match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) { 1283 // select x, y, false --> x & y 1284 // select x, true, y --> x | y 1285 const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0); 1286 const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1); 1287 1288 SmallVector<const Value *, 2> Operands; 1289 if (all_of(operands(), 1290 [](VPValue *Op) { return Op->getUnderlyingValue(); })) 1291 Operands.append(SI->op_begin(), SI->op_end()); 1292 bool IsLogicalOr = match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))); 1293 return Ctx.TTI.getArithmeticInstrCost( 1294 IsLogicalOr ? Instruction::Or : Instruction::And, VectorTy, 1295 Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI); 1296 } 1297 1298 Type *CondTy = Ctx.Types.inferScalarType(getOperand(0)); 1299 if (!ScalarCond) 1300 CondTy = VectorType::get(CondTy, VF); 1301 1302 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 1303 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 1304 Pred = Cmp->getPredicate(); 1305 return Ctx.TTI.getCmpSelInstrCost( 1306 Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind, 1307 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI); 1308 } 1309 1310 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy( 1311 const FastMathFlags &FMF) { 1312 AllowReassoc = FMF.allowReassoc(); 1313 NoNaNs = FMF.noNaNs(); 1314 NoInfs = FMF.noInfs(); 1315 NoSignedZeros = FMF.noSignedZeros(); 1316 AllowReciprocal = FMF.allowReciprocal(); 1317 AllowContract = FMF.allowContract(); 1318 ApproxFunc = FMF.approxFunc(); 1319 } 1320 1321 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1322 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { 1323 switch (OpType) { 1324 case OperationType::Cmp: 1325 O << " " << CmpInst::getPredicateName(getPredicate()); 1326 break; 1327 case OperationType::DisjointOp: 1328 if (DisjointFlags.IsDisjoint) 1329 O << " disjoint"; 1330 break; 1331 case OperationType::PossiblyExactOp: 1332 if (ExactFlags.IsExact) 1333 O << " exact"; 1334 break; 1335 case OperationType::OverflowingBinOp: 1336 if (WrapFlags.HasNUW) 1337 O << " nuw"; 1338 if (WrapFlags.HasNSW) 1339 O << " nsw"; 1340 break; 1341 case OperationType::FPMathOp: 1342 getFastMathFlags().print(O); 1343 break; 1344 case OperationType::GEPOp: 1345 if (GEPFlags.isInBounds()) 1346 O << " inbounds"; 1347 else if (GEPFlags.hasNoUnsignedSignedWrap()) 1348 O << " nusw"; 1349 if (GEPFlags.hasNoUnsignedWrap()) 1350 O << " nuw"; 1351 break; 1352 case OperationType::NonNegOp: 1353 if (NonNegFlags.NonNeg) 1354 O << " nneg"; 1355 break; 1356 case OperationType::Other: 1357 break; 1358 } 1359 if (getNumOperands() > 0) 1360 O << " "; 1361 } 1362 #endif 1363 1364 void VPWidenRecipe::execute(VPTransformState &State) { 1365 State.setDebugLocFrom(getDebugLoc()); 1366 auto &Builder = State.Builder; 1367 switch (Opcode) { 1368 case Instruction::Call: 1369 case Instruction::Br: 1370 case Instruction::PHI: 1371 case Instruction::GetElementPtr: 1372 case Instruction::Select: 1373 llvm_unreachable("This instruction is handled by a different recipe."); 1374 case Instruction::UDiv: 1375 case Instruction::SDiv: 1376 case Instruction::SRem: 1377 case Instruction::URem: 1378 case Instruction::Add: 1379 case Instruction::FAdd: 1380 case Instruction::Sub: 1381 case Instruction::FSub: 1382 case Instruction::FNeg: 1383 case Instruction::Mul: 1384 case Instruction::FMul: 1385 case Instruction::FDiv: 1386 case Instruction::FRem: 1387 case Instruction::Shl: 1388 case Instruction::LShr: 1389 case Instruction::AShr: 1390 case Instruction::And: 1391 case Instruction::Or: 1392 case Instruction::Xor: { 1393 // Just widen unops and binops. 1394 SmallVector<Value *, 2> Ops; 1395 for (VPValue *VPOp : operands()) 1396 Ops.push_back(State.get(VPOp)); 1397 1398 Value *V = Builder.CreateNAryOp(Opcode, Ops); 1399 1400 if (auto *VecOp = dyn_cast<Instruction>(V)) 1401 setFlags(VecOp); 1402 1403 // Use this vector value for all users of the original instruction. 1404 State.set(this, V); 1405 State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1406 break; 1407 } 1408 case Instruction::Freeze: { 1409 Value *Op = State.get(getOperand(0)); 1410 1411 Value *Freeze = Builder.CreateFreeze(Op); 1412 State.set(this, Freeze); 1413 break; 1414 } 1415 case Instruction::ICmp: 1416 case Instruction::FCmp: { 1417 // Widen compares. Generate vector compares. 1418 bool FCmp = Opcode == Instruction::FCmp; 1419 Value *A = State.get(getOperand(0)); 1420 Value *B = State.get(getOperand(1)); 1421 Value *C = nullptr; 1422 if (FCmp) { 1423 // Propagate fast math flags. 1424 C = Builder.CreateFCmpFMF( 1425 getPredicate(), A, B, 1426 dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1427 } else { 1428 C = Builder.CreateICmp(getPredicate(), A, B); 1429 } 1430 State.set(this, C); 1431 State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1432 break; 1433 } 1434 default: 1435 // This instruction is not vectorized by simple widening. 1436 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " 1437 << Instruction::getOpcodeName(Opcode)); 1438 llvm_unreachable("Unhandled instruction!"); 1439 } // end of switch. 1440 1441 #if !defined(NDEBUG) 1442 // Verify that VPlan type inference results agree with the type of the 1443 // generated values. 1444 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) == 1445 State.get(this)->getType() && 1446 "inferred type and type from generated instructions do not match"); 1447 #endif 1448 } 1449 1450 InstructionCost VPWidenRecipe::computeCost(ElementCount VF, 1451 VPCostContext &Ctx) const { 1452 switch (Opcode) { 1453 case Instruction::FNeg: { 1454 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 1455 return Ctx.TTI.getArithmeticInstrCost( 1456 Opcode, VectorTy, Ctx.CostKind, 1457 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 1458 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}); 1459 } 1460 1461 case Instruction::UDiv: 1462 case Instruction::SDiv: 1463 case Instruction::SRem: 1464 case Instruction::URem: 1465 // More complex computation, let the legacy cost-model handle this for now. 1466 return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF); 1467 case Instruction::Add: 1468 case Instruction::FAdd: 1469 case Instruction::Sub: 1470 case Instruction::FSub: 1471 case Instruction::Mul: 1472 case Instruction::FMul: 1473 case Instruction::FDiv: 1474 case Instruction::FRem: 1475 case Instruction::Shl: 1476 case Instruction::LShr: 1477 case Instruction::AShr: 1478 case Instruction::And: 1479 case Instruction::Or: 1480 case Instruction::Xor: { 1481 VPValue *RHS = getOperand(1); 1482 // Certain instructions can be cheaper to vectorize if they have a constant 1483 // second vector operand. One example of this are shifts on x86. 1484 TargetTransformInfo::OperandValueInfo RHSInfo = { 1485 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}; 1486 if (RHS->isLiveIn()) 1487 RHSInfo = Ctx.TTI.getOperandInfo(RHS->getLiveInIRValue()); 1488 1489 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue && 1490 getOperand(1)->isDefinedOutsideLoopRegions()) 1491 RHSInfo.Kind = TargetTransformInfo::OK_UniformValue; 1492 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 1493 Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue()); 1494 1495 SmallVector<const Value *, 4> Operands; 1496 if (CtxI) 1497 Operands.append(CtxI->value_op_begin(), CtxI->value_op_end()); 1498 return Ctx.TTI.getArithmeticInstrCost( 1499 Opcode, VectorTy, Ctx.CostKind, 1500 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 1501 RHSInfo, Operands, CtxI, &Ctx.TLI); 1502 } 1503 case Instruction::Freeze: { 1504 // This opcode is unknown. Assume that it is the same as 'mul'. 1505 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 1506 return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, 1507 Ctx.CostKind); 1508 } 1509 case Instruction::ICmp: 1510 case Instruction::FCmp: { 1511 Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue()); 1512 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); 1513 return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(), 1514 Ctx.CostKind, 1515 {TTI::OK_AnyValue, TTI::OP_None}, 1516 {TTI::OK_AnyValue, TTI::OP_None}, CtxI); 1517 } 1518 default: 1519 llvm_unreachable("Unsupported opcode for instruction"); 1520 } 1521 } 1522 1523 void VPWidenEVLRecipe::execute(VPTransformState &State) { 1524 unsigned Opcode = getOpcode(); 1525 // TODO: Support other opcodes 1526 if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode)) 1527 llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute"); 1528 1529 State.setDebugLocFrom(getDebugLoc()); 1530 1531 assert(State.get(getOperand(0))->getType()->isVectorTy() && 1532 "VPWidenEVLRecipe should not be used for scalars"); 1533 1534 VPValue *EVL = getEVL(); 1535 Value *EVLArg = State.get(EVL, /*NeedsScalar=*/true); 1536 IRBuilderBase &BuilderIR = State.Builder; 1537 VectorBuilder Builder(BuilderIR); 1538 Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue()); 1539 1540 SmallVector<Value *, 4> Ops; 1541 for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) { 1542 VPValue *VPOp = getOperand(I); 1543 Ops.push_back(State.get(VPOp)); 1544 } 1545 1546 Builder.setMask(Mask).setEVL(EVLArg); 1547 Value *VPInst = 1548 Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops, "vp.op"); 1549 // Currently vp-intrinsics only accept FMF flags. 1550 // TODO: Enable other flags when support is added. 1551 if (isa<FPMathOperator>(VPInst)) 1552 setFlags(cast<Instruction>(VPInst)); 1553 1554 State.set(this, VPInst); 1555 State.addMetadata(VPInst, 1556 dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1557 } 1558 1559 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1560 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, 1561 VPSlotTracker &SlotTracker) const { 1562 O << Indent << "WIDEN "; 1563 printAsOperand(O, SlotTracker); 1564 O << " = " << Instruction::getOpcodeName(Opcode); 1565 printFlags(O); 1566 printOperands(O, SlotTracker); 1567 } 1568 1569 void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent, 1570 VPSlotTracker &SlotTracker) const { 1571 O << Indent << "WIDEN "; 1572 printAsOperand(O, SlotTracker); 1573 O << " = vp." << Instruction::getOpcodeName(getOpcode()); 1574 printFlags(O); 1575 printOperands(O, SlotTracker); 1576 } 1577 #endif 1578 1579 void VPWidenCastRecipe::execute(VPTransformState &State) { 1580 State.setDebugLocFrom(getDebugLoc()); 1581 auto &Builder = State.Builder; 1582 /// Vectorize casts. 1583 assert(State.VF.isVector() && "Not vectorizing?"); 1584 Type *DestTy = VectorType::get(getResultType(), State.VF); 1585 VPValue *Op = getOperand(0); 1586 Value *A = State.get(Op); 1587 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); 1588 State.set(this, Cast); 1589 State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue())); 1590 if (auto *CastOp = dyn_cast<Instruction>(Cast)) 1591 setFlags(CastOp); 1592 } 1593 1594 InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF, 1595 VPCostContext &Ctx) const { 1596 // TODO: In some cases, VPWidenCastRecipes are created but not considered in 1597 // the legacy cost model, including truncates/extends when evaluating a 1598 // reduction in a smaller type. 1599 if (!getUnderlyingValue()) 1600 return 0; 1601 // Computes the CastContextHint from a recipes that may access memory. 1602 auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint { 1603 if (VF.isScalar()) 1604 return TTI::CastContextHint::Normal; 1605 if (isa<VPInterleaveRecipe>(R)) 1606 return TTI::CastContextHint::Interleave; 1607 if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R)) 1608 return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked 1609 : TTI::CastContextHint::Normal; 1610 const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R); 1611 if (WidenMemoryRecipe == nullptr) 1612 return TTI::CastContextHint::None; 1613 if (!WidenMemoryRecipe->isConsecutive()) 1614 return TTI::CastContextHint::GatherScatter; 1615 if (WidenMemoryRecipe->isReverse()) 1616 return TTI::CastContextHint::Reversed; 1617 if (WidenMemoryRecipe->isMasked()) 1618 return TTI::CastContextHint::Masked; 1619 return TTI::CastContextHint::Normal; 1620 }; 1621 1622 VPValue *Operand = getOperand(0); 1623 TTI::CastContextHint CCH = TTI::CastContextHint::None; 1624 // For Trunc/FPTrunc, get the context from the only user. 1625 if ((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) && 1626 !hasMoreThanOneUniqueUser() && getNumUsers() > 0) { 1627 if (auto *StoreRecipe = dyn_cast<VPRecipeBase>(*user_begin())) 1628 CCH = ComputeCCH(StoreRecipe); 1629 } 1630 // For Z/Sext, get the context from the operand. 1631 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 1632 Opcode == Instruction::FPExt) { 1633 if (Operand->isLiveIn()) 1634 CCH = TTI::CastContextHint::Normal; 1635 else if (Operand->getDefiningRecipe()) 1636 CCH = ComputeCCH(Operand->getDefiningRecipe()); 1637 } 1638 1639 auto *SrcTy = 1640 cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(Operand), VF)); 1641 auto *DestTy = cast<VectorType>(toVectorTy(getResultType(), VF)); 1642 // Arm TTI will use the underlying instruction to determine the cost. 1643 return Ctx.TTI.getCastInstrCost( 1644 Opcode, DestTy, SrcTy, CCH, Ctx.CostKind, 1645 dyn_cast_if_present<Instruction>(getUnderlyingValue())); 1646 } 1647 1648 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1649 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, 1650 VPSlotTracker &SlotTracker) const { 1651 O << Indent << "WIDEN-CAST "; 1652 printAsOperand(O, SlotTracker); 1653 O << " = " << Instruction::getOpcodeName(Opcode); 1654 printFlags(O); 1655 printOperands(O, SlotTracker); 1656 O << " to " << *getResultType(); 1657 } 1658 #endif 1659 1660 InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF, 1661 VPCostContext &Ctx) const { 1662 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind); 1663 } 1664 1665 /// This function adds 1666 /// (0 * Step, 1 * Step, 2 * Step, ...) 1667 /// to each vector element of Val. 1668 /// \p Opcode is relevant for FP induction variable. 1669 static Value *getStepVector(Value *Val, Value *Step, 1670 Instruction::BinaryOps BinOp, ElementCount VF, 1671 IRBuilderBase &Builder) { 1672 assert(VF.isVector() && "only vector VFs are supported"); 1673 1674 // Create and check the types. 1675 auto *ValVTy = cast<VectorType>(Val->getType()); 1676 ElementCount VLen = ValVTy->getElementCount(); 1677 1678 Type *STy = Val->getType()->getScalarType(); 1679 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1680 "Induction Step must be an integer or FP"); 1681 assert(Step->getType() == STy && "Step has wrong type"); 1682 1683 SmallVector<Constant *, 8> Indices; 1684 1685 // Create a vector of consecutive numbers from zero to VF. 1686 VectorType *InitVecValVTy = ValVTy; 1687 if (STy->isFloatingPointTy()) { 1688 Type *InitVecValSTy = 1689 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 1690 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 1691 } 1692 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 1693 1694 if (STy->isIntegerTy()) { 1695 Step = Builder.CreateVectorSplat(VLen, Step); 1696 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1697 // FIXME: The newly created binary instructions should contain nsw/nuw 1698 // flags, which can be found from the original scalar operations. 1699 Step = Builder.CreateMul(InitVec, Step); 1700 return Builder.CreateAdd(Val, Step, "induction"); 1701 } 1702 1703 // Floating point induction. 1704 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1705 "Binary Opcode should be specified for FP induction"); 1706 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 1707 1708 Step = Builder.CreateVectorSplat(VLen, Step); 1709 Value *MulOp = Builder.CreateFMul(InitVec, Step); 1710 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1711 } 1712 1713 /// A helper function that returns an integer or floating-point constant with 1714 /// value C. 1715 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 1716 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 1717 : ConstantFP::get(Ty, C); 1718 } 1719 1720 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 1721 assert(!State.Lane && "Int or FP induction being replicated."); 1722 1723 Value *Start = getStartValue()->getLiveInIRValue(); 1724 const InductionDescriptor &ID = getInductionDescriptor(); 1725 TruncInst *Trunc = getTruncInst(); 1726 IRBuilderBase &Builder = State.Builder; 1727 assert(getPHINode()->getType() == ID.getStartValue()->getType() && 1728 "Types must match"); 1729 assert(State.VF.isVector() && "must have vector VF"); 1730 1731 // The value from the original loop to which we are mapping the new induction 1732 // variable. 1733 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : getPHINode(); 1734 1735 // Fast-math-flags propagate from the original induction instruction. 1736 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 1737 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 1738 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 1739 1740 // Now do the actual transformations, and start with fetching the step value. 1741 Value *Step = State.get(getStepValue(), VPLane(0)); 1742 1743 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1744 "Expected either an induction phi-node or a truncate of it!"); 1745 1746 // Construct the initial value of the vector IV in the vector loop preheader 1747 auto CurrIP = Builder.saveIP(); 1748 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1749 Builder.SetInsertPoint(VectorPH->getTerminator()); 1750 if (isa<TruncInst>(EntryVal)) { 1751 assert(Start->getType()->isIntegerTy() && 1752 "Truncation requires an integer type"); 1753 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1754 Step = Builder.CreateTrunc(Step, TruncType); 1755 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1756 } 1757 1758 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 1759 Value *SteppedStart = getStepVector(SplatStart, Step, ID.getInductionOpcode(), 1760 State.VF, State.Builder); 1761 1762 // We create vector phi nodes for both integer and floating-point induction 1763 // variables. Here, we determine the kind of arithmetic we will perform. 1764 Instruction::BinaryOps AddOp; 1765 Instruction::BinaryOps MulOp; 1766 if (Step->getType()->isIntegerTy()) { 1767 AddOp = Instruction::Add; 1768 MulOp = Instruction::Mul; 1769 } else { 1770 AddOp = ID.getInductionOpcode(); 1771 MulOp = Instruction::FMul; 1772 } 1773 1774 Value *SplatVF; 1775 if (VPValue *SplatVFOperand = getSplatVFValue()) { 1776 // The recipe has been unrolled. In that case, fetch the splat value for the 1777 // induction increment. 1778 SplatVF = State.get(SplatVFOperand); 1779 } else { 1780 // Multiply the vectorization factor by the step using integer or 1781 // floating-point arithmetic as appropriate. 1782 Type *StepType = Step->getType(); 1783 Value *RuntimeVF = State.get(getVFValue(), VPLane(0)); 1784 if (Step->getType()->isFloatingPointTy()) 1785 RuntimeVF = Builder.CreateUIToFP(RuntimeVF, StepType); 1786 else 1787 RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, StepType); 1788 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 1789 1790 // Create a vector splat to use in the induction update. 1791 SplatVF = Builder.CreateVectorSplat(State.VF, Mul); 1792 } 1793 1794 Builder.restoreIP(CurrIP); 1795 1796 // We may need to add the step a number of times, depending on the unroll 1797 // factor. The last of those goes into the PHI. 1798 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind"); 1799 VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 1800 VecInd->setDebugLoc(getDebugLoc()); 1801 State.set(this, VecInd); 1802 1803 Instruction *LastInduction = cast<Instruction>( 1804 Builder.CreateBinOp(AddOp, VecInd, SplatVF, "vec.ind.next")); 1805 if (isa<TruncInst>(EntryVal)) 1806 State.addMetadata(LastInduction, EntryVal); 1807 LastInduction->setDebugLoc(getDebugLoc()); 1808 1809 VecInd->addIncoming(SteppedStart, VectorPH); 1810 // Add induction update using an incorrect block temporarily. The phi node 1811 // will be fixed after VPlan execution. Note that at this point the latch 1812 // block cannot be used, as it does not exist yet. 1813 // TODO: Model increment value in VPlan, by turning the recipe into a 1814 // multi-def and a subclass of VPHeaderPHIRecipe. 1815 VecInd->addIncoming(LastInduction, VectorPH); 1816 } 1817 1818 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1819 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, 1820 VPSlotTracker &SlotTracker) const { 1821 O << Indent; 1822 printAsOperand(O, SlotTracker); 1823 O << " = WIDEN-INDUCTION "; 1824 printOperands(O, SlotTracker); 1825 1826 if (auto *TI = getTruncInst()) 1827 O << " (truncated to " << *TI->getType() << ")"; 1828 } 1829 #endif 1830 1831 bool VPWidenIntOrFpInductionRecipe::isCanonical() const { 1832 // The step may be defined by a recipe in the preheader (e.g. if it requires 1833 // SCEV expansion), but for the canonical induction the step is required to be 1834 // 1, which is represented as live-in. 1835 if (getStepValue()->getDefiningRecipe()) 1836 return false; 1837 auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue()); 1838 auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); 1839 auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin()); 1840 return StartC && StartC->isZero() && StepC && StepC->isOne() && 1841 getScalarType() == CanIV->getScalarType(); 1842 } 1843 1844 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1845 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, 1846 VPSlotTracker &SlotTracker) const { 1847 O << Indent; 1848 printAsOperand(O, SlotTracker); 1849 O << " = DERIVED-IV "; 1850 getStartValue()->printAsOperand(O, SlotTracker); 1851 O << " + "; 1852 getOperand(1)->printAsOperand(O, SlotTracker); 1853 O << " * "; 1854 getStepValue()->printAsOperand(O, SlotTracker); 1855 } 1856 #endif 1857 1858 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 1859 // Fast-math-flags propagate from the original induction instruction. 1860 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 1861 if (hasFastMathFlags()) 1862 State.Builder.setFastMathFlags(getFastMathFlags()); 1863 1864 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 1865 /// variable on which to base the steps, \p Step is the size of the step. 1866 1867 Value *BaseIV = State.get(getOperand(0), VPLane(0)); 1868 Value *Step = State.get(getStepValue(), VPLane(0)); 1869 IRBuilderBase &Builder = State.Builder; 1870 1871 // Ensure step has the same type as that of scalar IV. 1872 Type *BaseIVTy = BaseIV->getType()->getScalarType(); 1873 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!"); 1874 1875 // We build scalar steps for both integer and floating-point induction 1876 // variables. Here, we determine the kind of arithmetic we will perform. 1877 Instruction::BinaryOps AddOp; 1878 Instruction::BinaryOps MulOp; 1879 if (BaseIVTy->isIntegerTy()) { 1880 AddOp = Instruction::Add; 1881 MulOp = Instruction::Mul; 1882 } else { 1883 AddOp = InductionOpcode; 1884 MulOp = Instruction::FMul; 1885 } 1886 1887 // Determine the number of scalars we need to generate for each unroll 1888 // iteration. 1889 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this); 1890 // Compute the scalar steps and save the results in State. 1891 Type *IntStepTy = 1892 IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits()); 1893 Type *VecIVTy = nullptr; 1894 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 1895 if (!FirstLaneOnly && State.VF.isScalable()) { 1896 VecIVTy = VectorType::get(BaseIVTy, State.VF); 1897 UnitStepVec = 1898 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 1899 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 1900 SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV); 1901 } 1902 1903 unsigned StartLane = 0; 1904 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 1905 if (State.Lane) { 1906 StartLane = State.Lane->getKnownLane(); 1907 EndLane = StartLane + 1; 1908 } 1909 Value *StartIdx0 = 1910 createStepForVF(Builder, IntStepTy, State.VF, getUnrollPart(*this)); 1911 1912 if (!FirstLaneOnly && State.VF.isScalable()) { 1913 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 1914 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 1915 if (BaseIVTy->isFloatingPointTy()) 1916 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 1917 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 1918 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 1919 State.set(this, Add); 1920 // It's useful to record the lane values too for the known minimum number 1921 // of elements so we do those below. This improves the code quality when 1922 // trying to extract the first element, for example. 1923 } 1924 1925 if (BaseIVTy->isFloatingPointTy()) 1926 StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy); 1927 1928 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { 1929 Value *StartIdx = Builder.CreateBinOp( 1930 AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane)); 1931 // The step returned by `createStepForVF` is a runtime-evaluated value 1932 // when VF is scalable. Otherwise, it should be folded into a Constant. 1933 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 1934 "Expected StartIdx to be folded to a constant when VF is not " 1935 "scalable"); 1936 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 1937 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul); 1938 State.set(this, Add, VPLane(Lane)); 1939 } 1940 } 1941 1942 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1943 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, 1944 VPSlotTracker &SlotTracker) const { 1945 O << Indent; 1946 printAsOperand(O, SlotTracker); 1947 O << " = SCALAR-STEPS "; 1948 printOperands(O, SlotTracker); 1949 } 1950 #endif 1951 1952 void VPWidenGEPRecipe::execute(VPTransformState &State) { 1953 assert(State.VF.isVector() && "not widening"); 1954 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 1955 // Construct a vector GEP by widening the operands of the scalar GEP as 1956 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 1957 // results in a vector of pointers when at least one operand of the GEP 1958 // is vector-typed. Thus, to keep the representation compact, we only use 1959 // vector-typed operands for loop-varying values. 1960 1961 if (areAllOperandsInvariant()) { 1962 // If we are vectorizing, but the GEP has only loop-invariant operands, 1963 // the GEP we build (by only using vector-typed operands for 1964 // loop-varying values) would be a scalar pointer. Thus, to ensure we 1965 // produce a vector of pointers, we need to either arbitrarily pick an 1966 // operand to broadcast, or broadcast a clone of the original GEP. 1967 // Here, we broadcast a clone of the original. 1968 // 1969 // TODO: If at some point we decide to scalarize instructions having 1970 // loop-invariant operands, this special case will no longer be 1971 // required. We would add the scalarization decision to 1972 // collectLoopScalars() and teach getVectorValue() to broadcast 1973 // the lane-zero scalar value. 1974 SmallVector<Value *> Ops; 1975 for (unsigned I = 0, E = getNumOperands(); I != E; I++) 1976 Ops.push_back(State.get(getOperand(I), VPLane(0))); 1977 1978 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0], 1979 ArrayRef(Ops).drop_front(), "", 1980 getGEPNoWrapFlags()); 1981 Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP); 1982 State.set(this, Splat); 1983 State.addMetadata(Splat, GEP); 1984 } else { 1985 // If the GEP has at least one loop-varying operand, we are sure to 1986 // produce a vector of pointers unless VF is scalar. 1987 // The pointer operand of the new GEP. If it's loop-invariant, we 1988 // won't broadcast it. 1989 auto *Ptr = isPointerLoopInvariant() ? State.get(getOperand(0), VPLane(0)) 1990 : State.get(getOperand(0)); 1991 1992 // Collect all the indices for the new GEP. If any index is 1993 // loop-invariant, we won't broadcast it. 1994 SmallVector<Value *, 4> Indices; 1995 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 1996 VPValue *Operand = getOperand(I); 1997 if (isIndexLoopInvariant(I - 1)) 1998 Indices.push_back(State.get(Operand, VPLane(0))); 1999 else 2000 Indices.push_back(State.get(Operand)); 2001 } 2002 2003 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 2004 // but it should be a vector, otherwise. 2005 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, 2006 Indices, "", getGEPNoWrapFlags()); 2007 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 2008 "NewGEP is not a pointer vector"); 2009 State.set(this, NewGEP); 2010 State.addMetadata(NewGEP, GEP); 2011 } 2012 } 2013 2014 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2015 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, 2016 VPSlotTracker &SlotTracker) const { 2017 O << Indent << "WIDEN-GEP "; 2018 O << (isPointerLoopInvariant() ? "Inv" : "Var"); 2019 for (size_t I = 0; I < getNumOperands() - 1; ++I) 2020 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]"; 2021 2022 O << " "; 2023 printAsOperand(O, SlotTracker); 2024 O << " = getelementptr"; 2025 printFlags(O); 2026 printOperands(O, SlotTracker); 2027 } 2028 #endif 2029 2030 static Type *getGEPIndexTy(bool IsScalable, bool IsReverse, 2031 unsigned CurrentPart, IRBuilderBase &Builder) { 2032 // Use i32 for the gep index type when the value is constant, 2033 // or query DataLayout for a more suitable index type otherwise. 2034 const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout(); 2035 return IsScalable && (IsReverse || CurrentPart > 0) 2036 ? DL.getIndexType(Builder.getPtrTy(0)) 2037 : Builder.getInt32Ty(); 2038 } 2039 2040 void VPReverseVectorPointerRecipe::execute(VPTransformState &State) { 2041 auto &Builder = State.Builder; 2042 State.setDebugLocFrom(getDebugLoc()); 2043 unsigned CurrentPart = getUnrollPart(*this); 2044 Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ true, 2045 CurrentPart, Builder); 2046 2047 // The wide store needs to start at the last vector element. 2048 Value *RunTimeVF = State.get(getVFValue(), VPLane(0)); 2049 if (IndexTy != RunTimeVF->getType()) 2050 RunTimeVF = Builder.CreateZExtOrTrunc(RunTimeVF, IndexTy); 2051 // NumElt = -CurrentPart * RunTimeVF 2052 Value *NumElt = Builder.CreateMul( 2053 ConstantInt::get(IndexTy, -(int64_t)CurrentPart), RunTimeVF); 2054 // LastLane = 1 - RunTimeVF 2055 Value *LastLane = Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); 2056 Value *Ptr = State.get(getOperand(0), VPLane(0)); 2057 Value *ResultPtr = 2058 Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags()); 2059 ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "", 2060 getGEPNoWrapFlags()); 2061 2062 State.set(this, ResultPtr, /*IsScalar*/ true); 2063 } 2064 2065 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2066 void VPReverseVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, 2067 VPSlotTracker &SlotTracker) const { 2068 O << Indent; 2069 printAsOperand(O, SlotTracker); 2070 O << " = reverse-vector-pointer"; 2071 printFlags(O); 2072 printOperands(O, SlotTracker); 2073 } 2074 #endif 2075 2076 void VPVectorPointerRecipe::execute(VPTransformState &State) { 2077 auto &Builder = State.Builder; 2078 State.setDebugLocFrom(getDebugLoc()); 2079 unsigned CurrentPart = getUnrollPart(*this); 2080 Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false, 2081 CurrentPart, Builder); 2082 Value *Ptr = State.get(getOperand(0), VPLane(0)); 2083 2084 Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart); 2085 Value *ResultPtr = 2086 Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags()); 2087 2088 State.set(this, ResultPtr, /*IsScalar*/ true); 2089 } 2090 2091 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2092 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, 2093 VPSlotTracker &SlotTracker) const { 2094 O << Indent; 2095 printAsOperand(O, SlotTracker); 2096 O << " = vector-pointer "; 2097 2098 printOperands(O, SlotTracker); 2099 } 2100 #endif 2101 2102 void VPBlendRecipe::execute(VPTransformState &State) { 2103 assert(isNormalized() && "Expected blend to be normalized!"); 2104 State.setDebugLocFrom(getDebugLoc()); 2105 // We know that all PHIs in non-header blocks are converted into 2106 // selects, so we don't have to worry about the insertion order and we 2107 // can just use the builder. 2108 // At this point we generate the predication tree. There may be 2109 // duplications since this is a simple recursive scan, but future 2110 // optimizations will clean it up. 2111 2112 unsigned NumIncoming = getNumIncomingValues(); 2113 2114 // Generate a sequence of selects of the form: 2115 // SELECT(Mask3, In3, 2116 // SELECT(Mask2, In2, 2117 // SELECT(Mask1, In1, 2118 // In0))) 2119 // Note that Mask0 is never used: lanes for which no path reaches this phi and 2120 // are essentially undef are taken from In0. 2121 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 2122 Value *Result = nullptr; 2123 for (unsigned In = 0; In < NumIncoming; ++In) { 2124 // We might have single edge PHIs (blocks) - use an identity 2125 // 'select' for the first PHI operand. 2126 Value *In0 = State.get(getIncomingValue(In), OnlyFirstLaneUsed); 2127 if (In == 0) 2128 Result = In0; // Initialize with the first incoming value. 2129 else { 2130 // Select between the current value and the previous incoming edge 2131 // based on the incoming mask. 2132 Value *Cond = State.get(getMask(In), OnlyFirstLaneUsed); 2133 Result = State.Builder.CreateSelect(Cond, In0, Result, "predphi"); 2134 } 2135 } 2136 State.set(this, Result, OnlyFirstLaneUsed); 2137 } 2138 2139 InstructionCost VPBlendRecipe::computeCost(ElementCount VF, 2140 VPCostContext &Ctx) const { 2141 // Handle cases where only the first lane is used the same way as the legacy 2142 // cost model. 2143 if (vputils::onlyFirstLaneUsed(this)) 2144 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind); 2145 2146 Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 2147 Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF); 2148 return (getNumIncomingValues() - 1) * 2149 Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy, 2150 CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind); 2151 } 2152 2153 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2154 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, 2155 VPSlotTracker &SlotTracker) const { 2156 O << Indent << "BLEND "; 2157 printAsOperand(O, SlotTracker); 2158 O << " ="; 2159 if (getNumIncomingValues() == 1) { 2160 // Not a User of any mask: not really blending, this is a 2161 // single-predecessor phi. 2162 O << " "; 2163 getIncomingValue(0)->printAsOperand(O, SlotTracker); 2164 } else { 2165 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { 2166 O << " "; 2167 getIncomingValue(I)->printAsOperand(O, SlotTracker); 2168 if (I == 0) 2169 continue; 2170 O << "/"; 2171 getMask(I)->printAsOperand(O, SlotTracker); 2172 } 2173 } 2174 } 2175 #endif 2176 2177 void VPReductionRecipe::execute(VPTransformState &State) { 2178 assert(!State.Lane && "Reduction being replicated."); 2179 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); 2180 RecurKind Kind = RdxDesc.getRecurrenceKind(); 2181 // Propagate the fast-math flags carried by the underlying instruction. 2182 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 2183 State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 2184 State.setDebugLocFrom(getDebugLoc()); 2185 Value *NewVecOp = State.get(getVecOp()); 2186 if (VPValue *Cond = getCondOp()) { 2187 Value *NewCond = State.get(Cond, State.VF.isScalar()); 2188 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType()); 2189 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); 2190 2191 Value *Start; 2192 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind)) 2193 Start = RdxDesc.getRecurrenceStartValue(); 2194 else 2195 Start = llvm::getRecurrenceIdentity(Kind, ElementTy, 2196 RdxDesc.getFastMathFlags()); 2197 if (State.VF.isVector()) 2198 Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start); 2199 2200 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start); 2201 NewVecOp = Select; 2202 } 2203 Value *NewRed; 2204 Value *NextInChain; 2205 if (IsOrdered) { 2206 if (State.VF.isVector()) 2207 NewRed = 2208 createOrderedReduction(State.Builder, RdxDesc, NewVecOp, PrevInChain); 2209 else 2210 NewRed = State.Builder.CreateBinOp( 2211 (Instruction::BinaryOps)RdxDesc.getOpcode(), PrevInChain, NewVecOp); 2212 PrevInChain = NewRed; 2213 NextInChain = NewRed; 2214 } else { 2215 PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); 2216 NewRed = createReduction(State.Builder, RdxDesc, NewVecOp); 2217 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 2218 NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(), 2219 NewRed, PrevInChain); 2220 else 2221 NextInChain = State.Builder.CreateBinOp( 2222 (Instruction::BinaryOps)RdxDesc.getOpcode(), NewRed, PrevInChain); 2223 } 2224 State.set(this, NextInChain, /*IsScalar*/ true); 2225 } 2226 2227 void VPReductionEVLRecipe::execute(VPTransformState &State) { 2228 assert(!State.Lane && "Reduction being replicated."); 2229 2230 auto &Builder = State.Builder; 2231 // Propagate the fast-math flags carried by the underlying instruction. 2232 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder); 2233 const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor(); 2234 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 2235 2236 RecurKind Kind = RdxDesc.getRecurrenceKind(); 2237 Value *Prev = State.get(getChainOp(), /*IsScalar*/ true); 2238 Value *VecOp = State.get(getVecOp()); 2239 Value *EVL = State.get(getEVL(), VPLane(0)); 2240 2241 VectorBuilder VBuilder(Builder); 2242 VBuilder.setEVL(EVL); 2243 Value *Mask; 2244 // TODO: move the all-true mask generation into VectorBuilder. 2245 if (VPValue *CondOp = getCondOp()) 2246 Mask = State.get(CondOp); 2247 else 2248 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 2249 VBuilder.setMask(Mask); 2250 2251 Value *NewRed; 2252 if (isOrdered()) { 2253 NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev); 2254 } else { 2255 NewRed = createSimpleReduction(VBuilder, VecOp, RdxDesc); 2256 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 2257 NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev); 2258 else 2259 NewRed = Builder.CreateBinOp((Instruction::BinaryOps)RdxDesc.getOpcode(), 2260 NewRed, Prev); 2261 } 2262 State.set(this, NewRed, /*IsScalar*/ true); 2263 } 2264 2265 InstructionCost VPReductionRecipe::computeCost(ElementCount VF, 2266 VPCostContext &Ctx) const { 2267 RecurKind RdxKind = RdxDesc.getRecurrenceKind(); 2268 Type *ElementTy = Ctx.Types.inferScalarType(this); 2269 auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF)); 2270 unsigned Opcode = RdxDesc.getOpcode(); 2271 2272 // TODO: Support any-of and in-loop reductions. 2273 assert( 2274 (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) || 2275 ForceTargetInstructionCost.getNumOccurrences() > 0) && 2276 "Any-of reduction not implemented in VPlan-based cost model currently."); 2277 assert( 2278 (!cast<VPReductionPHIRecipe>(getOperand(0))->isInLoop() || 2279 ForceTargetInstructionCost.getNumOccurrences() > 0) && 2280 "In-loop reduction not implemented in VPlan-based cost model currently."); 2281 2282 assert(ElementTy->getTypeID() == RdxDesc.getRecurrenceType()->getTypeID() && 2283 "Inferred type and recurrence type mismatch."); 2284 2285 // Cost = Reduction cost + BinOp cost 2286 InstructionCost Cost = 2287 Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, Ctx.CostKind); 2288 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) { 2289 Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind); 2290 return Cost + Ctx.TTI.getMinMaxReductionCost( 2291 Id, VectorTy, RdxDesc.getFastMathFlags(), Ctx.CostKind); 2292 } 2293 2294 return Cost + Ctx.TTI.getArithmeticReductionCost( 2295 Opcode, VectorTy, RdxDesc.getFastMathFlags(), Ctx.CostKind); 2296 } 2297 2298 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2299 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, 2300 VPSlotTracker &SlotTracker) const { 2301 O << Indent << "REDUCE "; 2302 printAsOperand(O, SlotTracker); 2303 O << " = "; 2304 getChainOp()->printAsOperand(O, SlotTracker); 2305 O << " +"; 2306 if (isa<FPMathOperator>(getUnderlyingInstr())) 2307 O << getUnderlyingInstr()->getFastMathFlags(); 2308 O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; 2309 getVecOp()->printAsOperand(O, SlotTracker); 2310 if (isConditional()) { 2311 O << ", "; 2312 getCondOp()->printAsOperand(O, SlotTracker); 2313 } 2314 O << ")"; 2315 if (RdxDesc.IntermediateStore) 2316 O << " (with final reduction value stored in invariant address sank " 2317 "outside of loop)"; 2318 } 2319 2320 void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent, 2321 VPSlotTracker &SlotTracker) const { 2322 const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor(); 2323 O << Indent << "REDUCE "; 2324 printAsOperand(O, SlotTracker); 2325 O << " = "; 2326 getChainOp()->printAsOperand(O, SlotTracker); 2327 O << " +"; 2328 if (isa<FPMathOperator>(getUnderlyingInstr())) 2329 O << getUnderlyingInstr()->getFastMathFlags(); 2330 O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; 2331 getVecOp()->printAsOperand(O, SlotTracker); 2332 O << ", "; 2333 getEVL()->printAsOperand(O, SlotTracker); 2334 if (isConditional()) { 2335 O << ", "; 2336 getCondOp()->printAsOperand(O, SlotTracker); 2337 } 2338 O << ")"; 2339 if (RdxDesc.IntermediateStore) 2340 O << " (with final reduction value stored in invariant address sank " 2341 "outside of loop)"; 2342 } 2343 #endif 2344 2345 bool VPReplicateRecipe::shouldPack() const { 2346 // Find if the recipe is used by a widened recipe via an intervening 2347 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. 2348 return any_of(users(), [](const VPUser *U) { 2349 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U)) 2350 return any_of(PredR->users(), [PredR](const VPUser *U) { 2351 return !U->usesScalars(PredR); 2352 }); 2353 return false; 2354 }); 2355 } 2356 2357 InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, 2358 VPCostContext &Ctx) const { 2359 Instruction *UI = cast<Instruction>(getUnderlyingValue()); 2360 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan 2361 // transform, avoid computing their cost multiple times for now. 2362 Ctx.SkipCostComputation.insert(UI); 2363 return Ctx.getLegacyCost(UI, VF); 2364 } 2365 2366 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2367 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, 2368 VPSlotTracker &SlotTracker) const { 2369 O << Indent << (IsUniform ? "CLONE " : "REPLICATE "); 2370 2371 if (!getUnderlyingInstr()->getType()->isVoidTy()) { 2372 printAsOperand(O, SlotTracker); 2373 O << " = "; 2374 } 2375 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) { 2376 O << "call"; 2377 printFlags(O); 2378 O << "@" << CB->getCalledFunction()->getName() << "("; 2379 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), 2380 O, [&O, &SlotTracker](VPValue *Op) { 2381 Op->printAsOperand(O, SlotTracker); 2382 }); 2383 O << ")"; 2384 } else { 2385 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); 2386 printFlags(O); 2387 printOperands(O, SlotTracker); 2388 } 2389 2390 if (shouldPack()) 2391 O << " (S->V)"; 2392 } 2393 #endif 2394 2395 Value *VPScalarCastRecipe ::generate(VPTransformState &State) { 2396 State.setDebugLocFrom(getDebugLoc()); 2397 assert(vputils::onlyFirstLaneUsed(this) && 2398 "Codegen only implemented for first lane."); 2399 switch (Opcode) { 2400 case Instruction::SExt: 2401 case Instruction::ZExt: 2402 case Instruction::Trunc: { 2403 // Note: SExt/ZExt not used yet. 2404 Value *Op = State.get(getOperand(0), VPLane(0)); 2405 return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy); 2406 } 2407 default: 2408 llvm_unreachable("opcode not implemented yet"); 2409 } 2410 } 2411 2412 void VPScalarCastRecipe ::execute(VPTransformState &State) { 2413 State.set(this, generate(State), VPLane(0)); 2414 } 2415 2416 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2417 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent, 2418 VPSlotTracker &SlotTracker) const { 2419 O << Indent << "SCALAR-CAST "; 2420 printAsOperand(O, SlotTracker); 2421 O << " = " << Instruction::getOpcodeName(Opcode) << " "; 2422 printOperands(O, SlotTracker); 2423 O << " to " << *ResultTy; 2424 } 2425 #endif 2426 2427 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 2428 assert(State.Lane && "Branch on Mask works only on single instance."); 2429 2430 unsigned Lane = State.Lane->getKnownLane(); 2431 2432 Value *ConditionBit = nullptr; 2433 VPValue *BlockInMask = getMask(); 2434 if (BlockInMask) { 2435 ConditionBit = State.get(BlockInMask); 2436 if (ConditionBit->getType()->isVectorTy()) 2437 ConditionBit = State.Builder.CreateExtractElement( 2438 ConditionBit, State.Builder.getInt32(Lane)); 2439 } else // Block in mask is all-one. 2440 ConditionBit = State.Builder.getTrue(); 2441 2442 // Replace the temporary unreachable terminator with a new conditional branch, 2443 // whose two destinations will be set later when they are created. 2444 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 2445 assert(isa<UnreachableInst>(CurrentTerminator) && 2446 "Expected to replace unreachable terminator with conditional branch."); 2447 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 2448 CondBr->setSuccessor(0, nullptr); 2449 ReplaceInstWithInst(CurrentTerminator, CondBr); 2450 } 2451 2452 InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF, 2453 VPCostContext &Ctx) const { 2454 // The legacy cost model doesn't assign costs to branches for individual 2455 // replicate regions. Match the current behavior in the VPlan cost model for 2456 // now. 2457 return 0; 2458 } 2459 2460 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 2461 State.setDebugLocFrom(getDebugLoc()); 2462 assert(State.Lane && "Predicated instruction PHI works per instance."); 2463 Instruction *ScalarPredInst = 2464 cast<Instruction>(State.get(getOperand(0), *State.Lane)); 2465 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 2466 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 2467 assert(PredicatingBB && "Predicated block has no single predecessor."); 2468 assert(isa<VPReplicateRecipe>(getOperand(0)) && 2469 "operand must be VPReplicateRecipe"); 2470 2471 // By current pack/unpack logic we need to generate only a single phi node: if 2472 // a vector value for the predicated instruction exists at this point it means 2473 // the instruction has vector users only, and a phi for the vector value is 2474 // needed. In this case the recipe of the predicated instruction is marked to 2475 // also do that packing, thereby "hoisting" the insert-element sequence. 2476 // Otherwise, a phi node for the scalar value is needed. 2477 if (State.hasVectorValue(getOperand(0))) { 2478 Value *VectorValue = State.get(getOperand(0)); 2479 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 2480 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 2481 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 2482 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 2483 if (State.hasVectorValue(this)) 2484 State.reset(this, VPhi); 2485 else 2486 State.set(this, VPhi); 2487 // NOTE: Currently we need to update the value of the operand, so the next 2488 // predicated iteration inserts its generated value in the correct vector. 2489 State.reset(getOperand(0), VPhi); 2490 } else { 2491 if (vputils::onlyFirstLaneUsed(this) && !State.Lane->isFirstLane()) 2492 return; 2493 2494 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 2495 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 2496 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 2497 PredicatingBB); 2498 Phi->addIncoming(ScalarPredInst, PredicatedBB); 2499 if (State.hasScalarValue(this, *State.Lane)) 2500 State.reset(this, Phi, *State.Lane); 2501 else 2502 State.set(this, Phi, *State.Lane); 2503 // NOTE: Currently we need to update the value of the operand, so the next 2504 // predicated iteration inserts its generated value in the correct vector. 2505 State.reset(getOperand(0), Phi, *State.Lane); 2506 } 2507 } 2508 2509 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2510 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2511 VPSlotTracker &SlotTracker) const { 2512 O << Indent << "PHI-PREDICATED-INSTRUCTION "; 2513 printAsOperand(O, SlotTracker); 2514 O << " = "; 2515 printOperands(O, SlotTracker); 2516 } 2517 #endif 2518 2519 InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, 2520 VPCostContext &Ctx) const { 2521 Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); 2522 const Align Alignment = 2523 getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient)); 2524 unsigned AS = 2525 getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient)); 2526 2527 if (!Consecutive) { 2528 // TODO: Using the original IR may not be accurate. 2529 // Currently, ARM will use the underlying IR to calculate gather/scatter 2530 // instruction cost. 2531 const Value *Ptr = getLoadStorePointerOperand(&Ingredient); 2532 assert(!Reverse && 2533 "Inconsecutive memory access should not have the order."); 2534 return Ctx.TTI.getAddressComputationCost(Ty) + 2535 Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr, 2536 IsMasked, Alignment, Ctx.CostKind, 2537 &Ingredient); 2538 } 2539 2540 InstructionCost Cost = 0; 2541 if (IsMasked) { 2542 Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, 2543 AS, Ctx.CostKind); 2544 } else { 2545 TTI::OperandValueInfo OpInfo = 2546 Ctx.TTI.getOperandInfo(Ingredient.getOperand(0)); 2547 Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS, 2548 Ctx.CostKind, OpInfo, &Ingredient); 2549 } 2550 if (!Reverse) 2551 return Cost; 2552 2553 return Cost += 2554 Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 2555 cast<VectorType>(Ty), {}, Ctx.CostKind, 0); 2556 } 2557 2558 void VPWidenLoadRecipe::execute(VPTransformState &State) { 2559 auto *LI = cast<LoadInst>(&Ingredient); 2560 2561 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 2562 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 2563 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2564 bool CreateGather = !isConsecutive(); 2565 2566 auto &Builder = State.Builder; 2567 State.setDebugLocFrom(getDebugLoc()); 2568 Value *Mask = nullptr; 2569 if (auto *VPMask = getMask()) { 2570 // Mask reversal is only needed for non-all-one (null) masks, as reverse 2571 // of a null all-one mask is a null mask. 2572 Mask = State.get(VPMask); 2573 if (isReverse()) 2574 Mask = Builder.CreateVectorReverse(Mask, "reverse"); 2575 } 2576 2577 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather); 2578 Value *NewLI; 2579 if (CreateGather) { 2580 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr, 2581 "wide.masked.gather"); 2582 } else if (Mask) { 2583 NewLI = 2584 Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask, 2585 PoisonValue::get(DataTy), "wide.masked.load"); 2586 } else { 2587 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load"); 2588 } 2589 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2590 State.addMetadata(NewLI, LI); 2591 if (Reverse) 2592 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 2593 State.set(this, NewLI); 2594 } 2595 2596 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2597 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, 2598 VPSlotTracker &SlotTracker) const { 2599 O << Indent << "WIDEN "; 2600 printAsOperand(O, SlotTracker); 2601 O << " = load "; 2602 printOperands(O, SlotTracker); 2603 } 2604 #endif 2605 2606 /// Use all-true mask for reverse rather than actual mask, as it avoids a 2607 /// dependence w/o affecting the result. 2608 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, 2609 Value *EVL, const Twine &Name) { 2610 VectorType *ValTy = cast<VectorType>(Operand->getType()); 2611 Value *AllTrueMask = 2612 Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue()); 2613 return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse, 2614 {Operand, AllTrueMask, EVL}, nullptr, Name); 2615 } 2616 2617 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { 2618 auto *LI = cast<LoadInst>(&Ingredient); 2619 2620 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 2621 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 2622 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2623 bool CreateGather = !isConsecutive(); 2624 2625 auto &Builder = State.Builder; 2626 State.setDebugLocFrom(getDebugLoc()); 2627 CallInst *NewLI; 2628 Value *EVL = State.get(getEVL(), VPLane(0)); 2629 Value *Addr = State.get(getAddr(), !CreateGather); 2630 Value *Mask = nullptr; 2631 if (VPValue *VPMask = getMask()) { 2632 Mask = State.get(VPMask); 2633 if (isReverse()) 2634 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); 2635 } else { 2636 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 2637 } 2638 2639 if (CreateGather) { 2640 NewLI = 2641 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, 2642 nullptr, "wide.masked.gather"); 2643 } else { 2644 VectorBuilder VBuilder(Builder); 2645 VBuilder.setEVL(EVL).setMask(Mask); 2646 NewLI = cast<CallInst>(VBuilder.createVectorInstruction( 2647 Instruction::Load, DataTy, Addr, "vp.op.load")); 2648 } 2649 NewLI->addParamAttr( 2650 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); 2651 State.addMetadata(NewLI, LI); 2652 Instruction *Res = NewLI; 2653 if (isReverse()) 2654 Res = createReverseEVL(Builder, Res, EVL, "vp.reverse"); 2655 State.set(this, Res); 2656 } 2657 2658 InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF, 2659 VPCostContext &Ctx) const { 2660 if (!Consecutive || IsMasked) 2661 return VPWidenMemoryRecipe::computeCost(VF, Ctx); 2662 2663 // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost() 2664 // here because the EVL recipes using EVL to replace the tail mask. But in the 2665 // legacy model, it will always calculate the cost of mask. 2666 // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we 2667 // don't need to compare to the legacy cost model. 2668 Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); 2669 const Align Alignment = 2670 getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient)); 2671 unsigned AS = 2672 getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient)); 2673 InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( 2674 Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind); 2675 if (!Reverse) 2676 return Cost; 2677 2678 return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 2679 cast<VectorType>(Ty), {}, Ctx.CostKind, 2680 0); 2681 } 2682 2683 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2684 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, 2685 VPSlotTracker &SlotTracker) const { 2686 O << Indent << "WIDEN "; 2687 printAsOperand(O, SlotTracker); 2688 O << " = vp.load "; 2689 printOperands(O, SlotTracker); 2690 } 2691 #endif 2692 2693 void VPWidenStoreRecipe::execute(VPTransformState &State) { 2694 auto *SI = cast<StoreInst>(&Ingredient); 2695 2696 VPValue *StoredVPValue = getStoredValue(); 2697 bool CreateScatter = !isConsecutive(); 2698 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2699 2700 auto &Builder = State.Builder; 2701 State.setDebugLocFrom(getDebugLoc()); 2702 2703 Value *Mask = nullptr; 2704 if (auto *VPMask = getMask()) { 2705 // Mask reversal is only needed for non-all-one (null) masks, as reverse 2706 // of a null all-one mask is a null mask. 2707 Mask = State.get(VPMask); 2708 if (isReverse()) 2709 Mask = Builder.CreateVectorReverse(Mask, "reverse"); 2710 } 2711 2712 Value *StoredVal = State.get(StoredVPValue); 2713 if (isReverse()) { 2714 // If we store to reverse consecutive memory locations, then we need 2715 // to reverse the order of elements in the stored value. 2716 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 2717 // We don't want to update the value in the map as it might be used in 2718 // another expression. So don't call resetVectorValue(StoredVal). 2719 } 2720 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter); 2721 Instruction *NewSI = nullptr; 2722 if (CreateScatter) 2723 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask); 2724 else if (Mask) 2725 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask); 2726 else 2727 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment); 2728 State.addMetadata(NewSI, SI); 2729 } 2730 2731 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2732 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, 2733 VPSlotTracker &SlotTracker) const { 2734 O << Indent << "WIDEN store "; 2735 printOperands(O, SlotTracker); 2736 } 2737 #endif 2738 2739 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { 2740 auto *SI = cast<StoreInst>(&Ingredient); 2741 2742 VPValue *StoredValue = getStoredValue(); 2743 bool CreateScatter = !isConsecutive(); 2744 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2745 2746 auto &Builder = State.Builder; 2747 State.setDebugLocFrom(getDebugLoc()); 2748 2749 CallInst *NewSI = nullptr; 2750 Value *StoredVal = State.get(StoredValue); 2751 Value *EVL = State.get(getEVL(), VPLane(0)); 2752 if (isReverse()) 2753 StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse"); 2754 Value *Mask = nullptr; 2755 if (VPValue *VPMask = getMask()) { 2756 Mask = State.get(VPMask); 2757 if (isReverse()) 2758 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); 2759 } else { 2760 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 2761 } 2762 Value *Addr = State.get(getAddr(), !CreateScatter); 2763 if (CreateScatter) { 2764 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), 2765 Intrinsic::vp_scatter, 2766 {StoredVal, Addr, Mask, EVL}); 2767 } else { 2768 VectorBuilder VBuilder(Builder); 2769 VBuilder.setEVL(EVL).setMask(Mask); 2770 NewSI = cast<CallInst>(VBuilder.createVectorInstruction( 2771 Instruction::Store, Type::getVoidTy(EVL->getContext()), 2772 {StoredVal, Addr})); 2773 } 2774 NewSI->addParamAttr( 2775 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment)); 2776 State.addMetadata(NewSI, SI); 2777 } 2778 2779 InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF, 2780 VPCostContext &Ctx) const { 2781 if (!Consecutive || IsMasked) 2782 return VPWidenMemoryRecipe::computeCost(VF, Ctx); 2783 2784 // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost() 2785 // here because the EVL recipes using EVL to replace the tail mask. But in the 2786 // legacy model, it will always calculate the cost of mask. 2787 // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we 2788 // don't need to compare to the legacy cost model. 2789 Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); 2790 const Align Alignment = 2791 getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient)); 2792 unsigned AS = 2793 getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient)); 2794 InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( 2795 Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind); 2796 if (!Reverse) 2797 return Cost; 2798 2799 return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 2800 cast<VectorType>(Ty), {}, Ctx.CostKind, 2801 0); 2802 } 2803 2804 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2805 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent, 2806 VPSlotTracker &SlotTracker) const { 2807 O << Indent << "WIDEN vp.store "; 2808 printOperands(O, SlotTracker); 2809 } 2810 #endif 2811 2812 static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V, 2813 VectorType *DstVTy, const DataLayout &DL) { 2814 // Verify that V is a vector type with same number of elements as DstVTy. 2815 auto VF = DstVTy->getElementCount(); 2816 auto *SrcVecTy = cast<VectorType>(V->getType()); 2817 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match"); 2818 Type *SrcElemTy = SrcVecTy->getElementType(); 2819 Type *DstElemTy = DstVTy->getElementType(); 2820 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2821 "Vector elements must have same size"); 2822 2823 // Do a direct cast if element types are castable. 2824 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2825 return Builder.CreateBitOrPointerCast(V, DstVTy); 2826 } 2827 // V cannot be directly casted to desired vector type. 2828 // May happen when V is a floating point vector but DstVTy is a vector of 2829 // pointers or vice-versa. Handle this using a two-step bitcast using an 2830 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2831 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2832 "Only one type should be a pointer type"); 2833 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2834 "Only one type should be a floating point type"); 2835 Type *IntTy = 2836 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2837 auto *VecIntTy = VectorType::get(IntTy, VF); 2838 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2839 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2840 } 2841 2842 /// Return a vector containing interleaved elements from multiple 2843 /// smaller input vectors. 2844 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, 2845 const Twine &Name) { 2846 unsigned Factor = Vals.size(); 2847 assert(Factor > 1 && "Tried to interleave invalid number of vectors"); 2848 2849 VectorType *VecTy = cast<VectorType>(Vals[0]->getType()); 2850 #ifndef NDEBUG 2851 for (Value *Val : Vals) 2852 assert(Val->getType() == VecTy && "Tried to interleave mismatched types"); 2853 #endif 2854 2855 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so 2856 // must use intrinsics to interleave. 2857 if (VecTy->isScalableTy()) { 2858 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); 2859 return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2, 2860 Vals, 2861 /*FMFSource=*/nullptr, Name); 2862 } 2863 2864 // Fixed length. Start by concatenating all vectors into a wide vector. 2865 Value *WideVec = concatenateVectors(Builder, Vals); 2866 2867 // Interleave the elements into the wide vector. 2868 const unsigned NumElts = VecTy->getElementCount().getFixedValue(); 2869 return Builder.CreateShuffleVector( 2870 WideVec, createInterleaveMask(NumElts, Factor), Name); 2871 } 2872 2873 // Try to vectorize the interleave group that \p Instr belongs to. 2874 // 2875 // E.g. Translate following interleaved load group (factor = 3): 2876 // for (i = 0; i < N; i+=3) { 2877 // R = Pic[i]; // Member of index 0 2878 // G = Pic[i+1]; // Member of index 1 2879 // B = Pic[i+2]; // Member of index 2 2880 // ... // do something to R, G, B 2881 // } 2882 // To: 2883 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2884 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2885 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2886 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2887 // 2888 // Or translate following interleaved store group (factor = 3): 2889 // for (i = 0; i < N; i+=3) { 2890 // ... do something to R, G, B 2891 // Pic[i] = R; // Member of index 0 2892 // Pic[i+1] = G; // Member of index 1 2893 // Pic[i+2] = B; // Member of index 2 2894 // } 2895 // To: 2896 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2897 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2898 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2899 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2900 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2901 void VPInterleaveRecipe::execute(VPTransformState &State) { 2902 assert(!State.Lane && "Interleave group being replicated."); 2903 const InterleaveGroup<Instruction> *Group = IG; 2904 Instruction *Instr = Group->getInsertPos(); 2905 2906 // Prepare for the vector type of the interleaved load/store. 2907 Type *ScalarTy = getLoadStoreType(Instr); 2908 unsigned InterleaveFactor = Group->getFactor(); 2909 auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor); 2910 2911 // TODO: extend the masked interleaved-group support to reversed access. 2912 VPValue *BlockInMask = getMask(); 2913 assert((!BlockInMask || !Group->isReverse()) && 2914 "Reversed masked interleave-group not supported."); 2915 2916 VPValue *Addr = getAddr(); 2917 Value *ResAddr = State.get(Addr, VPLane(0)); 2918 if (auto *I = dyn_cast<Instruction>(ResAddr)) 2919 State.setDebugLocFrom(I->getDebugLoc()); 2920 2921 // If the group is reverse, adjust the index to refer to the last vector lane 2922 // instead of the first. We adjust the index from the first vector lane, 2923 // rather than directly getting the pointer for lane VF - 1, because the 2924 // pointer operand of the interleaved access is supposed to be uniform. 2925 if (Group->isReverse()) { 2926 Value *RuntimeVF = 2927 getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF); 2928 Value *Index = 2929 State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1)); 2930 Index = State.Builder.CreateMul(Index, 2931 State.Builder.getInt32(Group->getFactor())); 2932 Index = State.Builder.CreateNeg(Index); 2933 2934 bool InBounds = false; 2935 if (auto *Gep = dyn_cast<GetElementPtrInst>(ResAddr->stripPointerCasts())) 2936 InBounds = Gep->isInBounds(); 2937 ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Index, "", InBounds); 2938 } 2939 2940 State.setDebugLocFrom(Instr->getDebugLoc()); 2941 Value *PoisonVec = PoisonValue::get(VecTy); 2942 2943 auto CreateGroupMask = [&BlockInMask, &State, 2944 &InterleaveFactor](Value *MaskForGaps) -> Value * { 2945 if (State.VF.isScalable()) { 2946 assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); 2947 assert(InterleaveFactor == 2 && 2948 "Unsupported deinterleave factor for scalable vectors"); 2949 auto *ResBlockInMask = State.get(BlockInMask); 2950 SmallVector<Value *, 2> Ops = {ResBlockInMask, ResBlockInMask}; 2951 auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(), 2952 State.VF.getKnownMinValue() * 2, true); 2953 return State.Builder.CreateIntrinsic( 2954 MaskTy, Intrinsic::vector_interleave2, Ops, 2955 /*FMFSource=*/nullptr, "interleaved.mask"); 2956 } 2957 2958 if (!BlockInMask) 2959 return MaskForGaps; 2960 2961 Value *ResBlockInMask = State.get(BlockInMask); 2962 Value *ShuffledMask = State.Builder.CreateShuffleVector( 2963 ResBlockInMask, 2964 createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()), 2965 "interleaved.mask"); 2966 return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And, 2967 ShuffledMask, MaskForGaps) 2968 : ShuffledMask; 2969 }; 2970 2971 const DataLayout &DL = Instr->getDataLayout(); 2972 // Vectorize the interleaved load group. 2973 if (isa<LoadInst>(Instr)) { 2974 Value *MaskForGaps = nullptr; 2975 if (NeedsMaskForGaps) { 2976 MaskForGaps = createBitMaskForGaps(State.Builder, 2977 State.VF.getKnownMinValue(), *Group); 2978 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2979 } 2980 2981 Instruction *NewLoad; 2982 if (BlockInMask || MaskForGaps) { 2983 Value *GroupMask = CreateGroupMask(MaskForGaps); 2984 NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr, 2985 Group->getAlign(), GroupMask, 2986 PoisonVec, "wide.masked.vec"); 2987 } else 2988 NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr, 2989 Group->getAlign(), "wide.vec"); 2990 Group->addMetadata(NewLoad); 2991 2992 ArrayRef<VPValue *> VPDefs = definedValues(); 2993 const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); 2994 if (VecTy->isScalableTy()) { 2995 assert(InterleaveFactor == 2 && 2996 "Unsupported deinterleave factor for scalable vectors"); 2997 2998 // Scalable vectors cannot use arbitrary shufflevectors (only splats), 2999 // so must use intrinsics to deinterleave. 3000 Value *DI = State.Builder.CreateIntrinsic( 3001 Intrinsic::vector_deinterleave2, VecTy, NewLoad, 3002 /*FMFSource=*/nullptr, "strided.vec"); 3003 unsigned J = 0; 3004 for (unsigned I = 0; I < InterleaveFactor; ++I) { 3005 Instruction *Member = Group->getMember(I); 3006 3007 if (!Member) 3008 continue; 3009 3010 Value *StridedVec = State.Builder.CreateExtractValue(DI, I); 3011 // If this member has different type, cast the result type. 3012 if (Member->getType() != ScalarTy) { 3013 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); 3014 StridedVec = 3015 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); 3016 } 3017 3018 if (Group->isReverse()) 3019 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); 3020 3021 State.set(VPDefs[J], StridedVec); 3022 ++J; 3023 } 3024 3025 return; 3026 } 3027 3028 // For each member in the group, shuffle out the appropriate data from the 3029 // wide loads. 3030 unsigned J = 0; 3031 for (unsigned I = 0; I < InterleaveFactor; ++I) { 3032 Instruction *Member = Group->getMember(I); 3033 3034 // Skip the gaps in the group. 3035 if (!Member) 3036 continue; 3037 3038 auto StrideMask = 3039 createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue()); 3040 Value *StridedVec = 3041 State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec"); 3042 3043 // If this member has different type, cast the result type. 3044 if (Member->getType() != ScalarTy) { 3045 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 3046 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); 3047 StridedVec = 3048 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); 3049 } 3050 3051 if (Group->isReverse()) 3052 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); 3053 3054 State.set(VPDefs[J], StridedVec); 3055 ++J; 3056 } 3057 return; 3058 } 3059 3060 // The sub vector type for current instruction. 3061 auto *SubVT = VectorType::get(ScalarTy, State.VF); 3062 3063 // Vectorize the interleaved store group. 3064 Value *MaskForGaps = 3065 createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group); 3066 assert((!MaskForGaps || !State.VF.isScalable()) && 3067 "masking gaps for scalable vectors is not yet supported."); 3068 ArrayRef<VPValue *> StoredValues = getStoredValues(); 3069 // Collect the stored vector from each member. 3070 SmallVector<Value *, 4> StoredVecs; 3071 unsigned StoredIdx = 0; 3072 for (unsigned i = 0; i < InterleaveFactor; i++) { 3073 assert((Group->getMember(i) || MaskForGaps) && 3074 "Fail to get a member from an interleaved store group"); 3075 Instruction *Member = Group->getMember(i); 3076 3077 // Skip the gaps in the group. 3078 if (!Member) { 3079 Value *Undef = PoisonValue::get(SubVT); 3080 StoredVecs.push_back(Undef); 3081 continue; 3082 } 3083 3084 Value *StoredVec = State.get(StoredValues[StoredIdx]); 3085 ++StoredIdx; 3086 3087 if (Group->isReverse()) 3088 StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse"); 3089 3090 // If this member has different type, cast it to a unified type. 3091 3092 if (StoredVec->getType() != SubVT) 3093 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL); 3094 3095 StoredVecs.push_back(StoredVec); 3096 } 3097 3098 // Interleave all the smaller vectors into one wider vector. 3099 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec"); 3100 Instruction *NewStoreInstr; 3101 if (BlockInMask || MaskForGaps) { 3102 Value *GroupMask = CreateGroupMask(MaskForGaps); 3103 NewStoreInstr = State.Builder.CreateMaskedStore( 3104 IVec, ResAddr, Group->getAlign(), GroupMask); 3105 } else 3106 NewStoreInstr = 3107 State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign()); 3108 3109 Group->addMetadata(NewStoreInstr); 3110 } 3111 3112 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3113 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 3114 VPSlotTracker &SlotTracker) const { 3115 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 3116 IG->getInsertPos()->printAsOperand(O, false); 3117 O << ", "; 3118 getAddr()->printAsOperand(O, SlotTracker); 3119 VPValue *Mask = getMask(); 3120 if (Mask) { 3121 O << ", "; 3122 Mask->printAsOperand(O, SlotTracker); 3123 } 3124 3125 unsigned OpIdx = 0; 3126 for (unsigned i = 0; i < IG->getFactor(); ++i) { 3127 if (!IG->getMember(i)) 3128 continue; 3129 if (getNumStoreOperands() > 0) { 3130 O << "\n" << Indent << " store "; 3131 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 3132 O << " to index " << i; 3133 } else { 3134 O << "\n" << Indent << " "; 3135 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 3136 O << " = load from index " << i; 3137 } 3138 ++OpIdx; 3139 } 3140 } 3141 #endif 3142 3143 InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF, 3144 VPCostContext &Ctx) const { 3145 Instruction *InsertPos = getInsertPos(); 3146 // Find the VPValue index of the interleave group. We need to skip gaps. 3147 unsigned InsertPosIdx = 0; 3148 for (unsigned Idx = 0; IG->getFactor(); ++Idx) 3149 if (auto *Member = IG->getMember(Idx)) { 3150 if (Member == InsertPos) 3151 break; 3152 InsertPosIdx++; 3153 } 3154 Type *ValTy = Ctx.Types.inferScalarType( 3155 getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx) 3156 : getStoredValues()[InsertPosIdx]); 3157 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); 3158 unsigned AS = getLoadStoreAddressSpace(InsertPos); 3159 3160 unsigned InterleaveFactor = IG->getFactor(); 3161 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 3162 3163 // Holds the indices of existing members in the interleaved group. 3164 SmallVector<unsigned, 4> Indices; 3165 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 3166 if (IG->getMember(IF)) 3167 Indices.push_back(IF); 3168 3169 // Calculate the cost of the whole interleaved group. 3170 InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost( 3171 InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices, 3172 IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps); 3173 3174 if (!IG->isReverse()) 3175 return Cost; 3176 3177 return Cost + IG->getNumMembers() * 3178 Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 3179 VectorTy, std::nullopt, Ctx.CostKind, 3180 0); 3181 } 3182 3183 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3184 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3185 VPSlotTracker &SlotTracker) const { 3186 O << Indent << "EMIT "; 3187 printAsOperand(O, SlotTracker); 3188 O << " = CANONICAL-INDUCTION "; 3189 printOperands(O, SlotTracker); 3190 } 3191 #endif 3192 3193 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) { 3194 return IsScalarAfterVectorization && 3195 (!IsScalable || vputils::onlyFirstLaneUsed(this)); 3196 } 3197 3198 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 3199 assert(getInductionDescriptor().getKind() == 3200 InductionDescriptor::IK_PtrInduction && 3201 "Not a pointer induction according to InductionDescriptor!"); 3202 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && 3203 "Unexpected type."); 3204 assert(!onlyScalarsGenerated(State.VF.isScalable()) && 3205 "Recipe should have been replaced"); 3206 3207 unsigned CurrentPart = getUnrollPart(*this); 3208 3209 // Build a pointer phi 3210 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 3211 Type *ScStValueType = ScalarStartValue->getType(); 3212 3213 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3214 PHINode *NewPointerPhi = nullptr; 3215 if (CurrentPart == 0) { 3216 auto *IVR = cast<VPHeaderPHIRecipe>(&getParent() 3217 ->getPlan() 3218 ->getVectorLoopRegion() 3219 ->getEntryBasicBlock() 3220 ->front()); 3221 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, /*IsScalar*/ true)); 3222 NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi", 3223 CanonicalIV->getIterator()); 3224 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 3225 NewPointerPhi->setDebugLoc(getDebugLoc()); 3226 } else { 3227 // The recipe has been unrolled. In that case, fetch the single pointer phi 3228 // shared among all unrolled parts of the recipe. 3229 auto *GEP = 3230 cast<GetElementPtrInst>(State.get(getFirstUnrolledPartOperand())); 3231 NewPointerPhi = cast<PHINode>(GEP->getPointerOperand()); 3232 } 3233 3234 // A pointer induction, performed by using a gep 3235 BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint(); 3236 Value *ScalarStepValue = State.get(getStepValue(), VPLane(0)); 3237 Type *PhiType = State.TypeAnalysis.inferScalarType(getStepValue()); 3238 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 3239 // Add induction update using an incorrect block temporarily. The phi node 3240 // will be fixed after VPlan execution. Note that at this point the latch 3241 // block cannot be used, as it does not exist yet. 3242 // TODO: Model increment value in VPlan, by turning the recipe into a 3243 // multi-def and a subclass of VPHeaderPHIRecipe. 3244 if (CurrentPart == 0) { 3245 // The recipe represents the first part of the pointer induction. Create the 3246 // GEP to increment the phi across all unrolled parts. 3247 unsigned UF = CurrentPart == 0 ? getParent()->getPlan()->getUF() : 1; 3248 Value *NumUnrolledElems = 3249 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, UF)); 3250 3251 Value *InductionGEP = GetElementPtrInst::Create( 3252 State.Builder.getInt8Ty(), NewPointerPhi, 3253 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 3254 InductionLoc); 3255 3256 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 3257 } 3258 3259 // Create actual address geps that use the pointer phi as base and a 3260 // vectorized version of the step value (<step*0, ..., step*N>) as offset. 3261 Type *VecPhiType = VectorType::get(PhiType, State.VF); 3262 Value *StartOffsetScalar = State.Builder.CreateMul( 3263 RuntimeVF, ConstantInt::get(PhiType, CurrentPart)); 3264 Value *StartOffset = 3265 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 3266 // Create a vector of consecutive numbers from zero to VF. 3267 StartOffset = State.Builder.CreateAdd( 3268 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 3269 3270 assert(ScalarStepValue == State.get(getOperand(1), VPLane(0)) && 3271 "scalar step must be the same across all parts"); 3272 Value *GEP = State.Builder.CreateGEP( 3273 State.Builder.getInt8Ty(), NewPointerPhi, 3274 State.Builder.CreateMul(StartOffset, State.Builder.CreateVectorSplat( 3275 State.VF, ScalarStepValue)), 3276 "vector.gep"); 3277 State.set(this, GEP); 3278 } 3279 3280 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3281 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, 3282 VPSlotTracker &SlotTracker) const { 3283 assert((getNumOperands() == 2 || getNumOperands() == 4) && 3284 "unexpected number of operands"); 3285 O << Indent << "EMIT "; 3286 printAsOperand(O, SlotTracker); 3287 O << " = WIDEN-POINTER-INDUCTION "; 3288 getStartValue()->printAsOperand(O, SlotTracker); 3289 O << ", "; 3290 getStepValue()->printAsOperand(O, SlotTracker); 3291 if (getNumOperands() == 4) { 3292 O << ", "; 3293 getOperand(2)->printAsOperand(O, SlotTracker); 3294 O << ", "; 3295 getOperand(3)->printAsOperand(O, SlotTracker); 3296 } 3297 } 3298 #endif 3299 3300 void VPExpandSCEVRecipe::execute(VPTransformState &State) { 3301 assert(!State.Lane && "cannot be used in per-lane"); 3302 if (State.ExpandedSCEVs.contains(Expr)) { 3303 // SCEV Expr has already been expanded, result must already be set. At the 3304 // moment we have to execute the entry block twice (once before skeleton 3305 // creation to get expanded SCEVs used by the skeleton and once during 3306 // regular VPlan execution). 3307 State.Builder.SetInsertPoint(State.CFG.VPBB2IRBB[getParent()]); 3308 assert(State.get(this, VPLane(0)) == State.ExpandedSCEVs[Expr] && 3309 "Results must match"); 3310 return; 3311 } 3312 3313 const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); 3314 SCEVExpander Exp(SE, DL, "induction"); 3315 3316 Value *Res = Exp.expandCodeFor(Expr, Expr->getType(), 3317 &*State.Builder.GetInsertPoint()); 3318 State.ExpandedSCEVs[Expr] = Res; 3319 State.set(this, Res, VPLane(0)); 3320 } 3321 3322 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3323 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, 3324 VPSlotTracker &SlotTracker) const { 3325 O << Indent << "EMIT "; 3326 printAsOperand(O, SlotTracker); 3327 O << " = EXPAND SCEV " << *Expr; 3328 } 3329 #endif 3330 3331 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { 3332 Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true); 3333 Type *STy = CanonicalIV->getType(); 3334 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); 3335 ElementCount VF = State.VF; 3336 Value *VStart = VF.isScalar() 3337 ? CanonicalIV 3338 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); 3339 Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this)); 3340 if (VF.isVector()) { 3341 VStep = Builder.CreateVectorSplat(VF, VStep); 3342 VStep = 3343 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); 3344 } 3345 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); 3346 State.set(this, CanonicalVectorIV); 3347 } 3348 3349 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3350 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, 3351 VPSlotTracker &SlotTracker) const { 3352 O << Indent << "EMIT "; 3353 printAsOperand(O, SlotTracker); 3354 O << " = WIDEN-CANONICAL-INDUCTION "; 3355 printOperands(O, SlotTracker); 3356 } 3357 #endif 3358 3359 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { 3360 auto &Builder = State.Builder; 3361 // Create a vector from the initial value. 3362 auto *VectorInit = getStartValue()->getLiveInIRValue(); 3363 3364 Type *VecTy = State.VF.isScalar() 3365 ? VectorInit->getType() 3366 : VectorType::get(VectorInit->getType(), State.VF); 3367 3368 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3369 if (State.VF.isVector()) { 3370 auto *IdxTy = Builder.getInt32Ty(); 3371 auto *One = ConstantInt::get(IdxTy, 1); 3372 IRBuilder<>::InsertPointGuard Guard(Builder); 3373 Builder.SetInsertPoint(VectorPH->getTerminator()); 3374 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF); 3375 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 3376 VectorInit = Builder.CreateInsertElement( 3377 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init"); 3378 } 3379 3380 // Create a phi node for the new recurrence. 3381 PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur"); 3382 Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 3383 Phi->addIncoming(VectorInit, VectorPH); 3384 State.set(this, Phi); 3385 } 3386 3387 InstructionCost 3388 VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF, 3389 VPCostContext &Ctx) const { 3390 if (VF.isScalar()) 3391 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind); 3392 3393 if (VF.isScalable() && VF.getKnownMinValue() == 1) 3394 return InstructionCost::getInvalid(); 3395 3396 SmallVector<int> Mask(VF.getKnownMinValue()); 3397 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1); 3398 Type *VectorTy = 3399 toVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); 3400 3401 return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice, 3402 cast<VectorType>(VectorTy), Mask, Ctx.CostKind, 3403 VF.getKnownMinValue() - 1); 3404 } 3405 3406 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3407 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, 3408 VPSlotTracker &SlotTracker) const { 3409 O << Indent << "FIRST-ORDER-RECURRENCE-PHI "; 3410 printAsOperand(O, SlotTracker); 3411 O << " = phi "; 3412 printOperands(O, SlotTracker); 3413 } 3414 #endif 3415 3416 void VPReductionPHIRecipe::execute(VPTransformState &State) { 3417 auto &Builder = State.Builder; 3418 3419 // If this phi is fed by a scaled reduction then it should output a 3420 // vector with fewer elements than the VF. 3421 ElementCount VF = State.VF.divideCoefficientBy(VFScaleFactor); 3422 3423 // Reductions do not have to start at zero. They can start with 3424 // any loop invariant values. 3425 VPValue *StartVPV = getStartValue(); 3426 Value *StartV = StartVPV->getLiveInIRValue(); 3427 3428 // In order to support recurrences we need to be able to vectorize Phi nodes. 3429 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3430 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 3431 // this value when we vectorize all of the instructions that use the PHI. 3432 bool ScalarPHI = State.VF.isScalar() || IsInLoop; 3433 Type *VecTy = 3434 ScalarPHI ? StartV->getType() : VectorType::get(StartV->getType(), VF); 3435 3436 BasicBlock *HeaderBB = State.CFG.PrevBB; 3437 assert(State.CurrentParentLoop->getHeader() == HeaderBB && 3438 "recipe must be in the vector loop header"); 3439 auto *Phi = PHINode::Create(VecTy, 2, "vec.phi"); 3440 Phi->insertBefore(HeaderBB->getFirstInsertionPt()); 3441 State.set(this, Phi, IsInLoop); 3442 3443 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3444 3445 Value *Iden = nullptr; 3446 RecurKind RK = RdxDesc.getRecurrenceKind(); 3447 unsigned CurrentPart = getUnrollPart(*this); 3448 3449 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || 3450 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { 3451 // MinMax and AnyOf reductions have the start value as their identity. 3452 if (ScalarPHI) { 3453 Iden = StartV; 3454 } else { 3455 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 3456 Builder.SetInsertPoint(VectorPH->getTerminator()); 3457 StartV = Iden = State.get(StartVPV); 3458 } 3459 } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) { 3460 // [I|F]FindLastIV will use a sentinel value to initialize the reduction 3461 // phi or the resume value from the main vector loop when vectorizing the 3462 // epilogue loop. In the exit block, ComputeReductionResult will generate 3463 // checks to verify if the reduction result is the sentinel value. If the 3464 // result is the sentinel value, it will be corrected back to the start 3465 // value. 3466 // TODO: The sentinel value is not always necessary. When the start value is 3467 // a constant, and smaller than the start value of the induction variable, 3468 // the start value can be directly used to initialize the reduction phi. 3469 Iden = StartV; 3470 if (!ScalarPHI) { 3471 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 3472 Builder.SetInsertPoint(VectorPH->getTerminator()); 3473 StartV = Iden = Builder.CreateVectorSplat(State.VF, Iden); 3474 } 3475 } else { 3476 Iden = llvm::getRecurrenceIdentity(RK, VecTy->getScalarType(), 3477 RdxDesc.getFastMathFlags()); 3478 3479 if (!ScalarPHI) { 3480 if (CurrentPart == 0) { 3481 // Create start and identity vector values for the reduction in the 3482 // preheader. 3483 // TODO: Introduce recipes in VPlan preheader to create initial values. 3484 Iden = Builder.CreateVectorSplat(VF, Iden); 3485 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 3486 Builder.SetInsertPoint(VectorPH->getTerminator()); 3487 Constant *Zero = Builder.getInt32(0); 3488 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 3489 } else { 3490 Iden = Builder.CreateVectorSplat(VF, Iden); 3491 } 3492 } 3493 } 3494 3495 Phi = cast<PHINode>(State.get(this, IsInLoop)); 3496 Value *StartVal = (CurrentPart == 0) ? StartV : Iden; 3497 Phi->addIncoming(StartVal, VectorPH); 3498 } 3499 3500 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3501 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3502 VPSlotTracker &SlotTracker) const { 3503 O << Indent << "WIDEN-REDUCTION-PHI "; 3504 3505 printAsOperand(O, SlotTracker); 3506 O << " = phi "; 3507 printOperands(O, SlotTracker); 3508 if (VFScaleFactor != 1) 3509 O << " (VF scaled by 1/" << VFScaleFactor << ")"; 3510 } 3511 #endif 3512 3513 void VPWidenPHIRecipe::execute(VPTransformState &State) { 3514 assert(EnableVPlanNativePath && 3515 "Non-native vplans are not expected to have VPWidenPHIRecipes."); 3516 3517 State.setDebugLocFrom(getDebugLoc()); 3518 Value *Op0 = State.get(getOperand(0)); 3519 Type *VecTy = Op0->getType(); 3520 Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi"); 3521 State.set(this, VecPhi); 3522 } 3523 3524 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3525 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3526 VPSlotTracker &SlotTracker) const { 3527 O << Indent << "WIDEN-PHI "; 3528 3529 auto *OriginalPhi = cast<PHINode>(getUnderlyingValue()); 3530 // Unless all incoming values are modeled in VPlan print the original PHI 3531 // directly. 3532 // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming 3533 // values as VPValues. 3534 if (getNumOperands() != OriginalPhi->getNumOperands()) { 3535 O << VPlanIngredient(OriginalPhi); 3536 return; 3537 } 3538 3539 printAsOperand(O, SlotTracker); 3540 O << " = phi "; 3541 printOperands(O, SlotTracker); 3542 } 3543 #endif 3544 3545 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and 3546 // remove VPActiveLaneMaskPHIRecipe. 3547 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { 3548 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3549 Value *StartMask = State.get(getOperand(0)); 3550 PHINode *Phi = 3551 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); 3552 Phi->addIncoming(StartMask, VectorPH); 3553 Phi->setDebugLoc(getDebugLoc()); 3554 State.set(this, Phi); 3555 } 3556 3557 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3558 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3559 VPSlotTracker &SlotTracker) const { 3560 O << Indent << "ACTIVE-LANE-MASK-PHI "; 3561 3562 printAsOperand(O, SlotTracker); 3563 O << " = phi "; 3564 printOperands(O, SlotTracker); 3565 } 3566 #endif 3567 3568 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3569 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3570 VPSlotTracker &SlotTracker) const { 3571 O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI "; 3572 3573 printAsOperand(O, SlotTracker); 3574 O << " = phi "; 3575 printOperands(O, SlotTracker); 3576 } 3577 #endif 3578 3579 void VPScalarPHIRecipe::execute(VPTransformState &State) { 3580 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3581 Value *Start = State.get(getStartValue(), VPLane(0)); 3582 PHINode *Phi = State.Builder.CreatePHI(Start->getType(), 2, Name); 3583 Phi->addIncoming(Start, VectorPH); 3584 Phi->setDebugLoc(getDebugLoc()); 3585 State.set(this, Phi, /*IsScalar=*/true); 3586 } 3587 3588 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3589 void VPScalarPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3590 VPSlotTracker &SlotTracker) const { 3591 O << Indent << "SCALAR-PHI "; 3592 printAsOperand(O, SlotTracker); 3593 O << " = phi "; 3594 printOperands(O, SlotTracker); 3595 } 3596 #endif 3597