1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file contains implementations for different VPlan recipes. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "LoopVectorizationPlanner.h" 15 #include "VPlan.h" 16 #include "VPlanAnalysis.h" 17 #include "VPlanPatternMatch.h" 18 #include "VPlanUtils.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/ADT/Twine.h" 22 #include "llvm/Analysis/IVDescriptors.h" 23 #include "llvm/IR/BasicBlock.h" 24 #include "llvm/IR/IRBuilder.h" 25 #include "llvm/IR/Instruction.h" 26 #include "llvm/IR/Instructions.h" 27 #include "llvm/IR/Intrinsics.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/IR/Value.h" 30 #include "llvm/IR/VectorBuilder.h" 31 #include "llvm/Support/Casting.h" 32 #include "llvm/Support/CommandLine.h" 33 #include "llvm/Support/Debug.h" 34 #include "llvm/Support/raw_ostream.h" 35 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 36 #include "llvm/Transforms/Utils/LoopUtils.h" 37 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 38 #include <cassert> 39 40 using namespace llvm; 41 42 using VectorParts = SmallVector<Value *, 2>; 43 44 namespace llvm { 45 extern cl::opt<bool> EnableVPlanNativePath; 46 } 47 extern cl::opt<unsigned> ForceTargetInstructionCost; 48 49 #define LV_NAME "loop-vectorize" 50 #define DEBUG_TYPE LV_NAME 51 52 bool VPRecipeBase::mayWriteToMemory() const { 53 switch (getVPDefID()) { 54 case VPInstructionSC: 55 return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory(); 56 case VPInterleaveSC: 57 return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0; 58 case VPWidenStoreEVLSC: 59 case VPWidenStoreSC: 60 return true; 61 case VPReplicateSC: 62 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 63 ->mayWriteToMemory(); 64 case VPWidenCallSC: 65 return !cast<VPWidenCallRecipe>(this) 66 ->getCalledScalarFunction() 67 ->onlyReadsMemory(); 68 case VPWidenIntrinsicSC: 69 return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory(); 70 case VPBranchOnMaskSC: 71 case VPScalarIVStepsSC: 72 case VPPredInstPHISC: 73 return false; 74 case VPBlendSC: 75 case VPReductionEVLSC: 76 case VPReductionSC: 77 case VPVectorPointerSC: 78 case VPWidenCanonicalIVSC: 79 case VPWidenCastSC: 80 case VPWidenGEPSC: 81 case VPWidenIntOrFpInductionSC: 82 case VPWidenLoadEVLSC: 83 case VPWidenLoadSC: 84 case VPWidenPHISC: 85 case VPWidenSC: 86 case VPWidenEVLSC: 87 case VPWidenSelectSC: { 88 const Instruction *I = 89 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 90 (void)I; 91 assert((!I || !I->mayWriteToMemory()) && 92 "underlying instruction may write to memory"); 93 return false; 94 } 95 default: 96 return true; 97 } 98 } 99 100 bool VPRecipeBase::mayReadFromMemory() const { 101 switch (getVPDefID()) { 102 case VPInstructionSC: 103 return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory(); 104 case VPWidenLoadEVLSC: 105 case VPWidenLoadSC: 106 return true; 107 case VPReplicateSC: 108 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 109 ->mayReadFromMemory(); 110 case VPWidenCallSC: 111 return !cast<VPWidenCallRecipe>(this) 112 ->getCalledScalarFunction() 113 ->onlyWritesMemory(); 114 case VPWidenIntrinsicSC: 115 return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory(); 116 case VPBranchOnMaskSC: 117 case VPPredInstPHISC: 118 case VPScalarIVStepsSC: 119 case VPWidenStoreEVLSC: 120 case VPWidenStoreSC: 121 return false; 122 case VPBlendSC: 123 case VPReductionEVLSC: 124 case VPReductionSC: 125 case VPVectorPointerSC: 126 case VPWidenCanonicalIVSC: 127 case VPWidenCastSC: 128 case VPWidenGEPSC: 129 case VPWidenIntOrFpInductionSC: 130 case VPWidenPHISC: 131 case VPWidenSC: 132 case VPWidenEVLSC: 133 case VPWidenSelectSC: { 134 const Instruction *I = 135 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 136 (void)I; 137 assert((!I || !I->mayReadFromMemory()) && 138 "underlying instruction may read from memory"); 139 return false; 140 } 141 default: 142 return true; 143 } 144 } 145 146 bool VPRecipeBase::mayHaveSideEffects() const { 147 switch (getVPDefID()) { 148 case VPDerivedIVSC: 149 case VPPredInstPHISC: 150 case VPScalarCastSC: 151 case VPReverseVectorPointerSC: 152 return false; 153 case VPInstructionSC: 154 return mayWriteToMemory(); 155 case VPWidenCallSC: { 156 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction(); 157 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn(); 158 } 159 case VPWidenIntrinsicSC: 160 return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects(); 161 case VPBlendSC: 162 case VPReductionEVLSC: 163 case VPReductionSC: 164 case VPScalarIVStepsSC: 165 case VPVectorPointerSC: 166 case VPWidenCanonicalIVSC: 167 case VPWidenCastSC: 168 case VPWidenGEPSC: 169 case VPWidenIntOrFpInductionSC: 170 case VPWidenPHISC: 171 case VPWidenPointerInductionSC: 172 case VPWidenSC: 173 case VPWidenEVLSC: 174 case VPWidenSelectSC: { 175 const Instruction *I = 176 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 177 (void)I; 178 assert((!I || !I->mayHaveSideEffects()) && 179 "underlying instruction has side-effects"); 180 return false; 181 } 182 case VPInterleaveSC: 183 return mayWriteToMemory(); 184 case VPWidenLoadEVLSC: 185 case VPWidenLoadSC: 186 case VPWidenStoreEVLSC: 187 case VPWidenStoreSC: 188 assert( 189 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() == 190 mayWriteToMemory() && 191 "mayHaveSideffects result for ingredient differs from this " 192 "implementation"); 193 return mayWriteToMemory(); 194 case VPReplicateSC: { 195 auto *R = cast<VPReplicateRecipe>(this); 196 return R->getUnderlyingInstr()->mayHaveSideEffects(); 197 } 198 default: 199 return true; 200 } 201 } 202 203 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { 204 assert(!Parent && "Recipe already in some VPBasicBlock"); 205 assert(InsertPos->getParent() && 206 "Insertion position not in any VPBasicBlock"); 207 InsertPos->getParent()->insert(this, InsertPos->getIterator()); 208 } 209 210 void VPRecipeBase::insertBefore(VPBasicBlock &BB, 211 iplist<VPRecipeBase>::iterator I) { 212 assert(!Parent && "Recipe already in some VPBasicBlock"); 213 assert(I == BB.end() || I->getParent() == &BB); 214 BB.insert(this, I); 215 } 216 217 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { 218 assert(!Parent && "Recipe already in some VPBasicBlock"); 219 assert(InsertPos->getParent() && 220 "Insertion position not in any VPBasicBlock"); 221 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator())); 222 } 223 224 void VPRecipeBase::removeFromParent() { 225 assert(getParent() && "Recipe not in any VPBasicBlock"); 226 getParent()->getRecipeList().remove(getIterator()); 227 Parent = nullptr; 228 } 229 230 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { 231 assert(getParent() && "Recipe not in any VPBasicBlock"); 232 return getParent()->getRecipeList().erase(getIterator()); 233 } 234 235 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { 236 removeFromParent(); 237 insertAfter(InsertPos); 238 } 239 240 void VPRecipeBase::moveBefore(VPBasicBlock &BB, 241 iplist<VPRecipeBase>::iterator I) { 242 removeFromParent(); 243 insertBefore(BB, I); 244 } 245 246 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { 247 // Get the underlying instruction for the recipe, if there is one. It is used 248 // to 249 // * decide if cost computation should be skipped for this recipe, 250 // * apply forced target instruction cost. 251 Instruction *UI = nullptr; 252 if (auto *S = dyn_cast<VPSingleDefRecipe>(this)) 253 UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue()); 254 else if (auto *IG = dyn_cast<VPInterleaveRecipe>(this)) 255 UI = IG->getInsertPos(); 256 else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this)) 257 UI = &WidenMem->getIngredient(); 258 259 InstructionCost RecipeCost; 260 if (UI && Ctx.skipCostComputation(UI, VF.isVector())) { 261 RecipeCost = 0; 262 } else { 263 RecipeCost = computeCost(VF, Ctx); 264 if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 && 265 RecipeCost.isValid()) 266 RecipeCost = InstructionCost(ForceTargetInstructionCost); 267 } 268 269 LLVM_DEBUG({ 270 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": "; 271 dump(); 272 }); 273 return RecipeCost; 274 } 275 276 InstructionCost VPRecipeBase::computeCost(ElementCount VF, 277 VPCostContext &Ctx) const { 278 llvm_unreachable("subclasses should implement computeCost"); 279 } 280 281 InstructionCost 282 VPPartialReductionRecipe::computeCost(ElementCount VF, 283 VPCostContext &Ctx) const { 284 std::optional<unsigned> Opcode = std::nullopt; 285 VPRecipeBase *BinOpR = getOperand(0)->getDefiningRecipe(); 286 if (auto *WidenR = dyn_cast<VPWidenRecipe>(BinOpR)) 287 Opcode = std::make_optional(WidenR->getOpcode()); 288 289 VPRecipeBase *ExtAR = BinOpR->getOperand(0)->getDefiningRecipe(); 290 VPRecipeBase *ExtBR = BinOpR->getOperand(1)->getDefiningRecipe(); 291 292 auto *PhiType = Ctx.Types.inferScalarType(getOperand(1)); 293 auto *InputTypeA = Ctx.Types.inferScalarType(ExtAR ? ExtAR->getOperand(0) 294 : BinOpR->getOperand(0)); 295 auto *InputTypeB = Ctx.Types.inferScalarType(ExtBR ? ExtBR->getOperand(0) 296 : BinOpR->getOperand(1)); 297 298 auto GetExtendKind = [](VPRecipeBase *R) { 299 // The extend could come from outside the plan. 300 if (!R) 301 return TargetTransformInfo::PR_None; 302 auto *WidenCastR = dyn_cast<VPWidenCastRecipe>(R); 303 if (!WidenCastR) 304 return TargetTransformInfo::PR_None; 305 if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt) 306 return TargetTransformInfo::PR_ZeroExtend; 307 if (WidenCastR->getOpcode() == Instruction::CastOps::SExt) 308 return TargetTransformInfo::PR_SignExtend; 309 return TargetTransformInfo::PR_None; 310 }; 311 312 return Ctx.TTI.getPartialReductionCost(getOpcode(), InputTypeA, InputTypeB, 313 PhiType, VF, GetExtendKind(ExtAR), 314 GetExtendKind(ExtBR), Opcode); 315 } 316 317 void VPPartialReductionRecipe::execute(VPTransformState &State) { 318 State.setDebugLocFrom(getDebugLoc()); 319 auto &Builder = State.Builder; 320 321 assert(getOpcode() == Instruction::Add && 322 "Unhandled partial reduction opcode"); 323 324 Value *BinOpVal = State.get(getOperand(0)); 325 Value *PhiVal = State.get(getOperand(1)); 326 assert(PhiVal && BinOpVal && "Phi and Mul must be set"); 327 328 Type *RetTy = PhiVal->getType(); 329 330 CallInst *V = Builder.CreateIntrinsic( 331 RetTy, Intrinsic::experimental_vector_partial_reduce_add, 332 {PhiVal, BinOpVal}, nullptr, "partial.reduce"); 333 334 State.set(this, V); 335 } 336 337 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 338 void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent, 339 VPSlotTracker &SlotTracker) const { 340 O << Indent << "PARTIAL-REDUCE "; 341 printAsOperand(O, SlotTracker); 342 O << " = " << Instruction::getOpcodeName(getOpcode()) << " "; 343 printOperands(O, SlotTracker); 344 } 345 #endif 346 347 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { 348 assert(OpType == OperationType::FPMathOp && 349 "recipe doesn't have fast math flags"); 350 FastMathFlags Res; 351 Res.setAllowReassoc(FMFs.AllowReassoc); 352 Res.setNoNaNs(FMFs.NoNaNs); 353 Res.setNoInfs(FMFs.NoInfs); 354 Res.setNoSignedZeros(FMFs.NoSignedZeros); 355 Res.setAllowReciprocal(FMFs.AllowReciprocal); 356 Res.setAllowContract(FMFs.AllowContract); 357 Res.setApproxFunc(FMFs.ApproxFunc); 358 return Res; 359 } 360 361 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 362 void VPSingleDefRecipe::dump() const { VPDef::dump(); } 363 #endif 364 365 template <unsigned PartOpIdx> 366 VPValue * 367 VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(VPUser &U) const { 368 if (U.getNumOperands() == PartOpIdx + 1) 369 return U.getOperand(PartOpIdx); 370 return nullptr; 371 } 372 373 template <unsigned PartOpIdx> 374 unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(VPUser &U) const { 375 if (auto *UnrollPartOp = getUnrollPartOperand(U)) 376 return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue(); 377 return 0; 378 } 379 380 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, 381 VPValue *A, VPValue *B, DebugLoc DL, 382 const Twine &Name) 383 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}), 384 Pred, DL), 385 Opcode(Opcode), Name(Name.str()) { 386 assert(Opcode == Instruction::ICmp && 387 "only ICmp predicates supported at the moment"); 388 } 389 390 VPInstruction::VPInstruction(unsigned Opcode, 391 std::initializer_list<VPValue *> Operands, 392 FastMathFlags FMFs, DebugLoc DL, const Twine &Name) 393 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL), 394 Opcode(Opcode), Name(Name.str()) { 395 // Make sure the VPInstruction is a floating-point operation. 396 assert(isFPMathOp() && "this op can't take fast-math flags"); 397 } 398 399 bool VPInstruction::doesGeneratePerAllLanes() const { 400 return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this); 401 } 402 403 bool VPInstruction::canGenerateScalarForFirstLane() const { 404 if (Instruction::isBinaryOp(getOpcode())) 405 return true; 406 if (isSingleScalar() || isVectorToScalar()) 407 return true; 408 switch (Opcode) { 409 case Instruction::ICmp: 410 case Instruction::Select: 411 case VPInstruction::BranchOnCond: 412 case VPInstruction::BranchOnCount: 413 case VPInstruction::CalculateTripCountMinusVF: 414 case VPInstruction::CanonicalIVIncrementForPart: 415 case VPInstruction::PtrAdd: 416 case VPInstruction::ExplicitVectorLength: 417 case VPInstruction::AnyOf: 418 return true; 419 default: 420 return false; 421 } 422 } 423 424 Value *VPInstruction::generatePerLane(VPTransformState &State, 425 const VPLane &Lane) { 426 IRBuilderBase &Builder = State.Builder; 427 428 assert(getOpcode() == VPInstruction::PtrAdd && 429 "only PtrAdd opcodes are supported for now"); 430 return Builder.CreatePtrAdd(State.get(getOperand(0), Lane), 431 State.get(getOperand(1), Lane), Name); 432 } 433 434 Value *VPInstruction::generate(VPTransformState &State) { 435 IRBuilderBase &Builder = State.Builder; 436 437 if (Instruction::isBinaryOp(getOpcode())) { 438 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 439 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); 440 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed); 441 auto *Res = 442 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); 443 if (auto *I = dyn_cast<Instruction>(Res)) 444 setFlags(I); 445 return Res; 446 } 447 448 switch (getOpcode()) { 449 case VPInstruction::Not: { 450 Value *A = State.get(getOperand(0)); 451 return Builder.CreateNot(A, Name); 452 } 453 case Instruction::ICmp: { 454 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 455 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); 456 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed); 457 return Builder.CreateCmp(getPredicate(), A, B, Name); 458 } 459 case Instruction::Select: { 460 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 461 Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed); 462 Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed); 463 Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed); 464 return Builder.CreateSelect(Cond, Op1, Op2, Name); 465 } 466 case VPInstruction::ActiveLaneMask: { 467 // Get first lane of vector induction variable. 468 Value *VIVElem0 = State.get(getOperand(0), VPLane(0)); 469 // Get the original loop tripcount. 470 Value *ScalarTC = State.get(getOperand(1), VPLane(0)); 471 472 // If this part of the active lane mask is scalar, generate the CMP directly 473 // to avoid unnecessary extracts. 474 if (State.VF.isScalar()) 475 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC, 476 Name); 477 478 auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); 479 auto *PredTy = VectorType::get(Int1Ty, State.VF); 480 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, 481 {PredTy, ScalarTC->getType()}, 482 {VIVElem0, ScalarTC}, nullptr, Name); 483 } 484 case VPInstruction::FirstOrderRecurrenceSplice: { 485 // Generate code to combine the previous and current values in vector v3. 486 // 487 // vector.ph: 488 // v_init = vector(..., ..., ..., a[-1]) 489 // br vector.body 490 // 491 // vector.body 492 // i = phi [0, vector.ph], [i+4, vector.body] 493 // v1 = phi [v_init, vector.ph], [v2, vector.body] 494 // v2 = a[i, i+1, i+2, i+3]; 495 // v3 = vector(v1(3), v2(0, 1, 2)) 496 497 auto *V1 = State.get(getOperand(0)); 498 if (!V1->getType()->isVectorTy()) 499 return V1; 500 Value *V2 = State.get(getOperand(1)); 501 return Builder.CreateVectorSplice(V1, V2, -1, Name); 502 } 503 case VPInstruction::CalculateTripCountMinusVF: { 504 unsigned UF = getParent()->getPlan()->getUF(); 505 Value *ScalarTC = State.get(getOperand(0), VPLane(0)); 506 Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF); 507 Value *Sub = Builder.CreateSub(ScalarTC, Step); 508 Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step); 509 Value *Zero = ConstantInt::get(ScalarTC->getType(), 0); 510 return Builder.CreateSelect(Cmp, Sub, Zero); 511 } 512 case VPInstruction::ExplicitVectorLength: { 513 // TODO: Restructure this code with an explicit remainder loop, vsetvli can 514 // be outside of the main loop. 515 Value *AVL = State.get(getOperand(0), /*IsScalar*/ true); 516 // Compute EVL 517 assert(AVL->getType()->isIntegerTy() && 518 "Requested vector length should be an integer."); 519 520 assert(State.VF.isScalable() && "Expected scalable vector factor."); 521 Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue()); 522 523 Value *EVL = State.Builder.CreateIntrinsic( 524 State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length, 525 {AVL, VFArg, State.Builder.getTrue()}); 526 return EVL; 527 } 528 case VPInstruction::CanonicalIVIncrementForPart: { 529 unsigned Part = getUnrollPart(*this); 530 auto *IV = State.get(getOperand(0), VPLane(0)); 531 assert(Part != 0 && "Must have a positive part"); 532 // The canonical IV is incremented by the vectorization factor (num of 533 // SIMD elements) times the unroll part. 534 Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); 535 return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(), 536 hasNoSignedWrap()); 537 } 538 case VPInstruction::BranchOnCond: { 539 Value *Cond = State.get(getOperand(0), VPLane(0)); 540 // Replace the temporary unreachable terminator with a new conditional 541 // branch, hooking it up to backward destination for exiting blocks now and 542 // to forward destination(s) later when they are created. 543 BranchInst *CondBr = 544 Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr); 545 CondBr->setSuccessor(0, nullptr); 546 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 547 548 if (!getParent()->isExiting()) 549 return CondBr; 550 551 VPRegionBlock *ParentRegion = getParent()->getParent(); 552 VPBasicBlock *Header = ParentRegion->getEntryBasicBlock(); 553 CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]); 554 return CondBr; 555 } 556 case VPInstruction::BranchOnCount: { 557 // First create the compare. 558 Value *IV = State.get(getOperand(0), /*IsScalar*/ true); 559 Value *TC = State.get(getOperand(1), /*IsScalar*/ true); 560 Value *Cond = Builder.CreateICmpEQ(IV, TC); 561 562 // Now create the branch. 563 auto *Plan = getParent()->getPlan(); 564 VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); 565 VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock(); 566 567 // Replace the temporary unreachable terminator with a new conditional 568 // branch, hooking it up to backward destination (the header) now and to the 569 // forward destination (the exit/middle block) later when it is created. 570 // Note that CreateCondBr expects a valid BB as first argument, so we need 571 // to set it to nullptr later. 572 BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), 573 State.CFG.VPBB2IRBB[Header]); 574 CondBr->setSuccessor(0, nullptr); 575 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 576 return CondBr; 577 } 578 case VPInstruction::ComputeReductionResult: { 579 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary 580 // and will be removed by breaking up the recipe further. 581 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0)); 582 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 583 // Get its reduction variable descriptor. 584 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 585 586 RecurKind RK = RdxDesc.getRecurrenceKind(); 587 588 Type *PhiTy = OrigPhi->getType(); 589 // The recipe's operands are the reduction phi, followed by one operand for 590 // each part of the reduction. 591 unsigned UF = getNumOperands() - 1; 592 VectorParts RdxParts(UF); 593 for (unsigned Part = 0; Part < UF; ++Part) 594 RdxParts[Part] = State.get(getOperand(1 + Part), PhiR->isInLoop()); 595 596 // If the vector reduction can be performed in a smaller type, we truncate 597 // then extend the loop exit value to enable InstCombine to evaluate the 598 // entire expression in the smaller type. 599 // TODO: Handle this in truncateToMinBW. 600 if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 601 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF); 602 for (unsigned Part = 0; Part < UF; ++Part) 603 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 604 } 605 // Reduce all of the unrolled parts into a single vector. 606 Value *ReducedPartRdx = RdxParts[0]; 607 unsigned Op = RdxDesc.getOpcode(); 608 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) 609 Op = Instruction::Or; 610 611 if (PhiR->isOrdered()) { 612 ReducedPartRdx = RdxParts[UF - 1]; 613 } else { 614 // Floating-point operations should have some FMF to enable the reduction. 615 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 616 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 617 for (unsigned Part = 1; Part < UF; ++Part) { 618 Value *RdxPart = RdxParts[Part]; 619 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 620 ReducedPartRdx = Builder.CreateBinOp( 621 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 622 else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) 623 ReducedPartRdx = 624 createMinMaxOp(Builder, RecurKind::SMax, ReducedPartRdx, RdxPart); 625 else 626 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 627 } 628 } 629 630 // Create the reduction after the loop. Note that inloop reductions create 631 // the target reduction in the loop using a Reduction recipe. 632 if ((State.VF.isVector() || 633 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) || 634 RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) && 635 !PhiR->isInLoop()) { 636 ReducedPartRdx = 637 createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); 638 // If the reduction can be performed in a smaller type, we need to extend 639 // the reduction to the wider type before we branch to the original loop. 640 if (PhiTy != RdxDesc.getRecurrenceType()) 641 ReducedPartRdx = RdxDesc.isSigned() 642 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 643 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 644 } 645 646 return ReducedPartRdx; 647 } 648 case VPInstruction::ExtractFromEnd: { 649 auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue()); 650 unsigned Offset = CI->getZExtValue(); 651 assert(Offset > 0 && "Offset from end must be positive"); 652 Value *Res; 653 if (State.VF.isVector()) { 654 assert(Offset <= State.VF.getKnownMinValue() && 655 "invalid offset to extract from"); 656 // Extract lane VF - Offset from the operand. 657 Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset)); 658 } else { 659 assert(Offset <= 1 && "invalid offset to extract from"); 660 Res = State.get(getOperand(0)); 661 } 662 if (isa<ExtractElementInst>(Res)) 663 Res->setName(Name); 664 return Res; 665 } 666 case VPInstruction::LogicalAnd: { 667 Value *A = State.get(getOperand(0)); 668 Value *B = State.get(getOperand(1)); 669 return Builder.CreateLogicalAnd(A, B, Name); 670 } 671 case VPInstruction::PtrAdd: { 672 assert(vputils::onlyFirstLaneUsed(this) && 673 "can only generate first lane for PtrAdd"); 674 Value *Ptr = State.get(getOperand(0), VPLane(0)); 675 Value *Addend = State.get(getOperand(1), VPLane(0)); 676 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags()); 677 } 678 case VPInstruction::ResumePhi: { 679 Value *IncomingFromVPlanPred = 680 State.get(getOperand(0), /* IsScalar */ true); 681 Value *IncomingFromOtherPreds = 682 State.get(getOperand(1), /* IsScalar */ true); 683 auto *NewPhi = 684 Builder.CreatePHI(State.TypeAnalysis.inferScalarType(this), 2, Name); 685 BasicBlock *VPlanPred = 686 State.CFG 687 .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getPredecessors()[0])]; 688 NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred); 689 for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) { 690 if (OtherPred == VPlanPred) 691 continue; 692 NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred); 693 } 694 return NewPhi; 695 } 696 case VPInstruction::AnyOf: { 697 Value *A = State.get(getOperand(0)); 698 return Builder.CreateOrReduce(A); 699 } 700 701 default: 702 llvm_unreachable("Unsupported opcode for instruction"); 703 } 704 } 705 706 bool VPInstruction::isVectorToScalar() const { 707 return getOpcode() == VPInstruction::ExtractFromEnd || 708 getOpcode() == VPInstruction::ComputeReductionResult || 709 getOpcode() == VPInstruction::AnyOf; 710 } 711 712 bool VPInstruction::isSingleScalar() const { 713 return getOpcode() == VPInstruction::ResumePhi; 714 } 715 716 #if !defined(NDEBUG) 717 bool VPInstruction::isFPMathOp() const { 718 // Inspired by FPMathOperator::classof. Notable differences are that we don't 719 // support Call, PHI and Select opcodes here yet. 720 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || 721 Opcode == Instruction::FNeg || Opcode == Instruction::FSub || 722 Opcode == Instruction::FDiv || Opcode == Instruction::FRem || 723 Opcode == Instruction::FCmp || Opcode == Instruction::Select; 724 } 725 #endif 726 727 void VPInstruction::execute(VPTransformState &State) { 728 assert(!State.Lane && "VPInstruction executing an Lane"); 729 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 730 assert((hasFastMathFlags() == isFPMathOp() || 731 getOpcode() == Instruction::Select) && 732 "Recipe not a FPMathOp but has fast-math flags?"); 733 if (hasFastMathFlags()) 734 State.Builder.setFastMathFlags(getFastMathFlags()); 735 State.setDebugLocFrom(getDebugLoc()); 736 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() && 737 (vputils::onlyFirstLaneUsed(this) || 738 isVectorToScalar() || isSingleScalar()); 739 bool GeneratesPerAllLanes = doesGeneratePerAllLanes(); 740 if (GeneratesPerAllLanes) { 741 for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue(); 742 Lane != NumLanes; ++Lane) { 743 Value *GeneratedValue = generatePerLane(State, VPLane(Lane)); 744 assert(GeneratedValue && "generatePerLane must produce a value"); 745 State.set(this, GeneratedValue, VPLane(Lane)); 746 } 747 return; 748 } 749 750 Value *GeneratedValue = generate(State); 751 if (!hasResult()) 752 return; 753 assert(GeneratedValue && "generate must produce a value"); 754 assert( 755 (GeneratedValue->getType()->isVectorTy() == !GeneratesPerFirstLaneOnly || 756 State.VF.isScalar()) && 757 "scalar value but not only first lane defined"); 758 State.set(this, GeneratedValue, 759 /*IsScalar*/ GeneratesPerFirstLaneOnly); 760 } 761 762 bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { 763 if (Instruction::isBinaryOp(getOpcode())) 764 return false; 765 switch (getOpcode()) { 766 case Instruction::ICmp: 767 case Instruction::Select: 768 case VPInstruction::AnyOf: 769 case VPInstruction::CalculateTripCountMinusVF: 770 case VPInstruction::CanonicalIVIncrementForPart: 771 case VPInstruction::ExtractFromEnd: 772 case VPInstruction::FirstOrderRecurrenceSplice: 773 case VPInstruction::LogicalAnd: 774 case VPInstruction::Not: 775 case VPInstruction::PtrAdd: 776 return false; 777 default: 778 return true; 779 } 780 } 781 782 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { 783 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 784 if (Instruction::isBinaryOp(getOpcode())) 785 return vputils::onlyFirstLaneUsed(this); 786 787 switch (getOpcode()) { 788 default: 789 return false; 790 case Instruction::ICmp: 791 case Instruction::Select: 792 case Instruction::Or: 793 case VPInstruction::PtrAdd: 794 // TODO: Cover additional opcodes. 795 return vputils::onlyFirstLaneUsed(this); 796 case VPInstruction::ActiveLaneMask: 797 case VPInstruction::ExplicitVectorLength: 798 case VPInstruction::CalculateTripCountMinusVF: 799 case VPInstruction::CanonicalIVIncrementForPart: 800 case VPInstruction::BranchOnCount: 801 case VPInstruction::BranchOnCond: 802 case VPInstruction::ResumePhi: 803 return true; 804 }; 805 llvm_unreachable("switch should return"); 806 } 807 808 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const { 809 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 810 if (Instruction::isBinaryOp(getOpcode())) 811 return vputils::onlyFirstPartUsed(this); 812 813 switch (getOpcode()) { 814 default: 815 return false; 816 case Instruction::ICmp: 817 case Instruction::Select: 818 return vputils::onlyFirstPartUsed(this); 819 case VPInstruction::BranchOnCount: 820 case VPInstruction::BranchOnCond: 821 case VPInstruction::CanonicalIVIncrementForPart: 822 return true; 823 }; 824 llvm_unreachable("switch should return"); 825 } 826 827 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 828 void VPInstruction::dump() const { 829 VPSlotTracker SlotTracker(getParent()->getPlan()); 830 print(dbgs(), "", SlotTracker); 831 } 832 833 void VPInstruction::print(raw_ostream &O, const Twine &Indent, 834 VPSlotTracker &SlotTracker) const { 835 O << Indent << "EMIT "; 836 837 if (hasResult()) { 838 printAsOperand(O, SlotTracker); 839 O << " = "; 840 } 841 842 switch (getOpcode()) { 843 case VPInstruction::Not: 844 O << "not"; 845 break; 846 case VPInstruction::SLPLoad: 847 O << "combined load"; 848 break; 849 case VPInstruction::SLPStore: 850 O << "combined store"; 851 break; 852 case VPInstruction::ActiveLaneMask: 853 O << "active lane mask"; 854 break; 855 case VPInstruction::ResumePhi: 856 O << "resume-phi"; 857 break; 858 case VPInstruction::ExplicitVectorLength: 859 O << "EXPLICIT-VECTOR-LENGTH"; 860 break; 861 case VPInstruction::FirstOrderRecurrenceSplice: 862 O << "first-order splice"; 863 break; 864 case VPInstruction::BranchOnCond: 865 O << "branch-on-cond"; 866 break; 867 case VPInstruction::CalculateTripCountMinusVF: 868 O << "TC > VF ? TC - VF : 0"; 869 break; 870 case VPInstruction::CanonicalIVIncrementForPart: 871 O << "VF * Part +"; 872 break; 873 case VPInstruction::BranchOnCount: 874 O << "branch-on-count"; 875 break; 876 case VPInstruction::ExtractFromEnd: 877 O << "extract-from-end"; 878 break; 879 case VPInstruction::ComputeReductionResult: 880 O << "compute-reduction-result"; 881 break; 882 case VPInstruction::LogicalAnd: 883 O << "logical-and"; 884 break; 885 case VPInstruction::PtrAdd: 886 O << "ptradd"; 887 break; 888 case VPInstruction::AnyOf: 889 O << "any-of"; 890 break; 891 default: 892 O << Instruction::getOpcodeName(getOpcode()); 893 } 894 895 printFlags(O); 896 printOperands(O, SlotTracker); 897 898 if (auto DL = getDebugLoc()) { 899 O << ", !dbg "; 900 DL.print(O); 901 } 902 } 903 #endif 904 905 void VPIRInstruction::execute(VPTransformState &State) { 906 assert((isa<PHINode>(&I) || getNumOperands() == 0) && 907 "Only PHINodes can have extra operands"); 908 for (const auto &[Idx, Op] : enumerate(operands())) { 909 VPValue *ExitValue = Op; 910 auto Lane = vputils::isUniformAfterVectorization(ExitValue) 911 ? VPLane::getFirstLane() 912 : VPLane::getLastLaneForVF(State.VF); 913 VPBlockBase *Pred = getParent()->getPredecessors()[Idx]; 914 auto *PredVPBB = Pred->getExitingBasicBlock(); 915 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; 916 // Set insertion point in PredBB in case an extract needs to be generated. 917 // TODO: Model extracts explicitly. 918 State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt()); 919 Value *V = State.get(ExitValue, VPLane(Lane)); 920 auto *Phi = cast<PHINode>(&I); 921 // If there is no existing block for PredBB in the phi, add a new incoming 922 // value. Otherwise update the existing incoming value for PredBB. 923 if (Phi->getBasicBlockIndex(PredBB) == -1) 924 Phi->addIncoming(V, PredBB); 925 else 926 Phi->setIncomingValueForBlock(PredBB, V); 927 } 928 929 // Advance the insert point after the wrapped IR instruction. This allows 930 // interleaving VPIRInstructions and other recipes. 931 State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator())); 932 } 933 934 InstructionCost VPIRInstruction::computeCost(ElementCount VF, 935 VPCostContext &Ctx) const { 936 // The recipe wraps an existing IR instruction on the border of VPlan's scope, 937 // hence it does not contribute to the cost-modeling for the VPlan. 938 return 0; 939 } 940 941 void VPIRInstruction::extractLastLaneOfOperand(VPBuilder &Builder) { 942 assert(isa<PHINode>(getInstruction()) && 943 "can only add exiting operands to phi nodes"); 944 assert(getNumOperands() == 1 && "must have a single operand"); 945 VPValue *Exiting = getOperand(0); 946 if (!Exiting->isLiveIn()) { 947 LLVMContext &Ctx = getInstruction().getContext(); 948 auto &Plan = *getParent()->getPlan(); 949 Exiting = Builder.createNaryOp( 950 VPInstruction::ExtractFromEnd, 951 {Exiting, 952 Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::get(Ctx, 32), 1))}); 953 } 954 setOperand(0, Exiting); 955 } 956 957 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 958 void VPIRInstruction::print(raw_ostream &O, const Twine &Indent, 959 VPSlotTracker &SlotTracker) const { 960 O << Indent << "IR " << I; 961 962 if (getNumOperands() != 0) { 963 O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": "; 964 interleaveComma( 965 enumerate(operands()), O, [this, &O, &SlotTracker](auto Op) { 966 Op.value()->printAsOperand(O, SlotTracker); 967 O << " from "; 968 getParent()->getPredecessors()[Op.index()]->printAsOperand(O); 969 }); 970 O << ")"; 971 } 972 } 973 #endif 974 975 void VPWidenCallRecipe::execute(VPTransformState &State) { 976 assert(State.VF.isVector() && "not widening"); 977 State.setDebugLocFrom(getDebugLoc()); 978 979 FunctionType *VFTy = Variant->getFunctionType(); 980 // Add return type if intrinsic is overloaded on it. 981 SmallVector<Value *, 4> Args; 982 for (const auto &I : enumerate(arg_operands())) { 983 Value *Arg; 984 // Some vectorized function variants may also take a scalar argument, 985 // e.g. linear parameters for pointers. This needs to be the scalar value 986 // from the start of the respective part when interleaving. 987 if (!VFTy->getParamType(I.index())->isVectorTy()) 988 Arg = State.get(I.value(), VPLane(0)); 989 else 990 Arg = State.get(I.value(), onlyFirstLaneUsed(I.value())); 991 Args.push_back(Arg); 992 } 993 994 assert(Variant != nullptr && "Can't create vector function."); 995 996 auto *CI = cast_or_null<CallInst>(getUnderlyingValue()); 997 SmallVector<OperandBundleDef, 1> OpBundles; 998 if (CI) 999 CI->getOperandBundlesAsDefs(OpBundles); 1000 1001 CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles); 1002 setFlags(V); 1003 1004 if (!V->getType()->isVoidTy()) 1005 State.set(this, V); 1006 State.addMetadata(V, CI); 1007 } 1008 1009 InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF, 1010 VPCostContext &Ctx) const { 1011 return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(), 1012 Variant->getFunctionType()->params(), 1013 Ctx.CostKind); 1014 } 1015 1016 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1017 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, 1018 VPSlotTracker &SlotTracker) const { 1019 O << Indent << "WIDEN-CALL "; 1020 1021 Function *CalledFn = getCalledScalarFunction(); 1022 if (CalledFn->getReturnType()->isVoidTy()) 1023 O << "void "; 1024 else { 1025 printAsOperand(O, SlotTracker); 1026 O << " = "; 1027 } 1028 1029 O << "call"; 1030 printFlags(O); 1031 O << " @" << CalledFn->getName() << "("; 1032 interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) { 1033 Op->printAsOperand(O, SlotTracker); 1034 }); 1035 O << ")"; 1036 1037 O << " (using library function"; 1038 if (Variant->hasName()) 1039 O << ": " << Variant->getName(); 1040 O << ")"; 1041 } 1042 #endif 1043 1044 void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { 1045 assert(State.VF.isVector() && "not widening"); 1046 State.setDebugLocFrom(getDebugLoc()); 1047 1048 SmallVector<Type *, 2> TysForDecl; 1049 // Add return type if intrinsic is overloaded on it. 1050 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1, State.TTI)) 1051 TysForDecl.push_back(VectorType::get(getResultType(), State.VF)); 1052 SmallVector<Value *, 4> Args; 1053 for (const auto &I : enumerate(operands())) { 1054 // Some intrinsics have a scalar argument - don't replace it with a 1055 // vector. 1056 Value *Arg; 1057 if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(), 1058 State.TTI)) 1059 Arg = State.get(I.value(), VPLane(0)); 1060 else 1061 Arg = State.get(I.value(), onlyFirstLaneUsed(I.value())); 1062 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(), 1063 State.TTI)) 1064 TysForDecl.push_back(Arg->getType()); 1065 Args.push_back(Arg); 1066 } 1067 1068 // Use vector version of the intrinsic. 1069 Module *M = State.Builder.GetInsertBlock()->getModule(); 1070 Function *VectorF = 1071 Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl); 1072 assert(VectorF && 1073 "Can't retrieve vector intrinsic or vector-predication intrinsics."); 1074 1075 auto *CI = cast_or_null<CallInst>(getUnderlyingValue()); 1076 SmallVector<OperandBundleDef, 1> OpBundles; 1077 if (CI) 1078 CI->getOperandBundlesAsDefs(OpBundles); 1079 1080 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); 1081 1082 setFlags(V); 1083 1084 if (!V->getType()->isVoidTy()) 1085 State.set(this, V); 1086 State.addMetadata(V, CI); 1087 } 1088 1089 InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, 1090 VPCostContext &Ctx) const { 1091 // Some backends analyze intrinsic arguments to determine cost. Use the 1092 // underlying value for the operand if it has one. Otherwise try to use the 1093 // operand of the underlying call instruction, if there is one. Otherwise 1094 // clear Arguments. 1095 // TODO: Rework TTI interface to be independent of concrete IR values. 1096 SmallVector<const Value *> Arguments; 1097 for (const auto &[Idx, Op] : enumerate(operands())) { 1098 auto *V = Op->getUnderlyingValue(); 1099 if (!V) { 1100 // Push all the VP Intrinsic's ops into the Argments even if is nullptr. 1101 // Some VP Intrinsic's cost will assert the number of parameters. 1102 // Mainly appears in the following two scenarios: 1103 // 1. EVL Op is nullptr 1104 // 2. The Argmunt of the VP Intrinsic is also the VP Intrinsic 1105 if (VPIntrinsic::isVPIntrinsic(VectorIntrinsicID)) { 1106 Arguments.push_back(V); 1107 continue; 1108 } 1109 if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) { 1110 Arguments.push_back(UI->getArgOperand(Idx)); 1111 continue; 1112 } 1113 Arguments.clear(); 1114 break; 1115 } 1116 Arguments.push_back(V); 1117 } 1118 1119 Type *RetTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 1120 SmallVector<Type *> ParamTys; 1121 for (unsigned I = 0; I != getNumOperands(); ++I) 1122 ParamTys.push_back( 1123 toVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF)); 1124 1125 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst. 1126 FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags(); 1127 IntrinsicCostAttributes CostAttrs( 1128 VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF, 1129 dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue())); 1130 return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind); 1131 } 1132 1133 StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const { 1134 return Intrinsic::getBaseName(VectorIntrinsicID); 1135 } 1136 1137 bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const { 1138 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 1139 // Vector predication intrinsics only demand the the first lane the last 1140 // operand (the EVL operand). 1141 return VPIntrinsic::isVPIntrinsic(VectorIntrinsicID) && 1142 Op == getOperand(getNumOperands() - 1); 1143 } 1144 1145 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1146 void VPWidenIntrinsicRecipe::print(raw_ostream &O, const Twine &Indent, 1147 VPSlotTracker &SlotTracker) const { 1148 O << Indent << "WIDEN-INTRINSIC "; 1149 if (ResultTy->isVoidTy()) { 1150 O << "void "; 1151 } else { 1152 printAsOperand(O, SlotTracker); 1153 O << " = "; 1154 } 1155 1156 O << "call"; 1157 printFlags(O); 1158 O << getIntrinsicName() << "("; 1159 1160 interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) { 1161 Op->printAsOperand(O, SlotTracker); 1162 }); 1163 O << ")"; 1164 } 1165 #endif 1166 1167 void VPHistogramRecipe::execute(VPTransformState &State) { 1168 State.setDebugLocFrom(getDebugLoc()); 1169 IRBuilderBase &Builder = State.Builder; 1170 1171 Value *Address = State.get(getOperand(0)); 1172 Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true); 1173 VectorType *VTy = cast<VectorType>(Address->getType()); 1174 1175 // The histogram intrinsic requires a mask even if the recipe doesn't; 1176 // if the mask operand was omitted then all lanes should be executed and 1177 // we just need to synthesize an all-true mask. 1178 Value *Mask = nullptr; 1179 if (VPValue *VPMask = getMask()) 1180 Mask = State.get(VPMask); 1181 else 1182 Mask = 1183 Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1)); 1184 1185 // If this is a subtract, we want to invert the increment amount. We may 1186 // add a separate intrinsic in future, but for now we'll try this. 1187 if (Opcode == Instruction::Sub) 1188 IncAmt = Builder.CreateNeg(IncAmt); 1189 else 1190 assert(Opcode == Instruction::Add && "only add or sub supported for now"); 1191 1192 State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add, 1193 {VTy, IncAmt->getType()}, 1194 {Address, IncAmt, Mask}); 1195 } 1196 1197 InstructionCost VPHistogramRecipe::computeCost(ElementCount VF, 1198 VPCostContext &Ctx) const { 1199 // FIXME: Take the gather and scatter into account as well. For now we're 1200 // generating the same cost as the fallback path, but we'll likely 1201 // need to create a new TTI method for determining the cost, including 1202 // whether we can use base + vec-of-smaller-indices or just 1203 // vec-of-pointers. 1204 assert(VF.isVector() && "Invalid VF for histogram cost"); 1205 Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0)); 1206 VPValue *IncAmt = getOperand(1); 1207 Type *IncTy = Ctx.Types.inferScalarType(IncAmt); 1208 VectorType *VTy = VectorType::get(IncTy, VF); 1209 1210 // Assume that a non-constant update value (or a constant != 1) requires 1211 // a multiply, and add that into the cost. 1212 InstructionCost MulCost = 1213 Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind); 1214 if (IncAmt->isLiveIn()) { 1215 ConstantInt *CI = dyn_cast<ConstantInt>(IncAmt->getLiveInIRValue()); 1216 1217 if (CI && CI->getZExtValue() == 1) 1218 MulCost = TTI::TCC_Free; 1219 } 1220 1221 // Find the cost of the histogram operation itself. 1222 Type *PtrTy = VectorType::get(AddressTy, VF); 1223 Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF); 1224 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add, 1225 Type::getVoidTy(Ctx.LLVMCtx), 1226 {PtrTy, IncTy, MaskTy}); 1227 1228 // Add the costs together with the add/sub operation. 1229 return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost + 1230 Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind); 1231 } 1232 1233 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1234 void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent, 1235 VPSlotTracker &SlotTracker) const { 1236 O << Indent << "WIDEN-HISTOGRAM buckets: "; 1237 getOperand(0)->printAsOperand(O, SlotTracker); 1238 1239 if (Opcode == Instruction::Sub) 1240 O << ", dec: "; 1241 else { 1242 assert(Opcode == Instruction::Add); 1243 O << ", inc: "; 1244 } 1245 getOperand(1)->printAsOperand(O, SlotTracker); 1246 1247 if (VPValue *Mask = getMask()) { 1248 O << ", mask: "; 1249 Mask->printAsOperand(O, SlotTracker); 1250 } 1251 } 1252 1253 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, 1254 VPSlotTracker &SlotTracker) const { 1255 O << Indent << "WIDEN-SELECT "; 1256 printAsOperand(O, SlotTracker); 1257 O << " = select "; 1258 printFlags(O); 1259 getOperand(0)->printAsOperand(O, SlotTracker); 1260 O << ", "; 1261 getOperand(1)->printAsOperand(O, SlotTracker); 1262 O << ", "; 1263 getOperand(2)->printAsOperand(O, SlotTracker); 1264 O << (isInvariantCond() ? " (condition is loop invariant)" : ""); 1265 } 1266 #endif 1267 1268 void VPWidenSelectRecipe::execute(VPTransformState &State) { 1269 State.setDebugLocFrom(getDebugLoc()); 1270 1271 // The condition can be loop invariant but still defined inside the 1272 // loop. This means that we can't just use the original 'cond' value. 1273 // We have to take the 'vectorized' value and pick the first lane. 1274 // Instcombine will make this a no-op. 1275 auto *InvarCond = 1276 isInvariantCond() ? State.get(getCond(), VPLane(0)) : nullptr; 1277 1278 Value *Cond = InvarCond ? InvarCond : State.get(getCond()); 1279 Value *Op0 = State.get(getOperand(1)); 1280 Value *Op1 = State.get(getOperand(2)); 1281 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 1282 State.set(this, Sel); 1283 if (isa<FPMathOperator>(Sel)) 1284 setFlags(cast<Instruction>(Sel)); 1285 State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1286 } 1287 1288 InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF, 1289 VPCostContext &Ctx) const { 1290 SelectInst *SI = cast<SelectInst>(getUnderlyingValue()); 1291 bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions(); 1292 Type *ScalarTy = Ctx.Types.inferScalarType(this); 1293 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 1294 1295 VPValue *Op0, *Op1; 1296 using namespace llvm::VPlanPatternMatch; 1297 if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 && 1298 (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) || 1299 match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) { 1300 // select x, y, false --> x & y 1301 // select x, true, y --> x | y 1302 const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0); 1303 const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1); 1304 1305 SmallVector<const Value *, 2> Operands; 1306 if (all_of(operands(), 1307 [](VPValue *Op) { return Op->getUnderlyingValue(); })) 1308 Operands.append(SI->op_begin(), SI->op_end()); 1309 bool IsLogicalOr = match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))); 1310 return Ctx.TTI.getArithmeticInstrCost( 1311 IsLogicalOr ? Instruction::Or : Instruction::And, VectorTy, 1312 Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI); 1313 } 1314 1315 Type *CondTy = Ctx.Types.inferScalarType(getOperand(0)); 1316 if (!ScalarCond) 1317 CondTy = VectorType::get(CondTy, VF); 1318 1319 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 1320 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 1321 Pred = Cmp->getPredicate(); 1322 return Ctx.TTI.getCmpSelInstrCost( 1323 Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind, 1324 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI); 1325 } 1326 1327 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy( 1328 const FastMathFlags &FMF) { 1329 AllowReassoc = FMF.allowReassoc(); 1330 NoNaNs = FMF.noNaNs(); 1331 NoInfs = FMF.noInfs(); 1332 NoSignedZeros = FMF.noSignedZeros(); 1333 AllowReciprocal = FMF.allowReciprocal(); 1334 AllowContract = FMF.allowContract(); 1335 ApproxFunc = FMF.approxFunc(); 1336 } 1337 1338 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1339 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { 1340 switch (OpType) { 1341 case OperationType::Cmp: 1342 O << " " << CmpInst::getPredicateName(getPredicate()); 1343 break; 1344 case OperationType::DisjointOp: 1345 if (DisjointFlags.IsDisjoint) 1346 O << " disjoint"; 1347 break; 1348 case OperationType::PossiblyExactOp: 1349 if (ExactFlags.IsExact) 1350 O << " exact"; 1351 break; 1352 case OperationType::OverflowingBinOp: 1353 if (WrapFlags.HasNUW) 1354 O << " nuw"; 1355 if (WrapFlags.HasNSW) 1356 O << " nsw"; 1357 break; 1358 case OperationType::FPMathOp: 1359 getFastMathFlags().print(O); 1360 break; 1361 case OperationType::GEPOp: 1362 if (GEPFlags.isInBounds()) 1363 O << " inbounds"; 1364 else if (GEPFlags.hasNoUnsignedSignedWrap()) 1365 O << " nusw"; 1366 if (GEPFlags.hasNoUnsignedWrap()) 1367 O << " nuw"; 1368 break; 1369 case OperationType::NonNegOp: 1370 if (NonNegFlags.NonNeg) 1371 O << " nneg"; 1372 break; 1373 case OperationType::Other: 1374 break; 1375 } 1376 if (getNumOperands() > 0) 1377 O << " "; 1378 } 1379 #endif 1380 1381 void VPWidenRecipe::execute(VPTransformState &State) { 1382 State.setDebugLocFrom(getDebugLoc()); 1383 auto &Builder = State.Builder; 1384 switch (Opcode) { 1385 case Instruction::Call: 1386 case Instruction::Br: 1387 case Instruction::PHI: 1388 case Instruction::GetElementPtr: 1389 case Instruction::Select: 1390 llvm_unreachable("This instruction is handled by a different recipe."); 1391 case Instruction::UDiv: 1392 case Instruction::SDiv: 1393 case Instruction::SRem: 1394 case Instruction::URem: 1395 case Instruction::Add: 1396 case Instruction::FAdd: 1397 case Instruction::Sub: 1398 case Instruction::FSub: 1399 case Instruction::FNeg: 1400 case Instruction::Mul: 1401 case Instruction::FMul: 1402 case Instruction::FDiv: 1403 case Instruction::FRem: 1404 case Instruction::Shl: 1405 case Instruction::LShr: 1406 case Instruction::AShr: 1407 case Instruction::And: 1408 case Instruction::Or: 1409 case Instruction::Xor: { 1410 // Just widen unops and binops. 1411 SmallVector<Value *, 2> Ops; 1412 for (VPValue *VPOp : operands()) 1413 Ops.push_back(State.get(VPOp)); 1414 1415 Value *V = Builder.CreateNAryOp(Opcode, Ops); 1416 1417 if (auto *VecOp = dyn_cast<Instruction>(V)) 1418 setFlags(VecOp); 1419 1420 // Use this vector value for all users of the original instruction. 1421 State.set(this, V); 1422 State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1423 break; 1424 } 1425 case Instruction::Freeze: { 1426 Value *Op = State.get(getOperand(0)); 1427 1428 Value *Freeze = Builder.CreateFreeze(Op); 1429 State.set(this, Freeze); 1430 break; 1431 } 1432 case Instruction::ICmp: 1433 case Instruction::FCmp: { 1434 // Widen compares. Generate vector compares. 1435 bool FCmp = Opcode == Instruction::FCmp; 1436 Value *A = State.get(getOperand(0)); 1437 Value *B = State.get(getOperand(1)); 1438 Value *C = nullptr; 1439 if (FCmp) { 1440 // Propagate fast math flags. 1441 C = Builder.CreateFCmpFMF( 1442 getPredicate(), A, B, 1443 dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1444 } else { 1445 C = Builder.CreateICmp(getPredicate(), A, B); 1446 } 1447 State.set(this, C); 1448 State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1449 break; 1450 } 1451 default: 1452 // This instruction is not vectorized by simple widening. 1453 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " 1454 << Instruction::getOpcodeName(Opcode)); 1455 llvm_unreachable("Unhandled instruction!"); 1456 } // end of switch. 1457 1458 #if !defined(NDEBUG) 1459 // Verify that VPlan type inference results agree with the type of the 1460 // generated values. 1461 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) == 1462 State.get(this)->getType() && 1463 "inferred type and type from generated instructions do not match"); 1464 #endif 1465 } 1466 1467 InstructionCost VPWidenRecipe::computeCost(ElementCount VF, 1468 VPCostContext &Ctx) const { 1469 switch (Opcode) { 1470 case Instruction::FNeg: { 1471 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 1472 return Ctx.TTI.getArithmeticInstrCost( 1473 Opcode, VectorTy, Ctx.CostKind, 1474 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 1475 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}); 1476 } 1477 1478 case Instruction::UDiv: 1479 case Instruction::SDiv: 1480 case Instruction::SRem: 1481 case Instruction::URem: 1482 // More complex computation, let the legacy cost-model handle this for now. 1483 return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF); 1484 case Instruction::Add: 1485 case Instruction::FAdd: 1486 case Instruction::Sub: 1487 case Instruction::FSub: 1488 case Instruction::Mul: 1489 case Instruction::FMul: 1490 case Instruction::FDiv: 1491 case Instruction::FRem: 1492 case Instruction::Shl: 1493 case Instruction::LShr: 1494 case Instruction::AShr: 1495 case Instruction::And: 1496 case Instruction::Or: 1497 case Instruction::Xor: { 1498 VPValue *RHS = getOperand(1); 1499 // Certain instructions can be cheaper to vectorize if they have a constant 1500 // second vector operand. One example of this are shifts on x86. 1501 TargetTransformInfo::OperandValueInfo RHSInfo = { 1502 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}; 1503 if (RHS->isLiveIn()) 1504 RHSInfo = Ctx.TTI.getOperandInfo(RHS->getLiveInIRValue()); 1505 1506 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue && 1507 getOperand(1)->isDefinedOutsideLoopRegions()) 1508 RHSInfo.Kind = TargetTransformInfo::OK_UniformValue; 1509 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 1510 Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue()); 1511 1512 SmallVector<const Value *, 4> Operands; 1513 if (CtxI) 1514 Operands.append(CtxI->value_op_begin(), CtxI->value_op_end()); 1515 return Ctx.TTI.getArithmeticInstrCost( 1516 Opcode, VectorTy, Ctx.CostKind, 1517 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 1518 RHSInfo, Operands, CtxI, &Ctx.TLI); 1519 } 1520 case Instruction::Freeze: { 1521 // This opcode is unknown. Assume that it is the same as 'mul'. 1522 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 1523 return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, 1524 Ctx.CostKind); 1525 } 1526 case Instruction::ICmp: 1527 case Instruction::FCmp: { 1528 Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue()); 1529 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); 1530 return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(), 1531 Ctx.CostKind, 1532 {TTI::OK_AnyValue, TTI::OP_None}, 1533 {TTI::OK_AnyValue, TTI::OP_None}, CtxI); 1534 } 1535 default: 1536 llvm_unreachable("Unsupported opcode for instruction"); 1537 } 1538 } 1539 1540 void VPWidenEVLRecipe::execute(VPTransformState &State) { 1541 unsigned Opcode = getOpcode(); 1542 // TODO: Support other opcodes 1543 if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode)) 1544 llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute"); 1545 1546 State.setDebugLocFrom(getDebugLoc()); 1547 1548 assert(State.get(getOperand(0))->getType()->isVectorTy() && 1549 "VPWidenEVLRecipe should not be used for scalars"); 1550 1551 VPValue *EVL = getEVL(); 1552 Value *EVLArg = State.get(EVL, /*NeedsScalar=*/true); 1553 IRBuilderBase &BuilderIR = State.Builder; 1554 VectorBuilder Builder(BuilderIR); 1555 Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue()); 1556 1557 SmallVector<Value *, 4> Ops; 1558 for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) { 1559 VPValue *VPOp = getOperand(I); 1560 Ops.push_back(State.get(VPOp)); 1561 } 1562 1563 Builder.setMask(Mask).setEVL(EVLArg); 1564 Value *VPInst = 1565 Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops, "vp.op"); 1566 // Currently vp-intrinsics only accept FMF flags. 1567 // TODO: Enable other flags when support is added. 1568 if (isa<FPMathOperator>(VPInst)) 1569 setFlags(cast<Instruction>(VPInst)); 1570 1571 State.set(this, VPInst); 1572 State.addMetadata(VPInst, 1573 dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1574 } 1575 1576 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1577 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, 1578 VPSlotTracker &SlotTracker) const { 1579 O << Indent << "WIDEN "; 1580 printAsOperand(O, SlotTracker); 1581 O << " = " << Instruction::getOpcodeName(Opcode); 1582 printFlags(O); 1583 printOperands(O, SlotTracker); 1584 } 1585 1586 void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent, 1587 VPSlotTracker &SlotTracker) const { 1588 O << Indent << "WIDEN "; 1589 printAsOperand(O, SlotTracker); 1590 O << " = vp." << Instruction::getOpcodeName(getOpcode()); 1591 printFlags(O); 1592 printOperands(O, SlotTracker); 1593 } 1594 #endif 1595 1596 void VPWidenCastRecipe::execute(VPTransformState &State) { 1597 State.setDebugLocFrom(getDebugLoc()); 1598 auto &Builder = State.Builder; 1599 /// Vectorize casts. 1600 assert(State.VF.isVector() && "Not vectorizing?"); 1601 Type *DestTy = VectorType::get(getResultType(), State.VF); 1602 VPValue *Op = getOperand(0); 1603 Value *A = State.get(Op); 1604 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); 1605 State.set(this, Cast); 1606 State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue())); 1607 if (auto *CastOp = dyn_cast<Instruction>(Cast)) 1608 setFlags(CastOp); 1609 } 1610 1611 InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF, 1612 VPCostContext &Ctx) const { 1613 // TODO: In some cases, VPWidenCastRecipes are created but not considered in 1614 // the legacy cost model, including truncates/extends when evaluating a 1615 // reduction in a smaller type. 1616 if (!getUnderlyingValue()) 1617 return 0; 1618 // Computes the CastContextHint from a recipes that may access memory. 1619 auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint { 1620 if (VF.isScalar()) 1621 return TTI::CastContextHint::Normal; 1622 if (isa<VPInterleaveRecipe>(R)) 1623 return TTI::CastContextHint::Interleave; 1624 if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R)) 1625 return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked 1626 : TTI::CastContextHint::Normal; 1627 const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R); 1628 if (WidenMemoryRecipe == nullptr) 1629 return TTI::CastContextHint::None; 1630 if (!WidenMemoryRecipe->isConsecutive()) 1631 return TTI::CastContextHint::GatherScatter; 1632 if (WidenMemoryRecipe->isReverse()) 1633 return TTI::CastContextHint::Reversed; 1634 if (WidenMemoryRecipe->isMasked()) 1635 return TTI::CastContextHint::Masked; 1636 return TTI::CastContextHint::Normal; 1637 }; 1638 1639 VPValue *Operand = getOperand(0); 1640 TTI::CastContextHint CCH = TTI::CastContextHint::None; 1641 // For Trunc/FPTrunc, get the context from the only user. 1642 if ((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) && 1643 !hasMoreThanOneUniqueUser() && getNumUsers() > 0) { 1644 if (auto *StoreRecipe = dyn_cast<VPRecipeBase>(*user_begin())) 1645 CCH = ComputeCCH(StoreRecipe); 1646 } 1647 // For Z/Sext, get the context from the operand. 1648 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 1649 Opcode == Instruction::FPExt) { 1650 if (Operand->isLiveIn()) 1651 CCH = TTI::CastContextHint::Normal; 1652 else if (Operand->getDefiningRecipe()) 1653 CCH = ComputeCCH(Operand->getDefiningRecipe()); 1654 } 1655 1656 auto *SrcTy = 1657 cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(Operand), VF)); 1658 auto *DestTy = cast<VectorType>(toVectorTy(getResultType(), VF)); 1659 // Arm TTI will use the underlying instruction to determine the cost. 1660 return Ctx.TTI.getCastInstrCost( 1661 Opcode, DestTy, SrcTy, CCH, Ctx.CostKind, 1662 dyn_cast_if_present<Instruction>(getUnderlyingValue())); 1663 } 1664 1665 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1666 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, 1667 VPSlotTracker &SlotTracker) const { 1668 O << Indent << "WIDEN-CAST "; 1669 printAsOperand(O, SlotTracker); 1670 O << " = " << Instruction::getOpcodeName(Opcode); 1671 printFlags(O); 1672 printOperands(O, SlotTracker); 1673 O << " to " << *getResultType(); 1674 } 1675 #endif 1676 1677 InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF, 1678 VPCostContext &Ctx) const { 1679 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind); 1680 } 1681 1682 /// This function adds 1683 /// (0 * Step, 1 * Step, 2 * Step, ...) 1684 /// to each vector element of Val. 1685 /// \p Opcode is relevant for FP induction variable. 1686 static Value *getStepVector(Value *Val, Value *Step, 1687 Instruction::BinaryOps BinOp, ElementCount VF, 1688 IRBuilderBase &Builder) { 1689 assert(VF.isVector() && "only vector VFs are supported"); 1690 1691 // Create and check the types. 1692 auto *ValVTy = cast<VectorType>(Val->getType()); 1693 ElementCount VLen = ValVTy->getElementCount(); 1694 1695 Type *STy = Val->getType()->getScalarType(); 1696 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1697 "Induction Step must be an integer or FP"); 1698 assert(Step->getType() == STy && "Step has wrong type"); 1699 1700 SmallVector<Constant *, 8> Indices; 1701 1702 // Create a vector of consecutive numbers from zero to VF. 1703 VectorType *InitVecValVTy = ValVTy; 1704 if (STy->isFloatingPointTy()) { 1705 Type *InitVecValSTy = 1706 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 1707 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 1708 } 1709 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 1710 1711 if (STy->isIntegerTy()) { 1712 Step = Builder.CreateVectorSplat(VLen, Step); 1713 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1714 // FIXME: The newly created binary instructions should contain nsw/nuw 1715 // flags, which can be found from the original scalar operations. 1716 Step = Builder.CreateMul(InitVec, Step); 1717 return Builder.CreateAdd(Val, Step, "induction"); 1718 } 1719 1720 // Floating point induction. 1721 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1722 "Binary Opcode should be specified for FP induction"); 1723 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 1724 1725 Step = Builder.CreateVectorSplat(VLen, Step); 1726 Value *MulOp = Builder.CreateFMul(InitVec, Step); 1727 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1728 } 1729 1730 /// A helper function that returns an integer or floating-point constant with 1731 /// value C. 1732 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 1733 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 1734 : ConstantFP::get(Ty, C); 1735 } 1736 1737 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 1738 assert(!State.Lane && "Int or FP induction being replicated."); 1739 1740 Value *Start = getStartValue()->getLiveInIRValue(); 1741 const InductionDescriptor &ID = getInductionDescriptor(); 1742 TruncInst *Trunc = getTruncInst(); 1743 IRBuilderBase &Builder = State.Builder; 1744 assert(getPHINode()->getType() == ID.getStartValue()->getType() && 1745 "Types must match"); 1746 assert(State.VF.isVector() && "must have vector VF"); 1747 1748 // The value from the original loop to which we are mapping the new induction 1749 // variable. 1750 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : getPHINode(); 1751 1752 // Fast-math-flags propagate from the original induction instruction. 1753 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 1754 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 1755 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 1756 1757 // Now do the actual transformations, and start with fetching the step value. 1758 Value *Step = State.get(getStepValue(), VPLane(0)); 1759 1760 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1761 "Expected either an induction phi-node or a truncate of it!"); 1762 1763 // Construct the initial value of the vector IV in the vector loop preheader 1764 auto CurrIP = Builder.saveIP(); 1765 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1766 Builder.SetInsertPoint(VectorPH->getTerminator()); 1767 if (isa<TruncInst>(EntryVal)) { 1768 assert(Start->getType()->isIntegerTy() && 1769 "Truncation requires an integer type"); 1770 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1771 Step = Builder.CreateTrunc(Step, TruncType); 1772 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1773 } 1774 1775 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 1776 Value *SteppedStart = getStepVector(SplatStart, Step, ID.getInductionOpcode(), 1777 State.VF, State.Builder); 1778 1779 // We create vector phi nodes for both integer and floating-point induction 1780 // variables. Here, we determine the kind of arithmetic we will perform. 1781 Instruction::BinaryOps AddOp; 1782 Instruction::BinaryOps MulOp; 1783 if (Step->getType()->isIntegerTy()) { 1784 AddOp = Instruction::Add; 1785 MulOp = Instruction::Mul; 1786 } else { 1787 AddOp = ID.getInductionOpcode(); 1788 MulOp = Instruction::FMul; 1789 } 1790 1791 Value *SplatVF; 1792 if (VPValue *SplatVFOperand = getSplatVFValue()) { 1793 // The recipe has been unrolled. In that case, fetch the splat value for the 1794 // induction increment. 1795 SplatVF = State.get(SplatVFOperand); 1796 } else { 1797 // Multiply the vectorization factor by the step using integer or 1798 // floating-point arithmetic as appropriate. 1799 Type *StepType = Step->getType(); 1800 Value *RuntimeVF = State.get(getVFValue(), VPLane(0)); 1801 if (Step->getType()->isFloatingPointTy()) 1802 RuntimeVF = Builder.CreateUIToFP(RuntimeVF, StepType); 1803 else 1804 RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, StepType); 1805 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 1806 1807 // Create a vector splat to use in the induction update. 1808 SplatVF = Builder.CreateVectorSplat(State.VF, Mul); 1809 } 1810 1811 Builder.restoreIP(CurrIP); 1812 1813 // We may need to add the step a number of times, depending on the unroll 1814 // factor. The last of those goes into the PHI. 1815 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind"); 1816 VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 1817 VecInd->setDebugLoc(getDebugLoc()); 1818 State.set(this, VecInd); 1819 1820 Instruction *LastInduction = cast<Instruction>( 1821 Builder.CreateBinOp(AddOp, VecInd, SplatVF, "vec.ind.next")); 1822 if (isa<TruncInst>(EntryVal)) 1823 State.addMetadata(LastInduction, EntryVal); 1824 LastInduction->setDebugLoc(getDebugLoc()); 1825 1826 VecInd->addIncoming(SteppedStart, VectorPH); 1827 // Add induction update using an incorrect block temporarily. The phi node 1828 // will be fixed after VPlan execution. Note that at this point the latch 1829 // block cannot be used, as it does not exist yet. 1830 // TODO: Model increment value in VPlan, by turning the recipe into a 1831 // multi-def and a subclass of VPHeaderPHIRecipe. 1832 VecInd->addIncoming(LastInduction, VectorPH); 1833 } 1834 1835 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1836 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, 1837 VPSlotTracker &SlotTracker) const { 1838 O << Indent; 1839 printAsOperand(O, SlotTracker); 1840 O << " = WIDEN-INDUCTION "; 1841 printOperands(O, SlotTracker); 1842 1843 if (auto *TI = getTruncInst()) 1844 O << " (truncated to " << *TI->getType() << ")"; 1845 } 1846 #endif 1847 1848 bool VPWidenIntOrFpInductionRecipe::isCanonical() const { 1849 // The step may be defined by a recipe in the preheader (e.g. if it requires 1850 // SCEV expansion), but for the canonical induction the step is required to be 1851 // 1, which is represented as live-in. 1852 if (getStepValue()->getDefiningRecipe()) 1853 return false; 1854 auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue()); 1855 auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); 1856 auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin()); 1857 return StartC && StartC->isZero() && StepC && StepC->isOne() && 1858 getScalarType() == CanIV->getScalarType(); 1859 } 1860 1861 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1862 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, 1863 VPSlotTracker &SlotTracker) const { 1864 O << Indent; 1865 printAsOperand(O, SlotTracker); 1866 O << " = DERIVED-IV "; 1867 getStartValue()->printAsOperand(O, SlotTracker); 1868 O << " + "; 1869 getOperand(1)->printAsOperand(O, SlotTracker); 1870 O << " * "; 1871 getStepValue()->printAsOperand(O, SlotTracker); 1872 } 1873 #endif 1874 1875 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 1876 // Fast-math-flags propagate from the original induction instruction. 1877 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 1878 if (hasFastMathFlags()) 1879 State.Builder.setFastMathFlags(getFastMathFlags()); 1880 1881 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 1882 /// variable on which to base the steps, \p Step is the size of the step. 1883 1884 Value *BaseIV = State.get(getOperand(0), VPLane(0)); 1885 Value *Step = State.get(getStepValue(), VPLane(0)); 1886 IRBuilderBase &Builder = State.Builder; 1887 1888 // Ensure step has the same type as that of scalar IV. 1889 Type *BaseIVTy = BaseIV->getType()->getScalarType(); 1890 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!"); 1891 1892 // We build scalar steps for both integer and floating-point induction 1893 // variables. Here, we determine the kind of arithmetic we will perform. 1894 Instruction::BinaryOps AddOp; 1895 Instruction::BinaryOps MulOp; 1896 if (BaseIVTy->isIntegerTy()) { 1897 AddOp = Instruction::Add; 1898 MulOp = Instruction::Mul; 1899 } else { 1900 AddOp = InductionOpcode; 1901 MulOp = Instruction::FMul; 1902 } 1903 1904 // Determine the number of scalars we need to generate for each unroll 1905 // iteration. 1906 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this); 1907 // Compute the scalar steps and save the results in State. 1908 Type *IntStepTy = 1909 IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits()); 1910 Type *VecIVTy = nullptr; 1911 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 1912 if (!FirstLaneOnly && State.VF.isScalable()) { 1913 VecIVTy = VectorType::get(BaseIVTy, State.VF); 1914 UnitStepVec = 1915 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 1916 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 1917 SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV); 1918 } 1919 1920 unsigned StartLane = 0; 1921 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 1922 if (State.Lane) { 1923 StartLane = State.Lane->getKnownLane(); 1924 EndLane = StartLane + 1; 1925 } 1926 Value *StartIdx0 = 1927 createStepForVF(Builder, IntStepTy, State.VF, getUnrollPart(*this)); 1928 1929 if (!FirstLaneOnly && State.VF.isScalable()) { 1930 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 1931 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 1932 if (BaseIVTy->isFloatingPointTy()) 1933 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 1934 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 1935 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 1936 State.set(this, Add); 1937 // It's useful to record the lane values too for the known minimum number 1938 // of elements so we do those below. This improves the code quality when 1939 // trying to extract the first element, for example. 1940 } 1941 1942 if (BaseIVTy->isFloatingPointTy()) 1943 StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy); 1944 1945 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { 1946 Value *StartIdx = Builder.CreateBinOp( 1947 AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane)); 1948 // The step returned by `createStepForVF` is a runtime-evaluated value 1949 // when VF is scalable. Otherwise, it should be folded into a Constant. 1950 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 1951 "Expected StartIdx to be folded to a constant when VF is not " 1952 "scalable"); 1953 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 1954 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul); 1955 State.set(this, Add, VPLane(Lane)); 1956 } 1957 } 1958 1959 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1960 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, 1961 VPSlotTracker &SlotTracker) const { 1962 O << Indent; 1963 printAsOperand(O, SlotTracker); 1964 O << " = SCALAR-STEPS "; 1965 printOperands(O, SlotTracker); 1966 } 1967 #endif 1968 1969 void VPWidenGEPRecipe::execute(VPTransformState &State) { 1970 assert(State.VF.isVector() && "not widening"); 1971 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 1972 // Construct a vector GEP by widening the operands of the scalar GEP as 1973 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 1974 // results in a vector of pointers when at least one operand of the GEP 1975 // is vector-typed. Thus, to keep the representation compact, we only use 1976 // vector-typed operands for loop-varying values. 1977 1978 if (areAllOperandsInvariant()) { 1979 // If we are vectorizing, but the GEP has only loop-invariant operands, 1980 // the GEP we build (by only using vector-typed operands for 1981 // loop-varying values) would be a scalar pointer. Thus, to ensure we 1982 // produce a vector of pointers, we need to either arbitrarily pick an 1983 // operand to broadcast, or broadcast a clone of the original GEP. 1984 // Here, we broadcast a clone of the original. 1985 // 1986 // TODO: If at some point we decide to scalarize instructions having 1987 // loop-invariant operands, this special case will no longer be 1988 // required. We would add the scalarization decision to 1989 // collectLoopScalars() and teach getVectorValue() to broadcast 1990 // the lane-zero scalar value. 1991 SmallVector<Value *> Ops; 1992 for (unsigned I = 0, E = getNumOperands(); I != E; I++) 1993 Ops.push_back(State.get(getOperand(I), VPLane(0))); 1994 1995 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0], 1996 ArrayRef(Ops).drop_front(), "", 1997 getGEPNoWrapFlags()); 1998 Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP); 1999 State.set(this, Splat); 2000 State.addMetadata(Splat, GEP); 2001 } else { 2002 // If the GEP has at least one loop-varying operand, we are sure to 2003 // produce a vector of pointers unless VF is scalar. 2004 // The pointer operand of the new GEP. If it's loop-invariant, we 2005 // won't broadcast it. 2006 auto *Ptr = isPointerLoopInvariant() ? State.get(getOperand(0), VPLane(0)) 2007 : State.get(getOperand(0)); 2008 2009 // Collect all the indices for the new GEP. If any index is 2010 // loop-invariant, we won't broadcast it. 2011 SmallVector<Value *, 4> Indices; 2012 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 2013 VPValue *Operand = getOperand(I); 2014 if (isIndexLoopInvariant(I - 1)) 2015 Indices.push_back(State.get(Operand, VPLane(0))); 2016 else 2017 Indices.push_back(State.get(Operand)); 2018 } 2019 2020 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 2021 // but it should be a vector, otherwise. 2022 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, 2023 Indices, "", getGEPNoWrapFlags()); 2024 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 2025 "NewGEP is not a pointer vector"); 2026 State.set(this, NewGEP); 2027 State.addMetadata(NewGEP, GEP); 2028 } 2029 } 2030 2031 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2032 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, 2033 VPSlotTracker &SlotTracker) const { 2034 O << Indent << "WIDEN-GEP "; 2035 O << (isPointerLoopInvariant() ? "Inv" : "Var"); 2036 for (size_t I = 0; I < getNumOperands() - 1; ++I) 2037 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]"; 2038 2039 O << " "; 2040 printAsOperand(O, SlotTracker); 2041 O << " = getelementptr"; 2042 printFlags(O); 2043 printOperands(O, SlotTracker); 2044 } 2045 #endif 2046 2047 static Type *getGEPIndexTy(bool IsScalable, bool IsReverse, 2048 unsigned CurrentPart, IRBuilderBase &Builder) { 2049 // Use i32 for the gep index type when the value is constant, 2050 // or query DataLayout for a more suitable index type otherwise. 2051 const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout(); 2052 return IsScalable && (IsReverse || CurrentPart > 0) 2053 ? DL.getIndexType(Builder.getPtrTy(0)) 2054 : Builder.getInt32Ty(); 2055 } 2056 2057 void VPReverseVectorPointerRecipe::execute(VPTransformState &State) { 2058 auto &Builder = State.Builder; 2059 State.setDebugLocFrom(getDebugLoc()); 2060 unsigned CurrentPart = getUnrollPart(*this); 2061 Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ true, 2062 CurrentPart, Builder); 2063 2064 // The wide store needs to start at the last vector element. 2065 Value *RunTimeVF = State.get(getVFValue(), VPLane(0)); 2066 if (IndexTy != RunTimeVF->getType()) 2067 RunTimeVF = Builder.CreateZExtOrTrunc(RunTimeVF, IndexTy); 2068 // NumElt = -CurrentPart * RunTimeVF 2069 Value *NumElt = Builder.CreateMul( 2070 ConstantInt::get(IndexTy, -(int64_t)CurrentPart), RunTimeVF); 2071 // LastLane = 1 - RunTimeVF 2072 Value *LastLane = Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); 2073 Value *Ptr = State.get(getOperand(0), VPLane(0)); 2074 Value *ResultPtr = 2075 Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags()); 2076 ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "", 2077 getGEPNoWrapFlags()); 2078 2079 State.set(this, ResultPtr, /*IsScalar*/ true); 2080 } 2081 2082 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2083 void VPReverseVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, 2084 VPSlotTracker &SlotTracker) const { 2085 O << Indent; 2086 printAsOperand(O, SlotTracker); 2087 O << " = reverse-vector-pointer"; 2088 printFlags(O); 2089 printOperands(O, SlotTracker); 2090 } 2091 #endif 2092 2093 void VPVectorPointerRecipe::execute(VPTransformState &State) { 2094 auto &Builder = State.Builder; 2095 State.setDebugLocFrom(getDebugLoc()); 2096 unsigned CurrentPart = getUnrollPart(*this); 2097 Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false, 2098 CurrentPart, Builder); 2099 Value *Ptr = State.get(getOperand(0), VPLane(0)); 2100 2101 Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart); 2102 Value *ResultPtr = 2103 Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags()); 2104 2105 State.set(this, ResultPtr, /*IsScalar*/ true); 2106 } 2107 2108 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2109 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, 2110 VPSlotTracker &SlotTracker) const { 2111 O << Indent; 2112 printAsOperand(O, SlotTracker); 2113 O << " = vector-pointer "; 2114 2115 printOperands(O, SlotTracker); 2116 } 2117 #endif 2118 2119 void VPBlendRecipe::execute(VPTransformState &State) { 2120 assert(isNormalized() && "Expected blend to be normalized!"); 2121 State.setDebugLocFrom(getDebugLoc()); 2122 // We know that all PHIs in non-header blocks are converted into 2123 // selects, so we don't have to worry about the insertion order and we 2124 // can just use the builder. 2125 // At this point we generate the predication tree. There may be 2126 // duplications since this is a simple recursive scan, but future 2127 // optimizations will clean it up. 2128 2129 unsigned NumIncoming = getNumIncomingValues(); 2130 2131 // Generate a sequence of selects of the form: 2132 // SELECT(Mask3, In3, 2133 // SELECT(Mask2, In2, 2134 // SELECT(Mask1, In1, 2135 // In0))) 2136 // Note that Mask0 is never used: lanes for which no path reaches this phi and 2137 // are essentially undef are taken from In0. 2138 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 2139 Value *Result = nullptr; 2140 for (unsigned In = 0; In < NumIncoming; ++In) { 2141 // We might have single edge PHIs (blocks) - use an identity 2142 // 'select' for the first PHI operand. 2143 Value *In0 = State.get(getIncomingValue(In), OnlyFirstLaneUsed); 2144 if (In == 0) 2145 Result = In0; // Initialize with the first incoming value. 2146 else { 2147 // Select between the current value and the previous incoming edge 2148 // based on the incoming mask. 2149 Value *Cond = State.get(getMask(In), OnlyFirstLaneUsed); 2150 Result = State.Builder.CreateSelect(Cond, In0, Result, "predphi"); 2151 } 2152 } 2153 State.set(this, Result, OnlyFirstLaneUsed); 2154 } 2155 2156 InstructionCost VPBlendRecipe::computeCost(ElementCount VF, 2157 VPCostContext &Ctx) const { 2158 // Handle cases where only the first lane is used the same way as the legacy 2159 // cost model. 2160 if (vputils::onlyFirstLaneUsed(this)) 2161 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind); 2162 2163 Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 2164 Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF); 2165 return (getNumIncomingValues() - 1) * 2166 Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy, 2167 CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind); 2168 } 2169 2170 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2171 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, 2172 VPSlotTracker &SlotTracker) const { 2173 O << Indent << "BLEND "; 2174 printAsOperand(O, SlotTracker); 2175 O << " ="; 2176 if (getNumIncomingValues() == 1) { 2177 // Not a User of any mask: not really blending, this is a 2178 // single-predecessor phi. 2179 O << " "; 2180 getIncomingValue(0)->printAsOperand(O, SlotTracker); 2181 } else { 2182 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { 2183 O << " "; 2184 getIncomingValue(I)->printAsOperand(O, SlotTracker); 2185 if (I == 0) 2186 continue; 2187 O << "/"; 2188 getMask(I)->printAsOperand(O, SlotTracker); 2189 } 2190 } 2191 } 2192 #endif 2193 2194 void VPReductionRecipe::execute(VPTransformState &State) { 2195 assert(!State.Lane && "Reduction being replicated."); 2196 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); 2197 RecurKind Kind = RdxDesc.getRecurrenceKind(); 2198 // Propagate the fast-math flags carried by the underlying instruction. 2199 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 2200 State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 2201 State.setDebugLocFrom(getDebugLoc()); 2202 Value *NewVecOp = State.get(getVecOp()); 2203 if (VPValue *Cond = getCondOp()) { 2204 Value *NewCond = State.get(Cond, State.VF.isScalar()); 2205 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType()); 2206 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); 2207 2208 Value *Start; 2209 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind)) 2210 Start = RdxDesc.getRecurrenceStartValue(); 2211 else 2212 Start = llvm::getRecurrenceIdentity(Kind, ElementTy, 2213 RdxDesc.getFastMathFlags()); 2214 if (State.VF.isVector()) 2215 Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start); 2216 2217 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start); 2218 NewVecOp = Select; 2219 } 2220 Value *NewRed; 2221 Value *NextInChain; 2222 if (IsOrdered) { 2223 if (State.VF.isVector()) 2224 NewRed = 2225 createOrderedReduction(State.Builder, RdxDesc, NewVecOp, PrevInChain); 2226 else 2227 NewRed = State.Builder.CreateBinOp( 2228 (Instruction::BinaryOps)RdxDesc.getOpcode(), PrevInChain, NewVecOp); 2229 PrevInChain = NewRed; 2230 NextInChain = NewRed; 2231 } else { 2232 PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); 2233 NewRed = createReduction(State.Builder, RdxDesc, NewVecOp); 2234 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 2235 NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(), 2236 NewRed, PrevInChain); 2237 else 2238 NextInChain = State.Builder.CreateBinOp( 2239 (Instruction::BinaryOps)RdxDesc.getOpcode(), NewRed, PrevInChain); 2240 } 2241 State.set(this, NextInChain, /*IsScalar*/ true); 2242 } 2243 2244 void VPReductionEVLRecipe::execute(VPTransformState &State) { 2245 assert(!State.Lane && "Reduction being replicated."); 2246 2247 auto &Builder = State.Builder; 2248 // Propagate the fast-math flags carried by the underlying instruction. 2249 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder); 2250 const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor(); 2251 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 2252 2253 RecurKind Kind = RdxDesc.getRecurrenceKind(); 2254 Value *Prev = State.get(getChainOp(), /*IsScalar*/ true); 2255 Value *VecOp = State.get(getVecOp()); 2256 Value *EVL = State.get(getEVL(), VPLane(0)); 2257 2258 VectorBuilder VBuilder(Builder); 2259 VBuilder.setEVL(EVL); 2260 Value *Mask; 2261 // TODO: move the all-true mask generation into VectorBuilder. 2262 if (VPValue *CondOp = getCondOp()) 2263 Mask = State.get(CondOp); 2264 else 2265 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 2266 VBuilder.setMask(Mask); 2267 2268 Value *NewRed; 2269 if (isOrdered()) { 2270 NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev); 2271 } else { 2272 NewRed = createSimpleReduction(VBuilder, VecOp, RdxDesc); 2273 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 2274 NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev); 2275 else 2276 NewRed = Builder.CreateBinOp((Instruction::BinaryOps)RdxDesc.getOpcode(), 2277 NewRed, Prev); 2278 } 2279 State.set(this, NewRed, /*IsScalar*/ true); 2280 } 2281 2282 InstructionCost VPReductionRecipe::computeCost(ElementCount VF, 2283 VPCostContext &Ctx) const { 2284 RecurKind RdxKind = RdxDesc.getRecurrenceKind(); 2285 Type *ElementTy = Ctx.Types.inferScalarType(this); 2286 auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF)); 2287 unsigned Opcode = RdxDesc.getOpcode(); 2288 2289 // TODO: Support any-of and in-loop reductions. 2290 assert( 2291 (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) || 2292 ForceTargetInstructionCost.getNumOccurrences() > 0) && 2293 "Any-of reduction not implemented in VPlan-based cost model currently."); 2294 assert( 2295 (!cast<VPReductionPHIRecipe>(getOperand(0))->isInLoop() || 2296 ForceTargetInstructionCost.getNumOccurrences() > 0) && 2297 "In-loop reduction not implemented in VPlan-based cost model currently."); 2298 2299 assert(ElementTy->getTypeID() == RdxDesc.getRecurrenceType()->getTypeID() && 2300 "Inferred type and recurrence type mismatch."); 2301 2302 // Cost = Reduction cost + BinOp cost 2303 InstructionCost Cost = 2304 Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, Ctx.CostKind); 2305 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) { 2306 Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind); 2307 return Cost + Ctx.TTI.getMinMaxReductionCost( 2308 Id, VectorTy, RdxDesc.getFastMathFlags(), Ctx.CostKind); 2309 } 2310 2311 return Cost + Ctx.TTI.getArithmeticReductionCost( 2312 Opcode, VectorTy, RdxDesc.getFastMathFlags(), Ctx.CostKind); 2313 } 2314 2315 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2316 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, 2317 VPSlotTracker &SlotTracker) const { 2318 O << Indent << "REDUCE "; 2319 printAsOperand(O, SlotTracker); 2320 O << " = "; 2321 getChainOp()->printAsOperand(O, SlotTracker); 2322 O << " +"; 2323 if (isa<FPMathOperator>(getUnderlyingInstr())) 2324 O << getUnderlyingInstr()->getFastMathFlags(); 2325 O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; 2326 getVecOp()->printAsOperand(O, SlotTracker); 2327 if (isConditional()) { 2328 O << ", "; 2329 getCondOp()->printAsOperand(O, SlotTracker); 2330 } 2331 O << ")"; 2332 if (RdxDesc.IntermediateStore) 2333 O << " (with final reduction value stored in invariant address sank " 2334 "outside of loop)"; 2335 } 2336 2337 void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent, 2338 VPSlotTracker &SlotTracker) const { 2339 const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor(); 2340 O << Indent << "REDUCE "; 2341 printAsOperand(O, SlotTracker); 2342 O << " = "; 2343 getChainOp()->printAsOperand(O, SlotTracker); 2344 O << " +"; 2345 if (isa<FPMathOperator>(getUnderlyingInstr())) 2346 O << getUnderlyingInstr()->getFastMathFlags(); 2347 O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; 2348 getVecOp()->printAsOperand(O, SlotTracker); 2349 O << ", "; 2350 getEVL()->printAsOperand(O, SlotTracker); 2351 if (isConditional()) { 2352 O << ", "; 2353 getCondOp()->printAsOperand(O, SlotTracker); 2354 } 2355 O << ")"; 2356 if (RdxDesc.IntermediateStore) 2357 O << " (with final reduction value stored in invariant address sank " 2358 "outside of loop)"; 2359 } 2360 #endif 2361 2362 bool VPReplicateRecipe::shouldPack() const { 2363 // Find if the recipe is used by a widened recipe via an intervening 2364 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. 2365 return any_of(users(), [](const VPUser *U) { 2366 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U)) 2367 return any_of(PredR->users(), [PredR](const VPUser *U) { 2368 return !U->usesScalars(PredR); 2369 }); 2370 return false; 2371 }); 2372 } 2373 2374 InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, 2375 VPCostContext &Ctx) const { 2376 Instruction *UI = cast<Instruction>(getUnderlyingValue()); 2377 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan 2378 // transform, avoid computing their cost multiple times for now. 2379 Ctx.SkipCostComputation.insert(UI); 2380 return Ctx.getLegacyCost(UI, VF); 2381 } 2382 2383 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2384 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, 2385 VPSlotTracker &SlotTracker) const { 2386 O << Indent << (IsUniform ? "CLONE " : "REPLICATE "); 2387 2388 if (!getUnderlyingInstr()->getType()->isVoidTy()) { 2389 printAsOperand(O, SlotTracker); 2390 O << " = "; 2391 } 2392 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) { 2393 O << "call"; 2394 printFlags(O); 2395 O << "@" << CB->getCalledFunction()->getName() << "("; 2396 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), 2397 O, [&O, &SlotTracker](VPValue *Op) { 2398 Op->printAsOperand(O, SlotTracker); 2399 }); 2400 O << ")"; 2401 } else { 2402 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); 2403 printFlags(O); 2404 printOperands(O, SlotTracker); 2405 } 2406 2407 if (shouldPack()) 2408 O << " (S->V)"; 2409 } 2410 #endif 2411 2412 Value *VPScalarCastRecipe ::generate(VPTransformState &State) { 2413 State.setDebugLocFrom(getDebugLoc()); 2414 assert(vputils::onlyFirstLaneUsed(this) && 2415 "Codegen only implemented for first lane."); 2416 switch (Opcode) { 2417 case Instruction::SExt: 2418 case Instruction::ZExt: 2419 case Instruction::Trunc: { 2420 // Note: SExt/ZExt not used yet. 2421 Value *Op = State.get(getOperand(0), VPLane(0)); 2422 return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy); 2423 } 2424 default: 2425 llvm_unreachable("opcode not implemented yet"); 2426 } 2427 } 2428 2429 void VPScalarCastRecipe ::execute(VPTransformState &State) { 2430 State.set(this, generate(State), VPLane(0)); 2431 } 2432 2433 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2434 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent, 2435 VPSlotTracker &SlotTracker) const { 2436 O << Indent << "SCALAR-CAST "; 2437 printAsOperand(O, SlotTracker); 2438 O << " = " << Instruction::getOpcodeName(Opcode) << " "; 2439 printOperands(O, SlotTracker); 2440 O << " to " << *ResultTy; 2441 } 2442 #endif 2443 2444 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 2445 assert(State.Lane && "Branch on Mask works only on single instance."); 2446 2447 2448 Value *ConditionBit = nullptr; 2449 VPValue *BlockInMask = getMask(); 2450 if (BlockInMask) 2451 ConditionBit = State.get(BlockInMask, *State.Lane); 2452 else // Block in mask is all-one. 2453 ConditionBit = State.Builder.getTrue(); 2454 2455 // Replace the temporary unreachable terminator with a new conditional branch, 2456 // whose two destinations will be set later when they are created. 2457 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 2458 assert(isa<UnreachableInst>(CurrentTerminator) && 2459 "Expected to replace unreachable terminator with conditional branch."); 2460 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 2461 CondBr->setSuccessor(0, nullptr); 2462 ReplaceInstWithInst(CurrentTerminator, CondBr); 2463 } 2464 2465 InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF, 2466 VPCostContext &Ctx) const { 2467 // The legacy cost model doesn't assign costs to branches for individual 2468 // replicate regions. Match the current behavior in the VPlan cost model for 2469 // now. 2470 return 0; 2471 } 2472 2473 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 2474 State.setDebugLocFrom(getDebugLoc()); 2475 assert(State.Lane && "Predicated instruction PHI works per instance."); 2476 Instruction *ScalarPredInst = 2477 cast<Instruction>(State.get(getOperand(0), *State.Lane)); 2478 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 2479 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 2480 assert(PredicatingBB && "Predicated block has no single predecessor."); 2481 assert(isa<VPReplicateRecipe>(getOperand(0)) && 2482 "operand must be VPReplicateRecipe"); 2483 2484 // By current pack/unpack logic we need to generate only a single phi node: if 2485 // a vector value for the predicated instruction exists at this point it means 2486 // the instruction has vector users only, and a phi for the vector value is 2487 // needed. In this case the recipe of the predicated instruction is marked to 2488 // also do that packing, thereby "hoisting" the insert-element sequence. 2489 // Otherwise, a phi node for the scalar value is needed. 2490 if (State.hasVectorValue(getOperand(0))) { 2491 Value *VectorValue = State.get(getOperand(0)); 2492 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 2493 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 2494 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 2495 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 2496 if (State.hasVectorValue(this)) 2497 State.reset(this, VPhi); 2498 else 2499 State.set(this, VPhi); 2500 // NOTE: Currently we need to update the value of the operand, so the next 2501 // predicated iteration inserts its generated value in the correct vector. 2502 State.reset(getOperand(0), VPhi); 2503 } else { 2504 if (vputils::onlyFirstLaneUsed(this) && !State.Lane->isFirstLane()) 2505 return; 2506 2507 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 2508 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 2509 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 2510 PredicatingBB); 2511 Phi->addIncoming(ScalarPredInst, PredicatedBB); 2512 if (State.hasScalarValue(this, *State.Lane)) 2513 State.reset(this, Phi, *State.Lane); 2514 else 2515 State.set(this, Phi, *State.Lane); 2516 // NOTE: Currently we need to update the value of the operand, so the next 2517 // predicated iteration inserts its generated value in the correct vector. 2518 State.reset(getOperand(0), Phi, *State.Lane); 2519 } 2520 } 2521 2522 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2523 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2524 VPSlotTracker &SlotTracker) const { 2525 O << Indent << "PHI-PREDICATED-INSTRUCTION "; 2526 printAsOperand(O, SlotTracker); 2527 O << " = "; 2528 printOperands(O, SlotTracker); 2529 } 2530 #endif 2531 2532 InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, 2533 VPCostContext &Ctx) const { 2534 Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); 2535 const Align Alignment = 2536 getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient)); 2537 unsigned AS = 2538 getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient)); 2539 2540 if (!Consecutive) { 2541 // TODO: Using the original IR may not be accurate. 2542 // Currently, ARM will use the underlying IR to calculate gather/scatter 2543 // instruction cost. 2544 const Value *Ptr = getLoadStorePointerOperand(&Ingredient); 2545 assert(!Reverse && 2546 "Inconsecutive memory access should not have the order."); 2547 return Ctx.TTI.getAddressComputationCost(Ty) + 2548 Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr, 2549 IsMasked, Alignment, Ctx.CostKind, 2550 &Ingredient); 2551 } 2552 2553 InstructionCost Cost = 0; 2554 if (IsMasked) { 2555 Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, 2556 AS, Ctx.CostKind); 2557 } else { 2558 TTI::OperandValueInfo OpInfo = 2559 Ctx.TTI.getOperandInfo(Ingredient.getOperand(0)); 2560 Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS, 2561 Ctx.CostKind, OpInfo, &Ingredient); 2562 } 2563 if (!Reverse) 2564 return Cost; 2565 2566 return Cost += 2567 Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 2568 cast<VectorType>(Ty), {}, Ctx.CostKind, 0); 2569 } 2570 2571 void VPWidenLoadRecipe::execute(VPTransformState &State) { 2572 auto *LI = cast<LoadInst>(&Ingredient); 2573 2574 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 2575 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 2576 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2577 bool CreateGather = !isConsecutive(); 2578 2579 auto &Builder = State.Builder; 2580 State.setDebugLocFrom(getDebugLoc()); 2581 Value *Mask = nullptr; 2582 if (auto *VPMask = getMask()) { 2583 // Mask reversal is only needed for non-all-one (null) masks, as reverse 2584 // of a null all-one mask is a null mask. 2585 Mask = State.get(VPMask); 2586 if (isReverse()) 2587 Mask = Builder.CreateVectorReverse(Mask, "reverse"); 2588 } 2589 2590 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather); 2591 Value *NewLI; 2592 if (CreateGather) { 2593 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr, 2594 "wide.masked.gather"); 2595 } else if (Mask) { 2596 NewLI = 2597 Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask, 2598 PoisonValue::get(DataTy), "wide.masked.load"); 2599 } else { 2600 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load"); 2601 } 2602 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2603 State.addMetadata(NewLI, LI); 2604 if (Reverse) 2605 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 2606 State.set(this, NewLI); 2607 } 2608 2609 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2610 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, 2611 VPSlotTracker &SlotTracker) const { 2612 O << Indent << "WIDEN "; 2613 printAsOperand(O, SlotTracker); 2614 O << " = load "; 2615 printOperands(O, SlotTracker); 2616 } 2617 #endif 2618 2619 /// Use all-true mask for reverse rather than actual mask, as it avoids a 2620 /// dependence w/o affecting the result. 2621 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, 2622 Value *EVL, const Twine &Name) { 2623 VectorType *ValTy = cast<VectorType>(Operand->getType()); 2624 Value *AllTrueMask = 2625 Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue()); 2626 return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse, 2627 {Operand, AllTrueMask, EVL}, nullptr, Name); 2628 } 2629 2630 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { 2631 auto *LI = cast<LoadInst>(&Ingredient); 2632 2633 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 2634 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 2635 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2636 bool CreateGather = !isConsecutive(); 2637 2638 auto &Builder = State.Builder; 2639 State.setDebugLocFrom(getDebugLoc()); 2640 CallInst *NewLI; 2641 Value *EVL = State.get(getEVL(), VPLane(0)); 2642 Value *Addr = State.get(getAddr(), !CreateGather); 2643 Value *Mask = nullptr; 2644 if (VPValue *VPMask = getMask()) { 2645 Mask = State.get(VPMask); 2646 if (isReverse()) 2647 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); 2648 } else { 2649 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 2650 } 2651 2652 if (CreateGather) { 2653 NewLI = 2654 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, 2655 nullptr, "wide.masked.gather"); 2656 } else { 2657 VectorBuilder VBuilder(Builder); 2658 VBuilder.setEVL(EVL).setMask(Mask); 2659 NewLI = cast<CallInst>(VBuilder.createVectorInstruction( 2660 Instruction::Load, DataTy, Addr, "vp.op.load")); 2661 } 2662 NewLI->addParamAttr( 2663 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); 2664 State.addMetadata(NewLI, LI); 2665 Instruction *Res = NewLI; 2666 if (isReverse()) 2667 Res = createReverseEVL(Builder, Res, EVL, "vp.reverse"); 2668 State.set(this, Res); 2669 } 2670 2671 InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF, 2672 VPCostContext &Ctx) const { 2673 if (!Consecutive || IsMasked) 2674 return VPWidenMemoryRecipe::computeCost(VF, Ctx); 2675 2676 // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost() 2677 // here because the EVL recipes using EVL to replace the tail mask. But in the 2678 // legacy model, it will always calculate the cost of mask. 2679 // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we 2680 // don't need to compare to the legacy cost model. 2681 Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); 2682 const Align Alignment = 2683 getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient)); 2684 unsigned AS = 2685 getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient)); 2686 InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( 2687 Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind); 2688 if (!Reverse) 2689 return Cost; 2690 2691 return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 2692 cast<VectorType>(Ty), {}, Ctx.CostKind, 2693 0); 2694 } 2695 2696 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2697 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, 2698 VPSlotTracker &SlotTracker) const { 2699 O << Indent << "WIDEN "; 2700 printAsOperand(O, SlotTracker); 2701 O << " = vp.load "; 2702 printOperands(O, SlotTracker); 2703 } 2704 #endif 2705 2706 void VPWidenStoreRecipe::execute(VPTransformState &State) { 2707 auto *SI = cast<StoreInst>(&Ingredient); 2708 2709 VPValue *StoredVPValue = getStoredValue(); 2710 bool CreateScatter = !isConsecutive(); 2711 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2712 2713 auto &Builder = State.Builder; 2714 State.setDebugLocFrom(getDebugLoc()); 2715 2716 Value *Mask = nullptr; 2717 if (auto *VPMask = getMask()) { 2718 // Mask reversal is only needed for non-all-one (null) masks, as reverse 2719 // of a null all-one mask is a null mask. 2720 Mask = State.get(VPMask); 2721 if (isReverse()) 2722 Mask = Builder.CreateVectorReverse(Mask, "reverse"); 2723 } 2724 2725 Value *StoredVal = State.get(StoredVPValue); 2726 if (isReverse()) { 2727 // If we store to reverse consecutive memory locations, then we need 2728 // to reverse the order of elements in the stored value. 2729 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 2730 // We don't want to update the value in the map as it might be used in 2731 // another expression. So don't call resetVectorValue(StoredVal). 2732 } 2733 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter); 2734 Instruction *NewSI = nullptr; 2735 if (CreateScatter) 2736 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask); 2737 else if (Mask) 2738 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask); 2739 else 2740 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment); 2741 State.addMetadata(NewSI, SI); 2742 } 2743 2744 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2745 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, 2746 VPSlotTracker &SlotTracker) const { 2747 O << Indent << "WIDEN store "; 2748 printOperands(O, SlotTracker); 2749 } 2750 #endif 2751 2752 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { 2753 auto *SI = cast<StoreInst>(&Ingredient); 2754 2755 VPValue *StoredValue = getStoredValue(); 2756 bool CreateScatter = !isConsecutive(); 2757 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2758 2759 auto &Builder = State.Builder; 2760 State.setDebugLocFrom(getDebugLoc()); 2761 2762 CallInst *NewSI = nullptr; 2763 Value *StoredVal = State.get(StoredValue); 2764 Value *EVL = State.get(getEVL(), VPLane(0)); 2765 if (isReverse()) 2766 StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse"); 2767 Value *Mask = nullptr; 2768 if (VPValue *VPMask = getMask()) { 2769 Mask = State.get(VPMask); 2770 if (isReverse()) 2771 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); 2772 } else { 2773 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 2774 } 2775 Value *Addr = State.get(getAddr(), !CreateScatter); 2776 if (CreateScatter) { 2777 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), 2778 Intrinsic::vp_scatter, 2779 {StoredVal, Addr, Mask, EVL}); 2780 } else { 2781 VectorBuilder VBuilder(Builder); 2782 VBuilder.setEVL(EVL).setMask(Mask); 2783 NewSI = cast<CallInst>(VBuilder.createVectorInstruction( 2784 Instruction::Store, Type::getVoidTy(EVL->getContext()), 2785 {StoredVal, Addr})); 2786 } 2787 NewSI->addParamAttr( 2788 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment)); 2789 State.addMetadata(NewSI, SI); 2790 } 2791 2792 InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF, 2793 VPCostContext &Ctx) const { 2794 if (!Consecutive || IsMasked) 2795 return VPWidenMemoryRecipe::computeCost(VF, Ctx); 2796 2797 // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost() 2798 // here because the EVL recipes using EVL to replace the tail mask. But in the 2799 // legacy model, it will always calculate the cost of mask. 2800 // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we 2801 // don't need to compare to the legacy cost model. 2802 Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); 2803 const Align Alignment = 2804 getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient)); 2805 unsigned AS = 2806 getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient)); 2807 InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( 2808 Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind); 2809 if (!Reverse) 2810 return Cost; 2811 2812 return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 2813 cast<VectorType>(Ty), {}, Ctx.CostKind, 2814 0); 2815 } 2816 2817 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2818 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent, 2819 VPSlotTracker &SlotTracker) const { 2820 O << Indent << "WIDEN vp.store "; 2821 printOperands(O, SlotTracker); 2822 } 2823 #endif 2824 2825 static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V, 2826 VectorType *DstVTy, const DataLayout &DL) { 2827 // Verify that V is a vector type with same number of elements as DstVTy. 2828 auto VF = DstVTy->getElementCount(); 2829 auto *SrcVecTy = cast<VectorType>(V->getType()); 2830 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match"); 2831 Type *SrcElemTy = SrcVecTy->getElementType(); 2832 Type *DstElemTy = DstVTy->getElementType(); 2833 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2834 "Vector elements must have same size"); 2835 2836 // Do a direct cast if element types are castable. 2837 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2838 return Builder.CreateBitOrPointerCast(V, DstVTy); 2839 } 2840 // V cannot be directly casted to desired vector type. 2841 // May happen when V is a floating point vector but DstVTy is a vector of 2842 // pointers or vice-versa. Handle this using a two-step bitcast using an 2843 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2844 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2845 "Only one type should be a pointer type"); 2846 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2847 "Only one type should be a floating point type"); 2848 Type *IntTy = 2849 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2850 auto *VecIntTy = VectorType::get(IntTy, VF); 2851 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2852 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2853 } 2854 2855 /// Return a vector containing interleaved elements from multiple 2856 /// smaller input vectors. 2857 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, 2858 const Twine &Name) { 2859 unsigned Factor = Vals.size(); 2860 assert(Factor > 1 && "Tried to interleave invalid number of vectors"); 2861 2862 VectorType *VecTy = cast<VectorType>(Vals[0]->getType()); 2863 #ifndef NDEBUG 2864 for (Value *Val : Vals) 2865 assert(Val->getType() == VecTy && "Tried to interleave mismatched types"); 2866 #endif 2867 2868 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so 2869 // must use intrinsics to interleave. 2870 if (VecTy->isScalableTy()) { 2871 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); 2872 return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2, 2873 Vals, 2874 /*FMFSource=*/nullptr, Name); 2875 } 2876 2877 // Fixed length. Start by concatenating all vectors into a wide vector. 2878 Value *WideVec = concatenateVectors(Builder, Vals); 2879 2880 // Interleave the elements into the wide vector. 2881 const unsigned NumElts = VecTy->getElementCount().getFixedValue(); 2882 return Builder.CreateShuffleVector( 2883 WideVec, createInterleaveMask(NumElts, Factor), Name); 2884 } 2885 2886 // Try to vectorize the interleave group that \p Instr belongs to. 2887 // 2888 // E.g. Translate following interleaved load group (factor = 3): 2889 // for (i = 0; i < N; i+=3) { 2890 // R = Pic[i]; // Member of index 0 2891 // G = Pic[i+1]; // Member of index 1 2892 // B = Pic[i+2]; // Member of index 2 2893 // ... // do something to R, G, B 2894 // } 2895 // To: 2896 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2897 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2898 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2899 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2900 // 2901 // Or translate following interleaved store group (factor = 3): 2902 // for (i = 0; i < N; i+=3) { 2903 // ... do something to R, G, B 2904 // Pic[i] = R; // Member of index 0 2905 // Pic[i+1] = G; // Member of index 1 2906 // Pic[i+2] = B; // Member of index 2 2907 // } 2908 // To: 2909 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2910 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2911 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2912 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2913 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2914 void VPInterleaveRecipe::execute(VPTransformState &State) { 2915 assert(!State.Lane && "Interleave group being replicated."); 2916 const InterleaveGroup<Instruction> *Group = IG; 2917 Instruction *Instr = Group->getInsertPos(); 2918 2919 // Prepare for the vector type of the interleaved load/store. 2920 Type *ScalarTy = getLoadStoreType(Instr); 2921 unsigned InterleaveFactor = Group->getFactor(); 2922 auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor); 2923 2924 // TODO: extend the masked interleaved-group support to reversed access. 2925 VPValue *BlockInMask = getMask(); 2926 assert((!BlockInMask || !Group->isReverse()) && 2927 "Reversed masked interleave-group not supported."); 2928 2929 VPValue *Addr = getAddr(); 2930 Value *ResAddr = State.get(Addr, VPLane(0)); 2931 if (auto *I = dyn_cast<Instruction>(ResAddr)) 2932 State.setDebugLocFrom(I->getDebugLoc()); 2933 2934 // If the group is reverse, adjust the index to refer to the last vector lane 2935 // instead of the first. We adjust the index from the first vector lane, 2936 // rather than directly getting the pointer for lane VF - 1, because the 2937 // pointer operand of the interleaved access is supposed to be uniform. 2938 if (Group->isReverse()) { 2939 Value *RuntimeVF = 2940 getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF); 2941 Value *Index = 2942 State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1)); 2943 Index = State.Builder.CreateMul(Index, 2944 State.Builder.getInt32(Group->getFactor())); 2945 Index = State.Builder.CreateNeg(Index); 2946 2947 bool InBounds = false; 2948 if (auto *Gep = dyn_cast<GetElementPtrInst>(ResAddr->stripPointerCasts())) 2949 InBounds = Gep->isInBounds(); 2950 ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Index, "", InBounds); 2951 } 2952 2953 State.setDebugLocFrom(Instr->getDebugLoc()); 2954 Value *PoisonVec = PoisonValue::get(VecTy); 2955 2956 auto CreateGroupMask = [&BlockInMask, &State, 2957 &InterleaveFactor](Value *MaskForGaps) -> Value * { 2958 if (State.VF.isScalable()) { 2959 assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); 2960 assert(InterleaveFactor == 2 && 2961 "Unsupported deinterleave factor for scalable vectors"); 2962 auto *ResBlockInMask = State.get(BlockInMask); 2963 SmallVector<Value *, 2> Ops = {ResBlockInMask, ResBlockInMask}; 2964 auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(), 2965 State.VF.getKnownMinValue() * 2, true); 2966 return State.Builder.CreateIntrinsic( 2967 MaskTy, Intrinsic::vector_interleave2, Ops, 2968 /*FMFSource=*/nullptr, "interleaved.mask"); 2969 } 2970 2971 if (!BlockInMask) 2972 return MaskForGaps; 2973 2974 Value *ResBlockInMask = State.get(BlockInMask); 2975 Value *ShuffledMask = State.Builder.CreateShuffleVector( 2976 ResBlockInMask, 2977 createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()), 2978 "interleaved.mask"); 2979 return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And, 2980 ShuffledMask, MaskForGaps) 2981 : ShuffledMask; 2982 }; 2983 2984 const DataLayout &DL = Instr->getDataLayout(); 2985 // Vectorize the interleaved load group. 2986 if (isa<LoadInst>(Instr)) { 2987 Value *MaskForGaps = nullptr; 2988 if (NeedsMaskForGaps) { 2989 MaskForGaps = createBitMaskForGaps(State.Builder, 2990 State.VF.getKnownMinValue(), *Group); 2991 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2992 } 2993 2994 Instruction *NewLoad; 2995 if (BlockInMask || MaskForGaps) { 2996 Value *GroupMask = CreateGroupMask(MaskForGaps); 2997 NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr, 2998 Group->getAlign(), GroupMask, 2999 PoisonVec, "wide.masked.vec"); 3000 } else 3001 NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr, 3002 Group->getAlign(), "wide.vec"); 3003 Group->addMetadata(NewLoad); 3004 3005 ArrayRef<VPValue *> VPDefs = definedValues(); 3006 const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); 3007 if (VecTy->isScalableTy()) { 3008 assert(InterleaveFactor == 2 && 3009 "Unsupported deinterleave factor for scalable vectors"); 3010 3011 // Scalable vectors cannot use arbitrary shufflevectors (only splats), 3012 // so must use intrinsics to deinterleave. 3013 Value *DI = State.Builder.CreateIntrinsic( 3014 Intrinsic::vector_deinterleave2, VecTy, NewLoad, 3015 /*FMFSource=*/nullptr, "strided.vec"); 3016 unsigned J = 0; 3017 for (unsigned I = 0; I < InterleaveFactor; ++I) { 3018 Instruction *Member = Group->getMember(I); 3019 3020 if (!Member) 3021 continue; 3022 3023 Value *StridedVec = State.Builder.CreateExtractValue(DI, I); 3024 // If this member has different type, cast the result type. 3025 if (Member->getType() != ScalarTy) { 3026 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); 3027 StridedVec = 3028 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); 3029 } 3030 3031 if (Group->isReverse()) 3032 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); 3033 3034 State.set(VPDefs[J], StridedVec); 3035 ++J; 3036 } 3037 3038 return; 3039 } 3040 3041 // For each member in the group, shuffle out the appropriate data from the 3042 // wide loads. 3043 unsigned J = 0; 3044 for (unsigned I = 0; I < InterleaveFactor; ++I) { 3045 Instruction *Member = Group->getMember(I); 3046 3047 // Skip the gaps in the group. 3048 if (!Member) 3049 continue; 3050 3051 auto StrideMask = 3052 createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue()); 3053 Value *StridedVec = 3054 State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec"); 3055 3056 // If this member has different type, cast the result type. 3057 if (Member->getType() != ScalarTy) { 3058 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 3059 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); 3060 StridedVec = 3061 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); 3062 } 3063 3064 if (Group->isReverse()) 3065 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); 3066 3067 State.set(VPDefs[J], StridedVec); 3068 ++J; 3069 } 3070 return; 3071 } 3072 3073 // The sub vector type for current instruction. 3074 auto *SubVT = VectorType::get(ScalarTy, State.VF); 3075 3076 // Vectorize the interleaved store group. 3077 Value *MaskForGaps = 3078 createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group); 3079 assert((!MaskForGaps || !State.VF.isScalable()) && 3080 "masking gaps for scalable vectors is not yet supported."); 3081 ArrayRef<VPValue *> StoredValues = getStoredValues(); 3082 // Collect the stored vector from each member. 3083 SmallVector<Value *, 4> StoredVecs; 3084 unsigned StoredIdx = 0; 3085 for (unsigned i = 0; i < InterleaveFactor; i++) { 3086 assert((Group->getMember(i) || MaskForGaps) && 3087 "Fail to get a member from an interleaved store group"); 3088 Instruction *Member = Group->getMember(i); 3089 3090 // Skip the gaps in the group. 3091 if (!Member) { 3092 Value *Undef = PoisonValue::get(SubVT); 3093 StoredVecs.push_back(Undef); 3094 continue; 3095 } 3096 3097 Value *StoredVec = State.get(StoredValues[StoredIdx]); 3098 ++StoredIdx; 3099 3100 if (Group->isReverse()) 3101 StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse"); 3102 3103 // If this member has different type, cast it to a unified type. 3104 3105 if (StoredVec->getType() != SubVT) 3106 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL); 3107 3108 StoredVecs.push_back(StoredVec); 3109 } 3110 3111 // Interleave all the smaller vectors into one wider vector. 3112 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec"); 3113 Instruction *NewStoreInstr; 3114 if (BlockInMask || MaskForGaps) { 3115 Value *GroupMask = CreateGroupMask(MaskForGaps); 3116 NewStoreInstr = State.Builder.CreateMaskedStore( 3117 IVec, ResAddr, Group->getAlign(), GroupMask); 3118 } else 3119 NewStoreInstr = 3120 State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign()); 3121 3122 Group->addMetadata(NewStoreInstr); 3123 } 3124 3125 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3126 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 3127 VPSlotTracker &SlotTracker) const { 3128 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 3129 IG->getInsertPos()->printAsOperand(O, false); 3130 O << ", "; 3131 getAddr()->printAsOperand(O, SlotTracker); 3132 VPValue *Mask = getMask(); 3133 if (Mask) { 3134 O << ", "; 3135 Mask->printAsOperand(O, SlotTracker); 3136 } 3137 3138 unsigned OpIdx = 0; 3139 for (unsigned i = 0; i < IG->getFactor(); ++i) { 3140 if (!IG->getMember(i)) 3141 continue; 3142 if (getNumStoreOperands() > 0) { 3143 O << "\n" << Indent << " store "; 3144 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 3145 O << " to index " << i; 3146 } else { 3147 O << "\n" << Indent << " "; 3148 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 3149 O << " = load from index " << i; 3150 } 3151 ++OpIdx; 3152 } 3153 } 3154 #endif 3155 3156 InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF, 3157 VPCostContext &Ctx) const { 3158 Instruction *InsertPos = getInsertPos(); 3159 // Find the VPValue index of the interleave group. We need to skip gaps. 3160 unsigned InsertPosIdx = 0; 3161 for (unsigned Idx = 0; IG->getFactor(); ++Idx) 3162 if (auto *Member = IG->getMember(Idx)) { 3163 if (Member == InsertPos) 3164 break; 3165 InsertPosIdx++; 3166 } 3167 Type *ValTy = Ctx.Types.inferScalarType( 3168 getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx) 3169 : getStoredValues()[InsertPosIdx]); 3170 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); 3171 unsigned AS = getLoadStoreAddressSpace(InsertPos); 3172 3173 unsigned InterleaveFactor = IG->getFactor(); 3174 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 3175 3176 // Holds the indices of existing members in the interleaved group. 3177 SmallVector<unsigned, 4> Indices; 3178 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 3179 if (IG->getMember(IF)) 3180 Indices.push_back(IF); 3181 3182 // Calculate the cost of the whole interleaved group. 3183 InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost( 3184 InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices, 3185 IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps); 3186 3187 if (!IG->isReverse()) 3188 return Cost; 3189 3190 return Cost + IG->getNumMembers() * 3191 Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 3192 VectorTy, std::nullopt, Ctx.CostKind, 3193 0); 3194 } 3195 3196 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3197 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3198 VPSlotTracker &SlotTracker) const { 3199 O << Indent << "EMIT "; 3200 printAsOperand(O, SlotTracker); 3201 O << " = CANONICAL-INDUCTION "; 3202 printOperands(O, SlotTracker); 3203 } 3204 #endif 3205 3206 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) { 3207 return IsScalarAfterVectorization && 3208 (!IsScalable || vputils::onlyFirstLaneUsed(this)); 3209 } 3210 3211 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 3212 assert(getInductionDescriptor().getKind() == 3213 InductionDescriptor::IK_PtrInduction && 3214 "Not a pointer induction according to InductionDescriptor!"); 3215 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && 3216 "Unexpected type."); 3217 assert(!onlyScalarsGenerated(State.VF.isScalable()) && 3218 "Recipe should have been replaced"); 3219 3220 unsigned CurrentPart = getUnrollPart(*this); 3221 3222 // Build a pointer phi 3223 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 3224 Type *ScStValueType = ScalarStartValue->getType(); 3225 3226 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3227 PHINode *NewPointerPhi = nullptr; 3228 if (CurrentPart == 0) { 3229 auto *IVR = cast<VPHeaderPHIRecipe>(&getParent() 3230 ->getPlan() 3231 ->getVectorLoopRegion() 3232 ->getEntryBasicBlock() 3233 ->front()); 3234 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, /*IsScalar*/ true)); 3235 NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi", 3236 CanonicalIV->getIterator()); 3237 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 3238 NewPointerPhi->setDebugLoc(getDebugLoc()); 3239 } else { 3240 // The recipe has been unrolled. In that case, fetch the single pointer phi 3241 // shared among all unrolled parts of the recipe. 3242 auto *GEP = 3243 cast<GetElementPtrInst>(State.get(getFirstUnrolledPartOperand())); 3244 NewPointerPhi = cast<PHINode>(GEP->getPointerOperand()); 3245 } 3246 3247 // A pointer induction, performed by using a gep 3248 BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint(); 3249 Value *ScalarStepValue = State.get(getStepValue(), VPLane(0)); 3250 Type *PhiType = State.TypeAnalysis.inferScalarType(getStepValue()); 3251 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 3252 // Add induction update using an incorrect block temporarily. The phi node 3253 // will be fixed after VPlan execution. Note that at this point the latch 3254 // block cannot be used, as it does not exist yet. 3255 // TODO: Model increment value in VPlan, by turning the recipe into a 3256 // multi-def and a subclass of VPHeaderPHIRecipe. 3257 if (CurrentPart == 0) { 3258 // The recipe represents the first part of the pointer induction. Create the 3259 // GEP to increment the phi across all unrolled parts. 3260 unsigned UF = CurrentPart == 0 ? getParent()->getPlan()->getUF() : 1; 3261 Value *NumUnrolledElems = 3262 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, UF)); 3263 3264 Value *InductionGEP = GetElementPtrInst::Create( 3265 State.Builder.getInt8Ty(), NewPointerPhi, 3266 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 3267 InductionLoc); 3268 3269 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 3270 } 3271 3272 // Create actual address geps that use the pointer phi as base and a 3273 // vectorized version of the step value (<step*0, ..., step*N>) as offset. 3274 Type *VecPhiType = VectorType::get(PhiType, State.VF); 3275 Value *StartOffsetScalar = State.Builder.CreateMul( 3276 RuntimeVF, ConstantInt::get(PhiType, CurrentPart)); 3277 Value *StartOffset = 3278 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 3279 // Create a vector of consecutive numbers from zero to VF. 3280 StartOffset = State.Builder.CreateAdd( 3281 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 3282 3283 assert(ScalarStepValue == State.get(getOperand(1), VPLane(0)) && 3284 "scalar step must be the same across all parts"); 3285 Value *GEP = State.Builder.CreateGEP( 3286 State.Builder.getInt8Ty(), NewPointerPhi, 3287 State.Builder.CreateMul(StartOffset, State.Builder.CreateVectorSplat( 3288 State.VF, ScalarStepValue)), 3289 "vector.gep"); 3290 State.set(this, GEP); 3291 } 3292 3293 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3294 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, 3295 VPSlotTracker &SlotTracker) const { 3296 assert((getNumOperands() == 2 || getNumOperands() == 4) && 3297 "unexpected number of operands"); 3298 O << Indent << "EMIT "; 3299 printAsOperand(O, SlotTracker); 3300 O << " = WIDEN-POINTER-INDUCTION "; 3301 getStartValue()->printAsOperand(O, SlotTracker); 3302 O << ", "; 3303 getStepValue()->printAsOperand(O, SlotTracker); 3304 if (getNumOperands() == 4) { 3305 O << ", "; 3306 getOperand(2)->printAsOperand(O, SlotTracker); 3307 O << ", "; 3308 getOperand(3)->printAsOperand(O, SlotTracker); 3309 } 3310 } 3311 #endif 3312 3313 void VPExpandSCEVRecipe::execute(VPTransformState &State) { 3314 assert(!State.Lane && "cannot be used in per-lane"); 3315 if (State.ExpandedSCEVs.contains(Expr)) { 3316 // SCEV Expr has already been expanded, result must already be set. At the 3317 // moment we have to execute the entry block twice (once before skeleton 3318 // creation to get expanded SCEVs used by the skeleton and once during 3319 // regular VPlan execution). 3320 State.Builder.SetInsertPoint(State.CFG.VPBB2IRBB[getParent()]); 3321 assert(State.get(this, VPLane(0)) == State.ExpandedSCEVs[Expr] && 3322 "Results must match"); 3323 return; 3324 } 3325 3326 const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); 3327 SCEVExpander Exp(SE, DL, "induction"); 3328 3329 Value *Res = Exp.expandCodeFor(Expr, Expr->getType(), 3330 &*State.Builder.GetInsertPoint()); 3331 State.ExpandedSCEVs[Expr] = Res; 3332 State.set(this, Res, VPLane(0)); 3333 } 3334 3335 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3336 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, 3337 VPSlotTracker &SlotTracker) const { 3338 O << Indent << "EMIT "; 3339 printAsOperand(O, SlotTracker); 3340 O << " = EXPAND SCEV " << *Expr; 3341 } 3342 #endif 3343 3344 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { 3345 Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true); 3346 Type *STy = CanonicalIV->getType(); 3347 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); 3348 ElementCount VF = State.VF; 3349 Value *VStart = VF.isScalar() 3350 ? CanonicalIV 3351 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); 3352 Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this)); 3353 if (VF.isVector()) { 3354 VStep = Builder.CreateVectorSplat(VF, VStep); 3355 VStep = 3356 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); 3357 } 3358 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); 3359 State.set(this, CanonicalVectorIV); 3360 } 3361 3362 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3363 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, 3364 VPSlotTracker &SlotTracker) const { 3365 O << Indent << "EMIT "; 3366 printAsOperand(O, SlotTracker); 3367 O << " = WIDEN-CANONICAL-INDUCTION "; 3368 printOperands(O, SlotTracker); 3369 } 3370 #endif 3371 3372 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { 3373 auto &Builder = State.Builder; 3374 // Create a vector from the initial value. 3375 auto *VectorInit = getStartValue()->getLiveInIRValue(); 3376 3377 Type *VecTy = State.VF.isScalar() 3378 ? VectorInit->getType() 3379 : VectorType::get(VectorInit->getType(), State.VF); 3380 3381 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3382 if (State.VF.isVector()) { 3383 auto *IdxTy = Builder.getInt32Ty(); 3384 auto *One = ConstantInt::get(IdxTy, 1); 3385 IRBuilder<>::InsertPointGuard Guard(Builder); 3386 Builder.SetInsertPoint(VectorPH->getTerminator()); 3387 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF); 3388 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 3389 VectorInit = Builder.CreateInsertElement( 3390 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init"); 3391 } 3392 3393 // Create a phi node for the new recurrence. 3394 PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur"); 3395 Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 3396 Phi->addIncoming(VectorInit, VectorPH); 3397 State.set(this, Phi); 3398 } 3399 3400 InstructionCost 3401 VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF, 3402 VPCostContext &Ctx) const { 3403 if (VF.isScalar()) 3404 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind); 3405 3406 if (VF.isScalable() && VF.getKnownMinValue() == 1) 3407 return InstructionCost::getInvalid(); 3408 3409 SmallVector<int> Mask(VF.getKnownMinValue()); 3410 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1); 3411 Type *VectorTy = 3412 toVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); 3413 3414 return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice, 3415 cast<VectorType>(VectorTy), Mask, Ctx.CostKind, 3416 VF.getKnownMinValue() - 1); 3417 } 3418 3419 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3420 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, 3421 VPSlotTracker &SlotTracker) const { 3422 O << Indent << "FIRST-ORDER-RECURRENCE-PHI "; 3423 printAsOperand(O, SlotTracker); 3424 O << " = phi "; 3425 printOperands(O, SlotTracker); 3426 } 3427 #endif 3428 3429 void VPReductionPHIRecipe::execute(VPTransformState &State) { 3430 auto &Builder = State.Builder; 3431 3432 // If this phi is fed by a scaled reduction then it should output a 3433 // vector with fewer elements than the VF. 3434 ElementCount VF = State.VF.divideCoefficientBy(VFScaleFactor); 3435 3436 // Reductions do not have to start at zero. They can start with 3437 // any loop invariant values. 3438 VPValue *StartVPV = getStartValue(); 3439 Value *StartV = StartVPV->getLiveInIRValue(); 3440 3441 // In order to support recurrences we need to be able to vectorize Phi nodes. 3442 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3443 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 3444 // this value when we vectorize all of the instructions that use the PHI. 3445 bool ScalarPHI = State.VF.isScalar() || IsInLoop; 3446 Type *VecTy = 3447 ScalarPHI ? StartV->getType() : VectorType::get(StartV->getType(), VF); 3448 3449 BasicBlock *HeaderBB = State.CFG.PrevBB; 3450 assert(State.CurrentParentLoop->getHeader() == HeaderBB && 3451 "recipe must be in the vector loop header"); 3452 auto *Phi = PHINode::Create(VecTy, 2, "vec.phi"); 3453 Phi->insertBefore(HeaderBB->getFirstInsertionPt()); 3454 State.set(this, Phi, IsInLoop); 3455 3456 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3457 3458 Value *Iden = nullptr; 3459 RecurKind RK = RdxDesc.getRecurrenceKind(); 3460 unsigned CurrentPart = getUnrollPart(*this); 3461 3462 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || 3463 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { 3464 // MinMax and AnyOf reductions have the start value as their identity. 3465 if (ScalarPHI) { 3466 Iden = StartV; 3467 } else { 3468 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 3469 Builder.SetInsertPoint(VectorPH->getTerminator()); 3470 StartV = Iden = State.get(StartVPV); 3471 } 3472 } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) { 3473 // [I|F]FindLastIV will use a sentinel value to initialize the reduction 3474 // phi or the resume value from the main vector loop when vectorizing the 3475 // epilogue loop. In the exit block, ComputeReductionResult will generate 3476 // checks to verify if the reduction result is the sentinel value. If the 3477 // result is the sentinel value, it will be corrected back to the start 3478 // value. 3479 // TODO: The sentinel value is not always necessary. When the start value is 3480 // a constant, and smaller than the start value of the induction variable, 3481 // the start value can be directly used to initialize the reduction phi. 3482 Iden = StartV; 3483 if (!ScalarPHI) { 3484 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 3485 Builder.SetInsertPoint(VectorPH->getTerminator()); 3486 StartV = Iden = Builder.CreateVectorSplat(State.VF, Iden); 3487 } 3488 } else { 3489 Iden = llvm::getRecurrenceIdentity(RK, VecTy->getScalarType(), 3490 RdxDesc.getFastMathFlags()); 3491 3492 if (!ScalarPHI) { 3493 if (CurrentPart == 0) { 3494 // Create start and identity vector values for the reduction in the 3495 // preheader. 3496 // TODO: Introduce recipes in VPlan preheader to create initial values. 3497 Iden = Builder.CreateVectorSplat(VF, Iden); 3498 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 3499 Builder.SetInsertPoint(VectorPH->getTerminator()); 3500 Constant *Zero = Builder.getInt32(0); 3501 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 3502 } else { 3503 Iden = Builder.CreateVectorSplat(VF, Iden); 3504 } 3505 } 3506 } 3507 3508 Phi = cast<PHINode>(State.get(this, IsInLoop)); 3509 Value *StartVal = (CurrentPart == 0) ? StartV : Iden; 3510 Phi->addIncoming(StartVal, VectorPH); 3511 } 3512 3513 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3514 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3515 VPSlotTracker &SlotTracker) const { 3516 O << Indent << "WIDEN-REDUCTION-PHI "; 3517 3518 printAsOperand(O, SlotTracker); 3519 O << " = phi "; 3520 printOperands(O, SlotTracker); 3521 if (VFScaleFactor != 1) 3522 O << " (VF scaled by 1/" << VFScaleFactor << ")"; 3523 } 3524 #endif 3525 3526 void VPWidenPHIRecipe::execute(VPTransformState &State) { 3527 assert(EnableVPlanNativePath && 3528 "Non-native vplans are not expected to have VPWidenPHIRecipes."); 3529 3530 State.setDebugLocFrom(getDebugLoc()); 3531 Value *Op0 = State.get(getOperand(0)); 3532 Type *VecTy = Op0->getType(); 3533 Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi"); 3534 State.set(this, VecPhi); 3535 } 3536 3537 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3538 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3539 VPSlotTracker &SlotTracker) const { 3540 O << Indent << "WIDEN-PHI "; 3541 3542 auto *OriginalPhi = cast<PHINode>(getUnderlyingValue()); 3543 // Unless all incoming values are modeled in VPlan print the original PHI 3544 // directly. 3545 // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming 3546 // values as VPValues. 3547 if (getNumOperands() != OriginalPhi->getNumOperands()) { 3548 O << VPlanIngredient(OriginalPhi); 3549 return; 3550 } 3551 3552 printAsOperand(O, SlotTracker); 3553 O << " = phi "; 3554 printOperands(O, SlotTracker); 3555 } 3556 #endif 3557 3558 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and 3559 // remove VPActiveLaneMaskPHIRecipe. 3560 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { 3561 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3562 Value *StartMask = State.get(getOperand(0)); 3563 PHINode *Phi = 3564 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); 3565 Phi->addIncoming(StartMask, VectorPH); 3566 Phi->setDebugLoc(getDebugLoc()); 3567 State.set(this, Phi); 3568 } 3569 3570 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3571 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3572 VPSlotTracker &SlotTracker) const { 3573 O << Indent << "ACTIVE-LANE-MASK-PHI "; 3574 3575 printAsOperand(O, SlotTracker); 3576 O << " = phi "; 3577 printOperands(O, SlotTracker); 3578 } 3579 #endif 3580 3581 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3582 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3583 VPSlotTracker &SlotTracker) const { 3584 O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI "; 3585 3586 printAsOperand(O, SlotTracker); 3587 O << " = phi "; 3588 printOperands(O, SlotTracker); 3589 } 3590 #endif 3591 3592 void VPScalarPHIRecipe::execute(VPTransformState &State) { 3593 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3594 Value *Start = State.get(getStartValue(), VPLane(0)); 3595 PHINode *Phi = State.Builder.CreatePHI(Start->getType(), 2, Name); 3596 Phi->addIncoming(Start, VectorPH); 3597 Phi->setDebugLoc(getDebugLoc()); 3598 State.set(this, Phi, /*IsScalar=*/true); 3599 } 3600 3601 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3602 void VPScalarPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3603 VPSlotTracker &SlotTracker) const { 3604 O << Indent << "SCALAR-PHI "; 3605 printAsOperand(O, SlotTracker); 3606 O << " = phi "; 3607 printOperands(O, SlotTracker); 3608 } 3609 #endif 3610