1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file contains implementations for different VPlan recipes. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "VPlan.h" 15 #include "VPlanAnalysis.h" 16 #include "VPlanUtils.h" 17 #include "llvm/ADT/STLExtras.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/Twine.h" 20 #include "llvm/Analysis/IVDescriptors.h" 21 #include "llvm/IR/BasicBlock.h" 22 #include "llvm/IR/IRBuilder.h" 23 #include "llvm/IR/Instruction.h" 24 #include "llvm/IR/Instructions.h" 25 #include "llvm/IR/Type.h" 26 #include "llvm/IR/Value.h" 27 #include "llvm/IR/VectorBuilder.h" 28 #include "llvm/Support/Casting.h" 29 #include "llvm/Support/CommandLine.h" 30 #include "llvm/Support/Debug.h" 31 #include "llvm/Support/raw_ostream.h" 32 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 33 #include "llvm/Transforms/Utils/LoopUtils.h" 34 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 35 #include <cassert> 36 37 using namespace llvm; 38 39 using VectorParts = SmallVector<Value *, 2>; 40 41 namespace llvm { 42 extern cl::opt<bool> EnableVPlanNativePath; 43 } 44 extern cl::opt<unsigned> ForceTargetInstructionCost; 45 46 #define LV_NAME "loop-vectorize" 47 #define DEBUG_TYPE LV_NAME 48 49 bool VPRecipeBase::mayWriteToMemory() const { 50 switch (getVPDefID()) { 51 case VPInterleaveSC: 52 return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0; 53 case VPWidenStoreEVLSC: 54 case VPWidenStoreSC: 55 return true; 56 case VPReplicateSC: 57 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 58 ->mayWriteToMemory(); 59 case VPWidenCallSC: 60 return !cast<VPWidenCallRecipe>(this) 61 ->getCalledScalarFunction() 62 ->onlyReadsMemory(); 63 case VPBranchOnMaskSC: 64 case VPScalarIVStepsSC: 65 case VPPredInstPHISC: 66 return false; 67 case VPBlendSC: 68 case VPReductionEVLSC: 69 case VPReductionSC: 70 case VPWidenCanonicalIVSC: 71 case VPWidenCastSC: 72 case VPWidenGEPSC: 73 case VPWidenIntOrFpInductionSC: 74 case VPWidenLoadEVLSC: 75 case VPWidenLoadSC: 76 case VPWidenPHISC: 77 case VPWidenSC: 78 case VPWidenEVLSC: 79 case VPWidenSelectSC: { 80 const Instruction *I = 81 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 82 (void)I; 83 assert((!I || !I->mayWriteToMemory()) && 84 "underlying instruction may write to memory"); 85 return false; 86 } 87 default: 88 return true; 89 } 90 } 91 92 bool VPRecipeBase::mayReadFromMemory() const { 93 switch (getVPDefID()) { 94 case VPWidenLoadEVLSC: 95 case VPWidenLoadSC: 96 return true; 97 case VPReplicateSC: 98 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 99 ->mayReadFromMemory(); 100 case VPWidenCallSC: 101 return !cast<VPWidenCallRecipe>(this) 102 ->getCalledScalarFunction() 103 ->onlyWritesMemory(); 104 case VPBranchOnMaskSC: 105 case VPPredInstPHISC: 106 case VPScalarIVStepsSC: 107 case VPWidenStoreEVLSC: 108 case VPWidenStoreSC: 109 return false; 110 case VPBlendSC: 111 case VPReductionEVLSC: 112 case VPReductionSC: 113 case VPWidenCanonicalIVSC: 114 case VPWidenCastSC: 115 case VPWidenGEPSC: 116 case VPWidenIntOrFpInductionSC: 117 case VPWidenPHISC: 118 case VPWidenSC: 119 case VPWidenEVLSC: 120 case VPWidenSelectSC: { 121 const Instruction *I = 122 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 123 (void)I; 124 assert((!I || !I->mayReadFromMemory()) && 125 "underlying instruction may read from memory"); 126 return false; 127 } 128 default: 129 return true; 130 } 131 } 132 133 bool VPRecipeBase::mayHaveSideEffects() const { 134 switch (getVPDefID()) { 135 case VPDerivedIVSC: 136 case VPPredInstPHISC: 137 case VPScalarCastSC: 138 return false; 139 case VPInstructionSC: 140 switch (cast<VPInstruction>(this)->getOpcode()) { 141 case Instruction::Or: 142 case Instruction::ICmp: 143 case Instruction::Select: 144 case VPInstruction::Not: 145 case VPInstruction::CalculateTripCountMinusVF: 146 case VPInstruction::CanonicalIVIncrementForPart: 147 case VPInstruction::ExtractFromEnd: 148 case VPInstruction::FirstOrderRecurrenceSplice: 149 case VPInstruction::LogicalAnd: 150 case VPInstruction::PtrAdd: 151 return false; 152 default: 153 return true; 154 } 155 case VPWidenCallSC: { 156 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction(); 157 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn(); 158 } 159 case VPBlendSC: 160 case VPReductionEVLSC: 161 case VPReductionSC: 162 case VPScalarIVStepsSC: 163 case VPWidenCanonicalIVSC: 164 case VPWidenCastSC: 165 case VPWidenGEPSC: 166 case VPWidenIntOrFpInductionSC: 167 case VPWidenPHISC: 168 case VPWidenPointerInductionSC: 169 case VPWidenSC: 170 case VPWidenEVLSC: 171 case VPWidenSelectSC: { 172 const Instruction *I = 173 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 174 (void)I; 175 assert((!I || !I->mayHaveSideEffects()) && 176 "underlying instruction has side-effects"); 177 return false; 178 } 179 case VPInterleaveSC: 180 return mayWriteToMemory(); 181 case VPWidenLoadEVLSC: 182 case VPWidenLoadSC: 183 case VPWidenStoreEVLSC: 184 case VPWidenStoreSC: 185 assert( 186 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() == 187 mayWriteToMemory() && 188 "mayHaveSideffects result for ingredient differs from this " 189 "implementation"); 190 return mayWriteToMemory(); 191 case VPReplicateSC: { 192 auto *R = cast<VPReplicateRecipe>(this); 193 return R->getUnderlyingInstr()->mayHaveSideEffects(); 194 } 195 default: 196 return true; 197 } 198 } 199 200 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { 201 VPValue *ExitValue = getOperand(0); 202 VPBasicBlock *MiddleVPBB = 203 cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor()); 204 VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe(); 205 auto *ExitingVPBB = ExitingRecipe ? ExitingRecipe->getParent() : nullptr; 206 // Values leaving the vector loop reach live out phi's in the exiting block 207 // via middle block. 208 auto *PredVPBB = !ExitingVPBB || ExitingVPBB->getEnclosingLoopRegion() 209 ? MiddleVPBB 210 : ExitingVPBB; 211 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; 212 Value *V = State.get(ExitValue, VPIteration(0, 0)); 213 if (Phi->getBasicBlockIndex(PredBB) != -1) 214 Phi->setIncomingValueForBlock(PredBB, V); 215 else 216 Phi->addIncoming(V, PredBB); 217 } 218 219 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 220 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { 221 O << "Live-out "; 222 getPhi()->printAsOperand(O); 223 O << " = "; 224 getOperand(0)->printAsOperand(O, SlotTracker); 225 O << "\n"; 226 } 227 #endif 228 229 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { 230 assert(!Parent && "Recipe already in some VPBasicBlock"); 231 assert(InsertPos->getParent() && 232 "Insertion position not in any VPBasicBlock"); 233 InsertPos->getParent()->insert(this, InsertPos->getIterator()); 234 } 235 236 void VPRecipeBase::insertBefore(VPBasicBlock &BB, 237 iplist<VPRecipeBase>::iterator I) { 238 assert(!Parent && "Recipe already in some VPBasicBlock"); 239 assert(I == BB.end() || I->getParent() == &BB); 240 BB.insert(this, I); 241 } 242 243 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { 244 assert(!Parent && "Recipe already in some VPBasicBlock"); 245 assert(InsertPos->getParent() && 246 "Insertion position not in any VPBasicBlock"); 247 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator())); 248 } 249 250 void VPRecipeBase::removeFromParent() { 251 assert(getParent() && "Recipe not in any VPBasicBlock"); 252 getParent()->getRecipeList().remove(getIterator()); 253 Parent = nullptr; 254 } 255 256 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { 257 assert(getParent() && "Recipe not in any VPBasicBlock"); 258 return getParent()->getRecipeList().erase(getIterator()); 259 } 260 261 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { 262 removeFromParent(); 263 insertAfter(InsertPos); 264 } 265 266 void VPRecipeBase::moveBefore(VPBasicBlock &BB, 267 iplist<VPRecipeBase>::iterator I) { 268 removeFromParent(); 269 insertBefore(BB, I); 270 } 271 272 /// Return the underlying instruction to be used for computing \p R's cost via 273 /// the legacy cost model. Return nullptr if there's no suitable instruction. 274 static Instruction *getInstructionForCost(const VPRecipeBase *R) { 275 if (auto *S = dyn_cast<VPSingleDefRecipe>(R)) 276 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue()); 277 if (auto *IG = dyn_cast<VPInterleaveRecipe>(R)) 278 return IG->getInsertPos(); 279 // Currently the legacy cost model only calculates the instruction cost with 280 // underlying instruction. Removing the WidenMem here will prevent 281 // force-target-instruction-cost overwriting the cost of recipe with 282 // underlying instruction which is inconsistent with the legacy model. 283 // TODO: Remove WidenMem from this function when we don't need to compare to 284 // the legacy model. 285 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R)) 286 return &WidenMem->getIngredient(); 287 return nullptr; 288 } 289 290 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { 291 auto *UI = getInstructionForCost(this); 292 if (UI && Ctx.skipCostComputation(UI, VF.isVector())) 293 return 0; 294 295 InstructionCost RecipeCost = computeCost(VF, Ctx); 296 if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 && 297 RecipeCost.isValid()) 298 RecipeCost = InstructionCost(ForceTargetInstructionCost); 299 300 LLVM_DEBUG({ 301 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": "; 302 dump(); 303 }); 304 return RecipeCost; 305 } 306 307 InstructionCost VPRecipeBase::computeCost(ElementCount VF, 308 VPCostContext &Ctx) const { 309 // Compute the cost for the recipe falling back to the legacy cost model using 310 // the underlying instruction. If there is no underlying instruction, returns 311 // 0. 312 Instruction *UI = getInstructionForCost(this); 313 if (UI && isa<VPReplicateRecipe>(this)) { 314 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan 315 // transform, avoid computing their cost multiple times for now. 316 Ctx.SkipCostComputation.insert(UI); 317 } 318 return UI ? Ctx.getLegacyCost(UI, VF) : 0; 319 } 320 321 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { 322 assert(OpType == OperationType::FPMathOp && 323 "recipe doesn't have fast math flags"); 324 FastMathFlags Res; 325 Res.setAllowReassoc(FMFs.AllowReassoc); 326 Res.setNoNaNs(FMFs.NoNaNs); 327 Res.setNoInfs(FMFs.NoInfs); 328 Res.setNoSignedZeros(FMFs.NoSignedZeros); 329 Res.setAllowReciprocal(FMFs.AllowReciprocal); 330 Res.setAllowContract(FMFs.AllowContract); 331 Res.setApproxFunc(FMFs.ApproxFunc); 332 return Res; 333 } 334 335 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, 336 VPValue *A, VPValue *B, DebugLoc DL, 337 const Twine &Name) 338 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}), 339 Pred, DL), 340 Opcode(Opcode), Name(Name.str()) { 341 assert(Opcode == Instruction::ICmp && 342 "only ICmp predicates supported at the moment"); 343 } 344 345 VPInstruction::VPInstruction(unsigned Opcode, 346 std::initializer_list<VPValue *> Operands, 347 FastMathFlags FMFs, DebugLoc DL, const Twine &Name) 348 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL), 349 Opcode(Opcode), Name(Name.str()) { 350 // Make sure the VPInstruction is a floating-point operation. 351 assert(isFPMathOp() && "this op can't take fast-math flags"); 352 } 353 354 bool VPInstruction::doesGeneratePerAllLanes() const { 355 return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this); 356 } 357 358 bool VPInstruction::canGenerateScalarForFirstLane() const { 359 if (Instruction::isBinaryOp(getOpcode())) 360 return true; 361 if (isSingleScalar() || isVectorToScalar()) 362 return true; 363 switch (Opcode) { 364 case Instruction::ICmp: 365 case VPInstruction::BranchOnCond: 366 case VPInstruction::BranchOnCount: 367 case VPInstruction::CalculateTripCountMinusVF: 368 case VPInstruction::CanonicalIVIncrementForPart: 369 case VPInstruction::PtrAdd: 370 case VPInstruction::ExplicitVectorLength: 371 return true; 372 default: 373 return false; 374 } 375 } 376 377 Value *VPInstruction::generatePerLane(VPTransformState &State, 378 const VPIteration &Lane) { 379 IRBuilderBase &Builder = State.Builder; 380 381 assert(getOpcode() == VPInstruction::PtrAdd && 382 "only PtrAdd opcodes are supported for now"); 383 return Builder.CreatePtrAdd(State.get(getOperand(0), Lane), 384 State.get(getOperand(1), Lane), Name); 385 } 386 387 Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { 388 IRBuilderBase &Builder = State.Builder; 389 390 if (Instruction::isBinaryOp(getOpcode())) { 391 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 392 Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed); 393 Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed); 394 auto *Res = 395 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); 396 if (auto *I = dyn_cast<Instruction>(Res)) 397 setFlags(I); 398 return Res; 399 } 400 401 switch (getOpcode()) { 402 case VPInstruction::Not: { 403 Value *A = State.get(getOperand(0), Part); 404 return Builder.CreateNot(A, Name); 405 } 406 case Instruction::ICmp: { 407 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 408 Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed); 409 Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed); 410 return Builder.CreateCmp(getPredicate(), A, B, Name); 411 } 412 case Instruction::Select: { 413 Value *Cond = State.get(getOperand(0), Part); 414 Value *Op1 = State.get(getOperand(1), Part); 415 Value *Op2 = State.get(getOperand(2), Part); 416 return Builder.CreateSelect(Cond, Op1, Op2, Name); 417 } 418 case VPInstruction::ActiveLaneMask: { 419 // Get first lane of vector induction variable. 420 Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); 421 // Get the original loop tripcount. 422 Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0)); 423 424 // If this part of the active lane mask is scalar, generate the CMP directly 425 // to avoid unnecessary extracts. 426 if (State.VF.isScalar()) 427 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC, 428 Name); 429 430 auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); 431 auto *PredTy = VectorType::get(Int1Ty, State.VF); 432 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, 433 {PredTy, ScalarTC->getType()}, 434 {VIVElem0, ScalarTC}, nullptr, Name); 435 } 436 case VPInstruction::FirstOrderRecurrenceSplice: { 437 // Generate code to combine the previous and current values in vector v3. 438 // 439 // vector.ph: 440 // v_init = vector(..., ..., ..., a[-1]) 441 // br vector.body 442 // 443 // vector.body 444 // i = phi [0, vector.ph], [i+4, vector.body] 445 // v1 = phi [v_init, vector.ph], [v2, vector.body] 446 // v2 = a[i, i+1, i+2, i+3]; 447 // v3 = vector(v1(3), v2(0, 1, 2)) 448 449 // For the first part, use the recurrence phi (v1), otherwise v2. 450 auto *V1 = State.get(getOperand(0), 0); 451 Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1); 452 if (!PartMinus1->getType()->isVectorTy()) 453 return PartMinus1; 454 Value *V2 = State.get(getOperand(1), Part); 455 return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name); 456 } 457 case VPInstruction::CalculateTripCountMinusVF: { 458 if (Part != 0) 459 return State.get(this, 0, /*IsScalar*/ true); 460 461 Value *ScalarTC = State.get(getOperand(0), {0, 0}); 462 Value *Step = 463 createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF); 464 Value *Sub = Builder.CreateSub(ScalarTC, Step); 465 Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step); 466 Value *Zero = ConstantInt::get(ScalarTC->getType(), 0); 467 return Builder.CreateSelect(Cmp, Sub, Zero); 468 } 469 case VPInstruction::ExplicitVectorLength: { 470 // Compute EVL 471 auto GetEVL = [=](VPTransformState &State, Value *AVL) { 472 assert(AVL->getType()->isIntegerTy() && 473 "Requested vector length should be an integer."); 474 475 // TODO: Add support for MaxSafeDist for correct loop emission. 476 assert(State.VF.isScalable() && "Expected scalable vector factor."); 477 Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue()); 478 479 Value *EVL = State.Builder.CreateIntrinsic( 480 State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length, 481 {AVL, VFArg, State.Builder.getTrue()}); 482 return EVL; 483 }; 484 // TODO: Restructure this code with an explicit remainder loop, vsetvli can 485 // be outside of the main loop. 486 assert(Part == 0 && "No unrolling expected for predicated vectorization."); 487 // Compute VTC - IV as the AVL (requested vector length). 488 Value *Index = State.get(getOperand(0), VPIteration(0, 0)); 489 Value *TripCount = State.get(getOperand(1), VPIteration(0, 0)); 490 Value *AVL = State.Builder.CreateSub(TripCount, Index); 491 Value *EVL = GetEVL(State, AVL); 492 return EVL; 493 } 494 case VPInstruction::CanonicalIVIncrementForPart: { 495 auto *IV = State.get(getOperand(0), VPIteration(0, 0)); 496 if (Part == 0) 497 return IV; 498 499 // The canonical IV is incremented by the vectorization factor (num of SIMD 500 // elements) times the unroll part. 501 Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); 502 return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(), 503 hasNoSignedWrap()); 504 } 505 case VPInstruction::BranchOnCond: { 506 if (Part != 0) 507 return nullptr; 508 509 Value *Cond = State.get(getOperand(0), VPIteration(Part, 0)); 510 // Replace the temporary unreachable terminator with a new conditional 511 // branch, hooking it up to backward destination for exiting blocks now and 512 // to forward destination(s) later when they are created. 513 BranchInst *CondBr = 514 Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr); 515 CondBr->setSuccessor(0, nullptr); 516 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 517 518 if (!getParent()->isExiting()) 519 return CondBr; 520 521 VPRegionBlock *ParentRegion = getParent()->getParent(); 522 VPBasicBlock *Header = ParentRegion->getEntryBasicBlock(); 523 CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]); 524 return CondBr; 525 } 526 case VPInstruction::BranchOnCount: { 527 if (Part != 0) 528 return nullptr; 529 // First create the compare. 530 Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true); 531 Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true); 532 Value *Cond = Builder.CreateICmpEQ(IV, TC); 533 534 // Now create the branch. 535 auto *Plan = getParent()->getPlan(); 536 VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); 537 VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock(); 538 539 // Replace the temporary unreachable terminator with a new conditional 540 // branch, hooking it up to backward destination (the header) now and to the 541 // forward destination (the exit/middle block) later when it is created. 542 // Note that CreateCondBr expects a valid BB as first argument, so we need 543 // to set it to nullptr later. 544 BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), 545 State.CFG.VPBB2IRBB[Header]); 546 CondBr->setSuccessor(0, nullptr); 547 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 548 return CondBr; 549 } 550 case VPInstruction::ComputeReductionResult: { 551 if (Part != 0) 552 return State.get(this, 0, /*IsScalar*/ true); 553 554 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary 555 // and will be removed by breaking up the recipe further. 556 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0)); 557 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 558 // Get its reduction variable descriptor. 559 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 560 561 RecurKind RK = RdxDesc.getRecurrenceKind(); 562 563 VPValue *LoopExitingDef = getOperand(1); 564 Type *PhiTy = OrigPhi->getType(); 565 VectorParts RdxParts(State.UF); 566 for (unsigned Part = 0; Part < State.UF; ++Part) 567 RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop()); 568 569 // If the vector reduction can be performed in a smaller type, we truncate 570 // then extend the loop exit value to enable InstCombine to evaluate the 571 // entire expression in the smaller type. 572 // TODO: Handle this in truncateToMinBW. 573 if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 574 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF); 575 for (unsigned Part = 0; Part < State.UF; ++Part) 576 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 577 } 578 // Reduce all of the unrolled parts into a single vector. 579 Value *ReducedPartRdx = RdxParts[0]; 580 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 581 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) 582 Op = Instruction::Or; 583 584 if (PhiR->isOrdered()) { 585 ReducedPartRdx = RdxParts[State.UF - 1]; 586 } else { 587 // Floating-point operations should have some FMF to enable the reduction. 588 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 589 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 590 for (unsigned Part = 1; Part < State.UF; ++Part) { 591 Value *RdxPart = RdxParts[Part]; 592 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 593 ReducedPartRdx = Builder.CreateBinOp( 594 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 595 else 596 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 597 } 598 } 599 600 // Create the reduction after the loop. Note that inloop reductions create 601 // the target reduction in the loop using a Reduction recipe. 602 if ((State.VF.isVector() || 603 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) && 604 !PhiR->isInLoop()) { 605 ReducedPartRdx = 606 createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); 607 // If the reduction can be performed in a smaller type, we need to extend 608 // the reduction to the wider type before we branch to the original loop. 609 if (PhiTy != RdxDesc.getRecurrenceType()) 610 ReducedPartRdx = RdxDesc.isSigned() 611 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 612 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 613 } 614 615 // If there were stores of the reduction value to a uniform memory address 616 // inside the loop, create the final store here. 617 if (StoreInst *SI = RdxDesc.IntermediateStore) { 618 auto *NewSI = Builder.CreateAlignedStore( 619 ReducedPartRdx, SI->getPointerOperand(), SI->getAlign()); 620 propagateMetadata(NewSI, SI); 621 } 622 623 return ReducedPartRdx; 624 } 625 case VPInstruction::ExtractFromEnd: { 626 if (Part != 0) 627 return State.get(this, 0, /*IsScalar*/ true); 628 629 auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue()); 630 unsigned Offset = CI->getZExtValue(); 631 assert(Offset > 0 && "Offset from end must be positive"); 632 Value *Res; 633 if (State.VF.isVector()) { 634 assert(Offset <= State.VF.getKnownMinValue() && 635 "invalid offset to extract from"); 636 // Extract lane VF - Offset from the operand. 637 Res = State.get( 638 getOperand(0), 639 VPIteration(State.UF - 1, VPLane::getLaneFromEnd(State.VF, Offset))); 640 } else { 641 assert(Offset <= State.UF && "invalid offset to extract from"); 642 // When loop is unrolled without vectorizing, retrieve UF - Offset. 643 Res = State.get(getOperand(0), State.UF - Offset); 644 } 645 if (isa<ExtractElementInst>(Res)) 646 Res->setName(Name); 647 return Res; 648 } 649 case VPInstruction::LogicalAnd: { 650 Value *A = State.get(getOperand(0), Part); 651 Value *B = State.get(getOperand(1), Part); 652 return Builder.CreateLogicalAnd(A, B, Name); 653 } 654 case VPInstruction::PtrAdd: { 655 assert(vputils::onlyFirstLaneUsed(this) && 656 "can only generate first lane for PtrAdd"); 657 Value *Ptr = State.get(getOperand(0), Part, /* IsScalar */ true); 658 Value *Addend = State.get(getOperand(1), Part, /* IsScalar */ true); 659 return Builder.CreatePtrAdd(Ptr, Addend, Name); 660 } 661 case VPInstruction::ResumePhi: { 662 if (Part != 0) 663 return State.get(this, 0, /*IsScalar*/ true); 664 Value *IncomingFromVPlanPred = 665 State.get(getOperand(0), Part, /* IsScalar */ true); 666 Value *IncomingFromOtherPreds = 667 State.get(getOperand(1), Part, /* IsScalar */ true); 668 auto *NewPhi = 669 Builder.CreatePHI(IncomingFromOtherPreds->getType(), 2, Name); 670 BasicBlock *VPlanPred = 671 State.CFG 672 .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getSinglePredecessor())]; 673 NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred); 674 for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) { 675 assert(OtherPred != VPlanPred && 676 "VPlan predecessors should not be connected yet"); 677 NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred); 678 } 679 return NewPhi; 680 } 681 682 default: 683 llvm_unreachable("Unsupported opcode for instruction"); 684 } 685 } 686 687 bool VPInstruction::isVectorToScalar() const { 688 return getOpcode() == VPInstruction::ExtractFromEnd || 689 getOpcode() == VPInstruction::ComputeReductionResult; 690 } 691 692 bool VPInstruction::isSingleScalar() const { 693 return getOpcode() == VPInstruction::ResumePhi; 694 } 695 696 #if !defined(NDEBUG) 697 bool VPInstruction::isFPMathOp() const { 698 // Inspired by FPMathOperator::classof. Notable differences are that we don't 699 // support Call, PHI and Select opcodes here yet. 700 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || 701 Opcode == Instruction::FNeg || Opcode == Instruction::FSub || 702 Opcode == Instruction::FDiv || Opcode == Instruction::FRem || 703 Opcode == Instruction::FCmp || Opcode == Instruction::Select; 704 } 705 #endif 706 707 void VPInstruction::execute(VPTransformState &State) { 708 assert(!State.Instance && "VPInstruction executing an Instance"); 709 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 710 assert((hasFastMathFlags() == isFPMathOp() || 711 getOpcode() == Instruction::Select) && 712 "Recipe not a FPMathOp but has fast-math flags?"); 713 if (hasFastMathFlags()) 714 State.Builder.setFastMathFlags(getFastMathFlags()); 715 State.setDebugLocFrom(getDebugLoc()); 716 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() && 717 (vputils::onlyFirstLaneUsed(this) || 718 isVectorToScalar() || isSingleScalar()); 719 bool GeneratesPerAllLanes = doesGeneratePerAllLanes(); 720 bool OnlyFirstPartUsed = vputils::onlyFirstPartUsed(this); 721 for (unsigned Part = 0; Part < State.UF; ++Part) { 722 if (GeneratesPerAllLanes) { 723 for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue(); 724 Lane != NumLanes; ++Lane) { 725 Value *GeneratedValue = generatePerLane(State, VPIteration(Part, Lane)); 726 assert(GeneratedValue && "generatePerLane must produce a value"); 727 State.set(this, GeneratedValue, VPIteration(Part, Lane)); 728 } 729 continue; 730 } 731 732 if (Part != 0 && OnlyFirstPartUsed && hasResult()) { 733 Value *Part0 = State.get(this, 0, /*IsScalar*/ GeneratesPerFirstLaneOnly); 734 State.set(this, Part0, Part, 735 /*IsScalar*/ GeneratesPerFirstLaneOnly); 736 continue; 737 } 738 739 Value *GeneratedValue = generatePerPart(State, Part); 740 if (!hasResult()) 741 continue; 742 assert(GeneratedValue && "generatePerPart must produce a value"); 743 assert((GeneratedValue->getType()->isVectorTy() == 744 !GeneratesPerFirstLaneOnly || 745 State.VF.isScalar()) && 746 "scalar value but not only first lane defined"); 747 State.set(this, GeneratedValue, Part, 748 /*IsScalar*/ GeneratesPerFirstLaneOnly); 749 } 750 } 751 752 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { 753 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 754 if (Instruction::isBinaryOp(getOpcode())) 755 return vputils::onlyFirstLaneUsed(this); 756 757 switch (getOpcode()) { 758 default: 759 return false; 760 case Instruction::ICmp: 761 case VPInstruction::PtrAdd: 762 // TODO: Cover additional opcodes. 763 return vputils::onlyFirstLaneUsed(this); 764 case VPInstruction::ActiveLaneMask: 765 case VPInstruction::ExplicitVectorLength: 766 case VPInstruction::CalculateTripCountMinusVF: 767 case VPInstruction::CanonicalIVIncrementForPart: 768 case VPInstruction::BranchOnCount: 769 case VPInstruction::BranchOnCond: 770 case VPInstruction::ResumePhi: 771 return true; 772 }; 773 llvm_unreachable("switch should return"); 774 } 775 776 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const { 777 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 778 if (Instruction::isBinaryOp(getOpcode())) 779 return vputils::onlyFirstPartUsed(this); 780 781 switch (getOpcode()) { 782 default: 783 return false; 784 case Instruction::ICmp: 785 case Instruction::Select: 786 return vputils::onlyFirstPartUsed(this); 787 case VPInstruction::BranchOnCount: 788 case VPInstruction::BranchOnCond: 789 case VPInstruction::CanonicalIVIncrementForPart: 790 return true; 791 }; 792 llvm_unreachable("switch should return"); 793 } 794 795 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 796 void VPInstruction::dump() const { 797 VPSlotTracker SlotTracker(getParent()->getPlan()); 798 print(dbgs(), "", SlotTracker); 799 } 800 801 void VPInstruction::print(raw_ostream &O, const Twine &Indent, 802 VPSlotTracker &SlotTracker) const { 803 O << Indent << "EMIT "; 804 805 if (hasResult()) { 806 printAsOperand(O, SlotTracker); 807 O << " = "; 808 } 809 810 switch (getOpcode()) { 811 case VPInstruction::Not: 812 O << "not"; 813 break; 814 case VPInstruction::SLPLoad: 815 O << "combined load"; 816 break; 817 case VPInstruction::SLPStore: 818 O << "combined store"; 819 break; 820 case VPInstruction::ActiveLaneMask: 821 O << "active lane mask"; 822 break; 823 case VPInstruction::ResumePhi: 824 O << "resume-phi"; 825 break; 826 case VPInstruction::ExplicitVectorLength: 827 O << "EXPLICIT-VECTOR-LENGTH"; 828 break; 829 case VPInstruction::FirstOrderRecurrenceSplice: 830 O << "first-order splice"; 831 break; 832 case VPInstruction::BranchOnCond: 833 O << "branch-on-cond"; 834 break; 835 case VPInstruction::CalculateTripCountMinusVF: 836 O << "TC > VF ? TC - VF : 0"; 837 break; 838 case VPInstruction::CanonicalIVIncrementForPart: 839 O << "VF * Part +"; 840 break; 841 case VPInstruction::BranchOnCount: 842 O << "branch-on-count"; 843 break; 844 case VPInstruction::ExtractFromEnd: 845 O << "extract-from-end"; 846 break; 847 case VPInstruction::ComputeReductionResult: 848 O << "compute-reduction-result"; 849 break; 850 case VPInstruction::LogicalAnd: 851 O << "logical-and"; 852 break; 853 case VPInstruction::PtrAdd: 854 O << "ptradd"; 855 break; 856 default: 857 O << Instruction::getOpcodeName(getOpcode()); 858 } 859 860 printFlags(O); 861 printOperands(O, SlotTracker); 862 863 if (auto DL = getDebugLoc()) { 864 O << ", !dbg "; 865 DL.print(O); 866 } 867 } 868 #endif 869 870 void VPWidenCallRecipe::execute(VPTransformState &State) { 871 assert(State.VF.isVector() && "not widening"); 872 Function *CalledScalarFn = getCalledScalarFunction(); 873 assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) && 874 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 875 State.setDebugLocFrom(getDebugLoc()); 876 877 bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic; 878 FunctionType *VFTy = nullptr; 879 if (Variant) 880 VFTy = Variant->getFunctionType(); 881 for (unsigned Part = 0; Part < State.UF; ++Part) { 882 SmallVector<Type *, 2> TysForDecl; 883 // Add return type if intrinsic is overloaded on it. 884 if (UseIntrinsic && 885 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1)) 886 TysForDecl.push_back(VectorType::get( 887 CalledScalarFn->getReturnType()->getScalarType(), State.VF)); 888 SmallVector<Value *, 4> Args; 889 for (const auto &I : enumerate(arg_operands())) { 890 // Some intrinsics have a scalar argument - don't replace it with a 891 // vector. 892 Value *Arg; 893 if (UseIntrinsic && 894 isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())) 895 Arg = State.get(I.value(), VPIteration(0, 0)); 896 // Some vectorized function variants may also take a scalar argument, 897 // e.g. linear parameters for pointers. This needs to be the scalar value 898 // from the start of the respective part when interleaving. 899 else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy()) 900 Arg = State.get(I.value(), VPIteration(Part, 0)); 901 else 902 Arg = State.get(I.value(), Part); 903 if (UseIntrinsic && 904 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index())) 905 TysForDecl.push_back(Arg->getType()); 906 Args.push_back(Arg); 907 } 908 909 Function *VectorF; 910 if (UseIntrinsic) { 911 // Use vector version of the intrinsic. 912 Module *M = State.Builder.GetInsertBlock()->getModule(); 913 VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl); 914 assert(VectorF && "Can't retrieve vector intrinsic."); 915 } else { 916 #ifndef NDEBUG 917 assert(Variant != nullptr && "Can't create vector function."); 918 #endif 919 VectorF = Variant; 920 } 921 922 auto *CI = cast_or_null<CallInst>(getUnderlyingInstr()); 923 SmallVector<OperandBundleDef, 1> OpBundles; 924 if (CI) 925 CI->getOperandBundlesAsDefs(OpBundles); 926 927 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); 928 929 if (isa<FPMathOperator>(V)) 930 V->copyFastMathFlags(CI); 931 932 if (!V->getType()->isVoidTy()) 933 State.set(this, V, Part); 934 State.addMetadata(V, CI); 935 } 936 } 937 938 InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF, 939 VPCostContext &Ctx) const { 940 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 941 if (Variant) { 942 return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(), 943 Variant->getFunctionType()->params(), 944 CostKind); 945 } 946 947 FastMathFlags FMF; 948 // TODO: Manage flags via VPRecipeWithIRFlags. 949 if (auto *FPMO = dyn_cast_or_null<FPMathOperator>(getUnderlyingValue())) 950 FMF = FPMO->getFastMathFlags(); 951 952 // Some backends analyze intrinsic arguments to determine cost. Use the 953 // underlying value for the operand if it has one. Otherwise try to use the 954 // operand of the underlying call instruction, if there is one. Otherwise 955 // clear Arguments. 956 // TODO: Rework TTI interface to be independent of concrete IR values. 957 SmallVector<const Value *> Arguments; 958 for (const auto &[Idx, Op] : enumerate(operands())) { 959 auto *V = Op->getUnderlyingValue(); 960 if (!V) { 961 if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) { 962 Arguments.push_back(UI->getArgOperand(Idx)); 963 continue; 964 } 965 Arguments.clear(); 966 break; 967 } 968 Arguments.push_back(V); 969 } 970 971 Type *RetTy = 972 ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); 973 SmallVector<Type *> ParamTys; 974 for (unsigned I = 0; I != getNumOperands(); ++I) 975 ParamTys.push_back( 976 ToVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF)); 977 978 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst. 979 IntrinsicCostAttributes CostAttrs( 980 VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF, 981 dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue())); 982 return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, CostKind); 983 } 984 985 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 986 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, 987 VPSlotTracker &SlotTracker) const { 988 O << Indent << "WIDEN-CALL "; 989 990 Function *CalledFn = getCalledScalarFunction(); 991 if (CalledFn->getReturnType()->isVoidTy()) 992 O << "void "; 993 else { 994 printAsOperand(O, SlotTracker); 995 O << " = "; 996 } 997 998 O << "call @" << CalledFn->getName() << "("; 999 interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) { 1000 Op->printAsOperand(O, SlotTracker); 1001 }); 1002 O << ")"; 1003 1004 if (VectorIntrinsicID) 1005 O << " (using vector intrinsic)"; 1006 else { 1007 O << " (using library function"; 1008 if (Variant->hasName()) 1009 O << ": " << Variant->getName(); 1010 O << ")"; 1011 } 1012 } 1013 1014 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, 1015 VPSlotTracker &SlotTracker) const { 1016 O << Indent << "WIDEN-SELECT "; 1017 printAsOperand(O, SlotTracker); 1018 O << " = select "; 1019 getOperand(0)->printAsOperand(O, SlotTracker); 1020 O << ", "; 1021 getOperand(1)->printAsOperand(O, SlotTracker); 1022 O << ", "; 1023 getOperand(2)->printAsOperand(O, SlotTracker); 1024 O << (isInvariantCond() ? " (condition is loop invariant)" : ""); 1025 } 1026 #endif 1027 1028 void VPWidenSelectRecipe::execute(VPTransformState &State) { 1029 State.setDebugLocFrom(getDebugLoc()); 1030 1031 // The condition can be loop invariant but still defined inside the 1032 // loop. This means that we can't just use the original 'cond' value. 1033 // We have to take the 'vectorized' value and pick the first lane. 1034 // Instcombine will make this a no-op. 1035 auto *InvarCond = 1036 isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr; 1037 1038 for (unsigned Part = 0; Part < State.UF; ++Part) { 1039 Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part); 1040 Value *Op0 = State.get(getOperand(1), Part); 1041 Value *Op1 = State.get(getOperand(2), Part); 1042 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 1043 State.set(this, Sel, Part); 1044 State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1045 } 1046 } 1047 1048 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy( 1049 const FastMathFlags &FMF) { 1050 AllowReassoc = FMF.allowReassoc(); 1051 NoNaNs = FMF.noNaNs(); 1052 NoInfs = FMF.noInfs(); 1053 NoSignedZeros = FMF.noSignedZeros(); 1054 AllowReciprocal = FMF.allowReciprocal(); 1055 AllowContract = FMF.allowContract(); 1056 ApproxFunc = FMF.approxFunc(); 1057 } 1058 1059 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1060 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { 1061 switch (OpType) { 1062 case OperationType::Cmp: 1063 O << " " << CmpInst::getPredicateName(getPredicate()); 1064 break; 1065 case OperationType::DisjointOp: 1066 if (DisjointFlags.IsDisjoint) 1067 O << " disjoint"; 1068 break; 1069 case OperationType::PossiblyExactOp: 1070 if (ExactFlags.IsExact) 1071 O << " exact"; 1072 break; 1073 case OperationType::OverflowingBinOp: 1074 if (WrapFlags.HasNUW) 1075 O << " nuw"; 1076 if (WrapFlags.HasNSW) 1077 O << " nsw"; 1078 break; 1079 case OperationType::FPMathOp: 1080 getFastMathFlags().print(O); 1081 break; 1082 case OperationType::GEPOp: 1083 if (GEPFlags.IsInBounds) 1084 O << " inbounds"; 1085 break; 1086 case OperationType::NonNegOp: 1087 if (NonNegFlags.NonNeg) 1088 O << " nneg"; 1089 break; 1090 case OperationType::Other: 1091 break; 1092 } 1093 if (getNumOperands() > 0) 1094 O << " "; 1095 } 1096 #endif 1097 1098 void VPWidenRecipe::execute(VPTransformState &State) { 1099 State.setDebugLocFrom(getDebugLoc()); 1100 auto &Builder = State.Builder; 1101 switch (Opcode) { 1102 case Instruction::Call: 1103 case Instruction::Br: 1104 case Instruction::PHI: 1105 case Instruction::GetElementPtr: 1106 case Instruction::Select: 1107 llvm_unreachable("This instruction is handled by a different recipe."); 1108 case Instruction::UDiv: 1109 case Instruction::SDiv: 1110 case Instruction::SRem: 1111 case Instruction::URem: 1112 case Instruction::Add: 1113 case Instruction::FAdd: 1114 case Instruction::Sub: 1115 case Instruction::FSub: 1116 case Instruction::FNeg: 1117 case Instruction::Mul: 1118 case Instruction::FMul: 1119 case Instruction::FDiv: 1120 case Instruction::FRem: 1121 case Instruction::Shl: 1122 case Instruction::LShr: 1123 case Instruction::AShr: 1124 case Instruction::And: 1125 case Instruction::Or: 1126 case Instruction::Xor: { 1127 // Just widen unops and binops. 1128 for (unsigned Part = 0; Part < State.UF; ++Part) { 1129 SmallVector<Value *, 2> Ops; 1130 for (VPValue *VPOp : operands()) 1131 Ops.push_back(State.get(VPOp, Part)); 1132 1133 Value *V = Builder.CreateNAryOp(Opcode, Ops); 1134 1135 if (auto *VecOp = dyn_cast<Instruction>(V)) 1136 setFlags(VecOp); 1137 1138 // Use this vector value for all users of the original instruction. 1139 State.set(this, V, Part); 1140 State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1141 } 1142 1143 break; 1144 } 1145 case Instruction::Freeze: { 1146 for (unsigned Part = 0; Part < State.UF; ++Part) { 1147 Value *Op = State.get(getOperand(0), Part); 1148 1149 Value *Freeze = Builder.CreateFreeze(Op); 1150 State.set(this, Freeze, Part); 1151 } 1152 break; 1153 } 1154 case Instruction::ICmp: 1155 case Instruction::FCmp: { 1156 // Widen compares. Generate vector compares. 1157 bool FCmp = Opcode == Instruction::FCmp; 1158 for (unsigned Part = 0; Part < State.UF; ++Part) { 1159 Value *A = State.get(getOperand(0), Part); 1160 Value *B = State.get(getOperand(1), Part); 1161 Value *C = nullptr; 1162 if (FCmp) { 1163 // Propagate fast math flags. 1164 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 1165 if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue())) 1166 Builder.setFastMathFlags(I->getFastMathFlags()); 1167 C = Builder.CreateFCmp(getPredicate(), A, B); 1168 } else { 1169 C = Builder.CreateICmp(getPredicate(), A, B); 1170 } 1171 State.set(this, C, Part); 1172 State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1173 } 1174 1175 break; 1176 } 1177 default: 1178 // This instruction is not vectorized by simple widening. 1179 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " 1180 << Instruction::getOpcodeName(Opcode)); 1181 llvm_unreachable("Unhandled instruction!"); 1182 } // end of switch. 1183 1184 #if !defined(NDEBUG) 1185 // Verify that VPlan type inference results agree with the type of the 1186 // generated values. 1187 for (unsigned Part = 0; Part < State.UF; ++Part) { 1188 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), 1189 State.VF) == State.get(this, Part)->getType() && 1190 "inferred type and type from generated instructions do not match"); 1191 } 1192 #endif 1193 } 1194 1195 InstructionCost VPWidenRecipe::computeCost(ElementCount VF, 1196 VPCostContext &Ctx) const { 1197 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1198 switch (Opcode) { 1199 case Instruction::FNeg: { 1200 Type *VectorTy = 1201 ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); 1202 return Ctx.TTI.getArithmeticInstrCost( 1203 Opcode, VectorTy, CostKind, 1204 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 1205 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}); 1206 } 1207 1208 case Instruction::UDiv: 1209 case Instruction::SDiv: 1210 case Instruction::SRem: 1211 case Instruction::URem: 1212 // More complex computation, let the legacy cost-model handle this for now. 1213 return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF); 1214 case Instruction::Add: 1215 case Instruction::FAdd: 1216 case Instruction::Sub: 1217 case Instruction::FSub: 1218 case Instruction::Mul: 1219 case Instruction::FMul: 1220 case Instruction::FDiv: 1221 case Instruction::FRem: 1222 case Instruction::Shl: 1223 case Instruction::LShr: 1224 case Instruction::AShr: 1225 case Instruction::And: 1226 case Instruction::Or: 1227 case Instruction::Xor: { 1228 VPValue *RHS = getOperand(1); 1229 // Certain instructions can be cheaper to vectorize if they have a constant 1230 // second vector operand. One example of this are shifts on x86. 1231 TargetTransformInfo::OperandValueInfo RHSInfo = { 1232 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}; 1233 if (RHS->isLiveIn()) 1234 RHSInfo = Ctx.TTI.getOperandInfo(RHS->getLiveInIRValue()); 1235 1236 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue && 1237 getOperand(1)->isDefinedOutsideVectorRegions()) 1238 RHSInfo.Kind = TargetTransformInfo::OK_UniformValue; 1239 Type *VectorTy = 1240 ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); 1241 Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue()); 1242 1243 SmallVector<const Value *, 4> Operands; 1244 if (CtxI) 1245 Operands.append(CtxI->value_op_begin(), CtxI->value_op_end()); 1246 return Ctx.TTI.getArithmeticInstrCost( 1247 Opcode, VectorTy, CostKind, 1248 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 1249 RHSInfo, Operands, CtxI, &Ctx.TLI); 1250 } 1251 case Instruction::Freeze: { 1252 // This opcode is unknown. Assume that it is the same as 'mul'. 1253 Type *VectorTy = 1254 ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); 1255 return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 1256 } 1257 case Instruction::ICmp: 1258 case Instruction::FCmp: { 1259 Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue()); 1260 Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); 1261 return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(), 1262 CostKind, CtxI); 1263 } 1264 default: 1265 llvm_unreachable("Unsupported opcode for instruction"); 1266 } 1267 } 1268 1269 void VPWidenEVLRecipe::execute(VPTransformState &State) { 1270 unsigned Opcode = getOpcode(); 1271 // TODO: Support other opcodes 1272 if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode)) 1273 llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute"); 1274 1275 State.setDebugLocFrom(getDebugLoc()); 1276 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " 1277 "explicit vector length."); 1278 1279 assert(State.get(getOperand(0), 0)->getType()->isVectorTy() && 1280 "VPWidenEVLRecipe should not be used for scalars"); 1281 1282 VPValue *EVL = getEVL(); 1283 Value *EVLArg = State.get(EVL, 0, /*NeedsScalar=*/true); 1284 IRBuilderBase &BuilderIR = State.Builder; 1285 VectorBuilder Builder(BuilderIR); 1286 Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue()); 1287 1288 SmallVector<Value *, 4> Ops; 1289 for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) { 1290 VPValue *VPOp = getOperand(I); 1291 Ops.push_back(State.get(VPOp, 0)); 1292 } 1293 1294 Builder.setMask(Mask).setEVL(EVLArg); 1295 Value *VPInst = 1296 Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops, "vp.op"); 1297 // Currently vp-intrinsics only accept FMF flags. 1298 // TODO: Enable other flags when support is added. 1299 if (isa<FPMathOperator>(VPInst)) 1300 setFlags(cast<Instruction>(VPInst)); 1301 1302 State.set(this, VPInst, 0); 1303 State.addMetadata(VPInst, 1304 dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1305 } 1306 1307 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1308 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, 1309 VPSlotTracker &SlotTracker) const { 1310 O << Indent << "WIDEN "; 1311 printAsOperand(O, SlotTracker); 1312 O << " = " << Instruction::getOpcodeName(Opcode); 1313 printFlags(O); 1314 printOperands(O, SlotTracker); 1315 } 1316 1317 void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent, 1318 VPSlotTracker &SlotTracker) const { 1319 O << Indent << "WIDEN-VP "; 1320 printAsOperand(O, SlotTracker); 1321 O << " = " << Instruction::getOpcodeName(getOpcode()); 1322 printFlags(O); 1323 printOperands(O, SlotTracker); 1324 } 1325 #endif 1326 1327 void VPWidenCastRecipe::execute(VPTransformState &State) { 1328 State.setDebugLocFrom(getDebugLoc()); 1329 auto &Builder = State.Builder; 1330 /// Vectorize casts. 1331 assert(State.VF.isVector() && "Not vectorizing?"); 1332 Type *DestTy = VectorType::get(getResultType(), State.VF); 1333 VPValue *Op = getOperand(0); 1334 for (unsigned Part = 0; Part < State.UF; ++Part) { 1335 if (Part > 0 && Op->isLiveIn()) { 1336 // FIXME: Remove once explicit unrolling is implemented using VPlan. 1337 State.set(this, State.get(this, 0), Part); 1338 continue; 1339 } 1340 Value *A = State.get(Op, Part); 1341 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); 1342 State.set(this, Cast, Part); 1343 State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue())); 1344 } 1345 } 1346 1347 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1348 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, 1349 VPSlotTracker &SlotTracker) const { 1350 O << Indent << "WIDEN-CAST "; 1351 printAsOperand(O, SlotTracker); 1352 O << " = " << Instruction::getOpcodeName(Opcode) << " "; 1353 printFlags(O); 1354 printOperands(O, SlotTracker); 1355 O << " to " << *getResultType(); 1356 } 1357 #endif 1358 1359 /// This function adds 1360 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 1361 /// to each vector element of Val. The sequence starts at StartIndex. 1362 /// \p Opcode is relevant for FP induction variable. 1363 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 1364 Instruction::BinaryOps BinOp, ElementCount VF, 1365 IRBuilderBase &Builder) { 1366 assert(VF.isVector() && "only vector VFs are supported"); 1367 1368 // Create and check the types. 1369 auto *ValVTy = cast<VectorType>(Val->getType()); 1370 ElementCount VLen = ValVTy->getElementCount(); 1371 1372 Type *STy = Val->getType()->getScalarType(); 1373 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1374 "Induction Step must be an integer or FP"); 1375 assert(Step->getType() == STy && "Step has wrong type"); 1376 1377 SmallVector<Constant *, 8> Indices; 1378 1379 // Create a vector of consecutive numbers from zero to VF. 1380 VectorType *InitVecValVTy = ValVTy; 1381 if (STy->isFloatingPointTy()) { 1382 Type *InitVecValSTy = 1383 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 1384 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 1385 } 1386 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 1387 1388 // Splat the StartIdx 1389 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 1390 1391 if (STy->isIntegerTy()) { 1392 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 1393 Step = Builder.CreateVectorSplat(VLen, Step); 1394 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1395 // FIXME: The newly created binary instructions should contain nsw/nuw 1396 // flags, which can be found from the original scalar operations. 1397 Step = Builder.CreateMul(InitVec, Step); 1398 return Builder.CreateAdd(Val, Step, "induction"); 1399 } 1400 1401 // Floating point induction. 1402 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1403 "Binary Opcode should be specified for FP induction"); 1404 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 1405 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 1406 1407 Step = Builder.CreateVectorSplat(VLen, Step); 1408 Value *MulOp = Builder.CreateFMul(InitVec, Step); 1409 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1410 } 1411 1412 /// A helper function that returns an integer or floating-point constant with 1413 /// value C. 1414 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 1415 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 1416 : ConstantFP::get(Ty, C); 1417 } 1418 1419 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 1420 assert(!State.Instance && "Int or FP induction being replicated."); 1421 1422 Value *Start = getStartValue()->getLiveInIRValue(); 1423 const InductionDescriptor &ID = getInductionDescriptor(); 1424 TruncInst *Trunc = getTruncInst(); 1425 IRBuilderBase &Builder = State.Builder; 1426 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1427 assert(State.VF.isVector() && "must have vector VF"); 1428 1429 // The value from the original loop to which we are mapping the new induction 1430 // variable. 1431 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1432 1433 // Fast-math-flags propagate from the original induction instruction. 1434 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 1435 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 1436 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 1437 1438 // Now do the actual transformations, and start with fetching the step value. 1439 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 1440 1441 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1442 "Expected either an induction phi-node or a truncate of it!"); 1443 1444 // Construct the initial value of the vector IV in the vector loop preheader 1445 auto CurrIP = Builder.saveIP(); 1446 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1447 Builder.SetInsertPoint(VectorPH->getTerminator()); 1448 if (isa<TruncInst>(EntryVal)) { 1449 assert(Start->getType()->isIntegerTy() && 1450 "Truncation requires an integer type"); 1451 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1452 Step = Builder.CreateTrunc(Step, TruncType); 1453 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1454 } 1455 1456 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 1457 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 1458 Value *SteppedStart = getStepVector( 1459 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); 1460 1461 // We create vector phi nodes for both integer and floating-point induction 1462 // variables. Here, we determine the kind of arithmetic we will perform. 1463 Instruction::BinaryOps AddOp; 1464 Instruction::BinaryOps MulOp; 1465 if (Step->getType()->isIntegerTy()) { 1466 AddOp = Instruction::Add; 1467 MulOp = Instruction::Mul; 1468 } else { 1469 AddOp = ID.getInductionOpcode(); 1470 MulOp = Instruction::FMul; 1471 } 1472 1473 // Multiply the vectorization factor by the step using integer or 1474 // floating-point arithmetic as appropriate. 1475 Type *StepType = Step->getType(); 1476 Value *RuntimeVF = State.get(getVFValue(), {0, 0}); 1477 if (Step->getType()->isFloatingPointTy()) 1478 RuntimeVF = Builder.CreateUIToFP(RuntimeVF, StepType); 1479 else 1480 RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, StepType); 1481 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 1482 1483 // Create a vector splat to use in the induction update. 1484 // 1485 // FIXME: If the step is non-constant, we create the vector splat with 1486 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1487 // handle a constant vector splat. 1488 Value *SplatVF = isa<Constant>(Mul) 1489 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 1490 : Builder.CreateVectorSplat(State.VF, Mul); 1491 Builder.restoreIP(CurrIP); 1492 1493 // We may need to add the step a number of times, depending on the unroll 1494 // factor. The last of those goes into the PHI. 1495 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind"); 1496 VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 1497 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1498 Instruction *LastInduction = VecInd; 1499 for (unsigned Part = 0; Part < State.UF; ++Part) { 1500 State.set(this, LastInduction, Part); 1501 1502 if (isa<TruncInst>(EntryVal)) 1503 State.addMetadata(LastInduction, EntryVal); 1504 1505 LastInduction = cast<Instruction>( 1506 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 1507 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1508 } 1509 1510 LastInduction->setName("vec.ind.next"); 1511 VecInd->addIncoming(SteppedStart, VectorPH); 1512 // Add induction update using an incorrect block temporarily. The phi node 1513 // will be fixed after VPlan execution. Note that at this point the latch 1514 // block cannot be used, as it does not exist yet. 1515 // TODO: Model increment value in VPlan, by turning the recipe into a 1516 // multi-def and a subclass of VPHeaderPHIRecipe. 1517 VecInd->addIncoming(LastInduction, VectorPH); 1518 } 1519 1520 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1521 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, 1522 VPSlotTracker &SlotTracker) const { 1523 O << Indent << "WIDEN-INDUCTION"; 1524 if (getTruncInst()) { 1525 O << "\\l\""; 1526 O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\""; 1527 O << " +\n" << Indent << "\" "; 1528 getVPValue(0)->printAsOperand(O, SlotTracker); 1529 } else 1530 O << " " << VPlanIngredient(IV); 1531 1532 O << ", "; 1533 getStepValue()->printAsOperand(O, SlotTracker); 1534 1535 O << ", "; 1536 getVFValue()->printAsOperand(O, SlotTracker); 1537 } 1538 #endif 1539 1540 bool VPWidenIntOrFpInductionRecipe::isCanonical() const { 1541 // The step may be defined by a recipe in the preheader (e.g. if it requires 1542 // SCEV expansion), but for the canonical induction the step is required to be 1543 // 1, which is represented as live-in. 1544 if (getStepValue()->getDefiningRecipe()) 1545 return false; 1546 auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue()); 1547 auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); 1548 auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin()); 1549 return StartC && StartC->isZero() && StepC && StepC->isOne() && 1550 getScalarType() == CanIV->getScalarType(); 1551 } 1552 1553 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1554 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, 1555 VPSlotTracker &SlotTracker) const { 1556 O << Indent; 1557 printAsOperand(O, SlotTracker); 1558 O << " = DERIVED-IV "; 1559 getStartValue()->printAsOperand(O, SlotTracker); 1560 O << " + "; 1561 getOperand(1)->printAsOperand(O, SlotTracker); 1562 O << " * "; 1563 getStepValue()->printAsOperand(O, SlotTracker); 1564 } 1565 #endif 1566 1567 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 1568 // Fast-math-flags propagate from the original induction instruction. 1569 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 1570 if (hasFastMathFlags()) 1571 State.Builder.setFastMathFlags(getFastMathFlags()); 1572 1573 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 1574 /// variable on which to base the steps, \p Step is the size of the step. 1575 1576 Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0)); 1577 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 1578 IRBuilderBase &Builder = State.Builder; 1579 1580 // Ensure step has the same type as that of scalar IV. 1581 Type *BaseIVTy = BaseIV->getType()->getScalarType(); 1582 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!"); 1583 1584 // We build scalar steps for both integer and floating-point induction 1585 // variables. Here, we determine the kind of arithmetic we will perform. 1586 Instruction::BinaryOps AddOp; 1587 Instruction::BinaryOps MulOp; 1588 if (BaseIVTy->isIntegerTy()) { 1589 AddOp = Instruction::Add; 1590 MulOp = Instruction::Mul; 1591 } else { 1592 AddOp = InductionOpcode; 1593 MulOp = Instruction::FMul; 1594 } 1595 1596 // Determine the number of scalars we need to generate for each unroll 1597 // iteration. 1598 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this); 1599 // Compute the scalar steps and save the results in State. 1600 Type *IntStepTy = 1601 IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits()); 1602 Type *VecIVTy = nullptr; 1603 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 1604 if (!FirstLaneOnly && State.VF.isScalable()) { 1605 VecIVTy = VectorType::get(BaseIVTy, State.VF); 1606 UnitStepVec = 1607 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 1608 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 1609 SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV); 1610 } 1611 1612 unsigned StartPart = 0; 1613 unsigned EndPart = State.UF; 1614 unsigned StartLane = 0; 1615 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 1616 if (State.Instance) { 1617 StartPart = State.Instance->Part; 1618 EndPart = StartPart + 1; 1619 StartLane = State.Instance->Lane.getKnownLane(); 1620 EndLane = StartLane + 1; 1621 } 1622 for (unsigned Part = StartPart; Part < EndPart; ++Part) { 1623 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 1624 1625 if (!FirstLaneOnly && State.VF.isScalable()) { 1626 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 1627 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 1628 if (BaseIVTy->isFloatingPointTy()) 1629 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 1630 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 1631 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 1632 State.set(this, Add, Part); 1633 // It's useful to record the lane values too for the known minimum number 1634 // of elements so we do those below. This improves the code quality when 1635 // trying to extract the first element, for example. 1636 } 1637 1638 if (BaseIVTy->isFloatingPointTy()) 1639 StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy); 1640 1641 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { 1642 Value *StartIdx = Builder.CreateBinOp( 1643 AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane)); 1644 // The step returned by `createStepForVF` is a runtime-evaluated value 1645 // when VF is scalable. Otherwise, it should be folded into a Constant. 1646 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 1647 "Expected StartIdx to be folded to a constant when VF is not " 1648 "scalable"); 1649 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 1650 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul); 1651 State.set(this, Add, VPIteration(Part, Lane)); 1652 } 1653 } 1654 } 1655 1656 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1657 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, 1658 VPSlotTracker &SlotTracker) const { 1659 O << Indent; 1660 printAsOperand(O, SlotTracker); 1661 O << " = SCALAR-STEPS "; 1662 printOperands(O, SlotTracker); 1663 } 1664 #endif 1665 1666 void VPWidenGEPRecipe::execute(VPTransformState &State) { 1667 assert(State.VF.isVector() && "not widening"); 1668 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 1669 // Construct a vector GEP by widening the operands of the scalar GEP as 1670 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 1671 // results in a vector of pointers when at least one operand of the GEP 1672 // is vector-typed. Thus, to keep the representation compact, we only use 1673 // vector-typed operands for loop-varying values. 1674 1675 if (areAllOperandsInvariant()) { 1676 // If we are vectorizing, but the GEP has only loop-invariant operands, 1677 // the GEP we build (by only using vector-typed operands for 1678 // loop-varying values) would be a scalar pointer. Thus, to ensure we 1679 // produce a vector of pointers, we need to either arbitrarily pick an 1680 // operand to broadcast, or broadcast a clone of the original GEP. 1681 // Here, we broadcast a clone of the original. 1682 // 1683 // TODO: If at some point we decide to scalarize instructions having 1684 // loop-invariant operands, this special case will no longer be 1685 // required. We would add the scalarization decision to 1686 // collectLoopScalars() and teach getVectorValue() to broadcast 1687 // the lane-zero scalar value. 1688 SmallVector<Value *> Ops; 1689 for (unsigned I = 0, E = getNumOperands(); I != E; I++) 1690 Ops.push_back(State.get(getOperand(I), VPIteration(0, 0))); 1691 1692 auto *NewGEP = 1693 State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0], 1694 ArrayRef(Ops).drop_front(), "", isInBounds()); 1695 for (unsigned Part = 0; Part < State.UF; ++Part) { 1696 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP); 1697 State.set(this, EntryPart, Part); 1698 State.addMetadata(EntryPart, GEP); 1699 } 1700 } else { 1701 // If the GEP has at least one loop-varying operand, we are sure to 1702 // produce a vector of pointers. But if we are only unrolling, we want 1703 // to produce a scalar GEP for each unroll part. Thus, the GEP we 1704 // produce with the code below will be scalar (if VF == 1) or vector 1705 // (otherwise). Note that for the unroll-only case, we still maintain 1706 // values in the vector mapping with initVector, as we do for other 1707 // instructions. 1708 for (unsigned Part = 0; Part < State.UF; ++Part) { 1709 // The pointer operand of the new GEP. If it's loop-invariant, we 1710 // won't broadcast it. 1711 auto *Ptr = isPointerLoopInvariant() 1712 ? State.get(getOperand(0), VPIteration(0, 0)) 1713 : State.get(getOperand(0), Part); 1714 1715 // Collect all the indices for the new GEP. If any index is 1716 // loop-invariant, we won't broadcast it. 1717 SmallVector<Value *, 4> Indices; 1718 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 1719 VPValue *Operand = getOperand(I); 1720 if (isIndexLoopInvariant(I - 1)) 1721 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 1722 else 1723 Indices.push_back(State.get(Operand, Part)); 1724 } 1725 1726 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 1727 // but it should be a vector, otherwise. 1728 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, 1729 Indices, "", isInBounds()); 1730 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 1731 "NewGEP is not a pointer vector"); 1732 State.set(this, NewGEP, Part); 1733 State.addMetadata(NewGEP, GEP); 1734 } 1735 } 1736 } 1737 1738 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1739 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, 1740 VPSlotTracker &SlotTracker) const { 1741 O << Indent << "WIDEN-GEP "; 1742 O << (isPointerLoopInvariant() ? "Inv" : "Var"); 1743 for (size_t I = 0; I < getNumOperands() - 1; ++I) 1744 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]"; 1745 1746 O << " "; 1747 printAsOperand(O, SlotTracker); 1748 O << " = getelementptr"; 1749 printFlags(O); 1750 printOperands(O, SlotTracker); 1751 } 1752 #endif 1753 1754 void VPVectorPointerRecipe ::execute(VPTransformState &State) { 1755 auto &Builder = State.Builder; 1756 State.setDebugLocFrom(getDebugLoc()); 1757 for (unsigned Part = 0; Part < State.UF; ++Part) { 1758 // Calculate the pointer for the specific unroll-part. 1759 Value *PartPtr = nullptr; 1760 // Use i32 for the gep index type when the value is constant, 1761 // or query DataLayout for a more suitable index type otherwise. 1762 const DataLayout &DL = 1763 Builder.GetInsertBlock()->getDataLayout(); 1764 Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0) 1765 ? DL.getIndexType(IndexedTy->getPointerTo()) 1766 : Builder.getInt32Ty(); 1767 Value *Ptr = State.get(getOperand(0), VPIteration(0, 0)); 1768 bool InBounds = isInBounds(); 1769 if (IsReverse) { 1770 // If the address is consecutive but reversed, then the 1771 // wide store needs to start at the last vector element. 1772 // RunTimeVF = VScale * VF.getKnownMinValue() 1773 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 1774 Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF); 1775 // NumElt = -Part * RunTimeVF 1776 Value *NumElt = Builder.CreateMul( 1777 ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF); 1778 // LastLane = 1 - RunTimeVF 1779 Value *LastLane = 1780 Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); 1781 PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds); 1782 PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds); 1783 } else { 1784 Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part); 1785 PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds); 1786 } 1787 1788 State.set(this, PartPtr, Part, /*IsScalar*/ true); 1789 } 1790 } 1791 1792 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1793 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, 1794 VPSlotTracker &SlotTracker) const { 1795 O << Indent; 1796 printAsOperand(O, SlotTracker); 1797 O << " = vector-pointer "; 1798 if (IsReverse) 1799 O << "(reverse) "; 1800 1801 printOperands(O, SlotTracker); 1802 } 1803 #endif 1804 1805 void VPBlendRecipe::execute(VPTransformState &State) { 1806 assert(isNormalized() && "Expected blend to be normalized!"); 1807 State.setDebugLocFrom(getDebugLoc()); 1808 // We know that all PHIs in non-header blocks are converted into 1809 // selects, so we don't have to worry about the insertion order and we 1810 // can just use the builder. 1811 // At this point we generate the predication tree. There may be 1812 // duplications since this is a simple recursive scan, but future 1813 // optimizations will clean it up. 1814 1815 unsigned NumIncoming = getNumIncomingValues(); 1816 1817 // Generate a sequence of selects of the form: 1818 // SELECT(Mask3, In3, 1819 // SELECT(Mask2, In2, 1820 // SELECT(Mask1, In1, 1821 // In0))) 1822 // Note that Mask0 is never used: lanes for which no path reaches this phi and 1823 // are essentially undef are taken from In0. 1824 VectorParts Entry(State.UF); 1825 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 1826 for (unsigned In = 0; In < NumIncoming; ++In) { 1827 for (unsigned Part = 0; Part < State.UF; ++Part) { 1828 // We might have single edge PHIs (blocks) - use an identity 1829 // 'select' for the first PHI operand. 1830 Value *In0 = State.get(getIncomingValue(In), Part, OnlyFirstLaneUsed); 1831 if (In == 0) 1832 Entry[Part] = In0; // Initialize with the first incoming value. 1833 else { 1834 // Select between the current value and the previous incoming edge 1835 // based on the incoming mask. 1836 Value *Cond = State.get(getMask(In), Part, OnlyFirstLaneUsed); 1837 Entry[Part] = 1838 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 1839 } 1840 } 1841 } 1842 1843 for (unsigned Part = 0; Part < State.UF; ++Part) 1844 State.set(this, Entry[Part], Part, OnlyFirstLaneUsed); 1845 } 1846 1847 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1848 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, 1849 VPSlotTracker &SlotTracker) const { 1850 O << Indent << "BLEND "; 1851 printAsOperand(O, SlotTracker); 1852 O << " ="; 1853 if (getNumIncomingValues() == 1) { 1854 // Not a User of any mask: not really blending, this is a 1855 // single-predecessor phi. 1856 O << " "; 1857 getIncomingValue(0)->printAsOperand(O, SlotTracker); 1858 } else { 1859 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { 1860 O << " "; 1861 getIncomingValue(I)->printAsOperand(O, SlotTracker); 1862 if (I == 0) 1863 continue; 1864 O << "/"; 1865 getMask(I)->printAsOperand(O, SlotTracker); 1866 } 1867 } 1868 } 1869 #endif 1870 1871 void VPReductionRecipe::execute(VPTransformState &State) { 1872 assert(!State.Instance && "Reduction being replicated."); 1873 Value *PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true); 1874 RecurKind Kind = RdxDesc.getRecurrenceKind(); 1875 // Propagate the fast-math flags carried by the underlying instruction. 1876 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 1877 State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 1878 for (unsigned Part = 0; Part < State.UF; ++Part) { 1879 Value *NewVecOp = State.get(getVecOp(), Part); 1880 if (VPValue *Cond = getCondOp()) { 1881 Value *NewCond = State.get(Cond, Part, State.VF.isScalar()); 1882 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType()); 1883 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); 1884 1885 Value *Start; 1886 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind)) 1887 Start = RdxDesc.getRecurrenceStartValue(); 1888 else 1889 Start = llvm::getRecurrenceIdentity(Kind, ElementTy, 1890 RdxDesc.getFastMathFlags()); 1891 if (State.VF.isVector()) 1892 Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), 1893 Start); 1894 1895 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start); 1896 NewVecOp = Select; 1897 } 1898 Value *NewRed; 1899 Value *NextInChain; 1900 if (IsOrdered) { 1901 if (State.VF.isVector()) 1902 NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp, 1903 PrevInChain); 1904 else 1905 NewRed = State.Builder.CreateBinOp( 1906 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain, 1907 NewVecOp); 1908 PrevInChain = NewRed; 1909 NextInChain = NewRed; 1910 } else { 1911 PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true); 1912 NewRed = createReduction(State.Builder, RdxDesc, NewVecOp); 1913 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 1914 NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(), 1915 NewRed, PrevInChain); 1916 else 1917 NextInChain = State.Builder.CreateBinOp( 1918 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, 1919 PrevInChain); 1920 } 1921 State.set(this, NextInChain, Part, /*IsScalar*/ true); 1922 } 1923 } 1924 1925 void VPReductionEVLRecipe::execute(VPTransformState &State) { 1926 assert(!State.Instance && "Reduction being replicated."); 1927 assert(State.UF == 1 && 1928 "Expected only UF == 1 when vectorizing with explicit vector length."); 1929 1930 auto &Builder = State.Builder; 1931 // Propagate the fast-math flags carried by the underlying instruction. 1932 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder); 1933 const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor(); 1934 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 1935 1936 RecurKind Kind = RdxDesc.getRecurrenceKind(); 1937 Value *Prev = State.get(getChainOp(), 0, /*IsScalar*/ true); 1938 Value *VecOp = State.get(getVecOp(), 0); 1939 Value *EVL = State.get(getEVL(), VPIteration(0, 0)); 1940 1941 VectorBuilder VBuilder(Builder); 1942 VBuilder.setEVL(EVL); 1943 Value *Mask; 1944 // TODO: move the all-true mask generation into VectorBuilder. 1945 if (VPValue *CondOp = getCondOp()) 1946 Mask = State.get(CondOp, 0); 1947 else 1948 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 1949 VBuilder.setMask(Mask); 1950 1951 Value *NewRed; 1952 if (isOrdered()) { 1953 NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev); 1954 } else { 1955 NewRed = createSimpleReduction(VBuilder, VecOp, RdxDesc); 1956 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 1957 NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev); 1958 else 1959 NewRed = Builder.CreateBinOp( 1960 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, Prev); 1961 } 1962 State.set(this, NewRed, 0, /*IsScalar*/ true); 1963 } 1964 1965 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1966 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, 1967 VPSlotTracker &SlotTracker) const { 1968 O << Indent << "REDUCE "; 1969 printAsOperand(O, SlotTracker); 1970 O << " = "; 1971 getChainOp()->printAsOperand(O, SlotTracker); 1972 O << " +"; 1973 if (isa<FPMathOperator>(getUnderlyingInstr())) 1974 O << getUnderlyingInstr()->getFastMathFlags(); 1975 O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; 1976 getVecOp()->printAsOperand(O, SlotTracker); 1977 if (isConditional()) { 1978 O << ", "; 1979 getCondOp()->printAsOperand(O, SlotTracker); 1980 } 1981 O << ")"; 1982 if (RdxDesc.IntermediateStore) 1983 O << " (with final reduction value stored in invariant address sank " 1984 "outside of loop)"; 1985 } 1986 1987 void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent, 1988 VPSlotTracker &SlotTracker) const { 1989 const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor(); 1990 O << Indent << "REDUCE "; 1991 printAsOperand(O, SlotTracker); 1992 O << " = "; 1993 getChainOp()->printAsOperand(O, SlotTracker); 1994 O << " +"; 1995 if (isa<FPMathOperator>(getUnderlyingInstr())) 1996 O << getUnderlyingInstr()->getFastMathFlags(); 1997 O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; 1998 getVecOp()->printAsOperand(O, SlotTracker); 1999 O << ", "; 2000 getEVL()->printAsOperand(O, SlotTracker); 2001 if (isConditional()) { 2002 O << ", "; 2003 getCondOp()->printAsOperand(O, SlotTracker); 2004 } 2005 O << ")"; 2006 if (RdxDesc.IntermediateStore) 2007 O << " (with final reduction value stored in invariant address sank " 2008 "outside of loop)"; 2009 } 2010 #endif 2011 2012 bool VPReplicateRecipe::shouldPack() const { 2013 // Find if the recipe is used by a widened recipe via an intervening 2014 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. 2015 return any_of(users(), [](const VPUser *U) { 2016 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U)) 2017 return any_of(PredR->users(), [PredR](const VPUser *U) { 2018 return !U->usesScalars(PredR); 2019 }); 2020 return false; 2021 }); 2022 } 2023 2024 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2025 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, 2026 VPSlotTracker &SlotTracker) const { 2027 O << Indent << (IsUniform ? "CLONE " : "REPLICATE "); 2028 2029 if (!getUnderlyingInstr()->getType()->isVoidTy()) { 2030 printAsOperand(O, SlotTracker); 2031 O << " = "; 2032 } 2033 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) { 2034 O << "call"; 2035 printFlags(O); 2036 O << "@" << CB->getCalledFunction()->getName() << "("; 2037 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), 2038 O, [&O, &SlotTracker](VPValue *Op) { 2039 Op->printAsOperand(O, SlotTracker); 2040 }); 2041 O << ")"; 2042 } else { 2043 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); 2044 printFlags(O); 2045 printOperands(O, SlotTracker); 2046 } 2047 2048 if (shouldPack()) 2049 O << " (S->V)"; 2050 } 2051 #endif 2052 2053 /// Checks if \p C is uniform across all VFs and UFs. It is considered as such 2054 /// if it is either defined outside the vector region or its operand is known to 2055 /// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI). 2056 /// TODO: Uniformity should be associated with a VPValue and there should be a 2057 /// generic way to check. 2058 static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) { 2059 return C->isDefinedOutsideVectorRegions() || 2060 isa<VPDerivedIVRecipe>(C->getOperand(0)) || 2061 isa<VPCanonicalIVPHIRecipe>(C->getOperand(0)); 2062 } 2063 2064 Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) { 2065 assert(vputils::onlyFirstLaneUsed(this) && 2066 "Codegen only implemented for first lane."); 2067 switch (Opcode) { 2068 case Instruction::SExt: 2069 case Instruction::ZExt: 2070 case Instruction::Trunc: { 2071 // Note: SExt/ZExt not used yet. 2072 Value *Op = State.get(getOperand(0), VPIteration(Part, 0)); 2073 return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy); 2074 } 2075 default: 2076 llvm_unreachable("opcode not implemented yet"); 2077 } 2078 } 2079 2080 void VPScalarCastRecipe ::execute(VPTransformState &State) { 2081 bool IsUniformAcrossVFsAndUFs = isUniformAcrossVFsAndUFs(this); 2082 for (unsigned Part = 0; Part != State.UF; ++Part) { 2083 Value *Res; 2084 // Only generate a single instance, if the recipe is uniform across UFs and 2085 // VFs. 2086 if (Part > 0 && IsUniformAcrossVFsAndUFs) 2087 Res = State.get(this, VPIteration(0, 0)); 2088 else 2089 Res = generate(State, Part); 2090 State.set(this, Res, VPIteration(Part, 0)); 2091 } 2092 } 2093 2094 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2095 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent, 2096 VPSlotTracker &SlotTracker) const { 2097 O << Indent << "SCALAR-CAST "; 2098 printAsOperand(O, SlotTracker); 2099 O << " = " << Instruction::getOpcodeName(Opcode) << " "; 2100 printOperands(O, SlotTracker); 2101 O << " to " << *ResultTy; 2102 } 2103 #endif 2104 2105 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 2106 assert(State.Instance && "Branch on Mask works only on single instance."); 2107 2108 unsigned Part = State.Instance->Part; 2109 unsigned Lane = State.Instance->Lane.getKnownLane(); 2110 2111 Value *ConditionBit = nullptr; 2112 VPValue *BlockInMask = getMask(); 2113 if (BlockInMask) { 2114 ConditionBit = State.get(BlockInMask, Part); 2115 if (ConditionBit->getType()->isVectorTy()) 2116 ConditionBit = State.Builder.CreateExtractElement( 2117 ConditionBit, State.Builder.getInt32(Lane)); 2118 } else // Block in mask is all-one. 2119 ConditionBit = State.Builder.getTrue(); 2120 2121 // Replace the temporary unreachable terminator with a new conditional branch, 2122 // whose two destinations will be set later when they are created. 2123 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 2124 assert(isa<UnreachableInst>(CurrentTerminator) && 2125 "Expected to replace unreachable terminator with conditional branch."); 2126 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 2127 CondBr->setSuccessor(0, nullptr); 2128 ReplaceInstWithInst(CurrentTerminator, CondBr); 2129 } 2130 2131 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 2132 assert(State.Instance && "Predicated instruction PHI works per instance."); 2133 Instruction *ScalarPredInst = 2134 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 2135 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 2136 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 2137 assert(PredicatingBB && "Predicated block has no single predecessor."); 2138 assert(isa<VPReplicateRecipe>(getOperand(0)) && 2139 "operand must be VPReplicateRecipe"); 2140 2141 // By current pack/unpack logic we need to generate only a single phi node: if 2142 // a vector value for the predicated instruction exists at this point it means 2143 // the instruction has vector users only, and a phi for the vector value is 2144 // needed. In this case the recipe of the predicated instruction is marked to 2145 // also do that packing, thereby "hoisting" the insert-element sequence. 2146 // Otherwise, a phi node for the scalar value is needed. 2147 unsigned Part = State.Instance->Part; 2148 if (State.hasVectorValue(getOperand(0), Part)) { 2149 Value *VectorValue = State.get(getOperand(0), Part); 2150 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 2151 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 2152 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 2153 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 2154 if (State.hasVectorValue(this, Part)) 2155 State.reset(this, VPhi, Part); 2156 else 2157 State.set(this, VPhi, Part); 2158 // NOTE: Currently we need to update the value of the operand, so the next 2159 // predicated iteration inserts its generated value in the correct vector. 2160 State.reset(getOperand(0), VPhi, Part); 2161 } else { 2162 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 2163 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 2164 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 2165 PredicatingBB); 2166 Phi->addIncoming(ScalarPredInst, PredicatedBB); 2167 if (State.hasScalarValue(this, *State.Instance)) 2168 State.reset(this, Phi, *State.Instance); 2169 else 2170 State.set(this, Phi, *State.Instance); 2171 // NOTE: Currently we need to update the value of the operand, so the next 2172 // predicated iteration inserts its generated value in the correct vector. 2173 State.reset(getOperand(0), Phi, *State.Instance); 2174 } 2175 } 2176 2177 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2178 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2179 VPSlotTracker &SlotTracker) const { 2180 O << Indent << "PHI-PREDICATED-INSTRUCTION "; 2181 printAsOperand(O, SlotTracker); 2182 O << " = "; 2183 printOperands(O, SlotTracker); 2184 } 2185 #endif 2186 2187 InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, 2188 VPCostContext &Ctx) const { 2189 Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF); 2190 const Align Alignment = 2191 getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient)); 2192 unsigned AS = 2193 getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient)); 2194 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2195 2196 if (!Consecutive) { 2197 // TODO: Using the original IR may not be accurate. 2198 // Currently, ARM will use the underlying IR to calculate gather/scatter 2199 // instruction cost. 2200 const Value *Ptr = getLoadStorePointerOperand(&Ingredient); 2201 assert(!Reverse && 2202 "Inconsecutive memory access should not have the order."); 2203 return Ctx.TTI.getAddressComputationCost(Ty) + 2204 Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr, 2205 IsMasked, Alignment, CostKind, 2206 &Ingredient); 2207 } 2208 2209 InstructionCost Cost = 0; 2210 if (IsMasked) { 2211 Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, 2212 AS, CostKind); 2213 } else { 2214 TTI::OperandValueInfo OpInfo = 2215 Ctx.TTI.getOperandInfo(Ingredient.getOperand(0)); 2216 Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS, 2217 CostKind, OpInfo, &Ingredient); 2218 } 2219 if (!Reverse) 2220 return Cost; 2221 2222 return Cost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 2223 cast<VectorType>(Ty), std::nullopt, 2224 CostKind, 0); 2225 } 2226 2227 void VPWidenLoadRecipe::execute(VPTransformState &State) { 2228 auto *LI = cast<LoadInst>(&Ingredient); 2229 2230 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 2231 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 2232 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2233 bool CreateGather = !isConsecutive(); 2234 2235 auto &Builder = State.Builder; 2236 State.setDebugLocFrom(getDebugLoc()); 2237 for (unsigned Part = 0; Part < State.UF; ++Part) { 2238 Value *NewLI; 2239 Value *Mask = nullptr; 2240 if (auto *VPMask = getMask()) { 2241 // Mask reversal is only needed for non-all-one (null) masks, as reverse 2242 // of a null all-one mask is a null mask. 2243 Mask = State.get(VPMask, Part); 2244 if (isReverse()) 2245 Mask = Builder.CreateVectorReverse(Mask, "reverse"); 2246 } 2247 2248 Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateGather); 2249 if (CreateGather) { 2250 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr, 2251 "wide.masked.gather"); 2252 } else if (Mask) { 2253 NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask, 2254 PoisonValue::get(DataTy), 2255 "wide.masked.load"); 2256 } else { 2257 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load"); 2258 } 2259 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2260 State.addMetadata(NewLI, LI); 2261 if (Reverse) 2262 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 2263 State.set(this, NewLI, Part); 2264 } 2265 } 2266 2267 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2268 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, 2269 VPSlotTracker &SlotTracker) const { 2270 O << Indent << "WIDEN "; 2271 printAsOperand(O, SlotTracker); 2272 O << " = load "; 2273 printOperands(O, SlotTracker); 2274 } 2275 #endif 2276 2277 /// Use all-true mask for reverse rather than actual mask, as it avoids a 2278 /// dependence w/o affecting the result. 2279 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, 2280 Value *EVL, const Twine &Name) { 2281 VectorType *ValTy = cast<VectorType>(Operand->getType()); 2282 Value *AllTrueMask = 2283 Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue()); 2284 return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse, 2285 {Operand, AllTrueMask, EVL}, nullptr, Name); 2286 } 2287 2288 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { 2289 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " 2290 "explicit vector length."); 2291 auto *LI = cast<LoadInst>(&Ingredient); 2292 2293 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 2294 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 2295 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2296 bool CreateGather = !isConsecutive(); 2297 2298 auto &Builder = State.Builder; 2299 State.setDebugLocFrom(getDebugLoc()); 2300 CallInst *NewLI; 2301 Value *EVL = State.get(getEVL(), VPIteration(0, 0)); 2302 Value *Addr = State.get(getAddr(), 0, !CreateGather); 2303 Value *Mask = nullptr; 2304 if (VPValue *VPMask = getMask()) { 2305 Mask = State.get(VPMask, 0); 2306 if (isReverse()) 2307 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); 2308 } else { 2309 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 2310 } 2311 2312 if (CreateGather) { 2313 NewLI = 2314 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, 2315 nullptr, "wide.masked.gather"); 2316 } else { 2317 VectorBuilder VBuilder(Builder); 2318 VBuilder.setEVL(EVL).setMask(Mask); 2319 NewLI = cast<CallInst>(VBuilder.createVectorInstruction( 2320 Instruction::Load, DataTy, Addr, "vp.op.load")); 2321 } 2322 NewLI->addParamAttr( 2323 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); 2324 State.addMetadata(NewLI, LI); 2325 Instruction *Res = NewLI; 2326 if (isReverse()) 2327 Res = createReverseEVL(Builder, Res, EVL, "vp.reverse"); 2328 State.set(this, Res, 0); 2329 } 2330 2331 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2332 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, 2333 VPSlotTracker &SlotTracker) const { 2334 O << Indent << "WIDEN "; 2335 printAsOperand(O, SlotTracker); 2336 O << " = vp.load "; 2337 printOperands(O, SlotTracker); 2338 } 2339 #endif 2340 2341 void VPWidenStoreRecipe::execute(VPTransformState &State) { 2342 auto *SI = cast<StoreInst>(&Ingredient); 2343 2344 VPValue *StoredVPValue = getStoredValue(); 2345 bool CreateScatter = !isConsecutive(); 2346 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2347 2348 auto &Builder = State.Builder; 2349 State.setDebugLocFrom(getDebugLoc()); 2350 2351 for (unsigned Part = 0; Part < State.UF; ++Part) { 2352 Instruction *NewSI = nullptr; 2353 Value *Mask = nullptr; 2354 if (auto *VPMask = getMask()) { 2355 // Mask reversal is only needed for non-all-one (null) masks, as reverse 2356 // of a null all-one mask is a null mask. 2357 Mask = State.get(VPMask, Part); 2358 if (isReverse()) 2359 Mask = Builder.CreateVectorReverse(Mask, "reverse"); 2360 } 2361 2362 Value *StoredVal = State.get(StoredVPValue, Part); 2363 if (isReverse()) { 2364 // If we store to reverse consecutive memory locations, then we need 2365 // to reverse the order of elements in the stored value. 2366 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 2367 // We don't want to update the value in the map as it might be used in 2368 // another expression. So don't call resetVectorValue(StoredVal). 2369 } 2370 Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateScatter); 2371 if (CreateScatter) 2372 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask); 2373 else if (Mask) 2374 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask); 2375 else 2376 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment); 2377 State.addMetadata(NewSI, SI); 2378 } 2379 } 2380 2381 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2382 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, 2383 VPSlotTracker &SlotTracker) const { 2384 O << Indent << "WIDEN store "; 2385 printOperands(O, SlotTracker); 2386 } 2387 #endif 2388 2389 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { 2390 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " 2391 "explicit vector length."); 2392 auto *SI = cast<StoreInst>(&Ingredient); 2393 2394 VPValue *StoredValue = getStoredValue(); 2395 bool CreateScatter = !isConsecutive(); 2396 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2397 2398 auto &Builder = State.Builder; 2399 State.setDebugLocFrom(getDebugLoc()); 2400 2401 CallInst *NewSI = nullptr; 2402 Value *StoredVal = State.get(StoredValue, 0); 2403 Value *EVL = State.get(getEVL(), VPIteration(0, 0)); 2404 if (isReverse()) 2405 StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse"); 2406 Value *Mask = nullptr; 2407 if (VPValue *VPMask = getMask()) { 2408 Mask = State.get(VPMask, 0); 2409 if (isReverse()) 2410 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); 2411 } else { 2412 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 2413 } 2414 Value *Addr = State.get(getAddr(), 0, !CreateScatter); 2415 if (CreateScatter) { 2416 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), 2417 Intrinsic::vp_scatter, 2418 {StoredVal, Addr, Mask, EVL}); 2419 } else { 2420 VectorBuilder VBuilder(Builder); 2421 VBuilder.setEVL(EVL).setMask(Mask); 2422 NewSI = cast<CallInst>(VBuilder.createVectorInstruction( 2423 Instruction::Store, Type::getVoidTy(EVL->getContext()), 2424 {StoredVal, Addr})); 2425 } 2426 NewSI->addParamAttr( 2427 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment)); 2428 State.addMetadata(NewSI, SI); 2429 } 2430 2431 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2432 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent, 2433 VPSlotTracker &SlotTracker) const { 2434 O << Indent << "WIDEN vp.store "; 2435 printOperands(O, SlotTracker); 2436 } 2437 #endif 2438 2439 static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V, 2440 VectorType *DstVTy, const DataLayout &DL) { 2441 // Verify that V is a vector type with same number of elements as DstVTy. 2442 auto VF = DstVTy->getElementCount(); 2443 auto *SrcVecTy = cast<VectorType>(V->getType()); 2444 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match"); 2445 Type *SrcElemTy = SrcVecTy->getElementType(); 2446 Type *DstElemTy = DstVTy->getElementType(); 2447 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2448 "Vector elements must have same size"); 2449 2450 // Do a direct cast if element types are castable. 2451 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2452 return Builder.CreateBitOrPointerCast(V, DstVTy); 2453 } 2454 // V cannot be directly casted to desired vector type. 2455 // May happen when V is a floating point vector but DstVTy is a vector of 2456 // pointers or vice-versa. Handle this using a two-step bitcast using an 2457 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2458 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2459 "Only one type should be a pointer type"); 2460 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2461 "Only one type should be a floating point type"); 2462 Type *IntTy = 2463 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2464 auto *VecIntTy = VectorType::get(IntTy, VF); 2465 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2466 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2467 } 2468 2469 /// Return a vector containing interleaved elements from multiple 2470 /// smaller input vectors. 2471 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, 2472 const Twine &Name) { 2473 unsigned Factor = Vals.size(); 2474 assert(Factor > 1 && "Tried to interleave invalid number of vectors"); 2475 2476 VectorType *VecTy = cast<VectorType>(Vals[0]->getType()); 2477 #ifndef NDEBUG 2478 for (Value *Val : Vals) 2479 assert(Val->getType() == VecTy && "Tried to interleave mismatched types"); 2480 #endif 2481 2482 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so 2483 // must use intrinsics to interleave. 2484 if (VecTy->isScalableTy()) { 2485 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); 2486 return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2, 2487 Vals, 2488 /*FMFSource=*/nullptr, Name); 2489 } 2490 2491 // Fixed length. Start by concatenating all vectors into a wide vector. 2492 Value *WideVec = concatenateVectors(Builder, Vals); 2493 2494 // Interleave the elements into the wide vector. 2495 const unsigned NumElts = VecTy->getElementCount().getFixedValue(); 2496 return Builder.CreateShuffleVector( 2497 WideVec, createInterleaveMask(NumElts, Factor), Name); 2498 } 2499 2500 // Try to vectorize the interleave group that \p Instr belongs to. 2501 // 2502 // E.g. Translate following interleaved load group (factor = 3): 2503 // for (i = 0; i < N; i+=3) { 2504 // R = Pic[i]; // Member of index 0 2505 // G = Pic[i+1]; // Member of index 1 2506 // B = Pic[i+2]; // Member of index 2 2507 // ... // do something to R, G, B 2508 // } 2509 // To: 2510 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2511 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2512 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2513 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2514 // 2515 // Or translate following interleaved store group (factor = 3): 2516 // for (i = 0; i < N; i+=3) { 2517 // ... do something to R, G, B 2518 // Pic[i] = R; // Member of index 0 2519 // Pic[i+1] = G; // Member of index 1 2520 // Pic[i+2] = B; // Member of index 2 2521 // } 2522 // To: 2523 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2524 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2525 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2526 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2527 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2528 void VPInterleaveRecipe::execute(VPTransformState &State) { 2529 assert(!State.Instance && "Interleave group being replicated."); 2530 const InterleaveGroup<Instruction> *Group = IG; 2531 Instruction *Instr = Group->getInsertPos(); 2532 2533 // Prepare for the vector type of the interleaved load/store. 2534 Type *ScalarTy = getLoadStoreType(Instr); 2535 unsigned InterleaveFactor = Group->getFactor(); 2536 auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor); 2537 2538 // Prepare for the new pointers. 2539 SmallVector<Value *, 2> AddrParts; 2540 unsigned Index = Group->getIndex(Instr); 2541 2542 // TODO: extend the masked interleaved-group support to reversed access. 2543 VPValue *BlockInMask = getMask(); 2544 assert((!BlockInMask || !Group->isReverse()) && 2545 "Reversed masked interleave-group not supported."); 2546 2547 Value *Idx; 2548 // If the group is reverse, adjust the index to refer to the last vector lane 2549 // instead of the first. We adjust the index from the first vector lane, 2550 // rather than directly getting the pointer for lane VF - 1, because the 2551 // pointer operand of the interleaved access is supposed to be uniform. For 2552 // uniform instructions, we're only required to generate a value for the 2553 // first vector lane in each unroll iteration. 2554 if (Group->isReverse()) { 2555 Value *RuntimeVF = 2556 getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF); 2557 Idx = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1)); 2558 Idx = State.Builder.CreateMul(Idx, 2559 State.Builder.getInt32(Group->getFactor())); 2560 Idx = State.Builder.CreateAdd(Idx, State.Builder.getInt32(Index)); 2561 Idx = State.Builder.CreateNeg(Idx); 2562 } else 2563 Idx = State.Builder.getInt32(-Index); 2564 2565 VPValue *Addr = getAddr(); 2566 for (unsigned Part = 0; Part < State.UF; Part++) { 2567 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2568 if (auto *I = dyn_cast<Instruction>(AddrPart)) 2569 State.setDebugLocFrom(I->getDebugLoc()); 2570 2571 // Notice current instruction could be any index. Need to adjust the address 2572 // to the member of index 0. 2573 // 2574 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2575 // b = A[i]; // Member of index 0 2576 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2577 // 2578 // E.g. A[i+1] = a; // Member of index 1 2579 // A[i] = b; // Member of index 0 2580 // A[i+2] = c; // Member of index 2 (Current instruction) 2581 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2582 2583 bool InBounds = false; 2584 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2585 InBounds = gep->isInBounds(); 2586 AddrPart = State.Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds); 2587 AddrParts.push_back(AddrPart); 2588 } 2589 2590 State.setDebugLocFrom(Instr->getDebugLoc()); 2591 Value *PoisonVec = PoisonValue::get(VecTy); 2592 2593 auto CreateGroupMask = [&BlockInMask, &State, &InterleaveFactor]( 2594 unsigned Part, Value *MaskForGaps) -> Value * { 2595 if (State.VF.isScalable()) { 2596 assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); 2597 assert(InterleaveFactor == 2 && 2598 "Unsupported deinterleave factor for scalable vectors"); 2599 auto *BlockInMaskPart = State.get(BlockInMask, Part); 2600 SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart}; 2601 auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(), 2602 State.VF.getKnownMinValue() * 2, true); 2603 return State.Builder.CreateIntrinsic( 2604 MaskTy, Intrinsic::vector_interleave2, Ops, 2605 /*FMFSource=*/nullptr, "interleaved.mask"); 2606 } 2607 2608 if (!BlockInMask) 2609 return MaskForGaps; 2610 2611 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2612 Value *ShuffledMask = State.Builder.CreateShuffleVector( 2613 BlockInMaskPart, 2614 createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()), 2615 "interleaved.mask"); 2616 return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And, 2617 ShuffledMask, MaskForGaps) 2618 : ShuffledMask; 2619 }; 2620 2621 const DataLayout &DL = Instr->getDataLayout(); 2622 // Vectorize the interleaved load group. 2623 if (isa<LoadInst>(Instr)) { 2624 Value *MaskForGaps = nullptr; 2625 if (NeedsMaskForGaps) { 2626 MaskForGaps = createBitMaskForGaps(State.Builder, 2627 State.VF.getKnownMinValue(), *Group); 2628 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2629 } 2630 2631 // For each unroll part, create a wide load for the group. 2632 SmallVector<Value *, 2> NewLoads; 2633 for (unsigned Part = 0; Part < State.UF; Part++) { 2634 Instruction *NewLoad; 2635 if (BlockInMask || MaskForGaps) { 2636 Value *GroupMask = CreateGroupMask(Part, MaskForGaps); 2637 NewLoad = State.Builder.CreateMaskedLoad(VecTy, AddrParts[Part], 2638 Group->getAlign(), GroupMask, 2639 PoisonVec, "wide.masked.vec"); 2640 } else 2641 NewLoad = State.Builder.CreateAlignedLoad( 2642 VecTy, AddrParts[Part], Group->getAlign(), "wide.vec"); 2643 Group->addMetadata(NewLoad); 2644 NewLoads.push_back(NewLoad); 2645 } 2646 2647 ArrayRef<VPValue *> VPDefs = definedValues(); 2648 const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); 2649 if (VecTy->isScalableTy()) { 2650 assert(InterleaveFactor == 2 && 2651 "Unsupported deinterleave factor for scalable vectors"); 2652 2653 for (unsigned Part = 0; Part < State.UF; ++Part) { 2654 // Scalable vectors cannot use arbitrary shufflevectors (only splats), 2655 // so must use intrinsics to deinterleave. 2656 Value *DI = State.Builder.CreateIntrinsic( 2657 Intrinsic::vector_deinterleave2, VecTy, NewLoads[Part], 2658 /*FMFSource=*/nullptr, "strided.vec"); 2659 unsigned J = 0; 2660 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2661 Instruction *Member = Group->getMember(I); 2662 2663 if (!Member) 2664 continue; 2665 2666 Value *StridedVec = State.Builder.CreateExtractValue(DI, I); 2667 // If this member has different type, cast the result type. 2668 if (Member->getType() != ScalarTy) { 2669 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); 2670 StridedVec = 2671 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); 2672 } 2673 2674 if (Group->isReverse()) 2675 StridedVec = 2676 State.Builder.CreateVectorReverse(StridedVec, "reverse"); 2677 2678 State.set(VPDefs[J], StridedVec, Part); 2679 ++J; 2680 } 2681 } 2682 2683 return; 2684 } 2685 2686 // For each member in the group, shuffle out the appropriate data from the 2687 // wide loads. 2688 unsigned J = 0; 2689 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2690 Instruction *Member = Group->getMember(I); 2691 2692 // Skip the gaps in the group. 2693 if (!Member) 2694 continue; 2695 2696 auto StrideMask = 2697 createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue()); 2698 for (unsigned Part = 0; Part < State.UF; Part++) { 2699 Value *StridedVec = State.Builder.CreateShuffleVector( 2700 NewLoads[Part], StrideMask, "strided.vec"); 2701 2702 // If this member has different type, cast the result type. 2703 if (Member->getType() != ScalarTy) { 2704 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 2705 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); 2706 StridedVec = 2707 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); 2708 } 2709 2710 if (Group->isReverse()) 2711 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); 2712 2713 State.set(VPDefs[J], StridedVec, Part); 2714 } 2715 ++J; 2716 } 2717 return; 2718 } 2719 2720 // The sub vector type for current instruction. 2721 auto *SubVT = VectorType::get(ScalarTy, State.VF); 2722 2723 // Vectorize the interleaved store group. 2724 Value *MaskForGaps = 2725 createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group); 2726 assert((!MaskForGaps || !State.VF.isScalable()) && 2727 "masking gaps for scalable vectors is not yet supported."); 2728 ArrayRef<VPValue *> StoredValues = getStoredValues(); 2729 for (unsigned Part = 0; Part < State.UF; Part++) { 2730 // Collect the stored vector from each member. 2731 SmallVector<Value *, 4> StoredVecs; 2732 unsigned StoredIdx = 0; 2733 for (unsigned i = 0; i < InterleaveFactor; i++) { 2734 assert((Group->getMember(i) || MaskForGaps) && 2735 "Fail to get a member from an interleaved store group"); 2736 Instruction *Member = Group->getMember(i); 2737 2738 // Skip the gaps in the group. 2739 if (!Member) { 2740 Value *Undef = PoisonValue::get(SubVT); 2741 StoredVecs.push_back(Undef); 2742 continue; 2743 } 2744 2745 Value *StoredVec = State.get(StoredValues[StoredIdx], Part); 2746 ++StoredIdx; 2747 2748 if (Group->isReverse()) 2749 StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse"); 2750 2751 // If this member has different type, cast it to a unified type. 2752 2753 if (StoredVec->getType() != SubVT) 2754 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL); 2755 2756 StoredVecs.push_back(StoredVec); 2757 } 2758 2759 // Interleave all the smaller vectors into one wider vector. 2760 Value *IVec = 2761 interleaveVectors(State.Builder, StoredVecs, "interleaved.vec"); 2762 Instruction *NewStoreInstr; 2763 if (BlockInMask || MaskForGaps) { 2764 Value *GroupMask = CreateGroupMask(Part, MaskForGaps); 2765 NewStoreInstr = State.Builder.CreateMaskedStore( 2766 IVec, AddrParts[Part], Group->getAlign(), GroupMask); 2767 } else 2768 NewStoreInstr = State.Builder.CreateAlignedStore(IVec, AddrParts[Part], 2769 Group->getAlign()); 2770 2771 Group->addMetadata(NewStoreInstr); 2772 } 2773 } 2774 2775 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2776 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 2777 VPSlotTracker &SlotTracker) const { 2778 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 2779 IG->getInsertPos()->printAsOperand(O, false); 2780 O << ", "; 2781 getAddr()->printAsOperand(O, SlotTracker); 2782 VPValue *Mask = getMask(); 2783 if (Mask) { 2784 O << ", "; 2785 Mask->printAsOperand(O, SlotTracker); 2786 } 2787 2788 unsigned OpIdx = 0; 2789 for (unsigned i = 0; i < IG->getFactor(); ++i) { 2790 if (!IG->getMember(i)) 2791 continue; 2792 if (getNumStoreOperands() > 0) { 2793 O << "\n" << Indent << " store "; 2794 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 2795 O << " to index " << i; 2796 } else { 2797 O << "\n" << Indent << " "; 2798 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 2799 O << " = load from index " << i; 2800 } 2801 ++OpIdx; 2802 } 2803 } 2804 #endif 2805 2806 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { 2807 Value *Start = getStartValue()->getLiveInIRValue(); 2808 PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index"); 2809 EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 2810 2811 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2812 EntryPart->addIncoming(Start, VectorPH); 2813 EntryPart->setDebugLoc(getDebugLoc()); 2814 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) 2815 State.set(this, EntryPart, Part, /*IsScalar*/ true); 2816 } 2817 2818 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2819 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2820 VPSlotTracker &SlotTracker) const { 2821 O << Indent << "EMIT "; 2822 printAsOperand(O, SlotTracker); 2823 O << " = CANONICAL-INDUCTION "; 2824 printOperands(O, SlotTracker); 2825 } 2826 #endif 2827 2828 bool VPCanonicalIVPHIRecipe::isCanonical( 2829 InductionDescriptor::InductionKind Kind, VPValue *Start, 2830 VPValue *Step) const { 2831 // Must be an integer induction. 2832 if (Kind != InductionDescriptor::IK_IntInduction) 2833 return false; 2834 // Start must match the start value of this canonical induction. 2835 if (Start != getStartValue()) 2836 return false; 2837 2838 // If the step is defined by a recipe, it is not a ConstantInt. 2839 if (Step->getDefiningRecipe()) 2840 return false; 2841 2842 ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue()); 2843 return StepC && StepC->isOne(); 2844 } 2845 2846 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) { 2847 return IsScalarAfterVectorization && 2848 (!IsScalable || vputils::onlyFirstLaneUsed(this)); 2849 } 2850 2851 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 2852 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && 2853 "Not a pointer induction according to InductionDescriptor!"); 2854 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && 2855 "Unexpected type."); 2856 assert(!onlyScalarsGenerated(State.VF.isScalable()) && 2857 "Recipe should have been replaced"); 2858 2859 auto *IVR = getParent()->getPlan()->getCanonicalIV(); 2860 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true)); 2861 Type *PhiType = IndDesc.getStep()->getType(); 2862 2863 // Build a pointer phi 2864 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 2865 Type *ScStValueType = ScalarStartValue->getType(); 2866 PHINode *NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi", 2867 CanonicalIV->getIterator()); 2868 2869 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2870 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 2871 2872 // A pointer induction, performed by using a gep 2873 BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint(); 2874 2875 Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0)); 2876 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 2877 Value *NumUnrolledElems = 2878 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 2879 Value *InductionGEP = GetElementPtrInst::Create( 2880 State.Builder.getInt8Ty(), NewPointerPhi, 2881 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 2882 InductionLoc); 2883 // Add induction update using an incorrect block temporarily. The phi node 2884 // will be fixed after VPlan execution. Note that at this point the latch 2885 // block cannot be used, as it does not exist yet. 2886 // TODO: Model increment value in VPlan, by turning the recipe into a 2887 // multi-def and a subclass of VPHeaderPHIRecipe. 2888 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 2889 2890 // Create UF many actual address geps that use the pointer 2891 // phi as base and a vectorized version of the step value 2892 // (<step*0, ..., step*N>) as offset. 2893 for (unsigned Part = 0; Part < State.UF; ++Part) { 2894 Type *VecPhiType = VectorType::get(PhiType, State.VF); 2895 Value *StartOffsetScalar = 2896 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 2897 Value *StartOffset = 2898 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 2899 // Create a vector of consecutive numbers from zero to VF. 2900 StartOffset = State.Builder.CreateAdd( 2901 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 2902 2903 assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) && 2904 "scalar step must be the same across all parts"); 2905 Value *GEP = State.Builder.CreateGEP( 2906 State.Builder.getInt8Ty(), NewPointerPhi, 2907 State.Builder.CreateMul( 2908 StartOffset, 2909 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue), 2910 "vector.gep")); 2911 State.set(this, GEP, Part); 2912 } 2913 } 2914 2915 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2916 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, 2917 VPSlotTracker &SlotTracker) const { 2918 O << Indent << "EMIT "; 2919 printAsOperand(O, SlotTracker); 2920 O << " = WIDEN-POINTER-INDUCTION "; 2921 getStartValue()->printAsOperand(O, SlotTracker); 2922 O << ", " << *IndDesc.getStep(); 2923 } 2924 #endif 2925 2926 void VPExpandSCEVRecipe::execute(VPTransformState &State) { 2927 assert(!State.Instance && "cannot be used in per-lane"); 2928 const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); 2929 SCEVExpander Exp(SE, DL, "induction"); 2930 2931 Value *Res = Exp.expandCodeFor(Expr, Expr->getType(), 2932 &*State.Builder.GetInsertPoint()); 2933 assert(!State.ExpandedSCEVs.contains(Expr) && 2934 "Same SCEV expanded multiple times"); 2935 State.ExpandedSCEVs[Expr] = Res; 2936 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) 2937 State.set(this, Res, {Part, 0}); 2938 } 2939 2940 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2941 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, 2942 VPSlotTracker &SlotTracker) const { 2943 O << Indent << "EMIT "; 2944 getVPSingleValue()->printAsOperand(O, SlotTracker); 2945 O << " = EXPAND SCEV " << *Expr; 2946 } 2947 #endif 2948 2949 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { 2950 Value *CanonicalIV = State.get(getOperand(0), 0, /*IsScalar*/ true); 2951 Type *STy = CanonicalIV->getType(); 2952 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); 2953 ElementCount VF = State.VF; 2954 Value *VStart = VF.isScalar() 2955 ? CanonicalIV 2956 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); 2957 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { 2958 Value *VStep = createStepForVF(Builder, STy, VF, Part); 2959 if (VF.isVector()) { 2960 VStep = Builder.CreateVectorSplat(VF, VStep); 2961 VStep = 2962 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); 2963 } 2964 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); 2965 State.set(this, CanonicalVectorIV, Part); 2966 } 2967 } 2968 2969 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2970 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, 2971 VPSlotTracker &SlotTracker) const { 2972 O << Indent << "EMIT "; 2973 printAsOperand(O, SlotTracker); 2974 O << " = WIDEN-CANONICAL-INDUCTION "; 2975 printOperands(O, SlotTracker); 2976 } 2977 #endif 2978 2979 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { 2980 auto &Builder = State.Builder; 2981 // Create a vector from the initial value. 2982 auto *VectorInit = getStartValue()->getLiveInIRValue(); 2983 2984 Type *VecTy = State.VF.isScalar() 2985 ? VectorInit->getType() 2986 : VectorType::get(VectorInit->getType(), State.VF); 2987 2988 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2989 if (State.VF.isVector()) { 2990 auto *IdxTy = Builder.getInt32Ty(); 2991 auto *One = ConstantInt::get(IdxTy, 1); 2992 IRBuilder<>::InsertPointGuard Guard(Builder); 2993 Builder.SetInsertPoint(VectorPH->getTerminator()); 2994 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF); 2995 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 2996 VectorInit = Builder.CreateInsertElement( 2997 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init"); 2998 } 2999 3000 // Create a phi node for the new recurrence. 3001 PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur"); 3002 EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 3003 EntryPart->addIncoming(VectorInit, VectorPH); 3004 State.set(this, EntryPart, 0); 3005 } 3006 3007 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3008 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, 3009 VPSlotTracker &SlotTracker) const { 3010 O << Indent << "FIRST-ORDER-RECURRENCE-PHI "; 3011 printAsOperand(O, SlotTracker); 3012 O << " = phi "; 3013 printOperands(O, SlotTracker); 3014 } 3015 #endif 3016 3017 void VPReductionPHIRecipe::execute(VPTransformState &State) { 3018 auto &Builder = State.Builder; 3019 3020 // Reductions do not have to start at zero. They can start with 3021 // any loop invariant values. 3022 VPValue *StartVPV = getStartValue(); 3023 Value *StartV = StartVPV->getLiveInIRValue(); 3024 3025 // In order to support recurrences we need to be able to vectorize Phi nodes. 3026 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3027 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 3028 // this value when we vectorize all of the instructions that use the PHI. 3029 bool ScalarPHI = State.VF.isScalar() || IsInLoop; 3030 Type *VecTy = ScalarPHI ? StartV->getType() 3031 : VectorType::get(StartV->getType(), State.VF); 3032 3033 BasicBlock *HeaderBB = State.CFG.PrevBB; 3034 assert(State.CurrentVectorLoop->getHeader() == HeaderBB && 3035 "recipe must be in the vector loop header"); 3036 unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF; 3037 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { 3038 Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi"); 3039 EntryPart->insertBefore(HeaderBB->getFirstInsertionPt()); 3040 State.set(this, EntryPart, Part, IsInLoop); 3041 } 3042 3043 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3044 3045 Value *Iden = nullptr; 3046 RecurKind RK = RdxDesc.getRecurrenceKind(); 3047 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || 3048 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { 3049 // MinMax and AnyOf reductions have the start value as their identity. 3050 if (ScalarPHI) { 3051 Iden = StartV; 3052 } else { 3053 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 3054 Builder.SetInsertPoint(VectorPH->getTerminator()); 3055 StartV = Iden = 3056 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 3057 } 3058 } else { 3059 Iden = llvm::getRecurrenceIdentity(RK, VecTy->getScalarType(), 3060 RdxDesc.getFastMathFlags()); 3061 3062 if (!ScalarPHI) { 3063 Iden = Builder.CreateVectorSplat(State.VF, Iden); 3064 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 3065 Builder.SetInsertPoint(VectorPH->getTerminator()); 3066 Constant *Zero = Builder.getInt32(0); 3067 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 3068 } 3069 } 3070 3071 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { 3072 Value *EntryPart = State.get(this, Part, IsInLoop); 3073 // Make sure to add the reduction start value only to the 3074 // first unroll part. 3075 Value *StartVal = (Part == 0) ? StartV : Iden; 3076 cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH); 3077 } 3078 } 3079 3080 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3081 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3082 VPSlotTracker &SlotTracker) const { 3083 O << Indent << "WIDEN-REDUCTION-PHI "; 3084 3085 printAsOperand(O, SlotTracker); 3086 O << " = phi "; 3087 printOperands(O, SlotTracker); 3088 } 3089 #endif 3090 3091 void VPWidenPHIRecipe::execute(VPTransformState &State) { 3092 assert(EnableVPlanNativePath && 3093 "Non-native vplans are not expected to have VPWidenPHIRecipes."); 3094 3095 Value *Op0 = State.get(getOperand(0), 0); 3096 Type *VecTy = Op0->getType(); 3097 Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi"); 3098 State.set(this, VecPhi, 0); 3099 } 3100 3101 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3102 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3103 VPSlotTracker &SlotTracker) const { 3104 O << Indent << "WIDEN-PHI "; 3105 3106 auto *OriginalPhi = cast<PHINode>(getUnderlyingValue()); 3107 // Unless all incoming values are modeled in VPlan print the original PHI 3108 // directly. 3109 // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming 3110 // values as VPValues. 3111 if (getNumOperands() != OriginalPhi->getNumOperands()) { 3112 O << VPlanIngredient(OriginalPhi); 3113 return; 3114 } 3115 3116 printAsOperand(O, SlotTracker); 3117 O << " = phi "; 3118 printOperands(O, SlotTracker); 3119 } 3120 #endif 3121 3122 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and 3123 // remove VPActiveLaneMaskPHIRecipe. 3124 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { 3125 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3126 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { 3127 Value *StartMask = State.get(getOperand(0), Part); 3128 PHINode *EntryPart = 3129 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); 3130 EntryPart->addIncoming(StartMask, VectorPH); 3131 EntryPart->setDebugLoc(getDebugLoc()); 3132 State.set(this, EntryPart, Part); 3133 } 3134 } 3135 3136 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3137 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3138 VPSlotTracker &SlotTracker) const { 3139 O << Indent << "ACTIVE-LANE-MASK-PHI "; 3140 3141 printAsOperand(O, SlotTracker); 3142 O << " = phi "; 3143 printOperands(O, SlotTracker); 3144 } 3145 #endif 3146 3147 void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) { 3148 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3149 assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization."); 3150 Value *Start = State.get(getOperand(0), VPIteration(0, 0)); 3151 PHINode *EntryPart = 3152 State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv"); 3153 EntryPart->addIncoming(Start, VectorPH); 3154 EntryPart->setDebugLoc(getDebugLoc()); 3155 State.set(this, EntryPart, 0, /*IsScalar=*/true); 3156 } 3157 3158 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3159 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3160 VPSlotTracker &SlotTracker) const { 3161 O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI "; 3162 3163 printAsOperand(O, SlotTracker); 3164 O << " = phi "; 3165 printOperands(O, SlotTracker); 3166 } 3167 #endif 3168