1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file contains implementations for different VPlan recipes. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "VPlan.h" 15 #include "VPlanAnalysis.h" 16 #include "VPlanUtils.h" 17 #include "llvm/ADT/STLExtras.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/Twine.h" 20 #include "llvm/Analysis/IVDescriptors.h" 21 #include "llvm/IR/BasicBlock.h" 22 #include "llvm/IR/IRBuilder.h" 23 #include "llvm/IR/Instruction.h" 24 #include "llvm/IR/Instructions.h" 25 #include "llvm/IR/Type.h" 26 #include "llvm/IR/Value.h" 27 #include "llvm/IR/VectorBuilder.h" 28 #include "llvm/Support/Casting.h" 29 #include "llvm/Support/CommandLine.h" 30 #include "llvm/Support/Debug.h" 31 #include "llvm/Support/raw_ostream.h" 32 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 33 #include "llvm/Transforms/Utils/LoopUtils.h" 34 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 35 #include <cassert> 36 37 using namespace llvm; 38 39 using VectorParts = SmallVector<Value *, 2>; 40 41 namespace llvm { 42 extern cl::opt<bool> EnableVPlanNativePath; 43 } 44 extern cl::opt<unsigned> ForceTargetInstructionCost; 45 46 #define LV_NAME "loop-vectorize" 47 #define DEBUG_TYPE LV_NAME 48 49 bool VPRecipeBase::mayWriteToMemory() const { 50 switch (getVPDefID()) { 51 case VPInterleaveSC: 52 return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0; 53 case VPWidenStoreEVLSC: 54 case VPWidenStoreSC: 55 return true; 56 case VPReplicateSC: 57 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 58 ->mayWriteToMemory(); 59 case VPWidenCallSC: 60 return !cast<VPWidenCallRecipe>(this) 61 ->getCalledScalarFunction() 62 ->onlyReadsMemory(); 63 case VPBranchOnMaskSC: 64 case VPScalarIVStepsSC: 65 case VPPredInstPHISC: 66 return false; 67 case VPBlendSC: 68 case VPReductionEVLSC: 69 case VPReductionSC: 70 case VPWidenCanonicalIVSC: 71 case VPWidenCastSC: 72 case VPWidenGEPSC: 73 case VPWidenIntOrFpInductionSC: 74 case VPWidenLoadEVLSC: 75 case VPWidenLoadSC: 76 case VPWidenPHISC: 77 case VPWidenSC: 78 case VPWidenEVLSC: 79 case VPWidenSelectSC: { 80 const Instruction *I = 81 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 82 (void)I; 83 assert((!I || !I->mayWriteToMemory()) && 84 "underlying instruction may write to memory"); 85 return false; 86 } 87 default: 88 return true; 89 } 90 } 91 92 bool VPRecipeBase::mayReadFromMemory() const { 93 switch (getVPDefID()) { 94 case VPWidenLoadEVLSC: 95 case VPWidenLoadSC: 96 return true; 97 case VPReplicateSC: 98 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 99 ->mayReadFromMemory(); 100 case VPWidenCallSC: 101 return !cast<VPWidenCallRecipe>(this) 102 ->getCalledScalarFunction() 103 ->onlyWritesMemory(); 104 case VPBranchOnMaskSC: 105 case VPPredInstPHISC: 106 case VPScalarIVStepsSC: 107 case VPWidenStoreEVLSC: 108 case VPWidenStoreSC: 109 return false; 110 case VPBlendSC: 111 case VPReductionEVLSC: 112 case VPReductionSC: 113 case VPWidenCanonicalIVSC: 114 case VPWidenCastSC: 115 case VPWidenGEPSC: 116 case VPWidenIntOrFpInductionSC: 117 case VPWidenPHISC: 118 case VPWidenSC: 119 case VPWidenEVLSC: 120 case VPWidenSelectSC: { 121 const Instruction *I = 122 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 123 (void)I; 124 assert((!I || !I->mayReadFromMemory()) && 125 "underlying instruction may read from memory"); 126 return false; 127 } 128 default: 129 return true; 130 } 131 } 132 133 bool VPRecipeBase::mayHaveSideEffects() const { 134 switch (getVPDefID()) { 135 case VPDerivedIVSC: 136 case VPPredInstPHISC: 137 case VPScalarCastSC: 138 return false; 139 case VPInstructionSC: 140 switch (cast<VPInstruction>(this)->getOpcode()) { 141 case Instruction::Or: 142 case Instruction::ICmp: 143 case Instruction::Select: 144 case VPInstruction::Not: 145 case VPInstruction::CalculateTripCountMinusVF: 146 case VPInstruction::CanonicalIVIncrementForPart: 147 case VPInstruction::ExtractFromEnd: 148 case VPInstruction::FirstOrderRecurrenceSplice: 149 case VPInstruction::LogicalAnd: 150 case VPInstruction::PtrAdd: 151 return false; 152 default: 153 return true; 154 } 155 case VPWidenCallSC: { 156 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction(); 157 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn(); 158 } 159 case VPBlendSC: 160 case VPReductionEVLSC: 161 case VPReductionSC: 162 case VPScalarIVStepsSC: 163 case VPWidenCanonicalIVSC: 164 case VPWidenCastSC: 165 case VPWidenGEPSC: 166 case VPWidenIntOrFpInductionSC: 167 case VPWidenPHISC: 168 case VPWidenPointerInductionSC: 169 case VPWidenSC: 170 case VPWidenEVLSC: 171 case VPWidenSelectSC: { 172 const Instruction *I = 173 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 174 (void)I; 175 assert((!I || !I->mayHaveSideEffects()) && 176 "underlying instruction has side-effects"); 177 return false; 178 } 179 case VPInterleaveSC: 180 return mayWriteToMemory(); 181 case VPWidenLoadEVLSC: 182 case VPWidenLoadSC: 183 case VPWidenStoreEVLSC: 184 case VPWidenStoreSC: 185 assert( 186 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() == 187 mayWriteToMemory() && 188 "mayHaveSideffects result for ingredient differs from this " 189 "implementation"); 190 return mayWriteToMemory(); 191 case VPReplicateSC: { 192 auto *R = cast<VPReplicateRecipe>(this); 193 return R->getUnderlyingInstr()->mayHaveSideEffects(); 194 } 195 default: 196 return true; 197 } 198 } 199 200 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { 201 VPValue *ExitValue = getOperand(0); 202 VPBasicBlock *MiddleVPBB = 203 cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor()); 204 VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe(); 205 auto *ExitingVPBB = ExitingRecipe ? ExitingRecipe->getParent() : nullptr; 206 // Values leaving the vector loop reach live out phi's in the exiting block 207 // via middle block. 208 auto *PredVPBB = !ExitingVPBB || ExitingVPBB->getEnclosingLoopRegion() 209 ? MiddleVPBB 210 : ExitingVPBB; 211 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; 212 Value *V = State.get(ExitValue, VPIteration(0, 0)); 213 if (Phi->getBasicBlockIndex(PredBB) != -1) 214 Phi->setIncomingValueForBlock(PredBB, V); 215 else 216 Phi->addIncoming(V, PredBB); 217 } 218 219 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 220 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { 221 O << "Live-out "; 222 getPhi()->printAsOperand(O); 223 O << " = "; 224 getOperand(0)->printAsOperand(O, SlotTracker); 225 O << "\n"; 226 } 227 #endif 228 229 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { 230 assert(!Parent && "Recipe already in some VPBasicBlock"); 231 assert(InsertPos->getParent() && 232 "Insertion position not in any VPBasicBlock"); 233 InsertPos->getParent()->insert(this, InsertPos->getIterator()); 234 } 235 236 void VPRecipeBase::insertBefore(VPBasicBlock &BB, 237 iplist<VPRecipeBase>::iterator I) { 238 assert(!Parent && "Recipe already in some VPBasicBlock"); 239 assert(I == BB.end() || I->getParent() == &BB); 240 BB.insert(this, I); 241 } 242 243 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { 244 assert(!Parent && "Recipe already in some VPBasicBlock"); 245 assert(InsertPos->getParent() && 246 "Insertion position not in any VPBasicBlock"); 247 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator())); 248 } 249 250 void VPRecipeBase::removeFromParent() { 251 assert(getParent() && "Recipe not in any VPBasicBlock"); 252 getParent()->getRecipeList().remove(getIterator()); 253 Parent = nullptr; 254 } 255 256 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { 257 assert(getParent() && "Recipe not in any VPBasicBlock"); 258 return getParent()->getRecipeList().erase(getIterator()); 259 } 260 261 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { 262 removeFromParent(); 263 insertAfter(InsertPos); 264 } 265 266 void VPRecipeBase::moveBefore(VPBasicBlock &BB, 267 iplist<VPRecipeBase>::iterator I) { 268 removeFromParent(); 269 insertBefore(BB, I); 270 } 271 272 /// Return the underlying instruction to be used for computing \p R's cost via 273 /// the legacy cost model. Return nullptr if there's no suitable instruction. 274 static Instruction *getInstructionForCost(const VPRecipeBase *R) { 275 if (auto *S = dyn_cast<VPSingleDefRecipe>(R)) 276 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue()); 277 if (auto *IG = dyn_cast<VPInterleaveRecipe>(R)) 278 return IG->getInsertPos(); 279 // Currently the legacy cost model only calculates the instruction cost with 280 // underlying instruction. Removing the WidenMem here will prevent 281 // force-target-instruction-cost overwriting the cost of recipe with 282 // underlying instruction which is inconsistent with the legacy model. 283 // TODO: Remove WidenMem from this function when we don't need to compare to 284 // the legacy model. 285 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R)) 286 return &WidenMem->getIngredient(); 287 return nullptr; 288 } 289 290 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { 291 auto *UI = getInstructionForCost(this); 292 if (UI && Ctx.skipCostComputation(UI, VF.isVector())) 293 return 0; 294 295 InstructionCost RecipeCost = computeCost(VF, Ctx); 296 if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 && 297 RecipeCost.isValid()) 298 RecipeCost = InstructionCost(ForceTargetInstructionCost); 299 300 LLVM_DEBUG({ 301 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": "; 302 dump(); 303 }); 304 return RecipeCost; 305 } 306 307 InstructionCost VPRecipeBase::computeCost(ElementCount VF, 308 VPCostContext &Ctx) const { 309 // Compute the cost for the recipe falling back to the legacy cost model using 310 // the underlying instruction. If there is no underlying instruction, returns 311 // 0. 312 Instruction *UI = getInstructionForCost(this); 313 if (UI && isa<VPReplicateRecipe>(this)) { 314 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan 315 // transform, avoid computing their cost multiple times for now. 316 Ctx.SkipCostComputation.insert(UI); 317 } 318 return UI ? Ctx.getLegacyCost(UI, VF) : 0; 319 } 320 321 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { 322 assert(OpType == OperationType::FPMathOp && 323 "recipe doesn't have fast math flags"); 324 FastMathFlags Res; 325 Res.setAllowReassoc(FMFs.AllowReassoc); 326 Res.setNoNaNs(FMFs.NoNaNs); 327 Res.setNoInfs(FMFs.NoInfs); 328 Res.setNoSignedZeros(FMFs.NoSignedZeros); 329 Res.setAllowReciprocal(FMFs.AllowReciprocal); 330 Res.setAllowContract(FMFs.AllowContract); 331 Res.setApproxFunc(FMFs.ApproxFunc); 332 return Res; 333 } 334 335 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, 336 VPValue *A, VPValue *B, DebugLoc DL, 337 const Twine &Name) 338 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}), 339 Pred, DL), 340 Opcode(Opcode), Name(Name.str()) { 341 assert(Opcode == Instruction::ICmp && 342 "only ICmp predicates supported at the moment"); 343 } 344 345 VPInstruction::VPInstruction(unsigned Opcode, 346 std::initializer_list<VPValue *> Operands, 347 FastMathFlags FMFs, DebugLoc DL, const Twine &Name) 348 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL), 349 Opcode(Opcode), Name(Name.str()) { 350 // Make sure the VPInstruction is a floating-point operation. 351 assert(isFPMathOp() && "this op can't take fast-math flags"); 352 } 353 354 bool VPInstruction::doesGeneratePerAllLanes() const { 355 return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this); 356 } 357 358 bool VPInstruction::canGenerateScalarForFirstLane() const { 359 if (Instruction::isBinaryOp(getOpcode())) 360 return true; 361 if (isSingleScalar() || isVectorToScalar()) 362 return true; 363 switch (Opcode) { 364 case Instruction::ICmp: 365 case VPInstruction::BranchOnCond: 366 case VPInstruction::BranchOnCount: 367 case VPInstruction::CalculateTripCountMinusVF: 368 case VPInstruction::CanonicalIVIncrementForPart: 369 case VPInstruction::PtrAdd: 370 case VPInstruction::ExplicitVectorLength: 371 return true; 372 default: 373 return false; 374 } 375 } 376 377 Value *VPInstruction::generatePerLane(VPTransformState &State, 378 const VPIteration &Lane) { 379 IRBuilderBase &Builder = State.Builder; 380 381 assert(getOpcode() == VPInstruction::PtrAdd && 382 "only PtrAdd opcodes are supported for now"); 383 return Builder.CreatePtrAdd(State.get(getOperand(0), Lane), 384 State.get(getOperand(1), Lane), Name); 385 } 386 387 Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { 388 IRBuilderBase &Builder = State.Builder; 389 390 if (Instruction::isBinaryOp(getOpcode())) { 391 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 392 Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed); 393 Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed); 394 auto *Res = 395 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); 396 if (auto *I = dyn_cast<Instruction>(Res)) 397 setFlags(I); 398 return Res; 399 } 400 401 switch (getOpcode()) { 402 case VPInstruction::Not: { 403 Value *A = State.get(getOperand(0), Part); 404 return Builder.CreateNot(A, Name); 405 } 406 case Instruction::ICmp: { 407 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 408 Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed); 409 Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed); 410 return Builder.CreateCmp(getPredicate(), A, B, Name); 411 } 412 case Instruction::Select: { 413 Value *Cond = State.get(getOperand(0), Part); 414 Value *Op1 = State.get(getOperand(1), Part); 415 Value *Op2 = State.get(getOperand(2), Part); 416 return Builder.CreateSelect(Cond, Op1, Op2, Name); 417 } 418 case VPInstruction::ActiveLaneMask: { 419 // Get first lane of vector induction variable. 420 Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); 421 // Get the original loop tripcount. 422 Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0)); 423 424 // If this part of the active lane mask is scalar, generate the CMP directly 425 // to avoid unnecessary extracts. 426 if (State.VF.isScalar()) 427 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC, 428 Name); 429 430 auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); 431 auto *PredTy = VectorType::get(Int1Ty, State.VF); 432 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, 433 {PredTy, ScalarTC->getType()}, 434 {VIVElem0, ScalarTC}, nullptr, Name); 435 } 436 case VPInstruction::FirstOrderRecurrenceSplice: { 437 // Generate code to combine the previous and current values in vector v3. 438 // 439 // vector.ph: 440 // v_init = vector(..., ..., ..., a[-1]) 441 // br vector.body 442 // 443 // vector.body 444 // i = phi [0, vector.ph], [i+4, vector.body] 445 // v1 = phi [v_init, vector.ph], [v2, vector.body] 446 // v2 = a[i, i+1, i+2, i+3]; 447 // v3 = vector(v1(3), v2(0, 1, 2)) 448 449 // For the first part, use the recurrence phi (v1), otherwise v2. 450 auto *V1 = State.get(getOperand(0), 0); 451 Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1); 452 if (!PartMinus1->getType()->isVectorTy()) 453 return PartMinus1; 454 Value *V2 = State.get(getOperand(1), Part); 455 return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name); 456 } 457 case VPInstruction::CalculateTripCountMinusVF: { 458 if (Part != 0) 459 return State.get(this, 0, /*IsScalar*/ true); 460 461 Value *ScalarTC = State.get(getOperand(0), {0, 0}); 462 Value *Step = 463 createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF); 464 Value *Sub = Builder.CreateSub(ScalarTC, Step); 465 Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step); 466 Value *Zero = ConstantInt::get(ScalarTC->getType(), 0); 467 return Builder.CreateSelect(Cmp, Sub, Zero); 468 } 469 case VPInstruction::ExplicitVectorLength: { 470 // Compute EVL 471 auto GetEVL = [=](VPTransformState &State, Value *AVL) { 472 assert(AVL->getType()->isIntegerTy() && 473 "Requested vector length should be an integer."); 474 475 // TODO: Add support for MaxSafeDist for correct loop emission. 476 assert(State.VF.isScalable() && "Expected scalable vector factor."); 477 Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue()); 478 479 Value *EVL = State.Builder.CreateIntrinsic( 480 State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length, 481 {AVL, VFArg, State.Builder.getTrue()}); 482 return EVL; 483 }; 484 // TODO: Restructure this code with an explicit remainder loop, vsetvli can 485 // be outside of the main loop. 486 assert(Part == 0 && "No unrolling expected for predicated vectorization."); 487 // Compute VTC - IV as the AVL (requested vector length). 488 Value *Index = State.get(getOperand(0), VPIteration(0, 0)); 489 Value *TripCount = State.get(getOperand(1), VPIteration(0, 0)); 490 Value *AVL = State.Builder.CreateSub(TripCount, Index); 491 Value *EVL = GetEVL(State, AVL); 492 return EVL; 493 } 494 case VPInstruction::CanonicalIVIncrementForPart: { 495 auto *IV = State.get(getOperand(0), VPIteration(0, 0)); 496 if (Part == 0) 497 return IV; 498 499 // The canonical IV is incremented by the vectorization factor (num of SIMD 500 // elements) times the unroll part. 501 Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); 502 return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(), 503 hasNoSignedWrap()); 504 } 505 case VPInstruction::BranchOnCond: { 506 if (Part != 0) 507 return nullptr; 508 509 Value *Cond = State.get(getOperand(0), VPIteration(Part, 0)); 510 // Replace the temporary unreachable terminator with a new conditional 511 // branch, hooking it up to backward destination for exiting blocks now and 512 // to forward destination(s) later when they are created. 513 BranchInst *CondBr = 514 Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr); 515 CondBr->setSuccessor(0, nullptr); 516 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 517 518 if (!getParent()->isExiting()) 519 return CondBr; 520 521 VPRegionBlock *ParentRegion = getParent()->getParent(); 522 VPBasicBlock *Header = ParentRegion->getEntryBasicBlock(); 523 CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]); 524 return CondBr; 525 } 526 case VPInstruction::BranchOnCount: { 527 if (Part != 0) 528 return nullptr; 529 // First create the compare. 530 Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true); 531 Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true); 532 Value *Cond = Builder.CreateICmpEQ(IV, TC); 533 534 // Now create the branch. 535 auto *Plan = getParent()->getPlan(); 536 VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); 537 VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock(); 538 539 // Replace the temporary unreachable terminator with a new conditional 540 // branch, hooking it up to backward destination (the header) now and to the 541 // forward destination (the exit/middle block) later when it is created. 542 // Note that CreateCondBr expects a valid BB as first argument, so we need 543 // to set it to nullptr later. 544 BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), 545 State.CFG.VPBB2IRBB[Header]); 546 CondBr->setSuccessor(0, nullptr); 547 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 548 return CondBr; 549 } 550 case VPInstruction::ComputeReductionResult: { 551 if (Part != 0) 552 return State.get(this, 0, /*IsScalar*/ true); 553 554 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary 555 // and will be removed by breaking up the recipe further. 556 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0)); 557 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 558 // Get its reduction variable descriptor. 559 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 560 561 RecurKind RK = RdxDesc.getRecurrenceKind(); 562 563 VPValue *LoopExitingDef = getOperand(1); 564 Type *PhiTy = OrigPhi->getType(); 565 VectorParts RdxParts(State.UF); 566 for (unsigned Part = 0; Part < State.UF; ++Part) 567 RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop()); 568 569 // If the vector reduction can be performed in a smaller type, we truncate 570 // then extend the loop exit value to enable InstCombine to evaluate the 571 // entire expression in the smaller type. 572 // TODO: Handle this in truncateToMinBW. 573 if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 574 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF); 575 for (unsigned Part = 0; Part < State.UF; ++Part) 576 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 577 } 578 // Reduce all of the unrolled parts into a single vector. 579 Value *ReducedPartRdx = RdxParts[0]; 580 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 581 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) 582 Op = Instruction::Or; 583 584 if (PhiR->isOrdered()) { 585 ReducedPartRdx = RdxParts[State.UF - 1]; 586 } else { 587 // Floating-point operations should have some FMF to enable the reduction. 588 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 589 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 590 for (unsigned Part = 1; Part < State.UF; ++Part) { 591 Value *RdxPart = RdxParts[Part]; 592 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 593 ReducedPartRdx = Builder.CreateBinOp( 594 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 595 else 596 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 597 } 598 } 599 600 // Create the reduction after the loop. Note that inloop reductions create 601 // the target reduction in the loop using a Reduction recipe. 602 if ((State.VF.isVector() || 603 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) && 604 !PhiR->isInLoop()) { 605 ReducedPartRdx = 606 createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); 607 // If the reduction can be performed in a smaller type, we need to extend 608 // the reduction to the wider type before we branch to the original loop. 609 if (PhiTy != RdxDesc.getRecurrenceType()) 610 ReducedPartRdx = RdxDesc.isSigned() 611 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 612 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 613 } 614 615 // If there were stores of the reduction value to a uniform memory address 616 // inside the loop, create the final store here. 617 if (StoreInst *SI = RdxDesc.IntermediateStore) { 618 auto *NewSI = Builder.CreateAlignedStore( 619 ReducedPartRdx, SI->getPointerOperand(), SI->getAlign()); 620 propagateMetadata(NewSI, SI); 621 } 622 623 return ReducedPartRdx; 624 } 625 case VPInstruction::ExtractFromEnd: { 626 if (Part != 0) 627 return State.get(this, 0, /*IsScalar*/ true); 628 629 auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue()); 630 unsigned Offset = CI->getZExtValue(); 631 assert(Offset > 0 && "Offset from end must be positive"); 632 Value *Res; 633 if (State.VF.isVector()) { 634 assert(Offset <= State.VF.getKnownMinValue() && 635 "invalid offset to extract from"); 636 // Extract lane VF - Offset from the operand. 637 Res = State.get( 638 getOperand(0), 639 VPIteration(State.UF - 1, VPLane::getLaneFromEnd(State.VF, Offset))); 640 } else { 641 assert(Offset <= State.UF && "invalid offset to extract from"); 642 // When loop is unrolled without vectorizing, retrieve UF - Offset. 643 Res = State.get(getOperand(0), State.UF - Offset); 644 } 645 if (isa<ExtractElementInst>(Res)) 646 Res->setName(Name); 647 return Res; 648 } 649 case VPInstruction::LogicalAnd: { 650 Value *A = State.get(getOperand(0), Part); 651 Value *B = State.get(getOperand(1), Part); 652 return Builder.CreateLogicalAnd(A, B, Name); 653 } 654 case VPInstruction::PtrAdd: { 655 assert(vputils::onlyFirstLaneUsed(this) && 656 "can only generate first lane for PtrAdd"); 657 Value *Ptr = State.get(getOperand(0), Part, /* IsScalar */ true); 658 Value *Addend = State.get(getOperand(1), Part, /* IsScalar */ true); 659 return Builder.CreatePtrAdd(Ptr, Addend, Name); 660 } 661 case VPInstruction::ResumePhi: { 662 if (Part != 0) 663 return State.get(this, 0, /*IsScalar*/ true); 664 Value *IncomingFromVPlanPred = 665 State.get(getOperand(0), Part, /* IsScalar */ true); 666 Value *IncomingFromOtherPreds = 667 State.get(getOperand(1), Part, /* IsScalar */ true); 668 auto *NewPhi = 669 Builder.CreatePHI(IncomingFromOtherPreds->getType(), 2, Name); 670 BasicBlock *VPlanPred = 671 State.CFG 672 .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getSinglePredecessor())]; 673 NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred); 674 for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) { 675 assert(OtherPred != VPlanPred && 676 "VPlan predecessors should not be connected yet"); 677 NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred); 678 } 679 return NewPhi; 680 } 681 682 default: 683 llvm_unreachable("Unsupported opcode for instruction"); 684 } 685 } 686 687 bool VPInstruction::isVectorToScalar() const { 688 return getOpcode() == VPInstruction::ExtractFromEnd || 689 getOpcode() == VPInstruction::ComputeReductionResult; 690 } 691 692 bool VPInstruction::isSingleScalar() const { 693 return getOpcode() == VPInstruction::ResumePhi; 694 } 695 696 #if !defined(NDEBUG) 697 bool VPInstruction::isFPMathOp() const { 698 // Inspired by FPMathOperator::classof. Notable differences are that we don't 699 // support Call, PHI and Select opcodes here yet. 700 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || 701 Opcode == Instruction::FNeg || Opcode == Instruction::FSub || 702 Opcode == Instruction::FDiv || Opcode == Instruction::FRem || 703 Opcode == Instruction::FCmp || Opcode == Instruction::Select; 704 } 705 #endif 706 707 void VPInstruction::execute(VPTransformState &State) { 708 assert(!State.Instance && "VPInstruction executing an Instance"); 709 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 710 assert((hasFastMathFlags() == isFPMathOp() || 711 getOpcode() == Instruction::Select) && 712 "Recipe not a FPMathOp but has fast-math flags?"); 713 if (hasFastMathFlags()) 714 State.Builder.setFastMathFlags(getFastMathFlags()); 715 State.setDebugLocFrom(getDebugLoc()); 716 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() && 717 (vputils::onlyFirstLaneUsed(this) || 718 isVectorToScalar() || isSingleScalar()); 719 bool GeneratesPerAllLanes = doesGeneratePerAllLanes(); 720 bool OnlyFirstPartUsed = vputils::onlyFirstPartUsed(this); 721 for (unsigned Part = 0; Part < State.UF; ++Part) { 722 if (GeneratesPerAllLanes) { 723 for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue(); 724 Lane != NumLanes; ++Lane) { 725 Value *GeneratedValue = generatePerLane(State, VPIteration(Part, Lane)); 726 assert(GeneratedValue && "generatePerLane must produce a value"); 727 State.set(this, GeneratedValue, VPIteration(Part, Lane)); 728 } 729 continue; 730 } 731 732 if (Part != 0 && OnlyFirstPartUsed && hasResult()) { 733 Value *Part0 = State.get(this, 0, /*IsScalar*/ GeneratesPerFirstLaneOnly); 734 State.set(this, Part0, Part, 735 /*IsScalar*/ GeneratesPerFirstLaneOnly); 736 continue; 737 } 738 739 Value *GeneratedValue = generatePerPart(State, Part); 740 if (!hasResult()) 741 continue; 742 assert(GeneratedValue && "generatePerPart must produce a value"); 743 assert((GeneratedValue->getType()->isVectorTy() == 744 !GeneratesPerFirstLaneOnly || 745 State.VF.isScalar()) && 746 "scalar value but not only first lane defined"); 747 State.set(this, GeneratedValue, Part, 748 /*IsScalar*/ GeneratesPerFirstLaneOnly); 749 } 750 } 751 752 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { 753 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 754 if (Instruction::isBinaryOp(getOpcode())) 755 return vputils::onlyFirstLaneUsed(this); 756 757 switch (getOpcode()) { 758 default: 759 return false; 760 case Instruction::ICmp: 761 case VPInstruction::PtrAdd: 762 // TODO: Cover additional opcodes. 763 return vputils::onlyFirstLaneUsed(this); 764 case VPInstruction::ActiveLaneMask: 765 case VPInstruction::ExplicitVectorLength: 766 case VPInstruction::CalculateTripCountMinusVF: 767 case VPInstruction::CanonicalIVIncrementForPart: 768 case VPInstruction::BranchOnCount: 769 case VPInstruction::BranchOnCond: 770 case VPInstruction::ResumePhi: 771 return true; 772 }; 773 llvm_unreachable("switch should return"); 774 } 775 776 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const { 777 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 778 if (Instruction::isBinaryOp(getOpcode())) 779 return vputils::onlyFirstPartUsed(this); 780 781 switch (getOpcode()) { 782 default: 783 return false; 784 case Instruction::ICmp: 785 case Instruction::Select: 786 return vputils::onlyFirstPartUsed(this); 787 case VPInstruction::BranchOnCount: 788 case VPInstruction::BranchOnCond: 789 case VPInstruction::CanonicalIVIncrementForPart: 790 return true; 791 }; 792 llvm_unreachable("switch should return"); 793 } 794 795 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 796 void VPInstruction::dump() const { 797 VPSlotTracker SlotTracker(getParent()->getPlan()); 798 print(dbgs(), "", SlotTracker); 799 } 800 801 void VPInstruction::print(raw_ostream &O, const Twine &Indent, 802 VPSlotTracker &SlotTracker) const { 803 O << Indent << "EMIT "; 804 805 if (hasResult()) { 806 printAsOperand(O, SlotTracker); 807 O << " = "; 808 } 809 810 switch (getOpcode()) { 811 case VPInstruction::Not: 812 O << "not"; 813 break; 814 case VPInstruction::SLPLoad: 815 O << "combined load"; 816 break; 817 case VPInstruction::SLPStore: 818 O << "combined store"; 819 break; 820 case VPInstruction::ActiveLaneMask: 821 O << "active lane mask"; 822 break; 823 case VPInstruction::ResumePhi: 824 O << "resume-phi"; 825 break; 826 case VPInstruction::ExplicitVectorLength: 827 O << "EXPLICIT-VECTOR-LENGTH"; 828 break; 829 case VPInstruction::FirstOrderRecurrenceSplice: 830 O << "first-order splice"; 831 break; 832 case VPInstruction::BranchOnCond: 833 O << "branch-on-cond"; 834 break; 835 case VPInstruction::CalculateTripCountMinusVF: 836 O << "TC > VF ? TC - VF : 0"; 837 break; 838 case VPInstruction::CanonicalIVIncrementForPart: 839 O << "VF * Part +"; 840 break; 841 case VPInstruction::BranchOnCount: 842 O << "branch-on-count"; 843 break; 844 case VPInstruction::ExtractFromEnd: 845 O << "extract-from-end"; 846 break; 847 case VPInstruction::ComputeReductionResult: 848 O << "compute-reduction-result"; 849 break; 850 case VPInstruction::LogicalAnd: 851 O << "logical-and"; 852 break; 853 case VPInstruction::PtrAdd: 854 O << "ptradd"; 855 break; 856 default: 857 O << Instruction::getOpcodeName(getOpcode()); 858 } 859 860 printFlags(O); 861 printOperands(O, SlotTracker); 862 863 if (auto DL = getDebugLoc()) { 864 O << ", !dbg "; 865 DL.print(O); 866 } 867 } 868 #endif 869 870 void VPIRInstruction::execute(VPTransformState &State) { 871 assert((isa<PHINode>(&I) || getNumOperands() == 0) && 872 "Only PHINodes can have extra operands"); 873 if (getNumOperands() == 1) { 874 VPValue *ExitValue = getOperand(0); 875 auto Lane = vputils::isUniformAfterVectorization(ExitValue) 876 ? VPLane::getFirstLane() 877 : VPLane::getLastLaneForVF(State.VF); 878 auto *PredVPBB = cast<VPBasicBlock>(getParent()->getSinglePredecessor()); 879 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; 880 // Set insertion point in PredBB in case an extract needs to be generated. 881 // TODO: Model extracts explicitly. 882 State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt()); 883 Value *V = State.get(ExitValue, VPIteration(State.UF - 1, Lane)); 884 auto *Phi = cast<PHINode>(&I); 885 Phi->addIncoming(V, PredBB); 886 } 887 888 // Advance the insert point after the wrapped IR instruction. This allows 889 // interleaving VPIRInstructions and other recipes. 890 State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator())); 891 } 892 893 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 894 void VPIRInstruction::print(raw_ostream &O, const Twine &Indent, 895 VPSlotTracker &SlotTracker) const { 896 O << Indent << "IR " << I; 897 898 if (getNumOperands() != 0) { 899 assert(getNumOperands() == 1 && "can have at most 1 operand"); 900 O << " (extra operand: "; 901 printOperands(O, SlotTracker); 902 O << ")"; 903 } 904 } 905 #endif 906 907 void VPWidenCallRecipe::execute(VPTransformState &State) { 908 assert(State.VF.isVector() && "not widening"); 909 Function *CalledScalarFn = getCalledScalarFunction(); 910 assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) && 911 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 912 State.setDebugLocFrom(getDebugLoc()); 913 914 bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic; 915 FunctionType *VFTy = nullptr; 916 if (Variant) 917 VFTy = Variant->getFunctionType(); 918 for (unsigned Part = 0; Part < State.UF; ++Part) { 919 SmallVector<Type *, 2> TysForDecl; 920 // Add return type if intrinsic is overloaded on it. 921 if (UseIntrinsic && 922 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1)) 923 TysForDecl.push_back(VectorType::get( 924 CalledScalarFn->getReturnType()->getScalarType(), State.VF)); 925 SmallVector<Value *, 4> Args; 926 for (const auto &I : enumerate(arg_operands())) { 927 // Some intrinsics have a scalar argument - don't replace it with a 928 // vector. 929 Value *Arg; 930 if (UseIntrinsic && 931 isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())) 932 Arg = State.get(I.value(), VPIteration(0, 0)); 933 // Some vectorized function variants may also take a scalar argument, 934 // e.g. linear parameters for pointers. This needs to be the scalar value 935 // from the start of the respective part when interleaving. 936 else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy()) 937 Arg = State.get(I.value(), VPIteration(Part, 0)); 938 else 939 Arg = State.get(I.value(), Part); 940 if (UseIntrinsic && 941 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index())) 942 TysForDecl.push_back(Arg->getType()); 943 Args.push_back(Arg); 944 } 945 946 Function *VectorF; 947 if (UseIntrinsic) { 948 // Use vector version of the intrinsic. 949 Module *M = State.Builder.GetInsertBlock()->getModule(); 950 VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl); 951 assert(VectorF && "Can't retrieve vector intrinsic."); 952 } else { 953 #ifndef NDEBUG 954 assert(Variant != nullptr && "Can't create vector function."); 955 #endif 956 VectorF = Variant; 957 } 958 959 auto *CI = cast_or_null<CallInst>(getUnderlyingInstr()); 960 SmallVector<OperandBundleDef, 1> OpBundles; 961 if (CI) 962 CI->getOperandBundlesAsDefs(OpBundles); 963 964 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); 965 966 if (isa<FPMathOperator>(V)) 967 V->copyFastMathFlags(CI); 968 969 if (!V->getType()->isVoidTy()) 970 State.set(this, V, Part); 971 State.addMetadata(V, CI); 972 } 973 } 974 975 InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF, 976 VPCostContext &Ctx) const { 977 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 978 if (Variant) { 979 return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(), 980 Variant->getFunctionType()->params(), 981 CostKind); 982 } 983 984 FastMathFlags FMF; 985 // TODO: Manage flags via VPRecipeWithIRFlags. 986 if (auto *FPMO = dyn_cast_or_null<FPMathOperator>(getUnderlyingValue())) 987 FMF = FPMO->getFastMathFlags(); 988 989 // Some backends analyze intrinsic arguments to determine cost. Use the 990 // underlying value for the operand if it has one. Otherwise try to use the 991 // operand of the underlying call instruction, if there is one. Otherwise 992 // clear Arguments. 993 // TODO: Rework TTI interface to be independent of concrete IR values. 994 SmallVector<const Value *> Arguments; 995 for (const auto &[Idx, Op] : enumerate(operands())) { 996 auto *V = Op->getUnderlyingValue(); 997 if (!V) { 998 if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) { 999 Arguments.push_back(UI->getArgOperand(Idx)); 1000 continue; 1001 } 1002 Arguments.clear(); 1003 break; 1004 } 1005 Arguments.push_back(V); 1006 } 1007 1008 Type *RetTy = 1009 ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); 1010 SmallVector<Type *> ParamTys; 1011 for (unsigned I = 0; I != getNumOperands(); ++I) 1012 ParamTys.push_back( 1013 ToVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF)); 1014 1015 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst. 1016 IntrinsicCostAttributes CostAttrs( 1017 VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF, 1018 dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue())); 1019 return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, CostKind); 1020 } 1021 1022 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1023 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, 1024 VPSlotTracker &SlotTracker) const { 1025 O << Indent << "WIDEN-CALL "; 1026 1027 Function *CalledFn = getCalledScalarFunction(); 1028 if (CalledFn->getReturnType()->isVoidTy()) 1029 O << "void "; 1030 else { 1031 printAsOperand(O, SlotTracker); 1032 O << " = "; 1033 } 1034 1035 O << "call @" << CalledFn->getName() << "("; 1036 interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) { 1037 Op->printAsOperand(O, SlotTracker); 1038 }); 1039 O << ")"; 1040 1041 if (VectorIntrinsicID) 1042 O << " (using vector intrinsic)"; 1043 else { 1044 O << " (using library function"; 1045 if (Variant->hasName()) 1046 O << ": " << Variant->getName(); 1047 O << ")"; 1048 } 1049 } 1050 1051 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, 1052 VPSlotTracker &SlotTracker) const { 1053 O << Indent << "WIDEN-SELECT "; 1054 printAsOperand(O, SlotTracker); 1055 O << " = select "; 1056 getOperand(0)->printAsOperand(O, SlotTracker); 1057 O << ", "; 1058 getOperand(1)->printAsOperand(O, SlotTracker); 1059 O << ", "; 1060 getOperand(2)->printAsOperand(O, SlotTracker); 1061 O << (isInvariantCond() ? " (condition is loop invariant)" : ""); 1062 } 1063 #endif 1064 1065 void VPWidenSelectRecipe::execute(VPTransformState &State) { 1066 State.setDebugLocFrom(getDebugLoc()); 1067 1068 // The condition can be loop invariant but still defined inside the 1069 // loop. This means that we can't just use the original 'cond' value. 1070 // We have to take the 'vectorized' value and pick the first lane. 1071 // Instcombine will make this a no-op. 1072 auto *InvarCond = 1073 isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr; 1074 1075 for (unsigned Part = 0; Part < State.UF; ++Part) { 1076 Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part); 1077 Value *Op0 = State.get(getOperand(1), Part); 1078 Value *Op1 = State.get(getOperand(2), Part); 1079 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 1080 State.set(this, Sel, Part); 1081 State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1082 } 1083 } 1084 1085 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy( 1086 const FastMathFlags &FMF) { 1087 AllowReassoc = FMF.allowReassoc(); 1088 NoNaNs = FMF.noNaNs(); 1089 NoInfs = FMF.noInfs(); 1090 NoSignedZeros = FMF.noSignedZeros(); 1091 AllowReciprocal = FMF.allowReciprocal(); 1092 AllowContract = FMF.allowContract(); 1093 ApproxFunc = FMF.approxFunc(); 1094 } 1095 1096 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1097 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { 1098 switch (OpType) { 1099 case OperationType::Cmp: 1100 O << " " << CmpInst::getPredicateName(getPredicate()); 1101 break; 1102 case OperationType::DisjointOp: 1103 if (DisjointFlags.IsDisjoint) 1104 O << " disjoint"; 1105 break; 1106 case OperationType::PossiblyExactOp: 1107 if (ExactFlags.IsExact) 1108 O << " exact"; 1109 break; 1110 case OperationType::OverflowingBinOp: 1111 if (WrapFlags.HasNUW) 1112 O << " nuw"; 1113 if (WrapFlags.HasNSW) 1114 O << " nsw"; 1115 break; 1116 case OperationType::FPMathOp: 1117 getFastMathFlags().print(O); 1118 break; 1119 case OperationType::GEPOp: 1120 if (GEPFlags.IsInBounds) 1121 O << " inbounds"; 1122 break; 1123 case OperationType::NonNegOp: 1124 if (NonNegFlags.NonNeg) 1125 O << " nneg"; 1126 break; 1127 case OperationType::Other: 1128 break; 1129 } 1130 if (getNumOperands() > 0) 1131 O << " "; 1132 } 1133 #endif 1134 1135 void VPWidenRecipe::execute(VPTransformState &State) { 1136 State.setDebugLocFrom(getDebugLoc()); 1137 auto &Builder = State.Builder; 1138 switch (Opcode) { 1139 case Instruction::Call: 1140 case Instruction::Br: 1141 case Instruction::PHI: 1142 case Instruction::GetElementPtr: 1143 case Instruction::Select: 1144 llvm_unreachable("This instruction is handled by a different recipe."); 1145 case Instruction::UDiv: 1146 case Instruction::SDiv: 1147 case Instruction::SRem: 1148 case Instruction::URem: 1149 case Instruction::Add: 1150 case Instruction::FAdd: 1151 case Instruction::Sub: 1152 case Instruction::FSub: 1153 case Instruction::FNeg: 1154 case Instruction::Mul: 1155 case Instruction::FMul: 1156 case Instruction::FDiv: 1157 case Instruction::FRem: 1158 case Instruction::Shl: 1159 case Instruction::LShr: 1160 case Instruction::AShr: 1161 case Instruction::And: 1162 case Instruction::Or: 1163 case Instruction::Xor: { 1164 // Just widen unops and binops. 1165 for (unsigned Part = 0; Part < State.UF; ++Part) { 1166 SmallVector<Value *, 2> Ops; 1167 for (VPValue *VPOp : operands()) 1168 Ops.push_back(State.get(VPOp, Part)); 1169 1170 Value *V = Builder.CreateNAryOp(Opcode, Ops); 1171 1172 if (auto *VecOp = dyn_cast<Instruction>(V)) 1173 setFlags(VecOp); 1174 1175 // Use this vector value for all users of the original instruction. 1176 State.set(this, V, Part); 1177 State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1178 } 1179 1180 break; 1181 } 1182 case Instruction::Freeze: { 1183 for (unsigned Part = 0; Part < State.UF; ++Part) { 1184 Value *Op = State.get(getOperand(0), Part); 1185 1186 Value *Freeze = Builder.CreateFreeze(Op); 1187 State.set(this, Freeze, Part); 1188 } 1189 break; 1190 } 1191 case Instruction::ICmp: 1192 case Instruction::FCmp: { 1193 // Widen compares. Generate vector compares. 1194 bool FCmp = Opcode == Instruction::FCmp; 1195 for (unsigned Part = 0; Part < State.UF; ++Part) { 1196 Value *A = State.get(getOperand(0), Part); 1197 Value *B = State.get(getOperand(1), Part); 1198 Value *C = nullptr; 1199 if (FCmp) { 1200 // Propagate fast math flags. 1201 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 1202 if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue())) 1203 Builder.setFastMathFlags(I->getFastMathFlags()); 1204 C = Builder.CreateFCmp(getPredicate(), A, B); 1205 } else { 1206 C = Builder.CreateICmp(getPredicate(), A, B); 1207 } 1208 State.set(this, C, Part); 1209 State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1210 } 1211 1212 break; 1213 } 1214 default: 1215 // This instruction is not vectorized by simple widening. 1216 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " 1217 << Instruction::getOpcodeName(Opcode)); 1218 llvm_unreachable("Unhandled instruction!"); 1219 } // end of switch. 1220 1221 #if !defined(NDEBUG) 1222 // Verify that VPlan type inference results agree with the type of the 1223 // generated values. 1224 for (unsigned Part = 0; Part < State.UF; ++Part) { 1225 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), 1226 State.VF) == State.get(this, Part)->getType() && 1227 "inferred type and type from generated instructions do not match"); 1228 } 1229 #endif 1230 } 1231 1232 InstructionCost VPWidenRecipe::computeCost(ElementCount VF, 1233 VPCostContext &Ctx) const { 1234 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1235 switch (Opcode) { 1236 case Instruction::FNeg: { 1237 Type *VectorTy = 1238 ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); 1239 return Ctx.TTI.getArithmeticInstrCost( 1240 Opcode, VectorTy, CostKind, 1241 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 1242 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}); 1243 } 1244 1245 case Instruction::UDiv: 1246 case Instruction::SDiv: 1247 case Instruction::SRem: 1248 case Instruction::URem: 1249 // More complex computation, let the legacy cost-model handle this for now. 1250 return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF); 1251 case Instruction::Add: 1252 case Instruction::FAdd: 1253 case Instruction::Sub: 1254 case Instruction::FSub: 1255 case Instruction::Mul: 1256 case Instruction::FMul: 1257 case Instruction::FDiv: 1258 case Instruction::FRem: 1259 case Instruction::Shl: 1260 case Instruction::LShr: 1261 case Instruction::AShr: 1262 case Instruction::And: 1263 case Instruction::Or: 1264 case Instruction::Xor: { 1265 VPValue *RHS = getOperand(1); 1266 // Certain instructions can be cheaper to vectorize if they have a constant 1267 // second vector operand. One example of this are shifts on x86. 1268 TargetTransformInfo::OperandValueInfo RHSInfo = { 1269 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}; 1270 if (RHS->isLiveIn()) 1271 RHSInfo = Ctx.TTI.getOperandInfo(RHS->getLiveInIRValue()); 1272 1273 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue && 1274 getOperand(1)->isDefinedOutsideVectorRegions()) 1275 RHSInfo.Kind = TargetTransformInfo::OK_UniformValue; 1276 Type *VectorTy = 1277 ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); 1278 Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue()); 1279 1280 SmallVector<const Value *, 4> Operands; 1281 if (CtxI) 1282 Operands.append(CtxI->value_op_begin(), CtxI->value_op_end()); 1283 return Ctx.TTI.getArithmeticInstrCost( 1284 Opcode, VectorTy, CostKind, 1285 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 1286 RHSInfo, Operands, CtxI, &Ctx.TLI); 1287 } 1288 case Instruction::Freeze: { 1289 // This opcode is unknown. Assume that it is the same as 'mul'. 1290 Type *VectorTy = 1291 ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); 1292 return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 1293 } 1294 case Instruction::ICmp: 1295 case Instruction::FCmp: { 1296 Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue()); 1297 Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); 1298 return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(), 1299 CostKind, CtxI); 1300 } 1301 default: 1302 llvm_unreachable("Unsupported opcode for instruction"); 1303 } 1304 } 1305 1306 void VPWidenEVLRecipe::execute(VPTransformState &State) { 1307 unsigned Opcode = getOpcode(); 1308 // TODO: Support other opcodes 1309 if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode)) 1310 llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute"); 1311 1312 State.setDebugLocFrom(getDebugLoc()); 1313 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " 1314 "explicit vector length."); 1315 1316 assert(State.get(getOperand(0), 0)->getType()->isVectorTy() && 1317 "VPWidenEVLRecipe should not be used for scalars"); 1318 1319 VPValue *EVL = getEVL(); 1320 Value *EVLArg = State.get(EVL, 0, /*NeedsScalar=*/true); 1321 IRBuilderBase &BuilderIR = State.Builder; 1322 VectorBuilder Builder(BuilderIR); 1323 Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue()); 1324 1325 SmallVector<Value *, 4> Ops; 1326 for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) { 1327 VPValue *VPOp = getOperand(I); 1328 Ops.push_back(State.get(VPOp, 0)); 1329 } 1330 1331 Builder.setMask(Mask).setEVL(EVLArg); 1332 Value *VPInst = 1333 Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops, "vp.op"); 1334 // Currently vp-intrinsics only accept FMF flags. 1335 // TODO: Enable other flags when support is added. 1336 if (isa<FPMathOperator>(VPInst)) 1337 setFlags(cast<Instruction>(VPInst)); 1338 1339 State.set(this, VPInst, 0); 1340 State.addMetadata(VPInst, 1341 dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1342 } 1343 1344 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1345 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, 1346 VPSlotTracker &SlotTracker) const { 1347 O << Indent << "WIDEN "; 1348 printAsOperand(O, SlotTracker); 1349 O << " = " << Instruction::getOpcodeName(Opcode); 1350 printFlags(O); 1351 printOperands(O, SlotTracker); 1352 } 1353 1354 void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent, 1355 VPSlotTracker &SlotTracker) const { 1356 O << Indent << "WIDEN-VP "; 1357 printAsOperand(O, SlotTracker); 1358 O << " = " << Instruction::getOpcodeName(getOpcode()); 1359 printFlags(O); 1360 printOperands(O, SlotTracker); 1361 } 1362 #endif 1363 1364 void VPWidenCastRecipe::execute(VPTransformState &State) { 1365 State.setDebugLocFrom(getDebugLoc()); 1366 auto &Builder = State.Builder; 1367 /// Vectorize casts. 1368 assert(State.VF.isVector() && "Not vectorizing?"); 1369 Type *DestTy = VectorType::get(getResultType(), State.VF); 1370 VPValue *Op = getOperand(0); 1371 for (unsigned Part = 0; Part < State.UF; ++Part) { 1372 if (Part > 0 && Op->isLiveIn()) { 1373 // FIXME: Remove once explicit unrolling is implemented using VPlan. 1374 State.set(this, State.get(this, 0), Part); 1375 continue; 1376 } 1377 Value *A = State.get(Op, Part); 1378 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); 1379 State.set(this, Cast, Part); 1380 State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue())); 1381 } 1382 } 1383 1384 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1385 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, 1386 VPSlotTracker &SlotTracker) const { 1387 O << Indent << "WIDEN-CAST "; 1388 printAsOperand(O, SlotTracker); 1389 O << " = " << Instruction::getOpcodeName(Opcode) << " "; 1390 printFlags(O); 1391 printOperands(O, SlotTracker); 1392 O << " to " << *getResultType(); 1393 } 1394 #endif 1395 1396 /// This function adds 1397 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 1398 /// to each vector element of Val. The sequence starts at StartIndex. 1399 /// \p Opcode is relevant for FP induction variable. 1400 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 1401 Instruction::BinaryOps BinOp, ElementCount VF, 1402 IRBuilderBase &Builder) { 1403 assert(VF.isVector() && "only vector VFs are supported"); 1404 1405 // Create and check the types. 1406 auto *ValVTy = cast<VectorType>(Val->getType()); 1407 ElementCount VLen = ValVTy->getElementCount(); 1408 1409 Type *STy = Val->getType()->getScalarType(); 1410 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1411 "Induction Step must be an integer or FP"); 1412 assert(Step->getType() == STy && "Step has wrong type"); 1413 1414 SmallVector<Constant *, 8> Indices; 1415 1416 // Create a vector of consecutive numbers from zero to VF. 1417 VectorType *InitVecValVTy = ValVTy; 1418 if (STy->isFloatingPointTy()) { 1419 Type *InitVecValSTy = 1420 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 1421 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 1422 } 1423 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 1424 1425 // Splat the StartIdx 1426 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 1427 1428 if (STy->isIntegerTy()) { 1429 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 1430 Step = Builder.CreateVectorSplat(VLen, Step); 1431 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1432 // FIXME: The newly created binary instructions should contain nsw/nuw 1433 // flags, which can be found from the original scalar operations. 1434 Step = Builder.CreateMul(InitVec, Step); 1435 return Builder.CreateAdd(Val, Step, "induction"); 1436 } 1437 1438 // Floating point induction. 1439 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1440 "Binary Opcode should be specified for FP induction"); 1441 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 1442 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 1443 1444 Step = Builder.CreateVectorSplat(VLen, Step); 1445 Value *MulOp = Builder.CreateFMul(InitVec, Step); 1446 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1447 } 1448 1449 /// A helper function that returns an integer or floating-point constant with 1450 /// value C. 1451 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 1452 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 1453 : ConstantFP::get(Ty, C); 1454 } 1455 1456 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 1457 assert(!State.Instance && "Int or FP induction being replicated."); 1458 1459 Value *Start = getStartValue()->getLiveInIRValue(); 1460 const InductionDescriptor &ID = getInductionDescriptor(); 1461 TruncInst *Trunc = getTruncInst(); 1462 IRBuilderBase &Builder = State.Builder; 1463 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1464 assert(State.VF.isVector() && "must have vector VF"); 1465 1466 // The value from the original loop to which we are mapping the new induction 1467 // variable. 1468 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1469 1470 // Fast-math-flags propagate from the original induction instruction. 1471 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 1472 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 1473 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 1474 1475 // Now do the actual transformations, and start with fetching the step value. 1476 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 1477 1478 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1479 "Expected either an induction phi-node or a truncate of it!"); 1480 1481 // Construct the initial value of the vector IV in the vector loop preheader 1482 auto CurrIP = Builder.saveIP(); 1483 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1484 Builder.SetInsertPoint(VectorPH->getTerminator()); 1485 if (isa<TruncInst>(EntryVal)) { 1486 assert(Start->getType()->isIntegerTy() && 1487 "Truncation requires an integer type"); 1488 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1489 Step = Builder.CreateTrunc(Step, TruncType); 1490 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1491 } 1492 1493 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 1494 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 1495 Value *SteppedStart = getStepVector( 1496 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); 1497 1498 // We create vector phi nodes for both integer and floating-point induction 1499 // variables. Here, we determine the kind of arithmetic we will perform. 1500 Instruction::BinaryOps AddOp; 1501 Instruction::BinaryOps MulOp; 1502 if (Step->getType()->isIntegerTy()) { 1503 AddOp = Instruction::Add; 1504 MulOp = Instruction::Mul; 1505 } else { 1506 AddOp = ID.getInductionOpcode(); 1507 MulOp = Instruction::FMul; 1508 } 1509 1510 // Multiply the vectorization factor by the step using integer or 1511 // floating-point arithmetic as appropriate. 1512 Type *StepType = Step->getType(); 1513 Value *RuntimeVF = State.get(getVFValue(), {0, 0}); 1514 if (Step->getType()->isFloatingPointTy()) 1515 RuntimeVF = Builder.CreateUIToFP(RuntimeVF, StepType); 1516 else 1517 RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, StepType); 1518 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 1519 1520 // Create a vector splat to use in the induction update. 1521 // 1522 // FIXME: If the step is non-constant, we create the vector splat with 1523 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1524 // handle a constant vector splat. 1525 Value *SplatVF = isa<Constant>(Mul) 1526 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 1527 : Builder.CreateVectorSplat(State.VF, Mul); 1528 Builder.restoreIP(CurrIP); 1529 1530 // We may need to add the step a number of times, depending on the unroll 1531 // factor. The last of those goes into the PHI. 1532 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind"); 1533 VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 1534 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1535 Instruction *LastInduction = VecInd; 1536 for (unsigned Part = 0; Part < State.UF; ++Part) { 1537 State.set(this, LastInduction, Part); 1538 1539 if (isa<TruncInst>(EntryVal)) 1540 State.addMetadata(LastInduction, EntryVal); 1541 1542 LastInduction = cast<Instruction>( 1543 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 1544 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1545 } 1546 1547 LastInduction->setName("vec.ind.next"); 1548 VecInd->addIncoming(SteppedStart, VectorPH); 1549 // Add induction update using an incorrect block temporarily. The phi node 1550 // will be fixed after VPlan execution. Note that at this point the latch 1551 // block cannot be used, as it does not exist yet. 1552 // TODO: Model increment value in VPlan, by turning the recipe into a 1553 // multi-def and a subclass of VPHeaderPHIRecipe. 1554 VecInd->addIncoming(LastInduction, VectorPH); 1555 } 1556 1557 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1558 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, 1559 VPSlotTracker &SlotTracker) const { 1560 O << Indent << "WIDEN-INDUCTION"; 1561 if (getTruncInst()) { 1562 O << "\\l\""; 1563 O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\""; 1564 O << " +\n" << Indent << "\" "; 1565 getVPValue(0)->printAsOperand(O, SlotTracker); 1566 } else 1567 O << " " << VPlanIngredient(IV); 1568 1569 O << ", "; 1570 getStepValue()->printAsOperand(O, SlotTracker); 1571 1572 O << ", "; 1573 getVFValue()->printAsOperand(O, SlotTracker); 1574 } 1575 #endif 1576 1577 bool VPWidenIntOrFpInductionRecipe::isCanonical() const { 1578 // The step may be defined by a recipe in the preheader (e.g. if it requires 1579 // SCEV expansion), but for the canonical induction the step is required to be 1580 // 1, which is represented as live-in. 1581 if (getStepValue()->getDefiningRecipe()) 1582 return false; 1583 auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue()); 1584 auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); 1585 auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin()); 1586 return StartC && StartC->isZero() && StepC && StepC->isOne() && 1587 getScalarType() == CanIV->getScalarType(); 1588 } 1589 1590 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1591 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, 1592 VPSlotTracker &SlotTracker) const { 1593 O << Indent; 1594 printAsOperand(O, SlotTracker); 1595 O << " = DERIVED-IV "; 1596 getStartValue()->printAsOperand(O, SlotTracker); 1597 O << " + "; 1598 getOperand(1)->printAsOperand(O, SlotTracker); 1599 O << " * "; 1600 getStepValue()->printAsOperand(O, SlotTracker); 1601 } 1602 #endif 1603 1604 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 1605 // Fast-math-flags propagate from the original induction instruction. 1606 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 1607 if (hasFastMathFlags()) 1608 State.Builder.setFastMathFlags(getFastMathFlags()); 1609 1610 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 1611 /// variable on which to base the steps, \p Step is the size of the step. 1612 1613 Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0)); 1614 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 1615 IRBuilderBase &Builder = State.Builder; 1616 1617 // Ensure step has the same type as that of scalar IV. 1618 Type *BaseIVTy = BaseIV->getType()->getScalarType(); 1619 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!"); 1620 1621 // We build scalar steps for both integer and floating-point induction 1622 // variables. Here, we determine the kind of arithmetic we will perform. 1623 Instruction::BinaryOps AddOp; 1624 Instruction::BinaryOps MulOp; 1625 if (BaseIVTy->isIntegerTy()) { 1626 AddOp = Instruction::Add; 1627 MulOp = Instruction::Mul; 1628 } else { 1629 AddOp = InductionOpcode; 1630 MulOp = Instruction::FMul; 1631 } 1632 1633 // Determine the number of scalars we need to generate for each unroll 1634 // iteration. 1635 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this); 1636 // Compute the scalar steps and save the results in State. 1637 Type *IntStepTy = 1638 IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits()); 1639 Type *VecIVTy = nullptr; 1640 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 1641 if (!FirstLaneOnly && State.VF.isScalable()) { 1642 VecIVTy = VectorType::get(BaseIVTy, State.VF); 1643 UnitStepVec = 1644 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 1645 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 1646 SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV); 1647 } 1648 1649 unsigned StartPart = 0; 1650 unsigned EndPart = State.UF; 1651 unsigned StartLane = 0; 1652 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 1653 if (State.Instance) { 1654 StartPart = State.Instance->Part; 1655 EndPart = StartPart + 1; 1656 StartLane = State.Instance->Lane.getKnownLane(); 1657 EndLane = StartLane + 1; 1658 } 1659 for (unsigned Part = StartPart; Part < EndPart; ++Part) { 1660 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 1661 1662 if (!FirstLaneOnly && State.VF.isScalable()) { 1663 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 1664 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 1665 if (BaseIVTy->isFloatingPointTy()) 1666 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 1667 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 1668 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 1669 State.set(this, Add, Part); 1670 // It's useful to record the lane values too for the known minimum number 1671 // of elements so we do those below. This improves the code quality when 1672 // trying to extract the first element, for example. 1673 } 1674 1675 if (BaseIVTy->isFloatingPointTy()) 1676 StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy); 1677 1678 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { 1679 Value *StartIdx = Builder.CreateBinOp( 1680 AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane)); 1681 // The step returned by `createStepForVF` is a runtime-evaluated value 1682 // when VF is scalable. Otherwise, it should be folded into a Constant. 1683 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 1684 "Expected StartIdx to be folded to a constant when VF is not " 1685 "scalable"); 1686 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 1687 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul); 1688 State.set(this, Add, VPIteration(Part, Lane)); 1689 } 1690 } 1691 } 1692 1693 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1694 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, 1695 VPSlotTracker &SlotTracker) const { 1696 O << Indent; 1697 printAsOperand(O, SlotTracker); 1698 O << " = SCALAR-STEPS "; 1699 printOperands(O, SlotTracker); 1700 } 1701 #endif 1702 1703 void VPWidenGEPRecipe::execute(VPTransformState &State) { 1704 assert(State.VF.isVector() && "not widening"); 1705 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 1706 // Construct a vector GEP by widening the operands of the scalar GEP as 1707 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 1708 // results in a vector of pointers when at least one operand of the GEP 1709 // is vector-typed. Thus, to keep the representation compact, we only use 1710 // vector-typed operands for loop-varying values. 1711 1712 if (areAllOperandsInvariant()) { 1713 // If we are vectorizing, but the GEP has only loop-invariant operands, 1714 // the GEP we build (by only using vector-typed operands for 1715 // loop-varying values) would be a scalar pointer. Thus, to ensure we 1716 // produce a vector of pointers, we need to either arbitrarily pick an 1717 // operand to broadcast, or broadcast a clone of the original GEP. 1718 // Here, we broadcast a clone of the original. 1719 // 1720 // TODO: If at some point we decide to scalarize instructions having 1721 // loop-invariant operands, this special case will no longer be 1722 // required. We would add the scalarization decision to 1723 // collectLoopScalars() and teach getVectorValue() to broadcast 1724 // the lane-zero scalar value. 1725 SmallVector<Value *> Ops; 1726 for (unsigned I = 0, E = getNumOperands(); I != E; I++) 1727 Ops.push_back(State.get(getOperand(I), VPIteration(0, 0))); 1728 1729 auto *NewGEP = 1730 State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0], 1731 ArrayRef(Ops).drop_front(), "", isInBounds()); 1732 for (unsigned Part = 0; Part < State.UF; ++Part) { 1733 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP); 1734 State.set(this, EntryPart, Part); 1735 State.addMetadata(EntryPart, GEP); 1736 } 1737 } else { 1738 // If the GEP has at least one loop-varying operand, we are sure to 1739 // produce a vector of pointers. But if we are only unrolling, we want 1740 // to produce a scalar GEP for each unroll part. Thus, the GEP we 1741 // produce with the code below will be scalar (if VF == 1) or vector 1742 // (otherwise). Note that for the unroll-only case, we still maintain 1743 // values in the vector mapping with initVector, as we do for other 1744 // instructions. 1745 for (unsigned Part = 0; Part < State.UF; ++Part) { 1746 // The pointer operand of the new GEP. If it's loop-invariant, we 1747 // won't broadcast it. 1748 auto *Ptr = isPointerLoopInvariant() 1749 ? State.get(getOperand(0), VPIteration(0, 0)) 1750 : State.get(getOperand(0), Part); 1751 1752 // Collect all the indices for the new GEP. If any index is 1753 // loop-invariant, we won't broadcast it. 1754 SmallVector<Value *, 4> Indices; 1755 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 1756 VPValue *Operand = getOperand(I); 1757 if (isIndexLoopInvariant(I - 1)) 1758 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 1759 else 1760 Indices.push_back(State.get(Operand, Part)); 1761 } 1762 1763 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 1764 // but it should be a vector, otherwise. 1765 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, 1766 Indices, "", isInBounds()); 1767 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 1768 "NewGEP is not a pointer vector"); 1769 State.set(this, NewGEP, Part); 1770 State.addMetadata(NewGEP, GEP); 1771 } 1772 } 1773 } 1774 1775 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1776 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, 1777 VPSlotTracker &SlotTracker) const { 1778 O << Indent << "WIDEN-GEP "; 1779 O << (isPointerLoopInvariant() ? "Inv" : "Var"); 1780 for (size_t I = 0; I < getNumOperands() - 1; ++I) 1781 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]"; 1782 1783 O << " "; 1784 printAsOperand(O, SlotTracker); 1785 O << " = getelementptr"; 1786 printFlags(O); 1787 printOperands(O, SlotTracker); 1788 } 1789 #endif 1790 1791 void VPVectorPointerRecipe ::execute(VPTransformState &State) { 1792 auto &Builder = State.Builder; 1793 State.setDebugLocFrom(getDebugLoc()); 1794 for (unsigned Part = 0; Part < State.UF; ++Part) { 1795 // Calculate the pointer for the specific unroll-part. 1796 Value *PartPtr = nullptr; 1797 // Use i32 for the gep index type when the value is constant, 1798 // or query DataLayout for a more suitable index type otherwise. 1799 const DataLayout &DL = 1800 Builder.GetInsertBlock()->getDataLayout(); 1801 Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0) 1802 ? DL.getIndexType(IndexedTy->getPointerTo()) 1803 : Builder.getInt32Ty(); 1804 Value *Ptr = State.get(getOperand(0), VPIteration(0, 0)); 1805 bool InBounds = isInBounds(); 1806 if (IsReverse) { 1807 // If the address is consecutive but reversed, then the 1808 // wide store needs to start at the last vector element. 1809 // RunTimeVF = VScale * VF.getKnownMinValue() 1810 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 1811 Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF); 1812 // NumElt = -Part * RunTimeVF 1813 Value *NumElt = Builder.CreateMul( 1814 ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF); 1815 // LastLane = 1 - RunTimeVF 1816 Value *LastLane = 1817 Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); 1818 PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds); 1819 PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds); 1820 } else { 1821 Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part); 1822 PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds); 1823 } 1824 1825 State.set(this, PartPtr, Part, /*IsScalar*/ true); 1826 } 1827 } 1828 1829 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1830 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, 1831 VPSlotTracker &SlotTracker) const { 1832 O << Indent; 1833 printAsOperand(O, SlotTracker); 1834 O << " = vector-pointer "; 1835 if (IsReverse) 1836 O << "(reverse) "; 1837 1838 printOperands(O, SlotTracker); 1839 } 1840 #endif 1841 1842 void VPBlendRecipe::execute(VPTransformState &State) { 1843 assert(isNormalized() && "Expected blend to be normalized!"); 1844 State.setDebugLocFrom(getDebugLoc()); 1845 // We know that all PHIs in non-header blocks are converted into 1846 // selects, so we don't have to worry about the insertion order and we 1847 // can just use the builder. 1848 // At this point we generate the predication tree. There may be 1849 // duplications since this is a simple recursive scan, but future 1850 // optimizations will clean it up. 1851 1852 unsigned NumIncoming = getNumIncomingValues(); 1853 1854 // Generate a sequence of selects of the form: 1855 // SELECT(Mask3, In3, 1856 // SELECT(Mask2, In2, 1857 // SELECT(Mask1, In1, 1858 // In0))) 1859 // Note that Mask0 is never used: lanes for which no path reaches this phi and 1860 // are essentially undef are taken from In0. 1861 VectorParts Entry(State.UF); 1862 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 1863 for (unsigned In = 0; In < NumIncoming; ++In) { 1864 for (unsigned Part = 0; Part < State.UF; ++Part) { 1865 // We might have single edge PHIs (blocks) - use an identity 1866 // 'select' for the first PHI operand. 1867 Value *In0 = State.get(getIncomingValue(In), Part, OnlyFirstLaneUsed); 1868 if (In == 0) 1869 Entry[Part] = In0; // Initialize with the first incoming value. 1870 else { 1871 // Select between the current value and the previous incoming edge 1872 // based on the incoming mask. 1873 Value *Cond = State.get(getMask(In), Part, OnlyFirstLaneUsed); 1874 Entry[Part] = 1875 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 1876 } 1877 } 1878 } 1879 1880 for (unsigned Part = 0; Part < State.UF; ++Part) 1881 State.set(this, Entry[Part], Part, OnlyFirstLaneUsed); 1882 } 1883 1884 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1885 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, 1886 VPSlotTracker &SlotTracker) const { 1887 O << Indent << "BLEND "; 1888 printAsOperand(O, SlotTracker); 1889 O << " ="; 1890 if (getNumIncomingValues() == 1) { 1891 // Not a User of any mask: not really blending, this is a 1892 // single-predecessor phi. 1893 O << " "; 1894 getIncomingValue(0)->printAsOperand(O, SlotTracker); 1895 } else { 1896 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { 1897 O << " "; 1898 getIncomingValue(I)->printAsOperand(O, SlotTracker); 1899 if (I == 0) 1900 continue; 1901 O << "/"; 1902 getMask(I)->printAsOperand(O, SlotTracker); 1903 } 1904 } 1905 } 1906 #endif 1907 1908 void VPReductionRecipe::execute(VPTransformState &State) { 1909 assert(!State.Instance && "Reduction being replicated."); 1910 Value *PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true); 1911 RecurKind Kind = RdxDesc.getRecurrenceKind(); 1912 // Propagate the fast-math flags carried by the underlying instruction. 1913 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 1914 State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 1915 for (unsigned Part = 0; Part < State.UF; ++Part) { 1916 Value *NewVecOp = State.get(getVecOp(), Part); 1917 if (VPValue *Cond = getCondOp()) { 1918 Value *NewCond = State.get(Cond, Part, State.VF.isScalar()); 1919 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType()); 1920 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); 1921 1922 Value *Start; 1923 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind)) 1924 Start = RdxDesc.getRecurrenceStartValue(); 1925 else 1926 Start = llvm::getRecurrenceIdentity(Kind, ElementTy, 1927 RdxDesc.getFastMathFlags()); 1928 if (State.VF.isVector()) 1929 Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), 1930 Start); 1931 1932 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start); 1933 NewVecOp = Select; 1934 } 1935 Value *NewRed; 1936 Value *NextInChain; 1937 if (IsOrdered) { 1938 if (State.VF.isVector()) 1939 NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp, 1940 PrevInChain); 1941 else 1942 NewRed = State.Builder.CreateBinOp( 1943 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain, 1944 NewVecOp); 1945 PrevInChain = NewRed; 1946 NextInChain = NewRed; 1947 } else { 1948 PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true); 1949 NewRed = createReduction(State.Builder, RdxDesc, NewVecOp); 1950 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 1951 NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(), 1952 NewRed, PrevInChain); 1953 else 1954 NextInChain = State.Builder.CreateBinOp( 1955 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, 1956 PrevInChain); 1957 } 1958 State.set(this, NextInChain, Part, /*IsScalar*/ true); 1959 } 1960 } 1961 1962 void VPReductionEVLRecipe::execute(VPTransformState &State) { 1963 assert(!State.Instance && "Reduction being replicated."); 1964 assert(State.UF == 1 && 1965 "Expected only UF == 1 when vectorizing with explicit vector length."); 1966 1967 auto &Builder = State.Builder; 1968 // Propagate the fast-math flags carried by the underlying instruction. 1969 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder); 1970 const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor(); 1971 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 1972 1973 RecurKind Kind = RdxDesc.getRecurrenceKind(); 1974 Value *Prev = State.get(getChainOp(), 0, /*IsScalar*/ true); 1975 Value *VecOp = State.get(getVecOp(), 0); 1976 Value *EVL = State.get(getEVL(), VPIteration(0, 0)); 1977 1978 VectorBuilder VBuilder(Builder); 1979 VBuilder.setEVL(EVL); 1980 Value *Mask; 1981 // TODO: move the all-true mask generation into VectorBuilder. 1982 if (VPValue *CondOp = getCondOp()) 1983 Mask = State.get(CondOp, 0); 1984 else 1985 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 1986 VBuilder.setMask(Mask); 1987 1988 Value *NewRed; 1989 if (isOrdered()) { 1990 NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev); 1991 } else { 1992 NewRed = createSimpleReduction(VBuilder, VecOp, RdxDesc); 1993 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 1994 NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev); 1995 else 1996 NewRed = Builder.CreateBinOp( 1997 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, Prev); 1998 } 1999 State.set(this, NewRed, 0, /*IsScalar*/ true); 2000 } 2001 2002 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2003 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, 2004 VPSlotTracker &SlotTracker) const { 2005 O << Indent << "REDUCE "; 2006 printAsOperand(O, SlotTracker); 2007 O << " = "; 2008 getChainOp()->printAsOperand(O, SlotTracker); 2009 O << " +"; 2010 if (isa<FPMathOperator>(getUnderlyingInstr())) 2011 O << getUnderlyingInstr()->getFastMathFlags(); 2012 O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; 2013 getVecOp()->printAsOperand(O, SlotTracker); 2014 if (isConditional()) { 2015 O << ", "; 2016 getCondOp()->printAsOperand(O, SlotTracker); 2017 } 2018 O << ")"; 2019 if (RdxDesc.IntermediateStore) 2020 O << " (with final reduction value stored in invariant address sank " 2021 "outside of loop)"; 2022 } 2023 2024 void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent, 2025 VPSlotTracker &SlotTracker) const { 2026 const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor(); 2027 O << Indent << "REDUCE "; 2028 printAsOperand(O, SlotTracker); 2029 O << " = "; 2030 getChainOp()->printAsOperand(O, SlotTracker); 2031 O << " +"; 2032 if (isa<FPMathOperator>(getUnderlyingInstr())) 2033 O << getUnderlyingInstr()->getFastMathFlags(); 2034 O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; 2035 getVecOp()->printAsOperand(O, SlotTracker); 2036 O << ", "; 2037 getEVL()->printAsOperand(O, SlotTracker); 2038 if (isConditional()) { 2039 O << ", "; 2040 getCondOp()->printAsOperand(O, SlotTracker); 2041 } 2042 O << ")"; 2043 if (RdxDesc.IntermediateStore) 2044 O << " (with final reduction value stored in invariant address sank " 2045 "outside of loop)"; 2046 } 2047 #endif 2048 2049 bool VPReplicateRecipe::shouldPack() const { 2050 // Find if the recipe is used by a widened recipe via an intervening 2051 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. 2052 return any_of(users(), [](const VPUser *U) { 2053 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U)) 2054 return any_of(PredR->users(), [PredR](const VPUser *U) { 2055 return !U->usesScalars(PredR); 2056 }); 2057 return false; 2058 }); 2059 } 2060 2061 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2062 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, 2063 VPSlotTracker &SlotTracker) const { 2064 O << Indent << (IsUniform ? "CLONE " : "REPLICATE "); 2065 2066 if (!getUnderlyingInstr()->getType()->isVoidTy()) { 2067 printAsOperand(O, SlotTracker); 2068 O << " = "; 2069 } 2070 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) { 2071 O << "call"; 2072 printFlags(O); 2073 O << "@" << CB->getCalledFunction()->getName() << "("; 2074 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), 2075 O, [&O, &SlotTracker](VPValue *Op) { 2076 Op->printAsOperand(O, SlotTracker); 2077 }); 2078 O << ")"; 2079 } else { 2080 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); 2081 printFlags(O); 2082 printOperands(O, SlotTracker); 2083 } 2084 2085 if (shouldPack()) 2086 O << " (S->V)"; 2087 } 2088 #endif 2089 2090 /// Checks if \p C is uniform across all VFs and UFs. It is considered as such 2091 /// if it is either defined outside the vector region or its operand is known to 2092 /// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI). 2093 /// TODO: Uniformity should be associated with a VPValue and there should be a 2094 /// generic way to check. 2095 static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) { 2096 return C->isDefinedOutsideVectorRegions() || 2097 isa<VPDerivedIVRecipe>(C->getOperand(0)) || 2098 isa<VPCanonicalIVPHIRecipe>(C->getOperand(0)); 2099 } 2100 2101 Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) { 2102 assert(vputils::onlyFirstLaneUsed(this) && 2103 "Codegen only implemented for first lane."); 2104 switch (Opcode) { 2105 case Instruction::SExt: 2106 case Instruction::ZExt: 2107 case Instruction::Trunc: { 2108 // Note: SExt/ZExt not used yet. 2109 Value *Op = State.get(getOperand(0), VPIteration(Part, 0)); 2110 return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy); 2111 } 2112 default: 2113 llvm_unreachable("opcode not implemented yet"); 2114 } 2115 } 2116 2117 void VPScalarCastRecipe ::execute(VPTransformState &State) { 2118 bool IsUniformAcrossVFsAndUFs = isUniformAcrossVFsAndUFs(this); 2119 for (unsigned Part = 0; Part != State.UF; ++Part) { 2120 Value *Res; 2121 // Only generate a single instance, if the recipe is uniform across UFs and 2122 // VFs. 2123 if (Part > 0 && IsUniformAcrossVFsAndUFs) 2124 Res = State.get(this, VPIteration(0, 0)); 2125 else 2126 Res = generate(State, Part); 2127 State.set(this, Res, VPIteration(Part, 0)); 2128 } 2129 } 2130 2131 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2132 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent, 2133 VPSlotTracker &SlotTracker) const { 2134 O << Indent << "SCALAR-CAST "; 2135 printAsOperand(O, SlotTracker); 2136 O << " = " << Instruction::getOpcodeName(Opcode) << " "; 2137 printOperands(O, SlotTracker); 2138 O << " to " << *ResultTy; 2139 } 2140 #endif 2141 2142 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 2143 assert(State.Instance && "Branch on Mask works only on single instance."); 2144 2145 unsigned Part = State.Instance->Part; 2146 unsigned Lane = State.Instance->Lane.getKnownLane(); 2147 2148 Value *ConditionBit = nullptr; 2149 VPValue *BlockInMask = getMask(); 2150 if (BlockInMask) { 2151 ConditionBit = State.get(BlockInMask, Part); 2152 if (ConditionBit->getType()->isVectorTy()) 2153 ConditionBit = State.Builder.CreateExtractElement( 2154 ConditionBit, State.Builder.getInt32(Lane)); 2155 } else // Block in mask is all-one. 2156 ConditionBit = State.Builder.getTrue(); 2157 2158 // Replace the temporary unreachable terminator with a new conditional branch, 2159 // whose two destinations will be set later when they are created. 2160 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 2161 assert(isa<UnreachableInst>(CurrentTerminator) && 2162 "Expected to replace unreachable terminator with conditional branch."); 2163 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 2164 CondBr->setSuccessor(0, nullptr); 2165 ReplaceInstWithInst(CurrentTerminator, CondBr); 2166 } 2167 2168 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 2169 assert(State.Instance && "Predicated instruction PHI works per instance."); 2170 Instruction *ScalarPredInst = 2171 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 2172 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 2173 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 2174 assert(PredicatingBB && "Predicated block has no single predecessor."); 2175 assert(isa<VPReplicateRecipe>(getOperand(0)) && 2176 "operand must be VPReplicateRecipe"); 2177 2178 // By current pack/unpack logic we need to generate only a single phi node: if 2179 // a vector value for the predicated instruction exists at this point it means 2180 // the instruction has vector users only, and a phi for the vector value is 2181 // needed. In this case the recipe of the predicated instruction is marked to 2182 // also do that packing, thereby "hoisting" the insert-element sequence. 2183 // Otherwise, a phi node for the scalar value is needed. 2184 unsigned Part = State.Instance->Part; 2185 if (State.hasVectorValue(getOperand(0), Part)) { 2186 Value *VectorValue = State.get(getOperand(0), Part); 2187 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 2188 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 2189 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 2190 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 2191 if (State.hasVectorValue(this, Part)) 2192 State.reset(this, VPhi, Part); 2193 else 2194 State.set(this, VPhi, Part); 2195 // NOTE: Currently we need to update the value of the operand, so the next 2196 // predicated iteration inserts its generated value in the correct vector. 2197 State.reset(getOperand(0), VPhi, Part); 2198 } else { 2199 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 2200 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 2201 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 2202 PredicatingBB); 2203 Phi->addIncoming(ScalarPredInst, PredicatedBB); 2204 if (State.hasScalarValue(this, *State.Instance)) 2205 State.reset(this, Phi, *State.Instance); 2206 else 2207 State.set(this, Phi, *State.Instance); 2208 // NOTE: Currently we need to update the value of the operand, so the next 2209 // predicated iteration inserts its generated value in the correct vector. 2210 State.reset(getOperand(0), Phi, *State.Instance); 2211 } 2212 } 2213 2214 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2215 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2216 VPSlotTracker &SlotTracker) const { 2217 O << Indent << "PHI-PREDICATED-INSTRUCTION "; 2218 printAsOperand(O, SlotTracker); 2219 O << " = "; 2220 printOperands(O, SlotTracker); 2221 } 2222 #endif 2223 2224 InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, 2225 VPCostContext &Ctx) const { 2226 Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF); 2227 const Align Alignment = 2228 getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient)); 2229 unsigned AS = 2230 getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient)); 2231 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2232 2233 if (!Consecutive) { 2234 // TODO: Using the original IR may not be accurate. 2235 // Currently, ARM will use the underlying IR to calculate gather/scatter 2236 // instruction cost. 2237 const Value *Ptr = getLoadStorePointerOperand(&Ingredient); 2238 assert(!Reverse && 2239 "Inconsecutive memory access should not have the order."); 2240 return Ctx.TTI.getAddressComputationCost(Ty) + 2241 Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr, 2242 IsMasked, Alignment, CostKind, 2243 &Ingredient); 2244 } 2245 2246 InstructionCost Cost = 0; 2247 if (IsMasked) { 2248 Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, 2249 AS, CostKind); 2250 } else { 2251 TTI::OperandValueInfo OpInfo = 2252 Ctx.TTI.getOperandInfo(Ingredient.getOperand(0)); 2253 Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS, 2254 CostKind, OpInfo, &Ingredient); 2255 } 2256 if (!Reverse) 2257 return Cost; 2258 2259 return Cost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 2260 cast<VectorType>(Ty), std::nullopt, 2261 CostKind, 0); 2262 } 2263 2264 void VPWidenLoadRecipe::execute(VPTransformState &State) { 2265 auto *LI = cast<LoadInst>(&Ingredient); 2266 2267 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 2268 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 2269 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2270 bool CreateGather = !isConsecutive(); 2271 2272 auto &Builder = State.Builder; 2273 State.setDebugLocFrom(getDebugLoc()); 2274 for (unsigned Part = 0; Part < State.UF; ++Part) { 2275 Value *NewLI; 2276 Value *Mask = nullptr; 2277 if (auto *VPMask = getMask()) { 2278 // Mask reversal is only needed for non-all-one (null) masks, as reverse 2279 // of a null all-one mask is a null mask. 2280 Mask = State.get(VPMask, Part); 2281 if (isReverse()) 2282 Mask = Builder.CreateVectorReverse(Mask, "reverse"); 2283 } 2284 2285 Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateGather); 2286 if (CreateGather) { 2287 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr, 2288 "wide.masked.gather"); 2289 } else if (Mask) { 2290 NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask, 2291 PoisonValue::get(DataTy), 2292 "wide.masked.load"); 2293 } else { 2294 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load"); 2295 } 2296 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2297 State.addMetadata(NewLI, LI); 2298 if (Reverse) 2299 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 2300 State.set(this, NewLI, Part); 2301 } 2302 } 2303 2304 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2305 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, 2306 VPSlotTracker &SlotTracker) const { 2307 O << Indent << "WIDEN "; 2308 printAsOperand(O, SlotTracker); 2309 O << " = load "; 2310 printOperands(O, SlotTracker); 2311 } 2312 #endif 2313 2314 /// Use all-true mask for reverse rather than actual mask, as it avoids a 2315 /// dependence w/o affecting the result. 2316 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, 2317 Value *EVL, const Twine &Name) { 2318 VectorType *ValTy = cast<VectorType>(Operand->getType()); 2319 Value *AllTrueMask = 2320 Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue()); 2321 return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse, 2322 {Operand, AllTrueMask, EVL}, nullptr, Name); 2323 } 2324 2325 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { 2326 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " 2327 "explicit vector length."); 2328 auto *LI = cast<LoadInst>(&Ingredient); 2329 2330 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 2331 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 2332 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2333 bool CreateGather = !isConsecutive(); 2334 2335 auto &Builder = State.Builder; 2336 State.setDebugLocFrom(getDebugLoc()); 2337 CallInst *NewLI; 2338 Value *EVL = State.get(getEVL(), VPIteration(0, 0)); 2339 Value *Addr = State.get(getAddr(), 0, !CreateGather); 2340 Value *Mask = nullptr; 2341 if (VPValue *VPMask = getMask()) { 2342 Mask = State.get(VPMask, 0); 2343 if (isReverse()) 2344 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); 2345 } else { 2346 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 2347 } 2348 2349 if (CreateGather) { 2350 NewLI = 2351 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, 2352 nullptr, "wide.masked.gather"); 2353 } else { 2354 VectorBuilder VBuilder(Builder); 2355 VBuilder.setEVL(EVL).setMask(Mask); 2356 NewLI = cast<CallInst>(VBuilder.createVectorInstruction( 2357 Instruction::Load, DataTy, Addr, "vp.op.load")); 2358 } 2359 NewLI->addParamAttr( 2360 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); 2361 State.addMetadata(NewLI, LI); 2362 Instruction *Res = NewLI; 2363 if (isReverse()) 2364 Res = createReverseEVL(Builder, Res, EVL, "vp.reverse"); 2365 State.set(this, Res, 0); 2366 } 2367 2368 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2369 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, 2370 VPSlotTracker &SlotTracker) const { 2371 O << Indent << "WIDEN "; 2372 printAsOperand(O, SlotTracker); 2373 O << " = vp.load "; 2374 printOperands(O, SlotTracker); 2375 } 2376 #endif 2377 2378 void VPWidenStoreRecipe::execute(VPTransformState &State) { 2379 auto *SI = cast<StoreInst>(&Ingredient); 2380 2381 VPValue *StoredVPValue = getStoredValue(); 2382 bool CreateScatter = !isConsecutive(); 2383 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2384 2385 auto &Builder = State.Builder; 2386 State.setDebugLocFrom(getDebugLoc()); 2387 2388 for (unsigned Part = 0; Part < State.UF; ++Part) { 2389 Instruction *NewSI = nullptr; 2390 Value *Mask = nullptr; 2391 if (auto *VPMask = getMask()) { 2392 // Mask reversal is only needed for non-all-one (null) masks, as reverse 2393 // of a null all-one mask is a null mask. 2394 Mask = State.get(VPMask, Part); 2395 if (isReverse()) 2396 Mask = Builder.CreateVectorReverse(Mask, "reverse"); 2397 } 2398 2399 Value *StoredVal = State.get(StoredVPValue, Part); 2400 if (isReverse()) { 2401 // If we store to reverse consecutive memory locations, then we need 2402 // to reverse the order of elements in the stored value. 2403 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 2404 // We don't want to update the value in the map as it might be used in 2405 // another expression. So don't call resetVectorValue(StoredVal). 2406 } 2407 Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateScatter); 2408 if (CreateScatter) 2409 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask); 2410 else if (Mask) 2411 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask); 2412 else 2413 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment); 2414 State.addMetadata(NewSI, SI); 2415 } 2416 } 2417 2418 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2419 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, 2420 VPSlotTracker &SlotTracker) const { 2421 O << Indent << "WIDEN store "; 2422 printOperands(O, SlotTracker); 2423 } 2424 #endif 2425 2426 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { 2427 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " 2428 "explicit vector length."); 2429 auto *SI = cast<StoreInst>(&Ingredient); 2430 2431 VPValue *StoredValue = getStoredValue(); 2432 bool CreateScatter = !isConsecutive(); 2433 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2434 2435 auto &Builder = State.Builder; 2436 State.setDebugLocFrom(getDebugLoc()); 2437 2438 CallInst *NewSI = nullptr; 2439 Value *StoredVal = State.get(StoredValue, 0); 2440 Value *EVL = State.get(getEVL(), VPIteration(0, 0)); 2441 if (isReverse()) 2442 StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse"); 2443 Value *Mask = nullptr; 2444 if (VPValue *VPMask = getMask()) { 2445 Mask = State.get(VPMask, 0); 2446 if (isReverse()) 2447 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); 2448 } else { 2449 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 2450 } 2451 Value *Addr = State.get(getAddr(), 0, !CreateScatter); 2452 if (CreateScatter) { 2453 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), 2454 Intrinsic::vp_scatter, 2455 {StoredVal, Addr, Mask, EVL}); 2456 } else { 2457 VectorBuilder VBuilder(Builder); 2458 VBuilder.setEVL(EVL).setMask(Mask); 2459 NewSI = cast<CallInst>(VBuilder.createVectorInstruction( 2460 Instruction::Store, Type::getVoidTy(EVL->getContext()), 2461 {StoredVal, Addr})); 2462 } 2463 NewSI->addParamAttr( 2464 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment)); 2465 State.addMetadata(NewSI, SI); 2466 } 2467 2468 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2469 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent, 2470 VPSlotTracker &SlotTracker) const { 2471 O << Indent << "WIDEN vp.store "; 2472 printOperands(O, SlotTracker); 2473 } 2474 #endif 2475 2476 static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V, 2477 VectorType *DstVTy, const DataLayout &DL) { 2478 // Verify that V is a vector type with same number of elements as DstVTy. 2479 auto VF = DstVTy->getElementCount(); 2480 auto *SrcVecTy = cast<VectorType>(V->getType()); 2481 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match"); 2482 Type *SrcElemTy = SrcVecTy->getElementType(); 2483 Type *DstElemTy = DstVTy->getElementType(); 2484 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2485 "Vector elements must have same size"); 2486 2487 // Do a direct cast if element types are castable. 2488 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2489 return Builder.CreateBitOrPointerCast(V, DstVTy); 2490 } 2491 // V cannot be directly casted to desired vector type. 2492 // May happen when V is a floating point vector but DstVTy is a vector of 2493 // pointers or vice-versa. Handle this using a two-step bitcast using an 2494 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2495 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2496 "Only one type should be a pointer type"); 2497 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2498 "Only one type should be a floating point type"); 2499 Type *IntTy = 2500 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2501 auto *VecIntTy = VectorType::get(IntTy, VF); 2502 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2503 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2504 } 2505 2506 /// Return a vector containing interleaved elements from multiple 2507 /// smaller input vectors. 2508 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, 2509 const Twine &Name) { 2510 unsigned Factor = Vals.size(); 2511 assert(Factor > 1 && "Tried to interleave invalid number of vectors"); 2512 2513 VectorType *VecTy = cast<VectorType>(Vals[0]->getType()); 2514 #ifndef NDEBUG 2515 for (Value *Val : Vals) 2516 assert(Val->getType() == VecTy && "Tried to interleave mismatched types"); 2517 #endif 2518 2519 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so 2520 // must use intrinsics to interleave. 2521 if (VecTy->isScalableTy()) { 2522 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); 2523 return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2, 2524 Vals, 2525 /*FMFSource=*/nullptr, Name); 2526 } 2527 2528 // Fixed length. Start by concatenating all vectors into a wide vector. 2529 Value *WideVec = concatenateVectors(Builder, Vals); 2530 2531 // Interleave the elements into the wide vector. 2532 const unsigned NumElts = VecTy->getElementCount().getFixedValue(); 2533 return Builder.CreateShuffleVector( 2534 WideVec, createInterleaveMask(NumElts, Factor), Name); 2535 } 2536 2537 // Try to vectorize the interleave group that \p Instr belongs to. 2538 // 2539 // E.g. Translate following interleaved load group (factor = 3): 2540 // for (i = 0; i < N; i+=3) { 2541 // R = Pic[i]; // Member of index 0 2542 // G = Pic[i+1]; // Member of index 1 2543 // B = Pic[i+2]; // Member of index 2 2544 // ... // do something to R, G, B 2545 // } 2546 // To: 2547 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2548 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2549 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2550 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2551 // 2552 // Or translate following interleaved store group (factor = 3): 2553 // for (i = 0; i < N; i+=3) { 2554 // ... do something to R, G, B 2555 // Pic[i] = R; // Member of index 0 2556 // Pic[i+1] = G; // Member of index 1 2557 // Pic[i+2] = B; // Member of index 2 2558 // } 2559 // To: 2560 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2561 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2562 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2563 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2564 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2565 void VPInterleaveRecipe::execute(VPTransformState &State) { 2566 assert(!State.Instance && "Interleave group being replicated."); 2567 const InterleaveGroup<Instruction> *Group = IG; 2568 Instruction *Instr = Group->getInsertPos(); 2569 2570 // Prepare for the vector type of the interleaved load/store. 2571 Type *ScalarTy = getLoadStoreType(Instr); 2572 unsigned InterleaveFactor = Group->getFactor(); 2573 auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor); 2574 2575 // Prepare for the new pointers. 2576 SmallVector<Value *, 2> AddrParts; 2577 unsigned Index = Group->getIndex(Instr); 2578 2579 // TODO: extend the masked interleaved-group support to reversed access. 2580 VPValue *BlockInMask = getMask(); 2581 assert((!BlockInMask || !Group->isReverse()) && 2582 "Reversed masked interleave-group not supported."); 2583 2584 Value *Idx; 2585 // If the group is reverse, adjust the index to refer to the last vector lane 2586 // instead of the first. We adjust the index from the first vector lane, 2587 // rather than directly getting the pointer for lane VF - 1, because the 2588 // pointer operand of the interleaved access is supposed to be uniform. For 2589 // uniform instructions, we're only required to generate a value for the 2590 // first vector lane in each unroll iteration. 2591 if (Group->isReverse()) { 2592 Value *RuntimeVF = 2593 getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF); 2594 Idx = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1)); 2595 Idx = State.Builder.CreateMul(Idx, 2596 State.Builder.getInt32(Group->getFactor())); 2597 Idx = State.Builder.CreateAdd(Idx, State.Builder.getInt32(Index)); 2598 Idx = State.Builder.CreateNeg(Idx); 2599 } else 2600 Idx = State.Builder.getInt32(-Index); 2601 2602 VPValue *Addr = getAddr(); 2603 for (unsigned Part = 0; Part < State.UF; Part++) { 2604 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2605 if (auto *I = dyn_cast<Instruction>(AddrPart)) 2606 State.setDebugLocFrom(I->getDebugLoc()); 2607 2608 // Notice current instruction could be any index. Need to adjust the address 2609 // to the member of index 0. 2610 // 2611 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2612 // b = A[i]; // Member of index 0 2613 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2614 // 2615 // E.g. A[i+1] = a; // Member of index 1 2616 // A[i] = b; // Member of index 0 2617 // A[i+2] = c; // Member of index 2 (Current instruction) 2618 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2619 2620 bool InBounds = false; 2621 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2622 InBounds = gep->isInBounds(); 2623 AddrPart = State.Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds); 2624 AddrParts.push_back(AddrPart); 2625 } 2626 2627 State.setDebugLocFrom(Instr->getDebugLoc()); 2628 Value *PoisonVec = PoisonValue::get(VecTy); 2629 2630 auto CreateGroupMask = [&BlockInMask, &State, &InterleaveFactor]( 2631 unsigned Part, Value *MaskForGaps) -> Value * { 2632 if (State.VF.isScalable()) { 2633 assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); 2634 assert(InterleaveFactor == 2 && 2635 "Unsupported deinterleave factor for scalable vectors"); 2636 auto *BlockInMaskPart = State.get(BlockInMask, Part); 2637 SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart}; 2638 auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(), 2639 State.VF.getKnownMinValue() * 2, true); 2640 return State.Builder.CreateIntrinsic( 2641 MaskTy, Intrinsic::vector_interleave2, Ops, 2642 /*FMFSource=*/nullptr, "interleaved.mask"); 2643 } 2644 2645 if (!BlockInMask) 2646 return MaskForGaps; 2647 2648 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2649 Value *ShuffledMask = State.Builder.CreateShuffleVector( 2650 BlockInMaskPart, 2651 createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()), 2652 "interleaved.mask"); 2653 return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And, 2654 ShuffledMask, MaskForGaps) 2655 : ShuffledMask; 2656 }; 2657 2658 const DataLayout &DL = Instr->getDataLayout(); 2659 // Vectorize the interleaved load group. 2660 if (isa<LoadInst>(Instr)) { 2661 Value *MaskForGaps = nullptr; 2662 if (NeedsMaskForGaps) { 2663 MaskForGaps = createBitMaskForGaps(State.Builder, 2664 State.VF.getKnownMinValue(), *Group); 2665 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2666 } 2667 2668 // For each unroll part, create a wide load for the group. 2669 SmallVector<Value *, 2> NewLoads; 2670 for (unsigned Part = 0; Part < State.UF; Part++) { 2671 Instruction *NewLoad; 2672 if (BlockInMask || MaskForGaps) { 2673 Value *GroupMask = CreateGroupMask(Part, MaskForGaps); 2674 NewLoad = State.Builder.CreateMaskedLoad(VecTy, AddrParts[Part], 2675 Group->getAlign(), GroupMask, 2676 PoisonVec, "wide.masked.vec"); 2677 } else 2678 NewLoad = State.Builder.CreateAlignedLoad( 2679 VecTy, AddrParts[Part], Group->getAlign(), "wide.vec"); 2680 Group->addMetadata(NewLoad); 2681 NewLoads.push_back(NewLoad); 2682 } 2683 2684 ArrayRef<VPValue *> VPDefs = definedValues(); 2685 const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); 2686 if (VecTy->isScalableTy()) { 2687 assert(InterleaveFactor == 2 && 2688 "Unsupported deinterleave factor for scalable vectors"); 2689 2690 for (unsigned Part = 0; Part < State.UF; ++Part) { 2691 // Scalable vectors cannot use arbitrary shufflevectors (only splats), 2692 // so must use intrinsics to deinterleave. 2693 Value *DI = State.Builder.CreateIntrinsic( 2694 Intrinsic::vector_deinterleave2, VecTy, NewLoads[Part], 2695 /*FMFSource=*/nullptr, "strided.vec"); 2696 unsigned J = 0; 2697 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2698 Instruction *Member = Group->getMember(I); 2699 2700 if (!Member) 2701 continue; 2702 2703 Value *StridedVec = State.Builder.CreateExtractValue(DI, I); 2704 // If this member has different type, cast the result type. 2705 if (Member->getType() != ScalarTy) { 2706 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); 2707 StridedVec = 2708 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); 2709 } 2710 2711 if (Group->isReverse()) 2712 StridedVec = 2713 State.Builder.CreateVectorReverse(StridedVec, "reverse"); 2714 2715 State.set(VPDefs[J], StridedVec, Part); 2716 ++J; 2717 } 2718 } 2719 2720 return; 2721 } 2722 2723 // For each member in the group, shuffle out the appropriate data from the 2724 // wide loads. 2725 unsigned J = 0; 2726 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2727 Instruction *Member = Group->getMember(I); 2728 2729 // Skip the gaps in the group. 2730 if (!Member) 2731 continue; 2732 2733 auto StrideMask = 2734 createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue()); 2735 for (unsigned Part = 0; Part < State.UF; Part++) { 2736 Value *StridedVec = State.Builder.CreateShuffleVector( 2737 NewLoads[Part], StrideMask, "strided.vec"); 2738 2739 // If this member has different type, cast the result type. 2740 if (Member->getType() != ScalarTy) { 2741 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 2742 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); 2743 StridedVec = 2744 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); 2745 } 2746 2747 if (Group->isReverse()) 2748 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); 2749 2750 State.set(VPDefs[J], StridedVec, Part); 2751 } 2752 ++J; 2753 } 2754 return; 2755 } 2756 2757 // The sub vector type for current instruction. 2758 auto *SubVT = VectorType::get(ScalarTy, State.VF); 2759 2760 // Vectorize the interleaved store group. 2761 Value *MaskForGaps = 2762 createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group); 2763 assert((!MaskForGaps || !State.VF.isScalable()) && 2764 "masking gaps for scalable vectors is not yet supported."); 2765 ArrayRef<VPValue *> StoredValues = getStoredValues(); 2766 for (unsigned Part = 0; Part < State.UF; Part++) { 2767 // Collect the stored vector from each member. 2768 SmallVector<Value *, 4> StoredVecs; 2769 unsigned StoredIdx = 0; 2770 for (unsigned i = 0; i < InterleaveFactor; i++) { 2771 assert((Group->getMember(i) || MaskForGaps) && 2772 "Fail to get a member from an interleaved store group"); 2773 Instruction *Member = Group->getMember(i); 2774 2775 // Skip the gaps in the group. 2776 if (!Member) { 2777 Value *Undef = PoisonValue::get(SubVT); 2778 StoredVecs.push_back(Undef); 2779 continue; 2780 } 2781 2782 Value *StoredVec = State.get(StoredValues[StoredIdx], Part); 2783 ++StoredIdx; 2784 2785 if (Group->isReverse()) 2786 StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse"); 2787 2788 // If this member has different type, cast it to a unified type. 2789 2790 if (StoredVec->getType() != SubVT) 2791 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL); 2792 2793 StoredVecs.push_back(StoredVec); 2794 } 2795 2796 // Interleave all the smaller vectors into one wider vector. 2797 Value *IVec = 2798 interleaveVectors(State.Builder, StoredVecs, "interleaved.vec"); 2799 Instruction *NewStoreInstr; 2800 if (BlockInMask || MaskForGaps) { 2801 Value *GroupMask = CreateGroupMask(Part, MaskForGaps); 2802 NewStoreInstr = State.Builder.CreateMaskedStore( 2803 IVec, AddrParts[Part], Group->getAlign(), GroupMask); 2804 } else 2805 NewStoreInstr = State.Builder.CreateAlignedStore(IVec, AddrParts[Part], 2806 Group->getAlign()); 2807 2808 Group->addMetadata(NewStoreInstr); 2809 } 2810 } 2811 2812 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2813 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 2814 VPSlotTracker &SlotTracker) const { 2815 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 2816 IG->getInsertPos()->printAsOperand(O, false); 2817 O << ", "; 2818 getAddr()->printAsOperand(O, SlotTracker); 2819 VPValue *Mask = getMask(); 2820 if (Mask) { 2821 O << ", "; 2822 Mask->printAsOperand(O, SlotTracker); 2823 } 2824 2825 unsigned OpIdx = 0; 2826 for (unsigned i = 0; i < IG->getFactor(); ++i) { 2827 if (!IG->getMember(i)) 2828 continue; 2829 if (getNumStoreOperands() > 0) { 2830 O << "\n" << Indent << " store "; 2831 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 2832 O << " to index " << i; 2833 } else { 2834 O << "\n" << Indent << " "; 2835 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 2836 O << " = load from index " << i; 2837 } 2838 ++OpIdx; 2839 } 2840 } 2841 #endif 2842 2843 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { 2844 Value *Start = getStartValue()->getLiveInIRValue(); 2845 PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index"); 2846 EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 2847 2848 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2849 EntryPart->addIncoming(Start, VectorPH); 2850 EntryPart->setDebugLoc(getDebugLoc()); 2851 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) 2852 State.set(this, EntryPart, Part, /*IsScalar*/ true); 2853 } 2854 2855 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2856 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2857 VPSlotTracker &SlotTracker) const { 2858 O << Indent << "EMIT "; 2859 printAsOperand(O, SlotTracker); 2860 O << " = CANONICAL-INDUCTION "; 2861 printOperands(O, SlotTracker); 2862 } 2863 #endif 2864 2865 bool VPCanonicalIVPHIRecipe::isCanonical( 2866 InductionDescriptor::InductionKind Kind, VPValue *Start, 2867 VPValue *Step) const { 2868 // Must be an integer induction. 2869 if (Kind != InductionDescriptor::IK_IntInduction) 2870 return false; 2871 // Start must match the start value of this canonical induction. 2872 if (Start != getStartValue()) 2873 return false; 2874 2875 // If the step is defined by a recipe, it is not a ConstantInt. 2876 if (Step->getDefiningRecipe()) 2877 return false; 2878 2879 ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue()); 2880 return StepC && StepC->isOne(); 2881 } 2882 2883 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) { 2884 return IsScalarAfterVectorization && 2885 (!IsScalable || vputils::onlyFirstLaneUsed(this)); 2886 } 2887 2888 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 2889 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && 2890 "Not a pointer induction according to InductionDescriptor!"); 2891 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && 2892 "Unexpected type."); 2893 assert(!onlyScalarsGenerated(State.VF.isScalable()) && 2894 "Recipe should have been replaced"); 2895 2896 auto *IVR = getParent()->getPlan()->getCanonicalIV(); 2897 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true)); 2898 Type *PhiType = IndDesc.getStep()->getType(); 2899 2900 // Build a pointer phi 2901 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 2902 Type *ScStValueType = ScalarStartValue->getType(); 2903 PHINode *NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi", 2904 CanonicalIV->getIterator()); 2905 2906 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2907 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 2908 2909 // A pointer induction, performed by using a gep 2910 BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint(); 2911 2912 Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0)); 2913 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 2914 Value *NumUnrolledElems = 2915 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 2916 Value *InductionGEP = GetElementPtrInst::Create( 2917 State.Builder.getInt8Ty(), NewPointerPhi, 2918 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 2919 InductionLoc); 2920 // Add induction update using an incorrect block temporarily. The phi node 2921 // will be fixed after VPlan execution. Note that at this point the latch 2922 // block cannot be used, as it does not exist yet. 2923 // TODO: Model increment value in VPlan, by turning the recipe into a 2924 // multi-def and a subclass of VPHeaderPHIRecipe. 2925 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 2926 2927 // Create UF many actual address geps that use the pointer 2928 // phi as base and a vectorized version of the step value 2929 // (<step*0, ..., step*N>) as offset. 2930 for (unsigned Part = 0; Part < State.UF; ++Part) { 2931 Type *VecPhiType = VectorType::get(PhiType, State.VF); 2932 Value *StartOffsetScalar = 2933 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 2934 Value *StartOffset = 2935 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 2936 // Create a vector of consecutive numbers from zero to VF. 2937 StartOffset = State.Builder.CreateAdd( 2938 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 2939 2940 assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) && 2941 "scalar step must be the same across all parts"); 2942 Value *GEP = State.Builder.CreateGEP( 2943 State.Builder.getInt8Ty(), NewPointerPhi, 2944 State.Builder.CreateMul( 2945 StartOffset, 2946 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue), 2947 "vector.gep")); 2948 State.set(this, GEP, Part); 2949 } 2950 } 2951 2952 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2953 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, 2954 VPSlotTracker &SlotTracker) const { 2955 O << Indent << "EMIT "; 2956 printAsOperand(O, SlotTracker); 2957 O << " = WIDEN-POINTER-INDUCTION "; 2958 getStartValue()->printAsOperand(O, SlotTracker); 2959 O << ", " << *IndDesc.getStep(); 2960 } 2961 #endif 2962 2963 void VPExpandSCEVRecipe::execute(VPTransformState &State) { 2964 assert(!State.Instance && "cannot be used in per-lane"); 2965 const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); 2966 SCEVExpander Exp(SE, DL, "induction"); 2967 2968 Value *Res = Exp.expandCodeFor(Expr, Expr->getType(), 2969 &*State.Builder.GetInsertPoint()); 2970 assert(!State.ExpandedSCEVs.contains(Expr) && 2971 "Same SCEV expanded multiple times"); 2972 State.ExpandedSCEVs[Expr] = Res; 2973 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) 2974 State.set(this, Res, {Part, 0}); 2975 } 2976 2977 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2978 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, 2979 VPSlotTracker &SlotTracker) const { 2980 O << Indent << "EMIT "; 2981 getVPSingleValue()->printAsOperand(O, SlotTracker); 2982 O << " = EXPAND SCEV " << *Expr; 2983 } 2984 #endif 2985 2986 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { 2987 Value *CanonicalIV = State.get(getOperand(0), 0, /*IsScalar*/ true); 2988 Type *STy = CanonicalIV->getType(); 2989 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); 2990 ElementCount VF = State.VF; 2991 Value *VStart = VF.isScalar() 2992 ? CanonicalIV 2993 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); 2994 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { 2995 Value *VStep = createStepForVF(Builder, STy, VF, Part); 2996 if (VF.isVector()) { 2997 VStep = Builder.CreateVectorSplat(VF, VStep); 2998 VStep = 2999 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); 3000 } 3001 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); 3002 State.set(this, CanonicalVectorIV, Part); 3003 } 3004 } 3005 3006 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3007 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, 3008 VPSlotTracker &SlotTracker) const { 3009 O << Indent << "EMIT "; 3010 printAsOperand(O, SlotTracker); 3011 O << " = WIDEN-CANONICAL-INDUCTION "; 3012 printOperands(O, SlotTracker); 3013 } 3014 #endif 3015 3016 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { 3017 auto &Builder = State.Builder; 3018 // Create a vector from the initial value. 3019 auto *VectorInit = getStartValue()->getLiveInIRValue(); 3020 3021 Type *VecTy = State.VF.isScalar() 3022 ? VectorInit->getType() 3023 : VectorType::get(VectorInit->getType(), State.VF); 3024 3025 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3026 if (State.VF.isVector()) { 3027 auto *IdxTy = Builder.getInt32Ty(); 3028 auto *One = ConstantInt::get(IdxTy, 1); 3029 IRBuilder<>::InsertPointGuard Guard(Builder); 3030 Builder.SetInsertPoint(VectorPH->getTerminator()); 3031 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF); 3032 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 3033 VectorInit = Builder.CreateInsertElement( 3034 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init"); 3035 } 3036 3037 // Create a phi node for the new recurrence. 3038 PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur"); 3039 EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 3040 EntryPart->addIncoming(VectorInit, VectorPH); 3041 State.set(this, EntryPart, 0); 3042 } 3043 3044 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3045 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, 3046 VPSlotTracker &SlotTracker) const { 3047 O << Indent << "FIRST-ORDER-RECURRENCE-PHI "; 3048 printAsOperand(O, SlotTracker); 3049 O << " = phi "; 3050 printOperands(O, SlotTracker); 3051 } 3052 #endif 3053 3054 void VPReductionPHIRecipe::execute(VPTransformState &State) { 3055 auto &Builder = State.Builder; 3056 3057 // Reductions do not have to start at zero. They can start with 3058 // any loop invariant values. 3059 VPValue *StartVPV = getStartValue(); 3060 Value *StartV = StartVPV->getLiveInIRValue(); 3061 3062 // In order to support recurrences we need to be able to vectorize Phi nodes. 3063 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3064 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 3065 // this value when we vectorize all of the instructions that use the PHI. 3066 bool ScalarPHI = State.VF.isScalar() || IsInLoop; 3067 Type *VecTy = ScalarPHI ? StartV->getType() 3068 : VectorType::get(StartV->getType(), State.VF); 3069 3070 BasicBlock *HeaderBB = State.CFG.PrevBB; 3071 assert(State.CurrentVectorLoop->getHeader() == HeaderBB && 3072 "recipe must be in the vector loop header"); 3073 unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF; 3074 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { 3075 Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi"); 3076 EntryPart->insertBefore(HeaderBB->getFirstInsertionPt()); 3077 State.set(this, EntryPart, Part, IsInLoop); 3078 } 3079 3080 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3081 3082 Value *Iden = nullptr; 3083 RecurKind RK = RdxDesc.getRecurrenceKind(); 3084 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || 3085 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { 3086 // MinMax and AnyOf reductions have the start value as their identity. 3087 if (ScalarPHI) { 3088 Iden = StartV; 3089 } else { 3090 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 3091 Builder.SetInsertPoint(VectorPH->getTerminator()); 3092 StartV = Iden = 3093 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 3094 } 3095 } else { 3096 Iden = llvm::getRecurrenceIdentity(RK, VecTy->getScalarType(), 3097 RdxDesc.getFastMathFlags()); 3098 3099 if (!ScalarPHI) { 3100 Iden = Builder.CreateVectorSplat(State.VF, Iden); 3101 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 3102 Builder.SetInsertPoint(VectorPH->getTerminator()); 3103 Constant *Zero = Builder.getInt32(0); 3104 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 3105 } 3106 } 3107 3108 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { 3109 Value *EntryPart = State.get(this, Part, IsInLoop); 3110 // Make sure to add the reduction start value only to the 3111 // first unroll part. 3112 Value *StartVal = (Part == 0) ? StartV : Iden; 3113 cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH); 3114 } 3115 } 3116 3117 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3118 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3119 VPSlotTracker &SlotTracker) const { 3120 O << Indent << "WIDEN-REDUCTION-PHI "; 3121 3122 printAsOperand(O, SlotTracker); 3123 O << " = phi "; 3124 printOperands(O, SlotTracker); 3125 } 3126 #endif 3127 3128 void VPWidenPHIRecipe::execute(VPTransformState &State) { 3129 assert(EnableVPlanNativePath && 3130 "Non-native vplans are not expected to have VPWidenPHIRecipes."); 3131 3132 Value *Op0 = State.get(getOperand(0), 0); 3133 Type *VecTy = Op0->getType(); 3134 Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi"); 3135 State.set(this, VecPhi, 0); 3136 } 3137 3138 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3139 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3140 VPSlotTracker &SlotTracker) const { 3141 O << Indent << "WIDEN-PHI "; 3142 3143 auto *OriginalPhi = cast<PHINode>(getUnderlyingValue()); 3144 // Unless all incoming values are modeled in VPlan print the original PHI 3145 // directly. 3146 // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming 3147 // values as VPValues. 3148 if (getNumOperands() != OriginalPhi->getNumOperands()) { 3149 O << VPlanIngredient(OriginalPhi); 3150 return; 3151 } 3152 3153 printAsOperand(O, SlotTracker); 3154 O << " = phi "; 3155 printOperands(O, SlotTracker); 3156 } 3157 #endif 3158 3159 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and 3160 // remove VPActiveLaneMaskPHIRecipe. 3161 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { 3162 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3163 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { 3164 Value *StartMask = State.get(getOperand(0), Part); 3165 PHINode *EntryPart = 3166 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); 3167 EntryPart->addIncoming(StartMask, VectorPH); 3168 EntryPart->setDebugLoc(getDebugLoc()); 3169 State.set(this, EntryPart, Part); 3170 } 3171 } 3172 3173 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3174 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3175 VPSlotTracker &SlotTracker) const { 3176 O << Indent << "ACTIVE-LANE-MASK-PHI "; 3177 3178 printAsOperand(O, SlotTracker); 3179 O << " = phi "; 3180 printOperands(O, SlotTracker); 3181 } 3182 #endif 3183 3184 void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) { 3185 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3186 assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization."); 3187 Value *Start = State.get(getOperand(0), VPIteration(0, 0)); 3188 PHINode *EntryPart = 3189 State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv"); 3190 EntryPart->addIncoming(Start, VectorPH); 3191 EntryPart->setDebugLoc(getDebugLoc()); 3192 State.set(this, EntryPart, 0, /*IsScalar=*/true); 3193 } 3194 3195 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3196 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3197 VPSlotTracker &SlotTracker) const { 3198 O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI "; 3199 3200 printAsOperand(O, SlotTracker); 3201 O << " = phi "; 3202 printOperands(O, SlotTracker); 3203 } 3204 #endif 3205