1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file contains implementations for different VPlan recipes. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "VPlan.h" 15 #include "VPlanAnalysis.h" 16 #include "llvm/ADT/STLExtras.h" 17 #include "llvm/ADT/SmallVector.h" 18 #include "llvm/ADT/Twine.h" 19 #include "llvm/Analysis/IVDescriptors.h" 20 #include "llvm/IR/BasicBlock.h" 21 #include "llvm/IR/IRBuilder.h" 22 #include "llvm/IR/Instruction.h" 23 #include "llvm/IR/Instructions.h" 24 #include "llvm/IR/Type.h" 25 #include "llvm/IR/Value.h" 26 #include "llvm/Support/Casting.h" 27 #include "llvm/Support/CommandLine.h" 28 #include "llvm/Support/Debug.h" 29 #include "llvm/Support/raw_ostream.h" 30 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 31 #include "llvm/Transforms/Utils/LoopUtils.h" 32 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 33 #include <cassert> 34 35 using namespace llvm; 36 37 using VectorParts = SmallVector<Value *, 2>; 38 39 namespace llvm { 40 extern cl::opt<bool> EnableVPlanNativePath; 41 } 42 extern cl::opt<unsigned> ForceTargetInstructionCost; 43 44 #define LV_NAME "loop-vectorize" 45 #define DEBUG_TYPE LV_NAME 46 47 bool VPRecipeBase::mayWriteToMemory() const { 48 switch (getVPDefID()) { 49 case VPInterleaveSC: 50 return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0; 51 case VPWidenStoreEVLSC: 52 case VPWidenStoreSC: 53 return true; 54 case VPReplicateSC: 55 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 56 ->mayWriteToMemory(); 57 case VPWidenCallSC: 58 return !cast<VPWidenCallRecipe>(this) 59 ->getCalledScalarFunction() 60 ->onlyReadsMemory(); 61 case VPBranchOnMaskSC: 62 case VPScalarIVStepsSC: 63 case VPPredInstPHISC: 64 return false; 65 case VPBlendSC: 66 case VPReductionSC: 67 case VPWidenCanonicalIVSC: 68 case VPWidenCastSC: 69 case VPWidenGEPSC: 70 case VPWidenIntOrFpInductionSC: 71 case VPWidenLoadEVLSC: 72 case VPWidenLoadSC: 73 case VPWidenPHISC: 74 case VPWidenSC: 75 case VPWidenSelectSC: { 76 const Instruction *I = 77 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 78 (void)I; 79 assert((!I || !I->mayWriteToMemory()) && 80 "underlying instruction may write to memory"); 81 return false; 82 } 83 default: 84 return true; 85 } 86 } 87 88 bool VPRecipeBase::mayReadFromMemory() const { 89 switch (getVPDefID()) { 90 case VPWidenLoadEVLSC: 91 case VPWidenLoadSC: 92 return true; 93 case VPReplicateSC: 94 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 95 ->mayReadFromMemory(); 96 case VPWidenCallSC: 97 return !cast<VPWidenCallRecipe>(this) 98 ->getCalledScalarFunction() 99 ->onlyWritesMemory(); 100 case VPBranchOnMaskSC: 101 case VPPredInstPHISC: 102 case VPScalarIVStepsSC: 103 case VPWidenStoreEVLSC: 104 case VPWidenStoreSC: 105 return false; 106 case VPBlendSC: 107 case VPReductionSC: 108 case VPWidenCanonicalIVSC: 109 case VPWidenCastSC: 110 case VPWidenGEPSC: 111 case VPWidenIntOrFpInductionSC: 112 case VPWidenPHISC: 113 case VPWidenSC: 114 case VPWidenSelectSC: { 115 const Instruction *I = 116 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 117 (void)I; 118 assert((!I || !I->mayReadFromMemory()) && 119 "underlying instruction may read from memory"); 120 return false; 121 } 122 default: 123 return true; 124 } 125 } 126 127 bool VPRecipeBase::mayHaveSideEffects() const { 128 switch (getVPDefID()) { 129 case VPDerivedIVSC: 130 case VPPredInstPHISC: 131 case VPScalarCastSC: 132 return false; 133 case VPInstructionSC: 134 switch (cast<VPInstruction>(this)->getOpcode()) { 135 case Instruction::Or: 136 case Instruction::ICmp: 137 case Instruction::Select: 138 case VPInstruction::Not: 139 case VPInstruction::CalculateTripCountMinusVF: 140 case VPInstruction::CanonicalIVIncrementForPart: 141 case VPInstruction::ExtractFromEnd: 142 case VPInstruction::FirstOrderRecurrenceSplice: 143 case VPInstruction::LogicalAnd: 144 case VPInstruction::PtrAdd: 145 return false; 146 default: 147 return true; 148 } 149 case VPWidenCallSC: { 150 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction(); 151 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn(); 152 } 153 case VPBlendSC: 154 case VPReductionSC: 155 case VPScalarIVStepsSC: 156 case VPWidenCanonicalIVSC: 157 case VPWidenCastSC: 158 case VPWidenGEPSC: 159 case VPWidenIntOrFpInductionSC: 160 case VPWidenPHISC: 161 case VPWidenPointerInductionSC: 162 case VPWidenSC: 163 case VPWidenSelectSC: { 164 const Instruction *I = 165 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 166 (void)I; 167 assert((!I || !I->mayHaveSideEffects()) && 168 "underlying instruction has side-effects"); 169 return false; 170 } 171 case VPInterleaveSC: 172 return mayWriteToMemory(); 173 case VPWidenLoadEVLSC: 174 case VPWidenLoadSC: 175 case VPWidenStoreEVLSC: 176 case VPWidenStoreSC: 177 assert( 178 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() == 179 mayWriteToMemory() && 180 "mayHaveSideffects result for ingredient differs from this " 181 "implementation"); 182 return mayWriteToMemory(); 183 case VPReplicateSC: { 184 auto *R = cast<VPReplicateRecipe>(this); 185 return R->getUnderlyingInstr()->mayHaveSideEffects(); 186 } 187 default: 188 return true; 189 } 190 } 191 192 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { 193 VPValue *ExitValue = getOperand(0); 194 auto Lane = vputils::isUniformAfterVectorization(ExitValue) 195 ? VPLane::getFirstLane() 196 : VPLane::getLastLaneForVF(State.VF); 197 VPBasicBlock *MiddleVPBB = 198 cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor()); 199 BasicBlock *MiddleBB = State.CFG.VPBB2IRBB[MiddleVPBB]; 200 Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)), 201 MiddleBB); 202 } 203 204 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 205 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { 206 O << "Live-out "; 207 getPhi()->printAsOperand(O); 208 O << " = "; 209 getOperand(0)->printAsOperand(O, SlotTracker); 210 O << "\n"; 211 } 212 #endif 213 214 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { 215 assert(!Parent && "Recipe already in some VPBasicBlock"); 216 assert(InsertPos->getParent() && 217 "Insertion position not in any VPBasicBlock"); 218 InsertPos->getParent()->insert(this, InsertPos->getIterator()); 219 } 220 221 void VPRecipeBase::insertBefore(VPBasicBlock &BB, 222 iplist<VPRecipeBase>::iterator I) { 223 assert(!Parent && "Recipe already in some VPBasicBlock"); 224 assert(I == BB.end() || I->getParent() == &BB); 225 BB.insert(this, I); 226 } 227 228 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { 229 assert(!Parent && "Recipe already in some VPBasicBlock"); 230 assert(InsertPos->getParent() && 231 "Insertion position not in any VPBasicBlock"); 232 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator())); 233 } 234 235 void VPRecipeBase::removeFromParent() { 236 assert(getParent() && "Recipe not in any VPBasicBlock"); 237 getParent()->getRecipeList().remove(getIterator()); 238 Parent = nullptr; 239 } 240 241 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { 242 assert(getParent() && "Recipe not in any VPBasicBlock"); 243 return getParent()->getRecipeList().erase(getIterator()); 244 } 245 246 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { 247 removeFromParent(); 248 insertAfter(InsertPos); 249 } 250 251 void VPRecipeBase::moveBefore(VPBasicBlock &BB, 252 iplist<VPRecipeBase>::iterator I) { 253 removeFromParent(); 254 insertBefore(BB, I); 255 } 256 257 /// Return the underlying instruction to be used for computing \p R's cost via 258 /// the legacy cost model. Return nullptr if there's no suitable instruction. 259 static Instruction *getInstructionForCost(const VPRecipeBase *R) { 260 if (auto *S = dyn_cast<VPSingleDefRecipe>(R)) 261 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue()); 262 if (auto *IG = dyn_cast<VPInterleaveRecipe>(R)) 263 return IG->getInsertPos(); 264 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R)) 265 return &WidenMem->getIngredient(); 266 return nullptr; 267 } 268 269 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { 270 if (auto *UI = getInstructionForCost(this)) 271 if (Ctx.skipCostComputation(UI, VF.isVector())) 272 return 0; 273 274 InstructionCost RecipeCost = computeCost(VF, Ctx); 275 if (ForceTargetInstructionCost.getNumOccurrences() > 0 && 276 RecipeCost.isValid()) 277 RecipeCost = InstructionCost(ForceTargetInstructionCost); 278 279 LLVM_DEBUG({ 280 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": "; 281 dump(); 282 }); 283 return RecipeCost; 284 } 285 286 InstructionCost VPRecipeBase::computeCost(ElementCount VF, 287 VPCostContext &Ctx) const { 288 // Compute the cost for the recipe falling back to the legacy cost model using 289 // the underlying instruction. If there is no underlying instruction, returns 290 // 0. 291 Instruction *UI = getInstructionForCost(this); 292 if (UI && isa<VPReplicateRecipe>(this)) { 293 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan 294 // transform, avoid computing their cost multiple times for now. 295 Ctx.SkipCostComputation.insert(UI); 296 } 297 return UI ? Ctx.getLegacyCost(UI, VF) : 0; 298 } 299 300 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { 301 assert(OpType == OperationType::FPMathOp && 302 "recipe doesn't have fast math flags"); 303 FastMathFlags Res; 304 Res.setAllowReassoc(FMFs.AllowReassoc); 305 Res.setNoNaNs(FMFs.NoNaNs); 306 Res.setNoInfs(FMFs.NoInfs); 307 Res.setNoSignedZeros(FMFs.NoSignedZeros); 308 Res.setAllowReciprocal(FMFs.AllowReciprocal); 309 Res.setAllowContract(FMFs.AllowContract); 310 Res.setApproxFunc(FMFs.ApproxFunc); 311 return Res; 312 } 313 314 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, 315 VPValue *A, VPValue *B, DebugLoc DL, 316 const Twine &Name) 317 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}), 318 Pred, DL), 319 Opcode(Opcode), Name(Name.str()) { 320 assert(Opcode == Instruction::ICmp && 321 "only ICmp predicates supported at the moment"); 322 } 323 324 VPInstruction::VPInstruction(unsigned Opcode, 325 std::initializer_list<VPValue *> Operands, 326 FastMathFlags FMFs, DebugLoc DL, const Twine &Name) 327 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL), 328 Opcode(Opcode), Name(Name.str()) { 329 // Make sure the VPInstruction is a floating-point operation. 330 assert(isFPMathOp() && "this op can't take fast-math flags"); 331 } 332 333 bool VPInstruction::doesGeneratePerAllLanes() const { 334 return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this); 335 } 336 337 bool VPInstruction::canGenerateScalarForFirstLane() const { 338 if (Instruction::isBinaryOp(getOpcode())) 339 return true; 340 if (isVectorToScalar()) 341 return true; 342 switch (Opcode) { 343 case Instruction::ICmp: 344 case VPInstruction::BranchOnCond: 345 case VPInstruction::BranchOnCount: 346 case VPInstruction::CalculateTripCountMinusVF: 347 case VPInstruction::CanonicalIVIncrementForPart: 348 case VPInstruction::PtrAdd: 349 case VPInstruction::ExplicitVectorLength: 350 return true; 351 default: 352 return false; 353 } 354 } 355 356 Value *VPInstruction::generatePerLane(VPTransformState &State, 357 const VPIteration &Lane) { 358 IRBuilderBase &Builder = State.Builder; 359 360 assert(getOpcode() == VPInstruction::PtrAdd && 361 "only PtrAdd opcodes are supported for now"); 362 return Builder.CreatePtrAdd(State.get(getOperand(0), Lane), 363 State.get(getOperand(1), Lane), Name); 364 } 365 366 Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { 367 IRBuilderBase &Builder = State.Builder; 368 369 if (Instruction::isBinaryOp(getOpcode())) { 370 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 371 Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed); 372 Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed); 373 auto *Res = 374 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); 375 if (auto *I = dyn_cast<Instruction>(Res)) 376 setFlags(I); 377 return Res; 378 } 379 380 switch (getOpcode()) { 381 case VPInstruction::Not: { 382 Value *A = State.get(getOperand(0), Part); 383 return Builder.CreateNot(A, Name); 384 } 385 case Instruction::ICmp: { 386 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 387 Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed); 388 Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed); 389 return Builder.CreateCmp(getPredicate(), A, B, Name); 390 } 391 case Instruction::Select: { 392 Value *Cond = State.get(getOperand(0), Part); 393 Value *Op1 = State.get(getOperand(1), Part); 394 Value *Op2 = State.get(getOperand(2), Part); 395 return Builder.CreateSelect(Cond, Op1, Op2, Name); 396 } 397 case VPInstruction::ActiveLaneMask: { 398 // Get first lane of vector induction variable. 399 Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); 400 // Get the original loop tripcount. 401 Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0)); 402 403 // If this part of the active lane mask is scalar, generate the CMP directly 404 // to avoid unnecessary extracts. 405 if (State.VF.isScalar()) 406 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC, 407 Name); 408 409 auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); 410 auto *PredTy = VectorType::get(Int1Ty, State.VF); 411 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, 412 {PredTy, ScalarTC->getType()}, 413 {VIVElem0, ScalarTC}, nullptr, Name); 414 } 415 case VPInstruction::FirstOrderRecurrenceSplice: { 416 // Generate code to combine the previous and current values in vector v3. 417 // 418 // vector.ph: 419 // v_init = vector(..., ..., ..., a[-1]) 420 // br vector.body 421 // 422 // vector.body 423 // i = phi [0, vector.ph], [i+4, vector.body] 424 // v1 = phi [v_init, vector.ph], [v2, vector.body] 425 // v2 = a[i, i+1, i+2, i+3]; 426 // v3 = vector(v1(3), v2(0, 1, 2)) 427 428 // For the first part, use the recurrence phi (v1), otherwise v2. 429 auto *V1 = State.get(getOperand(0), 0); 430 Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1); 431 if (!PartMinus1->getType()->isVectorTy()) 432 return PartMinus1; 433 Value *V2 = State.get(getOperand(1), Part); 434 return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name); 435 } 436 case VPInstruction::CalculateTripCountMinusVF: { 437 if (Part != 0) 438 return State.get(this, 0, /*IsScalar*/ true); 439 440 Value *ScalarTC = State.get(getOperand(0), {0, 0}); 441 Value *Step = 442 createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF); 443 Value *Sub = Builder.CreateSub(ScalarTC, Step); 444 Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step); 445 Value *Zero = ConstantInt::get(ScalarTC->getType(), 0); 446 return Builder.CreateSelect(Cmp, Sub, Zero); 447 } 448 case VPInstruction::ExplicitVectorLength: { 449 // Compute EVL 450 auto GetEVL = [=](VPTransformState &State, Value *AVL) { 451 assert(AVL->getType()->isIntegerTy() && 452 "Requested vector length should be an integer."); 453 454 // TODO: Add support for MaxSafeDist for correct loop emission. 455 assert(State.VF.isScalable() && "Expected scalable vector factor."); 456 Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue()); 457 458 Value *EVL = State.Builder.CreateIntrinsic( 459 State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length, 460 {AVL, VFArg, State.Builder.getTrue()}); 461 return EVL; 462 }; 463 // TODO: Restructure this code with an explicit remainder loop, vsetvli can 464 // be outside of the main loop. 465 assert(Part == 0 && "No unrolling expected for predicated vectorization."); 466 // Compute VTC - IV as the AVL (requested vector length). 467 Value *Index = State.get(getOperand(0), VPIteration(0, 0)); 468 Value *TripCount = State.get(getOperand(1), VPIteration(0, 0)); 469 Value *AVL = State.Builder.CreateSub(TripCount, Index); 470 Value *EVL = GetEVL(State, AVL); 471 return EVL; 472 } 473 case VPInstruction::CanonicalIVIncrementForPart: { 474 auto *IV = State.get(getOperand(0), VPIteration(0, 0)); 475 if (Part == 0) 476 return IV; 477 478 // The canonical IV is incremented by the vectorization factor (num of SIMD 479 // elements) times the unroll part. 480 Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); 481 return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(), 482 hasNoSignedWrap()); 483 } 484 case VPInstruction::BranchOnCond: { 485 if (Part != 0) 486 return nullptr; 487 488 Value *Cond = State.get(getOperand(0), VPIteration(Part, 0)); 489 // Replace the temporary unreachable terminator with a new conditional 490 // branch, hooking it up to backward destination for exiting blocks now and 491 // to forward destination(s) later when they are created. 492 BranchInst *CondBr = 493 Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr); 494 CondBr->setSuccessor(0, nullptr); 495 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 496 497 if (!getParent()->isExiting()) 498 return CondBr; 499 500 VPRegionBlock *ParentRegion = getParent()->getParent(); 501 VPBasicBlock *Header = ParentRegion->getEntryBasicBlock(); 502 CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]); 503 return CondBr; 504 } 505 case VPInstruction::BranchOnCount: { 506 if (Part != 0) 507 return nullptr; 508 // First create the compare. 509 Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true); 510 Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true); 511 Value *Cond = Builder.CreateICmpEQ(IV, TC); 512 513 // Now create the branch. 514 auto *Plan = getParent()->getPlan(); 515 VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); 516 VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock(); 517 518 // Replace the temporary unreachable terminator with a new conditional 519 // branch, hooking it up to backward destination (the header) now and to the 520 // forward destination (the exit/middle block) later when it is created. 521 // Note that CreateCondBr expects a valid BB as first argument, so we need 522 // to set it to nullptr later. 523 BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), 524 State.CFG.VPBB2IRBB[Header]); 525 CondBr->setSuccessor(0, nullptr); 526 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 527 return CondBr; 528 } 529 case VPInstruction::ComputeReductionResult: { 530 if (Part != 0) 531 return State.get(this, 0, /*IsScalar*/ true); 532 533 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary 534 // and will be removed by breaking up the recipe further. 535 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0)); 536 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 537 // Get its reduction variable descriptor. 538 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 539 540 RecurKind RK = RdxDesc.getRecurrenceKind(); 541 542 VPValue *LoopExitingDef = getOperand(1); 543 Type *PhiTy = OrigPhi->getType(); 544 VectorParts RdxParts(State.UF); 545 for (unsigned Part = 0; Part < State.UF; ++Part) 546 RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop()); 547 548 // If the vector reduction can be performed in a smaller type, we truncate 549 // then extend the loop exit value to enable InstCombine to evaluate the 550 // entire expression in the smaller type. 551 // TODO: Handle this in truncateToMinBW. 552 if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 553 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF); 554 for (unsigned Part = 0; Part < State.UF; ++Part) 555 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 556 } 557 // Reduce all of the unrolled parts into a single vector. 558 Value *ReducedPartRdx = RdxParts[0]; 559 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 560 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) 561 Op = Instruction::Or; 562 563 if (PhiR->isOrdered()) { 564 ReducedPartRdx = RdxParts[State.UF - 1]; 565 } else { 566 // Floating-point operations should have some FMF to enable the reduction. 567 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 568 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 569 for (unsigned Part = 1; Part < State.UF; ++Part) { 570 Value *RdxPart = RdxParts[Part]; 571 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 572 ReducedPartRdx = Builder.CreateBinOp( 573 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 574 else 575 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 576 } 577 } 578 579 // Create the reduction after the loop. Note that inloop reductions create 580 // the target reduction in the loop using a Reduction recipe. 581 if ((State.VF.isVector() || 582 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) && 583 !PhiR->isInLoop()) { 584 ReducedPartRdx = 585 createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); 586 // If the reduction can be performed in a smaller type, we need to extend 587 // the reduction to the wider type before we branch to the original loop. 588 if (PhiTy != RdxDesc.getRecurrenceType()) 589 ReducedPartRdx = RdxDesc.isSigned() 590 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 591 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 592 } 593 594 // If there were stores of the reduction value to a uniform memory address 595 // inside the loop, create the final store here. 596 if (StoreInst *SI = RdxDesc.IntermediateStore) { 597 auto *NewSI = Builder.CreateAlignedStore( 598 ReducedPartRdx, SI->getPointerOperand(), SI->getAlign()); 599 propagateMetadata(NewSI, SI); 600 } 601 602 return ReducedPartRdx; 603 } 604 case VPInstruction::ExtractFromEnd: { 605 if (Part != 0) 606 return State.get(this, 0, /*IsScalar*/ true); 607 608 auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue()); 609 unsigned Offset = CI->getZExtValue(); 610 assert(Offset > 0 && "Offset from end must be positive"); 611 Value *Res; 612 if (State.VF.isVector()) { 613 assert(Offset <= State.VF.getKnownMinValue() && 614 "invalid offset to extract from"); 615 // Extract lane VF - Offset from the operand. 616 Res = State.get( 617 getOperand(0), 618 VPIteration(State.UF - 1, VPLane::getLaneFromEnd(State.VF, Offset))); 619 } else { 620 assert(Offset <= State.UF && "invalid offset to extract from"); 621 // When loop is unrolled without vectorizing, retrieve UF - Offset. 622 Res = State.get(getOperand(0), State.UF - Offset); 623 } 624 if (isa<ExtractElementInst>(Res)) 625 Res->setName(Name); 626 return Res; 627 } 628 case VPInstruction::LogicalAnd: { 629 Value *A = State.get(getOperand(0), Part); 630 Value *B = State.get(getOperand(1), Part); 631 return Builder.CreateLogicalAnd(A, B, Name); 632 } 633 case VPInstruction::PtrAdd: { 634 assert(vputils::onlyFirstLaneUsed(this) && 635 "can only generate first lane for PtrAdd"); 636 Value *Ptr = State.get(getOperand(0), Part, /* IsScalar */ true); 637 Value *Addend = State.get(getOperand(1), Part, /* IsScalar */ true); 638 return Builder.CreatePtrAdd(Ptr, Addend, Name); 639 } 640 default: 641 llvm_unreachable("Unsupported opcode for instruction"); 642 } 643 } 644 645 bool VPInstruction::isVectorToScalar() const { 646 return getOpcode() == VPInstruction::ExtractFromEnd || 647 getOpcode() == VPInstruction::ComputeReductionResult; 648 } 649 650 #if !defined(NDEBUG) 651 bool VPInstruction::isFPMathOp() const { 652 // Inspired by FPMathOperator::classof. Notable differences are that we don't 653 // support Call, PHI and Select opcodes here yet. 654 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || 655 Opcode == Instruction::FNeg || Opcode == Instruction::FSub || 656 Opcode == Instruction::FDiv || Opcode == Instruction::FRem || 657 Opcode == Instruction::FCmp || Opcode == Instruction::Select; 658 } 659 #endif 660 661 void VPInstruction::execute(VPTransformState &State) { 662 assert(!State.Instance && "VPInstruction executing an Instance"); 663 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 664 assert((hasFastMathFlags() == isFPMathOp() || 665 getOpcode() == Instruction::Select) && 666 "Recipe not a FPMathOp but has fast-math flags?"); 667 if (hasFastMathFlags()) 668 State.Builder.setFastMathFlags(getFastMathFlags()); 669 State.setDebugLocFrom(getDebugLoc()); 670 bool GeneratesPerFirstLaneOnly = 671 canGenerateScalarForFirstLane() && 672 (vputils::onlyFirstLaneUsed(this) || isVectorToScalar()); 673 bool GeneratesPerAllLanes = doesGeneratePerAllLanes(); 674 bool OnlyFirstPartUsed = vputils::onlyFirstPartUsed(this); 675 for (unsigned Part = 0; Part < State.UF; ++Part) { 676 if (GeneratesPerAllLanes) { 677 for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue(); 678 Lane != NumLanes; ++Lane) { 679 Value *GeneratedValue = generatePerLane(State, VPIteration(Part, Lane)); 680 assert(GeneratedValue && "generatePerLane must produce a value"); 681 State.set(this, GeneratedValue, VPIteration(Part, Lane)); 682 } 683 continue; 684 } 685 686 if (Part != 0 && OnlyFirstPartUsed && hasResult()) { 687 Value *Part0 = State.get(this, 0, /*IsScalar*/ GeneratesPerFirstLaneOnly); 688 State.set(this, Part0, Part, 689 /*IsScalar*/ GeneratesPerFirstLaneOnly); 690 continue; 691 } 692 693 Value *GeneratedValue = generatePerPart(State, Part); 694 if (!hasResult()) 695 continue; 696 assert(GeneratedValue && "generatePerPart must produce a value"); 697 assert((GeneratedValue->getType()->isVectorTy() == 698 !GeneratesPerFirstLaneOnly || 699 State.VF.isScalar()) && 700 "scalar value but not only first lane defined"); 701 State.set(this, GeneratedValue, Part, 702 /*IsScalar*/ GeneratesPerFirstLaneOnly); 703 } 704 } 705 706 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { 707 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 708 if (Instruction::isBinaryOp(getOpcode())) 709 return vputils::onlyFirstLaneUsed(this); 710 711 switch (getOpcode()) { 712 default: 713 return false; 714 case Instruction::ICmp: 715 case VPInstruction::PtrAdd: 716 // TODO: Cover additional opcodes. 717 return vputils::onlyFirstLaneUsed(this); 718 case VPInstruction::ActiveLaneMask: 719 case VPInstruction::ExplicitVectorLength: 720 case VPInstruction::CalculateTripCountMinusVF: 721 case VPInstruction::CanonicalIVIncrementForPart: 722 case VPInstruction::BranchOnCount: 723 case VPInstruction::BranchOnCond: 724 return true; 725 }; 726 llvm_unreachable("switch should return"); 727 } 728 729 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const { 730 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 731 if (Instruction::isBinaryOp(getOpcode())) 732 return vputils::onlyFirstPartUsed(this); 733 734 switch (getOpcode()) { 735 default: 736 return false; 737 case Instruction::ICmp: 738 case Instruction::Select: 739 return vputils::onlyFirstPartUsed(this); 740 case VPInstruction::BranchOnCount: 741 case VPInstruction::BranchOnCond: 742 case VPInstruction::CanonicalIVIncrementForPart: 743 return true; 744 }; 745 llvm_unreachable("switch should return"); 746 } 747 748 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 749 void VPInstruction::dump() const { 750 VPSlotTracker SlotTracker(getParent()->getPlan()); 751 print(dbgs(), "", SlotTracker); 752 } 753 754 void VPInstruction::print(raw_ostream &O, const Twine &Indent, 755 VPSlotTracker &SlotTracker) const { 756 O << Indent << "EMIT "; 757 758 if (hasResult()) { 759 printAsOperand(O, SlotTracker); 760 O << " = "; 761 } 762 763 switch (getOpcode()) { 764 case VPInstruction::Not: 765 O << "not"; 766 break; 767 case VPInstruction::SLPLoad: 768 O << "combined load"; 769 break; 770 case VPInstruction::SLPStore: 771 O << "combined store"; 772 break; 773 case VPInstruction::ActiveLaneMask: 774 O << "active lane mask"; 775 break; 776 case VPInstruction::ExplicitVectorLength: 777 O << "EXPLICIT-VECTOR-LENGTH"; 778 break; 779 case VPInstruction::FirstOrderRecurrenceSplice: 780 O << "first-order splice"; 781 break; 782 case VPInstruction::BranchOnCond: 783 O << "branch-on-cond"; 784 break; 785 case VPInstruction::CalculateTripCountMinusVF: 786 O << "TC > VF ? TC - VF : 0"; 787 break; 788 case VPInstruction::CanonicalIVIncrementForPart: 789 O << "VF * Part +"; 790 break; 791 case VPInstruction::BranchOnCount: 792 O << "branch-on-count"; 793 break; 794 case VPInstruction::ExtractFromEnd: 795 O << "extract-from-end"; 796 break; 797 case VPInstruction::ComputeReductionResult: 798 O << "compute-reduction-result"; 799 break; 800 case VPInstruction::LogicalAnd: 801 O << "logical-and"; 802 break; 803 case VPInstruction::PtrAdd: 804 O << "ptradd"; 805 break; 806 default: 807 O << Instruction::getOpcodeName(getOpcode()); 808 } 809 810 printFlags(O); 811 printOperands(O, SlotTracker); 812 813 if (auto DL = getDebugLoc()) { 814 O << ", !dbg "; 815 DL.print(O); 816 } 817 } 818 #endif 819 820 void VPWidenCallRecipe::execute(VPTransformState &State) { 821 assert(State.VF.isVector() && "not widening"); 822 Function *CalledScalarFn = getCalledScalarFunction(); 823 assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) && 824 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 825 State.setDebugLocFrom(getDebugLoc()); 826 827 bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic; 828 FunctionType *VFTy = nullptr; 829 if (Variant) 830 VFTy = Variant->getFunctionType(); 831 for (unsigned Part = 0; Part < State.UF; ++Part) { 832 SmallVector<Type *, 2> TysForDecl; 833 // Add return type if intrinsic is overloaded on it. 834 if (UseIntrinsic && 835 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1)) 836 TysForDecl.push_back(VectorType::get( 837 CalledScalarFn->getReturnType()->getScalarType(), State.VF)); 838 SmallVector<Value *, 4> Args; 839 for (const auto &I : enumerate(arg_operands())) { 840 // Some intrinsics have a scalar argument - don't replace it with a 841 // vector. 842 Value *Arg; 843 if (UseIntrinsic && 844 isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())) 845 Arg = State.get(I.value(), VPIteration(0, 0)); 846 // Some vectorized function variants may also take a scalar argument, 847 // e.g. linear parameters for pointers. This needs to be the scalar value 848 // from the start of the respective part when interleaving. 849 else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy()) 850 Arg = State.get(I.value(), VPIteration(Part, 0)); 851 else 852 Arg = State.get(I.value(), Part); 853 if (UseIntrinsic && 854 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index())) 855 TysForDecl.push_back(Arg->getType()); 856 Args.push_back(Arg); 857 } 858 859 Function *VectorF; 860 if (UseIntrinsic) { 861 // Use vector version of the intrinsic. 862 Module *M = State.Builder.GetInsertBlock()->getModule(); 863 VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl); 864 assert(VectorF && "Can't retrieve vector intrinsic."); 865 } else { 866 #ifndef NDEBUG 867 assert(Variant != nullptr && "Can't create vector function."); 868 #endif 869 VectorF = Variant; 870 } 871 872 auto *CI = cast_or_null<CallInst>(getUnderlyingInstr()); 873 SmallVector<OperandBundleDef, 1> OpBundles; 874 if (CI) 875 CI->getOperandBundlesAsDefs(OpBundles); 876 877 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); 878 879 if (isa<FPMathOperator>(V)) 880 V->copyFastMathFlags(CI); 881 882 if (!V->getType()->isVoidTy()) 883 State.set(this, V, Part); 884 State.addMetadata(V, CI); 885 } 886 } 887 888 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 889 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, 890 VPSlotTracker &SlotTracker) const { 891 O << Indent << "WIDEN-CALL "; 892 893 Function *CalledFn = getCalledScalarFunction(); 894 if (CalledFn->getReturnType()->isVoidTy()) 895 O << "void "; 896 else { 897 printAsOperand(O, SlotTracker); 898 O << " = "; 899 } 900 901 O << "call @" << CalledFn->getName() << "("; 902 interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) { 903 Op->printAsOperand(O, SlotTracker); 904 }); 905 O << ")"; 906 907 if (VectorIntrinsicID) 908 O << " (using vector intrinsic)"; 909 else { 910 O << " (using library function"; 911 if (Variant->hasName()) 912 O << ": " << Variant->getName(); 913 O << ")"; 914 } 915 } 916 917 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, 918 VPSlotTracker &SlotTracker) const { 919 O << Indent << "WIDEN-SELECT "; 920 printAsOperand(O, SlotTracker); 921 O << " = select "; 922 getOperand(0)->printAsOperand(O, SlotTracker); 923 O << ", "; 924 getOperand(1)->printAsOperand(O, SlotTracker); 925 O << ", "; 926 getOperand(2)->printAsOperand(O, SlotTracker); 927 O << (isInvariantCond() ? " (condition is loop invariant)" : ""); 928 } 929 #endif 930 931 void VPWidenSelectRecipe::execute(VPTransformState &State) { 932 State.setDebugLocFrom(getDebugLoc()); 933 934 // The condition can be loop invariant but still defined inside the 935 // loop. This means that we can't just use the original 'cond' value. 936 // We have to take the 'vectorized' value and pick the first lane. 937 // Instcombine will make this a no-op. 938 auto *InvarCond = 939 isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr; 940 941 for (unsigned Part = 0; Part < State.UF; ++Part) { 942 Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part); 943 Value *Op0 = State.get(getOperand(1), Part); 944 Value *Op1 = State.get(getOperand(2), Part); 945 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 946 State.set(this, Sel, Part); 947 State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 948 } 949 } 950 951 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy( 952 const FastMathFlags &FMF) { 953 AllowReassoc = FMF.allowReassoc(); 954 NoNaNs = FMF.noNaNs(); 955 NoInfs = FMF.noInfs(); 956 NoSignedZeros = FMF.noSignedZeros(); 957 AllowReciprocal = FMF.allowReciprocal(); 958 AllowContract = FMF.allowContract(); 959 ApproxFunc = FMF.approxFunc(); 960 } 961 962 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 963 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { 964 switch (OpType) { 965 case OperationType::Cmp: 966 O << " " << CmpInst::getPredicateName(getPredicate()); 967 break; 968 case OperationType::DisjointOp: 969 if (DisjointFlags.IsDisjoint) 970 O << " disjoint"; 971 break; 972 case OperationType::PossiblyExactOp: 973 if (ExactFlags.IsExact) 974 O << " exact"; 975 break; 976 case OperationType::OverflowingBinOp: 977 if (WrapFlags.HasNUW) 978 O << " nuw"; 979 if (WrapFlags.HasNSW) 980 O << " nsw"; 981 break; 982 case OperationType::FPMathOp: 983 getFastMathFlags().print(O); 984 break; 985 case OperationType::GEPOp: 986 if (GEPFlags.IsInBounds) 987 O << " inbounds"; 988 break; 989 case OperationType::NonNegOp: 990 if (NonNegFlags.NonNeg) 991 O << " nneg"; 992 break; 993 case OperationType::Other: 994 break; 995 } 996 if (getNumOperands() > 0) 997 O << " "; 998 } 999 #endif 1000 1001 void VPWidenRecipe::execute(VPTransformState &State) { 1002 State.setDebugLocFrom(getDebugLoc()); 1003 auto &Builder = State.Builder; 1004 switch (Opcode) { 1005 case Instruction::Call: 1006 case Instruction::Br: 1007 case Instruction::PHI: 1008 case Instruction::GetElementPtr: 1009 case Instruction::Select: 1010 llvm_unreachable("This instruction is handled by a different recipe."); 1011 case Instruction::UDiv: 1012 case Instruction::SDiv: 1013 case Instruction::SRem: 1014 case Instruction::URem: 1015 case Instruction::Add: 1016 case Instruction::FAdd: 1017 case Instruction::Sub: 1018 case Instruction::FSub: 1019 case Instruction::FNeg: 1020 case Instruction::Mul: 1021 case Instruction::FMul: 1022 case Instruction::FDiv: 1023 case Instruction::FRem: 1024 case Instruction::Shl: 1025 case Instruction::LShr: 1026 case Instruction::AShr: 1027 case Instruction::And: 1028 case Instruction::Or: 1029 case Instruction::Xor: { 1030 // Just widen unops and binops. 1031 for (unsigned Part = 0; Part < State.UF; ++Part) { 1032 SmallVector<Value *, 2> Ops; 1033 for (VPValue *VPOp : operands()) 1034 Ops.push_back(State.get(VPOp, Part)); 1035 1036 Value *V = Builder.CreateNAryOp(Opcode, Ops); 1037 1038 if (auto *VecOp = dyn_cast<Instruction>(V)) 1039 setFlags(VecOp); 1040 1041 // Use this vector value for all users of the original instruction. 1042 State.set(this, V, Part); 1043 State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1044 } 1045 1046 break; 1047 } 1048 case Instruction::Freeze: { 1049 for (unsigned Part = 0; Part < State.UF; ++Part) { 1050 Value *Op = State.get(getOperand(0), Part); 1051 1052 Value *Freeze = Builder.CreateFreeze(Op); 1053 State.set(this, Freeze, Part); 1054 } 1055 break; 1056 } 1057 case Instruction::ICmp: 1058 case Instruction::FCmp: { 1059 // Widen compares. Generate vector compares. 1060 bool FCmp = Opcode == Instruction::FCmp; 1061 for (unsigned Part = 0; Part < State.UF; ++Part) { 1062 Value *A = State.get(getOperand(0), Part); 1063 Value *B = State.get(getOperand(1), Part); 1064 Value *C = nullptr; 1065 if (FCmp) { 1066 // Propagate fast math flags. 1067 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 1068 if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue())) 1069 Builder.setFastMathFlags(I->getFastMathFlags()); 1070 C = Builder.CreateFCmp(getPredicate(), A, B); 1071 } else { 1072 C = Builder.CreateICmp(getPredicate(), A, B); 1073 } 1074 State.set(this, C, Part); 1075 State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1076 } 1077 1078 break; 1079 } 1080 default: 1081 // This instruction is not vectorized by simple widening. 1082 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " 1083 << Instruction::getOpcodeName(Opcode)); 1084 llvm_unreachable("Unhandled instruction!"); 1085 } // end of switch. 1086 1087 #if !defined(NDEBUG) 1088 // Verify that VPlan type inference results agree with the type of the 1089 // generated values. 1090 for (unsigned Part = 0; Part < State.UF; ++Part) { 1091 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), 1092 State.VF) == State.get(this, Part)->getType() && 1093 "inferred type and type from generated instructions do not match"); 1094 } 1095 #endif 1096 } 1097 1098 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1099 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, 1100 VPSlotTracker &SlotTracker) const { 1101 O << Indent << "WIDEN "; 1102 printAsOperand(O, SlotTracker); 1103 O << " = " << Instruction::getOpcodeName(Opcode); 1104 printFlags(O); 1105 printOperands(O, SlotTracker); 1106 } 1107 #endif 1108 1109 void VPWidenCastRecipe::execute(VPTransformState &State) { 1110 State.setDebugLocFrom(getDebugLoc()); 1111 auto &Builder = State.Builder; 1112 /// Vectorize casts. 1113 assert(State.VF.isVector() && "Not vectorizing?"); 1114 Type *DestTy = VectorType::get(getResultType(), State.VF); 1115 VPValue *Op = getOperand(0); 1116 for (unsigned Part = 0; Part < State.UF; ++Part) { 1117 if (Part > 0 && Op->isLiveIn()) { 1118 // FIXME: Remove once explicit unrolling is implemented using VPlan. 1119 State.set(this, State.get(this, 0), Part); 1120 continue; 1121 } 1122 Value *A = State.get(Op, Part); 1123 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); 1124 State.set(this, Cast, Part); 1125 State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue())); 1126 } 1127 } 1128 1129 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1130 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, 1131 VPSlotTracker &SlotTracker) const { 1132 O << Indent << "WIDEN-CAST "; 1133 printAsOperand(O, SlotTracker); 1134 O << " = " << Instruction::getOpcodeName(Opcode) << " "; 1135 printFlags(O); 1136 printOperands(O, SlotTracker); 1137 O << " to " << *getResultType(); 1138 } 1139 #endif 1140 1141 /// This function adds 1142 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 1143 /// to each vector element of Val. The sequence starts at StartIndex. 1144 /// \p Opcode is relevant for FP induction variable. 1145 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 1146 Instruction::BinaryOps BinOp, ElementCount VF, 1147 IRBuilderBase &Builder) { 1148 assert(VF.isVector() && "only vector VFs are supported"); 1149 1150 // Create and check the types. 1151 auto *ValVTy = cast<VectorType>(Val->getType()); 1152 ElementCount VLen = ValVTy->getElementCount(); 1153 1154 Type *STy = Val->getType()->getScalarType(); 1155 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1156 "Induction Step must be an integer or FP"); 1157 assert(Step->getType() == STy && "Step has wrong type"); 1158 1159 SmallVector<Constant *, 8> Indices; 1160 1161 // Create a vector of consecutive numbers from zero to VF. 1162 VectorType *InitVecValVTy = ValVTy; 1163 if (STy->isFloatingPointTy()) { 1164 Type *InitVecValSTy = 1165 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 1166 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 1167 } 1168 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 1169 1170 // Splat the StartIdx 1171 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 1172 1173 if (STy->isIntegerTy()) { 1174 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 1175 Step = Builder.CreateVectorSplat(VLen, Step); 1176 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1177 // FIXME: The newly created binary instructions should contain nsw/nuw 1178 // flags, which can be found from the original scalar operations. 1179 Step = Builder.CreateMul(InitVec, Step); 1180 return Builder.CreateAdd(Val, Step, "induction"); 1181 } 1182 1183 // Floating point induction. 1184 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1185 "Binary Opcode should be specified for FP induction"); 1186 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 1187 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 1188 1189 Step = Builder.CreateVectorSplat(VLen, Step); 1190 Value *MulOp = Builder.CreateFMul(InitVec, Step); 1191 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1192 } 1193 1194 /// A helper function that returns an integer or floating-point constant with 1195 /// value C. 1196 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 1197 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 1198 : ConstantFP::get(Ty, C); 1199 } 1200 1201 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 1202 ElementCount VF) { 1203 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1204 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1205 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1206 return B.CreateUIToFP(RuntimeVF, FTy); 1207 } 1208 1209 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 1210 assert(!State.Instance && "Int or FP induction being replicated."); 1211 1212 Value *Start = getStartValue()->getLiveInIRValue(); 1213 const InductionDescriptor &ID = getInductionDescriptor(); 1214 TruncInst *Trunc = getTruncInst(); 1215 IRBuilderBase &Builder = State.Builder; 1216 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1217 assert(State.VF.isVector() && "must have vector VF"); 1218 1219 // The value from the original loop to which we are mapping the new induction 1220 // variable. 1221 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1222 1223 // Fast-math-flags propagate from the original induction instruction. 1224 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 1225 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 1226 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 1227 1228 // Now do the actual transformations, and start with fetching the step value. 1229 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 1230 1231 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1232 "Expected either an induction phi-node or a truncate of it!"); 1233 1234 // Construct the initial value of the vector IV in the vector loop preheader 1235 auto CurrIP = Builder.saveIP(); 1236 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1237 Builder.SetInsertPoint(VectorPH->getTerminator()); 1238 if (isa<TruncInst>(EntryVal)) { 1239 assert(Start->getType()->isIntegerTy() && 1240 "Truncation requires an integer type"); 1241 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1242 Step = Builder.CreateTrunc(Step, TruncType); 1243 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1244 } 1245 1246 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 1247 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 1248 Value *SteppedStart = getStepVector( 1249 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); 1250 1251 // We create vector phi nodes for both integer and floating-point induction 1252 // variables. Here, we determine the kind of arithmetic we will perform. 1253 Instruction::BinaryOps AddOp; 1254 Instruction::BinaryOps MulOp; 1255 if (Step->getType()->isIntegerTy()) { 1256 AddOp = Instruction::Add; 1257 MulOp = Instruction::Mul; 1258 } else { 1259 AddOp = ID.getInductionOpcode(); 1260 MulOp = Instruction::FMul; 1261 } 1262 1263 // Multiply the vectorization factor by the step using integer or 1264 // floating-point arithmetic as appropriate. 1265 Type *StepType = Step->getType(); 1266 Value *RuntimeVF; 1267 if (Step->getType()->isFloatingPointTy()) 1268 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 1269 else 1270 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 1271 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 1272 1273 // Create a vector splat to use in the induction update. 1274 // 1275 // FIXME: If the step is non-constant, we create the vector splat with 1276 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1277 // handle a constant vector splat. 1278 Value *SplatVF = isa<Constant>(Mul) 1279 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 1280 : Builder.CreateVectorSplat(State.VF, Mul); 1281 Builder.restoreIP(CurrIP); 1282 1283 // We may need to add the step a number of times, depending on the unroll 1284 // factor. The last of those goes into the PHI. 1285 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind"); 1286 VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 1287 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1288 Instruction *LastInduction = VecInd; 1289 for (unsigned Part = 0; Part < State.UF; ++Part) { 1290 State.set(this, LastInduction, Part); 1291 1292 if (isa<TruncInst>(EntryVal)) 1293 State.addMetadata(LastInduction, EntryVal); 1294 1295 LastInduction = cast<Instruction>( 1296 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 1297 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1298 } 1299 1300 LastInduction->setName("vec.ind.next"); 1301 VecInd->addIncoming(SteppedStart, VectorPH); 1302 // Add induction update using an incorrect block temporarily. The phi node 1303 // will be fixed after VPlan execution. Note that at this point the latch 1304 // block cannot be used, as it does not exist yet. 1305 // TODO: Model increment value in VPlan, by turning the recipe into a 1306 // multi-def and a subclass of VPHeaderPHIRecipe. 1307 VecInd->addIncoming(LastInduction, VectorPH); 1308 } 1309 1310 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1311 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, 1312 VPSlotTracker &SlotTracker) const { 1313 O << Indent << "WIDEN-INDUCTION"; 1314 if (getTruncInst()) { 1315 O << "\\l\""; 1316 O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\""; 1317 O << " +\n" << Indent << "\" "; 1318 getVPValue(0)->printAsOperand(O, SlotTracker); 1319 } else 1320 O << " " << VPlanIngredient(IV); 1321 1322 O << ", "; 1323 getStepValue()->printAsOperand(O, SlotTracker); 1324 } 1325 #endif 1326 1327 bool VPWidenIntOrFpInductionRecipe::isCanonical() const { 1328 // The step may be defined by a recipe in the preheader (e.g. if it requires 1329 // SCEV expansion), but for the canonical induction the step is required to be 1330 // 1, which is represented as live-in. 1331 if (getStepValue()->getDefiningRecipe()) 1332 return false; 1333 auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue()); 1334 auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); 1335 auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin()); 1336 return StartC && StartC->isZero() && StepC && StepC->isOne() && 1337 getScalarType() == CanIV->getScalarType(); 1338 } 1339 1340 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1341 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, 1342 VPSlotTracker &SlotTracker) const { 1343 O << Indent; 1344 printAsOperand(O, SlotTracker); 1345 O << Indent << "= DERIVED-IV "; 1346 getStartValue()->printAsOperand(O, SlotTracker); 1347 O << " + "; 1348 getOperand(1)->printAsOperand(O, SlotTracker); 1349 O << " * "; 1350 getStepValue()->printAsOperand(O, SlotTracker); 1351 } 1352 #endif 1353 1354 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 1355 // Fast-math-flags propagate from the original induction instruction. 1356 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 1357 if (hasFastMathFlags()) 1358 State.Builder.setFastMathFlags(getFastMathFlags()); 1359 1360 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 1361 /// variable on which to base the steps, \p Step is the size of the step. 1362 1363 Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0)); 1364 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 1365 IRBuilderBase &Builder = State.Builder; 1366 1367 // Ensure step has the same type as that of scalar IV. 1368 Type *BaseIVTy = BaseIV->getType()->getScalarType(); 1369 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!"); 1370 1371 // We build scalar steps for both integer and floating-point induction 1372 // variables. Here, we determine the kind of arithmetic we will perform. 1373 Instruction::BinaryOps AddOp; 1374 Instruction::BinaryOps MulOp; 1375 if (BaseIVTy->isIntegerTy()) { 1376 AddOp = Instruction::Add; 1377 MulOp = Instruction::Mul; 1378 } else { 1379 AddOp = InductionOpcode; 1380 MulOp = Instruction::FMul; 1381 } 1382 1383 // Determine the number of scalars we need to generate for each unroll 1384 // iteration. 1385 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this); 1386 // Compute the scalar steps and save the results in State. 1387 Type *IntStepTy = 1388 IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits()); 1389 Type *VecIVTy = nullptr; 1390 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 1391 if (!FirstLaneOnly && State.VF.isScalable()) { 1392 VecIVTy = VectorType::get(BaseIVTy, State.VF); 1393 UnitStepVec = 1394 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 1395 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 1396 SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV); 1397 } 1398 1399 unsigned StartPart = 0; 1400 unsigned EndPart = State.UF; 1401 unsigned StartLane = 0; 1402 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 1403 if (State.Instance) { 1404 StartPart = State.Instance->Part; 1405 EndPart = StartPart + 1; 1406 StartLane = State.Instance->Lane.getKnownLane(); 1407 EndLane = StartLane + 1; 1408 } 1409 for (unsigned Part = StartPart; Part < EndPart; ++Part) { 1410 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 1411 1412 if (!FirstLaneOnly && State.VF.isScalable()) { 1413 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 1414 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 1415 if (BaseIVTy->isFloatingPointTy()) 1416 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 1417 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 1418 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 1419 State.set(this, Add, Part); 1420 // It's useful to record the lane values too for the known minimum number 1421 // of elements so we do those below. This improves the code quality when 1422 // trying to extract the first element, for example. 1423 } 1424 1425 if (BaseIVTy->isFloatingPointTy()) 1426 StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy); 1427 1428 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { 1429 Value *StartIdx = Builder.CreateBinOp( 1430 AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane)); 1431 // The step returned by `createStepForVF` is a runtime-evaluated value 1432 // when VF is scalable. Otherwise, it should be folded into a Constant. 1433 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 1434 "Expected StartIdx to be folded to a constant when VF is not " 1435 "scalable"); 1436 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 1437 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul); 1438 State.set(this, Add, VPIteration(Part, Lane)); 1439 } 1440 } 1441 } 1442 1443 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1444 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, 1445 VPSlotTracker &SlotTracker) const { 1446 O << Indent; 1447 printAsOperand(O, SlotTracker); 1448 O << " = SCALAR-STEPS "; 1449 printOperands(O, SlotTracker); 1450 } 1451 #endif 1452 1453 void VPWidenGEPRecipe::execute(VPTransformState &State) { 1454 assert(State.VF.isVector() && "not widening"); 1455 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 1456 // Construct a vector GEP by widening the operands of the scalar GEP as 1457 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 1458 // results in a vector of pointers when at least one operand of the GEP 1459 // is vector-typed. Thus, to keep the representation compact, we only use 1460 // vector-typed operands for loop-varying values. 1461 1462 if (areAllOperandsInvariant()) { 1463 // If we are vectorizing, but the GEP has only loop-invariant operands, 1464 // the GEP we build (by only using vector-typed operands for 1465 // loop-varying values) would be a scalar pointer. Thus, to ensure we 1466 // produce a vector of pointers, we need to either arbitrarily pick an 1467 // operand to broadcast, or broadcast a clone of the original GEP. 1468 // Here, we broadcast a clone of the original. 1469 // 1470 // TODO: If at some point we decide to scalarize instructions having 1471 // loop-invariant operands, this special case will no longer be 1472 // required. We would add the scalarization decision to 1473 // collectLoopScalars() and teach getVectorValue() to broadcast 1474 // the lane-zero scalar value. 1475 SmallVector<Value *> Ops; 1476 for (unsigned I = 0, E = getNumOperands(); I != E; I++) 1477 Ops.push_back(State.get(getOperand(I), VPIteration(0, 0))); 1478 1479 auto *NewGEP = 1480 State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0], 1481 ArrayRef(Ops).drop_front(), "", isInBounds()); 1482 for (unsigned Part = 0; Part < State.UF; ++Part) { 1483 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP); 1484 State.set(this, EntryPart, Part); 1485 State.addMetadata(EntryPart, GEP); 1486 } 1487 } else { 1488 // If the GEP has at least one loop-varying operand, we are sure to 1489 // produce a vector of pointers. But if we are only unrolling, we want 1490 // to produce a scalar GEP for each unroll part. Thus, the GEP we 1491 // produce with the code below will be scalar (if VF == 1) or vector 1492 // (otherwise). Note that for the unroll-only case, we still maintain 1493 // values in the vector mapping with initVector, as we do for other 1494 // instructions. 1495 for (unsigned Part = 0; Part < State.UF; ++Part) { 1496 // The pointer operand of the new GEP. If it's loop-invariant, we 1497 // won't broadcast it. 1498 auto *Ptr = isPointerLoopInvariant() 1499 ? State.get(getOperand(0), VPIteration(0, 0)) 1500 : State.get(getOperand(0), Part); 1501 1502 // Collect all the indices for the new GEP. If any index is 1503 // loop-invariant, we won't broadcast it. 1504 SmallVector<Value *, 4> Indices; 1505 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 1506 VPValue *Operand = getOperand(I); 1507 if (isIndexLoopInvariant(I - 1)) 1508 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 1509 else 1510 Indices.push_back(State.get(Operand, Part)); 1511 } 1512 1513 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 1514 // but it should be a vector, otherwise. 1515 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, 1516 Indices, "", isInBounds()); 1517 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 1518 "NewGEP is not a pointer vector"); 1519 State.set(this, NewGEP, Part); 1520 State.addMetadata(NewGEP, GEP); 1521 } 1522 } 1523 } 1524 1525 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1526 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, 1527 VPSlotTracker &SlotTracker) const { 1528 O << Indent << "WIDEN-GEP "; 1529 O << (isPointerLoopInvariant() ? "Inv" : "Var"); 1530 for (size_t I = 0; I < getNumOperands() - 1; ++I) 1531 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]"; 1532 1533 O << " "; 1534 printAsOperand(O, SlotTracker); 1535 O << " = getelementptr"; 1536 printFlags(O); 1537 printOperands(O, SlotTracker); 1538 } 1539 #endif 1540 1541 void VPVectorPointerRecipe ::execute(VPTransformState &State) { 1542 auto &Builder = State.Builder; 1543 State.setDebugLocFrom(getDebugLoc()); 1544 for (unsigned Part = 0; Part < State.UF; ++Part) { 1545 // Calculate the pointer for the specific unroll-part. 1546 Value *PartPtr = nullptr; 1547 // Use i32 for the gep index type when the value is constant, 1548 // or query DataLayout for a more suitable index type otherwise. 1549 const DataLayout &DL = 1550 Builder.GetInsertBlock()->getDataLayout(); 1551 Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0) 1552 ? DL.getIndexType(IndexedTy->getPointerTo()) 1553 : Builder.getInt32Ty(); 1554 Value *Ptr = State.get(getOperand(0), VPIteration(0, 0)); 1555 bool InBounds = isInBounds(); 1556 if (IsReverse) { 1557 // If the address is consecutive but reversed, then the 1558 // wide store needs to start at the last vector element. 1559 // RunTimeVF = VScale * VF.getKnownMinValue() 1560 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 1561 Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF); 1562 // NumElt = -Part * RunTimeVF 1563 Value *NumElt = Builder.CreateMul( 1564 ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF); 1565 // LastLane = 1 - RunTimeVF 1566 Value *LastLane = 1567 Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); 1568 PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds); 1569 PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds); 1570 } else { 1571 Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part); 1572 PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds); 1573 } 1574 1575 State.set(this, PartPtr, Part, /*IsScalar*/ true); 1576 } 1577 } 1578 1579 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1580 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, 1581 VPSlotTracker &SlotTracker) const { 1582 O << Indent; 1583 printAsOperand(O, SlotTracker); 1584 O << " = vector-pointer "; 1585 if (IsReverse) 1586 O << "(reverse) "; 1587 1588 printOperands(O, SlotTracker); 1589 } 1590 #endif 1591 1592 void VPBlendRecipe::execute(VPTransformState &State) { 1593 State.setDebugLocFrom(getDebugLoc()); 1594 // We know that all PHIs in non-header blocks are converted into 1595 // selects, so we don't have to worry about the insertion order and we 1596 // can just use the builder. 1597 // At this point we generate the predication tree. There may be 1598 // duplications since this is a simple recursive scan, but future 1599 // optimizations will clean it up. 1600 1601 unsigned NumIncoming = getNumIncomingValues(); 1602 1603 // Generate a sequence of selects of the form: 1604 // SELECT(Mask3, In3, 1605 // SELECT(Mask2, In2, 1606 // SELECT(Mask1, In1, 1607 // In0))) 1608 // Note that Mask0 is never used: lanes for which no path reaches this phi and 1609 // are essentially undef are taken from In0. 1610 VectorParts Entry(State.UF); 1611 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 1612 for (unsigned In = 0; In < NumIncoming; ++In) { 1613 for (unsigned Part = 0; Part < State.UF; ++Part) { 1614 // We might have single edge PHIs (blocks) - use an identity 1615 // 'select' for the first PHI operand. 1616 Value *In0 = State.get(getIncomingValue(In), Part, OnlyFirstLaneUsed); 1617 if (In == 0) 1618 Entry[Part] = In0; // Initialize with the first incoming value. 1619 else { 1620 // Select between the current value and the previous incoming edge 1621 // based on the incoming mask. 1622 Value *Cond = State.get(getMask(In), Part, OnlyFirstLaneUsed); 1623 Entry[Part] = 1624 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 1625 } 1626 } 1627 } 1628 for (unsigned Part = 0; Part < State.UF; ++Part) 1629 State.set(this, Entry[Part], Part, OnlyFirstLaneUsed); 1630 } 1631 1632 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1633 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, 1634 VPSlotTracker &SlotTracker) const { 1635 O << Indent << "BLEND "; 1636 printAsOperand(O, SlotTracker); 1637 O << " ="; 1638 if (getNumIncomingValues() == 1) { 1639 // Not a User of any mask: not really blending, this is a 1640 // single-predecessor phi. 1641 O << " "; 1642 getIncomingValue(0)->printAsOperand(O, SlotTracker); 1643 } else { 1644 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { 1645 O << " "; 1646 getIncomingValue(I)->printAsOperand(O, SlotTracker); 1647 if (I == 0) 1648 continue; 1649 O << "/"; 1650 getMask(I)->printAsOperand(O, SlotTracker); 1651 } 1652 } 1653 } 1654 #endif 1655 1656 void VPReductionRecipe::execute(VPTransformState &State) { 1657 assert(!State.Instance && "Reduction being replicated."); 1658 Value *PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true); 1659 RecurKind Kind = RdxDesc.getRecurrenceKind(); 1660 // Propagate the fast-math flags carried by the underlying instruction. 1661 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 1662 State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 1663 for (unsigned Part = 0; Part < State.UF; ++Part) { 1664 Value *NewVecOp = State.get(getVecOp(), Part); 1665 if (VPValue *Cond = getCondOp()) { 1666 Value *NewCond = State.get(Cond, Part, State.VF.isScalar()); 1667 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType()); 1668 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); 1669 Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy, 1670 RdxDesc.getFastMathFlags()); 1671 if (State.VF.isVector()) { 1672 Iden = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 1673 } 1674 1675 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden); 1676 NewVecOp = Select; 1677 } 1678 Value *NewRed; 1679 Value *NextInChain; 1680 if (IsOrdered) { 1681 if (State.VF.isVector()) 1682 NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp, 1683 PrevInChain); 1684 else 1685 NewRed = State.Builder.CreateBinOp( 1686 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain, 1687 NewVecOp); 1688 PrevInChain = NewRed; 1689 } else { 1690 PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true); 1691 NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp); 1692 } 1693 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 1694 NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(), 1695 NewRed, PrevInChain); 1696 } else if (IsOrdered) 1697 NextInChain = NewRed; 1698 else 1699 NextInChain = State.Builder.CreateBinOp( 1700 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain); 1701 State.set(this, NextInChain, Part, /*IsScalar*/ true); 1702 } 1703 } 1704 1705 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1706 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, 1707 VPSlotTracker &SlotTracker) const { 1708 O << Indent << "REDUCE "; 1709 printAsOperand(O, SlotTracker); 1710 O << " = "; 1711 getChainOp()->printAsOperand(O, SlotTracker); 1712 O << " +"; 1713 if (isa<FPMathOperator>(getUnderlyingInstr())) 1714 O << getUnderlyingInstr()->getFastMathFlags(); 1715 O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; 1716 getVecOp()->printAsOperand(O, SlotTracker); 1717 if (getCondOp()) { 1718 O << ", "; 1719 getCondOp()->printAsOperand(O, SlotTracker); 1720 } 1721 O << ")"; 1722 if (RdxDesc.IntermediateStore) 1723 O << " (with final reduction value stored in invariant address sank " 1724 "outside of loop)"; 1725 } 1726 #endif 1727 1728 bool VPReplicateRecipe::shouldPack() const { 1729 // Find if the recipe is used by a widened recipe via an intervening 1730 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. 1731 return any_of(users(), [](const VPUser *U) { 1732 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U)) 1733 return any_of(PredR->users(), [PredR](const VPUser *U) { 1734 return !U->usesScalars(PredR); 1735 }); 1736 return false; 1737 }); 1738 } 1739 1740 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1741 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, 1742 VPSlotTracker &SlotTracker) const { 1743 O << Indent << (IsUniform ? "CLONE " : "REPLICATE "); 1744 1745 if (!getUnderlyingInstr()->getType()->isVoidTy()) { 1746 printAsOperand(O, SlotTracker); 1747 O << " = "; 1748 } 1749 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) { 1750 O << "call"; 1751 printFlags(O); 1752 O << "@" << CB->getCalledFunction()->getName() << "("; 1753 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), 1754 O, [&O, &SlotTracker](VPValue *Op) { 1755 Op->printAsOperand(O, SlotTracker); 1756 }); 1757 O << ")"; 1758 } else { 1759 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); 1760 printFlags(O); 1761 printOperands(O, SlotTracker); 1762 } 1763 1764 if (shouldPack()) 1765 O << " (S->V)"; 1766 } 1767 #endif 1768 1769 /// Checks if \p C is uniform across all VFs and UFs. It is considered as such 1770 /// if it is either defined outside the vector region or its operand is known to 1771 /// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI). 1772 /// TODO: Uniformity should be associated with a VPValue and there should be a 1773 /// generic way to check. 1774 static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) { 1775 return C->isDefinedOutsideVectorRegions() || 1776 isa<VPDerivedIVRecipe>(C->getOperand(0)) || 1777 isa<VPCanonicalIVPHIRecipe>(C->getOperand(0)); 1778 } 1779 1780 Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) { 1781 assert(vputils::onlyFirstLaneUsed(this) && 1782 "Codegen only implemented for first lane."); 1783 switch (Opcode) { 1784 case Instruction::SExt: 1785 case Instruction::ZExt: 1786 case Instruction::Trunc: { 1787 // Note: SExt/ZExt not used yet. 1788 Value *Op = State.get(getOperand(0), VPIteration(Part, 0)); 1789 return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy); 1790 } 1791 default: 1792 llvm_unreachable("opcode not implemented yet"); 1793 } 1794 } 1795 1796 void VPScalarCastRecipe ::execute(VPTransformState &State) { 1797 bool IsUniformAcrossVFsAndUFs = isUniformAcrossVFsAndUFs(this); 1798 for (unsigned Part = 0; Part != State.UF; ++Part) { 1799 Value *Res; 1800 // Only generate a single instance, if the recipe is uniform across UFs and 1801 // VFs. 1802 if (Part > 0 && IsUniformAcrossVFsAndUFs) 1803 Res = State.get(this, VPIteration(0, 0)); 1804 else 1805 Res = generate(State, Part); 1806 State.set(this, Res, VPIteration(Part, 0)); 1807 } 1808 } 1809 1810 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1811 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent, 1812 VPSlotTracker &SlotTracker) const { 1813 O << Indent << "SCALAR-CAST "; 1814 printAsOperand(O, SlotTracker); 1815 O << " = " << Instruction::getOpcodeName(Opcode) << " "; 1816 printOperands(O, SlotTracker); 1817 O << " to " << *ResultTy; 1818 } 1819 #endif 1820 1821 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 1822 assert(State.Instance && "Branch on Mask works only on single instance."); 1823 1824 unsigned Part = State.Instance->Part; 1825 unsigned Lane = State.Instance->Lane.getKnownLane(); 1826 1827 Value *ConditionBit = nullptr; 1828 VPValue *BlockInMask = getMask(); 1829 if (BlockInMask) { 1830 ConditionBit = State.get(BlockInMask, Part); 1831 if (ConditionBit->getType()->isVectorTy()) 1832 ConditionBit = State.Builder.CreateExtractElement( 1833 ConditionBit, State.Builder.getInt32(Lane)); 1834 } else // Block in mask is all-one. 1835 ConditionBit = State.Builder.getTrue(); 1836 1837 // Replace the temporary unreachable terminator with a new conditional branch, 1838 // whose two destinations will be set later when they are created. 1839 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 1840 assert(isa<UnreachableInst>(CurrentTerminator) && 1841 "Expected to replace unreachable terminator with conditional branch."); 1842 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 1843 CondBr->setSuccessor(0, nullptr); 1844 ReplaceInstWithInst(CurrentTerminator, CondBr); 1845 } 1846 1847 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 1848 assert(State.Instance && "Predicated instruction PHI works per instance."); 1849 Instruction *ScalarPredInst = 1850 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 1851 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 1852 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 1853 assert(PredicatingBB && "Predicated block has no single predecessor."); 1854 assert(isa<VPReplicateRecipe>(getOperand(0)) && 1855 "operand must be VPReplicateRecipe"); 1856 1857 // By current pack/unpack logic we need to generate only a single phi node: if 1858 // a vector value for the predicated instruction exists at this point it means 1859 // the instruction has vector users only, and a phi for the vector value is 1860 // needed. In this case the recipe of the predicated instruction is marked to 1861 // also do that packing, thereby "hoisting" the insert-element sequence. 1862 // Otherwise, a phi node for the scalar value is needed. 1863 unsigned Part = State.Instance->Part; 1864 if (State.hasVectorValue(getOperand(0), Part)) { 1865 Value *VectorValue = State.get(getOperand(0), Part); 1866 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 1867 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 1868 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 1869 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 1870 if (State.hasVectorValue(this, Part)) 1871 State.reset(this, VPhi, Part); 1872 else 1873 State.set(this, VPhi, Part); 1874 // NOTE: Currently we need to update the value of the operand, so the next 1875 // predicated iteration inserts its generated value in the correct vector. 1876 State.reset(getOperand(0), VPhi, Part); 1877 } else { 1878 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 1879 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 1880 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 1881 PredicatingBB); 1882 Phi->addIncoming(ScalarPredInst, PredicatedBB); 1883 if (State.hasScalarValue(this, *State.Instance)) 1884 State.reset(this, Phi, *State.Instance); 1885 else 1886 State.set(this, Phi, *State.Instance); 1887 // NOTE: Currently we need to update the value of the operand, so the next 1888 // predicated iteration inserts its generated value in the correct vector. 1889 State.reset(getOperand(0), Phi, *State.Instance); 1890 } 1891 } 1892 1893 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1894 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, 1895 VPSlotTracker &SlotTracker) const { 1896 O << Indent << "PHI-PREDICATED-INSTRUCTION "; 1897 printAsOperand(O, SlotTracker); 1898 O << " = "; 1899 printOperands(O, SlotTracker); 1900 } 1901 1902 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, 1903 VPSlotTracker &SlotTracker) const { 1904 O << Indent << "WIDEN "; 1905 printAsOperand(O, SlotTracker); 1906 O << " = load "; 1907 printOperands(O, SlotTracker); 1908 } 1909 1910 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, 1911 VPSlotTracker &SlotTracker) const { 1912 O << Indent << "WIDEN "; 1913 printAsOperand(O, SlotTracker); 1914 O << " = vp.load "; 1915 printOperands(O, SlotTracker); 1916 } 1917 1918 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, 1919 VPSlotTracker &SlotTracker) const { 1920 O << Indent << "WIDEN store "; 1921 printOperands(O, SlotTracker); 1922 } 1923 1924 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent, 1925 VPSlotTracker &SlotTracker) const { 1926 O << Indent << "WIDEN vp.store "; 1927 printOperands(O, SlotTracker); 1928 } 1929 #endif 1930 1931 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { 1932 Value *Start = getStartValue()->getLiveInIRValue(); 1933 PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index"); 1934 EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 1935 1936 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1937 EntryPart->addIncoming(Start, VectorPH); 1938 EntryPart->setDebugLoc(getDebugLoc()); 1939 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) 1940 State.set(this, EntryPart, Part, /*IsScalar*/ true); 1941 } 1942 1943 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1944 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 1945 VPSlotTracker &SlotTracker) const { 1946 O << Indent << "EMIT "; 1947 printAsOperand(O, SlotTracker); 1948 O << " = CANONICAL-INDUCTION "; 1949 printOperands(O, SlotTracker); 1950 } 1951 #endif 1952 1953 bool VPCanonicalIVPHIRecipe::isCanonical( 1954 InductionDescriptor::InductionKind Kind, VPValue *Start, 1955 VPValue *Step) const { 1956 // Must be an integer induction. 1957 if (Kind != InductionDescriptor::IK_IntInduction) 1958 return false; 1959 // Start must match the start value of this canonical induction. 1960 if (Start != getStartValue()) 1961 return false; 1962 1963 // If the step is defined by a recipe, it is not a ConstantInt. 1964 if (Step->getDefiningRecipe()) 1965 return false; 1966 1967 ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue()); 1968 return StepC && StepC->isOne(); 1969 } 1970 1971 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) { 1972 return IsScalarAfterVectorization && 1973 (!IsScalable || vputils::onlyFirstLaneUsed(this)); 1974 } 1975 1976 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1977 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, 1978 VPSlotTracker &SlotTracker) const { 1979 O << Indent << "EMIT "; 1980 printAsOperand(O, SlotTracker); 1981 O << " = WIDEN-POINTER-INDUCTION "; 1982 getStartValue()->printAsOperand(O, SlotTracker); 1983 O << ", " << *IndDesc.getStep(); 1984 } 1985 #endif 1986 1987 void VPExpandSCEVRecipe::execute(VPTransformState &State) { 1988 assert(!State.Instance && "cannot be used in per-lane"); 1989 const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); 1990 SCEVExpander Exp(SE, DL, "induction"); 1991 1992 Value *Res = Exp.expandCodeFor(Expr, Expr->getType(), 1993 &*State.Builder.GetInsertPoint()); 1994 assert(!State.ExpandedSCEVs.contains(Expr) && 1995 "Same SCEV expanded multiple times"); 1996 State.ExpandedSCEVs[Expr] = Res; 1997 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) 1998 State.set(this, Res, {Part, 0}); 1999 } 2000 2001 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2002 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, 2003 VPSlotTracker &SlotTracker) const { 2004 O << Indent << "EMIT "; 2005 getVPSingleValue()->printAsOperand(O, SlotTracker); 2006 O << " = EXPAND SCEV " << *Expr; 2007 } 2008 #endif 2009 2010 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { 2011 Value *CanonicalIV = State.get(getOperand(0), 0, /*IsScalar*/ true); 2012 Type *STy = CanonicalIV->getType(); 2013 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); 2014 ElementCount VF = State.VF; 2015 Value *VStart = VF.isScalar() 2016 ? CanonicalIV 2017 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); 2018 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { 2019 Value *VStep = createStepForVF(Builder, STy, VF, Part); 2020 if (VF.isVector()) { 2021 VStep = Builder.CreateVectorSplat(VF, VStep); 2022 VStep = 2023 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); 2024 } 2025 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); 2026 State.set(this, CanonicalVectorIV, Part); 2027 } 2028 } 2029 2030 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2031 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, 2032 VPSlotTracker &SlotTracker) const { 2033 O << Indent << "EMIT "; 2034 printAsOperand(O, SlotTracker); 2035 O << " = WIDEN-CANONICAL-INDUCTION "; 2036 printOperands(O, SlotTracker); 2037 } 2038 #endif 2039 2040 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { 2041 auto &Builder = State.Builder; 2042 // Create a vector from the initial value. 2043 auto *VectorInit = getStartValue()->getLiveInIRValue(); 2044 2045 Type *VecTy = State.VF.isScalar() 2046 ? VectorInit->getType() 2047 : VectorType::get(VectorInit->getType(), State.VF); 2048 2049 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2050 if (State.VF.isVector()) { 2051 auto *IdxTy = Builder.getInt32Ty(); 2052 auto *One = ConstantInt::get(IdxTy, 1); 2053 IRBuilder<>::InsertPointGuard Guard(Builder); 2054 Builder.SetInsertPoint(VectorPH->getTerminator()); 2055 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF); 2056 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 2057 VectorInit = Builder.CreateInsertElement( 2058 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init"); 2059 } 2060 2061 // Create a phi node for the new recurrence. 2062 PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur"); 2063 EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 2064 EntryPart->addIncoming(VectorInit, VectorPH); 2065 State.set(this, EntryPart, 0); 2066 } 2067 2068 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2069 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, 2070 VPSlotTracker &SlotTracker) const { 2071 O << Indent << "FIRST-ORDER-RECURRENCE-PHI "; 2072 printAsOperand(O, SlotTracker); 2073 O << " = phi "; 2074 printOperands(O, SlotTracker); 2075 } 2076 #endif 2077 2078 void VPReductionPHIRecipe::execute(VPTransformState &State) { 2079 auto &Builder = State.Builder; 2080 2081 // Reductions do not have to start at zero. They can start with 2082 // any loop invariant values. 2083 VPValue *StartVPV = getStartValue(); 2084 Value *StartV = StartVPV->getLiveInIRValue(); 2085 2086 // In order to support recurrences we need to be able to vectorize Phi nodes. 2087 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 2088 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 2089 // this value when we vectorize all of the instructions that use the PHI. 2090 bool ScalarPHI = State.VF.isScalar() || IsInLoop; 2091 Type *VecTy = ScalarPHI ? StartV->getType() 2092 : VectorType::get(StartV->getType(), State.VF); 2093 2094 BasicBlock *HeaderBB = State.CFG.PrevBB; 2095 assert(State.CurrentVectorLoop->getHeader() == HeaderBB && 2096 "recipe must be in the vector loop header"); 2097 unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF; 2098 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { 2099 Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi"); 2100 EntryPart->insertBefore(HeaderBB->getFirstInsertionPt()); 2101 State.set(this, EntryPart, Part, IsInLoop); 2102 } 2103 2104 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2105 2106 Value *Iden = nullptr; 2107 RecurKind RK = RdxDesc.getRecurrenceKind(); 2108 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || 2109 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { 2110 // MinMax and AnyOf reductions have the start value as their identity. 2111 if (ScalarPHI) { 2112 Iden = StartV; 2113 } else { 2114 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 2115 Builder.SetInsertPoint(VectorPH->getTerminator()); 2116 StartV = Iden = 2117 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 2118 } 2119 } else { 2120 Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(), 2121 RdxDesc.getFastMathFlags()); 2122 2123 if (!ScalarPHI) { 2124 Iden = Builder.CreateVectorSplat(State.VF, Iden); 2125 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 2126 Builder.SetInsertPoint(VectorPH->getTerminator()); 2127 Constant *Zero = Builder.getInt32(0); 2128 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 2129 } 2130 } 2131 2132 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { 2133 Value *EntryPart = State.get(this, Part, IsInLoop); 2134 // Make sure to add the reduction start value only to the 2135 // first unroll part. 2136 Value *StartVal = (Part == 0) ? StartV : Iden; 2137 cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH); 2138 } 2139 } 2140 2141 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2142 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2143 VPSlotTracker &SlotTracker) const { 2144 O << Indent << "WIDEN-REDUCTION-PHI "; 2145 2146 printAsOperand(O, SlotTracker); 2147 O << " = phi "; 2148 printOperands(O, SlotTracker); 2149 } 2150 #endif 2151 2152 void VPWidenPHIRecipe::execute(VPTransformState &State) { 2153 assert(EnableVPlanNativePath && 2154 "Non-native vplans are not expected to have VPWidenPHIRecipes."); 2155 2156 Value *Op0 = State.get(getOperand(0), 0); 2157 Type *VecTy = Op0->getType(); 2158 Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi"); 2159 State.set(this, VecPhi, 0); 2160 } 2161 2162 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2163 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2164 VPSlotTracker &SlotTracker) const { 2165 O << Indent << "WIDEN-PHI "; 2166 2167 auto *OriginalPhi = cast<PHINode>(getUnderlyingValue()); 2168 // Unless all incoming values are modeled in VPlan print the original PHI 2169 // directly. 2170 // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming 2171 // values as VPValues. 2172 if (getNumOperands() != OriginalPhi->getNumOperands()) { 2173 O << VPlanIngredient(OriginalPhi); 2174 return; 2175 } 2176 2177 printAsOperand(O, SlotTracker); 2178 O << " = phi "; 2179 printOperands(O, SlotTracker); 2180 } 2181 #endif 2182 2183 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and 2184 // remove VPActiveLaneMaskPHIRecipe. 2185 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { 2186 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2187 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { 2188 Value *StartMask = State.get(getOperand(0), Part); 2189 PHINode *EntryPart = 2190 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); 2191 EntryPart->addIncoming(StartMask, VectorPH); 2192 EntryPart->setDebugLoc(getDebugLoc()); 2193 State.set(this, EntryPart, Part); 2194 } 2195 } 2196 2197 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2198 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2199 VPSlotTracker &SlotTracker) const { 2200 O << Indent << "ACTIVE-LANE-MASK-PHI "; 2201 2202 printAsOperand(O, SlotTracker); 2203 O << " = phi "; 2204 printOperands(O, SlotTracker); 2205 } 2206 #endif 2207 2208 void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) { 2209 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2210 assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization."); 2211 Value *Start = State.get(getOperand(0), VPIteration(0, 0)); 2212 PHINode *EntryPart = 2213 State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv"); 2214 EntryPart->addIncoming(Start, VectorPH); 2215 EntryPart->setDebugLoc(getDebugLoc()); 2216 State.set(this, EntryPart, 0, /*IsScalar=*/true); 2217 } 2218 2219 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2220 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2221 VPSlotTracker &SlotTracker) const { 2222 O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI "; 2223 2224 printAsOperand(O, SlotTracker); 2225 O << " = phi "; 2226 printOperands(O, SlotTracker); 2227 } 2228 #endif 2229