1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file contains implementations for different VPlan recipes. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "VPlan.h" 15 #include "VPlanAnalysis.h" 16 #include "llvm/ADT/STLExtras.h" 17 #include "llvm/ADT/SmallVector.h" 18 #include "llvm/ADT/Twine.h" 19 #include "llvm/Analysis/IVDescriptors.h" 20 #include "llvm/IR/BasicBlock.h" 21 #include "llvm/IR/IRBuilder.h" 22 #include "llvm/IR/Instruction.h" 23 #include "llvm/IR/Instructions.h" 24 #include "llvm/IR/Type.h" 25 #include "llvm/IR/Value.h" 26 #include "llvm/Support/Casting.h" 27 #include "llvm/Support/CommandLine.h" 28 #include "llvm/Support/Debug.h" 29 #include "llvm/Support/raw_ostream.h" 30 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 31 #include "llvm/Transforms/Utils/LoopUtils.h" 32 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 33 #include <cassert> 34 35 using namespace llvm; 36 37 using VectorParts = SmallVector<Value *, 2>; 38 39 namespace llvm { 40 extern cl::opt<bool> EnableVPlanNativePath; 41 } 42 43 #define LV_NAME "loop-vectorize" 44 #define DEBUG_TYPE LV_NAME 45 46 bool VPRecipeBase::mayWriteToMemory() const { 47 switch (getVPDefID()) { 48 case VPInterleaveSC: 49 return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0; 50 case VPWidenStoreEVLSC: 51 case VPWidenStoreSC: 52 return true; 53 case VPReplicateSC: 54 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 55 ->mayWriteToMemory(); 56 case VPWidenCallSC: 57 return !cast<VPWidenCallRecipe>(this) 58 ->getCalledScalarFunction() 59 ->onlyReadsMemory(); 60 case VPBranchOnMaskSC: 61 case VPScalarIVStepsSC: 62 case VPPredInstPHISC: 63 return false; 64 case VPBlendSC: 65 case VPReductionSC: 66 case VPWidenCanonicalIVSC: 67 case VPWidenCastSC: 68 case VPWidenGEPSC: 69 case VPWidenIntOrFpInductionSC: 70 case VPWidenLoadEVLSC: 71 case VPWidenLoadSC: 72 case VPWidenPHISC: 73 case VPWidenSC: 74 case VPWidenSelectSC: { 75 const Instruction *I = 76 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 77 (void)I; 78 assert((!I || !I->mayWriteToMemory()) && 79 "underlying instruction may write to memory"); 80 return false; 81 } 82 default: 83 return true; 84 } 85 } 86 87 bool VPRecipeBase::mayReadFromMemory() const { 88 switch (getVPDefID()) { 89 case VPWidenLoadEVLSC: 90 case VPWidenLoadSC: 91 return true; 92 case VPReplicateSC: 93 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 94 ->mayReadFromMemory(); 95 case VPWidenCallSC: 96 return !cast<VPWidenCallRecipe>(this) 97 ->getCalledScalarFunction() 98 ->onlyWritesMemory(); 99 case VPBranchOnMaskSC: 100 case VPPredInstPHISC: 101 case VPScalarIVStepsSC: 102 case VPWidenStoreEVLSC: 103 case VPWidenStoreSC: 104 return false; 105 case VPBlendSC: 106 case VPReductionSC: 107 case VPWidenCanonicalIVSC: 108 case VPWidenCastSC: 109 case VPWidenGEPSC: 110 case VPWidenIntOrFpInductionSC: 111 case VPWidenPHISC: 112 case VPWidenSC: 113 case VPWidenSelectSC: { 114 const Instruction *I = 115 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 116 (void)I; 117 assert((!I || !I->mayReadFromMemory()) && 118 "underlying instruction may read from memory"); 119 return false; 120 } 121 default: 122 return true; 123 } 124 } 125 126 bool VPRecipeBase::mayHaveSideEffects() const { 127 switch (getVPDefID()) { 128 case VPDerivedIVSC: 129 case VPPredInstPHISC: 130 case VPScalarCastSC: 131 return false; 132 case VPInstructionSC: 133 switch (cast<VPInstruction>(this)->getOpcode()) { 134 case Instruction::Or: 135 case Instruction::ICmp: 136 case Instruction::Select: 137 case VPInstruction::Not: 138 case VPInstruction::CalculateTripCountMinusVF: 139 case VPInstruction::CanonicalIVIncrementForPart: 140 case VPInstruction::ExtractFromEnd: 141 case VPInstruction::LogicalAnd: 142 case VPInstruction::PtrAdd: 143 return false; 144 default: 145 return true; 146 } 147 case VPWidenCallSC: { 148 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction(); 149 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn(); 150 } 151 case VPBlendSC: 152 case VPReductionSC: 153 case VPScalarIVStepsSC: 154 case VPWidenCanonicalIVSC: 155 case VPWidenCastSC: 156 case VPWidenGEPSC: 157 case VPWidenIntOrFpInductionSC: 158 case VPWidenPHISC: 159 case VPWidenPointerInductionSC: 160 case VPWidenSC: 161 case VPWidenSelectSC: { 162 const Instruction *I = 163 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 164 (void)I; 165 assert((!I || !I->mayHaveSideEffects()) && 166 "underlying instruction has side-effects"); 167 return false; 168 } 169 case VPInterleaveSC: 170 return mayWriteToMemory(); 171 case VPWidenLoadEVLSC: 172 case VPWidenLoadSC: 173 case VPWidenStoreEVLSC: 174 case VPWidenStoreSC: 175 assert( 176 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() == 177 mayWriteToMemory() && 178 "mayHaveSideffects result for ingredient differs from this " 179 "implementation"); 180 return mayWriteToMemory(); 181 case VPReplicateSC: { 182 auto *R = cast<VPReplicateRecipe>(this); 183 return R->getUnderlyingInstr()->mayHaveSideEffects(); 184 } 185 default: 186 return true; 187 } 188 } 189 190 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { 191 auto Lane = VPLane::getLastLaneForVF(State.VF); 192 VPValue *ExitValue = getOperand(0); 193 if (vputils::isUniformAfterVectorization(ExitValue)) 194 Lane = VPLane::getFirstLane(); 195 VPBasicBlock *MiddleVPBB = 196 cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor()); 197 assert(MiddleVPBB->getNumSuccessors() == 0 && 198 "the middle block must not have any successors"); 199 BasicBlock *MiddleBB = State.CFG.VPBB2IRBB[MiddleVPBB]; 200 Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)), 201 MiddleBB); 202 } 203 204 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 205 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { 206 O << "Live-out "; 207 getPhi()->printAsOperand(O); 208 O << " = "; 209 getOperand(0)->printAsOperand(O, SlotTracker); 210 O << "\n"; 211 } 212 #endif 213 214 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { 215 assert(!Parent && "Recipe already in some VPBasicBlock"); 216 assert(InsertPos->getParent() && 217 "Insertion position not in any VPBasicBlock"); 218 InsertPos->getParent()->insert(this, InsertPos->getIterator()); 219 } 220 221 void VPRecipeBase::insertBefore(VPBasicBlock &BB, 222 iplist<VPRecipeBase>::iterator I) { 223 assert(!Parent && "Recipe already in some VPBasicBlock"); 224 assert(I == BB.end() || I->getParent() == &BB); 225 BB.insert(this, I); 226 } 227 228 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { 229 assert(!Parent && "Recipe already in some VPBasicBlock"); 230 assert(InsertPos->getParent() && 231 "Insertion position not in any VPBasicBlock"); 232 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator())); 233 } 234 235 void VPRecipeBase::removeFromParent() { 236 assert(getParent() && "Recipe not in any VPBasicBlock"); 237 getParent()->getRecipeList().remove(getIterator()); 238 Parent = nullptr; 239 } 240 241 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { 242 assert(getParent() && "Recipe not in any VPBasicBlock"); 243 return getParent()->getRecipeList().erase(getIterator()); 244 } 245 246 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { 247 removeFromParent(); 248 insertAfter(InsertPos); 249 } 250 251 void VPRecipeBase::moveBefore(VPBasicBlock &BB, 252 iplist<VPRecipeBase>::iterator I) { 253 removeFromParent(); 254 insertBefore(BB, I); 255 } 256 257 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { 258 assert(OpType == OperationType::FPMathOp && 259 "recipe doesn't have fast math flags"); 260 FastMathFlags Res; 261 Res.setAllowReassoc(FMFs.AllowReassoc); 262 Res.setNoNaNs(FMFs.NoNaNs); 263 Res.setNoInfs(FMFs.NoInfs); 264 Res.setNoSignedZeros(FMFs.NoSignedZeros); 265 Res.setAllowReciprocal(FMFs.AllowReciprocal); 266 Res.setAllowContract(FMFs.AllowContract); 267 Res.setApproxFunc(FMFs.ApproxFunc); 268 return Res; 269 } 270 271 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, 272 VPValue *A, VPValue *B, DebugLoc DL, 273 const Twine &Name) 274 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}), 275 Pred, DL), 276 Opcode(Opcode), Name(Name.str()) { 277 assert(Opcode == Instruction::ICmp && 278 "only ICmp predicates supported at the moment"); 279 } 280 281 VPInstruction::VPInstruction(unsigned Opcode, 282 std::initializer_list<VPValue *> Operands, 283 FastMathFlags FMFs, DebugLoc DL, const Twine &Name) 284 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL), 285 Opcode(Opcode), Name(Name.str()) { 286 // Make sure the VPInstruction is a floating-point operation. 287 assert(isFPMathOp() && "this op can't take fast-math flags"); 288 } 289 290 bool VPInstruction::doesGeneratePerAllLanes() const { 291 return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this); 292 } 293 294 bool VPInstruction::canGenerateScalarForFirstLane() const { 295 if (Instruction::isBinaryOp(getOpcode())) 296 return true; 297 if (isVectorToScalar()) 298 return true; 299 switch (Opcode) { 300 case VPInstruction::BranchOnCond: 301 case VPInstruction::BranchOnCount: 302 case VPInstruction::CalculateTripCountMinusVF: 303 case VPInstruction::CanonicalIVIncrementForPart: 304 case VPInstruction::PtrAdd: 305 case VPInstruction::ExplicitVectorLength: 306 return true; 307 default: 308 return false; 309 } 310 } 311 312 Value *VPInstruction::generatePerLane(VPTransformState &State, 313 const VPIteration &Lane) { 314 IRBuilderBase &Builder = State.Builder; 315 316 assert(getOpcode() == VPInstruction::PtrAdd && 317 "only PtrAdd opcodes are supported for now"); 318 return Builder.CreatePtrAdd(State.get(getOperand(0), Lane), 319 State.get(getOperand(1), Lane), Name); 320 } 321 322 Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { 323 IRBuilderBase &Builder = State.Builder; 324 325 if (Instruction::isBinaryOp(getOpcode())) { 326 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 327 Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed); 328 Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed); 329 auto *Res = 330 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); 331 if (auto *I = dyn_cast<Instruction>(Res)) 332 setFlags(I); 333 return Res; 334 } 335 336 switch (getOpcode()) { 337 case VPInstruction::Not: { 338 Value *A = State.get(getOperand(0), Part); 339 return Builder.CreateNot(A, Name); 340 } 341 case Instruction::ICmp: { 342 Value *A = State.get(getOperand(0), Part); 343 Value *B = State.get(getOperand(1), Part); 344 return Builder.CreateCmp(getPredicate(), A, B, Name); 345 } 346 case Instruction::Select: { 347 Value *Cond = State.get(getOperand(0), Part); 348 Value *Op1 = State.get(getOperand(1), Part); 349 Value *Op2 = State.get(getOperand(2), Part); 350 return Builder.CreateSelect(Cond, Op1, Op2, Name); 351 } 352 case VPInstruction::ActiveLaneMask: { 353 // Get first lane of vector induction variable. 354 Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); 355 // Get the original loop tripcount. 356 Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0)); 357 358 // If this part of the active lane mask is scalar, generate the CMP directly 359 // to avoid unnecessary extracts. 360 if (State.VF.isScalar()) 361 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC, 362 Name); 363 364 auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); 365 auto *PredTy = VectorType::get(Int1Ty, State.VF); 366 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, 367 {PredTy, ScalarTC->getType()}, 368 {VIVElem0, ScalarTC}, nullptr, Name); 369 } 370 case VPInstruction::FirstOrderRecurrenceSplice: { 371 // Generate code to combine the previous and current values in vector v3. 372 // 373 // vector.ph: 374 // v_init = vector(..., ..., ..., a[-1]) 375 // br vector.body 376 // 377 // vector.body 378 // i = phi [0, vector.ph], [i+4, vector.body] 379 // v1 = phi [v_init, vector.ph], [v2, vector.body] 380 // v2 = a[i, i+1, i+2, i+3]; 381 // v3 = vector(v1(3), v2(0, 1, 2)) 382 383 // For the first part, use the recurrence phi (v1), otherwise v2. 384 auto *V1 = State.get(getOperand(0), 0); 385 Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1); 386 if (!PartMinus1->getType()->isVectorTy()) 387 return PartMinus1; 388 Value *V2 = State.get(getOperand(1), Part); 389 return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name); 390 } 391 case VPInstruction::CalculateTripCountMinusVF: { 392 if (Part != 0) 393 return State.get(this, 0, /*IsScalar*/ true); 394 395 Value *ScalarTC = State.get(getOperand(0), {0, 0}); 396 Value *Step = 397 createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF); 398 Value *Sub = Builder.CreateSub(ScalarTC, Step); 399 Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step); 400 Value *Zero = ConstantInt::get(ScalarTC->getType(), 0); 401 return Builder.CreateSelect(Cmp, Sub, Zero); 402 } 403 case VPInstruction::ExplicitVectorLength: { 404 // Compute EVL 405 auto GetEVL = [=](VPTransformState &State, Value *AVL) { 406 assert(AVL->getType()->isIntegerTy() && 407 "Requested vector length should be an integer."); 408 409 // TODO: Add support for MaxSafeDist for correct loop emission. 410 assert(State.VF.isScalable() && "Expected scalable vector factor."); 411 Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue()); 412 413 Value *EVL = State.Builder.CreateIntrinsic( 414 State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length, 415 {AVL, VFArg, State.Builder.getTrue()}); 416 return EVL; 417 }; 418 // TODO: Restructure this code with an explicit remainder loop, vsetvli can 419 // be outside of the main loop. 420 assert(Part == 0 && "No unrolling expected for predicated vectorization."); 421 // Compute VTC - IV as the AVL (requested vector length). 422 Value *Index = State.get(getOperand(0), VPIteration(0, 0)); 423 Value *TripCount = State.get(getOperand(1), VPIteration(0, 0)); 424 Value *AVL = State.Builder.CreateSub(TripCount, Index); 425 Value *EVL = GetEVL(State, AVL); 426 return EVL; 427 } 428 case VPInstruction::CanonicalIVIncrementForPart: { 429 auto *IV = State.get(getOperand(0), VPIteration(0, 0)); 430 if (Part == 0) 431 return IV; 432 433 // The canonical IV is incremented by the vectorization factor (num of SIMD 434 // elements) times the unroll part. 435 Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); 436 return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(), 437 hasNoSignedWrap()); 438 } 439 case VPInstruction::BranchOnCond: { 440 if (Part != 0) 441 return nullptr; 442 443 Value *Cond = State.get(getOperand(0), VPIteration(Part, 0)); 444 VPRegionBlock *ParentRegion = getParent()->getParent(); 445 VPBasicBlock *Header = ParentRegion->getEntryBasicBlock(); 446 447 // Replace the temporary unreachable terminator with a new conditional 448 // branch, hooking it up to backward destination for exiting blocks now and 449 // to forward destination(s) later when they are created. 450 BranchInst *CondBr = 451 Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr); 452 453 if (getParent()->isExiting()) 454 CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]); 455 456 CondBr->setSuccessor(0, nullptr); 457 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 458 return CondBr; 459 } 460 case VPInstruction::BranchOnCount: { 461 if (Part != 0) 462 return nullptr; 463 // First create the compare. 464 Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true); 465 Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true); 466 Value *Cond = Builder.CreateICmpEQ(IV, TC); 467 468 // Now create the branch. 469 auto *Plan = getParent()->getPlan(); 470 VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); 471 VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock(); 472 473 // Replace the temporary unreachable terminator with a new conditional 474 // branch, hooking it up to backward destination (the header) now and to the 475 // forward destination (the exit/middle block) later when it is created. 476 // Note that CreateCondBr expects a valid BB as first argument, so we need 477 // to set it to nullptr later. 478 BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), 479 State.CFG.VPBB2IRBB[Header]); 480 CondBr->setSuccessor(0, nullptr); 481 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 482 return CondBr; 483 } 484 case VPInstruction::ComputeReductionResult: { 485 if (Part != 0) 486 return State.get(this, 0, /*IsScalar*/ true); 487 488 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary 489 // and will be removed by breaking up the recipe further. 490 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0)); 491 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 492 // Get its reduction variable descriptor. 493 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 494 495 RecurKind RK = RdxDesc.getRecurrenceKind(); 496 497 VPValue *LoopExitingDef = getOperand(1); 498 Type *PhiTy = OrigPhi->getType(); 499 VectorParts RdxParts(State.UF); 500 for (unsigned Part = 0; Part < State.UF; ++Part) 501 RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop()); 502 503 // If the vector reduction can be performed in a smaller type, we truncate 504 // then extend the loop exit value to enable InstCombine to evaluate the 505 // entire expression in the smaller type. 506 // TODO: Handle this in truncateToMinBW. 507 if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 508 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF); 509 for (unsigned Part = 0; Part < State.UF; ++Part) 510 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 511 } 512 // Reduce all of the unrolled parts into a single vector. 513 Value *ReducedPartRdx = RdxParts[0]; 514 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 515 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) 516 Op = Instruction::Or; 517 518 if (PhiR->isOrdered()) { 519 ReducedPartRdx = RdxParts[State.UF - 1]; 520 } else { 521 // Floating-point operations should have some FMF to enable the reduction. 522 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 523 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 524 for (unsigned Part = 1; Part < State.UF; ++Part) { 525 Value *RdxPart = RdxParts[Part]; 526 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 527 ReducedPartRdx = Builder.CreateBinOp( 528 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 529 else 530 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 531 } 532 } 533 534 // Create the reduction after the loop. Note that inloop reductions create 535 // the target reduction in the loop using a Reduction recipe. 536 if ((State.VF.isVector() || 537 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) && 538 !PhiR->isInLoop()) { 539 ReducedPartRdx = 540 createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); 541 // If the reduction can be performed in a smaller type, we need to extend 542 // the reduction to the wider type before we branch to the original loop. 543 if (PhiTy != RdxDesc.getRecurrenceType()) 544 ReducedPartRdx = RdxDesc.isSigned() 545 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 546 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 547 } 548 549 // If there were stores of the reduction value to a uniform memory address 550 // inside the loop, create the final store here. 551 if (StoreInst *SI = RdxDesc.IntermediateStore) { 552 auto *NewSI = Builder.CreateAlignedStore( 553 ReducedPartRdx, SI->getPointerOperand(), SI->getAlign()); 554 propagateMetadata(NewSI, SI); 555 } 556 557 return ReducedPartRdx; 558 } 559 case VPInstruction::ExtractFromEnd: { 560 if (Part != 0) 561 return State.get(this, 0, /*IsScalar*/ true); 562 563 auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue()); 564 unsigned Offset = CI->getZExtValue(); 565 assert(Offset > 0 && "Offset from end must be positive"); 566 Value *Res; 567 if (State.VF.isVector()) { 568 assert(Offset <= State.VF.getKnownMinValue() && 569 "invalid offset to extract from"); 570 // Extract lane VF - Offset from the operand. 571 Res = State.get( 572 getOperand(0), 573 VPIteration(State.UF - 1, VPLane::getLaneFromEnd(State.VF, Offset))); 574 } else { 575 assert(Offset <= State.UF && "invalid offset to extract from"); 576 // When loop is unrolled without vectorizing, retrieve UF - Offset. 577 Res = State.get(getOperand(0), State.UF - Offset); 578 } 579 Res->setName(Name); 580 return Res; 581 } 582 case VPInstruction::LogicalAnd: { 583 Value *A = State.get(getOperand(0), Part); 584 Value *B = State.get(getOperand(1), Part); 585 return Builder.CreateLogicalAnd(A, B, Name); 586 } 587 case VPInstruction::PtrAdd: { 588 assert(vputils::onlyFirstLaneUsed(this) && 589 "can only generate first lane for PtrAdd"); 590 Value *Ptr = State.get(getOperand(0), Part, /* IsScalar */ true); 591 Value *Addend = State.get(getOperand(1), Part, /* IsScalar */ true); 592 return Builder.CreatePtrAdd(Ptr, Addend, Name); 593 } 594 default: 595 llvm_unreachable("Unsupported opcode for instruction"); 596 } 597 } 598 599 bool VPInstruction::isVectorToScalar() const { 600 return getOpcode() == VPInstruction::ExtractFromEnd || 601 getOpcode() == VPInstruction::ComputeReductionResult; 602 } 603 604 #if !defined(NDEBUG) 605 bool VPInstruction::isFPMathOp() const { 606 // Inspired by FPMathOperator::classof. Notable differences are that we don't 607 // support Call, PHI and Select opcodes here yet. 608 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || 609 Opcode == Instruction::FNeg || Opcode == Instruction::FSub || 610 Opcode == Instruction::FDiv || Opcode == Instruction::FRem || 611 Opcode == Instruction::FCmp || Opcode == Instruction::Select; 612 } 613 #endif 614 615 void VPInstruction::execute(VPTransformState &State) { 616 assert(!State.Instance && "VPInstruction executing an Instance"); 617 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 618 assert((hasFastMathFlags() == isFPMathOp() || 619 getOpcode() == Instruction::Select) && 620 "Recipe not a FPMathOp but has fast-math flags?"); 621 if (hasFastMathFlags()) 622 State.Builder.setFastMathFlags(getFastMathFlags()); 623 State.setDebugLocFrom(getDebugLoc()); 624 bool GeneratesPerFirstLaneOnly = 625 canGenerateScalarForFirstLane() && 626 (vputils::onlyFirstLaneUsed(this) || isVectorToScalar()); 627 bool GeneratesPerAllLanes = doesGeneratePerAllLanes(); 628 bool OnlyFirstPartUsed = vputils::onlyFirstPartUsed(this); 629 for (unsigned Part = 0; Part < State.UF; ++Part) { 630 if (GeneratesPerAllLanes) { 631 for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue(); 632 Lane != NumLanes; ++Lane) { 633 Value *GeneratedValue = generatePerLane(State, VPIteration(Part, Lane)); 634 assert(GeneratedValue && "generatePerLane must produce a value"); 635 State.set(this, GeneratedValue, VPIteration(Part, Lane)); 636 } 637 continue; 638 } 639 640 if (Part != 0 && OnlyFirstPartUsed && hasResult()) { 641 Value *Part0 = State.get(this, 0, /*IsScalar*/ GeneratesPerFirstLaneOnly); 642 State.set(this, Part0, Part, 643 /*IsScalar*/ GeneratesPerFirstLaneOnly); 644 continue; 645 } 646 647 Value *GeneratedValue = generatePerPart(State, Part); 648 if (!hasResult()) 649 continue; 650 assert(GeneratedValue && "generatePerPart must produce a value"); 651 assert((GeneratedValue->getType()->isVectorTy() == 652 !GeneratesPerFirstLaneOnly || 653 State.VF.isScalar()) && 654 "scalar value but not only first lane defined"); 655 State.set(this, GeneratedValue, Part, 656 /*IsScalar*/ GeneratesPerFirstLaneOnly); 657 } 658 } 659 660 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { 661 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 662 if (Instruction::isBinaryOp(getOpcode())) 663 return vputils::onlyFirstLaneUsed(this); 664 665 switch (getOpcode()) { 666 default: 667 return false; 668 case Instruction::ICmp: 669 case VPInstruction::PtrAdd: 670 // TODO: Cover additional opcodes. 671 return vputils::onlyFirstLaneUsed(this); 672 case VPInstruction::ActiveLaneMask: 673 case VPInstruction::ExplicitVectorLength: 674 case VPInstruction::CalculateTripCountMinusVF: 675 case VPInstruction::CanonicalIVIncrementForPart: 676 case VPInstruction::BranchOnCount: 677 return true; 678 }; 679 llvm_unreachable("switch should return"); 680 } 681 682 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 683 void VPInstruction::dump() const { 684 VPSlotTracker SlotTracker(getParent()->getPlan()); 685 print(dbgs(), "", SlotTracker); 686 } 687 688 void VPInstruction::print(raw_ostream &O, const Twine &Indent, 689 VPSlotTracker &SlotTracker) const { 690 O << Indent << "EMIT "; 691 692 if (hasResult()) { 693 printAsOperand(O, SlotTracker); 694 O << " = "; 695 } 696 697 switch (getOpcode()) { 698 case VPInstruction::Not: 699 O << "not"; 700 break; 701 case VPInstruction::SLPLoad: 702 O << "combined load"; 703 break; 704 case VPInstruction::SLPStore: 705 O << "combined store"; 706 break; 707 case VPInstruction::ActiveLaneMask: 708 O << "active lane mask"; 709 break; 710 case VPInstruction::ExplicitVectorLength: 711 O << "EXPLICIT-VECTOR-LENGTH"; 712 break; 713 case VPInstruction::FirstOrderRecurrenceSplice: 714 O << "first-order splice"; 715 break; 716 case VPInstruction::BranchOnCond: 717 O << "branch-on-cond"; 718 break; 719 case VPInstruction::CalculateTripCountMinusVF: 720 O << "TC > VF ? TC - VF : 0"; 721 break; 722 case VPInstruction::CanonicalIVIncrementForPart: 723 O << "VF * Part +"; 724 break; 725 case VPInstruction::BranchOnCount: 726 O << "branch-on-count"; 727 break; 728 case VPInstruction::ExtractFromEnd: 729 O << "extract-from-end"; 730 break; 731 case VPInstruction::ComputeReductionResult: 732 O << "compute-reduction-result"; 733 break; 734 case VPInstruction::LogicalAnd: 735 O << "logical-and"; 736 break; 737 case VPInstruction::PtrAdd: 738 O << "ptradd"; 739 break; 740 default: 741 O << Instruction::getOpcodeName(getOpcode()); 742 } 743 744 printFlags(O); 745 printOperands(O, SlotTracker); 746 747 if (auto DL = getDebugLoc()) { 748 O << ", !dbg "; 749 DL.print(O); 750 } 751 } 752 #endif 753 754 void VPWidenCallRecipe::execute(VPTransformState &State) { 755 assert(State.VF.isVector() && "not widening"); 756 Function *CalledScalarFn = getCalledScalarFunction(); 757 assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) && 758 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 759 State.setDebugLocFrom(getDebugLoc()); 760 761 bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic; 762 FunctionType *VFTy = nullptr; 763 if (Variant) 764 VFTy = Variant->getFunctionType(); 765 for (unsigned Part = 0; Part < State.UF; ++Part) { 766 SmallVector<Type *, 2> TysForDecl; 767 // Add return type if intrinsic is overloaded on it. 768 if (UseIntrinsic && 769 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1)) 770 TysForDecl.push_back(VectorType::get( 771 CalledScalarFn->getReturnType()->getScalarType(), State.VF)); 772 SmallVector<Value *, 4> Args; 773 for (const auto &I : enumerate(arg_operands())) { 774 // Some intrinsics have a scalar argument - don't replace it with a 775 // vector. 776 Value *Arg; 777 if (UseIntrinsic && 778 isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())) 779 Arg = State.get(I.value(), VPIteration(0, 0)); 780 // Some vectorized function variants may also take a scalar argument, 781 // e.g. linear parameters for pointers. This needs to be the scalar value 782 // from the start of the respective part when interleaving. 783 else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy()) 784 Arg = State.get(I.value(), VPIteration(Part, 0)); 785 else 786 Arg = State.get(I.value(), Part); 787 if (UseIntrinsic && 788 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index())) 789 TysForDecl.push_back(Arg->getType()); 790 Args.push_back(Arg); 791 } 792 793 Function *VectorF; 794 if (UseIntrinsic) { 795 // Use vector version of the intrinsic. 796 Module *M = State.Builder.GetInsertBlock()->getModule(); 797 VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl); 798 assert(VectorF && "Can't retrieve vector intrinsic."); 799 } else { 800 #ifndef NDEBUG 801 assert(Variant != nullptr && "Can't create vector function."); 802 #endif 803 VectorF = Variant; 804 } 805 806 auto *CI = cast_or_null<CallInst>(getUnderlyingInstr()); 807 SmallVector<OperandBundleDef, 1> OpBundles; 808 if (CI) 809 CI->getOperandBundlesAsDefs(OpBundles); 810 811 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); 812 813 if (isa<FPMathOperator>(V)) 814 V->copyFastMathFlags(CI); 815 816 if (!V->getType()->isVoidTy()) 817 State.set(this, V, Part); 818 State.addMetadata(V, CI); 819 } 820 } 821 822 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 823 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, 824 VPSlotTracker &SlotTracker) const { 825 O << Indent << "WIDEN-CALL "; 826 827 Function *CalledFn = getCalledScalarFunction(); 828 if (CalledFn->getReturnType()->isVoidTy()) 829 O << "void "; 830 else { 831 printAsOperand(O, SlotTracker); 832 O << " = "; 833 } 834 835 O << "call @" << CalledFn->getName() << "("; 836 interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) { 837 Op->printAsOperand(O, SlotTracker); 838 }); 839 O << ")"; 840 841 if (VectorIntrinsicID) 842 O << " (using vector intrinsic)"; 843 else { 844 O << " (using library function"; 845 if (Variant->hasName()) 846 O << ": " << Variant->getName(); 847 O << ")"; 848 } 849 } 850 851 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, 852 VPSlotTracker &SlotTracker) const { 853 O << Indent << "WIDEN-SELECT "; 854 printAsOperand(O, SlotTracker); 855 O << " = select "; 856 getOperand(0)->printAsOperand(O, SlotTracker); 857 O << ", "; 858 getOperand(1)->printAsOperand(O, SlotTracker); 859 O << ", "; 860 getOperand(2)->printAsOperand(O, SlotTracker); 861 O << (isInvariantCond() ? " (condition is loop invariant)" : ""); 862 } 863 #endif 864 865 void VPWidenSelectRecipe::execute(VPTransformState &State) { 866 State.setDebugLocFrom(getDebugLoc()); 867 868 // The condition can be loop invariant but still defined inside the 869 // loop. This means that we can't just use the original 'cond' value. 870 // We have to take the 'vectorized' value and pick the first lane. 871 // Instcombine will make this a no-op. 872 auto *InvarCond = 873 isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr; 874 875 for (unsigned Part = 0; Part < State.UF; ++Part) { 876 Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part); 877 Value *Op0 = State.get(getOperand(1), Part); 878 Value *Op1 = State.get(getOperand(2), Part); 879 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 880 State.set(this, Sel, Part); 881 State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 882 } 883 } 884 885 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy( 886 const FastMathFlags &FMF) { 887 AllowReassoc = FMF.allowReassoc(); 888 NoNaNs = FMF.noNaNs(); 889 NoInfs = FMF.noInfs(); 890 NoSignedZeros = FMF.noSignedZeros(); 891 AllowReciprocal = FMF.allowReciprocal(); 892 AllowContract = FMF.allowContract(); 893 ApproxFunc = FMF.approxFunc(); 894 } 895 896 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 897 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { 898 switch (OpType) { 899 case OperationType::Cmp: 900 O << " " << CmpInst::getPredicateName(getPredicate()); 901 break; 902 case OperationType::DisjointOp: 903 if (DisjointFlags.IsDisjoint) 904 O << " disjoint"; 905 break; 906 case OperationType::PossiblyExactOp: 907 if (ExactFlags.IsExact) 908 O << " exact"; 909 break; 910 case OperationType::OverflowingBinOp: 911 if (WrapFlags.HasNUW) 912 O << " nuw"; 913 if (WrapFlags.HasNSW) 914 O << " nsw"; 915 break; 916 case OperationType::FPMathOp: 917 getFastMathFlags().print(O); 918 break; 919 case OperationType::GEPOp: 920 if (GEPFlags.IsInBounds) 921 O << " inbounds"; 922 break; 923 case OperationType::NonNegOp: 924 if (NonNegFlags.NonNeg) 925 O << " nneg"; 926 break; 927 case OperationType::Other: 928 break; 929 } 930 if (getNumOperands() > 0) 931 O << " "; 932 } 933 #endif 934 935 void VPWidenRecipe::execute(VPTransformState &State) { 936 State.setDebugLocFrom(getDebugLoc()); 937 auto &Builder = State.Builder; 938 switch (Opcode) { 939 case Instruction::Call: 940 case Instruction::Br: 941 case Instruction::PHI: 942 case Instruction::GetElementPtr: 943 case Instruction::Select: 944 llvm_unreachable("This instruction is handled by a different recipe."); 945 case Instruction::UDiv: 946 case Instruction::SDiv: 947 case Instruction::SRem: 948 case Instruction::URem: 949 case Instruction::Add: 950 case Instruction::FAdd: 951 case Instruction::Sub: 952 case Instruction::FSub: 953 case Instruction::FNeg: 954 case Instruction::Mul: 955 case Instruction::FMul: 956 case Instruction::FDiv: 957 case Instruction::FRem: 958 case Instruction::Shl: 959 case Instruction::LShr: 960 case Instruction::AShr: 961 case Instruction::And: 962 case Instruction::Or: 963 case Instruction::Xor: { 964 // Just widen unops and binops. 965 for (unsigned Part = 0; Part < State.UF; ++Part) { 966 SmallVector<Value *, 2> Ops; 967 for (VPValue *VPOp : operands()) 968 Ops.push_back(State.get(VPOp, Part)); 969 970 Value *V = Builder.CreateNAryOp(Opcode, Ops); 971 972 if (auto *VecOp = dyn_cast<Instruction>(V)) 973 setFlags(VecOp); 974 975 // Use this vector value for all users of the original instruction. 976 State.set(this, V, Part); 977 State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 978 } 979 980 break; 981 } 982 case Instruction::Freeze: { 983 for (unsigned Part = 0; Part < State.UF; ++Part) { 984 Value *Op = State.get(getOperand(0), Part); 985 986 Value *Freeze = Builder.CreateFreeze(Op); 987 State.set(this, Freeze, Part); 988 } 989 break; 990 } 991 case Instruction::ICmp: 992 case Instruction::FCmp: { 993 // Widen compares. Generate vector compares. 994 bool FCmp = Opcode == Instruction::FCmp; 995 for (unsigned Part = 0; Part < State.UF; ++Part) { 996 Value *A = State.get(getOperand(0), Part); 997 Value *B = State.get(getOperand(1), Part); 998 Value *C = nullptr; 999 if (FCmp) { 1000 // Propagate fast math flags. 1001 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 1002 if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue())) 1003 Builder.setFastMathFlags(I->getFastMathFlags()); 1004 C = Builder.CreateFCmp(getPredicate(), A, B); 1005 } else { 1006 C = Builder.CreateICmp(getPredicate(), A, B); 1007 } 1008 State.set(this, C, Part); 1009 State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1010 } 1011 1012 break; 1013 } 1014 default: 1015 // This instruction is not vectorized by simple widening. 1016 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " 1017 << Instruction::getOpcodeName(Opcode)); 1018 llvm_unreachable("Unhandled instruction!"); 1019 } // end of switch. 1020 1021 #if !defined(NDEBUG) 1022 // Verify that VPlan type inference results agree with the type of the 1023 // generated values. 1024 for (unsigned Part = 0; Part < State.UF; ++Part) { 1025 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), 1026 State.VF) == State.get(this, Part)->getType() && 1027 "inferred type and type from generated instructions do not match"); 1028 } 1029 #endif 1030 } 1031 1032 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1033 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, 1034 VPSlotTracker &SlotTracker) const { 1035 O << Indent << "WIDEN "; 1036 printAsOperand(O, SlotTracker); 1037 O << " = " << Instruction::getOpcodeName(Opcode); 1038 printFlags(O); 1039 printOperands(O, SlotTracker); 1040 } 1041 #endif 1042 1043 void VPWidenCastRecipe::execute(VPTransformState &State) { 1044 State.setDebugLocFrom(getDebugLoc()); 1045 auto &Builder = State.Builder; 1046 /// Vectorize casts. 1047 assert(State.VF.isVector() && "Not vectorizing?"); 1048 Type *DestTy = VectorType::get(getResultType(), State.VF); 1049 VPValue *Op = getOperand(0); 1050 for (unsigned Part = 0; Part < State.UF; ++Part) { 1051 if (Part > 0 && Op->isLiveIn()) { 1052 // FIXME: Remove once explicit unrolling is implemented using VPlan. 1053 State.set(this, State.get(this, 0), Part); 1054 continue; 1055 } 1056 Value *A = State.get(Op, Part); 1057 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); 1058 State.set(this, Cast, Part); 1059 State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue())); 1060 } 1061 } 1062 1063 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1064 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, 1065 VPSlotTracker &SlotTracker) const { 1066 O << Indent << "WIDEN-CAST "; 1067 printAsOperand(O, SlotTracker); 1068 O << " = " << Instruction::getOpcodeName(Opcode) << " "; 1069 printFlags(O); 1070 printOperands(O, SlotTracker); 1071 O << " to " << *getResultType(); 1072 } 1073 #endif 1074 1075 /// This function adds 1076 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 1077 /// to each vector element of Val. The sequence starts at StartIndex. 1078 /// \p Opcode is relevant for FP induction variable. 1079 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 1080 Instruction::BinaryOps BinOp, ElementCount VF, 1081 IRBuilderBase &Builder) { 1082 assert(VF.isVector() && "only vector VFs are supported"); 1083 1084 // Create and check the types. 1085 auto *ValVTy = cast<VectorType>(Val->getType()); 1086 ElementCount VLen = ValVTy->getElementCount(); 1087 1088 Type *STy = Val->getType()->getScalarType(); 1089 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1090 "Induction Step must be an integer or FP"); 1091 assert(Step->getType() == STy && "Step has wrong type"); 1092 1093 SmallVector<Constant *, 8> Indices; 1094 1095 // Create a vector of consecutive numbers from zero to VF. 1096 VectorType *InitVecValVTy = ValVTy; 1097 if (STy->isFloatingPointTy()) { 1098 Type *InitVecValSTy = 1099 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 1100 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 1101 } 1102 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 1103 1104 // Splat the StartIdx 1105 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 1106 1107 if (STy->isIntegerTy()) { 1108 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 1109 Step = Builder.CreateVectorSplat(VLen, Step); 1110 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1111 // FIXME: The newly created binary instructions should contain nsw/nuw 1112 // flags, which can be found from the original scalar operations. 1113 Step = Builder.CreateMul(InitVec, Step); 1114 return Builder.CreateAdd(Val, Step, "induction"); 1115 } 1116 1117 // Floating point induction. 1118 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1119 "Binary Opcode should be specified for FP induction"); 1120 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 1121 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 1122 1123 Step = Builder.CreateVectorSplat(VLen, Step); 1124 Value *MulOp = Builder.CreateFMul(InitVec, Step); 1125 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1126 } 1127 1128 /// A helper function that returns an integer or floating-point constant with 1129 /// value C. 1130 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 1131 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 1132 : ConstantFP::get(Ty, C); 1133 } 1134 1135 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 1136 ElementCount VF) { 1137 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1138 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1139 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1140 return B.CreateUIToFP(RuntimeVF, FTy); 1141 } 1142 1143 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 1144 assert(!State.Instance && "Int or FP induction being replicated."); 1145 1146 Value *Start = getStartValue()->getLiveInIRValue(); 1147 const InductionDescriptor &ID = getInductionDescriptor(); 1148 TruncInst *Trunc = getTruncInst(); 1149 IRBuilderBase &Builder = State.Builder; 1150 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1151 assert(State.VF.isVector() && "must have vector VF"); 1152 1153 // The value from the original loop to which we are mapping the new induction 1154 // variable. 1155 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1156 1157 // Fast-math-flags propagate from the original induction instruction. 1158 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 1159 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 1160 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 1161 1162 // Now do the actual transformations, and start with fetching the step value. 1163 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 1164 1165 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1166 "Expected either an induction phi-node or a truncate of it!"); 1167 1168 // Construct the initial value of the vector IV in the vector loop preheader 1169 auto CurrIP = Builder.saveIP(); 1170 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1171 Builder.SetInsertPoint(VectorPH->getTerminator()); 1172 if (isa<TruncInst>(EntryVal)) { 1173 assert(Start->getType()->isIntegerTy() && 1174 "Truncation requires an integer type"); 1175 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1176 Step = Builder.CreateTrunc(Step, TruncType); 1177 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1178 } 1179 1180 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 1181 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 1182 Value *SteppedStart = getStepVector( 1183 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); 1184 1185 // We create vector phi nodes for both integer and floating-point induction 1186 // variables. Here, we determine the kind of arithmetic we will perform. 1187 Instruction::BinaryOps AddOp; 1188 Instruction::BinaryOps MulOp; 1189 if (Step->getType()->isIntegerTy()) { 1190 AddOp = Instruction::Add; 1191 MulOp = Instruction::Mul; 1192 } else { 1193 AddOp = ID.getInductionOpcode(); 1194 MulOp = Instruction::FMul; 1195 } 1196 1197 // Multiply the vectorization factor by the step using integer or 1198 // floating-point arithmetic as appropriate. 1199 Type *StepType = Step->getType(); 1200 Value *RuntimeVF; 1201 if (Step->getType()->isFloatingPointTy()) 1202 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 1203 else 1204 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 1205 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 1206 1207 // Create a vector splat to use in the induction update. 1208 // 1209 // FIXME: If the step is non-constant, we create the vector splat with 1210 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1211 // handle a constant vector splat. 1212 Value *SplatVF = isa<Constant>(Mul) 1213 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 1214 : Builder.CreateVectorSplat(State.VF, Mul); 1215 Builder.restoreIP(CurrIP); 1216 1217 // We may need to add the step a number of times, depending on the unroll 1218 // factor. The last of those goes into the PHI. 1219 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind"); 1220 VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 1221 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1222 Instruction *LastInduction = VecInd; 1223 for (unsigned Part = 0; Part < State.UF; ++Part) { 1224 State.set(this, LastInduction, Part); 1225 1226 if (isa<TruncInst>(EntryVal)) 1227 State.addMetadata(LastInduction, EntryVal); 1228 1229 LastInduction = cast<Instruction>( 1230 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 1231 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1232 } 1233 1234 LastInduction->setName("vec.ind.next"); 1235 VecInd->addIncoming(SteppedStart, VectorPH); 1236 // Add induction update using an incorrect block temporarily. The phi node 1237 // will be fixed after VPlan execution. Note that at this point the latch 1238 // block cannot be used, as it does not exist yet. 1239 // TODO: Model increment value in VPlan, by turning the recipe into a 1240 // multi-def and a subclass of VPHeaderPHIRecipe. 1241 VecInd->addIncoming(LastInduction, VectorPH); 1242 } 1243 1244 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1245 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, 1246 VPSlotTracker &SlotTracker) const { 1247 O << Indent << "WIDEN-INDUCTION"; 1248 if (getTruncInst()) { 1249 O << "\\l\""; 1250 O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\""; 1251 O << " +\n" << Indent << "\" "; 1252 getVPValue(0)->printAsOperand(O, SlotTracker); 1253 } else 1254 O << " " << VPlanIngredient(IV); 1255 1256 O << ", "; 1257 getStepValue()->printAsOperand(O, SlotTracker); 1258 } 1259 #endif 1260 1261 bool VPWidenIntOrFpInductionRecipe::isCanonical() const { 1262 // The step may be defined by a recipe in the preheader (e.g. if it requires 1263 // SCEV expansion), but for the canonical induction the step is required to be 1264 // 1, which is represented as live-in. 1265 if (getStepValue()->getDefiningRecipe()) 1266 return false; 1267 auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue()); 1268 auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); 1269 auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin()); 1270 return StartC && StartC->isZero() && StepC && StepC->isOne() && 1271 getScalarType() == CanIV->getScalarType(); 1272 } 1273 1274 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1275 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, 1276 VPSlotTracker &SlotTracker) const { 1277 O << Indent; 1278 printAsOperand(O, SlotTracker); 1279 O << Indent << "= DERIVED-IV "; 1280 getStartValue()->printAsOperand(O, SlotTracker); 1281 O << " + "; 1282 getOperand(1)->printAsOperand(O, SlotTracker); 1283 O << " * "; 1284 getStepValue()->printAsOperand(O, SlotTracker); 1285 } 1286 #endif 1287 1288 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 1289 // Fast-math-flags propagate from the original induction instruction. 1290 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 1291 if (hasFastMathFlags()) 1292 State.Builder.setFastMathFlags(getFastMathFlags()); 1293 1294 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 1295 /// variable on which to base the steps, \p Step is the size of the step. 1296 1297 Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0)); 1298 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 1299 IRBuilderBase &Builder = State.Builder; 1300 1301 // Ensure step has the same type as that of scalar IV. 1302 Type *BaseIVTy = BaseIV->getType()->getScalarType(); 1303 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!"); 1304 1305 // We build scalar steps for both integer and floating-point induction 1306 // variables. Here, we determine the kind of arithmetic we will perform. 1307 Instruction::BinaryOps AddOp; 1308 Instruction::BinaryOps MulOp; 1309 if (BaseIVTy->isIntegerTy()) { 1310 AddOp = Instruction::Add; 1311 MulOp = Instruction::Mul; 1312 } else { 1313 AddOp = InductionOpcode; 1314 MulOp = Instruction::FMul; 1315 } 1316 1317 // Determine the number of scalars we need to generate for each unroll 1318 // iteration. 1319 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this); 1320 // Compute the scalar steps and save the results in State. 1321 Type *IntStepTy = 1322 IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits()); 1323 Type *VecIVTy = nullptr; 1324 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 1325 if (!FirstLaneOnly && State.VF.isScalable()) { 1326 VecIVTy = VectorType::get(BaseIVTy, State.VF); 1327 UnitStepVec = 1328 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 1329 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 1330 SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV); 1331 } 1332 1333 unsigned StartPart = 0; 1334 unsigned EndPart = State.UF; 1335 unsigned StartLane = 0; 1336 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 1337 if (State.Instance) { 1338 StartPart = State.Instance->Part; 1339 EndPart = StartPart + 1; 1340 StartLane = State.Instance->Lane.getKnownLane(); 1341 EndLane = StartLane + 1; 1342 } 1343 for (unsigned Part = StartPart; Part < EndPart; ++Part) { 1344 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 1345 1346 if (!FirstLaneOnly && State.VF.isScalable()) { 1347 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 1348 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 1349 if (BaseIVTy->isFloatingPointTy()) 1350 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 1351 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 1352 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 1353 State.set(this, Add, Part); 1354 // It's useful to record the lane values too for the known minimum number 1355 // of elements so we do those below. This improves the code quality when 1356 // trying to extract the first element, for example. 1357 } 1358 1359 if (BaseIVTy->isFloatingPointTy()) 1360 StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy); 1361 1362 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { 1363 Value *StartIdx = Builder.CreateBinOp( 1364 AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane)); 1365 // The step returned by `createStepForVF` is a runtime-evaluated value 1366 // when VF is scalable. Otherwise, it should be folded into a Constant. 1367 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 1368 "Expected StartIdx to be folded to a constant when VF is not " 1369 "scalable"); 1370 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 1371 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul); 1372 State.set(this, Add, VPIteration(Part, Lane)); 1373 } 1374 } 1375 } 1376 1377 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1378 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, 1379 VPSlotTracker &SlotTracker) const { 1380 O << Indent; 1381 printAsOperand(O, SlotTracker); 1382 O << " = SCALAR-STEPS "; 1383 printOperands(O, SlotTracker); 1384 } 1385 #endif 1386 1387 void VPWidenGEPRecipe::execute(VPTransformState &State) { 1388 assert(State.VF.isVector() && "not widening"); 1389 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 1390 // Construct a vector GEP by widening the operands of the scalar GEP as 1391 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 1392 // results in a vector of pointers when at least one operand of the GEP 1393 // is vector-typed. Thus, to keep the representation compact, we only use 1394 // vector-typed operands for loop-varying values. 1395 1396 if (areAllOperandsInvariant()) { 1397 // If we are vectorizing, but the GEP has only loop-invariant operands, 1398 // the GEP we build (by only using vector-typed operands for 1399 // loop-varying values) would be a scalar pointer. Thus, to ensure we 1400 // produce a vector of pointers, we need to either arbitrarily pick an 1401 // operand to broadcast, or broadcast a clone of the original GEP. 1402 // Here, we broadcast a clone of the original. 1403 // 1404 // TODO: If at some point we decide to scalarize instructions having 1405 // loop-invariant operands, this special case will no longer be 1406 // required. We would add the scalarization decision to 1407 // collectLoopScalars() and teach getVectorValue() to broadcast 1408 // the lane-zero scalar value. 1409 SmallVector<Value *> Ops; 1410 for (unsigned I = 0, E = getNumOperands(); I != E; I++) 1411 Ops.push_back(State.get(getOperand(I), VPIteration(0, 0))); 1412 1413 auto *NewGEP = 1414 State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0], 1415 ArrayRef(Ops).drop_front(), "", isInBounds()); 1416 for (unsigned Part = 0; Part < State.UF; ++Part) { 1417 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP); 1418 State.set(this, EntryPart, Part); 1419 State.addMetadata(EntryPart, GEP); 1420 } 1421 } else { 1422 // If the GEP has at least one loop-varying operand, we are sure to 1423 // produce a vector of pointers. But if we are only unrolling, we want 1424 // to produce a scalar GEP for each unroll part. Thus, the GEP we 1425 // produce with the code below will be scalar (if VF == 1) or vector 1426 // (otherwise). Note that for the unroll-only case, we still maintain 1427 // values in the vector mapping with initVector, as we do for other 1428 // instructions. 1429 for (unsigned Part = 0; Part < State.UF; ++Part) { 1430 // The pointer operand of the new GEP. If it's loop-invariant, we 1431 // won't broadcast it. 1432 auto *Ptr = isPointerLoopInvariant() 1433 ? State.get(getOperand(0), VPIteration(0, 0)) 1434 : State.get(getOperand(0), Part); 1435 1436 // Collect all the indices for the new GEP. If any index is 1437 // loop-invariant, we won't broadcast it. 1438 SmallVector<Value *, 4> Indices; 1439 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 1440 VPValue *Operand = getOperand(I); 1441 if (isIndexLoopInvariant(I - 1)) 1442 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 1443 else 1444 Indices.push_back(State.get(Operand, Part)); 1445 } 1446 1447 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 1448 // but it should be a vector, otherwise. 1449 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, 1450 Indices, "", isInBounds()); 1451 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 1452 "NewGEP is not a pointer vector"); 1453 State.set(this, NewGEP, Part); 1454 State.addMetadata(NewGEP, GEP); 1455 } 1456 } 1457 } 1458 1459 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1460 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, 1461 VPSlotTracker &SlotTracker) const { 1462 O << Indent << "WIDEN-GEP "; 1463 O << (isPointerLoopInvariant() ? "Inv" : "Var"); 1464 for (size_t I = 0; I < getNumOperands() - 1; ++I) 1465 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]"; 1466 1467 O << " "; 1468 printAsOperand(O, SlotTracker); 1469 O << " = getelementptr"; 1470 printFlags(O); 1471 printOperands(O, SlotTracker); 1472 } 1473 #endif 1474 1475 void VPVectorPointerRecipe ::execute(VPTransformState &State) { 1476 auto &Builder = State.Builder; 1477 State.setDebugLocFrom(getDebugLoc()); 1478 for (unsigned Part = 0; Part < State.UF; ++Part) { 1479 // Calculate the pointer for the specific unroll-part. 1480 Value *PartPtr = nullptr; 1481 // Use i32 for the gep index type when the value is constant, 1482 // or query DataLayout for a more suitable index type otherwise. 1483 const DataLayout &DL = 1484 Builder.GetInsertBlock()->getModule()->getDataLayout(); 1485 Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0) 1486 ? DL.getIndexType(IndexedTy->getPointerTo()) 1487 : Builder.getInt32Ty(); 1488 Value *Ptr = State.get(getOperand(0), VPIteration(0, 0)); 1489 bool InBounds = isInBounds(); 1490 if (IsReverse) { 1491 // If the address is consecutive but reversed, then the 1492 // wide store needs to start at the last vector element. 1493 // RunTimeVF = VScale * VF.getKnownMinValue() 1494 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 1495 Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF); 1496 // NumElt = -Part * RunTimeVF 1497 Value *NumElt = Builder.CreateMul( 1498 ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF); 1499 // LastLane = 1 - RunTimeVF 1500 Value *LastLane = 1501 Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); 1502 PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds); 1503 PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds); 1504 } else { 1505 Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part); 1506 PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds); 1507 } 1508 1509 State.set(this, PartPtr, Part, /*IsScalar*/ true); 1510 } 1511 } 1512 1513 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1514 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, 1515 VPSlotTracker &SlotTracker) const { 1516 O << Indent; 1517 printAsOperand(O, SlotTracker); 1518 O << " = vector-pointer "; 1519 if (IsReverse) 1520 O << "(reverse) "; 1521 1522 printOperands(O, SlotTracker); 1523 } 1524 #endif 1525 1526 void VPBlendRecipe::execute(VPTransformState &State) { 1527 State.setDebugLocFrom(getDebugLoc()); 1528 // We know that all PHIs in non-header blocks are converted into 1529 // selects, so we don't have to worry about the insertion order and we 1530 // can just use the builder. 1531 // At this point we generate the predication tree. There may be 1532 // duplications since this is a simple recursive scan, but future 1533 // optimizations will clean it up. 1534 1535 unsigned NumIncoming = getNumIncomingValues(); 1536 1537 // Generate a sequence of selects of the form: 1538 // SELECT(Mask3, In3, 1539 // SELECT(Mask2, In2, 1540 // SELECT(Mask1, In1, 1541 // In0))) 1542 // Note that Mask0 is never used: lanes for which no path reaches this phi and 1543 // are essentially undef are taken from In0. 1544 VectorParts Entry(State.UF); 1545 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 1546 for (unsigned In = 0; In < NumIncoming; ++In) { 1547 for (unsigned Part = 0; Part < State.UF; ++Part) { 1548 // We might have single edge PHIs (blocks) - use an identity 1549 // 'select' for the first PHI operand. 1550 Value *In0 = State.get(getIncomingValue(In), Part, OnlyFirstLaneUsed); 1551 if (In == 0) 1552 Entry[Part] = In0; // Initialize with the first incoming value. 1553 else { 1554 // Select between the current value and the previous incoming edge 1555 // based on the incoming mask. 1556 Value *Cond = State.get(getMask(In), Part, OnlyFirstLaneUsed); 1557 Entry[Part] = 1558 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 1559 } 1560 } 1561 } 1562 for (unsigned Part = 0; Part < State.UF; ++Part) 1563 State.set(this, Entry[Part], Part, OnlyFirstLaneUsed); 1564 } 1565 1566 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1567 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, 1568 VPSlotTracker &SlotTracker) const { 1569 O << Indent << "BLEND "; 1570 printAsOperand(O, SlotTracker); 1571 O << " ="; 1572 if (getNumIncomingValues() == 1) { 1573 // Not a User of any mask: not really blending, this is a 1574 // single-predecessor phi. 1575 O << " "; 1576 getIncomingValue(0)->printAsOperand(O, SlotTracker); 1577 } else { 1578 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { 1579 O << " "; 1580 getIncomingValue(I)->printAsOperand(O, SlotTracker); 1581 if (I == 0) 1582 continue; 1583 O << "/"; 1584 getMask(I)->printAsOperand(O, SlotTracker); 1585 } 1586 } 1587 } 1588 #endif 1589 1590 void VPReductionRecipe::execute(VPTransformState &State) { 1591 assert(!State.Instance && "Reduction being replicated."); 1592 Value *PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true); 1593 RecurKind Kind = RdxDesc.getRecurrenceKind(); 1594 // Propagate the fast-math flags carried by the underlying instruction. 1595 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 1596 State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 1597 for (unsigned Part = 0; Part < State.UF; ++Part) { 1598 Value *NewVecOp = State.get(getVecOp(), Part); 1599 if (VPValue *Cond = getCondOp()) { 1600 Value *NewCond = State.get(Cond, Part, State.VF.isScalar()); 1601 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType()); 1602 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); 1603 Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy, 1604 RdxDesc.getFastMathFlags()); 1605 if (State.VF.isVector()) { 1606 Iden = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 1607 } 1608 1609 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden); 1610 NewVecOp = Select; 1611 } 1612 Value *NewRed; 1613 Value *NextInChain; 1614 if (IsOrdered) { 1615 if (State.VF.isVector()) 1616 NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp, 1617 PrevInChain); 1618 else 1619 NewRed = State.Builder.CreateBinOp( 1620 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain, 1621 NewVecOp); 1622 PrevInChain = NewRed; 1623 } else { 1624 PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true); 1625 NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp); 1626 } 1627 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 1628 NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(), 1629 NewRed, PrevInChain); 1630 } else if (IsOrdered) 1631 NextInChain = NewRed; 1632 else 1633 NextInChain = State.Builder.CreateBinOp( 1634 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain); 1635 State.set(this, NextInChain, Part, /*IsScalar*/ true); 1636 } 1637 } 1638 1639 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1640 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, 1641 VPSlotTracker &SlotTracker) const { 1642 O << Indent << "REDUCE "; 1643 printAsOperand(O, SlotTracker); 1644 O << " = "; 1645 getChainOp()->printAsOperand(O, SlotTracker); 1646 O << " +"; 1647 if (isa<FPMathOperator>(getUnderlyingInstr())) 1648 O << getUnderlyingInstr()->getFastMathFlags(); 1649 O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; 1650 getVecOp()->printAsOperand(O, SlotTracker); 1651 if (getCondOp()) { 1652 O << ", "; 1653 getCondOp()->printAsOperand(O, SlotTracker); 1654 } 1655 O << ")"; 1656 if (RdxDesc.IntermediateStore) 1657 O << " (with final reduction value stored in invariant address sank " 1658 "outside of loop)"; 1659 } 1660 #endif 1661 1662 bool VPReplicateRecipe::shouldPack() const { 1663 // Find if the recipe is used by a widened recipe via an intervening 1664 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. 1665 return any_of(users(), [](const VPUser *U) { 1666 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U)) 1667 return any_of(PredR->users(), [PredR](const VPUser *U) { 1668 return !U->usesScalars(PredR); 1669 }); 1670 return false; 1671 }); 1672 } 1673 1674 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1675 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, 1676 VPSlotTracker &SlotTracker) const { 1677 O << Indent << (IsUniform ? "CLONE " : "REPLICATE "); 1678 1679 if (!getUnderlyingInstr()->getType()->isVoidTy()) { 1680 printAsOperand(O, SlotTracker); 1681 O << " = "; 1682 } 1683 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) { 1684 O << "call"; 1685 printFlags(O); 1686 O << "@" << CB->getCalledFunction()->getName() << "("; 1687 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), 1688 O, [&O, &SlotTracker](VPValue *Op) { 1689 Op->printAsOperand(O, SlotTracker); 1690 }); 1691 O << ")"; 1692 } else { 1693 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); 1694 printFlags(O); 1695 printOperands(O, SlotTracker); 1696 } 1697 1698 if (shouldPack()) 1699 O << " (S->V)"; 1700 } 1701 #endif 1702 1703 /// Checks if \p C is uniform across all VFs and UFs. It is considered as such 1704 /// if it is either defined outside the vector region or its operand is known to 1705 /// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI). 1706 /// TODO: Uniformity should be associated with a VPValue and there should be a 1707 /// generic way to check. 1708 static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) { 1709 return C->isDefinedOutsideVectorRegions() || 1710 isa<VPDerivedIVRecipe>(C->getOperand(0)) || 1711 isa<VPCanonicalIVPHIRecipe>(C->getOperand(0)); 1712 } 1713 1714 Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) { 1715 assert(vputils::onlyFirstLaneUsed(this) && 1716 "Codegen only implemented for first lane."); 1717 switch (Opcode) { 1718 case Instruction::SExt: 1719 case Instruction::ZExt: 1720 case Instruction::Trunc: { 1721 // Note: SExt/ZExt not used yet. 1722 Value *Op = State.get(getOperand(0), VPIteration(Part, 0)); 1723 return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy); 1724 } 1725 default: 1726 llvm_unreachable("opcode not implemented yet"); 1727 } 1728 } 1729 1730 void VPScalarCastRecipe ::execute(VPTransformState &State) { 1731 bool IsUniformAcrossVFsAndUFs = isUniformAcrossVFsAndUFs(this); 1732 for (unsigned Part = 0; Part != State.UF; ++Part) { 1733 Value *Res; 1734 // Only generate a single instance, if the recipe is uniform across UFs and 1735 // VFs. 1736 if (Part > 0 && IsUniformAcrossVFsAndUFs) 1737 Res = State.get(this, VPIteration(0, 0)); 1738 else 1739 Res = generate(State, Part); 1740 State.set(this, Res, VPIteration(Part, 0)); 1741 } 1742 } 1743 1744 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1745 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent, 1746 VPSlotTracker &SlotTracker) const { 1747 O << Indent << "SCALAR-CAST "; 1748 printAsOperand(O, SlotTracker); 1749 O << " = " << Instruction::getOpcodeName(Opcode) << " "; 1750 printOperands(O, SlotTracker); 1751 O << " to " << *ResultTy; 1752 } 1753 #endif 1754 1755 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 1756 assert(State.Instance && "Branch on Mask works only on single instance."); 1757 1758 unsigned Part = State.Instance->Part; 1759 unsigned Lane = State.Instance->Lane.getKnownLane(); 1760 1761 Value *ConditionBit = nullptr; 1762 VPValue *BlockInMask = getMask(); 1763 if (BlockInMask) { 1764 ConditionBit = State.get(BlockInMask, Part); 1765 if (ConditionBit->getType()->isVectorTy()) 1766 ConditionBit = State.Builder.CreateExtractElement( 1767 ConditionBit, State.Builder.getInt32(Lane)); 1768 } else // Block in mask is all-one. 1769 ConditionBit = State.Builder.getTrue(); 1770 1771 // Replace the temporary unreachable terminator with a new conditional branch, 1772 // whose two destinations will be set later when they are created. 1773 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 1774 assert(isa<UnreachableInst>(CurrentTerminator) && 1775 "Expected to replace unreachable terminator with conditional branch."); 1776 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 1777 CondBr->setSuccessor(0, nullptr); 1778 ReplaceInstWithInst(CurrentTerminator, CondBr); 1779 } 1780 1781 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 1782 assert(State.Instance && "Predicated instruction PHI works per instance."); 1783 Instruction *ScalarPredInst = 1784 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 1785 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 1786 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 1787 assert(PredicatingBB && "Predicated block has no single predecessor."); 1788 assert(isa<VPReplicateRecipe>(getOperand(0)) && 1789 "operand must be VPReplicateRecipe"); 1790 1791 // By current pack/unpack logic we need to generate only a single phi node: if 1792 // a vector value for the predicated instruction exists at this point it means 1793 // the instruction has vector users only, and a phi for the vector value is 1794 // needed. In this case the recipe of the predicated instruction is marked to 1795 // also do that packing, thereby "hoisting" the insert-element sequence. 1796 // Otherwise, a phi node for the scalar value is needed. 1797 unsigned Part = State.Instance->Part; 1798 if (State.hasVectorValue(getOperand(0), Part)) { 1799 Value *VectorValue = State.get(getOperand(0), Part); 1800 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 1801 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 1802 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 1803 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 1804 if (State.hasVectorValue(this, Part)) 1805 State.reset(this, VPhi, Part); 1806 else 1807 State.set(this, VPhi, Part); 1808 // NOTE: Currently we need to update the value of the operand, so the next 1809 // predicated iteration inserts its generated value in the correct vector. 1810 State.reset(getOperand(0), VPhi, Part); 1811 } else { 1812 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 1813 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 1814 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 1815 PredicatingBB); 1816 Phi->addIncoming(ScalarPredInst, PredicatedBB); 1817 if (State.hasScalarValue(this, *State.Instance)) 1818 State.reset(this, Phi, *State.Instance); 1819 else 1820 State.set(this, Phi, *State.Instance); 1821 // NOTE: Currently we need to update the value of the operand, so the next 1822 // predicated iteration inserts its generated value in the correct vector. 1823 State.reset(getOperand(0), Phi, *State.Instance); 1824 } 1825 } 1826 1827 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1828 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, 1829 VPSlotTracker &SlotTracker) const { 1830 O << Indent << "PHI-PREDICATED-INSTRUCTION "; 1831 printAsOperand(O, SlotTracker); 1832 O << " = "; 1833 printOperands(O, SlotTracker); 1834 } 1835 1836 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, 1837 VPSlotTracker &SlotTracker) const { 1838 O << Indent << "WIDEN "; 1839 printAsOperand(O, SlotTracker); 1840 O << " = load "; 1841 printOperands(O, SlotTracker); 1842 } 1843 1844 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, 1845 VPSlotTracker &SlotTracker) const { 1846 O << Indent << "WIDEN "; 1847 printAsOperand(O, SlotTracker); 1848 O << " = vp.load "; 1849 printOperands(O, SlotTracker); 1850 } 1851 1852 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, 1853 VPSlotTracker &SlotTracker) const { 1854 O << Indent << "WIDEN store "; 1855 printOperands(O, SlotTracker); 1856 } 1857 1858 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent, 1859 VPSlotTracker &SlotTracker) const { 1860 O << Indent << "WIDEN vp.store "; 1861 printOperands(O, SlotTracker); 1862 } 1863 #endif 1864 1865 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { 1866 Value *Start = getStartValue()->getLiveInIRValue(); 1867 PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index"); 1868 EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 1869 1870 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1871 EntryPart->addIncoming(Start, VectorPH); 1872 EntryPart->setDebugLoc(getDebugLoc()); 1873 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) 1874 State.set(this, EntryPart, Part, /*IsScalar*/ true); 1875 } 1876 1877 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1878 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 1879 VPSlotTracker &SlotTracker) const { 1880 O << Indent << "EMIT "; 1881 printAsOperand(O, SlotTracker); 1882 O << " = CANONICAL-INDUCTION "; 1883 printOperands(O, SlotTracker); 1884 } 1885 #endif 1886 1887 bool VPCanonicalIVPHIRecipe::isCanonical( 1888 InductionDescriptor::InductionKind Kind, VPValue *Start, 1889 VPValue *Step) const { 1890 // Must be an integer induction. 1891 if (Kind != InductionDescriptor::IK_IntInduction) 1892 return false; 1893 // Start must match the start value of this canonical induction. 1894 if (Start != getStartValue()) 1895 return false; 1896 1897 // If the step is defined by a recipe, it is not a ConstantInt. 1898 if (Step->getDefiningRecipe()) 1899 return false; 1900 1901 ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue()); 1902 return StepC && StepC->isOne(); 1903 } 1904 1905 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) { 1906 return IsScalarAfterVectorization && 1907 (!IsScalable || vputils::onlyFirstLaneUsed(this)); 1908 } 1909 1910 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1911 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, 1912 VPSlotTracker &SlotTracker) const { 1913 O << Indent << "EMIT "; 1914 printAsOperand(O, SlotTracker); 1915 O << " = WIDEN-POINTER-INDUCTION "; 1916 getStartValue()->printAsOperand(O, SlotTracker); 1917 O << ", " << *IndDesc.getStep(); 1918 } 1919 #endif 1920 1921 void VPExpandSCEVRecipe::execute(VPTransformState &State) { 1922 assert(!State.Instance && "cannot be used in per-lane"); 1923 const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout(); 1924 SCEVExpander Exp(SE, DL, "induction"); 1925 1926 Value *Res = Exp.expandCodeFor(Expr, Expr->getType(), 1927 &*State.Builder.GetInsertPoint()); 1928 assert(!State.ExpandedSCEVs.contains(Expr) && 1929 "Same SCEV expanded multiple times"); 1930 State.ExpandedSCEVs[Expr] = Res; 1931 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) 1932 State.set(this, Res, {Part, 0}); 1933 } 1934 1935 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1936 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, 1937 VPSlotTracker &SlotTracker) const { 1938 O << Indent << "EMIT "; 1939 getVPSingleValue()->printAsOperand(O, SlotTracker); 1940 O << " = EXPAND SCEV " << *Expr; 1941 } 1942 #endif 1943 1944 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { 1945 Value *CanonicalIV = State.get(getOperand(0), 0, /*IsScalar*/ true); 1946 Type *STy = CanonicalIV->getType(); 1947 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); 1948 ElementCount VF = State.VF; 1949 Value *VStart = VF.isScalar() 1950 ? CanonicalIV 1951 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); 1952 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { 1953 Value *VStep = createStepForVF(Builder, STy, VF, Part); 1954 if (VF.isVector()) { 1955 VStep = Builder.CreateVectorSplat(VF, VStep); 1956 VStep = 1957 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); 1958 } 1959 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); 1960 State.set(this, CanonicalVectorIV, Part); 1961 } 1962 } 1963 1964 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1965 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, 1966 VPSlotTracker &SlotTracker) const { 1967 O << Indent << "EMIT "; 1968 printAsOperand(O, SlotTracker); 1969 O << " = WIDEN-CANONICAL-INDUCTION "; 1970 printOperands(O, SlotTracker); 1971 } 1972 #endif 1973 1974 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { 1975 auto &Builder = State.Builder; 1976 // Create a vector from the initial value. 1977 auto *VectorInit = getStartValue()->getLiveInIRValue(); 1978 1979 Type *VecTy = State.VF.isScalar() 1980 ? VectorInit->getType() 1981 : VectorType::get(VectorInit->getType(), State.VF); 1982 1983 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1984 if (State.VF.isVector()) { 1985 auto *IdxTy = Builder.getInt32Ty(); 1986 auto *One = ConstantInt::get(IdxTy, 1); 1987 IRBuilder<>::InsertPointGuard Guard(Builder); 1988 Builder.SetInsertPoint(VectorPH->getTerminator()); 1989 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF); 1990 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 1991 VectorInit = Builder.CreateInsertElement( 1992 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init"); 1993 } 1994 1995 // Create a phi node for the new recurrence. 1996 PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur"); 1997 EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 1998 EntryPart->addIncoming(VectorInit, VectorPH); 1999 State.set(this, EntryPart, 0); 2000 } 2001 2002 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2003 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, 2004 VPSlotTracker &SlotTracker) const { 2005 O << Indent << "FIRST-ORDER-RECURRENCE-PHI "; 2006 printAsOperand(O, SlotTracker); 2007 O << " = phi "; 2008 printOperands(O, SlotTracker); 2009 } 2010 #endif 2011 2012 void VPReductionPHIRecipe::execute(VPTransformState &State) { 2013 auto &Builder = State.Builder; 2014 2015 // Reductions do not have to start at zero. They can start with 2016 // any loop invariant values. 2017 VPValue *StartVPV = getStartValue(); 2018 Value *StartV = StartVPV->getLiveInIRValue(); 2019 2020 // In order to support recurrences we need to be able to vectorize Phi nodes. 2021 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 2022 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 2023 // this value when we vectorize all of the instructions that use the PHI. 2024 bool ScalarPHI = State.VF.isScalar() || IsInLoop; 2025 Type *VecTy = ScalarPHI ? StartV->getType() 2026 : VectorType::get(StartV->getType(), State.VF); 2027 2028 BasicBlock *HeaderBB = State.CFG.PrevBB; 2029 assert(State.CurrentVectorLoop->getHeader() == HeaderBB && 2030 "recipe must be in the vector loop header"); 2031 unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF; 2032 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { 2033 Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi"); 2034 EntryPart->insertBefore(HeaderBB->getFirstInsertionPt()); 2035 State.set(this, EntryPart, Part, IsInLoop); 2036 } 2037 2038 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2039 2040 Value *Iden = nullptr; 2041 RecurKind RK = RdxDesc.getRecurrenceKind(); 2042 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || 2043 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { 2044 // MinMax and AnyOf reductions have the start value as their identity. 2045 if (ScalarPHI) { 2046 Iden = StartV; 2047 } else { 2048 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 2049 Builder.SetInsertPoint(VectorPH->getTerminator()); 2050 StartV = Iden = 2051 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 2052 } 2053 } else { 2054 Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(), 2055 RdxDesc.getFastMathFlags()); 2056 2057 if (!ScalarPHI) { 2058 Iden = Builder.CreateVectorSplat(State.VF, Iden); 2059 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 2060 Builder.SetInsertPoint(VectorPH->getTerminator()); 2061 Constant *Zero = Builder.getInt32(0); 2062 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 2063 } 2064 } 2065 2066 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { 2067 Value *EntryPart = State.get(this, Part, IsInLoop); 2068 // Make sure to add the reduction start value only to the 2069 // first unroll part. 2070 Value *StartVal = (Part == 0) ? StartV : Iden; 2071 cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH); 2072 } 2073 } 2074 2075 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2076 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2077 VPSlotTracker &SlotTracker) const { 2078 O << Indent << "WIDEN-REDUCTION-PHI "; 2079 2080 printAsOperand(O, SlotTracker); 2081 O << " = phi "; 2082 printOperands(O, SlotTracker); 2083 } 2084 #endif 2085 2086 void VPWidenPHIRecipe::execute(VPTransformState &State) { 2087 assert(EnableVPlanNativePath && 2088 "Non-native vplans are not expected to have VPWidenPHIRecipes."); 2089 2090 Value *Op0 = State.get(getOperand(0), 0); 2091 Type *VecTy = Op0->getType(); 2092 Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi"); 2093 State.set(this, VecPhi, 0); 2094 } 2095 2096 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2097 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2098 VPSlotTracker &SlotTracker) const { 2099 O << Indent << "WIDEN-PHI "; 2100 2101 auto *OriginalPhi = cast<PHINode>(getUnderlyingValue()); 2102 // Unless all incoming values are modeled in VPlan print the original PHI 2103 // directly. 2104 // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming 2105 // values as VPValues. 2106 if (getNumOperands() != OriginalPhi->getNumOperands()) { 2107 O << VPlanIngredient(OriginalPhi); 2108 return; 2109 } 2110 2111 printAsOperand(O, SlotTracker); 2112 O << " = phi "; 2113 printOperands(O, SlotTracker); 2114 } 2115 #endif 2116 2117 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and 2118 // remove VPActiveLaneMaskPHIRecipe. 2119 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { 2120 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2121 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { 2122 Value *StartMask = State.get(getOperand(0), Part); 2123 PHINode *EntryPart = 2124 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); 2125 EntryPart->addIncoming(StartMask, VectorPH); 2126 EntryPart->setDebugLoc(getDebugLoc()); 2127 State.set(this, EntryPart, Part); 2128 } 2129 } 2130 2131 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2132 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2133 VPSlotTracker &SlotTracker) const { 2134 O << Indent << "ACTIVE-LANE-MASK-PHI "; 2135 2136 printAsOperand(O, SlotTracker); 2137 O << " = phi "; 2138 printOperands(O, SlotTracker); 2139 } 2140 #endif 2141 2142 void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) { 2143 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2144 assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization."); 2145 Value *Start = State.get(getOperand(0), VPIteration(0, 0)); 2146 PHINode *EntryPart = 2147 State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv"); 2148 EntryPart->addIncoming(Start, VectorPH); 2149 EntryPart->setDebugLoc(getDebugLoc()); 2150 State.set(this, EntryPart, 0, /*IsScalar=*/true); 2151 } 2152 2153 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2154 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2155 VPSlotTracker &SlotTracker) const { 2156 O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI "; 2157 2158 printAsOperand(O, SlotTracker); 2159 O << " = phi "; 2160 printOperands(O, SlotTracker); 2161 } 2162 #endif 2163