1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file contains implementations for different VPlan recipes. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "VPlan.h" 15 #include "VPlanAnalysis.h" 16 #include "llvm/ADT/STLExtras.h" 17 #include "llvm/ADT/SmallVector.h" 18 #include "llvm/ADT/Twine.h" 19 #include "llvm/Analysis/IVDescriptors.h" 20 #include "llvm/IR/BasicBlock.h" 21 #include "llvm/IR/IRBuilder.h" 22 #include "llvm/IR/Instruction.h" 23 #include "llvm/IR/Instructions.h" 24 #include "llvm/IR/Type.h" 25 #include "llvm/IR/Value.h" 26 #include "llvm/Support/Casting.h" 27 #include "llvm/Support/CommandLine.h" 28 #include "llvm/Support/Debug.h" 29 #include "llvm/Support/raw_ostream.h" 30 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 31 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 32 #include <cassert> 33 34 using namespace llvm; 35 36 using VectorParts = SmallVector<Value *, 2>; 37 38 namespace llvm { 39 extern cl::opt<bool> EnableVPlanNativePath; 40 } 41 42 #define LV_NAME "loop-vectorize" 43 #define DEBUG_TYPE LV_NAME 44 45 bool VPRecipeBase::mayWriteToMemory() const { 46 switch (getVPDefID()) { 47 case VPInterleaveSC: 48 return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0; 49 case VPWidenMemoryInstructionSC: { 50 return cast<VPWidenMemoryInstructionRecipe>(this)->isStore(); 51 } 52 case VPReplicateSC: 53 case VPWidenCallSC: 54 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 55 ->mayWriteToMemory(); 56 case VPBranchOnMaskSC: 57 case VPScalarIVStepsSC: 58 case VPPredInstPHISC: 59 return false; 60 case VPBlendSC: 61 case VPReductionSC: 62 case VPWidenCanonicalIVSC: 63 case VPWidenCastSC: 64 case VPWidenGEPSC: 65 case VPWidenIntOrFpInductionSC: 66 case VPWidenPHISC: 67 case VPWidenSC: 68 case VPWidenSelectSC: { 69 const Instruction *I = 70 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 71 (void)I; 72 assert((!I || !I->mayWriteToMemory()) && 73 "underlying instruction may write to memory"); 74 return false; 75 } 76 default: 77 return true; 78 } 79 } 80 81 bool VPRecipeBase::mayReadFromMemory() const { 82 switch (getVPDefID()) { 83 case VPWidenMemoryInstructionSC: { 84 return !cast<VPWidenMemoryInstructionRecipe>(this)->isStore(); 85 } 86 case VPReplicateSC: 87 case VPWidenCallSC: 88 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 89 ->mayReadFromMemory(); 90 case VPBranchOnMaskSC: 91 case VPScalarIVStepsSC: 92 case VPPredInstPHISC: 93 return false; 94 case VPBlendSC: 95 case VPReductionSC: 96 case VPWidenCanonicalIVSC: 97 case VPWidenCastSC: 98 case VPWidenGEPSC: 99 case VPWidenIntOrFpInductionSC: 100 case VPWidenPHISC: 101 case VPWidenSC: 102 case VPWidenSelectSC: { 103 const Instruction *I = 104 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 105 (void)I; 106 assert((!I || !I->mayReadFromMemory()) && 107 "underlying instruction may read from memory"); 108 return false; 109 } 110 default: 111 return true; 112 } 113 } 114 115 bool VPRecipeBase::mayHaveSideEffects() const { 116 switch (getVPDefID()) { 117 case VPDerivedIVSC: 118 case VPPredInstPHISC: 119 return false; 120 case VPInstructionSC: 121 switch (cast<VPInstruction>(this)->getOpcode()) { 122 case Instruction::ICmp: 123 case VPInstruction::Not: 124 case VPInstruction::CalculateTripCountMinusVF: 125 case VPInstruction::CanonicalIVIncrementForPart: 126 return false; 127 default: 128 return true; 129 } 130 case VPWidenCallSC: 131 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 132 ->mayHaveSideEffects(); 133 case VPBlendSC: 134 case VPReductionSC: 135 case VPScalarIVStepsSC: 136 case VPWidenCanonicalIVSC: 137 case VPWidenCastSC: 138 case VPWidenGEPSC: 139 case VPWidenIntOrFpInductionSC: 140 case VPWidenPHISC: 141 case VPWidenPointerInductionSC: 142 case VPWidenSC: 143 case VPWidenSelectSC: { 144 const Instruction *I = 145 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 146 (void)I; 147 assert((!I || !I->mayHaveSideEffects()) && 148 "underlying instruction has side-effects"); 149 return false; 150 } 151 case VPInterleaveSC: 152 return mayWriteToMemory(); 153 case VPWidenMemoryInstructionSC: 154 assert(cast<VPWidenMemoryInstructionRecipe>(this) 155 ->getIngredient() 156 .mayHaveSideEffects() == mayWriteToMemory() && 157 "mayHaveSideffects result for ingredient differs from this " 158 "implementation"); 159 return mayWriteToMemory(); 160 case VPReplicateSC: { 161 auto *R = cast<VPReplicateRecipe>(this); 162 return R->getUnderlyingInstr()->mayHaveSideEffects(); 163 } 164 default: 165 return true; 166 } 167 } 168 169 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { 170 auto Lane = VPLane::getLastLaneForVF(State.VF); 171 VPValue *ExitValue = getOperand(0); 172 if (vputils::isUniformAfterVectorization(ExitValue)) 173 Lane = VPLane::getFirstLane(); 174 VPBasicBlock *MiddleVPBB = 175 cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor()); 176 assert(MiddleVPBB->getNumSuccessors() == 0 && 177 "the middle block must not have any successors"); 178 BasicBlock *MiddleBB = State.CFG.VPBB2IRBB[MiddleVPBB]; 179 Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)), 180 MiddleBB); 181 } 182 183 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 184 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { 185 O << "Live-out "; 186 getPhi()->printAsOperand(O); 187 O << " = "; 188 getOperand(0)->printAsOperand(O, SlotTracker); 189 O << "\n"; 190 } 191 #endif 192 193 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { 194 assert(!Parent && "Recipe already in some VPBasicBlock"); 195 assert(InsertPos->getParent() && 196 "Insertion position not in any VPBasicBlock"); 197 Parent = InsertPos->getParent(); 198 Parent->getRecipeList().insert(InsertPos->getIterator(), this); 199 } 200 201 void VPRecipeBase::insertBefore(VPBasicBlock &BB, 202 iplist<VPRecipeBase>::iterator I) { 203 assert(!Parent && "Recipe already in some VPBasicBlock"); 204 assert(I == BB.end() || I->getParent() == &BB); 205 Parent = &BB; 206 BB.getRecipeList().insert(I, this); 207 } 208 209 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { 210 assert(!Parent && "Recipe already in some VPBasicBlock"); 211 assert(InsertPos->getParent() && 212 "Insertion position not in any VPBasicBlock"); 213 Parent = InsertPos->getParent(); 214 Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this); 215 } 216 217 void VPRecipeBase::removeFromParent() { 218 assert(getParent() && "Recipe not in any VPBasicBlock"); 219 getParent()->getRecipeList().remove(getIterator()); 220 Parent = nullptr; 221 } 222 223 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { 224 assert(getParent() && "Recipe not in any VPBasicBlock"); 225 return getParent()->getRecipeList().erase(getIterator()); 226 } 227 228 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { 229 removeFromParent(); 230 insertAfter(InsertPos); 231 } 232 233 void VPRecipeBase::moveBefore(VPBasicBlock &BB, 234 iplist<VPRecipeBase>::iterator I) { 235 removeFromParent(); 236 insertBefore(BB, I); 237 } 238 239 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { 240 assert(OpType == OperationType::FPMathOp && 241 "recipe doesn't have fast math flags"); 242 FastMathFlags Res; 243 Res.setAllowReassoc(FMFs.AllowReassoc); 244 Res.setNoNaNs(FMFs.NoNaNs); 245 Res.setNoInfs(FMFs.NoInfs); 246 Res.setNoSignedZeros(FMFs.NoSignedZeros); 247 Res.setAllowReciprocal(FMFs.AllowReciprocal); 248 Res.setAllowContract(FMFs.AllowContract); 249 Res.setApproxFunc(FMFs.ApproxFunc); 250 return Res; 251 } 252 253 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, 254 VPValue *A, VPValue *B, DebugLoc DL, 255 const Twine &Name) 256 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}), 257 Pred, DL), 258 VPValue(this), Opcode(Opcode), Name(Name.str()) { 259 assert(Opcode == Instruction::ICmp && 260 "only ICmp predicates supported at the moment"); 261 } 262 263 VPInstruction::VPInstruction(unsigned Opcode, 264 std::initializer_list<VPValue *> Operands, 265 FastMathFlags FMFs, DebugLoc DL, const Twine &Name) 266 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL), 267 VPValue(this), Opcode(Opcode), Name(Name.str()) { 268 // Make sure the VPInstruction is a floating-point operation. 269 assert(isFPMathOp() && "this op can't take fast-math flags"); 270 } 271 272 Value *VPInstruction::generateInstruction(VPTransformState &State, 273 unsigned Part) { 274 IRBuilderBase &Builder = State.Builder; 275 Builder.SetCurrentDebugLocation(getDebugLoc()); 276 277 if (Instruction::isBinaryOp(getOpcode())) { 278 if (Part != 0 && vputils::onlyFirstPartUsed(this)) 279 return State.get(this, 0); 280 281 Value *A = State.get(getOperand(0), Part); 282 Value *B = State.get(getOperand(1), Part); 283 auto *Res = 284 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); 285 if (auto *I = dyn_cast<Instruction>(Res)) 286 setFlags(I); 287 return Res; 288 } 289 290 switch (getOpcode()) { 291 case VPInstruction::Not: { 292 Value *A = State.get(getOperand(0), Part); 293 return Builder.CreateNot(A, Name); 294 } 295 case Instruction::ICmp: { 296 Value *A = State.get(getOperand(0), Part); 297 Value *B = State.get(getOperand(1), Part); 298 return Builder.CreateCmp(getPredicate(), A, B, Name); 299 } 300 case Instruction::Select: { 301 Value *Cond = State.get(getOperand(0), Part); 302 Value *Op1 = State.get(getOperand(1), Part); 303 Value *Op2 = State.get(getOperand(2), Part); 304 return Builder.CreateSelect(Cond, Op1, Op2, Name); 305 } 306 case VPInstruction::ActiveLaneMask: { 307 // Get first lane of vector induction variable. 308 Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); 309 // Get the original loop tripcount. 310 Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0)); 311 312 auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); 313 auto *PredTy = VectorType::get(Int1Ty, State.VF); 314 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, 315 {PredTy, ScalarTC->getType()}, 316 {VIVElem0, ScalarTC}, nullptr, Name); 317 } 318 case VPInstruction::FirstOrderRecurrenceSplice: { 319 // Generate code to combine the previous and current values in vector v3. 320 // 321 // vector.ph: 322 // v_init = vector(..., ..., ..., a[-1]) 323 // br vector.body 324 // 325 // vector.body 326 // i = phi [0, vector.ph], [i+4, vector.body] 327 // v1 = phi [v_init, vector.ph], [v2, vector.body] 328 // v2 = a[i, i+1, i+2, i+3]; 329 // v3 = vector(v1(3), v2(0, 1, 2)) 330 331 // For the first part, use the recurrence phi (v1), otherwise v2. 332 auto *V1 = State.get(getOperand(0), 0); 333 Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1); 334 if (!PartMinus1->getType()->isVectorTy()) 335 return PartMinus1; 336 Value *V2 = State.get(getOperand(1), Part); 337 return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name); 338 } 339 case VPInstruction::CalculateTripCountMinusVF: { 340 Value *ScalarTC = State.get(getOperand(0), {0, 0}); 341 Value *Step = 342 createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF); 343 Value *Sub = Builder.CreateSub(ScalarTC, Step); 344 Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step); 345 Value *Zero = ConstantInt::get(ScalarTC->getType(), 0); 346 return Builder.CreateSelect(Cmp, Sub, Zero); 347 } 348 case VPInstruction::CanonicalIVIncrementForPart: { 349 auto *IV = State.get(getOperand(0), VPIteration(0, 0)); 350 if (Part == 0) 351 return IV; 352 353 // The canonical IV is incremented by the vectorization factor (num of SIMD 354 // elements) times the unroll part. 355 Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); 356 return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(), 357 hasNoSignedWrap()); 358 } 359 case VPInstruction::BranchOnCond: { 360 if (Part != 0) 361 return nullptr; 362 363 Value *Cond = State.get(getOperand(0), VPIteration(Part, 0)); 364 VPRegionBlock *ParentRegion = getParent()->getParent(); 365 VPBasicBlock *Header = ParentRegion->getEntryBasicBlock(); 366 367 // Replace the temporary unreachable terminator with a new conditional 368 // branch, hooking it up to backward destination for exiting blocks now and 369 // to forward destination(s) later when they are created. 370 BranchInst *CondBr = 371 Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr); 372 373 if (getParent()->isExiting()) 374 CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]); 375 376 CondBr->setSuccessor(0, nullptr); 377 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 378 return CondBr; 379 } 380 case VPInstruction::BranchOnCount: { 381 if (Part != 0) 382 return nullptr; 383 // First create the compare. 384 Value *IV = State.get(getOperand(0), Part); 385 Value *TC = State.get(getOperand(1), Part); 386 Value *Cond = Builder.CreateICmpEQ(IV, TC); 387 388 // Now create the branch. 389 auto *Plan = getParent()->getPlan(); 390 VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); 391 VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock(); 392 393 // Replace the temporary unreachable terminator with a new conditional 394 // branch, hooking it up to backward destination (the header) now and to the 395 // forward destination (the exit/middle block) later when it is created. 396 // Note that CreateCondBr expects a valid BB as first argument, so we need 397 // to set it to nullptr later. 398 BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), 399 State.CFG.VPBB2IRBB[Header]); 400 CondBr->setSuccessor(0, nullptr); 401 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 402 return CondBr; 403 } 404 default: 405 llvm_unreachable("Unsupported opcode for instruction"); 406 } 407 } 408 409 #if !defined(NDEBUG) 410 bool VPInstruction::isFPMathOp() const { 411 // Inspired by FPMathOperator::classof. Notable differences are that we don't 412 // support Call, PHI and Select opcodes here yet. 413 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || 414 Opcode == Instruction::FNeg || Opcode == Instruction::FSub || 415 Opcode == Instruction::FDiv || Opcode == Instruction::FRem || 416 Opcode == Instruction::FCmp || Opcode == Instruction::Select; 417 } 418 #endif 419 420 void VPInstruction::execute(VPTransformState &State) { 421 assert(!State.Instance && "VPInstruction executing an Instance"); 422 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 423 assert((hasFastMathFlags() == isFPMathOp() || 424 getOpcode() == Instruction::Select) && 425 "Recipe not a FPMathOp but has fast-math flags?"); 426 if (hasFastMathFlags()) 427 State.Builder.setFastMathFlags(getFastMathFlags()); 428 for (unsigned Part = 0; Part < State.UF; ++Part) { 429 Value *GeneratedValue = generateInstruction(State, Part); 430 if (!hasResult()) 431 continue; 432 assert(GeneratedValue && "generateInstruction must produce a value"); 433 State.set(this, GeneratedValue, Part); 434 } 435 } 436 437 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 438 void VPInstruction::dump() const { 439 VPSlotTracker SlotTracker(getParent()->getPlan()); 440 print(dbgs(), "", SlotTracker); 441 } 442 443 void VPInstruction::print(raw_ostream &O, const Twine &Indent, 444 VPSlotTracker &SlotTracker) const { 445 O << Indent << "EMIT "; 446 447 if (hasResult()) { 448 printAsOperand(O, SlotTracker); 449 O << " = "; 450 } 451 452 switch (getOpcode()) { 453 case VPInstruction::Not: 454 O << "not"; 455 break; 456 case VPInstruction::SLPLoad: 457 O << "combined load"; 458 break; 459 case VPInstruction::SLPStore: 460 O << "combined store"; 461 break; 462 case VPInstruction::ActiveLaneMask: 463 O << "active lane mask"; 464 break; 465 case VPInstruction::FirstOrderRecurrenceSplice: 466 O << "first-order splice"; 467 break; 468 case VPInstruction::BranchOnCond: 469 O << "branch-on-cond"; 470 break; 471 case VPInstruction::CalculateTripCountMinusVF: 472 O << "TC > VF ? TC - VF : 0"; 473 break; 474 case VPInstruction::CanonicalIVIncrementForPart: 475 O << "VF * Part +"; 476 break; 477 case VPInstruction::BranchOnCount: 478 O << "branch-on-count"; 479 break; 480 default: 481 O << Instruction::getOpcodeName(getOpcode()); 482 } 483 484 printFlags(O); 485 printOperands(O, SlotTracker); 486 487 if (auto DL = getDebugLoc()) { 488 O << ", !dbg "; 489 DL.print(O); 490 } 491 } 492 #endif 493 494 void VPWidenCallRecipe::execute(VPTransformState &State) { 495 assert(State.VF.isVector() && "not widening"); 496 auto &CI = *cast<CallInst>(getUnderlyingInstr()); 497 assert(!isa<DbgInfoIntrinsic>(CI) && 498 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 499 State.setDebugLocFrom(CI.getDebugLoc()); 500 501 bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic; 502 FunctionType *VFTy = nullptr; 503 if (Variant) 504 VFTy = Variant->getFunctionType(); 505 for (unsigned Part = 0; Part < State.UF; ++Part) { 506 SmallVector<Type *, 2> TysForDecl; 507 // Add return type if intrinsic is overloaded on it. 508 if (UseIntrinsic && 509 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1)) 510 TysForDecl.push_back( 511 VectorType::get(CI.getType()->getScalarType(), State.VF)); 512 SmallVector<Value *, 4> Args; 513 for (const auto &I : enumerate(operands())) { 514 // Some intrinsics have a scalar argument - don't replace it with a 515 // vector. 516 // Some vectorized function variants may also take a scalar argument, 517 // e.g. linear parameters for pointers. 518 Value *Arg; 519 if ((VFTy && !VFTy->getParamType(I.index())->isVectorTy()) || 520 (UseIntrinsic && 521 isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))) 522 Arg = State.get(I.value(), VPIteration(0, 0)); 523 else 524 Arg = State.get(I.value(), Part); 525 if (UseIntrinsic && 526 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index())) 527 TysForDecl.push_back(Arg->getType()); 528 Args.push_back(Arg); 529 } 530 531 Function *VectorF; 532 if (UseIntrinsic) { 533 // Use vector version of the intrinsic. 534 Module *M = State.Builder.GetInsertBlock()->getModule(); 535 VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl); 536 assert(VectorF && "Can't retrieve vector intrinsic."); 537 } else { 538 #ifndef NDEBUG 539 assert(Variant != nullptr && "Can't create vector function."); 540 #endif 541 VectorF = Variant; 542 } 543 544 SmallVector<OperandBundleDef, 1> OpBundles; 545 CI.getOperandBundlesAsDefs(OpBundles); 546 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); 547 548 if (isa<FPMathOperator>(V)) 549 V->copyFastMathFlags(&CI); 550 551 State.set(this, V, Part); 552 State.addMetadata(V, &CI); 553 } 554 } 555 556 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 557 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, 558 VPSlotTracker &SlotTracker) const { 559 O << Indent << "WIDEN-CALL "; 560 561 auto *CI = cast<CallInst>(getUnderlyingInstr()); 562 if (CI->getType()->isVoidTy()) 563 O << "void "; 564 else { 565 printAsOperand(O, SlotTracker); 566 O << " = "; 567 } 568 569 O << "call @" << CI->getCalledFunction()->getName() << "("; 570 printOperands(O, SlotTracker); 571 O << ")"; 572 573 if (VectorIntrinsicID) 574 O << " (using vector intrinsic)"; 575 else { 576 O << " (using library function"; 577 if (Variant->hasName()) 578 O << ": " << Variant->getName(); 579 O << ")"; 580 } 581 } 582 583 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, 584 VPSlotTracker &SlotTracker) const { 585 O << Indent << "WIDEN-SELECT "; 586 printAsOperand(O, SlotTracker); 587 O << " = select "; 588 getOperand(0)->printAsOperand(O, SlotTracker); 589 O << ", "; 590 getOperand(1)->printAsOperand(O, SlotTracker); 591 O << ", "; 592 getOperand(2)->printAsOperand(O, SlotTracker); 593 O << (isInvariantCond() ? " (condition is loop invariant)" : ""); 594 } 595 #endif 596 597 void VPWidenSelectRecipe::execute(VPTransformState &State) { 598 State.setDebugLocFrom(getDebugLoc()); 599 600 // The condition can be loop invariant but still defined inside the 601 // loop. This means that we can't just use the original 'cond' value. 602 // We have to take the 'vectorized' value and pick the first lane. 603 // Instcombine will make this a no-op. 604 auto *InvarCond = 605 isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr; 606 607 for (unsigned Part = 0; Part < State.UF; ++Part) { 608 Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part); 609 Value *Op0 = State.get(getOperand(1), Part); 610 Value *Op1 = State.get(getOperand(2), Part); 611 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 612 State.set(this, Sel, Part); 613 State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 614 } 615 } 616 617 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy( 618 const FastMathFlags &FMF) { 619 AllowReassoc = FMF.allowReassoc(); 620 NoNaNs = FMF.noNaNs(); 621 NoInfs = FMF.noInfs(); 622 NoSignedZeros = FMF.noSignedZeros(); 623 AllowReciprocal = FMF.allowReciprocal(); 624 AllowContract = FMF.allowContract(); 625 ApproxFunc = FMF.approxFunc(); 626 } 627 628 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 629 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { 630 switch (OpType) { 631 case OperationType::Cmp: 632 O << " " << CmpInst::getPredicateName(getPredicate()); 633 break; 634 case OperationType::DisjointOp: 635 if (DisjointFlags.IsDisjoint) 636 O << " disjoint"; 637 break; 638 case OperationType::PossiblyExactOp: 639 if (ExactFlags.IsExact) 640 O << " exact"; 641 break; 642 case OperationType::OverflowingBinOp: 643 if (WrapFlags.HasNUW) 644 O << " nuw"; 645 if (WrapFlags.HasNSW) 646 O << " nsw"; 647 break; 648 case OperationType::FPMathOp: 649 getFastMathFlags().print(O); 650 break; 651 case OperationType::GEPOp: 652 if (GEPFlags.IsInBounds) 653 O << " inbounds"; 654 break; 655 case OperationType::NonNegOp: 656 if (NonNegFlags.NonNeg) 657 O << " nneg"; 658 break; 659 case OperationType::Other: 660 break; 661 } 662 if (getNumOperands() > 0) 663 O << " "; 664 } 665 #endif 666 667 void VPWidenRecipe::execute(VPTransformState &State) { 668 State.setDebugLocFrom(getDebugLoc()); 669 auto &Builder = State.Builder; 670 switch (Opcode) { 671 case Instruction::Call: 672 case Instruction::Br: 673 case Instruction::PHI: 674 case Instruction::GetElementPtr: 675 case Instruction::Select: 676 llvm_unreachable("This instruction is handled by a different recipe."); 677 case Instruction::UDiv: 678 case Instruction::SDiv: 679 case Instruction::SRem: 680 case Instruction::URem: 681 case Instruction::Add: 682 case Instruction::FAdd: 683 case Instruction::Sub: 684 case Instruction::FSub: 685 case Instruction::FNeg: 686 case Instruction::Mul: 687 case Instruction::FMul: 688 case Instruction::FDiv: 689 case Instruction::FRem: 690 case Instruction::Shl: 691 case Instruction::LShr: 692 case Instruction::AShr: 693 case Instruction::And: 694 case Instruction::Or: 695 case Instruction::Xor: { 696 // Just widen unops and binops. 697 for (unsigned Part = 0; Part < State.UF; ++Part) { 698 SmallVector<Value *, 2> Ops; 699 for (VPValue *VPOp : operands()) 700 Ops.push_back(State.get(VPOp, Part)); 701 702 Value *V = Builder.CreateNAryOp(Opcode, Ops); 703 704 if (auto *VecOp = dyn_cast<Instruction>(V)) 705 setFlags(VecOp); 706 707 // Use this vector value for all users of the original instruction. 708 State.set(this, V, Part); 709 State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 710 } 711 712 break; 713 } 714 case Instruction::Freeze: { 715 for (unsigned Part = 0; Part < State.UF; ++Part) { 716 Value *Op = State.get(getOperand(0), Part); 717 718 Value *Freeze = Builder.CreateFreeze(Op); 719 State.set(this, Freeze, Part); 720 } 721 break; 722 } 723 case Instruction::ICmp: 724 case Instruction::FCmp: { 725 // Widen compares. Generate vector compares. 726 bool FCmp = Opcode == Instruction::FCmp; 727 for (unsigned Part = 0; Part < State.UF; ++Part) { 728 Value *A = State.get(getOperand(0), Part); 729 Value *B = State.get(getOperand(1), Part); 730 Value *C = nullptr; 731 if (FCmp) { 732 // Propagate fast math flags. 733 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 734 if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue())) 735 Builder.setFastMathFlags(I->getFastMathFlags()); 736 C = Builder.CreateFCmp(getPredicate(), A, B); 737 } else { 738 C = Builder.CreateICmp(getPredicate(), A, B); 739 } 740 State.set(this, C, Part); 741 State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 742 } 743 744 break; 745 } 746 default: 747 // This instruction is not vectorized by simple widening. 748 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " 749 << Instruction::getOpcodeName(Opcode)); 750 llvm_unreachable("Unhandled instruction!"); 751 } // end of switch. 752 753 #if !defined(NDEBUG) 754 // Verify that VPlan type inference results agree with the type of the 755 // generated values. 756 for (unsigned Part = 0; Part < State.UF; ++Part) { 757 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), 758 State.VF) == State.get(this, Part)->getType() && 759 "inferred type and type from generated instructions do not match"); 760 } 761 #endif 762 } 763 764 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 765 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, 766 VPSlotTracker &SlotTracker) const { 767 O << Indent << "WIDEN "; 768 printAsOperand(O, SlotTracker); 769 O << " = " << Instruction::getOpcodeName(Opcode); 770 printFlags(O); 771 printOperands(O, SlotTracker); 772 } 773 #endif 774 775 void VPWidenCastRecipe::execute(VPTransformState &State) { 776 State.setDebugLocFrom(getDebugLoc()); 777 auto &Builder = State.Builder; 778 /// Vectorize casts. 779 assert(State.VF.isVector() && "Not vectorizing?"); 780 Type *DestTy = VectorType::get(getResultType(), State.VF); 781 VPValue *Op = getOperand(0); 782 for (unsigned Part = 0; Part < State.UF; ++Part) { 783 if (Part > 0 && Op->isLiveIn()) { 784 // FIXME: Remove once explicit unrolling is implemented using VPlan. 785 State.set(this, State.get(this, 0), Part); 786 continue; 787 } 788 Value *A = State.get(Op, Part); 789 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); 790 State.set(this, Cast, Part); 791 State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue())); 792 } 793 } 794 795 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 796 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, 797 VPSlotTracker &SlotTracker) const { 798 O << Indent << "WIDEN-CAST "; 799 printAsOperand(O, SlotTracker); 800 O << " = " << Instruction::getOpcodeName(Opcode) << " "; 801 printFlags(O); 802 printOperands(O, SlotTracker); 803 O << " to " << *getResultType(); 804 } 805 #endif 806 807 /// This function adds 808 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 809 /// to each vector element of Val. The sequence starts at StartIndex. 810 /// \p Opcode is relevant for FP induction variable. 811 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 812 Instruction::BinaryOps BinOp, ElementCount VF, 813 IRBuilderBase &Builder) { 814 assert(VF.isVector() && "only vector VFs are supported"); 815 816 // Create and check the types. 817 auto *ValVTy = cast<VectorType>(Val->getType()); 818 ElementCount VLen = ValVTy->getElementCount(); 819 820 Type *STy = Val->getType()->getScalarType(); 821 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 822 "Induction Step must be an integer or FP"); 823 assert(Step->getType() == STy && "Step has wrong type"); 824 825 SmallVector<Constant *, 8> Indices; 826 827 // Create a vector of consecutive numbers from zero to VF. 828 VectorType *InitVecValVTy = ValVTy; 829 if (STy->isFloatingPointTy()) { 830 Type *InitVecValSTy = 831 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 832 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 833 } 834 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 835 836 // Splat the StartIdx 837 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 838 839 if (STy->isIntegerTy()) { 840 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 841 Step = Builder.CreateVectorSplat(VLen, Step); 842 assert(Step->getType() == Val->getType() && "Invalid step vec"); 843 // FIXME: The newly created binary instructions should contain nsw/nuw 844 // flags, which can be found from the original scalar operations. 845 Step = Builder.CreateMul(InitVec, Step); 846 return Builder.CreateAdd(Val, Step, "induction"); 847 } 848 849 // Floating point induction. 850 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 851 "Binary Opcode should be specified for FP induction"); 852 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 853 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 854 855 Step = Builder.CreateVectorSplat(VLen, Step); 856 Value *MulOp = Builder.CreateFMul(InitVec, Step); 857 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 858 } 859 860 /// A helper function that returns an integer or floating-point constant with 861 /// value C. 862 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 863 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 864 : ConstantFP::get(Ty, C); 865 } 866 867 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 868 ElementCount VF) { 869 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 870 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 871 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 872 return B.CreateUIToFP(RuntimeVF, FTy); 873 } 874 875 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 876 assert(!State.Instance && "Int or FP induction being replicated."); 877 878 Value *Start = getStartValue()->getLiveInIRValue(); 879 const InductionDescriptor &ID = getInductionDescriptor(); 880 TruncInst *Trunc = getTruncInst(); 881 IRBuilderBase &Builder = State.Builder; 882 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 883 assert(State.VF.isVector() && "must have vector VF"); 884 885 // The value from the original loop to which we are mapping the new induction 886 // variable. 887 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 888 889 // Fast-math-flags propagate from the original induction instruction. 890 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 891 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 892 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 893 894 // Now do the actual transformations, and start with fetching the step value. 895 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 896 897 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 898 "Expected either an induction phi-node or a truncate of it!"); 899 900 // Construct the initial value of the vector IV in the vector loop preheader 901 auto CurrIP = Builder.saveIP(); 902 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 903 Builder.SetInsertPoint(VectorPH->getTerminator()); 904 if (isa<TruncInst>(EntryVal)) { 905 assert(Start->getType()->isIntegerTy() && 906 "Truncation requires an integer type"); 907 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 908 Step = Builder.CreateTrunc(Step, TruncType); 909 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 910 } 911 912 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 913 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 914 Value *SteppedStart = getStepVector( 915 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); 916 917 // We create vector phi nodes for both integer and floating-point induction 918 // variables. Here, we determine the kind of arithmetic we will perform. 919 Instruction::BinaryOps AddOp; 920 Instruction::BinaryOps MulOp; 921 if (Step->getType()->isIntegerTy()) { 922 AddOp = Instruction::Add; 923 MulOp = Instruction::Mul; 924 } else { 925 AddOp = ID.getInductionOpcode(); 926 MulOp = Instruction::FMul; 927 } 928 929 // Multiply the vectorization factor by the step using integer or 930 // floating-point arithmetic as appropriate. 931 Type *StepType = Step->getType(); 932 Value *RuntimeVF; 933 if (Step->getType()->isFloatingPointTy()) 934 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 935 else 936 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 937 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 938 939 // Create a vector splat to use in the induction update. 940 // 941 // FIXME: If the step is non-constant, we create the vector splat with 942 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 943 // handle a constant vector splat. 944 Value *SplatVF = isa<Constant>(Mul) 945 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 946 : Builder.CreateVectorSplat(State.VF, Mul); 947 Builder.restoreIP(CurrIP); 948 949 // We may need to add the step a number of times, depending on the unroll 950 // factor. The last of those goes into the PHI. 951 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind"); 952 VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 953 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 954 Instruction *LastInduction = VecInd; 955 for (unsigned Part = 0; Part < State.UF; ++Part) { 956 State.set(this, LastInduction, Part); 957 958 if (isa<TruncInst>(EntryVal)) 959 State.addMetadata(LastInduction, EntryVal); 960 961 LastInduction = cast<Instruction>( 962 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 963 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 964 } 965 966 LastInduction->setName("vec.ind.next"); 967 VecInd->addIncoming(SteppedStart, VectorPH); 968 // Add induction update using an incorrect block temporarily. The phi node 969 // will be fixed after VPlan execution. Note that at this point the latch 970 // block cannot be used, as it does not exist yet. 971 // TODO: Model increment value in VPlan, by turning the recipe into a 972 // multi-def and a subclass of VPHeaderPHIRecipe. 973 VecInd->addIncoming(LastInduction, VectorPH); 974 } 975 976 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 977 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, 978 VPSlotTracker &SlotTracker) const { 979 O << Indent << "WIDEN-INDUCTION"; 980 if (getTruncInst()) { 981 O << "\\l\""; 982 O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\""; 983 O << " +\n" << Indent << "\" "; 984 getVPValue(0)->printAsOperand(O, SlotTracker); 985 } else 986 O << " " << VPlanIngredient(IV); 987 988 O << ", "; 989 getStepValue()->printAsOperand(O, SlotTracker); 990 } 991 #endif 992 993 bool VPWidenIntOrFpInductionRecipe::isCanonical() const { 994 // The step may be defined by a recipe in the preheader (e.g. if it requires 995 // SCEV expansion), but for the canonical induction the step is required to be 996 // 1, which is represented as live-in. 997 if (getStepValue()->getDefiningRecipe()) 998 return false; 999 auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue()); 1000 auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); 1001 return StartC && StartC->isZero() && StepC && StepC->isOne(); 1002 } 1003 1004 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1005 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, 1006 VPSlotTracker &SlotTracker) const { 1007 O << Indent; 1008 printAsOperand(O, SlotTracker); 1009 O << Indent << "= DERIVED-IV "; 1010 getStartValue()->printAsOperand(O, SlotTracker); 1011 O << " + "; 1012 getCanonicalIV()->printAsOperand(O, SlotTracker); 1013 O << " * "; 1014 getStepValue()->printAsOperand(O, SlotTracker); 1015 1016 if (TruncResultTy) 1017 O << " (truncated to " << *TruncResultTy << ")"; 1018 } 1019 #endif 1020 1021 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 1022 // Fast-math-flags propagate from the original induction instruction. 1023 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 1024 if (hasFastMathFlags()) 1025 State.Builder.setFastMathFlags(getFastMathFlags()); 1026 1027 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 1028 /// variable on which to base the steps, \p Step is the size of the step. 1029 1030 Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0)); 1031 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 1032 IRBuilderBase &Builder = State.Builder; 1033 1034 // Ensure step has the same type as that of scalar IV. 1035 Type *BaseIVTy = BaseIV->getType()->getScalarType(); 1036 if (BaseIVTy != Step->getType()) { 1037 // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to 1038 // avoid separate truncate here. 1039 assert(Step->getType()->isIntegerTy() && 1040 "Truncation requires an integer step"); 1041 Step = State.Builder.CreateTrunc(Step, BaseIVTy); 1042 } 1043 1044 // We build scalar steps for both integer and floating-point induction 1045 // variables. Here, we determine the kind of arithmetic we will perform. 1046 Instruction::BinaryOps AddOp; 1047 Instruction::BinaryOps MulOp; 1048 if (BaseIVTy->isIntegerTy()) { 1049 AddOp = Instruction::Add; 1050 MulOp = Instruction::Mul; 1051 } else { 1052 AddOp = InductionOpcode; 1053 MulOp = Instruction::FMul; 1054 } 1055 1056 // Determine the number of scalars we need to generate for each unroll 1057 // iteration. 1058 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this); 1059 // Compute the scalar steps and save the results in State. 1060 Type *IntStepTy = 1061 IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits()); 1062 Type *VecIVTy = nullptr; 1063 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 1064 if (!FirstLaneOnly && State.VF.isScalable()) { 1065 VecIVTy = VectorType::get(BaseIVTy, State.VF); 1066 UnitStepVec = 1067 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 1068 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 1069 SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV); 1070 } 1071 1072 unsigned StartPart = 0; 1073 unsigned EndPart = State.UF; 1074 unsigned StartLane = 0; 1075 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 1076 if (State.Instance) { 1077 StartPart = State.Instance->Part; 1078 EndPart = StartPart + 1; 1079 StartLane = State.Instance->Lane.getKnownLane(); 1080 EndLane = StartLane + 1; 1081 } 1082 for (unsigned Part = StartPart; Part < EndPart; ++Part) { 1083 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 1084 1085 if (!FirstLaneOnly && State.VF.isScalable()) { 1086 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 1087 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 1088 if (BaseIVTy->isFloatingPointTy()) 1089 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 1090 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 1091 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 1092 State.set(this, Add, Part); 1093 // It's useful to record the lane values too for the known minimum number 1094 // of elements so we do those below. This improves the code quality when 1095 // trying to extract the first element, for example. 1096 } 1097 1098 if (BaseIVTy->isFloatingPointTy()) 1099 StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy); 1100 1101 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { 1102 Value *StartIdx = Builder.CreateBinOp( 1103 AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane)); 1104 // The step returned by `createStepForVF` is a runtime-evaluated value 1105 // when VF is scalable. Otherwise, it should be folded into a Constant. 1106 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 1107 "Expected StartIdx to be folded to a constant when VF is not " 1108 "scalable"); 1109 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 1110 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul); 1111 State.set(this, Add, VPIteration(Part, Lane)); 1112 } 1113 } 1114 } 1115 1116 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1117 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, 1118 VPSlotTracker &SlotTracker) const { 1119 O << Indent; 1120 printAsOperand(O, SlotTracker); 1121 O << " = SCALAR-STEPS "; 1122 printOperands(O, SlotTracker); 1123 } 1124 #endif 1125 1126 void VPWidenGEPRecipe::execute(VPTransformState &State) { 1127 assert(State.VF.isVector() && "not widening"); 1128 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 1129 // Construct a vector GEP by widening the operands of the scalar GEP as 1130 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 1131 // results in a vector of pointers when at least one operand of the GEP 1132 // is vector-typed. Thus, to keep the representation compact, we only use 1133 // vector-typed operands for loop-varying values. 1134 1135 if (areAllOperandsInvariant()) { 1136 // If we are vectorizing, but the GEP has only loop-invariant operands, 1137 // the GEP we build (by only using vector-typed operands for 1138 // loop-varying values) would be a scalar pointer. Thus, to ensure we 1139 // produce a vector of pointers, we need to either arbitrarily pick an 1140 // operand to broadcast, or broadcast a clone of the original GEP. 1141 // Here, we broadcast a clone of the original. 1142 // 1143 // TODO: If at some point we decide to scalarize instructions having 1144 // loop-invariant operands, this special case will no longer be 1145 // required. We would add the scalarization decision to 1146 // collectLoopScalars() and teach getVectorValue() to broadcast 1147 // the lane-zero scalar value. 1148 SmallVector<Value *> Ops; 1149 for (unsigned I = 0, E = getNumOperands(); I != E; I++) 1150 Ops.push_back(State.get(getOperand(I), VPIteration(0, 0))); 1151 1152 auto *NewGEP = 1153 State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0], 1154 ArrayRef(Ops).drop_front(), "", isInBounds()); 1155 for (unsigned Part = 0; Part < State.UF; ++Part) { 1156 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP); 1157 State.set(this, EntryPart, Part); 1158 State.addMetadata(EntryPart, GEP); 1159 } 1160 } else { 1161 // If the GEP has at least one loop-varying operand, we are sure to 1162 // produce a vector of pointers. But if we are only unrolling, we want 1163 // to produce a scalar GEP for each unroll part. Thus, the GEP we 1164 // produce with the code below will be scalar (if VF == 1) or vector 1165 // (otherwise). Note that for the unroll-only case, we still maintain 1166 // values in the vector mapping with initVector, as we do for other 1167 // instructions. 1168 for (unsigned Part = 0; Part < State.UF; ++Part) { 1169 // The pointer operand of the new GEP. If it's loop-invariant, we 1170 // won't broadcast it. 1171 auto *Ptr = isPointerLoopInvariant() 1172 ? State.get(getOperand(0), VPIteration(0, 0)) 1173 : State.get(getOperand(0), Part); 1174 1175 // Collect all the indices for the new GEP. If any index is 1176 // loop-invariant, we won't broadcast it. 1177 SmallVector<Value *, 4> Indices; 1178 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 1179 VPValue *Operand = getOperand(I); 1180 if (isIndexLoopInvariant(I - 1)) 1181 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 1182 else 1183 Indices.push_back(State.get(Operand, Part)); 1184 } 1185 1186 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 1187 // but it should be a vector, otherwise. 1188 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, 1189 Indices, "", isInBounds()); 1190 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 1191 "NewGEP is not a pointer vector"); 1192 State.set(this, NewGEP, Part); 1193 State.addMetadata(NewGEP, GEP); 1194 } 1195 } 1196 } 1197 1198 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1199 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, 1200 VPSlotTracker &SlotTracker) const { 1201 O << Indent << "WIDEN-GEP "; 1202 O << (isPointerLoopInvariant() ? "Inv" : "Var"); 1203 for (size_t I = 0; I < getNumOperands() - 1; ++I) 1204 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]"; 1205 1206 O << " "; 1207 printAsOperand(O, SlotTracker); 1208 O << " = getelementptr"; 1209 printFlags(O); 1210 printOperands(O, SlotTracker); 1211 } 1212 #endif 1213 1214 void VPVectorPointerRecipe ::execute(VPTransformState &State) { 1215 auto &Builder = State.Builder; 1216 State.setDebugLocFrom(getDebugLoc()); 1217 for (unsigned Part = 0; Part < State.UF; ++Part) { 1218 // Calculate the pointer for the specific unroll-part. 1219 Value *PartPtr = nullptr; 1220 // Use i32 for the gep index type when the value is constant, 1221 // or query DataLayout for a more suitable index type otherwise. 1222 const DataLayout &DL = 1223 Builder.GetInsertBlock()->getModule()->getDataLayout(); 1224 Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0) 1225 ? DL.getIndexType(IndexedTy->getPointerTo()) 1226 : Builder.getInt32Ty(); 1227 Value *Ptr = State.get(getOperand(0), VPIteration(0, 0)); 1228 bool InBounds = false; 1229 if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 1230 InBounds = GEP->isInBounds(); 1231 if (IsReverse) { 1232 // If the address is consecutive but reversed, then the 1233 // wide store needs to start at the last vector element. 1234 // RunTimeVF = VScale * VF.getKnownMinValue() 1235 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 1236 Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF); 1237 // NumElt = -Part * RunTimeVF 1238 Value *NumElt = Builder.CreateMul( 1239 ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF); 1240 // LastLane = 1 - RunTimeVF 1241 Value *LastLane = 1242 Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); 1243 PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds); 1244 PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds); 1245 } else { 1246 Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part); 1247 PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds); 1248 } 1249 1250 State.set(this, PartPtr, Part); 1251 } 1252 } 1253 1254 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1255 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, 1256 VPSlotTracker &SlotTracker) const { 1257 O << Indent; 1258 printAsOperand(O, SlotTracker); 1259 O << " = vector-pointer "; 1260 if (IsReverse) 1261 O << "(reverse) "; 1262 1263 printOperands(O, SlotTracker); 1264 } 1265 #endif 1266 1267 void VPBlendRecipe::execute(VPTransformState &State) { 1268 State.setDebugLocFrom(getDebugLoc()); 1269 // We know that all PHIs in non-header blocks are converted into 1270 // selects, so we don't have to worry about the insertion order and we 1271 // can just use the builder. 1272 // At this point we generate the predication tree. There may be 1273 // duplications since this is a simple recursive scan, but future 1274 // optimizations will clean it up. 1275 1276 unsigned NumIncoming = getNumIncomingValues(); 1277 1278 // Generate a sequence of selects of the form: 1279 // SELECT(Mask3, In3, 1280 // SELECT(Mask2, In2, 1281 // SELECT(Mask1, In1, 1282 // In0))) 1283 // Note that Mask0 is never used: lanes for which no path reaches this phi and 1284 // are essentially undef are taken from In0. 1285 VectorParts Entry(State.UF); 1286 for (unsigned In = 0; In < NumIncoming; ++In) { 1287 for (unsigned Part = 0; Part < State.UF; ++Part) { 1288 // We might have single edge PHIs (blocks) - use an identity 1289 // 'select' for the first PHI operand. 1290 Value *In0 = State.get(getIncomingValue(In), Part); 1291 if (In == 0) 1292 Entry[Part] = In0; // Initialize with the first incoming value. 1293 else { 1294 // Select between the current value and the previous incoming edge 1295 // based on the incoming mask. 1296 Value *Cond = State.get(getMask(In), Part); 1297 Entry[Part] = 1298 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 1299 } 1300 } 1301 } 1302 for (unsigned Part = 0; Part < State.UF; ++Part) 1303 State.set(this, Entry[Part], Part); 1304 } 1305 1306 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1307 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, 1308 VPSlotTracker &SlotTracker) const { 1309 O << Indent << "BLEND "; 1310 printAsOperand(O, SlotTracker); 1311 O << " ="; 1312 if (getNumIncomingValues() == 1) { 1313 // Not a User of any mask: not really blending, this is a 1314 // single-predecessor phi. 1315 O << " "; 1316 getIncomingValue(0)->printAsOperand(O, SlotTracker); 1317 } else { 1318 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { 1319 O << " "; 1320 getIncomingValue(I)->printAsOperand(O, SlotTracker); 1321 O << "/"; 1322 getMask(I)->printAsOperand(O, SlotTracker); 1323 } 1324 } 1325 } 1326 1327 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, 1328 VPSlotTracker &SlotTracker) const { 1329 O << Indent << "REDUCE "; 1330 printAsOperand(O, SlotTracker); 1331 O << " = "; 1332 getChainOp()->printAsOperand(O, SlotTracker); 1333 O << " +"; 1334 if (isa<FPMathOperator>(getUnderlyingInstr())) 1335 O << getUnderlyingInstr()->getFastMathFlags(); 1336 O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; 1337 getVecOp()->printAsOperand(O, SlotTracker); 1338 if (getCondOp()) { 1339 O << ", "; 1340 getCondOp()->printAsOperand(O, SlotTracker); 1341 } 1342 O << ")"; 1343 if (RdxDesc.IntermediateStore) 1344 O << " (with final reduction value stored in invariant address sank " 1345 "outside of loop)"; 1346 } 1347 #endif 1348 1349 bool VPReplicateRecipe::shouldPack() const { 1350 // Find if the recipe is used by a widened recipe via an intervening 1351 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. 1352 return any_of(users(), [](const VPUser *U) { 1353 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U)) 1354 return any_of(PredR->users(), [PredR](const VPUser *U) { 1355 return !U->usesScalars(PredR); 1356 }); 1357 return false; 1358 }); 1359 } 1360 1361 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1362 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, 1363 VPSlotTracker &SlotTracker) const { 1364 O << Indent << (IsUniform ? "CLONE " : "REPLICATE "); 1365 1366 if (!getUnderlyingInstr()->getType()->isVoidTy()) { 1367 printAsOperand(O, SlotTracker); 1368 O << " = "; 1369 } 1370 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) { 1371 O << "call"; 1372 printFlags(O); 1373 O << "@" << CB->getCalledFunction()->getName() << "("; 1374 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), 1375 O, [&O, &SlotTracker](VPValue *Op) { 1376 Op->printAsOperand(O, SlotTracker); 1377 }); 1378 O << ")"; 1379 } else { 1380 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); 1381 printFlags(O); 1382 printOperands(O, SlotTracker); 1383 } 1384 1385 if (shouldPack()) 1386 O << " (S->V)"; 1387 } 1388 #endif 1389 1390 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 1391 assert(State.Instance && "Branch on Mask works only on single instance."); 1392 1393 unsigned Part = State.Instance->Part; 1394 unsigned Lane = State.Instance->Lane.getKnownLane(); 1395 1396 Value *ConditionBit = nullptr; 1397 VPValue *BlockInMask = getMask(); 1398 if (BlockInMask) { 1399 ConditionBit = State.get(BlockInMask, Part); 1400 if (ConditionBit->getType()->isVectorTy()) 1401 ConditionBit = State.Builder.CreateExtractElement( 1402 ConditionBit, State.Builder.getInt32(Lane)); 1403 } else // Block in mask is all-one. 1404 ConditionBit = State.Builder.getTrue(); 1405 1406 // Replace the temporary unreachable terminator with a new conditional branch, 1407 // whose two destinations will be set later when they are created. 1408 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 1409 assert(isa<UnreachableInst>(CurrentTerminator) && 1410 "Expected to replace unreachable terminator with conditional branch."); 1411 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 1412 CondBr->setSuccessor(0, nullptr); 1413 ReplaceInstWithInst(CurrentTerminator, CondBr); 1414 } 1415 1416 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 1417 assert(State.Instance && "Predicated instruction PHI works per instance."); 1418 Instruction *ScalarPredInst = 1419 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 1420 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 1421 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 1422 assert(PredicatingBB && "Predicated block has no single predecessor."); 1423 assert(isa<VPReplicateRecipe>(getOperand(0)) && 1424 "operand must be VPReplicateRecipe"); 1425 1426 // By current pack/unpack logic we need to generate only a single phi node: if 1427 // a vector value for the predicated instruction exists at this point it means 1428 // the instruction has vector users only, and a phi for the vector value is 1429 // needed. In this case the recipe of the predicated instruction is marked to 1430 // also do that packing, thereby "hoisting" the insert-element sequence. 1431 // Otherwise, a phi node for the scalar value is needed. 1432 unsigned Part = State.Instance->Part; 1433 if (State.hasVectorValue(getOperand(0), Part)) { 1434 Value *VectorValue = State.get(getOperand(0), Part); 1435 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 1436 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 1437 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 1438 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 1439 if (State.hasVectorValue(this, Part)) 1440 State.reset(this, VPhi, Part); 1441 else 1442 State.set(this, VPhi, Part); 1443 // NOTE: Currently we need to update the value of the operand, so the next 1444 // predicated iteration inserts its generated value in the correct vector. 1445 State.reset(getOperand(0), VPhi, Part); 1446 } else { 1447 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 1448 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 1449 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 1450 PredicatingBB); 1451 Phi->addIncoming(ScalarPredInst, PredicatedBB); 1452 if (State.hasScalarValue(this, *State.Instance)) 1453 State.reset(this, Phi, *State.Instance); 1454 else 1455 State.set(this, Phi, *State.Instance); 1456 // NOTE: Currently we need to update the value of the operand, so the next 1457 // predicated iteration inserts its generated value in the correct vector. 1458 State.reset(getOperand(0), Phi, *State.Instance); 1459 } 1460 } 1461 1462 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1463 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, 1464 VPSlotTracker &SlotTracker) const { 1465 O << Indent << "PHI-PREDICATED-INSTRUCTION "; 1466 printAsOperand(O, SlotTracker); 1467 O << " = "; 1468 printOperands(O, SlotTracker); 1469 } 1470 1471 void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent, 1472 VPSlotTracker &SlotTracker) const { 1473 O << Indent << "WIDEN "; 1474 1475 if (!isStore()) { 1476 getVPSingleValue()->printAsOperand(O, SlotTracker); 1477 O << " = "; 1478 } 1479 O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " "; 1480 1481 printOperands(O, SlotTracker); 1482 } 1483 #endif 1484 1485 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { 1486 Value *Start = getStartValue()->getLiveInIRValue(); 1487 PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index"); 1488 EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 1489 1490 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1491 EntryPart->addIncoming(Start, VectorPH); 1492 EntryPart->setDebugLoc(getDebugLoc()); 1493 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) 1494 State.set(this, EntryPart, Part); 1495 } 1496 1497 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1498 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 1499 VPSlotTracker &SlotTracker) const { 1500 O << Indent << "EMIT "; 1501 printAsOperand(O, SlotTracker); 1502 O << " = CANONICAL-INDUCTION "; 1503 printOperands(O, SlotTracker); 1504 } 1505 #endif 1506 1507 bool VPCanonicalIVPHIRecipe::isCanonical( 1508 InductionDescriptor::InductionKind Kind, VPValue *Start, VPValue *Step, 1509 Type *Ty) const { 1510 // The types must match and it must be an integer induction. 1511 if (Ty != getScalarType() || Kind != InductionDescriptor::IK_IntInduction) 1512 return false; 1513 // Start must match the start value of this canonical induction. 1514 if (Start != getStartValue()) 1515 return false; 1516 1517 // If the step is defined by a recipe, it is not a ConstantInt. 1518 if (Step->getDefiningRecipe()) 1519 return false; 1520 1521 ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue()); 1522 return StepC && StepC->isOne(); 1523 } 1524 1525 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(ElementCount VF) { 1526 return IsScalarAfterVectorization && 1527 (!VF.isScalable() || vputils::onlyFirstLaneUsed(this)); 1528 } 1529 1530 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1531 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, 1532 VPSlotTracker &SlotTracker) const { 1533 O << Indent << "EMIT "; 1534 printAsOperand(O, SlotTracker); 1535 O << " = WIDEN-POINTER-INDUCTION "; 1536 getStartValue()->printAsOperand(O, SlotTracker); 1537 O << ", " << *IndDesc.getStep(); 1538 } 1539 #endif 1540 1541 void VPExpandSCEVRecipe::execute(VPTransformState &State) { 1542 assert(!State.Instance && "cannot be used in per-lane"); 1543 const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout(); 1544 SCEVExpander Exp(SE, DL, "induction"); 1545 1546 Value *Res = Exp.expandCodeFor(Expr, Expr->getType(), 1547 &*State.Builder.GetInsertPoint()); 1548 assert(!State.ExpandedSCEVs.contains(Expr) && 1549 "Same SCEV expanded multiple times"); 1550 State.ExpandedSCEVs[Expr] = Res; 1551 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) 1552 State.set(this, Res, {Part, 0}); 1553 } 1554 1555 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1556 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, 1557 VPSlotTracker &SlotTracker) const { 1558 O << Indent << "EMIT "; 1559 getVPSingleValue()->printAsOperand(O, SlotTracker); 1560 O << " = EXPAND SCEV " << *Expr; 1561 } 1562 #endif 1563 1564 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { 1565 Value *CanonicalIV = State.get(getOperand(0), 0); 1566 Type *STy = CanonicalIV->getType(); 1567 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); 1568 ElementCount VF = State.VF; 1569 Value *VStart = VF.isScalar() 1570 ? CanonicalIV 1571 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); 1572 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { 1573 Value *VStep = createStepForVF(Builder, STy, VF, Part); 1574 if (VF.isVector()) { 1575 VStep = Builder.CreateVectorSplat(VF, VStep); 1576 VStep = 1577 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); 1578 } 1579 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); 1580 State.set(this, CanonicalVectorIV, Part); 1581 } 1582 } 1583 1584 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1585 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, 1586 VPSlotTracker &SlotTracker) const { 1587 O << Indent << "EMIT "; 1588 printAsOperand(O, SlotTracker); 1589 O << " = WIDEN-CANONICAL-INDUCTION "; 1590 printOperands(O, SlotTracker); 1591 } 1592 #endif 1593 1594 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { 1595 auto &Builder = State.Builder; 1596 // Create a vector from the initial value. 1597 auto *VectorInit = getStartValue()->getLiveInIRValue(); 1598 1599 Type *VecTy = State.VF.isScalar() 1600 ? VectorInit->getType() 1601 : VectorType::get(VectorInit->getType(), State.VF); 1602 1603 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1604 if (State.VF.isVector()) { 1605 auto *IdxTy = Builder.getInt32Ty(); 1606 auto *One = ConstantInt::get(IdxTy, 1); 1607 IRBuilder<>::InsertPointGuard Guard(Builder); 1608 Builder.SetInsertPoint(VectorPH->getTerminator()); 1609 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF); 1610 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 1611 VectorInit = Builder.CreateInsertElement( 1612 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init"); 1613 } 1614 1615 // Create a phi node for the new recurrence. 1616 PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur"); 1617 EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 1618 EntryPart->addIncoming(VectorInit, VectorPH); 1619 State.set(this, EntryPart, 0); 1620 } 1621 1622 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1623 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, 1624 VPSlotTracker &SlotTracker) const { 1625 O << Indent << "FIRST-ORDER-RECURRENCE-PHI "; 1626 printAsOperand(O, SlotTracker); 1627 O << " = phi "; 1628 printOperands(O, SlotTracker); 1629 } 1630 #endif 1631 1632 void VPReductionPHIRecipe::execute(VPTransformState &State) { 1633 PHINode *PN = cast<PHINode>(getUnderlyingValue()); 1634 auto &Builder = State.Builder; 1635 1636 // In order to support recurrences we need to be able to vectorize Phi nodes. 1637 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 1638 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 1639 // this value when we vectorize all of the instructions that use the PHI. 1640 bool ScalarPHI = State.VF.isScalar() || IsInLoop; 1641 Type *VecTy = 1642 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF); 1643 1644 BasicBlock *HeaderBB = State.CFG.PrevBB; 1645 assert(State.CurrentVectorLoop->getHeader() == HeaderBB && 1646 "recipe must be in the vector loop header"); 1647 unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF; 1648 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { 1649 Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi"); 1650 EntryPart->insertBefore(HeaderBB->getFirstInsertionPt()); 1651 State.set(this, EntryPart, Part); 1652 } 1653 1654 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1655 1656 // Reductions do not have to start at zero. They can start with 1657 // any loop invariant values. 1658 VPValue *StartVPV = getStartValue(); 1659 Value *StartV = StartVPV->getLiveInIRValue(); 1660 1661 Value *Iden = nullptr; 1662 RecurKind RK = RdxDesc.getRecurrenceKind(); 1663 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || 1664 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { 1665 // MinMax and AnyOf reductions have the start value as their identity. 1666 if (ScalarPHI) { 1667 Iden = StartV; 1668 } else { 1669 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 1670 Builder.SetInsertPoint(VectorPH->getTerminator()); 1671 StartV = Iden = 1672 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 1673 } 1674 } else { 1675 Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(), 1676 RdxDesc.getFastMathFlags()); 1677 1678 if (!ScalarPHI) { 1679 Iden = Builder.CreateVectorSplat(State.VF, Iden); 1680 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 1681 Builder.SetInsertPoint(VectorPH->getTerminator()); 1682 Constant *Zero = Builder.getInt32(0); 1683 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 1684 } 1685 } 1686 1687 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { 1688 Value *EntryPart = State.get(this, Part); 1689 // Make sure to add the reduction start value only to the 1690 // first unroll part. 1691 Value *StartVal = (Part == 0) ? StartV : Iden; 1692 cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH); 1693 } 1694 } 1695 1696 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1697 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, 1698 VPSlotTracker &SlotTracker) const { 1699 O << Indent << "WIDEN-REDUCTION-PHI "; 1700 1701 printAsOperand(O, SlotTracker); 1702 O << " = phi "; 1703 printOperands(O, SlotTracker); 1704 } 1705 #endif 1706 1707 void VPWidenPHIRecipe::execute(VPTransformState &State) { 1708 assert(EnableVPlanNativePath && 1709 "Non-native vplans are not expected to have VPWidenPHIRecipes."); 1710 1711 Value *Op0 = State.get(getOperand(0), 0); 1712 Type *VecTy = Op0->getType(); 1713 Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi"); 1714 State.set(this, VecPhi, 0); 1715 } 1716 1717 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1718 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, 1719 VPSlotTracker &SlotTracker) const { 1720 O << Indent << "WIDEN-PHI "; 1721 1722 auto *OriginalPhi = cast<PHINode>(getUnderlyingValue()); 1723 // Unless all incoming values are modeled in VPlan print the original PHI 1724 // directly. 1725 // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming 1726 // values as VPValues. 1727 if (getNumOperands() != OriginalPhi->getNumOperands()) { 1728 O << VPlanIngredient(OriginalPhi); 1729 return; 1730 } 1731 1732 printAsOperand(O, SlotTracker); 1733 O << " = phi "; 1734 printOperands(O, SlotTracker); 1735 } 1736 #endif 1737 1738 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and 1739 // remove VPActiveLaneMaskPHIRecipe. 1740 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { 1741 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1742 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { 1743 Value *StartMask = State.get(getOperand(0), Part); 1744 PHINode *EntryPart = 1745 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); 1746 EntryPart->addIncoming(StartMask, VectorPH); 1747 EntryPart->setDebugLoc(getDebugLoc()); 1748 State.set(this, EntryPart, Part); 1749 } 1750 } 1751 1752 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1753 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, 1754 VPSlotTracker &SlotTracker) const { 1755 O << Indent << "ACTIVE-LANE-MASK-PHI "; 1756 1757 printAsOperand(O, SlotTracker); 1758 O << " = phi "; 1759 printOperands(O, SlotTracker); 1760 } 1761 #endif 1762