1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file contains implementations for different VPlan recipes. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "VPlan.h" 15 #include "VPlanAnalysis.h" 16 #include "llvm/ADT/STLExtras.h" 17 #include "llvm/ADT/SmallVector.h" 18 #include "llvm/ADT/Twine.h" 19 #include "llvm/Analysis/IVDescriptors.h" 20 #include "llvm/IR/BasicBlock.h" 21 #include "llvm/IR/IRBuilder.h" 22 #include "llvm/IR/Instruction.h" 23 #include "llvm/IR/Instructions.h" 24 #include "llvm/IR/Type.h" 25 #include "llvm/IR/Value.h" 26 #include "llvm/Support/Casting.h" 27 #include "llvm/Support/CommandLine.h" 28 #include "llvm/Support/Debug.h" 29 #include "llvm/Support/raw_ostream.h" 30 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 31 #include "llvm/Transforms/Utils/LoopUtils.h" 32 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 33 #include <cassert> 34 35 using namespace llvm; 36 37 using VectorParts = SmallVector<Value *, 2>; 38 39 namespace llvm { 40 extern cl::opt<bool> EnableVPlanNativePath; 41 } 42 43 #define LV_NAME "loop-vectorize" 44 #define DEBUG_TYPE LV_NAME 45 46 bool VPRecipeBase::mayWriteToMemory() const { 47 switch (getVPDefID()) { 48 case VPInterleaveSC: 49 return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0; 50 case VPWidenStoreSC: 51 return true; 52 case VPReplicateSC: 53 case VPWidenCallSC: 54 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 55 ->mayWriteToMemory(); 56 case VPBranchOnMaskSC: 57 case VPScalarIVStepsSC: 58 case VPPredInstPHISC: 59 return false; 60 case VPBlendSC: 61 case VPReductionSC: 62 case VPWidenCanonicalIVSC: 63 case VPWidenCastSC: 64 case VPWidenGEPSC: 65 case VPWidenIntOrFpInductionSC: 66 case VPWidenLoadSC: 67 case VPWidenPHISC: 68 case VPWidenSC: 69 case VPWidenSelectSC: { 70 const Instruction *I = 71 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 72 (void)I; 73 assert((!I || !I->mayWriteToMemory()) && 74 "underlying instruction may write to memory"); 75 return false; 76 } 77 default: 78 return true; 79 } 80 } 81 82 bool VPRecipeBase::mayReadFromMemory() const { 83 switch (getVPDefID()) { 84 case VPWidenLoadSC: 85 return true; 86 case VPReplicateSC: 87 case VPWidenCallSC: 88 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 89 ->mayReadFromMemory(); 90 case VPBranchOnMaskSC: 91 case VPPredInstPHISC: 92 case VPScalarIVStepsSC: 93 case VPWidenStoreSC: 94 return false; 95 case VPBlendSC: 96 case VPReductionSC: 97 case VPWidenCanonicalIVSC: 98 case VPWidenCastSC: 99 case VPWidenGEPSC: 100 case VPWidenIntOrFpInductionSC: 101 case VPWidenPHISC: 102 case VPWidenSC: 103 case VPWidenSelectSC: { 104 const Instruction *I = 105 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 106 (void)I; 107 assert((!I || !I->mayReadFromMemory()) && 108 "underlying instruction may read from memory"); 109 return false; 110 } 111 default: 112 return true; 113 } 114 } 115 116 bool VPRecipeBase::mayHaveSideEffects() const { 117 switch (getVPDefID()) { 118 case VPDerivedIVSC: 119 case VPPredInstPHISC: 120 case VPScalarCastSC: 121 return false; 122 case VPInstructionSC: 123 switch (cast<VPInstruction>(this)->getOpcode()) { 124 case Instruction::Or: 125 case Instruction::ICmp: 126 case Instruction::Select: 127 case VPInstruction::Not: 128 case VPInstruction::CalculateTripCountMinusVF: 129 case VPInstruction::CanonicalIVIncrementForPart: 130 case VPInstruction::PtrAdd: 131 return false; 132 default: 133 return true; 134 } 135 case VPWidenCallSC: 136 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 137 ->mayHaveSideEffects(); 138 case VPBlendSC: 139 case VPReductionSC: 140 case VPScalarIVStepsSC: 141 case VPWidenCanonicalIVSC: 142 case VPWidenCastSC: 143 case VPWidenGEPSC: 144 case VPWidenIntOrFpInductionSC: 145 case VPWidenPHISC: 146 case VPWidenPointerInductionSC: 147 case VPWidenSC: 148 case VPWidenSelectSC: { 149 const Instruction *I = 150 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 151 (void)I; 152 assert((!I || !I->mayHaveSideEffects()) && 153 "underlying instruction has side-effects"); 154 return false; 155 } 156 case VPInterleaveSC: 157 return mayWriteToMemory(); 158 case VPWidenLoadSC: 159 case VPWidenStoreSC: 160 assert( 161 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() == 162 mayWriteToMemory() && 163 "mayHaveSideffects result for ingredient differs from this " 164 "implementation"); 165 return mayWriteToMemory(); 166 case VPReplicateSC: { 167 auto *R = cast<VPReplicateRecipe>(this); 168 return R->getUnderlyingInstr()->mayHaveSideEffects(); 169 } 170 default: 171 return true; 172 } 173 } 174 175 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { 176 auto Lane = VPLane::getLastLaneForVF(State.VF); 177 VPValue *ExitValue = getOperand(0); 178 if (vputils::isUniformAfterVectorization(ExitValue)) 179 Lane = VPLane::getFirstLane(); 180 VPBasicBlock *MiddleVPBB = 181 cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor()); 182 assert(MiddleVPBB->getNumSuccessors() == 0 && 183 "the middle block must not have any successors"); 184 BasicBlock *MiddleBB = State.CFG.VPBB2IRBB[MiddleVPBB]; 185 Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)), 186 MiddleBB); 187 } 188 189 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 190 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { 191 O << "Live-out "; 192 getPhi()->printAsOperand(O); 193 O << " = "; 194 getOperand(0)->printAsOperand(O, SlotTracker); 195 O << "\n"; 196 } 197 #endif 198 199 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { 200 assert(!Parent && "Recipe already in some VPBasicBlock"); 201 assert(InsertPos->getParent() && 202 "Insertion position not in any VPBasicBlock"); 203 InsertPos->getParent()->insert(this, InsertPos->getIterator()); 204 } 205 206 void VPRecipeBase::insertBefore(VPBasicBlock &BB, 207 iplist<VPRecipeBase>::iterator I) { 208 assert(!Parent && "Recipe already in some VPBasicBlock"); 209 assert(I == BB.end() || I->getParent() == &BB); 210 BB.insert(this, I); 211 } 212 213 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { 214 assert(!Parent && "Recipe already in some VPBasicBlock"); 215 assert(InsertPos->getParent() && 216 "Insertion position not in any VPBasicBlock"); 217 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator())); 218 } 219 220 void VPRecipeBase::removeFromParent() { 221 assert(getParent() && "Recipe not in any VPBasicBlock"); 222 getParent()->getRecipeList().remove(getIterator()); 223 Parent = nullptr; 224 } 225 226 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { 227 assert(getParent() && "Recipe not in any VPBasicBlock"); 228 return getParent()->getRecipeList().erase(getIterator()); 229 } 230 231 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { 232 removeFromParent(); 233 insertAfter(InsertPos); 234 } 235 236 void VPRecipeBase::moveBefore(VPBasicBlock &BB, 237 iplist<VPRecipeBase>::iterator I) { 238 removeFromParent(); 239 insertBefore(BB, I); 240 } 241 242 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { 243 assert(OpType == OperationType::FPMathOp && 244 "recipe doesn't have fast math flags"); 245 FastMathFlags Res; 246 Res.setAllowReassoc(FMFs.AllowReassoc); 247 Res.setNoNaNs(FMFs.NoNaNs); 248 Res.setNoInfs(FMFs.NoInfs); 249 Res.setNoSignedZeros(FMFs.NoSignedZeros); 250 Res.setAllowReciprocal(FMFs.AllowReciprocal); 251 Res.setAllowContract(FMFs.AllowContract); 252 Res.setApproxFunc(FMFs.ApproxFunc); 253 return Res; 254 } 255 256 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, 257 VPValue *A, VPValue *B, DebugLoc DL, 258 const Twine &Name) 259 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}), 260 Pred, DL), 261 Opcode(Opcode), Name(Name.str()) { 262 assert(Opcode == Instruction::ICmp && 263 "only ICmp predicates supported at the moment"); 264 } 265 266 VPInstruction::VPInstruction(unsigned Opcode, 267 std::initializer_list<VPValue *> Operands, 268 FastMathFlags FMFs, DebugLoc DL, const Twine &Name) 269 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL), 270 Opcode(Opcode), Name(Name.str()) { 271 // Make sure the VPInstruction is a floating-point operation. 272 assert(isFPMathOp() && "this op can't take fast-math flags"); 273 } 274 275 bool VPInstruction::doesGeneratePerAllLanes() const { 276 return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this); 277 } 278 279 bool VPInstruction::canGenerateScalarForFirstLane() const { 280 if (Instruction::isBinaryOp(getOpcode())) 281 return true; 282 283 switch (Opcode) { 284 case VPInstruction::BranchOnCond: 285 case VPInstruction::BranchOnCount: 286 case VPInstruction::CalculateTripCountMinusVF: 287 case VPInstruction::CanonicalIVIncrementForPart: 288 case VPInstruction::ComputeReductionResult: 289 case VPInstruction::PtrAdd: 290 case VPInstruction::ExplicitVectorLength: 291 return true; 292 default: 293 return false; 294 } 295 } 296 297 Value *VPInstruction::generatePerLane(VPTransformState &State, 298 const VPIteration &Lane) { 299 IRBuilderBase &Builder = State.Builder; 300 301 assert(getOpcode() == VPInstruction::PtrAdd && 302 "only PtrAdd opcodes are supported for now"); 303 return Builder.CreatePtrAdd(State.get(getOperand(0), Lane), 304 State.get(getOperand(1), Lane), Name); 305 } 306 307 Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { 308 IRBuilderBase &Builder = State.Builder; 309 310 if (Instruction::isBinaryOp(getOpcode())) { 311 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 312 if (Part != 0 && vputils::onlyFirstPartUsed(this)) 313 return State.get(this, 0, OnlyFirstLaneUsed); 314 315 Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed); 316 Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed); 317 auto *Res = 318 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); 319 if (auto *I = dyn_cast<Instruction>(Res)) 320 setFlags(I); 321 return Res; 322 } 323 324 switch (getOpcode()) { 325 case VPInstruction::Not: { 326 Value *A = State.get(getOperand(0), Part); 327 return Builder.CreateNot(A, Name); 328 } 329 case Instruction::ICmp: { 330 Value *A = State.get(getOperand(0), Part); 331 Value *B = State.get(getOperand(1), Part); 332 return Builder.CreateCmp(getPredicate(), A, B, Name); 333 } 334 case Instruction::Select: { 335 Value *Cond = State.get(getOperand(0), Part); 336 Value *Op1 = State.get(getOperand(1), Part); 337 Value *Op2 = State.get(getOperand(2), Part); 338 return Builder.CreateSelect(Cond, Op1, Op2, Name); 339 } 340 case VPInstruction::ActiveLaneMask: { 341 // Get first lane of vector induction variable. 342 Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); 343 // Get the original loop tripcount. 344 Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0)); 345 346 // If this part of the active lane mask is scalar, generate the CMP directly 347 // to avoid unnecessary extracts. 348 if (State.VF.isScalar()) 349 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC, 350 Name); 351 352 auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); 353 auto *PredTy = VectorType::get(Int1Ty, State.VF); 354 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, 355 {PredTy, ScalarTC->getType()}, 356 {VIVElem0, ScalarTC}, nullptr, Name); 357 } 358 case VPInstruction::FirstOrderRecurrenceSplice: { 359 // Generate code to combine the previous and current values in vector v3. 360 // 361 // vector.ph: 362 // v_init = vector(..., ..., ..., a[-1]) 363 // br vector.body 364 // 365 // vector.body 366 // i = phi [0, vector.ph], [i+4, vector.body] 367 // v1 = phi [v_init, vector.ph], [v2, vector.body] 368 // v2 = a[i, i+1, i+2, i+3]; 369 // v3 = vector(v1(3), v2(0, 1, 2)) 370 371 // For the first part, use the recurrence phi (v1), otherwise v2. 372 auto *V1 = State.get(getOperand(0), 0); 373 Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1); 374 if (!PartMinus1->getType()->isVectorTy()) 375 return PartMinus1; 376 Value *V2 = State.get(getOperand(1), Part); 377 return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name); 378 } 379 case VPInstruction::CalculateTripCountMinusVF: { 380 if (Part != 0) 381 return State.get(this, 0, /*IsScalar*/ true); 382 383 Value *ScalarTC = State.get(getOperand(0), {0, 0}); 384 Value *Step = 385 createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF); 386 Value *Sub = Builder.CreateSub(ScalarTC, Step); 387 Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step); 388 Value *Zero = ConstantInt::get(ScalarTC->getType(), 0); 389 return Builder.CreateSelect(Cmp, Sub, Zero); 390 } 391 case VPInstruction::ExplicitVectorLength: { 392 // Compute EVL 393 auto GetEVL = [=](VPTransformState &State, Value *AVL) { 394 assert(AVL->getType()->isIntegerTy() && 395 "Requested vector length should be an integer."); 396 397 // TODO: Add support for MaxSafeDist for correct loop emission. 398 assert(State.VF.isScalable() && "Expected scalable vector factor."); 399 Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue()); 400 401 Value *EVL = State.Builder.CreateIntrinsic( 402 State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length, 403 {AVL, VFArg, State.Builder.getTrue()}); 404 return EVL; 405 }; 406 // TODO: Restructure this code with an explicit remainder loop, vsetvli can 407 // be outside of the main loop. 408 assert(Part == 0 && "No unrolling expected for predicated vectorization."); 409 // Compute VTC - IV as the AVL (requested vector length). 410 Value *Index = State.get(getOperand(0), VPIteration(0, 0)); 411 Value *TripCount = State.get(getOperand(1), VPIteration(0, 0)); 412 Value *AVL = State.Builder.CreateSub(TripCount, Index); 413 Value *EVL = GetEVL(State, AVL); 414 assert(!State.EVL && "multiple EVL recipes"); 415 State.EVL = this; 416 return EVL; 417 } 418 case VPInstruction::CanonicalIVIncrementForPart: { 419 auto *IV = State.get(getOperand(0), VPIteration(0, 0)); 420 if (Part == 0) 421 return IV; 422 423 // The canonical IV is incremented by the vectorization factor (num of SIMD 424 // elements) times the unroll part. 425 Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); 426 return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(), 427 hasNoSignedWrap()); 428 } 429 case VPInstruction::BranchOnCond: { 430 if (Part != 0) 431 return nullptr; 432 433 Value *Cond = State.get(getOperand(0), VPIteration(Part, 0)); 434 VPRegionBlock *ParentRegion = getParent()->getParent(); 435 VPBasicBlock *Header = ParentRegion->getEntryBasicBlock(); 436 437 // Replace the temporary unreachable terminator with a new conditional 438 // branch, hooking it up to backward destination for exiting blocks now and 439 // to forward destination(s) later when they are created. 440 BranchInst *CondBr = 441 Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr); 442 443 if (getParent()->isExiting()) 444 CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]); 445 446 CondBr->setSuccessor(0, nullptr); 447 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 448 return CondBr; 449 } 450 case VPInstruction::BranchOnCount: { 451 if (Part != 0) 452 return nullptr; 453 // First create the compare. 454 Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true); 455 Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true); 456 Value *Cond = Builder.CreateICmpEQ(IV, TC); 457 458 // Now create the branch. 459 auto *Plan = getParent()->getPlan(); 460 VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); 461 VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock(); 462 463 // Replace the temporary unreachable terminator with a new conditional 464 // branch, hooking it up to backward destination (the header) now and to the 465 // forward destination (the exit/middle block) later when it is created. 466 // Note that CreateCondBr expects a valid BB as first argument, so we need 467 // to set it to nullptr later. 468 BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), 469 State.CFG.VPBB2IRBB[Header]); 470 CondBr->setSuccessor(0, nullptr); 471 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 472 return CondBr; 473 } 474 case VPInstruction::ComputeReductionResult: { 475 if (Part != 0) 476 return State.get(this, 0, /*IsScalar*/ true); 477 478 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary 479 // and will be removed by breaking up the recipe further. 480 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0)); 481 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 482 // Get its reduction variable descriptor. 483 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 484 485 RecurKind RK = RdxDesc.getRecurrenceKind(); 486 487 VPValue *LoopExitingDef = getOperand(1); 488 Type *PhiTy = OrigPhi->getType(); 489 VectorParts RdxParts(State.UF); 490 for (unsigned Part = 0; Part < State.UF; ++Part) 491 RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop()); 492 493 // If the vector reduction can be performed in a smaller type, we truncate 494 // then extend the loop exit value to enable InstCombine to evaluate the 495 // entire expression in the smaller type. 496 // TODO: Handle this in truncateToMinBW. 497 if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 498 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF); 499 for (unsigned Part = 0; Part < State.UF; ++Part) 500 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 501 } 502 // Reduce all of the unrolled parts into a single vector. 503 Value *ReducedPartRdx = RdxParts[0]; 504 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 505 506 if (PhiR->isOrdered()) { 507 ReducedPartRdx = RdxParts[State.UF - 1]; 508 } else { 509 // Floating-point operations should have some FMF to enable the reduction. 510 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 511 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 512 for (unsigned Part = 1; Part < State.UF; ++Part) { 513 Value *RdxPart = RdxParts[Part]; 514 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 515 ReducedPartRdx = Builder.CreateBinOp( 516 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 517 else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { 518 TrackingVH<Value> ReductionStartValue = 519 RdxDesc.getRecurrenceStartValue(); 520 ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK, 521 ReducedPartRdx, RdxPart); 522 } else 523 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 524 } 525 } 526 527 // Create the reduction after the loop. Note that inloop reductions create 528 // the target reduction in the loop using a Reduction recipe. 529 if (State.VF.isVector() && !PhiR->isInLoop()) { 530 ReducedPartRdx = 531 createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); 532 // If the reduction can be performed in a smaller type, we need to extend 533 // the reduction to the wider type before we branch to the original loop. 534 if (PhiTy != RdxDesc.getRecurrenceType()) 535 ReducedPartRdx = RdxDesc.isSigned() 536 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 537 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 538 } 539 540 // If there were stores of the reduction value to a uniform memory address 541 // inside the loop, create the final store here. 542 if (StoreInst *SI = RdxDesc.IntermediateStore) { 543 auto *NewSI = Builder.CreateAlignedStore( 544 ReducedPartRdx, SI->getPointerOperand(), SI->getAlign()); 545 propagateMetadata(NewSI, SI); 546 } 547 548 return ReducedPartRdx; 549 } 550 case VPInstruction::PtrAdd: { 551 assert(vputils::onlyFirstLaneUsed(this) && 552 "can only generate first lane for PtrAdd"); 553 Value *Ptr = State.get(getOperand(0), Part, /* IsScalar */ true); 554 Value *Addend = State.get(getOperand(1), Part, /* IsScalar */ true); 555 return Builder.CreatePtrAdd(Ptr, Addend, Name); 556 } 557 default: 558 llvm_unreachable("Unsupported opcode for instruction"); 559 } 560 } 561 562 #if !defined(NDEBUG) 563 bool VPInstruction::isFPMathOp() const { 564 // Inspired by FPMathOperator::classof. Notable differences are that we don't 565 // support Call, PHI and Select opcodes here yet. 566 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || 567 Opcode == Instruction::FNeg || Opcode == Instruction::FSub || 568 Opcode == Instruction::FDiv || Opcode == Instruction::FRem || 569 Opcode == Instruction::FCmp || Opcode == Instruction::Select; 570 } 571 #endif 572 573 void VPInstruction::execute(VPTransformState &State) { 574 assert(!State.Instance && "VPInstruction executing an Instance"); 575 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 576 assert((hasFastMathFlags() == isFPMathOp() || 577 getOpcode() == Instruction::Select) && 578 "Recipe not a FPMathOp but has fast-math flags?"); 579 if (hasFastMathFlags()) 580 State.Builder.setFastMathFlags(getFastMathFlags()); 581 State.setDebugLocFrom(getDebugLoc()); 582 bool GeneratesPerFirstLaneOnly = 583 canGenerateScalarForFirstLane() && 584 (vputils::onlyFirstLaneUsed(this) || 585 getOpcode() == VPInstruction::ComputeReductionResult); 586 bool GeneratesPerAllLanes = doesGeneratePerAllLanes(); 587 for (unsigned Part = 0; Part < State.UF; ++Part) { 588 if (GeneratesPerAllLanes) { 589 for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue(); 590 Lane != NumLanes; ++Lane) { 591 Value *GeneratedValue = generatePerLane(State, VPIteration(Part, Lane)); 592 assert(GeneratedValue && "generatePerLane must produce a value"); 593 State.set(this, GeneratedValue, VPIteration(Part, Lane)); 594 } 595 continue; 596 } 597 598 Value *GeneratedValue = generatePerPart(State, Part); 599 if (!hasResult()) 600 continue; 601 assert(GeneratedValue && "generatePerPart must produce a value"); 602 assert((GeneratedValue->getType()->isVectorTy() == 603 !GeneratesPerFirstLaneOnly || 604 State.VF.isScalar()) && 605 "scalar value but not only first lane defined"); 606 State.set(this, GeneratedValue, Part, 607 /*IsScalar*/ GeneratesPerFirstLaneOnly); 608 } 609 } 610 611 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { 612 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 613 if (Instruction::isBinaryOp(getOpcode())) 614 return vputils::onlyFirstLaneUsed(this); 615 616 switch (getOpcode()) { 617 default: 618 return false; 619 case Instruction::ICmp: 620 case VPInstruction::PtrAdd: 621 // TODO: Cover additional opcodes. 622 return vputils::onlyFirstLaneUsed(this); 623 case VPInstruction::ActiveLaneMask: 624 case VPInstruction::ExplicitVectorLength: 625 case VPInstruction::CalculateTripCountMinusVF: 626 case VPInstruction::CanonicalIVIncrementForPart: 627 case VPInstruction::BranchOnCount: 628 return true; 629 }; 630 llvm_unreachable("switch should return"); 631 } 632 633 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 634 void VPInstruction::dump() const { 635 VPSlotTracker SlotTracker(getParent()->getPlan()); 636 print(dbgs(), "", SlotTracker); 637 } 638 639 void VPInstruction::print(raw_ostream &O, const Twine &Indent, 640 VPSlotTracker &SlotTracker) const { 641 O << Indent << "EMIT "; 642 643 if (hasResult()) { 644 printAsOperand(O, SlotTracker); 645 O << " = "; 646 } 647 648 switch (getOpcode()) { 649 case VPInstruction::Not: 650 O << "not"; 651 break; 652 case VPInstruction::SLPLoad: 653 O << "combined load"; 654 break; 655 case VPInstruction::SLPStore: 656 O << "combined store"; 657 break; 658 case VPInstruction::ActiveLaneMask: 659 O << "active lane mask"; 660 break; 661 case VPInstruction::ExplicitVectorLength: 662 O << "EXPLICIT-VECTOR-LENGTH"; 663 break; 664 case VPInstruction::FirstOrderRecurrenceSplice: 665 O << "first-order splice"; 666 break; 667 case VPInstruction::BranchOnCond: 668 O << "branch-on-cond"; 669 break; 670 case VPInstruction::CalculateTripCountMinusVF: 671 O << "TC > VF ? TC - VF : 0"; 672 break; 673 case VPInstruction::CanonicalIVIncrementForPart: 674 O << "VF * Part +"; 675 break; 676 case VPInstruction::BranchOnCount: 677 O << "branch-on-count"; 678 break; 679 case VPInstruction::ComputeReductionResult: 680 O << "compute-reduction-result"; 681 break; 682 case VPInstruction::PtrAdd: 683 O << "ptradd"; 684 break; 685 default: 686 O << Instruction::getOpcodeName(getOpcode()); 687 } 688 689 printFlags(O); 690 printOperands(O, SlotTracker); 691 692 if (auto DL = getDebugLoc()) { 693 O << ", !dbg "; 694 DL.print(O); 695 } 696 } 697 #endif 698 699 void VPWidenCallRecipe::execute(VPTransformState &State) { 700 assert(State.VF.isVector() && "not widening"); 701 auto &CI = *cast<CallInst>(getUnderlyingInstr()); 702 assert(!isa<DbgInfoIntrinsic>(CI) && 703 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 704 State.setDebugLocFrom(getDebugLoc()); 705 706 bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic; 707 FunctionType *VFTy = nullptr; 708 if (Variant) 709 VFTy = Variant->getFunctionType(); 710 for (unsigned Part = 0; Part < State.UF; ++Part) { 711 SmallVector<Type *, 2> TysForDecl; 712 // Add return type if intrinsic is overloaded on it. 713 if (UseIntrinsic && 714 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1)) 715 TysForDecl.push_back( 716 VectorType::get(CI.getType()->getScalarType(), State.VF)); 717 SmallVector<Value *, 4> Args; 718 for (const auto &I : enumerate(operands())) { 719 // Some intrinsics have a scalar argument - don't replace it with a 720 // vector. 721 Value *Arg; 722 if (UseIntrinsic && 723 isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())) 724 Arg = State.get(I.value(), VPIteration(0, 0)); 725 // Some vectorized function variants may also take a scalar argument, 726 // e.g. linear parameters for pointers. This needs to be the scalar value 727 // from the start of the respective part when interleaving. 728 else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy()) 729 Arg = State.get(I.value(), VPIteration(Part, 0)); 730 else 731 Arg = State.get(I.value(), Part); 732 if (UseIntrinsic && 733 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index())) 734 TysForDecl.push_back(Arg->getType()); 735 Args.push_back(Arg); 736 } 737 738 Function *VectorF; 739 if (UseIntrinsic) { 740 // Use vector version of the intrinsic. 741 Module *M = State.Builder.GetInsertBlock()->getModule(); 742 VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl); 743 assert(VectorF && "Can't retrieve vector intrinsic."); 744 } else { 745 #ifndef NDEBUG 746 assert(Variant != nullptr && "Can't create vector function."); 747 #endif 748 VectorF = Variant; 749 } 750 751 SmallVector<OperandBundleDef, 1> OpBundles; 752 CI.getOperandBundlesAsDefs(OpBundles); 753 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); 754 755 if (isa<FPMathOperator>(V)) 756 V->copyFastMathFlags(&CI); 757 758 if (!V->getType()->isVoidTy()) 759 State.set(this, V, Part); 760 State.addMetadata(V, &CI); 761 } 762 } 763 764 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 765 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, 766 VPSlotTracker &SlotTracker) const { 767 O << Indent << "WIDEN-CALL "; 768 769 auto *CI = cast<CallInst>(getUnderlyingInstr()); 770 if (CI->getType()->isVoidTy()) 771 O << "void "; 772 else { 773 printAsOperand(O, SlotTracker); 774 O << " = "; 775 } 776 777 O << "call @" << CI->getCalledFunction()->getName() << "("; 778 printOperands(O, SlotTracker); 779 O << ")"; 780 781 if (VectorIntrinsicID) 782 O << " (using vector intrinsic)"; 783 else { 784 O << " (using library function"; 785 if (Variant->hasName()) 786 O << ": " << Variant->getName(); 787 O << ")"; 788 } 789 } 790 791 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, 792 VPSlotTracker &SlotTracker) const { 793 O << Indent << "WIDEN-SELECT "; 794 printAsOperand(O, SlotTracker); 795 O << " = select "; 796 getOperand(0)->printAsOperand(O, SlotTracker); 797 O << ", "; 798 getOperand(1)->printAsOperand(O, SlotTracker); 799 O << ", "; 800 getOperand(2)->printAsOperand(O, SlotTracker); 801 O << (isInvariantCond() ? " (condition is loop invariant)" : ""); 802 } 803 #endif 804 805 void VPWidenSelectRecipe::execute(VPTransformState &State) { 806 State.setDebugLocFrom(getDebugLoc()); 807 808 // The condition can be loop invariant but still defined inside the 809 // loop. This means that we can't just use the original 'cond' value. 810 // We have to take the 'vectorized' value and pick the first lane. 811 // Instcombine will make this a no-op. 812 auto *InvarCond = 813 isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr; 814 815 for (unsigned Part = 0; Part < State.UF; ++Part) { 816 Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part); 817 Value *Op0 = State.get(getOperand(1), Part); 818 Value *Op1 = State.get(getOperand(2), Part); 819 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 820 State.set(this, Sel, Part); 821 State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 822 } 823 } 824 825 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy( 826 const FastMathFlags &FMF) { 827 AllowReassoc = FMF.allowReassoc(); 828 NoNaNs = FMF.noNaNs(); 829 NoInfs = FMF.noInfs(); 830 NoSignedZeros = FMF.noSignedZeros(); 831 AllowReciprocal = FMF.allowReciprocal(); 832 AllowContract = FMF.allowContract(); 833 ApproxFunc = FMF.approxFunc(); 834 } 835 836 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 837 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { 838 switch (OpType) { 839 case OperationType::Cmp: 840 O << " " << CmpInst::getPredicateName(getPredicate()); 841 break; 842 case OperationType::DisjointOp: 843 if (DisjointFlags.IsDisjoint) 844 O << " disjoint"; 845 break; 846 case OperationType::PossiblyExactOp: 847 if (ExactFlags.IsExact) 848 O << " exact"; 849 break; 850 case OperationType::OverflowingBinOp: 851 if (WrapFlags.HasNUW) 852 O << " nuw"; 853 if (WrapFlags.HasNSW) 854 O << " nsw"; 855 break; 856 case OperationType::FPMathOp: 857 getFastMathFlags().print(O); 858 break; 859 case OperationType::GEPOp: 860 if (GEPFlags.IsInBounds) 861 O << " inbounds"; 862 break; 863 case OperationType::NonNegOp: 864 if (NonNegFlags.NonNeg) 865 O << " nneg"; 866 break; 867 case OperationType::Other: 868 break; 869 } 870 if (getNumOperands() > 0) 871 O << " "; 872 } 873 #endif 874 875 void VPWidenRecipe::execute(VPTransformState &State) { 876 State.setDebugLocFrom(getDebugLoc()); 877 auto &Builder = State.Builder; 878 switch (Opcode) { 879 case Instruction::Call: 880 case Instruction::Br: 881 case Instruction::PHI: 882 case Instruction::GetElementPtr: 883 case Instruction::Select: 884 llvm_unreachable("This instruction is handled by a different recipe."); 885 case Instruction::UDiv: 886 case Instruction::SDiv: 887 case Instruction::SRem: 888 case Instruction::URem: 889 case Instruction::Add: 890 case Instruction::FAdd: 891 case Instruction::Sub: 892 case Instruction::FSub: 893 case Instruction::FNeg: 894 case Instruction::Mul: 895 case Instruction::FMul: 896 case Instruction::FDiv: 897 case Instruction::FRem: 898 case Instruction::Shl: 899 case Instruction::LShr: 900 case Instruction::AShr: 901 case Instruction::And: 902 case Instruction::Or: 903 case Instruction::Xor: { 904 // Just widen unops and binops. 905 for (unsigned Part = 0; Part < State.UF; ++Part) { 906 SmallVector<Value *, 2> Ops; 907 for (VPValue *VPOp : operands()) 908 Ops.push_back(State.get(VPOp, Part)); 909 910 Value *V = Builder.CreateNAryOp(Opcode, Ops); 911 912 if (auto *VecOp = dyn_cast<Instruction>(V)) 913 setFlags(VecOp); 914 915 // Use this vector value for all users of the original instruction. 916 State.set(this, V, Part); 917 State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 918 } 919 920 break; 921 } 922 case Instruction::Freeze: { 923 for (unsigned Part = 0; Part < State.UF; ++Part) { 924 Value *Op = State.get(getOperand(0), Part); 925 926 Value *Freeze = Builder.CreateFreeze(Op); 927 State.set(this, Freeze, Part); 928 } 929 break; 930 } 931 case Instruction::ICmp: 932 case Instruction::FCmp: { 933 // Widen compares. Generate vector compares. 934 bool FCmp = Opcode == Instruction::FCmp; 935 for (unsigned Part = 0; Part < State.UF; ++Part) { 936 Value *A = State.get(getOperand(0), Part); 937 Value *B = State.get(getOperand(1), Part); 938 Value *C = nullptr; 939 if (FCmp) { 940 // Propagate fast math flags. 941 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 942 if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue())) 943 Builder.setFastMathFlags(I->getFastMathFlags()); 944 C = Builder.CreateFCmp(getPredicate(), A, B); 945 } else { 946 C = Builder.CreateICmp(getPredicate(), A, B); 947 } 948 State.set(this, C, Part); 949 State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 950 } 951 952 break; 953 } 954 default: 955 // This instruction is not vectorized by simple widening. 956 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " 957 << Instruction::getOpcodeName(Opcode)); 958 llvm_unreachable("Unhandled instruction!"); 959 } // end of switch. 960 961 #if !defined(NDEBUG) 962 // Verify that VPlan type inference results agree with the type of the 963 // generated values. 964 for (unsigned Part = 0; Part < State.UF; ++Part) { 965 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), 966 State.VF) == State.get(this, Part)->getType() && 967 "inferred type and type from generated instructions do not match"); 968 } 969 #endif 970 } 971 972 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 973 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, 974 VPSlotTracker &SlotTracker) const { 975 O << Indent << "WIDEN "; 976 printAsOperand(O, SlotTracker); 977 O << " = " << Instruction::getOpcodeName(Opcode); 978 printFlags(O); 979 printOperands(O, SlotTracker); 980 } 981 #endif 982 983 void VPWidenCastRecipe::execute(VPTransformState &State) { 984 State.setDebugLocFrom(getDebugLoc()); 985 auto &Builder = State.Builder; 986 /// Vectorize casts. 987 assert(State.VF.isVector() && "Not vectorizing?"); 988 Type *DestTy = VectorType::get(getResultType(), State.VF); 989 VPValue *Op = getOperand(0); 990 for (unsigned Part = 0; Part < State.UF; ++Part) { 991 if (Part > 0 && Op->isLiveIn()) { 992 // FIXME: Remove once explicit unrolling is implemented using VPlan. 993 State.set(this, State.get(this, 0), Part); 994 continue; 995 } 996 Value *A = State.get(Op, Part); 997 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); 998 State.set(this, Cast, Part); 999 State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue())); 1000 } 1001 } 1002 1003 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1004 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, 1005 VPSlotTracker &SlotTracker) const { 1006 O << Indent << "WIDEN-CAST "; 1007 printAsOperand(O, SlotTracker); 1008 O << " = " << Instruction::getOpcodeName(Opcode) << " "; 1009 printFlags(O); 1010 printOperands(O, SlotTracker); 1011 O << " to " << *getResultType(); 1012 } 1013 #endif 1014 1015 /// This function adds 1016 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 1017 /// to each vector element of Val. The sequence starts at StartIndex. 1018 /// \p Opcode is relevant for FP induction variable. 1019 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 1020 Instruction::BinaryOps BinOp, ElementCount VF, 1021 IRBuilderBase &Builder) { 1022 assert(VF.isVector() && "only vector VFs are supported"); 1023 1024 // Create and check the types. 1025 auto *ValVTy = cast<VectorType>(Val->getType()); 1026 ElementCount VLen = ValVTy->getElementCount(); 1027 1028 Type *STy = Val->getType()->getScalarType(); 1029 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1030 "Induction Step must be an integer or FP"); 1031 assert(Step->getType() == STy && "Step has wrong type"); 1032 1033 SmallVector<Constant *, 8> Indices; 1034 1035 // Create a vector of consecutive numbers from zero to VF. 1036 VectorType *InitVecValVTy = ValVTy; 1037 if (STy->isFloatingPointTy()) { 1038 Type *InitVecValSTy = 1039 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 1040 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 1041 } 1042 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 1043 1044 // Splat the StartIdx 1045 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 1046 1047 if (STy->isIntegerTy()) { 1048 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 1049 Step = Builder.CreateVectorSplat(VLen, Step); 1050 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1051 // FIXME: The newly created binary instructions should contain nsw/nuw 1052 // flags, which can be found from the original scalar operations. 1053 Step = Builder.CreateMul(InitVec, Step); 1054 return Builder.CreateAdd(Val, Step, "induction"); 1055 } 1056 1057 // Floating point induction. 1058 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1059 "Binary Opcode should be specified for FP induction"); 1060 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 1061 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 1062 1063 Step = Builder.CreateVectorSplat(VLen, Step); 1064 Value *MulOp = Builder.CreateFMul(InitVec, Step); 1065 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1066 } 1067 1068 /// A helper function that returns an integer or floating-point constant with 1069 /// value C. 1070 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 1071 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 1072 : ConstantFP::get(Ty, C); 1073 } 1074 1075 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 1076 ElementCount VF) { 1077 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1078 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1079 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1080 return B.CreateUIToFP(RuntimeVF, FTy); 1081 } 1082 1083 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 1084 assert(!State.Instance && "Int or FP induction being replicated."); 1085 1086 Value *Start = getStartValue()->getLiveInIRValue(); 1087 const InductionDescriptor &ID = getInductionDescriptor(); 1088 TruncInst *Trunc = getTruncInst(); 1089 IRBuilderBase &Builder = State.Builder; 1090 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1091 assert(State.VF.isVector() && "must have vector VF"); 1092 1093 // The value from the original loop to which we are mapping the new induction 1094 // variable. 1095 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1096 1097 // Fast-math-flags propagate from the original induction instruction. 1098 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 1099 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 1100 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 1101 1102 // Now do the actual transformations, and start with fetching the step value. 1103 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 1104 1105 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1106 "Expected either an induction phi-node or a truncate of it!"); 1107 1108 // Construct the initial value of the vector IV in the vector loop preheader 1109 auto CurrIP = Builder.saveIP(); 1110 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1111 Builder.SetInsertPoint(VectorPH->getTerminator()); 1112 if (isa<TruncInst>(EntryVal)) { 1113 assert(Start->getType()->isIntegerTy() && 1114 "Truncation requires an integer type"); 1115 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1116 Step = Builder.CreateTrunc(Step, TruncType); 1117 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1118 } 1119 1120 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 1121 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 1122 Value *SteppedStart = getStepVector( 1123 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); 1124 1125 // We create vector phi nodes for both integer and floating-point induction 1126 // variables. Here, we determine the kind of arithmetic we will perform. 1127 Instruction::BinaryOps AddOp; 1128 Instruction::BinaryOps MulOp; 1129 if (Step->getType()->isIntegerTy()) { 1130 AddOp = Instruction::Add; 1131 MulOp = Instruction::Mul; 1132 } else { 1133 AddOp = ID.getInductionOpcode(); 1134 MulOp = Instruction::FMul; 1135 } 1136 1137 // Multiply the vectorization factor by the step using integer or 1138 // floating-point arithmetic as appropriate. 1139 Type *StepType = Step->getType(); 1140 Value *RuntimeVF; 1141 if (Step->getType()->isFloatingPointTy()) 1142 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 1143 else 1144 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 1145 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 1146 1147 // Create a vector splat to use in the induction update. 1148 // 1149 // FIXME: If the step is non-constant, we create the vector splat with 1150 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1151 // handle a constant vector splat. 1152 Value *SplatVF = isa<Constant>(Mul) 1153 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 1154 : Builder.CreateVectorSplat(State.VF, Mul); 1155 Builder.restoreIP(CurrIP); 1156 1157 // We may need to add the step a number of times, depending on the unroll 1158 // factor. The last of those goes into the PHI. 1159 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind"); 1160 VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 1161 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1162 Instruction *LastInduction = VecInd; 1163 for (unsigned Part = 0; Part < State.UF; ++Part) { 1164 State.set(this, LastInduction, Part); 1165 1166 if (isa<TruncInst>(EntryVal)) 1167 State.addMetadata(LastInduction, EntryVal); 1168 1169 LastInduction = cast<Instruction>( 1170 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 1171 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1172 } 1173 1174 LastInduction->setName("vec.ind.next"); 1175 VecInd->addIncoming(SteppedStart, VectorPH); 1176 // Add induction update using an incorrect block temporarily. The phi node 1177 // will be fixed after VPlan execution. Note that at this point the latch 1178 // block cannot be used, as it does not exist yet. 1179 // TODO: Model increment value in VPlan, by turning the recipe into a 1180 // multi-def and a subclass of VPHeaderPHIRecipe. 1181 VecInd->addIncoming(LastInduction, VectorPH); 1182 } 1183 1184 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1185 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, 1186 VPSlotTracker &SlotTracker) const { 1187 O << Indent << "WIDEN-INDUCTION"; 1188 if (getTruncInst()) { 1189 O << "\\l\""; 1190 O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\""; 1191 O << " +\n" << Indent << "\" "; 1192 getVPValue(0)->printAsOperand(O, SlotTracker); 1193 } else 1194 O << " " << VPlanIngredient(IV); 1195 1196 O << ", "; 1197 getStepValue()->printAsOperand(O, SlotTracker); 1198 } 1199 #endif 1200 1201 bool VPWidenIntOrFpInductionRecipe::isCanonical() const { 1202 // The step may be defined by a recipe in the preheader (e.g. if it requires 1203 // SCEV expansion), but for the canonical induction the step is required to be 1204 // 1, which is represented as live-in. 1205 if (getStepValue()->getDefiningRecipe()) 1206 return false; 1207 auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue()); 1208 auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); 1209 return StartC && StartC->isZero() && StepC && StepC->isOne(); 1210 } 1211 1212 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1213 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, 1214 VPSlotTracker &SlotTracker) const { 1215 O << Indent; 1216 printAsOperand(O, SlotTracker); 1217 O << Indent << "= DERIVED-IV "; 1218 getStartValue()->printAsOperand(O, SlotTracker); 1219 O << " + "; 1220 getOperand(1)->printAsOperand(O, SlotTracker); 1221 O << " * "; 1222 getStepValue()->printAsOperand(O, SlotTracker); 1223 } 1224 #endif 1225 1226 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 1227 // Fast-math-flags propagate from the original induction instruction. 1228 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 1229 if (hasFastMathFlags()) 1230 State.Builder.setFastMathFlags(getFastMathFlags()); 1231 1232 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 1233 /// variable on which to base the steps, \p Step is the size of the step. 1234 1235 Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0)); 1236 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 1237 IRBuilderBase &Builder = State.Builder; 1238 1239 // Ensure step has the same type as that of scalar IV. 1240 Type *BaseIVTy = BaseIV->getType()->getScalarType(); 1241 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!"); 1242 1243 // We build scalar steps for both integer and floating-point induction 1244 // variables. Here, we determine the kind of arithmetic we will perform. 1245 Instruction::BinaryOps AddOp; 1246 Instruction::BinaryOps MulOp; 1247 if (BaseIVTy->isIntegerTy()) { 1248 AddOp = Instruction::Add; 1249 MulOp = Instruction::Mul; 1250 } else { 1251 AddOp = InductionOpcode; 1252 MulOp = Instruction::FMul; 1253 } 1254 1255 // Determine the number of scalars we need to generate for each unroll 1256 // iteration. 1257 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this); 1258 // Compute the scalar steps and save the results in State. 1259 Type *IntStepTy = 1260 IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits()); 1261 Type *VecIVTy = nullptr; 1262 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 1263 if (!FirstLaneOnly && State.VF.isScalable()) { 1264 VecIVTy = VectorType::get(BaseIVTy, State.VF); 1265 UnitStepVec = 1266 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 1267 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 1268 SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV); 1269 } 1270 1271 unsigned StartPart = 0; 1272 unsigned EndPart = State.UF; 1273 unsigned StartLane = 0; 1274 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 1275 if (State.Instance) { 1276 StartPart = State.Instance->Part; 1277 EndPart = StartPart + 1; 1278 StartLane = State.Instance->Lane.getKnownLane(); 1279 EndLane = StartLane + 1; 1280 } 1281 for (unsigned Part = StartPart; Part < EndPart; ++Part) { 1282 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 1283 1284 if (!FirstLaneOnly && State.VF.isScalable()) { 1285 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 1286 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 1287 if (BaseIVTy->isFloatingPointTy()) 1288 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 1289 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 1290 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 1291 State.set(this, Add, Part); 1292 // It's useful to record the lane values too for the known minimum number 1293 // of elements so we do those below. This improves the code quality when 1294 // trying to extract the first element, for example. 1295 } 1296 1297 if (BaseIVTy->isFloatingPointTy()) 1298 StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy); 1299 1300 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { 1301 Value *StartIdx = Builder.CreateBinOp( 1302 AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane)); 1303 // The step returned by `createStepForVF` is a runtime-evaluated value 1304 // when VF is scalable. Otherwise, it should be folded into a Constant. 1305 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 1306 "Expected StartIdx to be folded to a constant when VF is not " 1307 "scalable"); 1308 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 1309 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul); 1310 State.set(this, Add, VPIteration(Part, Lane)); 1311 } 1312 } 1313 } 1314 1315 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1316 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, 1317 VPSlotTracker &SlotTracker) const { 1318 O << Indent; 1319 printAsOperand(O, SlotTracker); 1320 O << " = SCALAR-STEPS "; 1321 printOperands(O, SlotTracker); 1322 } 1323 #endif 1324 1325 void VPWidenGEPRecipe::execute(VPTransformState &State) { 1326 assert(State.VF.isVector() && "not widening"); 1327 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 1328 // Construct a vector GEP by widening the operands of the scalar GEP as 1329 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 1330 // results in a vector of pointers when at least one operand of the GEP 1331 // is vector-typed. Thus, to keep the representation compact, we only use 1332 // vector-typed operands for loop-varying values. 1333 1334 if (areAllOperandsInvariant()) { 1335 // If we are vectorizing, but the GEP has only loop-invariant operands, 1336 // the GEP we build (by only using vector-typed operands for 1337 // loop-varying values) would be a scalar pointer. Thus, to ensure we 1338 // produce a vector of pointers, we need to either arbitrarily pick an 1339 // operand to broadcast, or broadcast a clone of the original GEP. 1340 // Here, we broadcast a clone of the original. 1341 // 1342 // TODO: If at some point we decide to scalarize instructions having 1343 // loop-invariant operands, this special case will no longer be 1344 // required. We would add the scalarization decision to 1345 // collectLoopScalars() and teach getVectorValue() to broadcast 1346 // the lane-zero scalar value. 1347 SmallVector<Value *> Ops; 1348 for (unsigned I = 0, E = getNumOperands(); I != E; I++) 1349 Ops.push_back(State.get(getOperand(I), VPIteration(0, 0))); 1350 1351 auto *NewGEP = 1352 State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0], 1353 ArrayRef(Ops).drop_front(), "", isInBounds()); 1354 for (unsigned Part = 0; Part < State.UF; ++Part) { 1355 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP); 1356 State.set(this, EntryPart, Part); 1357 State.addMetadata(EntryPart, GEP); 1358 } 1359 } else { 1360 // If the GEP has at least one loop-varying operand, we are sure to 1361 // produce a vector of pointers. But if we are only unrolling, we want 1362 // to produce a scalar GEP for each unroll part. Thus, the GEP we 1363 // produce with the code below will be scalar (if VF == 1) or vector 1364 // (otherwise). Note that for the unroll-only case, we still maintain 1365 // values in the vector mapping with initVector, as we do for other 1366 // instructions. 1367 for (unsigned Part = 0; Part < State.UF; ++Part) { 1368 // The pointer operand of the new GEP. If it's loop-invariant, we 1369 // won't broadcast it. 1370 auto *Ptr = isPointerLoopInvariant() 1371 ? State.get(getOperand(0), VPIteration(0, 0)) 1372 : State.get(getOperand(0), Part); 1373 1374 // Collect all the indices for the new GEP. If any index is 1375 // loop-invariant, we won't broadcast it. 1376 SmallVector<Value *, 4> Indices; 1377 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 1378 VPValue *Operand = getOperand(I); 1379 if (isIndexLoopInvariant(I - 1)) 1380 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 1381 else 1382 Indices.push_back(State.get(Operand, Part)); 1383 } 1384 1385 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 1386 // but it should be a vector, otherwise. 1387 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, 1388 Indices, "", isInBounds()); 1389 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 1390 "NewGEP is not a pointer vector"); 1391 State.set(this, NewGEP, Part); 1392 State.addMetadata(NewGEP, GEP); 1393 } 1394 } 1395 } 1396 1397 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1398 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, 1399 VPSlotTracker &SlotTracker) const { 1400 O << Indent << "WIDEN-GEP "; 1401 O << (isPointerLoopInvariant() ? "Inv" : "Var"); 1402 for (size_t I = 0; I < getNumOperands() - 1; ++I) 1403 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]"; 1404 1405 O << " "; 1406 printAsOperand(O, SlotTracker); 1407 O << " = getelementptr"; 1408 printFlags(O); 1409 printOperands(O, SlotTracker); 1410 } 1411 #endif 1412 1413 void VPVectorPointerRecipe ::execute(VPTransformState &State) { 1414 auto &Builder = State.Builder; 1415 State.setDebugLocFrom(getDebugLoc()); 1416 for (unsigned Part = 0; Part < State.UF; ++Part) { 1417 // Calculate the pointer for the specific unroll-part. 1418 Value *PartPtr = nullptr; 1419 // Use i32 for the gep index type when the value is constant, 1420 // or query DataLayout for a more suitable index type otherwise. 1421 const DataLayout &DL = 1422 Builder.GetInsertBlock()->getModule()->getDataLayout(); 1423 Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0) 1424 ? DL.getIndexType(IndexedTy->getPointerTo()) 1425 : Builder.getInt32Ty(); 1426 Value *Ptr = State.get(getOperand(0), VPIteration(0, 0)); 1427 bool InBounds = isInBounds(); 1428 if (IsReverse) { 1429 // If the address is consecutive but reversed, then the 1430 // wide store needs to start at the last vector element. 1431 // RunTimeVF = VScale * VF.getKnownMinValue() 1432 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 1433 Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF); 1434 // NumElt = -Part * RunTimeVF 1435 Value *NumElt = Builder.CreateMul( 1436 ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF); 1437 // LastLane = 1 - RunTimeVF 1438 Value *LastLane = 1439 Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); 1440 PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds); 1441 PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds); 1442 } else { 1443 Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part); 1444 PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds); 1445 } 1446 1447 State.set(this, PartPtr, Part, /*IsScalar*/ true); 1448 } 1449 } 1450 1451 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1452 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, 1453 VPSlotTracker &SlotTracker) const { 1454 O << Indent; 1455 printAsOperand(O, SlotTracker); 1456 O << " = vector-pointer "; 1457 if (IsReverse) 1458 O << "(reverse) "; 1459 1460 printOperands(O, SlotTracker); 1461 } 1462 #endif 1463 1464 void VPBlendRecipe::execute(VPTransformState &State) { 1465 State.setDebugLocFrom(getDebugLoc()); 1466 // We know that all PHIs in non-header blocks are converted into 1467 // selects, so we don't have to worry about the insertion order and we 1468 // can just use the builder. 1469 // At this point we generate the predication tree. There may be 1470 // duplications since this is a simple recursive scan, but future 1471 // optimizations will clean it up. 1472 1473 unsigned NumIncoming = getNumIncomingValues(); 1474 1475 // Generate a sequence of selects of the form: 1476 // SELECT(Mask3, In3, 1477 // SELECT(Mask2, In2, 1478 // SELECT(Mask1, In1, 1479 // In0))) 1480 // Note that Mask0 is never used: lanes for which no path reaches this phi and 1481 // are essentially undef are taken from In0. 1482 VectorParts Entry(State.UF); 1483 for (unsigned In = 0; In < NumIncoming; ++In) { 1484 for (unsigned Part = 0; Part < State.UF; ++Part) { 1485 // We might have single edge PHIs (blocks) - use an identity 1486 // 'select' for the first PHI operand. 1487 Value *In0 = State.get(getIncomingValue(In), Part); 1488 if (In == 0) 1489 Entry[Part] = In0; // Initialize with the first incoming value. 1490 else { 1491 // Select between the current value and the previous incoming edge 1492 // based on the incoming mask. 1493 Value *Cond = State.get(getMask(In), Part); 1494 Entry[Part] = 1495 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 1496 } 1497 } 1498 } 1499 for (unsigned Part = 0; Part < State.UF; ++Part) 1500 State.set(this, Entry[Part], Part); 1501 } 1502 1503 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1504 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, 1505 VPSlotTracker &SlotTracker) const { 1506 O << Indent << "BLEND "; 1507 printAsOperand(O, SlotTracker); 1508 O << " ="; 1509 if (getNumIncomingValues() == 1) { 1510 // Not a User of any mask: not really blending, this is a 1511 // single-predecessor phi. 1512 O << " "; 1513 getIncomingValue(0)->printAsOperand(O, SlotTracker); 1514 } else { 1515 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { 1516 O << " "; 1517 getIncomingValue(I)->printAsOperand(O, SlotTracker); 1518 if (I == 0) 1519 continue; 1520 O << "/"; 1521 getMask(I)->printAsOperand(O, SlotTracker); 1522 } 1523 } 1524 } 1525 #endif 1526 1527 void VPReductionRecipe::execute(VPTransformState &State) { 1528 assert(!State.Instance && "Reduction being replicated."); 1529 Value *PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true); 1530 RecurKind Kind = RdxDesc.getRecurrenceKind(); 1531 // Propagate the fast-math flags carried by the underlying instruction. 1532 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 1533 State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 1534 for (unsigned Part = 0; Part < State.UF; ++Part) { 1535 Value *NewVecOp = State.get(getVecOp(), Part); 1536 if (VPValue *Cond = getCondOp()) { 1537 Value *NewCond = State.get(Cond, Part, State.VF.isScalar()); 1538 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType()); 1539 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); 1540 Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy, 1541 RdxDesc.getFastMathFlags()); 1542 if (State.VF.isVector()) { 1543 Iden = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 1544 } 1545 1546 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden); 1547 NewVecOp = Select; 1548 } 1549 Value *NewRed; 1550 Value *NextInChain; 1551 if (IsOrdered) { 1552 if (State.VF.isVector()) 1553 NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp, 1554 PrevInChain); 1555 else 1556 NewRed = State.Builder.CreateBinOp( 1557 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain, 1558 NewVecOp); 1559 PrevInChain = NewRed; 1560 } else { 1561 PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true); 1562 NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp); 1563 } 1564 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 1565 NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(), 1566 NewRed, PrevInChain); 1567 } else if (IsOrdered) 1568 NextInChain = NewRed; 1569 else 1570 NextInChain = State.Builder.CreateBinOp( 1571 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain); 1572 State.set(this, NextInChain, Part, /*IsScalar*/ true); 1573 } 1574 } 1575 1576 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1577 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, 1578 VPSlotTracker &SlotTracker) const { 1579 O << Indent << "REDUCE "; 1580 printAsOperand(O, SlotTracker); 1581 O << " = "; 1582 getChainOp()->printAsOperand(O, SlotTracker); 1583 O << " +"; 1584 if (isa<FPMathOperator>(getUnderlyingInstr())) 1585 O << getUnderlyingInstr()->getFastMathFlags(); 1586 O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; 1587 getVecOp()->printAsOperand(O, SlotTracker); 1588 if (getCondOp()) { 1589 O << ", "; 1590 getCondOp()->printAsOperand(O, SlotTracker); 1591 } 1592 O << ")"; 1593 if (RdxDesc.IntermediateStore) 1594 O << " (with final reduction value stored in invariant address sank " 1595 "outside of loop)"; 1596 } 1597 #endif 1598 1599 bool VPReplicateRecipe::shouldPack() const { 1600 // Find if the recipe is used by a widened recipe via an intervening 1601 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. 1602 return any_of(users(), [](const VPUser *U) { 1603 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U)) 1604 return any_of(PredR->users(), [PredR](const VPUser *U) { 1605 return !U->usesScalars(PredR); 1606 }); 1607 return false; 1608 }); 1609 } 1610 1611 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1612 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, 1613 VPSlotTracker &SlotTracker) const { 1614 O << Indent << (IsUniform ? "CLONE " : "REPLICATE "); 1615 1616 if (!getUnderlyingInstr()->getType()->isVoidTy()) { 1617 printAsOperand(O, SlotTracker); 1618 O << " = "; 1619 } 1620 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) { 1621 O << "call"; 1622 printFlags(O); 1623 O << "@" << CB->getCalledFunction()->getName() << "("; 1624 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), 1625 O, [&O, &SlotTracker](VPValue *Op) { 1626 Op->printAsOperand(O, SlotTracker); 1627 }); 1628 O << ")"; 1629 } else { 1630 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); 1631 printFlags(O); 1632 printOperands(O, SlotTracker); 1633 } 1634 1635 if (shouldPack()) 1636 O << " (S->V)"; 1637 } 1638 #endif 1639 1640 /// Checks if \p C is uniform across all VFs and UFs. It is considered as such 1641 /// if it is either defined outside the vector region or its operand is known to 1642 /// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI). 1643 /// TODO: Uniformity should be associated with a VPValue and there should be a 1644 /// generic way to check. 1645 static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) { 1646 return C->isDefinedOutsideVectorRegions() || 1647 isa<VPDerivedIVRecipe>(C->getOperand(0)) || 1648 isa<VPCanonicalIVPHIRecipe>(C->getOperand(0)); 1649 } 1650 1651 Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) { 1652 assert(vputils::onlyFirstLaneUsed(this) && 1653 "Codegen only implemented for first lane."); 1654 switch (Opcode) { 1655 case Instruction::SExt: 1656 case Instruction::ZExt: 1657 case Instruction::Trunc: { 1658 // Note: SExt/ZExt not used yet. 1659 Value *Op = State.get(getOperand(0), VPIteration(Part, 0)); 1660 return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy); 1661 } 1662 default: 1663 llvm_unreachable("opcode not implemented yet"); 1664 } 1665 } 1666 1667 void VPScalarCastRecipe ::execute(VPTransformState &State) { 1668 bool IsUniformAcrossVFsAndUFs = isUniformAcrossVFsAndUFs(this); 1669 for (unsigned Part = 0; Part != State.UF; ++Part) { 1670 Value *Res; 1671 // Only generate a single instance, if the recipe is uniform across UFs and 1672 // VFs. 1673 if (Part > 0 && IsUniformAcrossVFsAndUFs) 1674 Res = State.get(this, VPIteration(0, 0)); 1675 else 1676 Res = generate(State, Part); 1677 State.set(this, Res, VPIteration(Part, 0)); 1678 } 1679 } 1680 1681 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1682 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent, 1683 VPSlotTracker &SlotTracker) const { 1684 O << Indent << "SCALAR-CAST "; 1685 printAsOperand(O, SlotTracker); 1686 O << " = " << Instruction::getOpcodeName(Opcode) << " "; 1687 printOperands(O, SlotTracker); 1688 O << " to " << *ResultTy; 1689 } 1690 #endif 1691 1692 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 1693 assert(State.Instance && "Branch on Mask works only on single instance."); 1694 1695 unsigned Part = State.Instance->Part; 1696 unsigned Lane = State.Instance->Lane.getKnownLane(); 1697 1698 Value *ConditionBit = nullptr; 1699 VPValue *BlockInMask = getMask(); 1700 if (BlockInMask) { 1701 ConditionBit = State.get(BlockInMask, Part); 1702 if (ConditionBit->getType()->isVectorTy()) 1703 ConditionBit = State.Builder.CreateExtractElement( 1704 ConditionBit, State.Builder.getInt32(Lane)); 1705 } else // Block in mask is all-one. 1706 ConditionBit = State.Builder.getTrue(); 1707 1708 // Replace the temporary unreachable terminator with a new conditional branch, 1709 // whose two destinations will be set later when they are created. 1710 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 1711 assert(isa<UnreachableInst>(CurrentTerminator) && 1712 "Expected to replace unreachable terminator with conditional branch."); 1713 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 1714 CondBr->setSuccessor(0, nullptr); 1715 ReplaceInstWithInst(CurrentTerminator, CondBr); 1716 } 1717 1718 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 1719 assert(State.Instance && "Predicated instruction PHI works per instance."); 1720 Instruction *ScalarPredInst = 1721 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 1722 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 1723 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 1724 assert(PredicatingBB && "Predicated block has no single predecessor."); 1725 assert(isa<VPReplicateRecipe>(getOperand(0)) && 1726 "operand must be VPReplicateRecipe"); 1727 1728 // By current pack/unpack logic we need to generate only a single phi node: if 1729 // a vector value for the predicated instruction exists at this point it means 1730 // the instruction has vector users only, and a phi for the vector value is 1731 // needed. In this case the recipe of the predicated instruction is marked to 1732 // also do that packing, thereby "hoisting" the insert-element sequence. 1733 // Otherwise, a phi node for the scalar value is needed. 1734 unsigned Part = State.Instance->Part; 1735 if (State.hasVectorValue(getOperand(0), Part)) { 1736 Value *VectorValue = State.get(getOperand(0), Part); 1737 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 1738 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 1739 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 1740 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 1741 if (State.hasVectorValue(this, Part)) 1742 State.reset(this, VPhi, Part); 1743 else 1744 State.set(this, VPhi, Part); 1745 // NOTE: Currently we need to update the value of the operand, so the next 1746 // predicated iteration inserts its generated value in the correct vector. 1747 State.reset(getOperand(0), VPhi, Part); 1748 } else { 1749 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 1750 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 1751 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 1752 PredicatingBB); 1753 Phi->addIncoming(ScalarPredInst, PredicatedBB); 1754 if (State.hasScalarValue(this, *State.Instance)) 1755 State.reset(this, Phi, *State.Instance); 1756 else 1757 State.set(this, Phi, *State.Instance); 1758 // NOTE: Currently we need to update the value of the operand, so the next 1759 // predicated iteration inserts its generated value in the correct vector. 1760 State.reset(getOperand(0), Phi, *State.Instance); 1761 } 1762 } 1763 1764 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1765 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, 1766 VPSlotTracker &SlotTracker) const { 1767 O << Indent << "PHI-PREDICATED-INSTRUCTION "; 1768 printAsOperand(O, SlotTracker); 1769 O << " = "; 1770 printOperands(O, SlotTracker); 1771 } 1772 1773 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, 1774 VPSlotTracker &SlotTracker) const { 1775 O << Indent << "WIDEN "; 1776 printAsOperand(O, SlotTracker); 1777 O << " = load "; 1778 printOperands(O, SlotTracker); 1779 } 1780 1781 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, 1782 VPSlotTracker &SlotTracker) const { 1783 O << Indent << "WIDEN store "; 1784 printOperands(O, SlotTracker); 1785 } 1786 #endif 1787 1788 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { 1789 Value *Start = getStartValue()->getLiveInIRValue(); 1790 PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index"); 1791 EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 1792 1793 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1794 EntryPart->addIncoming(Start, VectorPH); 1795 EntryPart->setDebugLoc(getDebugLoc()); 1796 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) 1797 State.set(this, EntryPart, Part, /*IsScalar*/ true); 1798 } 1799 1800 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1801 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 1802 VPSlotTracker &SlotTracker) const { 1803 O << Indent << "EMIT "; 1804 printAsOperand(O, SlotTracker); 1805 O << " = CANONICAL-INDUCTION "; 1806 printOperands(O, SlotTracker); 1807 } 1808 #endif 1809 1810 bool VPCanonicalIVPHIRecipe::isCanonical( 1811 InductionDescriptor::InductionKind Kind, VPValue *Start, 1812 VPValue *Step) const { 1813 // Must be an integer induction. 1814 if (Kind != InductionDescriptor::IK_IntInduction) 1815 return false; 1816 // Start must match the start value of this canonical induction. 1817 if (Start != getStartValue()) 1818 return false; 1819 1820 // If the step is defined by a recipe, it is not a ConstantInt. 1821 if (Step->getDefiningRecipe()) 1822 return false; 1823 1824 ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue()); 1825 return StepC && StepC->isOne(); 1826 } 1827 1828 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) { 1829 return IsScalarAfterVectorization && 1830 (!IsScalable || vputils::onlyFirstLaneUsed(this)); 1831 } 1832 1833 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1834 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, 1835 VPSlotTracker &SlotTracker) const { 1836 O << Indent << "EMIT "; 1837 printAsOperand(O, SlotTracker); 1838 O << " = WIDEN-POINTER-INDUCTION "; 1839 getStartValue()->printAsOperand(O, SlotTracker); 1840 O << ", " << *IndDesc.getStep(); 1841 } 1842 #endif 1843 1844 void VPExpandSCEVRecipe::execute(VPTransformState &State) { 1845 assert(!State.Instance && "cannot be used in per-lane"); 1846 const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout(); 1847 SCEVExpander Exp(SE, DL, "induction"); 1848 1849 Value *Res = Exp.expandCodeFor(Expr, Expr->getType(), 1850 &*State.Builder.GetInsertPoint()); 1851 assert(!State.ExpandedSCEVs.contains(Expr) && 1852 "Same SCEV expanded multiple times"); 1853 State.ExpandedSCEVs[Expr] = Res; 1854 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) 1855 State.set(this, Res, {Part, 0}); 1856 } 1857 1858 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1859 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, 1860 VPSlotTracker &SlotTracker) const { 1861 O << Indent << "EMIT "; 1862 getVPSingleValue()->printAsOperand(O, SlotTracker); 1863 O << " = EXPAND SCEV " << *Expr; 1864 } 1865 #endif 1866 1867 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { 1868 Value *CanonicalIV = State.get(getOperand(0), 0, /*IsScalar*/ true); 1869 Type *STy = CanonicalIV->getType(); 1870 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); 1871 ElementCount VF = State.VF; 1872 Value *VStart = VF.isScalar() 1873 ? CanonicalIV 1874 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); 1875 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { 1876 Value *VStep = createStepForVF(Builder, STy, VF, Part); 1877 if (VF.isVector()) { 1878 VStep = Builder.CreateVectorSplat(VF, VStep); 1879 VStep = 1880 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); 1881 } 1882 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); 1883 State.set(this, CanonicalVectorIV, Part); 1884 } 1885 } 1886 1887 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1888 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, 1889 VPSlotTracker &SlotTracker) const { 1890 O << Indent << "EMIT "; 1891 printAsOperand(O, SlotTracker); 1892 O << " = WIDEN-CANONICAL-INDUCTION "; 1893 printOperands(O, SlotTracker); 1894 } 1895 #endif 1896 1897 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { 1898 auto &Builder = State.Builder; 1899 // Create a vector from the initial value. 1900 auto *VectorInit = getStartValue()->getLiveInIRValue(); 1901 1902 Type *VecTy = State.VF.isScalar() 1903 ? VectorInit->getType() 1904 : VectorType::get(VectorInit->getType(), State.VF); 1905 1906 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1907 if (State.VF.isVector()) { 1908 auto *IdxTy = Builder.getInt32Ty(); 1909 auto *One = ConstantInt::get(IdxTy, 1); 1910 IRBuilder<>::InsertPointGuard Guard(Builder); 1911 Builder.SetInsertPoint(VectorPH->getTerminator()); 1912 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF); 1913 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 1914 VectorInit = Builder.CreateInsertElement( 1915 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init"); 1916 } 1917 1918 // Create a phi node for the new recurrence. 1919 PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur"); 1920 EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 1921 EntryPart->addIncoming(VectorInit, VectorPH); 1922 State.set(this, EntryPart, 0); 1923 } 1924 1925 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1926 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, 1927 VPSlotTracker &SlotTracker) const { 1928 O << Indent << "FIRST-ORDER-RECURRENCE-PHI "; 1929 printAsOperand(O, SlotTracker); 1930 O << " = phi "; 1931 printOperands(O, SlotTracker); 1932 } 1933 #endif 1934 1935 void VPReductionPHIRecipe::execute(VPTransformState &State) { 1936 auto &Builder = State.Builder; 1937 1938 // Reductions do not have to start at zero. They can start with 1939 // any loop invariant values. 1940 VPValue *StartVPV = getStartValue(); 1941 Value *StartV = StartVPV->getLiveInIRValue(); 1942 1943 // In order to support recurrences we need to be able to vectorize Phi nodes. 1944 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 1945 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 1946 // this value when we vectorize all of the instructions that use the PHI. 1947 bool ScalarPHI = State.VF.isScalar() || IsInLoop; 1948 Type *VecTy = ScalarPHI ? StartV->getType() 1949 : VectorType::get(StartV->getType(), State.VF); 1950 1951 BasicBlock *HeaderBB = State.CFG.PrevBB; 1952 assert(State.CurrentVectorLoop->getHeader() == HeaderBB && 1953 "recipe must be in the vector loop header"); 1954 unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF; 1955 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { 1956 Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi"); 1957 EntryPart->insertBefore(HeaderBB->getFirstInsertionPt()); 1958 State.set(this, EntryPart, Part, IsInLoop); 1959 } 1960 1961 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1962 1963 Value *Iden = nullptr; 1964 RecurKind RK = RdxDesc.getRecurrenceKind(); 1965 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || 1966 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { 1967 // MinMax and AnyOf reductions have the start value as their identity. 1968 if (ScalarPHI) { 1969 Iden = StartV; 1970 } else { 1971 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 1972 Builder.SetInsertPoint(VectorPH->getTerminator()); 1973 StartV = Iden = 1974 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 1975 } 1976 } else { 1977 Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(), 1978 RdxDesc.getFastMathFlags()); 1979 1980 if (!ScalarPHI) { 1981 Iden = Builder.CreateVectorSplat(State.VF, Iden); 1982 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 1983 Builder.SetInsertPoint(VectorPH->getTerminator()); 1984 Constant *Zero = Builder.getInt32(0); 1985 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 1986 } 1987 } 1988 1989 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { 1990 Value *EntryPart = State.get(this, Part, IsInLoop); 1991 // Make sure to add the reduction start value only to the 1992 // first unroll part. 1993 Value *StartVal = (Part == 0) ? StartV : Iden; 1994 cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH); 1995 } 1996 } 1997 1998 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1999 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2000 VPSlotTracker &SlotTracker) const { 2001 O << Indent << "WIDEN-REDUCTION-PHI "; 2002 2003 printAsOperand(O, SlotTracker); 2004 O << " = phi "; 2005 printOperands(O, SlotTracker); 2006 } 2007 #endif 2008 2009 void VPWidenPHIRecipe::execute(VPTransformState &State) { 2010 assert(EnableVPlanNativePath && 2011 "Non-native vplans are not expected to have VPWidenPHIRecipes."); 2012 2013 Value *Op0 = State.get(getOperand(0), 0); 2014 Type *VecTy = Op0->getType(); 2015 Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi"); 2016 State.set(this, VecPhi, 0); 2017 } 2018 2019 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2020 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2021 VPSlotTracker &SlotTracker) const { 2022 O << Indent << "WIDEN-PHI "; 2023 2024 auto *OriginalPhi = cast<PHINode>(getUnderlyingValue()); 2025 // Unless all incoming values are modeled in VPlan print the original PHI 2026 // directly. 2027 // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming 2028 // values as VPValues. 2029 if (getNumOperands() != OriginalPhi->getNumOperands()) { 2030 O << VPlanIngredient(OriginalPhi); 2031 return; 2032 } 2033 2034 printAsOperand(O, SlotTracker); 2035 O << " = phi "; 2036 printOperands(O, SlotTracker); 2037 } 2038 #endif 2039 2040 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and 2041 // remove VPActiveLaneMaskPHIRecipe. 2042 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { 2043 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2044 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { 2045 Value *StartMask = State.get(getOperand(0), Part); 2046 PHINode *EntryPart = 2047 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); 2048 EntryPart->addIncoming(StartMask, VectorPH); 2049 EntryPart->setDebugLoc(getDebugLoc()); 2050 State.set(this, EntryPart, Part); 2051 } 2052 } 2053 2054 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2055 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2056 VPSlotTracker &SlotTracker) const { 2057 O << Indent << "ACTIVE-LANE-MASK-PHI "; 2058 2059 printAsOperand(O, SlotTracker); 2060 O << " = phi "; 2061 printOperands(O, SlotTracker); 2062 } 2063 #endif 2064 2065 void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) { 2066 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2067 assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization."); 2068 Value *Start = State.get(getOperand(0), VPIteration(0, 0)); 2069 PHINode *EntryPart = 2070 State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv"); 2071 EntryPart->addIncoming(Start, VectorPH); 2072 EntryPart->setDebugLoc(getDebugLoc()); 2073 State.set(this, EntryPart, 0, /*IsScalar=*/true); 2074 } 2075 2076 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2077 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2078 VPSlotTracker &SlotTracker) const { 2079 O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI "; 2080 2081 printAsOperand(O, SlotTracker); 2082 O << " = phi "; 2083 printOperands(O, SlotTracker); 2084 } 2085 #endif 2086