1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file contains implementations for different VPlan recipes. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "VPlan.h" 15 #include "VPlanAnalysis.h" 16 #include "llvm/ADT/STLExtras.h" 17 #include "llvm/ADT/SmallVector.h" 18 #include "llvm/ADT/Twine.h" 19 #include "llvm/Analysis/IVDescriptors.h" 20 #include "llvm/IR/BasicBlock.h" 21 #include "llvm/IR/IRBuilder.h" 22 #include "llvm/IR/Instruction.h" 23 #include "llvm/IR/Instructions.h" 24 #include "llvm/IR/Type.h" 25 #include "llvm/IR/Value.h" 26 #include "llvm/Support/Casting.h" 27 #include "llvm/Support/CommandLine.h" 28 #include "llvm/Support/Debug.h" 29 #include "llvm/Support/raw_ostream.h" 30 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 31 #include "llvm/Transforms/Utils/LoopUtils.h" 32 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 33 #include <cassert> 34 35 using namespace llvm; 36 37 using VectorParts = SmallVector<Value *, 2>; 38 39 namespace llvm { 40 extern cl::opt<bool> EnableVPlanNativePath; 41 } 42 43 #define LV_NAME "loop-vectorize" 44 #define DEBUG_TYPE LV_NAME 45 46 bool VPRecipeBase::mayWriteToMemory() const { 47 switch (getVPDefID()) { 48 case VPInterleaveSC: 49 return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0; 50 case VPWidenStoreEVLSC: 51 case VPWidenStoreSC: 52 return true; 53 case VPReplicateSC: 54 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 55 ->mayWriteToMemory(); 56 case VPWidenCallSC: 57 return !cast<VPWidenCallRecipe>(this) 58 ->getCalledScalarFunction() 59 ->onlyReadsMemory(); 60 case VPBranchOnMaskSC: 61 case VPScalarIVStepsSC: 62 case VPPredInstPHISC: 63 return false; 64 case VPBlendSC: 65 case VPReductionSC: 66 case VPWidenCanonicalIVSC: 67 case VPWidenCastSC: 68 case VPWidenGEPSC: 69 case VPWidenIntOrFpInductionSC: 70 case VPWidenLoadEVLSC: 71 case VPWidenLoadSC: 72 case VPWidenPHISC: 73 case VPWidenSC: 74 case VPWidenSelectSC: { 75 const Instruction *I = 76 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 77 (void)I; 78 assert((!I || !I->mayWriteToMemory()) && 79 "underlying instruction may write to memory"); 80 return false; 81 } 82 default: 83 return true; 84 } 85 } 86 87 bool VPRecipeBase::mayReadFromMemory() const { 88 switch (getVPDefID()) { 89 case VPWidenLoadEVLSC: 90 case VPWidenLoadSC: 91 return true; 92 case VPReplicateSC: 93 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 94 ->mayReadFromMemory(); 95 case VPWidenCallSC: 96 return !cast<VPWidenCallRecipe>(this) 97 ->getCalledScalarFunction() 98 ->onlyWritesMemory(); 99 case VPBranchOnMaskSC: 100 case VPPredInstPHISC: 101 case VPScalarIVStepsSC: 102 case VPWidenStoreEVLSC: 103 case VPWidenStoreSC: 104 return false; 105 case VPBlendSC: 106 case VPReductionSC: 107 case VPWidenCanonicalIVSC: 108 case VPWidenCastSC: 109 case VPWidenGEPSC: 110 case VPWidenIntOrFpInductionSC: 111 case VPWidenPHISC: 112 case VPWidenSC: 113 case VPWidenSelectSC: { 114 const Instruction *I = 115 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 116 (void)I; 117 assert((!I || !I->mayReadFromMemory()) && 118 "underlying instruction may read from memory"); 119 return false; 120 } 121 default: 122 return true; 123 } 124 } 125 126 bool VPRecipeBase::mayHaveSideEffects() const { 127 switch (getVPDefID()) { 128 case VPDerivedIVSC: 129 case VPPredInstPHISC: 130 case VPScalarCastSC: 131 return false; 132 case VPInstructionSC: 133 switch (cast<VPInstruction>(this)->getOpcode()) { 134 case Instruction::Or: 135 case Instruction::ICmp: 136 case Instruction::Select: 137 case VPInstruction::Not: 138 case VPInstruction::CalculateTripCountMinusVF: 139 case VPInstruction::CanonicalIVIncrementForPart: 140 case VPInstruction::ExtractFromEnd: 141 case VPInstruction::FirstOrderRecurrenceSplice: 142 case VPInstruction::LogicalAnd: 143 case VPInstruction::PtrAdd: 144 return false; 145 default: 146 return true; 147 } 148 case VPWidenCallSC: { 149 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction(); 150 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn(); 151 } 152 case VPBlendSC: 153 case VPReductionSC: 154 case VPScalarIVStepsSC: 155 case VPWidenCanonicalIVSC: 156 case VPWidenCastSC: 157 case VPWidenGEPSC: 158 case VPWidenIntOrFpInductionSC: 159 case VPWidenPHISC: 160 case VPWidenPointerInductionSC: 161 case VPWidenSC: 162 case VPWidenSelectSC: { 163 const Instruction *I = 164 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 165 (void)I; 166 assert((!I || !I->mayHaveSideEffects()) && 167 "underlying instruction has side-effects"); 168 return false; 169 } 170 case VPInterleaveSC: 171 return mayWriteToMemory(); 172 case VPWidenLoadEVLSC: 173 case VPWidenLoadSC: 174 case VPWidenStoreEVLSC: 175 case VPWidenStoreSC: 176 assert( 177 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() == 178 mayWriteToMemory() && 179 "mayHaveSideffects result for ingredient differs from this " 180 "implementation"); 181 return mayWriteToMemory(); 182 case VPReplicateSC: { 183 auto *R = cast<VPReplicateRecipe>(this); 184 return R->getUnderlyingInstr()->mayHaveSideEffects(); 185 } 186 default: 187 return true; 188 } 189 } 190 191 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { 192 auto Lane = VPLane::getLastLaneForVF(State.VF); 193 VPValue *ExitValue = getOperand(0); 194 if (vputils::isUniformAfterVectorization(ExitValue)) 195 Lane = VPLane::getFirstLane(); 196 VPBasicBlock *MiddleVPBB = 197 cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor()); 198 assert(MiddleVPBB->getNumSuccessors() == 0 && 199 "the middle block must not have any successors"); 200 BasicBlock *MiddleBB = State.CFG.VPBB2IRBB[MiddleVPBB]; 201 Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)), 202 MiddleBB); 203 } 204 205 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 206 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { 207 O << "Live-out "; 208 getPhi()->printAsOperand(O); 209 O << " = "; 210 getOperand(0)->printAsOperand(O, SlotTracker); 211 O << "\n"; 212 } 213 #endif 214 215 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { 216 assert(!Parent && "Recipe already in some VPBasicBlock"); 217 assert(InsertPos->getParent() && 218 "Insertion position not in any VPBasicBlock"); 219 InsertPos->getParent()->insert(this, InsertPos->getIterator()); 220 } 221 222 void VPRecipeBase::insertBefore(VPBasicBlock &BB, 223 iplist<VPRecipeBase>::iterator I) { 224 assert(!Parent && "Recipe already in some VPBasicBlock"); 225 assert(I == BB.end() || I->getParent() == &BB); 226 BB.insert(this, I); 227 } 228 229 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { 230 assert(!Parent && "Recipe already in some VPBasicBlock"); 231 assert(InsertPos->getParent() && 232 "Insertion position not in any VPBasicBlock"); 233 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator())); 234 } 235 236 void VPRecipeBase::removeFromParent() { 237 assert(getParent() && "Recipe not in any VPBasicBlock"); 238 getParent()->getRecipeList().remove(getIterator()); 239 Parent = nullptr; 240 } 241 242 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { 243 assert(getParent() && "Recipe not in any VPBasicBlock"); 244 return getParent()->getRecipeList().erase(getIterator()); 245 } 246 247 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { 248 removeFromParent(); 249 insertAfter(InsertPos); 250 } 251 252 void VPRecipeBase::moveBefore(VPBasicBlock &BB, 253 iplist<VPRecipeBase>::iterator I) { 254 removeFromParent(); 255 insertBefore(BB, I); 256 } 257 258 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { 259 assert(OpType == OperationType::FPMathOp && 260 "recipe doesn't have fast math flags"); 261 FastMathFlags Res; 262 Res.setAllowReassoc(FMFs.AllowReassoc); 263 Res.setNoNaNs(FMFs.NoNaNs); 264 Res.setNoInfs(FMFs.NoInfs); 265 Res.setNoSignedZeros(FMFs.NoSignedZeros); 266 Res.setAllowReciprocal(FMFs.AllowReciprocal); 267 Res.setAllowContract(FMFs.AllowContract); 268 Res.setApproxFunc(FMFs.ApproxFunc); 269 return Res; 270 } 271 272 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, 273 VPValue *A, VPValue *B, DebugLoc DL, 274 const Twine &Name) 275 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}), 276 Pred, DL), 277 Opcode(Opcode), Name(Name.str()) { 278 assert(Opcode == Instruction::ICmp && 279 "only ICmp predicates supported at the moment"); 280 } 281 282 VPInstruction::VPInstruction(unsigned Opcode, 283 std::initializer_list<VPValue *> Operands, 284 FastMathFlags FMFs, DebugLoc DL, const Twine &Name) 285 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL), 286 Opcode(Opcode), Name(Name.str()) { 287 // Make sure the VPInstruction is a floating-point operation. 288 assert(isFPMathOp() && "this op can't take fast-math flags"); 289 } 290 291 bool VPInstruction::doesGeneratePerAllLanes() const { 292 return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this); 293 } 294 295 bool VPInstruction::canGenerateScalarForFirstLane() const { 296 if (Instruction::isBinaryOp(getOpcode())) 297 return true; 298 if (isVectorToScalar()) 299 return true; 300 switch (Opcode) { 301 case VPInstruction::BranchOnCond: 302 case VPInstruction::BranchOnCount: 303 case VPInstruction::CalculateTripCountMinusVF: 304 case VPInstruction::CanonicalIVIncrementForPart: 305 case VPInstruction::PtrAdd: 306 case VPInstruction::ExplicitVectorLength: 307 return true; 308 default: 309 return false; 310 } 311 } 312 313 Value *VPInstruction::generatePerLane(VPTransformState &State, 314 const VPIteration &Lane) { 315 IRBuilderBase &Builder = State.Builder; 316 317 assert(getOpcode() == VPInstruction::PtrAdd && 318 "only PtrAdd opcodes are supported for now"); 319 return Builder.CreatePtrAdd(State.get(getOperand(0), Lane), 320 State.get(getOperand(1), Lane), Name); 321 } 322 323 Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { 324 IRBuilderBase &Builder = State.Builder; 325 326 if (Instruction::isBinaryOp(getOpcode())) { 327 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 328 Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed); 329 Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed); 330 auto *Res = 331 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); 332 if (auto *I = dyn_cast<Instruction>(Res)) 333 setFlags(I); 334 return Res; 335 } 336 337 switch (getOpcode()) { 338 case VPInstruction::Not: { 339 Value *A = State.get(getOperand(0), Part); 340 return Builder.CreateNot(A, Name); 341 } 342 case Instruction::ICmp: { 343 Value *A = State.get(getOperand(0), Part); 344 Value *B = State.get(getOperand(1), Part); 345 return Builder.CreateCmp(getPredicate(), A, B, Name); 346 } 347 case Instruction::Select: { 348 Value *Cond = State.get(getOperand(0), Part); 349 Value *Op1 = State.get(getOperand(1), Part); 350 Value *Op2 = State.get(getOperand(2), Part); 351 return Builder.CreateSelect(Cond, Op1, Op2, Name); 352 } 353 case VPInstruction::ActiveLaneMask: { 354 // Get first lane of vector induction variable. 355 Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); 356 // Get the original loop tripcount. 357 Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0)); 358 359 // If this part of the active lane mask is scalar, generate the CMP directly 360 // to avoid unnecessary extracts. 361 if (State.VF.isScalar()) 362 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC, 363 Name); 364 365 auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); 366 auto *PredTy = VectorType::get(Int1Ty, State.VF); 367 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, 368 {PredTy, ScalarTC->getType()}, 369 {VIVElem0, ScalarTC}, nullptr, Name); 370 } 371 case VPInstruction::FirstOrderRecurrenceSplice: { 372 // Generate code to combine the previous and current values in vector v3. 373 // 374 // vector.ph: 375 // v_init = vector(..., ..., ..., a[-1]) 376 // br vector.body 377 // 378 // vector.body 379 // i = phi [0, vector.ph], [i+4, vector.body] 380 // v1 = phi [v_init, vector.ph], [v2, vector.body] 381 // v2 = a[i, i+1, i+2, i+3]; 382 // v3 = vector(v1(3), v2(0, 1, 2)) 383 384 // For the first part, use the recurrence phi (v1), otherwise v2. 385 auto *V1 = State.get(getOperand(0), 0); 386 Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1); 387 if (!PartMinus1->getType()->isVectorTy()) 388 return PartMinus1; 389 Value *V2 = State.get(getOperand(1), Part); 390 return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name); 391 } 392 case VPInstruction::CalculateTripCountMinusVF: { 393 if (Part != 0) 394 return State.get(this, 0, /*IsScalar*/ true); 395 396 Value *ScalarTC = State.get(getOperand(0), {0, 0}); 397 Value *Step = 398 createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF); 399 Value *Sub = Builder.CreateSub(ScalarTC, Step); 400 Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step); 401 Value *Zero = ConstantInt::get(ScalarTC->getType(), 0); 402 return Builder.CreateSelect(Cmp, Sub, Zero); 403 } 404 case VPInstruction::ExplicitVectorLength: { 405 // Compute EVL 406 auto GetEVL = [=](VPTransformState &State, Value *AVL) { 407 assert(AVL->getType()->isIntegerTy() && 408 "Requested vector length should be an integer."); 409 410 // TODO: Add support for MaxSafeDist for correct loop emission. 411 assert(State.VF.isScalable() && "Expected scalable vector factor."); 412 Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue()); 413 414 Value *EVL = State.Builder.CreateIntrinsic( 415 State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length, 416 {AVL, VFArg, State.Builder.getTrue()}); 417 return EVL; 418 }; 419 // TODO: Restructure this code with an explicit remainder loop, vsetvli can 420 // be outside of the main loop. 421 assert(Part == 0 && "No unrolling expected for predicated vectorization."); 422 // Compute VTC - IV as the AVL (requested vector length). 423 Value *Index = State.get(getOperand(0), VPIteration(0, 0)); 424 Value *TripCount = State.get(getOperand(1), VPIteration(0, 0)); 425 Value *AVL = State.Builder.CreateSub(TripCount, Index); 426 Value *EVL = GetEVL(State, AVL); 427 return EVL; 428 } 429 case VPInstruction::CanonicalIVIncrementForPart: { 430 auto *IV = State.get(getOperand(0), VPIteration(0, 0)); 431 if (Part == 0) 432 return IV; 433 434 // The canonical IV is incremented by the vectorization factor (num of SIMD 435 // elements) times the unroll part. 436 Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); 437 return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(), 438 hasNoSignedWrap()); 439 } 440 case VPInstruction::BranchOnCond: { 441 if (Part != 0) 442 return nullptr; 443 444 Value *Cond = State.get(getOperand(0), VPIteration(Part, 0)); 445 VPRegionBlock *ParentRegion = getParent()->getParent(); 446 VPBasicBlock *Header = ParentRegion->getEntryBasicBlock(); 447 448 // Replace the temporary unreachable terminator with a new conditional 449 // branch, hooking it up to backward destination for exiting blocks now and 450 // to forward destination(s) later when they are created. 451 BranchInst *CondBr = 452 Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr); 453 454 if (getParent()->isExiting()) 455 CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]); 456 457 CondBr->setSuccessor(0, nullptr); 458 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 459 return CondBr; 460 } 461 case VPInstruction::BranchOnCount: { 462 if (Part != 0) 463 return nullptr; 464 // First create the compare. 465 Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true); 466 Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true); 467 Value *Cond = Builder.CreateICmpEQ(IV, TC); 468 469 // Now create the branch. 470 auto *Plan = getParent()->getPlan(); 471 VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); 472 VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock(); 473 474 // Replace the temporary unreachable terminator with a new conditional 475 // branch, hooking it up to backward destination (the header) now and to the 476 // forward destination (the exit/middle block) later when it is created. 477 // Note that CreateCondBr expects a valid BB as first argument, so we need 478 // to set it to nullptr later. 479 BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), 480 State.CFG.VPBB2IRBB[Header]); 481 CondBr->setSuccessor(0, nullptr); 482 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 483 return CondBr; 484 } 485 case VPInstruction::ComputeReductionResult: { 486 if (Part != 0) 487 return State.get(this, 0, /*IsScalar*/ true); 488 489 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary 490 // and will be removed by breaking up the recipe further. 491 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0)); 492 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 493 // Get its reduction variable descriptor. 494 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 495 496 RecurKind RK = RdxDesc.getRecurrenceKind(); 497 498 VPValue *LoopExitingDef = getOperand(1); 499 Type *PhiTy = OrigPhi->getType(); 500 VectorParts RdxParts(State.UF); 501 for (unsigned Part = 0; Part < State.UF; ++Part) 502 RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop()); 503 504 // If the vector reduction can be performed in a smaller type, we truncate 505 // then extend the loop exit value to enable InstCombine to evaluate the 506 // entire expression in the smaller type. 507 // TODO: Handle this in truncateToMinBW. 508 if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 509 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF); 510 for (unsigned Part = 0; Part < State.UF; ++Part) 511 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 512 } 513 // Reduce all of the unrolled parts into a single vector. 514 Value *ReducedPartRdx = RdxParts[0]; 515 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 516 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) 517 Op = Instruction::Or; 518 519 if (PhiR->isOrdered()) { 520 ReducedPartRdx = RdxParts[State.UF - 1]; 521 } else { 522 // Floating-point operations should have some FMF to enable the reduction. 523 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 524 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 525 for (unsigned Part = 1; Part < State.UF; ++Part) { 526 Value *RdxPart = RdxParts[Part]; 527 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 528 ReducedPartRdx = Builder.CreateBinOp( 529 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 530 else 531 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 532 } 533 } 534 535 // Create the reduction after the loop. Note that inloop reductions create 536 // the target reduction in the loop using a Reduction recipe. 537 if ((State.VF.isVector() || 538 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) && 539 !PhiR->isInLoop()) { 540 ReducedPartRdx = 541 createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); 542 // If the reduction can be performed in a smaller type, we need to extend 543 // the reduction to the wider type before we branch to the original loop. 544 if (PhiTy != RdxDesc.getRecurrenceType()) 545 ReducedPartRdx = RdxDesc.isSigned() 546 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 547 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 548 } 549 550 // If there were stores of the reduction value to a uniform memory address 551 // inside the loop, create the final store here. 552 if (StoreInst *SI = RdxDesc.IntermediateStore) { 553 auto *NewSI = Builder.CreateAlignedStore( 554 ReducedPartRdx, SI->getPointerOperand(), SI->getAlign()); 555 propagateMetadata(NewSI, SI); 556 } 557 558 return ReducedPartRdx; 559 } 560 case VPInstruction::ExtractFromEnd: { 561 if (Part != 0) 562 return State.get(this, 0, /*IsScalar*/ true); 563 564 auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue()); 565 unsigned Offset = CI->getZExtValue(); 566 assert(Offset > 0 && "Offset from end must be positive"); 567 Value *Res; 568 if (State.VF.isVector()) { 569 assert(Offset <= State.VF.getKnownMinValue() && 570 "invalid offset to extract from"); 571 // Extract lane VF - Offset from the operand. 572 Res = State.get( 573 getOperand(0), 574 VPIteration(State.UF - 1, VPLane::getLaneFromEnd(State.VF, Offset))); 575 } else { 576 assert(Offset <= State.UF && "invalid offset to extract from"); 577 // When loop is unrolled without vectorizing, retrieve UF - Offset. 578 Res = State.get(getOperand(0), State.UF - Offset); 579 } 580 Res->setName(Name); 581 return Res; 582 } 583 case VPInstruction::LogicalAnd: { 584 Value *A = State.get(getOperand(0), Part); 585 Value *B = State.get(getOperand(1), Part); 586 return Builder.CreateLogicalAnd(A, B, Name); 587 } 588 case VPInstruction::PtrAdd: { 589 assert(vputils::onlyFirstLaneUsed(this) && 590 "can only generate first lane for PtrAdd"); 591 Value *Ptr = State.get(getOperand(0), Part, /* IsScalar */ true); 592 Value *Addend = State.get(getOperand(1), Part, /* IsScalar */ true); 593 return Builder.CreatePtrAdd(Ptr, Addend, Name); 594 } 595 default: 596 llvm_unreachable("Unsupported opcode for instruction"); 597 } 598 } 599 600 bool VPInstruction::isVectorToScalar() const { 601 return getOpcode() == VPInstruction::ExtractFromEnd || 602 getOpcode() == VPInstruction::ComputeReductionResult; 603 } 604 605 #if !defined(NDEBUG) 606 bool VPInstruction::isFPMathOp() const { 607 // Inspired by FPMathOperator::classof. Notable differences are that we don't 608 // support Call, PHI and Select opcodes here yet. 609 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || 610 Opcode == Instruction::FNeg || Opcode == Instruction::FSub || 611 Opcode == Instruction::FDiv || Opcode == Instruction::FRem || 612 Opcode == Instruction::FCmp || Opcode == Instruction::Select; 613 } 614 #endif 615 616 void VPInstruction::execute(VPTransformState &State) { 617 assert(!State.Instance && "VPInstruction executing an Instance"); 618 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 619 assert((hasFastMathFlags() == isFPMathOp() || 620 getOpcode() == Instruction::Select) && 621 "Recipe not a FPMathOp but has fast-math flags?"); 622 if (hasFastMathFlags()) 623 State.Builder.setFastMathFlags(getFastMathFlags()); 624 State.setDebugLocFrom(getDebugLoc()); 625 bool GeneratesPerFirstLaneOnly = 626 canGenerateScalarForFirstLane() && 627 (vputils::onlyFirstLaneUsed(this) || isVectorToScalar()); 628 bool GeneratesPerAllLanes = doesGeneratePerAllLanes(); 629 bool OnlyFirstPartUsed = vputils::onlyFirstPartUsed(this); 630 for (unsigned Part = 0; Part < State.UF; ++Part) { 631 if (GeneratesPerAllLanes) { 632 for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue(); 633 Lane != NumLanes; ++Lane) { 634 Value *GeneratedValue = generatePerLane(State, VPIteration(Part, Lane)); 635 assert(GeneratedValue && "generatePerLane must produce a value"); 636 State.set(this, GeneratedValue, VPIteration(Part, Lane)); 637 } 638 continue; 639 } 640 641 if (Part != 0 && OnlyFirstPartUsed && hasResult()) { 642 Value *Part0 = State.get(this, 0, /*IsScalar*/ GeneratesPerFirstLaneOnly); 643 State.set(this, Part0, Part, 644 /*IsScalar*/ GeneratesPerFirstLaneOnly); 645 continue; 646 } 647 648 Value *GeneratedValue = generatePerPart(State, Part); 649 if (!hasResult()) 650 continue; 651 assert(GeneratedValue && "generatePerPart must produce a value"); 652 assert((GeneratedValue->getType()->isVectorTy() == 653 !GeneratesPerFirstLaneOnly || 654 State.VF.isScalar()) && 655 "scalar value but not only first lane defined"); 656 State.set(this, GeneratedValue, Part, 657 /*IsScalar*/ GeneratesPerFirstLaneOnly); 658 } 659 } 660 661 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { 662 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 663 if (Instruction::isBinaryOp(getOpcode())) 664 return vputils::onlyFirstLaneUsed(this); 665 666 switch (getOpcode()) { 667 default: 668 return false; 669 case Instruction::ICmp: 670 case VPInstruction::PtrAdd: 671 // TODO: Cover additional opcodes. 672 return vputils::onlyFirstLaneUsed(this); 673 case VPInstruction::ActiveLaneMask: 674 case VPInstruction::ExplicitVectorLength: 675 case VPInstruction::CalculateTripCountMinusVF: 676 case VPInstruction::CanonicalIVIncrementForPart: 677 case VPInstruction::BranchOnCount: 678 return true; 679 }; 680 llvm_unreachable("switch should return"); 681 } 682 683 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const { 684 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 685 if (Instruction::isBinaryOp(getOpcode())) 686 return vputils::onlyFirstPartUsed(this); 687 688 switch (getOpcode()) { 689 default: 690 return false; 691 case Instruction::ICmp: 692 case Instruction::Select: 693 return vputils::onlyFirstPartUsed(this); 694 case VPInstruction::BranchOnCount: 695 case VPInstruction::BranchOnCond: 696 case VPInstruction::CanonicalIVIncrementForPart: 697 return true; 698 }; 699 llvm_unreachable("switch should return"); 700 } 701 702 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 703 void VPInstruction::dump() const { 704 VPSlotTracker SlotTracker(getParent()->getPlan()); 705 print(dbgs(), "", SlotTracker); 706 } 707 708 void VPInstruction::print(raw_ostream &O, const Twine &Indent, 709 VPSlotTracker &SlotTracker) const { 710 O << Indent << "EMIT "; 711 712 if (hasResult()) { 713 printAsOperand(O, SlotTracker); 714 O << " = "; 715 } 716 717 switch (getOpcode()) { 718 case VPInstruction::Not: 719 O << "not"; 720 break; 721 case VPInstruction::SLPLoad: 722 O << "combined load"; 723 break; 724 case VPInstruction::SLPStore: 725 O << "combined store"; 726 break; 727 case VPInstruction::ActiveLaneMask: 728 O << "active lane mask"; 729 break; 730 case VPInstruction::ExplicitVectorLength: 731 O << "EXPLICIT-VECTOR-LENGTH"; 732 break; 733 case VPInstruction::FirstOrderRecurrenceSplice: 734 O << "first-order splice"; 735 break; 736 case VPInstruction::BranchOnCond: 737 O << "branch-on-cond"; 738 break; 739 case VPInstruction::CalculateTripCountMinusVF: 740 O << "TC > VF ? TC - VF : 0"; 741 break; 742 case VPInstruction::CanonicalIVIncrementForPart: 743 O << "VF * Part +"; 744 break; 745 case VPInstruction::BranchOnCount: 746 O << "branch-on-count"; 747 break; 748 case VPInstruction::ExtractFromEnd: 749 O << "extract-from-end"; 750 break; 751 case VPInstruction::ComputeReductionResult: 752 O << "compute-reduction-result"; 753 break; 754 case VPInstruction::LogicalAnd: 755 O << "logical-and"; 756 break; 757 case VPInstruction::PtrAdd: 758 O << "ptradd"; 759 break; 760 default: 761 O << Instruction::getOpcodeName(getOpcode()); 762 } 763 764 printFlags(O); 765 printOperands(O, SlotTracker); 766 767 if (auto DL = getDebugLoc()) { 768 O << ", !dbg "; 769 DL.print(O); 770 } 771 } 772 #endif 773 774 void VPWidenCallRecipe::execute(VPTransformState &State) { 775 assert(State.VF.isVector() && "not widening"); 776 Function *CalledScalarFn = getCalledScalarFunction(); 777 assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) && 778 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 779 State.setDebugLocFrom(getDebugLoc()); 780 781 bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic; 782 FunctionType *VFTy = nullptr; 783 if (Variant) 784 VFTy = Variant->getFunctionType(); 785 for (unsigned Part = 0; Part < State.UF; ++Part) { 786 SmallVector<Type *, 2> TysForDecl; 787 // Add return type if intrinsic is overloaded on it. 788 if (UseIntrinsic && 789 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1)) 790 TysForDecl.push_back(VectorType::get( 791 CalledScalarFn->getReturnType()->getScalarType(), State.VF)); 792 SmallVector<Value *, 4> Args; 793 for (const auto &I : enumerate(arg_operands())) { 794 // Some intrinsics have a scalar argument - don't replace it with a 795 // vector. 796 Value *Arg; 797 if (UseIntrinsic && 798 isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())) 799 Arg = State.get(I.value(), VPIteration(0, 0)); 800 // Some vectorized function variants may also take a scalar argument, 801 // e.g. linear parameters for pointers. This needs to be the scalar value 802 // from the start of the respective part when interleaving. 803 else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy()) 804 Arg = State.get(I.value(), VPIteration(Part, 0)); 805 else 806 Arg = State.get(I.value(), Part); 807 if (UseIntrinsic && 808 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index())) 809 TysForDecl.push_back(Arg->getType()); 810 Args.push_back(Arg); 811 } 812 813 Function *VectorF; 814 if (UseIntrinsic) { 815 // Use vector version of the intrinsic. 816 Module *M = State.Builder.GetInsertBlock()->getModule(); 817 VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl); 818 assert(VectorF && "Can't retrieve vector intrinsic."); 819 } else { 820 #ifndef NDEBUG 821 assert(Variant != nullptr && "Can't create vector function."); 822 #endif 823 VectorF = Variant; 824 } 825 826 auto *CI = cast_or_null<CallInst>(getUnderlyingInstr()); 827 SmallVector<OperandBundleDef, 1> OpBundles; 828 if (CI) 829 CI->getOperandBundlesAsDefs(OpBundles); 830 831 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); 832 833 if (isa<FPMathOperator>(V)) 834 V->copyFastMathFlags(CI); 835 836 if (!V->getType()->isVoidTy()) 837 State.set(this, V, Part); 838 State.addMetadata(V, CI); 839 } 840 } 841 842 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 843 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, 844 VPSlotTracker &SlotTracker) const { 845 O << Indent << "WIDEN-CALL "; 846 847 Function *CalledFn = getCalledScalarFunction(); 848 if (CalledFn->getReturnType()->isVoidTy()) 849 O << "void "; 850 else { 851 printAsOperand(O, SlotTracker); 852 O << " = "; 853 } 854 855 O << "call @" << CalledFn->getName() << "("; 856 interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) { 857 Op->printAsOperand(O, SlotTracker); 858 }); 859 O << ")"; 860 861 if (VectorIntrinsicID) 862 O << " (using vector intrinsic)"; 863 else { 864 O << " (using library function"; 865 if (Variant->hasName()) 866 O << ": " << Variant->getName(); 867 O << ")"; 868 } 869 } 870 871 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, 872 VPSlotTracker &SlotTracker) const { 873 O << Indent << "WIDEN-SELECT "; 874 printAsOperand(O, SlotTracker); 875 O << " = select "; 876 getOperand(0)->printAsOperand(O, SlotTracker); 877 O << ", "; 878 getOperand(1)->printAsOperand(O, SlotTracker); 879 O << ", "; 880 getOperand(2)->printAsOperand(O, SlotTracker); 881 O << (isInvariantCond() ? " (condition is loop invariant)" : ""); 882 } 883 #endif 884 885 void VPWidenSelectRecipe::execute(VPTransformState &State) { 886 State.setDebugLocFrom(getDebugLoc()); 887 888 // The condition can be loop invariant but still defined inside the 889 // loop. This means that we can't just use the original 'cond' value. 890 // We have to take the 'vectorized' value and pick the first lane. 891 // Instcombine will make this a no-op. 892 auto *InvarCond = 893 isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr; 894 895 for (unsigned Part = 0; Part < State.UF; ++Part) { 896 Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part); 897 Value *Op0 = State.get(getOperand(1), Part); 898 Value *Op1 = State.get(getOperand(2), Part); 899 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 900 State.set(this, Sel, Part); 901 State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 902 } 903 } 904 905 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy( 906 const FastMathFlags &FMF) { 907 AllowReassoc = FMF.allowReassoc(); 908 NoNaNs = FMF.noNaNs(); 909 NoInfs = FMF.noInfs(); 910 NoSignedZeros = FMF.noSignedZeros(); 911 AllowReciprocal = FMF.allowReciprocal(); 912 AllowContract = FMF.allowContract(); 913 ApproxFunc = FMF.approxFunc(); 914 } 915 916 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 917 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { 918 switch (OpType) { 919 case OperationType::Cmp: 920 O << " " << CmpInst::getPredicateName(getPredicate()); 921 break; 922 case OperationType::DisjointOp: 923 if (DisjointFlags.IsDisjoint) 924 O << " disjoint"; 925 break; 926 case OperationType::PossiblyExactOp: 927 if (ExactFlags.IsExact) 928 O << " exact"; 929 break; 930 case OperationType::OverflowingBinOp: 931 if (WrapFlags.HasNUW) 932 O << " nuw"; 933 if (WrapFlags.HasNSW) 934 O << " nsw"; 935 break; 936 case OperationType::FPMathOp: 937 getFastMathFlags().print(O); 938 break; 939 case OperationType::GEPOp: 940 if (GEPFlags.IsInBounds) 941 O << " inbounds"; 942 break; 943 case OperationType::NonNegOp: 944 if (NonNegFlags.NonNeg) 945 O << " nneg"; 946 break; 947 case OperationType::Other: 948 break; 949 } 950 if (getNumOperands() > 0) 951 O << " "; 952 } 953 #endif 954 955 void VPWidenRecipe::execute(VPTransformState &State) { 956 State.setDebugLocFrom(getDebugLoc()); 957 auto &Builder = State.Builder; 958 switch (Opcode) { 959 case Instruction::Call: 960 case Instruction::Br: 961 case Instruction::PHI: 962 case Instruction::GetElementPtr: 963 case Instruction::Select: 964 llvm_unreachable("This instruction is handled by a different recipe."); 965 case Instruction::UDiv: 966 case Instruction::SDiv: 967 case Instruction::SRem: 968 case Instruction::URem: 969 case Instruction::Add: 970 case Instruction::FAdd: 971 case Instruction::Sub: 972 case Instruction::FSub: 973 case Instruction::FNeg: 974 case Instruction::Mul: 975 case Instruction::FMul: 976 case Instruction::FDiv: 977 case Instruction::FRem: 978 case Instruction::Shl: 979 case Instruction::LShr: 980 case Instruction::AShr: 981 case Instruction::And: 982 case Instruction::Or: 983 case Instruction::Xor: { 984 // Just widen unops and binops. 985 for (unsigned Part = 0; Part < State.UF; ++Part) { 986 SmallVector<Value *, 2> Ops; 987 for (VPValue *VPOp : operands()) 988 Ops.push_back(State.get(VPOp, Part)); 989 990 Value *V = Builder.CreateNAryOp(Opcode, Ops); 991 992 if (auto *VecOp = dyn_cast<Instruction>(V)) 993 setFlags(VecOp); 994 995 // Use this vector value for all users of the original instruction. 996 State.set(this, V, Part); 997 State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 998 } 999 1000 break; 1001 } 1002 case Instruction::Freeze: { 1003 for (unsigned Part = 0; Part < State.UF; ++Part) { 1004 Value *Op = State.get(getOperand(0), Part); 1005 1006 Value *Freeze = Builder.CreateFreeze(Op); 1007 State.set(this, Freeze, Part); 1008 } 1009 break; 1010 } 1011 case Instruction::ICmp: 1012 case Instruction::FCmp: { 1013 // Widen compares. Generate vector compares. 1014 bool FCmp = Opcode == Instruction::FCmp; 1015 for (unsigned Part = 0; Part < State.UF; ++Part) { 1016 Value *A = State.get(getOperand(0), Part); 1017 Value *B = State.get(getOperand(1), Part); 1018 Value *C = nullptr; 1019 if (FCmp) { 1020 // Propagate fast math flags. 1021 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 1022 if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue())) 1023 Builder.setFastMathFlags(I->getFastMathFlags()); 1024 C = Builder.CreateFCmp(getPredicate(), A, B); 1025 } else { 1026 C = Builder.CreateICmp(getPredicate(), A, B); 1027 } 1028 State.set(this, C, Part); 1029 State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1030 } 1031 1032 break; 1033 } 1034 default: 1035 // This instruction is not vectorized by simple widening. 1036 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " 1037 << Instruction::getOpcodeName(Opcode)); 1038 llvm_unreachable("Unhandled instruction!"); 1039 } // end of switch. 1040 1041 #if !defined(NDEBUG) 1042 // Verify that VPlan type inference results agree with the type of the 1043 // generated values. 1044 for (unsigned Part = 0; Part < State.UF; ++Part) { 1045 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), 1046 State.VF) == State.get(this, Part)->getType() && 1047 "inferred type and type from generated instructions do not match"); 1048 } 1049 #endif 1050 } 1051 1052 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1053 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, 1054 VPSlotTracker &SlotTracker) const { 1055 O << Indent << "WIDEN "; 1056 printAsOperand(O, SlotTracker); 1057 O << " = " << Instruction::getOpcodeName(Opcode); 1058 printFlags(O); 1059 printOperands(O, SlotTracker); 1060 } 1061 #endif 1062 1063 void VPWidenCastRecipe::execute(VPTransformState &State) { 1064 State.setDebugLocFrom(getDebugLoc()); 1065 auto &Builder = State.Builder; 1066 /// Vectorize casts. 1067 assert(State.VF.isVector() && "Not vectorizing?"); 1068 Type *DestTy = VectorType::get(getResultType(), State.VF); 1069 VPValue *Op = getOperand(0); 1070 for (unsigned Part = 0; Part < State.UF; ++Part) { 1071 if (Part > 0 && Op->isLiveIn()) { 1072 // FIXME: Remove once explicit unrolling is implemented using VPlan. 1073 State.set(this, State.get(this, 0), Part); 1074 continue; 1075 } 1076 Value *A = State.get(Op, Part); 1077 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); 1078 State.set(this, Cast, Part); 1079 State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue())); 1080 } 1081 } 1082 1083 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1084 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, 1085 VPSlotTracker &SlotTracker) const { 1086 O << Indent << "WIDEN-CAST "; 1087 printAsOperand(O, SlotTracker); 1088 O << " = " << Instruction::getOpcodeName(Opcode) << " "; 1089 printFlags(O); 1090 printOperands(O, SlotTracker); 1091 O << " to " << *getResultType(); 1092 } 1093 #endif 1094 1095 /// This function adds 1096 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 1097 /// to each vector element of Val. The sequence starts at StartIndex. 1098 /// \p Opcode is relevant for FP induction variable. 1099 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 1100 Instruction::BinaryOps BinOp, ElementCount VF, 1101 IRBuilderBase &Builder) { 1102 assert(VF.isVector() && "only vector VFs are supported"); 1103 1104 // Create and check the types. 1105 auto *ValVTy = cast<VectorType>(Val->getType()); 1106 ElementCount VLen = ValVTy->getElementCount(); 1107 1108 Type *STy = Val->getType()->getScalarType(); 1109 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1110 "Induction Step must be an integer or FP"); 1111 assert(Step->getType() == STy && "Step has wrong type"); 1112 1113 SmallVector<Constant *, 8> Indices; 1114 1115 // Create a vector of consecutive numbers from zero to VF. 1116 VectorType *InitVecValVTy = ValVTy; 1117 if (STy->isFloatingPointTy()) { 1118 Type *InitVecValSTy = 1119 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 1120 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 1121 } 1122 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 1123 1124 // Splat the StartIdx 1125 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 1126 1127 if (STy->isIntegerTy()) { 1128 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 1129 Step = Builder.CreateVectorSplat(VLen, Step); 1130 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1131 // FIXME: The newly created binary instructions should contain nsw/nuw 1132 // flags, which can be found from the original scalar operations. 1133 Step = Builder.CreateMul(InitVec, Step); 1134 return Builder.CreateAdd(Val, Step, "induction"); 1135 } 1136 1137 // Floating point induction. 1138 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1139 "Binary Opcode should be specified for FP induction"); 1140 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 1141 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 1142 1143 Step = Builder.CreateVectorSplat(VLen, Step); 1144 Value *MulOp = Builder.CreateFMul(InitVec, Step); 1145 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1146 } 1147 1148 /// A helper function that returns an integer or floating-point constant with 1149 /// value C. 1150 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 1151 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 1152 : ConstantFP::get(Ty, C); 1153 } 1154 1155 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 1156 ElementCount VF) { 1157 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1158 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1159 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1160 return B.CreateUIToFP(RuntimeVF, FTy); 1161 } 1162 1163 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 1164 assert(!State.Instance && "Int or FP induction being replicated."); 1165 1166 Value *Start = getStartValue()->getLiveInIRValue(); 1167 const InductionDescriptor &ID = getInductionDescriptor(); 1168 TruncInst *Trunc = getTruncInst(); 1169 IRBuilderBase &Builder = State.Builder; 1170 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1171 assert(State.VF.isVector() && "must have vector VF"); 1172 1173 // The value from the original loop to which we are mapping the new induction 1174 // variable. 1175 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1176 1177 // Fast-math-flags propagate from the original induction instruction. 1178 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 1179 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 1180 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 1181 1182 // Now do the actual transformations, and start with fetching the step value. 1183 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 1184 1185 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1186 "Expected either an induction phi-node or a truncate of it!"); 1187 1188 // Construct the initial value of the vector IV in the vector loop preheader 1189 auto CurrIP = Builder.saveIP(); 1190 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1191 Builder.SetInsertPoint(VectorPH->getTerminator()); 1192 if (isa<TruncInst>(EntryVal)) { 1193 assert(Start->getType()->isIntegerTy() && 1194 "Truncation requires an integer type"); 1195 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1196 Step = Builder.CreateTrunc(Step, TruncType); 1197 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1198 } 1199 1200 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 1201 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 1202 Value *SteppedStart = getStepVector( 1203 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); 1204 1205 // We create vector phi nodes for both integer and floating-point induction 1206 // variables. Here, we determine the kind of arithmetic we will perform. 1207 Instruction::BinaryOps AddOp; 1208 Instruction::BinaryOps MulOp; 1209 if (Step->getType()->isIntegerTy()) { 1210 AddOp = Instruction::Add; 1211 MulOp = Instruction::Mul; 1212 } else { 1213 AddOp = ID.getInductionOpcode(); 1214 MulOp = Instruction::FMul; 1215 } 1216 1217 // Multiply the vectorization factor by the step using integer or 1218 // floating-point arithmetic as appropriate. 1219 Type *StepType = Step->getType(); 1220 Value *RuntimeVF; 1221 if (Step->getType()->isFloatingPointTy()) 1222 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 1223 else 1224 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 1225 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 1226 1227 // Create a vector splat to use in the induction update. 1228 // 1229 // FIXME: If the step is non-constant, we create the vector splat with 1230 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1231 // handle a constant vector splat. 1232 Value *SplatVF = isa<Constant>(Mul) 1233 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 1234 : Builder.CreateVectorSplat(State.VF, Mul); 1235 Builder.restoreIP(CurrIP); 1236 1237 // We may need to add the step a number of times, depending on the unroll 1238 // factor. The last of those goes into the PHI. 1239 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind"); 1240 VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 1241 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1242 Instruction *LastInduction = VecInd; 1243 for (unsigned Part = 0; Part < State.UF; ++Part) { 1244 State.set(this, LastInduction, Part); 1245 1246 if (isa<TruncInst>(EntryVal)) 1247 State.addMetadata(LastInduction, EntryVal); 1248 1249 LastInduction = cast<Instruction>( 1250 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 1251 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1252 } 1253 1254 LastInduction->setName("vec.ind.next"); 1255 VecInd->addIncoming(SteppedStart, VectorPH); 1256 // Add induction update using an incorrect block temporarily. The phi node 1257 // will be fixed after VPlan execution. Note that at this point the latch 1258 // block cannot be used, as it does not exist yet. 1259 // TODO: Model increment value in VPlan, by turning the recipe into a 1260 // multi-def and a subclass of VPHeaderPHIRecipe. 1261 VecInd->addIncoming(LastInduction, VectorPH); 1262 } 1263 1264 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1265 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, 1266 VPSlotTracker &SlotTracker) const { 1267 O << Indent << "WIDEN-INDUCTION"; 1268 if (getTruncInst()) { 1269 O << "\\l\""; 1270 O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\""; 1271 O << " +\n" << Indent << "\" "; 1272 getVPValue(0)->printAsOperand(O, SlotTracker); 1273 } else 1274 O << " " << VPlanIngredient(IV); 1275 1276 O << ", "; 1277 getStepValue()->printAsOperand(O, SlotTracker); 1278 } 1279 #endif 1280 1281 bool VPWidenIntOrFpInductionRecipe::isCanonical() const { 1282 // The step may be defined by a recipe in the preheader (e.g. if it requires 1283 // SCEV expansion), but for the canonical induction the step is required to be 1284 // 1, which is represented as live-in. 1285 if (getStepValue()->getDefiningRecipe()) 1286 return false; 1287 auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue()); 1288 auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); 1289 auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin()); 1290 return StartC && StartC->isZero() && StepC && StepC->isOne() && 1291 getScalarType() == CanIV->getScalarType(); 1292 } 1293 1294 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1295 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, 1296 VPSlotTracker &SlotTracker) const { 1297 O << Indent; 1298 printAsOperand(O, SlotTracker); 1299 O << Indent << "= DERIVED-IV "; 1300 getStartValue()->printAsOperand(O, SlotTracker); 1301 O << " + "; 1302 getOperand(1)->printAsOperand(O, SlotTracker); 1303 O << " * "; 1304 getStepValue()->printAsOperand(O, SlotTracker); 1305 } 1306 #endif 1307 1308 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 1309 // Fast-math-flags propagate from the original induction instruction. 1310 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 1311 if (hasFastMathFlags()) 1312 State.Builder.setFastMathFlags(getFastMathFlags()); 1313 1314 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 1315 /// variable on which to base the steps, \p Step is the size of the step. 1316 1317 Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0)); 1318 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 1319 IRBuilderBase &Builder = State.Builder; 1320 1321 // Ensure step has the same type as that of scalar IV. 1322 Type *BaseIVTy = BaseIV->getType()->getScalarType(); 1323 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!"); 1324 1325 // We build scalar steps for both integer and floating-point induction 1326 // variables. Here, we determine the kind of arithmetic we will perform. 1327 Instruction::BinaryOps AddOp; 1328 Instruction::BinaryOps MulOp; 1329 if (BaseIVTy->isIntegerTy()) { 1330 AddOp = Instruction::Add; 1331 MulOp = Instruction::Mul; 1332 } else { 1333 AddOp = InductionOpcode; 1334 MulOp = Instruction::FMul; 1335 } 1336 1337 // Determine the number of scalars we need to generate for each unroll 1338 // iteration. 1339 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this); 1340 // Compute the scalar steps and save the results in State. 1341 Type *IntStepTy = 1342 IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits()); 1343 Type *VecIVTy = nullptr; 1344 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 1345 if (!FirstLaneOnly && State.VF.isScalable()) { 1346 VecIVTy = VectorType::get(BaseIVTy, State.VF); 1347 UnitStepVec = 1348 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 1349 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 1350 SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV); 1351 } 1352 1353 unsigned StartPart = 0; 1354 unsigned EndPart = State.UF; 1355 unsigned StartLane = 0; 1356 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 1357 if (State.Instance) { 1358 StartPart = State.Instance->Part; 1359 EndPart = StartPart + 1; 1360 StartLane = State.Instance->Lane.getKnownLane(); 1361 EndLane = StartLane + 1; 1362 } 1363 for (unsigned Part = StartPart; Part < EndPart; ++Part) { 1364 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 1365 1366 if (!FirstLaneOnly && State.VF.isScalable()) { 1367 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 1368 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 1369 if (BaseIVTy->isFloatingPointTy()) 1370 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 1371 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 1372 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 1373 State.set(this, Add, Part); 1374 // It's useful to record the lane values too for the known minimum number 1375 // of elements so we do those below. This improves the code quality when 1376 // trying to extract the first element, for example. 1377 } 1378 1379 if (BaseIVTy->isFloatingPointTy()) 1380 StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy); 1381 1382 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { 1383 Value *StartIdx = Builder.CreateBinOp( 1384 AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane)); 1385 // The step returned by `createStepForVF` is a runtime-evaluated value 1386 // when VF is scalable. Otherwise, it should be folded into a Constant. 1387 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 1388 "Expected StartIdx to be folded to a constant when VF is not " 1389 "scalable"); 1390 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 1391 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul); 1392 State.set(this, Add, VPIteration(Part, Lane)); 1393 } 1394 } 1395 } 1396 1397 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1398 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, 1399 VPSlotTracker &SlotTracker) const { 1400 O << Indent; 1401 printAsOperand(O, SlotTracker); 1402 O << " = SCALAR-STEPS "; 1403 printOperands(O, SlotTracker); 1404 } 1405 #endif 1406 1407 void VPWidenGEPRecipe::execute(VPTransformState &State) { 1408 assert(State.VF.isVector() && "not widening"); 1409 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 1410 // Construct a vector GEP by widening the operands of the scalar GEP as 1411 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 1412 // results in a vector of pointers when at least one operand of the GEP 1413 // is vector-typed. Thus, to keep the representation compact, we only use 1414 // vector-typed operands for loop-varying values. 1415 1416 if (areAllOperandsInvariant()) { 1417 // If we are vectorizing, but the GEP has only loop-invariant operands, 1418 // the GEP we build (by only using vector-typed operands for 1419 // loop-varying values) would be a scalar pointer. Thus, to ensure we 1420 // produce a vector of pointers, we need to either arbitrarily pick an 1421 // operand to broadcast, or broadcast a clone of the original GEP. 1422 // Here, we broadcast a clone of the original. 1423 // 1424 // TODO: If at some point we decide to scalarize instructions having 1425 // loop-invariant operands, this special case will no longer be 1426 // required. We would add the scalarization decision to 1427 // collectLoopScalars() and teach getVectorValue() to broadcast 1428 // the lane-zero scalar value. 1429 SmallVector<Value *> Ops; 1430 for (unsigned I = 0, E = getNumOperands(); I != E; I++) 1431 Ops.push_back(State.get(getOperand(I), VPIteration(0, 0))); 1432 1433 auto *NewGEP = 1434 State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0], 1435 ArrayRef(Ops).drop_front(), "", isInBounds()); 1436 for (unsigned Part = 0; Part < State.UF; ++Part) { 1437 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP); 1438 State.set(this, EntryPart, Part); 1439 State.addMetadata(EntryPart, GEP); 1440 } 1441 } else { 1442 // If the GEP has at least one loop-varying operand, we are sure to 1443 // produce a vector of pointers. But if we are only unrolling, we want 1444 // to produce a scalar GEP for each unroll part. Thus, the GEP we 1445 // produce with the code below will be scalar (if VF == 1) or vector 1446 // (otherwise). Note that for the unroll-only case, we still maintain 1447 // values in the vector mapping with initVector, as we do for other 1448 // instructions. 1449 for (unsigned Part = 0; Part < State.UF; ++Part) { 1450 // The pointer operand of the new GEP. If it's loop-invariant, we 1451 // won't broadcast it. 1452 auto *Ptr = isPointerLoopInvariant() 1453 ? State.get(getOperand(0), VPIteration(0, 0)) 1454 : State.get(getOperand(0), Part); 1455 1456 // Collect all the indices for the new GEP. If any index is 1457 // loop-invariant, we won't broadcast it. 1458 SmallVector<Value *, 4> Indices; 1459 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 1460 VPValue *Operand = getOperand(I); 1461 if (isIndexLoopInvariant(I - 1)) 1462 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 1463 else 1464 Indices.push_back(State.get(Operand, Part)); 1465 } 1466 1467 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 1468 // but it should be a vector, otherwise. 1469 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, 1470 Indices, "", isInBounds()); 1471 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 1472 "NewGEP is not a pointer vector"); 1473 State.set(this, NewGEP, Part); 1474 State.addMetadata(NewGEP, GEP); 1475 } 1476 } 1477 } 1478 1479 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1480 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, 1481 VPSlotTracker &SlotTracker) const { 1482 O << Indent << "WIDEN-GEP "; 1483 O << (isPointerLoopInvariant() ? "Inv" : "Var"); 1484 for (size_t I = 0; I < getNumOperands() - 1; ++I) 1485 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]"; 1486 1487 O << " "; 1488 printAsOperand(O, SlotTracker); 1489 O << " = getelementptr"; 1490 printFlags(O); 1491 printOperands(O, SlotTracker); 1492 } 1493 #endif 1494 1495 void VPVectorPointerRecipe ::execute(VPTransformState &State) { 1496 auto &Builder = State.Builder; 1497 State.setDebugLocFrom(getDebugLoc()); 1498 for (unsigned Part = 0; Part < State.UF; ++Part) { 1499 // Calculate the pointer for the specific unroll-part. 1500 Value *PartPtr = nullptr; 1501 // Use i32 for the gep index type when the value is constant, 1502 // or query DataLayout for a more suitable index type otherwise. 1503 const DataLayout &DL = 1504 Builder.GetInsertBlock()->getModule()->getDataLayout(); 1505 Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0) 1506 ? DL.getIndexType(IndexedTy->getPointerTo()) 1507 : Builder.getInt32Ty(); 1508 Value *Ptr = State.get(getOperand(0), VPIteration(0, 0)); 1509 bool InBounds = isInBounds(); 1510 if (IsReverse) { 1511 // If the address is consecutive but reversed, then the 1512 // wide store needs to start at the last vector element. 1513 // RunTimeVF = VScale * VF.getKnownMinValue() 1514 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 1515 Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF); 1516 // NumElt = -Part * RunTimeVF 1517 Value *NumElt = Builder.CreateMul( 1518 ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF); 1519 // LastLane = 1 - RunTimeVF 1520 Value *LastLane = 1521 Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); 1522 PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds); 1523 PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds); 1524 } else { 1525 Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part); 1526 PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds); 1527 } 1528 1529 State.set(this, PartPtr, Part, /*IsScalar*/ true); 1530 } 1531 } 1532 1533 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1534 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, 1535 VPSlotTracker &SlotTracker) const { 1536 O << Indent; 1537 printAsOperand(O, SlotTracker); 1538 O << " = vector-pointer "; 1539 if (IsReverse) 1540 O << "(reverse) "; 1541 1542 printOperands(O, SlotTracker); 1543 } 1544 #endif 1545 1546 void VPBlendRecipe::execute(VPTransformState &State) { 1547 State.setDebugLocFrom(getDebugLoc()); 1548 // We know that all PHIs in non-header blocks are converted into 1549 // selects, so we don't have to worry about the insertion order and we 1550 // can just use the builder. 1551 // At this point we generate the predication tree. There may be 1552 // duplications since this is a simple recursive scan, but future 1553 // optimizations will clean it up. 1554 1555 unsigned NumIncoming = getNumIncomingValues(); 1556 1557 // Generate a sequence of selects of the form: 1558 // SELECT(Mask3, In3, 1559 // SELECT(Mask2, In2, 1560 // SELECT(Mask1, In1, 1561 // In0))) 1562 // Note that Mask0 is never used: lanes for which no path reaches this phi and 1563 // are essentially undef are taken from In0. 1564 VectorParts Entry(State.UF); 1565 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 1566 for (unsigned In = 0; In < NumIncoming; ++In) { 1567 for (unsigned Part = 0; Part < State.UF; ++Part) { 1568 // We might have single edge PHIs (blocks) - use an identity 1569 // 'select' for the first PHI operand. 1570 Value *In0 = State.get(getIncomingValue(In), Part, OnlyFirstLaneUsed); 1571 if (In == 0) 1572 Entry[Part] = In0; // Initialize with the first incoming value. 1573 else { 1574 // Select between the current value and the previous incoming edge 1575 // based on the incoming mask. 1576 Value *Cond = State.get(getMask(In), Part, OnlyFirstLaneUsed); 1577 Entry[Part] = 1578 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 1579 } 1580 } 1581 } 1582 for (unsigned Part = 0; Part < State.UF; ++Part) 1583 State.set(this, Entry[Part], Part, OnlyFirstLaneUsed); 1584 } 1585 1586 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1587 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, 1588 VPSlotTracker &SlotTracker) const { 1589 O << Indent << "BLEND "; 1590 printAsOperand(O, SlotTracker); 1591 O << " ="; 1592 if (getNumIncomingValues() == 1) { 1593 // Not a User of any mask: not really blending, this is a 1594 // single-predecessor phi. 1595 O << " "; 1596 getIncomingValue(0)->printAsOperand(O, SlotTracker); 1597 } else { 1598 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { 1599 O << " "; 1600 getIncomingValue(I)->printAsOperand(O, SlotTracker); 1601 if (I == 0) 1602 continue; 1603 O << "/"; 1604 getMask(I)->printAsOperand(O, SlotTracker); 1605 } 1606 } 1607 } 1608 #endif 1609 1610 void VPReductionRecipe::execute(VPTransformState &State) { 1611 assert(!State.Instance && "Reduction being replicated."); 1612 Value *PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true); 1613 RecurKind Kind = RdxDesc.getRecurrenceKind(); 1614 // Propagate the fast-math flags carried by the underlying instruction. 1615 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 1616 State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 1617 for (unsigned Part = 0; Part < State.UF; ++Part) { 1618 Value *NewVecOp = State.get(getVecOp(), Part); 1619 if (VPValue *Cond = getCondOp()) { 1620 Value *NewCond = State.get(Cond, Part, State.VF.isScalar()); 1621 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType()); 1622 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); 1623 Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy, 1624 RdxDesc.getFastMathFlags()); 1625 if (State.VF.isVector()) { 1626 Iden = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 1627 } 1628 1629 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden); 1630 NewVecOp = Select; 1631 } 1632 Value *NewRed; 1633 Value *NextInChain; 1634 if (IsOrdered) { 1635 if (State.VF.isVector()) 1636 NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp, 1637 PrevInChain); 1638 else 1639 NewRed = State.Builder.CreateBinOp( 1640 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain, 1641 NewVecOp); 1642 PrevInChain = NewRed; 1643 } else { 1644 PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true); 1645 NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp); 1646 } 1647 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 1648 NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(), 1649 NewRed, PrevInChain); 1650 } else if (IsOrdered) 1651 NextInChain = NewRed; 1652 else 1653 NextInChain = State.Builder.CreateBinOp( 1654 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain); 1655 State.set(this, NextInChain, Part, /*IsScalar*/ true); 1656 } 1657 } 1658 1659 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1660 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, 1661 VPSlotTracker &SlotTracker) const { 1662 O << Indent << "REDUCE "; 1663 printAsOperand(O, SlotTracker); 1664 O << " = "; 1665 getChainOp()->printAsOperand(O, SlotTracker); 1666 O << " +"; 1667 if (isa<FPMathOperator>(getUnderlyingInstr())) 1668 O << getUnderlyingInstr()->getFastMathFlags(); 1669 O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; 1670 getVecOp()->printAsOperand(O, SlotTracker); 1671 if (getCondOp()) { 1672 O << ", "; 1673 getCondOp()->printAsOperand(O, SlotTracker); 1674 } 1675 O << ")"; 1676 if (RdxDesc.IntermediateStore) 1677 O << " (with final reduction value stored in invariant address sank " 1678 "outside of loop)"; 1679 } 1680 #endif 1681 1682 bool VPReplicateRecipe::shouldPack() const { 1683 // Find if the recipe is used by a widened recipe via an intervening 1684 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. 1685 return any_of(users(), [](const VPUser *U) { 1686 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U)) 1687 return any_of(PredR->users(), [PredR](const VPUser *U) { 1688 return !U->usesScalars(PredR); 1689 }); 1690 return false; 1691 }); 1692 } 1693 1694 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1695 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, 1696 VPSlotTracker &SlotTracker) const { 1697 O << Indent << (IsUniform ? "CLONE " : "REPLICATE "); 1698 1699 if (!getUnderlyingInstr()->getType()->isVoidTy()) { 1700 printAsOperand(O, SlotTracker); 1701 O << " = "; 1702 } 1703 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) { 1704 O << "call"; 1705 printFlags(O); 1706 O << "@" << CB->getCalledFunction()->getName() << "("; 1707 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), 1708 O, [&O, &SlotTracker](VPValue *Op) { 1709 Op->printAsOperand(O, SlotTracker); 1710 }); 1711 O << ")"; 1712 } else { 1713 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); 1714 printFlags(O); 1715 printOperands(O, SlotTracker); 1716 } 1717 1718 if (shouldPack()) 1719 O << " (S->V)"; 1720 } 1721 #endif 1722 1723 /// Checks if \p C is uniform across all VFs and UFs. It is considered as such 1724 /// if it is either defined outside the vector region or its operand is known to 1725 /// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI). 1726 /// TODO: Uniformity should be associated with a VPValue and there should be a 1727 /// generic way to check. 1728 static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) { 1729 return C->isDefinedOutsideVectorRegions() || 1730 isa<VPDerivedIVRecipe>(C->getOperand(0)) || 1731 isa<VPCanonicalIVPHIRecipe>(C->getOperand(0)); 1732 } 1733 1734 Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) { 1735 assert(vputils::onlyFirstLaneUsed(this) && 1736 "Codegen only implemented for first lane."); 1737 switch (Opcode) { 1738 case Instruction::SExt: 1739 case Instruction::ZExt: 1740 case Instruction::Trunc: { 1741 // Note: SExt/ZExt not used yet. 1742 Value *Op = State.get(getOperand(0), VPIteration(Part, 0)); 1743 return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy); 1744 } 1745 default: 1746 llvm_unreachable("opcode not implemented yet"); 1747 } 1748 } 1749 1750 void VPScalarCastRecipe ::execute(VPTransformState &State) { 1751 bool IsUniformAcrossVFsAndUFs = isUniformAcrossVFsAndUFs(this); 1752 for (unsigned Part = 0; Part != State.UF; ++Part) { 1753 Value *Res; 1754 // Only generate a single instance, if the recipe is uniform across UFs and 1755 // VFs. 1756 if (Part > 0 && IsUniformAcrossVFsAndUFs) 1757 Res = State.get(this, VPIteration(0, 0)); 1758 else 1759 Res = generate(State, Part); 1760 State.set(this, Res, VPIteration(Part, 0)); 1761 } 1762 } 1763 1764 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1765 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent, 1766 VPSlotTracker &SlotTracker) const { 1767 O << Indent << "SCALAR-CAST "; 1768 printAsOperand(O, SlotTracker); 1769 O << " = " << Instruction::getOpcodeName(Opcode) << " "; 1770 printOperands(O, SlotTracker); 1771 O << " to " << *ResultTy; 1772 } 1773 #endif 1774 1775 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 1776 assert(State.Instance && "Branch on Mask works only on single instance."); 1777 1778 unsigned Part = State.Instance->Part; 1779 unsigned Lane = State.Instance->Lane.getKnownLane(); 1780 1781 Value *ConditionBit = nullptr; 1782 VPValue *BlockInMask = getMask(); 1783 if (BlockInMask) { 1784 ConditionBit = State.get(BlockInMask, Part); 1785 if (ConditionBit->getType()->isVectorTy()) 1786 ConditionBit = State.Builder.CreateExtractElement( 1787 ConditionBit, State.Builder.getInt32(Lane)); 1788 } else // Block in mask is all-one. 1789 ConditionBit = State.Builder.getTrue(); 1790 1791 // Replace the temporary unreachable terminator with a new conditional branch, 1792 // whose two destinations will be set later when they are created. 1793 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 1794 assert(isa<UnreachableInst>(CurrentTerminator) && 1795 "Expected to replace unreachable terminator with conditional branch."); 1796 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 1797 CondBr->setSuccessor(0, nullptr); 1798 ReplaceInstWithInst(CurrentTerminator, CondBr); 1799 } 1800 1801 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 1802 assert(State.Instance && "Predicated instruction PHI works per instance."); 1803 Instruction *ScalarPredInst = 1804 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 1805 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 1806 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 1807 assert(PredicatingBB && "Predicated block has no single predecessor."); 1808 assert(isa<VPReplicateRecipe>(getOperand(0)) && 1809 "operand must be VPReplicateRecipe"); 1810 1811 // By current pack/unpack logic we need to generate only a single phi node: if 1812 // a vector value for the predicated instruction exists at this point it means 1813 // the instruction has vector users only, and a phi for the vector value is 1814 // needed. In this case the recipe of the predicated instruction is marked to 1815 // also do that packing, thereby "hoisting" the insert-element sequence. 1816 // Otherwise, a phi node for the scalar value is needed. 1817 unsigned Part = State.Instance->Part; 1818 if (State.hasVectorValue(getOperand(0), Part)) { 1819 Value *VectorValue = State.get(getOperand(0), Part); 1820 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 1821 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 1822 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 1823 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 1824 if (State.hasVectorValue(this, Part)) 1825 State.reset(this, VPhi, Part); 1826 else 1827 State.set(this, VPhi, Part); 1828 // NOTE: Currently we need to update the value of the operand, so the next 1829 // predicated iteration inserts its generated value in the correct vector. 1830 State.reset(getOperand(0), VPhi, Part); 1831 } else { 1832 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 1833 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 1834 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 1835 PredicatingBB); 1836 Phi->addIncoming(ScalarPredInst, PredicatedBB); 1837 if (State.hasScalarValue(this, *State.Instance)) 1838 State.reset(this, Phi, *State.Instance); 1839 else 1840 State.set(this, Phi, *State.Instance); 1841 // NOTE: Currently we need to update the value of the operand, so the next 1842 // predicated iteration inserts its generated value in the correct vector. 1843 State.reset(getOperand(0), Phi, *State.Instance); 1844 } 1845 } 1846 1847 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1848 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, 1849 VPSlotTracker &SlotTracker) const { 1850 O << Indent << "PHI-PREDICATED-INSTRUCTION "; 1851 printAsOperand(O, SlotTracker); 1852 O << " = "; 1853 printOperands(O, SlotTracker); 1854 } 1855 1856 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, 1857 VPSlotTracker &SlotTracker) const { 1858 O << Indent << "WIDEN "; 1859 printAsOperand(O, SlotTracker); 1860 O << " = load "; 1861 printOperands(O, SlotTracker); 1862 } 1863 1864 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, 1865 VPSlotTracker &SlotTracker) const { 1866 O << Indent << "WIDEN "; 1867 printAsOperand(O, SlotTracker); 1868 O << " = vp.load "; 1869 printOperands(O, SlotTracker); 1870 } 1871 1872 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, 1873 VPSlotTracker &SlotTracker) const { 1874 O << Indent << "WIDEN store "; 1875 printOperands(O, SlotTracker); 1876 } 1877 1878 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent, 1879 VPSlotTracker &SlotTracker) const { 1880 O << Indent << "WIDEN vp.store "; 1881 printOperands(O, SlotTracker); 1882 } 1883 #endif 1884 1885 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { 1886 Value *Start = getStartValue()->getLiveInIRValue(); 1887 PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index"); 1888 EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 1889 1890 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1891 EntryPart->addIncoming(Start, VectorPH); 1892 EntryPart->setDebugLoc(getDebugLoc()); 1893 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) 1894 State.set(this, EntryPart, Part, /*IsScalar*/ true); 1895 } 1896 1897 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1898 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 1899 VPSlotTracker &SlotTracker) const { 1900 O << Indent << "EMIT "; 1901 printAsOperand(O, SlotTracker); 1902 O << " = CANONICAL-INDUCTION "; 1903 printOperands(O, SlotTracker); 1904 } 1905 #endif 1906 1907 bool VPCanonicalIVPHIRecipe::isCanonical( 1908 InductionDescriptor::InductionKind Kind, VPValue *Start, 1909 VPValue *Step) const { 1910 // Must be an integer induction. 1911 if (Kind != InductionDescriptor::IK_IntInduction) 1912 return false; 1913 // Start must match the start value of this canonical induction. 1914 if (Start != getStartValue()) 1915 return false; 1916 1917 // If the step is defined by a recipe, it is not a ConstantInt. 1918 if (Step->getDefiningRecipe()) 1919 return false; 1920 1921 ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue()); 1922 return StepC && StepC->isOne(); 1923 } 1924 1925 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) { 1926 return IsScalarAfterVectorization && 1927 (!IsScalable || vputils::onlyFirstLaneUsed(this)); 1928 } 1929 1930 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1931 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, 1932 VPSlotTracker &SlotTracker) const { 1933 O << Indent << "EMIT "; 1934 printAsOperand(O, SlotTracker); 1935 O << " = WIDEN-POINTER-INDUCTION "; 1936 getStartValue()->printAsOperand(O, SlotTracker); 1937 O << ", " << *IndDesc.getStep(); 1938 } 1939 #endif 1940 1941 void VPExpandSCEVRecipe::execute(VPTransformState &State) { 1942 assert(!State.Instance && "cannot be used in per-lane"); 1943 const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout(); 1944 SCEVExpander Exp(SE, DL, "induction"); 1945 1946 Value *Res = Exp.expandCodeFor(Expr, Expr->getType(), 1947 &*State.Builder.GetInsertPoint()); 1948 assert(!State.ExpandedSCEVs.contains(Expr) && 1949 "Same SCEV expanded multiple times"); 1950 State.ExpandedSCEVs[Expr] = Res; 1951 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) 1952 State.set(this, Res, {Part, 0}); 1953 } 1954 1955 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1956 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, 1957 VPSlotTracker &SlotTracker) const { 1958 O << Indent << "EMIT "; 1959 getVPSingleValue()->printAsOperand(O, SlotTracker); 1960 O << " = EXPAND SCEV " << *Expr; 1961 } 1962 #endif 1963 1964 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { 1965 Value *CanonicalIV = State.get(getOperand(0), 0, /*IsScalar*/ true); 1966 Type *STy = CanonicalIV->getType(); 1967 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); 1968 ElementCount VF = State.VF; 1969 Value *VStart = VF.isScalar() 1970 ? CanonicalIV 1971 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); 1972 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { 1973 Value *VStep = createStepForVF(Builder, STy, VF, Part); 1974 if (VF.isVector()) { 1975 VStep = Builder.CreateVectorSplat(VF, VStep); 1976 VStep = 1977 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); 1978 } 1979 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); 1980 State.set(this, CanonicalVectorIV, Part); 1981 } 1982 } 1983 1984 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1985 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, 1986 VPSlotTracker &SlotTracker) const { 1987 O << Indent << "EMIT "; 1988 printAsOperand(O, SlotTracker); 1989 O << " = WIDEN-CANONICAL-INDUCTION "; 1990 printOperands(O, SlotTracker); 1991 } 1992 #endif 1993 1994 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { 1995 auto &Builder = State.Builder; 1996 // Create a vector from the initial value. 1997 auto *VectorInit = getStartValue()->getLiveInIRValue(); 1998 1999 Type *VecTy = State.VF.isScalar() 2000 ? VectorInit->getType() 2001 : VectorType::get(VectorInit->getType(), State.VF); 2002 2003 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2004 if (State.VF.isVector()) { 2005 auto *IdxTy = Builder.getInt32Ty(); 2006 auto *One = ConstantInt::get(IdxTy, 1); 2007 IRBuilder<>::InsertPointGuard Guard(Builder); 2008 Builder.SetInsertPoint(VectorPH->getTerminator()); 2009 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF); 2010 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 2011 VectorInit = Builder.CreateInsertElement( 2012 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init"); 2013 } 2014 2015 // Create a phi node for the new recurrence. 2016 PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur"); 2017 EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 2018 EntryPart->addIncoming(VectorInit, VectorPH); 2019 State.set(this, EntryPart, 0); 2020 } 2021 2022 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2023 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, 2024 VPSlotTracker &SlotTracker) const { 2025 O << Indent << "FIRST-ORDER-RECURRENCE-PHI "; 2026 printAsOperand(O, SlotTracker); 2027 O << " = phi "; 2028 printOperands(O, SlotTracker); 2029 } 2030 #endif 2031 2032 void VPReductionPHIRecipe::execute(VPTransformState &State) { 2033 auto &Builder = State.Builder; 2034 2035 // Reductions do not have to start at zero. They can start with 2036 // any loop invariant values. 2037 VPValue *StartVPV = getStartValue(); 2038 Value *StartV = StartVPV->getLiveInIRValue(); 2039 2040 // In order to support recurrences we need to be able to vectorize Phi nodes. 2041 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 2042 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 2043 // this value when we vectorize all of the instructions that use the PHI. 2044 bool ScalarPHI = State.VF.isScalar() || IsInLoop; 2045 Type *VecTy = ScalarPHI ? StartV->getType() 2046 : VectorType::get(StartV->getType(), State.VF); 2047 2048 BasicBlock *HeaderBB = State.CFG.PrevBB; 2049 assert(State.CurrentVectorLoop->getHeader() == HeaderBB && 2050 "recipe must be in the vector loop header"); 2051 unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF; 2052 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { 2053 Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi"); 2054 EntryPart->insertBefore(HeaderBB->getFirstInsertionPt()); 2055 State.set(this, EntryPart, Part, IsInLoop); 2056 } 2057 2058 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2059 2060 Value *Iden = nullptr; 2061 RecurKind RK = RdxDesc.getRecurrenceKind(); 2062 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || 2063 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { 2064 // MinMax and AnyOf reductions have the start value as their identity. 2065 if (ScalarPHI) { 2066 Iden = StartV; 2067 } else { 2068 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 2069 Builder.SetInsertPoint(VectorPH->getTerminator()); 2070 StartV = Iden = 2071 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 2072 } 2073 } else { 2074 Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(), 2075 RdxDesc.getFastMathFlags()); 2076 2077 if (!ScalarPHI) { 2078 Iden = Builder.CreateVectorSplat(State.VF, Iden); 2079 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 2080 Builder.SetInsertPoint(VectorPH->getTerminator()); 2081 Constant *Zero = Builder.getInt32(0); 2082 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 2083 } 2084 } 2085 2086 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { 2087 Value *EntryPart = State.get(this, Part, IsInLoop); 2088 // Make sure to add the reduction start value only to the 2089 // first unroll part. 2090 Value *StartVal = (Part == 0) ? StartV : Iden; 2091 cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH); 2092 } 2093 } 2094 2095 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2096 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2097 VPSlotTracker &SlotTracker) const { 2098 O << Indent << "WIDEN-REDUCTION-PHI "; 2099 2100 printAsOperand(O, SlotTracker); 2101 O << " = phi "; 2102 printOperands(O, SlotTracker); 2103 } 2104 #endif 2105 2106 void VPWidenPHIRecipe::execute(VPTransformState &State) { 2107 assert(EnableVPlanNativePath && 2108 "Non-native vplans are not expected to have VPWidenPHIRecipes."); 2109 2110 Value *Op0 = State.get(getOperand(0), 0); 2111 Type *VecTy = Op0->getType(); 2112 Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi"); 2113 State.set(this, VecPhi, 0); 2114 } 2115 2116 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2117 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2118 VPSlotTracker &SlotTracker) const { 2119 O << Indent << "WIDEN-PHI "; 2120 2121 auto *OriginalPhi = cast<PHINode>(getUnderlyingValue()); 2122 // Unless all incoming values are modeled in VPlan print the original PHI 2123 // directly. 2124 // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming 2125 // values as VPValues. 2126 if (getNumOperands() != OriginalPhi->getNumOperands()) { 2127 O << VPlanIngredient(OriginalPhi); 2128 return; 2129 } 2130 2131 printAsOperand(O, SlotTracker); 2132 O << " = phi "; 2133 printOperands(O, SlotTracker); 2134 } 2135 #endif 2136 2137 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and 2138 // remove VPActiveLaneMaskPHIRecipe. 2139 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { 2140 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2141 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { 2142 Value *StartMask = State.get(getOperand(0), Part); 2143 PHINode *EntryPart = 2144 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); 2145 EntryPart->addIncoming(StartMask, VectorPH); 2146 EntryPart->setDebugLoc(getDebugLoc()); 2147 State.set(this, EntryPart, Part); 2148 } 2149 } 2150 2151 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2152 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2153 VPSlotTracker &SlotTracker) const { 2154 O << Indent << "ACTIVE-LANE-MASK-PHI "; 2155 2156 printAsOperand(O, SlotTracker); 2157 O << " = phi "; 2158 printOperands(O, SlotTracker); 2159 } 2160 #endif 2161 2162 void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) { 2163 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2164 assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization."); 2165 Value *Start = State.get(getOperand(0), VPIteration(0, 0)); 2166 PHINode *EntryPart = 2167 State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv"); 2168 EntryPart->addIncoming(Start, VectorPH); 2169 EntryPart->setDebugLoc(getDebugLoc()); 2170 State.set(this, EntryPart, 0, /*IsScalar=*/true); 2171 } 2172 2173 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2174 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2175 VPSlotTracker &SlotTracker) const { 2176 O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI "; 2177 2178 printAsOperand(O, SlotTracker); 2179 O << " = phi "; 2180 printOperands(O, SlotTracker); 2181 } 2182 #endif 2183