1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file contains implementations for different VPlan recipes. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "VPlan.h" 15 #include "VPlanAnalysis.h" 16 #include "VPlanUtils.h" 17 #include "llvm/ADT/STLExtras.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/Twine.h" 20 #include "llvm/Analysis/IVDescriptors.h" 21 #include "llvm/IR/BasicBlock.h" 22 #include "llvm/IR/IRBuilder.h" 23 #include "llvm/IR/Instruction.h" 24 #include "llvm/IR/Instructions.h" 25 #include "llvm/IR/Type.h" 26 #include "llvm/IR/Value.h" 27 #include "llvm/IR/VectorBuilder.h" 28 #include "llvm/Support/Casting.h" 29 #include "llvm/Support/CommandLine.h" 30 #include "llvm/Support/Debug.h" 31 #include "llvm/Support/raw_ostream.h" 32 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 33 #include "llvm/Transforms/Utils/LoopUtils.h" 34 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 35 #include <cassert> 36 37 using namespace llvm; 38 39 using VectorParts = SmallVector<Value *, 2>; 40 41 namespace llvm { 42 extern cl::opt<bool> EnableVPlanNativePath; 43 } 44 extern cl::opt<unsigned> ForceTargetInstructionCost; 45 46 #define LV_NAME "loop-vectorize" 47 #define DEBUG_TYPE LV_NAME 48 49 bool VPRecipeBase::mayWriteToMemory() const { 50 switch (getVPDefID()) { 51 case VPInterleaveSC: 52 return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0; 53 case VPWidenStoreEVLSC: 54 case VPWidenStoreSC: 55 return true; 56 case VPReplicateSC: 57 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 58 ->mayWriteToMemory(); 59 case VPWidenCallSC: 60 return !cast<VPWidenCallRecipe>(this) 61 ->getCalledScalarFunction() 62 ->onlyReadsMemory(); 63 case VPBranchOnMaskSC: 64 case VPScalarIVStepsSC: 65 case VPPredInstPHISC: 66 return false; 67 case VPBlendSC: 68 case VPReductionEVLSC: 69 case VPReductionSC: 70 case VPWidenCanonicalIVSC: 71 case VPWidenCastSC: 72 case VPWidenGEPSC: 73 case VPWidenIntOrFpInductionSC: 74 case VPWidenLoadEVLSC: 75 case VPWidenLoadSC: 76 case VPWidenPHISC: 77 case VPWidenSC: 78 case VPWidenEVLSC: 79 case VPWidenSelectSC: { 80 const Instruction *I = 81 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 82 (void)I; 83 assert((!I || !I->mayWriteToMemory()) && 84 "underlying instruction may write to memory"); 85 return false; 86 } 87 default: 88 return true; 89 } 90 } 91 92 bool VPRecipeBase::mayReadFromMemory() const { 93 switch (getVPDefID()) { 94 case VPWidenLoadEVLSC: 95 case VPWidenLoadSC: 96 return true; 97 case VPReplicateSC: 98 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 99 ->mayReadFromMemory(); 100 case VPWidenCallSC: 101 return !cast<VPWidenCallRecipe>(this) 102 ->getCalledScalarFunction() 103 ->onlyWritesMemory(); 104 case VPBranchOnMaskSC: 105 case VPPredInstPHISC: 106 case VPScalarIVStepsSC: 107 case VPWidenStoreEVLSC: 108 case VPWidenStoreSC: 109 return false; 110 case VPBlendSC: 111 case VPReductionEVLSC: 112 case VPReductionSC: 113 case VPWidenCanonicalIVSC: 114 case VPWidenCastSC: 115 case VPWidenGEPSC: 116 case VPWidenIntOrFpInductionSC: 117 case VPWidenPHISC: 118 case VPWidenSC: 119 case VPWidenEVLSC: 120 case VPWidenSelectSC: { 121 const Instruction *I = 122 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 123 (void)I; 124 assert((!I || !I->mayReadFromMemory()) && 125 "underlying instruction may read from memory"); 126 return false; 127 } 128 default: 129 return true; 130 } 131 } 132 133 bool VPRecipeBase::mayHaveSideEffects() const { 134 switch (getVPDefID()) { 135 case VPDerivedIVSC: 136 case VPPredInstPHISC: 137 case VPScalarCastSC: 138 return false; 139 case VPInstructionSC: 140 switch (cast<VPInstruction>(this)->getOpcode()) { 141 case Instruction::Or: 142 case Instruction::ICmp: 143 case Instruction::Select: 144 case VPInstruction::Not: 145 case VPInstruction::CalculateTripCountMinusVF: 146 case VPInstruction::CanonicalIVIncrementForPart: 147 case VPInstruction::ExtractFromEnd: 148 case VPInstruction::FirstOrderRecurrenceSplice: 149 case VPInstruction::LogicalAnd: 150 case VPInstruction::PtrAdd: 151 return false; 152 default: 153 return true; 154 } 155 case VPWidenCallSC: { 156 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction(); 157 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn(); 158 } 159 case VPBlendSC: 160 case VPReductionEVLSC: 161 case VPReductionSC: 162 case VPScalarIVStepsSC: 163 case VPWidenCanonicalIVSC: 164 case VPWidenCastSC: 165 case VPWidenGEPSC: 166 case VPWidenIntOrFpInductionSC: 167 case VPWidenPHISC: 168 case VPWidenPointerInductionSC: 169 case VPWidenSC: 170 case VPWidenEVLSC: 171 case VPWidenSelectSC: { 172 const Instruction *I = 173 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 174 (void)I; 175 assert((!I || !I->mayHaveSideEffects()) && 176 "underlying instruction has side-effects"); 177 return false; 178 } 179 case VPInterleaveSC: 180 return mayWriteToMemory(); 181 case VPWidenLoadEVLSC: 182 case VPWidenLoadSC: 183 case VPWidenStoreEVLSC: 184 case VPWidenStoreSC: 185 assert( 186 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() == 187 mayWriteToMemory() && 188 "mayHaveSideffects result for ingredient differs from this " 189 "implementation"); 190 return mayWriteToMemory(); 191 case VPReplicateSC: { 192 auto *R = cast<VPReplicateRecipe>(this); 193 return R->getUnderlyingInstr()->mayHaveSideEffects(); 194 } 195 default: 196 return true; 197 } 198 } 199 200 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { 201 VPValue *ExitValue = getOperand(0); 202 VPBasicBlock *MiddleVPBB = 203 cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor()); 204 VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe(); 205 auto *ExitingVPBB = ExitingRecipe ? ExitingRecipe->getParent() : nullptr; 206 // Values leaving the vector loop reach live out phi's in the exiting block 207 // via middle block. 208 auto *PredVPBB = !ExitingVPBB || ExitingVPBB->getEnclosingLoopRegion() 209 ? MiddleVPBB 210 : ExitingVPBB; 211 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; 212 Value *V = State.get(ExitValue, VPIteration(0, 0)); 213 if (Phi->getBasicBlockIndex(PredBB) != -1) 214 Phi->setIncomingValueForBlock(PredBB, V); 215 else 216 Phi->addIncoming(V, PredBB); 217 } 218 219 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 220 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { 221 O << "Live-out "; 222 getPhi()->printAsOperand(O); 223 O << " = "; 224 getOperand(0)->printAsOperand(O, SlotTracker); 225 O << "\n"; 226 } 227 #endif 228 229 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { 230 assert(!Parent && "Recipe already in some VPBasicBlock"); 231 assert(InsertPos->getParent() && 232 "Insertion position not in any VPBasicBlock"); 233 InsertPos->getParent()->insert(this, InsertPos->getIterator()); 234 } 235 236 void VPRecipeBase::insertBefore(VPBasicBlock &BB, 237 iplist<VPRecipeBase>::iterator I) { 238 assert(!Parent && "Recipe already in some VPBasicBlock"); 239 assert(I == BB.end() || I->getParent() == &BB); 240 BB.insert(this, I); 241 } 242 243 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { 244 assert(!Parent && "Recipe already in some VPBasicBlock"); 245 assert(InsertPos->getParent() && 246 "Insertion position not in any VPBasicBlock"); 247 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator())); 248 } 249 250 void VPRecipeBase::removeFromParent() { 251 assert(getParent() && "Recipe not in any VPBasicBlock"); 252 getParent()->getRecipeList().remove(getIterator()); 253 Parent = nullptr; 254 } 255 256 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { 257 assert(getParent() && "Recipe not in any VPBasicBlock"); 258 return getParent()->getRecipeList().erase(getIterator()); 259 } 260 261 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { 262 removeFromParent(); 263 insertAfter(InsertPos); 264 } 265 266 void VPRecipeBase::moveBefore(VPBasicBlock &BB, 267 iplist<VPRecipeBase>::iterator I) { 268 removeFromParent(); 269 insertBefore(BB, I); 270 } 271 272 /// Return the underlying instruction to be used for computing \p R's cost via 273 /// the legacy cost model. Return nullptr if there's no suitable instruction. 274 static Instruction *getInstructionForCost(const VPRecipeBase *R) { 275 if (auto *S = dyn_cast<VPSingleDefRecipe>(R)) 276 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue()); 277 if (auto *IG = dyn_cast<VPInterleaveRecipe>(R)) 278 return IG->getInsertPos(); 279 // Currently the legacy cost model only calculates the instruction cost with 280 // underlying instruction. Removing the WidenMem here will prevent 281 // force-target-instruction-cost overwriting the cost of recipe with 282 // underlying instruction which is inconsistent with the legacy model. 283 // TODO: Remove WidenMem from this function when we don't need to compare to 284 // the legacy model. 285 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R)) 286 return &WidenMem->getIngredient(); 287 return nullptr; 288 } 289 290 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { 291 auto *UI = getInstructionForCost(this); 292 if (UI && Ctx.skipCostComputation(UI, VF.isVector())) 293 return 0; 294 295 InstructionCost RecipeCost = computeCost(VF, Ctx); 296 if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 && 297 RecipeCost.isValid()) 298 RecipeCost = InstructionCost(ForceTargetInstructionCost); 299 300 LLVM_DEBUG({ 301 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": "; 302 dump(); 303 }); 304 return RecipeCost; 305 } 306 307 InstructionCost VPRecipeBase::computeCost(ElementCount VF, 308 VPCostContext &Ctx) const { 309 // Compute the cost for the recipe falling back to the legacy cost model using 310 // the underlying instruction. If there is no underlying instruction, returns 311 // 0. 312 Instruction *UI = getInstructionForCost(this); 313 if (UI && isa<VPReplicateRecipe>(this)) { 314 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan 315 // transform, avoid computing their cost multiple times for now. 316 Ctx.SkipCostComputation.insert(UI); 317 } 318 return UI ? Ctx.getLegacyCost(UI, VF) : 0; 319 } 320 321 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { 322 assert(OpType == OperationType::FPMathOp && 323 "recipe doesn't have fast math flags"); 324 FastMathFlags Res; 325 Res.setAllowReassoc(FMFs.AllowReassoc); 326 Res.setNoNaNs(FMFs.NoNaNs); 327 Res.setNoInfs(FMFs.NoInfs); 328 Res.setNoSignedZeros(FMFs.NoSignedZeros); 329 Res.setAllowReciprocal(FMFs.AllowReciprocal); 330 Res.setAllowContract(FMFs.AllowContract); 331 Res.setApproxFunc(FMFs.ApproxFunc); 332 return Res; 333 } 334 335 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, 336 VPValue *A, VPValue *B, DebugLoc DL, 337 const Twine &Name) 338 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}), 339 Pred, DL), 340 Opcode(Opcode), Name(Name.str()) { 341 assert(Opcode == Instruction::ICmp && 342 "only ICmp predicates supported at the moment"); 343 } 344 345 VPInstruction::VPInstruction(unsigned Opcode, 346 std::initializer_list<VPValue *> Operands, 347 FastMathFlags FMFs, DebugLoc DL, const Twine &Name) 348 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL), 349 Opcode(Opcode), Name(Name.str()) { 350 // Make sure the VPInstruction is a floating-point operation. 351 assert(isFPMathOp() && "this op can't take fast-math flags"); 352 } 353 354 bool VPInstruction::doesGeneratePerAllLanes() const { 355 return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this); 356 } 357 358 bool VPInstruction::canGenerateScalarForFirstLane() const { 359 if (Instruction::isBinaryOp(getOpcode())) 360 return true; 361 if (isSingleScalar() || isVectorToScalar()) 362 return true; 363 switch (Opcode) { 364 case Instruction::ICmp: 365 case VPInstruction::BranchOnCond: 366 case VPInstruction::BranchOnCount: 367 case VPInstruction::CalculateTripCountMinusVF: 368 case VPInstruction::CanonicalIVIncrementForPart: 369 case VPInstruction::PtrAdd: 370 case VPInstruction::ExplicitVectorLength: 371 return true; 372 default: 373 return false; 374 } 375 } 376 377 Value *VPInstruction::generatePerLane(VPTransformState &State, 378 const VPIteration &Lane) { 379 IRBuilderBase &Builder = State.Builder; 380 381 assert(getOpcode() == VPInstruction::PtrAdd && 382 "only PtrAdd opcodes are supported for now"); 383 return Builder.CreatePtrAdd(State.get(getOperand(0), Lane), 384 State.get(getOperand(1), Lane), Name); 385 } 386 387 Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { 388 IRBuilderBase &Builder = State.Builder; 389 390 if (Instruction::isBinaryOp(getOpcode())) { 391 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 392 Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed); 393 Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed); 394 auto *Res = 395 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); 396 if (auto *I = dyn_cast<Instruction>(Res)) 397 setFlags(I); 398 return Res; 399 } 400 401 switch (getOpcode()) { 402 case VPInstruction::Not: { 403 Value *A = State.get(getOperand(0), Part); 404 return Builder.CreateNot(A, Name); 405 } 406 case Instruction::ICmp: { 407 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 408 Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed); 409 Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed); 410 return Builder.CreateCmp(getPredicate(), A, B, Name); 411 } 412 case Instruction::Select: { 413 Value *Cond = State.get(getOperand(0), Part); 414 Value *Op1 = State.get(getOperand(1), Part); 415 Value *Op2 = State.get(getOperand(2), Part); 416 return Builder.CreateSelect(Cond, Op1, Op2, Name); 417 } 418 case VPInstruction::ActiveLaneMask: { 419 // Get first lane of vector induction variable. 420 Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); 421 // Get the original loop tripcount. 422 Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0)); 423 424 // If this part of the active lane mask is scalar, generate the CMP directly 425 // to avoid unnecessary extracts. 426 if (State.VF.isScalar()) 427 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC, 428 Name); 429 430 auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); 431 auto *PredTy = VectorType::get(Int1Ty, State.VF); 432 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, 433 {PredTy, ScalarTC->getType()}, 434 {VIVElem0, ScalarTC}, nullptr, Name); 435 } 436 case VPInstruction::FirstOrderRecurrenceSplice: { 437 // Generate code to combine the previous and current values in vector v3. 438 // 439 // vector.ph: 440 // v_init = vector(..., ..., ..., a[-1]) 441 // br vector.body 442 // 443 // vector.body 444 // i = phi [0, vector.ph], [i+4, vector.body] 445 // v1 = phi [v_init, vector.ph], [v2, vector.body] 446 // v2 = a[i, i+1, i+2, i+3]; 447 // v3 = vector(v1(3), v2(0, 1, 2)) 448 449 // For the first part, use the recurrence phi (v1), otherwise v2. 450 auto *V1 = State.get(getOperand(0), 0); 451 Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1); 452 if (!PartMinus1->getType()->isVectorTy()) 453 return PartMinus1; 454 Value *V2 = State.get(getOperand(1), Part); 455 return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name); 456 } 457 case VPInstruction::CalculateTripCountMinusVF: { 458 if (Part != 0) 459 return State.get(this, 0, /*IsScalar*/ true); 460 461 Value *ScalarTC = State.get(getOperand(0), {0, 0}); 462 Value *Step = 463 createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF); 464 Value *Sub = Builder.CreateSub(ScalarTC, Step); 465 Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step); 466 Value *Zero = ConstantInt::get(ScalarTC->getType(), 0); 467 return Builder.CreateSelect(Cmp, Sub, Zero); 468 } 469 case VPInstruction::ExplicitVectorLength: { 470 // Compute EVL 471 auto GetEVL = [=](VPTransformState &State, Value *AVL) { 472 assert(AVL->getType()->isIntegerTy() && 473 "Requested vector length should be an integer."); 474 475 // TODO: Add support for MaxSafeDist for correct loop emission. 476 assert(State.VF.isScalable() && "Expected scalable vector factor."); 477 Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue()); 478 479 Value *EVL = State.Builder.CreateIntrinsic( 480 State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length, 481 {AVL, VFArg, State.Builder.getTrue()}); 482 return EVL; 483 }; 484 // TODO: Restructure this code with an explicit remainder loop, vsetvli can 485 // be outside of the main loop. 486 assert(Part == 0 && "No unrolling expected for predicated vectorization."); 487 // Compute VTC - IV as the AVL (requested vector length). 488 Value *Index = State.get(getOperand(0), VPIteration(0, 0)); 489 Value *TripCount = State.get(getOperand(1), VPIteration(0, 0)); 490 Value *AVL = State.Builder.CreateSub(TripCount, Index); 491 Value *EVL = GetEVL(State, AVL); 492 return EVL; 493 } 494 case VPInstruction::CanonicalIVIncrementForPart: { 495 auto *IV = State.get(getOperand(0), VPIteration(0, 0)); 496 if (Part == 0) 497 return IV; 498 499 // The canonical IV is incremented by the vectorization factor (num of SIMD 500 // elements) times the unroll part. 501 Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); 502 return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(), 503 hasNoSignedWrap()); 504 } 505 case VPInstruction::BranchOnCond: { 506 if (Part != 0) 507 return nullptr; 508 509 Value *Cond = State.get(getOperand(0), VPIteration(Part, 0)); 510 // Replace the temporary unreachable terminator with a new conditional 511 // branch, hooking it up to backward destination for exiting blocks now and 512 // to forward destination(s) later when they are created. 513 BranchInst *CondBr = 514 Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr); 515 CondBr->setSuccessor(0, nullptr); 516 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 517 518 if (!getParent()->isExiting()) 519 return CondBr; 520 521 VPRegionBlock *ParentRegion = getParent()->getParent(); 522 VPBasicBlock *Header = ParentRegion->getEntryBasicBlock(); 523 CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]); 524 return CondBr; 525 } 526 case VPInstruction::BranchOnCount: { 527 if (Part != 0) 528 return nullptr; 529 // First create the compare. 530 Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true); 531 Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true); 532 Value *Cond = Builder.CreateICmpEQ(IV, TC); 533 534 // Now create the branch. 535 auto *Plan = getParent()->getPlan(); 536 VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); 537 VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock(); 538 539 // Replace the temporary unreachable terminator with a new conditional 540 // branch, hooking it up to backward destination (the header) now and to the 541 // forward destination (the exit/middle block) later when it is created. 542 // Note that CreateCondBr expects a valid BB as first argument, so we need 543 // to set it to nullptr later. 544 BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), 545 State.CFG.VPBB2IRBB[Header]); 546 CondBr->setSuccessor(0, nullptr); 547 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 548 return CondBr; 549 } 550 case VPInstruction::ComputeReductionResult: { 551 if (Part != 0) 552 return State.get(this, 0, /*IsScalar*/ true); 553 554 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary 555 // and will be removed by breaking up the recipe further. 556 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0)); 557 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 558 // Get its reduction variable descriptor. 559 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 560 561 RecurKind RK = RdxDesc.getRecurrenceKind(); 562 563 VPValue *LoopExitingDef = getOperand(1); 564 Type *PhiTy = OrigPhi->getType(); 565 VectorParts RdxParts(State.UF); 566 for (unsigned Part = 0; Part < State.UF; ++Part) 567 RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop()); 568 569 // If the vector reduction can be performed in a smaller type, we truncate 570 // then extend the loop exit value to enable InstCombine to evaluate the 571 // entire expression in the smaller type. 572 // TODO: Handle this in truncateToMinBW. 573 if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 574 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF); 575 for (unsigned Part = 0; Part < State.UF; ++Part) 576 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 577 } 578 // Reduce all of the unrolled parts into a single vector. 579 Value *ReducedPartRdx = RdxParts[0]; 580 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 581 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) 582 Op = Instruction::Or; 583 584 if (PhiR->isOrdered()) { 585 ReducedPartRdx = RdxParts[State.UF - 1]; 586 } else { 587 // Floating-point operations should have some FMF to enable the reduction. 588 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 589 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 590 for (unsigned Part = 1; Part < State.UF; ++Part) { 591 Value *RdxPart = RdxParts[Part]; 592 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 593 ReducedPartRdx = Builder.CreateBinOp( 594 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 595 else 596 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 597 } 598 } 599 600 // Create the reduction after the loop. Note that inloop reductions create 601 // the target reduction in the loop using a Reduction recipe. 602 if ((State.VF.isVector() || 603 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) && 604 !PhiR->isInLoop()) { 605 ReducedPartRdx = 606 createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); 607 // If the reduction can be performed in a smaller type, we need to extend 608 // the reduction to the wider type before we branch to the original loop. 609 if (PhiTy != RdxDesc.getRecurrenceType()) 610 ReducedPartRdx = RdxDesc.isSigned() 611 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 612 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 613 } 614 615 // If there were stores of the reduction value to a uniform memory address 616 // inside the loop, create the final store here. 617 if (StoreInst *SI = RdxDesc.IntermediateStore) { 618 auto *NewSI = Builder.CreateAlignedStore( 619 ReducedPartRdx, SI->getPointerOperand(), SI->getAlign()); 620 propagateMetadata(NewSI, SI); 621 } 622 623 return ReducedPartRdx; 624 } 625 case VPInstruction::ExtractFromEnd: { 626 if (Part != 0) 627 return State.get(this, 0, /*IsScalar*/ true); 628 629 auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue()); 630 unsigned Offset = CI->getZExtValue(); 631 assert(Offset > 0 && "Offset from end must be positive"); 632 Value *Res; 633 if (State.VF.isVector()) { 634 assert(Offset <= State.VF.getKnownMinValue() && 635 "invalid offset to extract from"); 636 // Extract lane VF - Offset from the operand. 637 Res = State.get( 638 getOperand(0), 639 VPIteration(State.UF - 1, VPLane::getLaneFromEnd(State.VF, Offset))); 640 } else { 641 assert(Offset <= State.UF && "invalid offset to extract from"); 642 // When loop is unrolled without vectorizing, retrieve UF - Offset. 643 Res = State.get(getOperand(0), State.UF - Offset); 644 } 645 if (isa<ExtractElementInst>(Res)) 646 Res->setName(Name); 647 return Res; 648 } 649 case VPInstruction::LogicalAnd: { 650 Value *A = State.get(getOperand(0), Part); 651 Value *B = State.get(getOperand(1), Part); 652 return Builder.CreateLogicalAnd(A, B, Name); 653 } 654 case VPInstruction::PtrAdd: { 655 assert(vputils::onlyFirstLaneUsed(this) && 656 "can only generate first lane for PtrAdd"); 657 Value *Ptr = State.get(getOperand(0), Part, /* IsScalar */ true); 658 Value *Addend = State.get(getOperand(1), Part, /* IsScalar */ true); 659 return Builder.CreatePtrAdd(Ptr, Addend, Name); 660 } 661 case VPInstruction::ResumePhi: { 662 if (Part != 0) 663 return State.get(this, 0, /*IsScalar*/ true); 664 Value *IncomingFromVPlanPred = 665 State.get(getOperand(0), Part, /* IsScalar */ true); 666 Value *IncomingFromOtherPreds = 667 State.get(getOperand(1), Part, /* IsScalar */ true); 668 auto *NewPhi = 669 Builder.CreatePHI(IncomingFromOtherPreds->getType(), 2, Name); 670 BasicBlock *VPlanPred = 671 State.CFG 672 .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getSinglePredecessor())]; 673 NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred); 674 for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) { 675 assert(OtherPred != VPlanPred && 676 "VPlan predecessors should not be connected yet"); 677 NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred); 678 } 679 return NewPhi; 680 } 681 682 default: 683 llvm_unreachable("Unsupported opcode for instruction"); 684 } 685 } 686 687 bool VPInstruction::isVectorToScalar() const { 688 return getOpcode() == VPInstruction::ExtractFromEnd || 689 getOpcode() == VPInstruction::ComputeReductionResult; 690 } 691 692 bool VPInstruction::isSingleScalar() const { 693 return getOpcode() == VPInstruction::ResumePhi; 694 } 695 696 #if !defined(NDEBUG) 697 bool VPInstruction::isFPMathOp() const { 698 // Inspired by FPMathOperator::classof. Notable differences are that we don't 699 // support Call, PHI and Select opcodes here yet. 700 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || 701 Opcode == Instruction::FNeg || Opcode == Instruction::FSub || 702 Opcode == Instruction::FDiv || Opcode == Instruction::FRem || 703 Opcode == Instruction::FCmp || Opcode == Instruction::Select; 704 } 705 #endif 706 707 void VPInstruction::execute(VPTransformState &State) { 708 assert(!State.Instance && "VPInstruction executing an Instance"); 709 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 710 assert((hasFastMathFlags() == isFPMathOp() || 711 getOpcode() == Instruction::Select) && 712 "Recipe not a FPMathOp but has fast-math flags?"); 713 if (hasFastMathFlags()) 714 State.Builder.setFastMathFlags(getFastMathFlags()); 715 State.setDebugLocFrom(getDebugLoc()); 716 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() && 717 (vputils::onlyFirstLaneUsed(this) || 718 isVectorToScalar() || isSingleScalar()); 719 bool GeneratesPerAllLanes = doesGeneratePerAllLanes(); 720 bool OnlyFirstPartUsed = vputils::onlyFirstPartUsed(this); 721 for (unsigned Part = 0; Part < State.UF; ++Part) { 722 if (GeneratesPerAllLanes) { 723 for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue(); 724 Lane != NumLanes; ++Lane) { 725 Value *GeneratedValue = generatePerLane(State, VPIteration(Part, Lane)); 726 assert(GeneratedValue && "generatePerLane must produce a value"); 727 State.set(this, GeneratedValue, VPIteration(Part, Lane)); 728 } 729 continue; 730 } 731 732 if (Part != 0 && OnlyFirstPartUsed && hasResult()) { 733 Value *Part0 = State.get(this, 0, /*IsScalar*/ GeneratesPerFirstLaneOnly); 734 State.set(this, Part0, Part, 735 /*IsScalar*/ GeneratesPerFirstLaneOnly); 736 continue; 737 } 738 739 Value *GeneratedValue = generatePerPart(State, Part); 740 if (!hasResult()) 741 continue; 742 assert(GeneratedValue && "generatePerPart must produce a value"); 743 assert((GeneratedValue->getType()->isVectorTy() == 744 !GeneratesPerFirstLaneOnly || 745 State.VF.isScalar()) && 746 "scalar value but not only first lane defined"); 747 State.set(this, GeneratedValue, Part, 748 /*IsScalar*/ GeneratesPerFirstLaneOnly); 749 } 750 } 751 752 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { 753 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 754 if (Instruction::isBinaryOp(getOpcode())) 755 return vputils::onlyFirstLaneUsed(this); 756 757 switch (getOpcode()) { 758 default: 759 return false; 760 case Instruction::ICmp: 761 case VPInstruction::PtrAdd: 762 // TODO: Cover additional opcodes. 763 return vputils::onlyFirstLaneUsed(this); 764 case VPInstruction::ActiveLaneMask: 765 case VPInstruction::ExplicitVectorLength: 766 case VPInstruction::CalculateTripCountMinusVF: 767 case VPInstruction::CanonicalIVIncrementForPart: 768 case VPInstruction::BranchOnCount: 769 case VPInstruction::BranchOnCond: 770 case VPInstruction::ResumePhi: 771 return true; 772 }; 773 llvm_unreachable("switch should return"); 774 } 775 776 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const { 777 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 778 if (Instruction::isBinaryOp(getOpcode())) 779 return vputils::onlyFirstPartUsed(this); 780 781 switch (getOpcode()) { 782 default: 783 return false; 784 case Instruction::ICmp: 785 case Instruction::Select: 786 return vputils::onlyFirstPartUsed(this); 787 case VPInstruction::BranchOnCount: 788 case VPInstruction::BranchOnCond: 789 case VPInstruction::CanonicalIVIncrementForPart: 790 return true; 791 }; 792 llvm_unreachable("switch should return"); 793 } 794 795 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 796 void VPInstruction::dump() const { 797 VPSlotTracker SlotTracker(getParent()->getPlan()); 798 print(dbgs(), "", SlotTracker); 799 } 800 801 void VPInstruction::print(raw_ostream &O, const Twine &Indent, 802 VPSlotTracker &SlotTracker) const { 803 O << Indent << "EMIT "; 804 805 if (hasResult()) { 806 printAsOperand(O, SlotTracker); 807 O << " = "; 808 } 809 810 switch (getOpcode()) { 811 case VPInstruction::Not: 812 O << "not"; 813 break; 814 case VPInstruction::SLPLoad: 815 O << "combined load"; 816 break; 817 case VPInstruction::SLPStore: 818 O << "combined store"; 819 break; 820 case VPInstruction::ActiveLaneMask: 821 O << "active lane mask"; 822 break; 823 case VPInstruction::ResumePhi: 824 O << "resume-phi"; 825 break; 826 case VPInstruction::ExplicitVectorLength: 827 O << "EXPLICIT-VECTOR-LENGTH"; 828 break; 829 case VPInstruction::FirstOrderRecurrenceSplice: 830 O << "first-order splice"; 831 break; 832 case VPInstruction::BranchOnCond: 833 O << "branch-on-cond"; 834 break; 835 case VPInstruction::CalculateTripCountMinusVF: 836 O << "TC > VF ? TC - VF : 0"; 837 break; 838 case VPInstruction::CanonicalIVIncrementForPart: 839 O << "VF * Part +"; 840 break; 841 case VPInstruction::BranchOnCount: 842 O << "branch-on-count"; 843 break; 844 case VPInstruction::ExtractFromEnd: 845 O << "extract-from-end"; 846 break; 847 case VPInstruction::ComputeReductionResult: 848 O << "compute-reduction-result"; 849 break; 850 case VPInstruction::LogicalAnd: 851 O << "logical-and"; 852 break; 853 case VPInstruction::PtrAdd: 854 O << "ptradd"; 855 break; 856 default: 857 O << Instruction::getOpcodeName(getOpcode()); 858 } 859 860 printFlags(O); 861 printOperands(O, SlotTracker); 862 863 if (auto DL = getDebugLoc()) { 864 O << ", !dbg "; 865 DL.print(O); 866 } 867 } 868 #endif 869 870 void VPWidenCallRecipe::execute(VPTransformState &State) { 871 assert(State.VF.isVector() && "not widening"); 872 Function *CalledScalarFn = getCalledScalarFunction(); 873 assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) && 874 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 875 State.setDebugLocFrom(getDebugLoc()); 876 877 bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic; 878 FunctionType *VFTy = nullptr; 879 if (Variant) 880 VFTy = Variant->getFunctionType(); 881 for (unsigned Part = 0; Part < State.UF; ++Part) { 882 SmallVector<Type *, 2> TysForDecl; 883 // Add return type if intrinsic is overloaded on it. 884 if (UseIntrinsic && 885 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1)) 886 TysForDecl.push_back(VectorType::get( 887 CalledScalarFn->getReturnType()->getScalarType(), State.VF)); 888 SmallVector<Value *, 4> Args; 889 for (const auto &I : enumerate(arg_operands())) { 890 // Some intrinsics have a scalar argument - don't replace it with a 891 // vector. 892 Value *Arg; 893 if (UseIntrinsic && 894 isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())) 895 Arg = State.get(I.value(), VPIteration(0, 0)); 896 // Some vectorized function variants may also take a scalar argument, 897 // e.g. linear parameters for pointers. This needs to be the scalar value 898 // from the start of the respective part when interleaving. 899 else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy()) 900 Arg = State.get(I.value(), VPIteration(Part, 0)); 901 else 902 Arg = State.get(I.value(), Part); 903 if (UseIntrinsic && 904 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index())) 905 TysForDecl.push_back(Arg->getType()); 906 Args.push_back(Arg); 907 } 908 909 Function *VectorF; 910 if (UseIntrinsic) { 911 // Use vector version of the intrinsic. 912 Module *M = State.Builder.GetInsertBlock()->getModule(); 913 VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl); 914 assert(VectorF && "Can't retrieve vector intrinsic."); 915 } else { 916 #ifndef NDEBUG 917 assert(Variant != nullptr && "Can't create vector function."); 918 #endif 919 VectorF = Variant; 920 } 921 922 auto *CI = cast_or_null<CallInst>(getUnderlyingInstr()); 923 SmallVector<OperandBundleDef, 1> OpBundles; 924 if (CI) 925 CI->getOperandBundlesAsDefs(OpBundles); 926 927 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); 928 929 if (isa<FPMathOperator>(V)) 930 V->copyFastMathFlags(CI); 931 932 if (!V->getType()->isVoidTy()) 933 State.set(this, V, Part); 934 State.addMetadata(V, CI); 935 } 936 } 937 938 InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF, 939 VPCostContext &Ctx) const { 940 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 941 if (Variant) { 942 return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(), 943 Variant->getFunctionType()->params(), 944 CostKind); 945 } 946 947 FastMathFlags FMF; 948 // TODO: Manage flags via VPRecipeWithIRFlags. 949 if (auto *FPMO = dyn_cast_or_null<FPMathOperator>(getUnderlyingValue())) 950 FMF = FPMO->getFastMathFlags(); 951 952 // Some backends analyze intrinsic arguments to determine cost. Use the 953 // underlying value for the operand if it has one. Otherwise try to use the 954 // operand of the underlying call instruction, if there is one. Otherwise 955 // clear Arguments. 956 // TODO: Rework TTI interface to be independent of concrete IR values. 957 SmallVector<const Value *> Arguments; 958 for (const auto &[Idx, Op] : enumerate(operands())) { 959 auto *V = Op->getUnderlyingValue(); 960 if (!V) { 961 if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) { 962 Arguments.push_back(UI->getArgOperand(Idx)); 963 continue; 964 } 965 Arguments.clear(); 966 break; 967 } 968 Arguments.push_back(V); 969 } 970 971 Type *RetTy = 972 ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); 973 SmallVector<Type *> ParamTys; 974 for (unsigned I = 0; I != getNumOperands(); ++I) 975 ParamTys.push_back( 976 ToVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF)); 977 978 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst. 979 IntrinsicCostAttributes CostAttrs( 980 VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF, 981 dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue())); 982 return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, CostKind); 983 } 984 985 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 986 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, 987 VPSlotTracker &SlotTracker) const { 988 O << Indent << "WIDEN-CALL "; 989 990 Function *CalledFn = getCalledScalarFunction(); 991 if (CalledFn->getReturnType()->isVoidTy()) 992 O << "void "; 993 else { 994 printAsOperand(O, SlotTracker); 995 O << " = "; 996 } 997 998 O << "call @" << CalledFn->getName() << "("; 999 interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) { 1000 Op->printAsOperand(O, SlotTracker); 1001 }); 1002 O << ")"; 1003 1004 if (VectorIntrinsicID) 1005 O << " (using vector intrinsic)"; 1006 else { 1007 O << " (using library function"; 1008 if (Variant->hasName()) 1009 O << ": " << Variant->getName(); 1010 O << ")"; 1011 } 1012 } 1013 1014 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, 1015 VPSlotTracker &SlotTracker) const { 1016 O << Indent << "WIDEN-SELECT "; 1017 printAsOperand(O, SlotTracker); 1018 O << " = select "; 1019 getOperand(0)->printAsOperand(O, SlotTracker); 1020 O << ", "; 1021 getOperand(1)->printAsOperand(O, SlotTracker); 1022 O << ", "; 1023 getOperand(2)->printAsOperand(O, SlotTracker); 1024 O << (isInvariantCond() ? " (condition is loop invariant)" : ""); 1025 } 1026 #endif 1027 1028 void VPWidenSelectRecipe::execute(VPTransformState &State) { 1029 State.setDebugLocFrom(getDebugLoc()); 1030 1031 // The condition can be loop invariant but still defined inside the 1032 // loop. This means that we can't just use the original 'cond' value. 1033 // We have to take the 'vectorized' value and pick the first lane. 1034 // Instcombine will make this a no-op. 1035 auto *InvarCond = 1036 isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr; 1037 1038 for (unsigned Part = 0; Part < State.UF; ++Part) { 1039 Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part); 1040 Value *Op0 = State.get(getOperand(1), Part); 1041 Value *Op1 = State.get(getOperand(2), Part); 1042 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 1043 State.set(this, Sel, Part); 1044 State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1045 } 1046 } 1047 1048 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy( 1049 const FastMathFlags &FMF) { 1050 AllowReassoc = FMF.allowReassoc(); 1051 NoNaNs = FMF.noNaNs(); 1052 NoInfs = FMF.noInfs(); 1053 NoSignedZeros = FMF.noSignedZeros(); 1054 AllowReciprocal = FMF.allowReciprocal(); 1055 AllowContract = FMF.allowContract(); 1056 ApproxFunc = FMF.approxFunc(); 1057 } 1058 1059 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1060 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { 1061 switch (OpType) { 1062 case OperationType::Cmp: 1063 O << " " << CmpInst::getPredicateName(getPredicate()); 1064 break; 1065 case OperationType::DisjointOp: 1066 if (DisjointFlags.IsDisjoint) 1067 O << " disjoint"; 1068 break; 1069 case OperationType::PossiblyExactOp: 1070 if (ExactFlags.IsExact) 1071 O << " exact"; 1072 break; 1073 case OperationType::OverflowingBinOp: 1074 if (WrapFlags.HasNUW) 1075 O << " nuw"; 1076 if (WrapFlags.HasNSW) 1077 O << " nsw"; 1078 break; 1079 case OperationType::FPMathOp: 1080 getFastMathFlags().print(O); 1081 break; 1082 case OperationType::GEPOp: 1083 if (GEPFlags.IsInBounds) 1084 O << " inbounds"; 1085 break; 1086 case OperationType::NonNegOp: 1087 if (NonNegFlags.NonNeg) 1088 O << " nneg"; 1089 break; 1090 case OperationType::Other: 1091 break; 1092 } 1093 if (getNumOperands() > 0) 1094 O << " "; 1095 } 1096 #endif 1097 1098 void VPWidenRecipe::execute(VPTransformState &State) { 1099 State.setDebugLocFrom(getDebugLoc()); 1100 auto &Builder = State.Builder; 1101 switch (Opcode) { 1102 case Instruction::Call: 1103 case Instruction::Br: 1104 case Instruction::PHI: 1105 case Instruction::GetElementPtr: 1106 case Instruction::Select: 1107 llvm_unreachable("This instruction is handled by a different recipe."); 1108 case Instruction::UDiv: 1109 case Instruction::SDiv: 1110 case Instruction::SRem: 1111 case Instruction::URem: 1112 case Instruction::Add: 1113 case Instruction::FAdd: 1114 case Instruction::Sub: 1115 case Instruction::FSub: 1116 case Instruction::FNeg: 1117 case Instruction::Mul: 1118 case Instruction::FMul: 1119 case Instruction::FDiv: 1120 case Instruction::FRem: 1121 case Instruction::Shl: 1122 case Instruction::LShr: 1123 case Instruction::AShr: 1124 case Instruction::And: 1125 case Instruction::Or: 1126 case Instruction::Xor: { 1127 // Just widen unops and binops. 1128 for (unsigned Part = 0; Part < State.UF; ++Part) { 1129 SmallVector<Value *, 2> Ops; 1130 for (VPValue *VPOp : operands()) 1131 Ops.push_back(State.get(VPOp, Part)); 1132 1133 Value *V = Builder.CreateNAryOp(Opcode, Ops); 1134 1135 if (auto *VecOp = dyn_cast<Instruction>(V)) 1136 setFlags(VecOp); 1137 1138 // Use this vector value for all users of the original instruction. 1139 State.set(this, V, Part); 1140 State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1141 } 1142 1143 break; 1144 } 1145 case Instruction::Freeze: { 1146 for (unsigned Part = 0; Part < State.UF; ++Part) { 1147 Value *Op = State.get(getOperand(0), Part); 1148 1149 Value *Freeze = Builder.CreateFreeze(Op); 1150 State.set(this, Freeze, Part); 1151 } 1152 break; 1153 } 1154 case Instruction::ICmp: 1155 case Instruction::FCmp: { 1156 // Widen compares. Generate vector compares. 1157 bool FCmp = Opcode == Instruction::FCmp; 1158 for (unsigned Part = 0; Part < State.UF; ++Part) { 1159 Value *A = State.get(getOperand(0), Part); 1160 Value *B = State.get(getOperand(1), Part); 1161 Value *C = nullptr; 1162 if (FCmp) { 1163 // Propagate fast math flags. 1164 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 1165 if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue())) 1166 Builder.setFastMathFlags(I->getFastMathFlags()); 1167 C = Builder.CreateFCmp(getPredicate(), A, B); 1168 } else { 1169 C = Builder.CreateICmp(getPredicate(), A, B); 1170 } 1171 State.set(this, C, Part); 1172 State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1173 } 1174 1175 break; 1176 } 1177 default: 1178 // This instruction is not vectorized by simple widening. 1179 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " 1180 << Instruction::getOpcodeName(Opcode)); 1181 llvm_unreachable("Unhandled instruction!"); 1182 } // end of switch. 1183 1184 #if !defined(NDEBUG) 1185 // Verify that VPlan type inference results agree with the type of the 1186 // generated values. 1187 for (unsigned Part = 0; Part < State.UF; ++Part) { 1188 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), 1189 State.VF) == State.get(this, Part)->getType() && 1190 "inferred type and type from generated instructions do not match"); 1191 } 1192 #endif 1193 } 1194 1195 InstructionCost VPWidenRecipe::computeCost(ElementCount VF, 1196 VPCostContext &Ctx) const { 1197 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1198 switch (Opcode) { 1199 case Instruction::FNeg: { 1200 Type *VectorTy = 1201 ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); 1202 return Ctx.TTI.getArithmeticInstrCost( 1203 Opcode, VectorTy, CostKind, 1204 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 1205 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}); 1206 } 1207 1208 case Instruction::UDiv: 1209 case Instruction::SDiv: 1210 case Instruction::SRem: 1211 case Instruction::URem: 1212 // More complex computation, let the legacy cost-model handle this for now. 1213 return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF); 1214 case Instruction::Add: 1215 case Instruction::FAdd: 1216 case Instruction::Sub: 1217 case Instruction::FSub: 1218 case Instruction::Mul: 1219 case Instruction::FMul: 1220 case Instruction::FDiv: 1221 case Instruction::FRem: 1222 case Instruction::Shl: 1223 case Instruction::LShr: 1224 case Instruction::AShr: 1225 case Instruction::And: 1226 case Instruction::Or: 1227 case Instruction::Xor: { 1228 VPValue *RHS = getOperand(1); 1229 // Certain instructions can be cheaper to vectorize if they have a constant 1230 // second vector operand. One example of this are shifts on x86. 1231 TargetTransformInfo::OperandValueInfo RHSInfo = { 1232 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}; 1233 if (RHS->isLiveIn()) 1234 RHSInfo = Ctx.TTI.getOperandInfo(RHS->getLiveInIRValue()); 1235 1236 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue && 1237 getOperand(1)->isDefinedOutsideVectorRegions()) 1238 RHSInfo.Kind = TargetTransformInfo::OK_UniformValue; 1239 Type *VectorTy = 1240 ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); 1241 Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue()); 1242 1243 SmallVector<const Value *, 4> Operands; 1244 if (CtxI) 1245 Operands.append(CtxI->value_op_begin(), CtxI->value_op_end()); 1246 return Ctx.TTI.getArithmeticInstrCost( 1247 Opcode, VectorTy, CostKind, 1248 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 1249 RHSInfo, Operands, CtxI, &Ctx.TLI); 1250 } 1251 case Instruction::Freeze: { 1252 // This opcode is unknown. Assume that it is the same as 'mul'. 1253 Type *VectorTy = 1254 ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); 1255 return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 1256 } 1257 case Instruction::ICmp: 1258 case Instruction::FCmp: { 1259 Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue()); 1260 Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); 1261 return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(), 1262 CostKind, CtxI); 1263 } 1264 default: 1265 llvm_unreachable("Unsupported opcode for instruction"); 1266 } 1267 } 1268 1269 void VPWidenEVLRecipe::execute(VPTransformState &State) { 1270 unsigned Opcode = getOpcode(); 1271 // TODO: Support other opcodes 1272 if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode)) 1273 llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute"); 1274 1275 State.setDebugLocFrom(getDebugLoc()); 1276 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " 1277 "explicit vector length."); 1278 1279 assert(State.get(getOperand(0), 0)->getType()->isVectorTy() && 1280 "VPWidenEVLRecipe should not be used for scalars"); 1281 1282 VPValue *EVL = getEVL(); 1283 Value *EVLArg = State.get(EVL, 0, /*NeedsScalar=*/true); 1284 IRBuilderBase &BuilderIR = State.Builder; 1285 VectorBuilder Builder(BuilderIR); 1286 Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue()); 1287 1288 SmallVector<Value *, 4> Ops; 1289 for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) { 1290 VPValue *VPOp = getOperand(I); 1291 Ops.push_back(State.get(VPOp, 0)); 1292 } 1293 1294 Builder.setMask(Mask).setEVL(EVLArg); 1295 Value *VPInst = 1296 Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops, "vp.op"); 1297 // Currently vp-intrinsics only accept FMF flags. 1298 // TODO: Enable other flags when support is added. 1299 if (isa<FPMathOperator>(VPInst)) 1300 setFlags(cast<Instruction>(VPInst)); 1301 1302 State.set(this, VPInst, 0); 1303 State.addMetadata(VPInst, 1304 dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1305 } 1306 1307 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1308 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, 1309 VPSlotTracker &SlotTracker) const { 1310 O << Indent << "WIDEN "; 1311 printAsOperand(O, SlotTracker); 1312 O << " = " << Instruction::getOpcodeName(Opcode); 1313 printFlags(O); 1314 printOperands(O, SlotTracker); 1315 } 1316 1317 void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent, 1318 VPSlotTracker &SlotTracker) const { 1319 O << Indent << "WIDEN-VP "; 1320 printAsOperand(O, SlotTracker); 1321 O << " = " << Instruction::getOpcodeName(getOpcode()); 1322 printFlags(O); 1323 printOperands(O, SlotTracker); 1324 } 1325 #endif 1326 1327 void VPWidenCastRecipe::execute(VPTransformState &State) { 1328 State.setDebugLocFrom(getDebugLoc()); 1329 auto &Builder = State.Builder; 1330 /// Vectorize casts. 1331 assert(State.VF.isVector() && "Not vectorizing?"); 1332 Type *DestTy = VectorType::get(getResultType(), State.VF); 1333 VPValue *Op = getOperand(0); 1334 for (unsigned Part = 0; Part < State.UF; ++Part) { 1335 if (Part > 0 && Op->isLiveIn()) { 1336 // FIXME: Remove once explicit unrolling is implemented using VPlan. 1337 State.set(this, State.get(this, 0), Part); 1338 continue; 1339 } 1340 Value *A = State.get(Op, Part); 1341 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); 1342 State.set(this, Cast, Part); 1343 State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue())); 1344 } 1345 } 1346 1347 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1348 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, 1349 VPSlotTracker &SlotTracker) const { 1350 O << Indent << "WIDEN-CAST "; 1351 printAsOperand(O, SlotTracker); 1352 O << " = " << Instruction::getOpcodeName(Opcode) << " "; 1353 printFlags(O); 1354 printOperands(O, SlotTracker); 1355 O << " to " << *getResultType(); 1356 } 1357 #endif 1358 1359 /// This function adds 1360 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 1361 /// to each vector element of Val. The sequence starts at StartIndex. 1362 /// \p Opcode is relevant for FP induction variable. 1363 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 1364 Instruction::BinaryOps BinOp, ElementCount VF, 1365 IRBuilderBase &Builder) { 1366 assert(VF.isVector() && "only vector VFs are supported"); 1367 1368 // Create and check the types. 1369 auto *ValVTy = cast<VectorType>(Val->getType()); 1370 ElementCount VLen = ValVTy->getElementCount(); 1371 1372 Type *STy = Val->getType()->getScalarType(); 1373 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1374 "Induction Step must be an integer or FP"); 1375 assert(Step->getType() == STy && "Step has wrong type"); 1376 1377 SmallVector<Constant *, 8> Indices; 1378 1379 // Create a vector of consecutive numbers from zero to VF. 1380 VectorType *InitVecValVTy = ValVTy; 1381 if (STy->isFloatingPointTy()) { 1382 Type *InitVecValSTy = 1383 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 1384 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 1385 } 1386 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 1387 1388 // Splat the StartIdx 1389 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 1390 1391 if (STy->isIntegerTy()) { 1392 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 1393 Step = Builder.CreateVectorSplat(VLen, Step); 1394 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1395 // FIXME: The newly created binary instructions should contain nsw/nuw 1396 // flags, which can be found from the original scalar operations. 1397 Step = Builder.CreateMul(InitVec, Step); 1398 return Builder.CreateAdd(Val, Step, "induction"); 1399 } 1400 1401 // Floating point induction. 1402 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1403 "Binary Opcode should be specified for FP induction"); 1404 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 1405 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 1406 1407 Step = Builder.CreateVectorSplat(VLen, Step); 1408 Value *MulOp = Builder.CreateFMul(InitVec, Step); 1409 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1410 } 1411 1412 /// A helper function that returns an integer or floating-point constant with 1413 /// value C. 1414 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 1415 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 1416 : ConstantFP::get(Ty, C); 1417 } 1418 1419 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 1420 ElementCount VF) { 1421 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1422 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1423 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1424 return B.CreateUIToFP(RuntimeVF, FTy); 1425 } 1426 1427 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 1428 assert(!State.Instance && "Int or FP induction being replicated."); 1429 1430 Value *Start = getStartValue()->getLiveInIRValue(); 1431 const InductionDescriptor &ID = getInductionDescriptor(); 1432 TruncInst *Trunc = getTruncInst(); 1433 IRBuilderBase &Builder = State.Builder; 1434 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1435 assert(State.VF.isVector() && "must have vector VF"); 1436 1437 // The value from the original loop to which we are mapping the new induction 1438 // variable. 1439 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1440 1441 // Fast-math-flags propagate from the original induction instruction. 1442 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 1443 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 1444 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 1445 1446 // Now do the actual transformations, and start with fetching the step value. 1447 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 1448 1449 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1450 "Expected either an induction phi-node or a truncate of it!"); 1451 1452 // Construct the initial value of the vector IV in the vector loop preheader 1453 auto CurrIP = Builder.saveIP(); 1454 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1455 Builder.SetInsertPoint(VectorPH->getTerminator()); 1456 if (isa<TruncInst>(EntryVal)) { 1457 assert(Start->getType()->isIntegerTy() && 1458 "Truncation requires an integer type"); 1459 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1460 Step = Builder.CreateTrunc(Step, TruncType); 1461 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1462 } 1463 1464 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 1465 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 1466 Value *SteppedStart = getStepVector( 1467 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); 1468 1469 // We create vector phi nodes for both integer and floating-point induction 1470 // variables. Here, we determine the kind of arithmetic we will perform. 1471 Instruction::BinaryOps AddOp; 1472 Instruction::BinaryOps MulOp; 1473 if (Step->getType()->isIntegerTy()) { 1474 AddOp = Instruction::Add; 1475 MulOp = Instruction::Mul; 1476 } else { 1477 AddOp = ID.getInductionOpcode(); 1478 MulOp = Instruction::FMul; 1479 } 1480 1481 // Multiply the vectorization factor by the step using integer or 1482 // floating-point arithmetic as appropriate. 1483 Type *StepType = Step->getType(); 1484 Value *RuntimeVF; 1485 if (Step->getType()->isFloatingPointTy()) 1486 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 1487 else 1488 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 1489 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 1490 1491 // Create a vector splat to use in the induction update. 1492 // 1493 // FIXME: If the step is non-constant, we create the vector splat with 1494 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1495 // handle a constant vector splat. 1496 Value *SplatVF = isa<Constant>(Mul) 1497 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 1498 : Builder.CreateVectorSplat(State.VF, Mul); 1499 Builder.restoreIP(CurrIP); 1500 1501 // We may need to add the step a number of times, depending on the unroll 1502 // factor. The last of those goes into the PHI. 1503 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind"); 1504 VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 1505 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1506 Instruction *LastInduction = VecInd; 1507 for (unsigned Part = 0; Part < State.UF; ++Part) { 1508 State.set(this, LastInduction, Part); 1509 1510 if (isa<TruncInst>(EntryVal)) 1511 State.addMetadata(LastInduction, EntryVal); 1512 1513 LastInduction = cast<Instruction>( 1514 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 1515 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1516 } 1517 1518 LastInduction->setName("vec.ind.next"); 1519 VecInd->addIncoming(SteppedStart, VectorPH); 1520 // Add induction update using an incorrect block temporarily. The phi node 1521 // will be fixed after VPlan execution. Note that at this point the latch 1522 // block cannot be used, as it does not exist yet. 1523 // TODO: Model increment value in VPlan, by turning the recipe into a 1524 // multi-def and a subclass of VPHeaderPHIRecipe. 1525 VecInd->addIncoming(LastInduction, VectorPH); 1526 } 1527 1528 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1529 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, 1530 VPSlotTracker &SlotTracker) const { 1531 O << Indent << "WIDEN-INDUCTION"; 1532 if (getTruncInst()) { 1533 O << "\\l\""; 1534 O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\""; 1535 O << " +\n" << Indent << "\" "; 1536 getVPValue(0)->printAsOperand(O, SlotTracker); 1537 } else 1538 O << " " << VPlanIngredient(IV); 1539 1540 O << ", "; 1541 getStepValue()->printAsOperand(O, SlotTracker); 1542 } 1543 #endif 1544 1545 bool VPWidenIntOrFpInductionRecipe::isCanonical() const { 1546 // The step may be defined by a recipe in the preheader (e.g. if it requires 1547 // SCEV expansion), but for the canonical induction the step is required to be 1548 // 1, which is represented as live-in. 1549 if (getStepValue()->getDefiningRecipe()) 1550 return false; 1551 auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue()); 1552 auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); 1553 auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin()); 1554 return StartC && StartC->isZero() && StepC && StepC->isOne() && 1555 getScalarType() == CanIV->getScalarType(); 1556 } 1557 1558 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1559 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, 1560 VPSlotTracker &SlotTracker) const { 1561 O << Indent; 1562 printAsOperand(O, SlotTracker); 1563 O << " = DERIVED-IV "; 1564 getStartValue()->printAsOperand(O, SlotTracker); 1565 O << " + "; 1566 getOperand(1)->printAsOperand(O, SlotTracker); 1567 O << " * "; 1568 getStepValue()->printAsOperand(O, SlotTracker); 1569 } 1570 #endif 1571 1572 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 1573 // Fast-math-flags propagate from the original induction instruction. 1574 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 1575 if (hasFastMathFlags()) 1576 State.Builder.setFastMathFlags(getFastMathFlags()); 1577 1578 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 1579 /// variable on which to base the steps, \p Step is the size of the step. 1580 1581 Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0)); 1582 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 1583 IRBuilderBase &Builder = State.Builder; 1584 1585 // Ensure step has the same type as that of scalar IV. 1586 Type *BaseIVTy = BaseIV->getType()->getScalarType(); 1587 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!"); 1588 1589 // We build scalar steps for both integer and floating-point induction 1590 // variables. Here, we determine the kind of arithmetic we will perform. 1591 Instruction::BinaryOps AddOp; 1592 Instruction::BinaryOps MulOp; 1593 if (BaseIVTy->isIntegerTy()) { 1594 AddOp = Instruction::Add; 1595 MulOp = Instruction::Mul; 1596 } else { 1597 AddOp = InductionOpcode; 1598 MulOp = Instruction::FMul; 1599 } 1600 1601 // Determine the number of scalars we need to generate for each unroll 1602 // iteration. 1603 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this); 1604 // Compute the scalar steps and save the results in State. 1605 Type *IntStepTy = 1606 IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits()); 1607 Type *VecIVTy = nullptr; 1608 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 1609 if (!FirstLaneOnly && State.VF.isScalable()) { 1610 VecIVTy = VectorType::get(BaseIVTy, State.VF); 1611 UnitStepVec = 1612 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 1613 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 1614 SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV); 1615 } 1616 1617 unsigned StartPart = 0; 1618 unsigned EndPart = State.UF; 1619 unsigned StartLane = 0; 1620 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 1621 if (State.Instance) { 1622 StartPart = State.Instance->Part; 1623 EndPart = StartPart + 1; 1624 StartLane = State.Instance->Lane.getKnownLane(); 1625 EndLane = StartLane + 1; 1626 } 1627 for (unsigned Part = StartPart; Part < EndPart; ++Part) { 1628 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 1629 1630 if (!FirstLaneOnly && State.VF.isScalable()) { 1631 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 1632 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 1633 if (BaseIVTy->isFloatingPointTy()) 1634 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 1635 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 1636 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 1637 State.set(this, Add, Part); 1638 // It's useful to record the lane values too for the known minimum number 1639 // of elements so we do those below. This improves the code quality when 1640 // trying to extract the first element, for example. 1641 } 1642 1643 if (BaseIVTy->isFloatingPointTy()) 1644 StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy); 1645 1646 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { 1647 Value *StartIdx = Builder.CreateBinOp( 1648 AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane)); 1649 // The step returned by `createStepForVF` is a runtime-evaluated value 1650 // when VF is scalable. Otherwise, it should be folded into a Constant. 1651 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 1652 "Expected StartIdx to be folded to a constant when VF is not " 1653 "scalable"); 1654 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 1655 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul); 1656 State.set(this, Add, VPIteration(Part, Lane)); 1657 } 1658 } 1659 } 1660 1661 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1662 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, 1663 VPSlotTracker &SlotTracker) const { 1664 O << Indent; 1665 printAsOperand(O, SlotTracker); 1666 O << " = SCALAR-STEPS "; 1667 printOperands(O, SlotTracker); 1668 } 1669 #endif 1670 1671 void VPWidenGEPRecipe::execute(VPTransformState &State) { 1672 assert(State.VF.isVector() && "not widening"); 1673 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 1674 // Construct a vector GEP by widening the operands of the scalar GEP as 1675 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 1676 // results in a vector of pointers when at least one operand of the GEP 1677 // is vector-typed. Thus, to keep the representation compact, we only use 1678 // vector-typed operands for loop-varying values. 1679 1680 if (areAllOperandsInvariant()) { 1681 // If we are vectorizing, but the GEP has only loop-invariant operands, 1682 // the GEP we build (by only using vector-typed operands for 1683 // loop-varying values) would be a scalar pointer. Thus, to ensure we 1684 // produce a vector of pointers, we need to either arbitrarily pick an 1685 // operand to broadcast, or broadcast a clone of the original GEP. 1686 // Here, we broadcast a clone of the original. 1687 // 1688 // TODO: If at some point we decide to scalarize instructions having 1689 // loop-invariant operands, this special case will no longer be 1690 // required. We would add the scalarization decision to 1691 // collectLoopScalars() and teach getVectorValue() to broadcast 1692 // the lane-zero scalar value. 1693 SmallVector<Value *> Ops; 1694 for (unsigned I = 0, E = getNumOperands(); I != E; I++) 1695 Ops.push_back(State.get(getOperand(I), VPIteration(0, 0))); 1696 1697 auto *NewGEP = 1698 State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0], 1699 ArrayRef(Ops).drop_front(), "", isInBounds()); 1700 for (unsigned Part = 0; Part < State.UF; ++Part) { 1701 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP); 1702 State.set(this, EntryPart, Part); 1703 State.addMetadata(EntryPart, GEP); 1704 } 1705 } else { 1706 // If the GEP has at least one loop-varying operand, we are sure to 1707 // produce a vector of pointers. But if we are only unrolling, we want 1708 // to produce a scalar GEP for each unroll part. Thus, the GEP we 1709 // produce with the code below will be scalar (if VF == 1) or vector 1710 // (otherwise). Note that for the unroll-only case, we still maintain 1711 // values in the vector mapping with initVector, as we do for other 1712 // instructions. 1713 for (unsigned Part = 0; Part < State.UF; ++Part) { 1714 // The pointer operand of the new GEP. If it's loop-invariant, we 1715 // won't broadcast it. 1716 auto *Ptr = isPointerLoopInvariant() 1717 ? State.get(getOperand(0), VPIteration(0, 0)) 1718 : State.get(getOperand(0), Part); 1719 1720 // Collect all the indices for the new GEP. If any index is 1721 // loop-invariant, we won't broadcast it. 1722 SmallVector<Value *, 4> Indices; 1723 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 1724 VPValue *Operand = getOperand(I); 1725 if (isIndexLoopInvariant(I - 1)) 1726 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 1727 else 1728 Indices.push_back(State.get(Operand, Part)); 1729 } 1730 1731 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 1732 // but it should be a vector, otherwise. 1733 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, 1734 Indices, "", isInBounds()); 1735 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 1736 "NewGEP is not a pointer vector"); 1737 State.set(this, NewGEP, Part); 1738 State.addMetadata(NewGEP, GEP); 1739 } 1740 } 1741 } 1742 1743 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1744 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, 1745 VPSlotTracker &SlotTracker) const { 1746 O << Indent << "WIDEN-GEP "; 1747 O << (isPointerLoopInvariant() ? "Inv" : "Var"); 1748 for (size_t I = 0; I < getNumOperands() - 1; ++I) 1749 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]"; 1750 1751 O << " "; 1752 printAsOperand(O, SlotTracker); 1753 O << " = getelementptr"; 1754 printFlags(O); 1755 printOperands(O, SlotTracker); 1756 } 1757 #endif 1758 1759 void VPVectorPointerRecipe ::execute(VPTransformState &State) { 1760 auto &Builder = State.Builder; 1761 State.setDebugLocFrom(getDebugLoc()); 1762 for (unsigned Part = 0; Part < State.UF; ++Part) { 1763 // Calculate the pointer for the specific unroll-part. 1764 Value *PartPtr = nullptr; 1765 // Use i32 for the gep index type when the value is constant, 1766 // or query DataLayout for a more suitable index type otherwise. 1767 const DataLayout &DL = 1768 Builder.GetInsertBlock()->getDataLayout(); 1769 Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0) 1770 ? DL.getIndexType(IndexedTy->getPointerTo()) 1771 : Builder.getInt32Ty(); 1772 Value *Ptr = State.get(getOperand(0), VPIteration(0, 0)); 1773 bool InBounds = isInBounds(); 1774 if (IsReverse) { 1775 // If the address is consecutive but reversed, then the 1776 // wide store needs to start at the last vector element. 1777 // RunTimeVF = VScale * VF.getKnownMinValue() 1778 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 1779 Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF); 1780 // NumElt = -Part * RunTimeVF 1781 Value *NumElt = Builder.CreateMul( 1782 ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF); 1783 // LastLane = 1 - RunTimeVF 1784 Value *LastLane = 1785 Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); 1786 PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds); 1787 PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds); 1788 } else { 1789 Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part); 1790 PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds); 1791 } 1792 1793 State.set(this, PartPtr, Part, /*IsScalar*/ true); 1794 } 1795 } 1796 1797 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1798 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, 1799 VPSlotTracker &SlotTracker) const { 1800 O << Indent; 1801 printAsOperand(O, SlotTracker); 1802 O << " = vector-pointer "; 1803 if (IsReverse) 1804 O << "(reverse) "; 1805 1806 printOperands(O, SlotTracker); 1807 } 1808 #endif 1809 1810 void VPBlendRecipe::execute(VPTransformState &State) { 1811 assert(isNormalized() && "Expected blend to be normalized!"); 1812 State.setDebugLocFrom(getDebugLoc()); 1813 // We know that all PHIs in non-header blocks are converted into 1814 // selects, so we don't have to worry about the insertion order and we 1815 // can just use the builder. 1816 // At this point we generate the predication tree. There may be 1817 // duplications since this is a simple recursive scan, but future 1818 // optimizations will clean it up. 1819 1820 unsigned NumIncoming = getNumIncomingValues(); 1821 1822 // Generate a sequence of selects of the form: 1823 // SELECT(Mask3, In3, 1824 // SELECT(Mask2, In2, 1825 // SELECT(Mask1, In1, 1826 // In0))) 1827 // Note that Mask0 is never used: lanes for which no path reaches this phi and 1828 // are essentially undef are taken from In0. 1829 VectorParts Entry(State.UF); 1830 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 1831 for (unsigned In = 0; In < NumIncoming; ++In) { 1832 for (unsigned Part = 0; Part < State.UF; ++Part) { 1833 // We might have single edge PHIs (blocks) - use an identity 1834 // 'select' for the first PHI operand. 1835 Value *In0 = State.get(getIncomingValue(In), Part, OnlyFirstLaneUsed); 1836 if (In == 0) 1837 Entry[Part] = In0; // Initialize with the first incoming value. 1838 else { 1839 // Select between the current value and the previous incoming edge 1840 // based on the incoming mask. 1841 Value *Cond = State.get(getMask(In), Part, OnlyFirstLaneUsed); 1842 Entry[Part] = 1843 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 1844 } 1845 } 1846 } 1847 1848 for (unsigned Part = 0; Part < State.UF; ++Part) 1849 State.set(this, Entry[Part], Part, OnlyFirstLaneUsed); 1850 } 1851 1852 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1853 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, 1854 VPSlotTracker &SlotTracker) const { 1855 O << Indent << "BLEND "; 1856 printAsOperand(O, SlotTracker); 1857 O << " ="; 1858 if (getNumIncomingValues() == 1) { 1859 // Not a User of any mask: not really blending, this is a 1860 // single-predecessor phi. 1861 O << " "; 1862 getIncomingValue(0)->printAsOperand(O, SlotTracker); 1863 } else { 1864 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { 1865 O << " "; 1866 getIncomingValue(I)->printAsOperand(O, SlotTracker); 1867 if (I == 0) 1868 continue; 1869 O << "/"; 1870 getMask(I)->printAsOperand(O, SlotTracker); 1871 } 1872 } 1873 } 1874 #endif 1875 1876 void VPReductionRecipe::execute(VPTransformState &State) { 1877 assert(!State.Instance && "Reduction being replicated."); 1878 Value *PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true); 1879 RecurKind Kind = RdxDesc.getRecurrenceKind(); 1880 // Propagate the fast-math flags carried by the underlying instruction. 1881 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 1882 State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 1883 for (unsigned Part = 0; Part < State.UF; ++Part) { 1884 Value *NewVecOp = State.get(getVecOp(), Part); 1885 if (VPValue *Cond = getCondOp()) { 1886 Value *NewCond = State.get(Cond, Part, State.VF.isScalar()); 1887 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType()); 1888 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); 1889 1890 Value *Start; 1891 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind)) 1892 Start = RdxDesc.getRecurrenceStartValue(); 1893 else 1894 Start = llvm::getRecurrenceIdentity(Kind, ElementTy, 1895 RdxDesc.getFastMathFlags()); 1896 if (State.VF.isVector()) 1897 Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), 1898 Start); 1899 1900 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start); 1901 NewVecOp = Select; 1902 } 1903 Value *NewRed; 1904 Value *NextInChain; 1905 if (IsOrdered) { 1906 if (State.VF.isVector()) 1907 NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp, 1908 PrevInChain); 1909 else 1910 NewRed = State.Builder.CreateBinOp( 1911 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain, 1912 NewVecOp); 1913 PrevInChain = NewRed; 1914 NextInChain = NewRed; 1915 } else { 1916 PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true); 1917 NewRed = createReduction(State.Builder, RdxDesc, NewVecOp); 1918 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 1919 NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(), 1920 NewRed, PrevInChain); 1921 else 1922 NextInChain = State.Builder.CreateBinOp( 1923 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, 1924 PrevInChain); 1925 } 1926 State.set(this, NextInChain, Part, /*IsScalar*/ true); 1927 } 1928 } 1929 1930 void VPReductionEVLRecipe::execute(VPTransformState &State) { 1931 assert(!State.Instance && "Reduction being replicated."); 1932 assert(State.UF == 1 && 1933 "Expected only UF == 1 when vectorizing with explicit vector length."); 1934 1935 auto &Builder = State.Builder; 1936 // Propagate the fast-math flags carried by the underlying instruction. 1937 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder); 1938 const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor(); 1939 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 1940 1941 RecurKind Kind = RdxDesc.getRecurrenceKind(); 1942 Value *Prev = State.get(getChainOp(), 0, /*IsScalar*/ true); 1943 Value *VecOp = State.get(getVecOp(), 0); 1944 Value *EVL = State.get(getEVL(), VPIteration(0, 0)); 1945 1946 VectorBuilder VBuilder(Builder); 1947 VBuilder.setEVL(EVL); 1948 Value *Mask; 1949 // TODO: move the all-true mask generation into VectorBuilder. 1950 if (VPValue *CondOp = getCondOp()) 1951 Mask = State.get(CondOp, 0); 1952 else 1953 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 1954 VBuilder.setMask(Mask); 1955 1956 Value *NewRed; 1957 if (isOrdered()) { 1958 NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev); 1959 } else { 1960 NewRed = createSimpleReduction(VBuilder, VecOp, RdxDesc); 1961 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 1962 NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev); 1963 else 1964 NewRed = Builder.CreateBinOp( 1965 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, Prev); 1966 } 1967 State.set(this, NewRed, 0, /*IsScalar*/ true); 1968 } 1969 1970 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1971 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, 1972 VPSlotTracker &SlotTracker) const { 1973 O << Indent << "REDUCE "; 1974 printAsOperand(O, SlotTracker); 1975 O << " = "; 1976 getChainOp()->printAsOperand(O, SlotTracker); 1977 O << " +"; 1978 if (isa<FPMathOperator>(getUnderlyingInstr())) 1979 O << getUnderlyingInstr()->getFastMathFlags(); 1980 O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; 1981 getVecOp()->printAsOperand(O, SlotTracker); 1982 if (isConditional()) { 1983 O << ", "; 1984 getCondOp()->printAsOperand(O, SlotTracker); 1985 } 1986 O << ")"; 1987 if (RdxDesc.IntermediateStore) 1988 O << " (with final reduction value stored in invariant address sank " 1989 "outside of loop)"; 1990 } 1991 1992 void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent, 1993 VPSlotTracker &SlotTracker) const { 1994 const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor(); 1995 O << Indent << "REDUCE "; 1996 printAsOperand(O, SlotTracker); 1997 O << " = "; 1998 getChainOp()->printAsOperand(O, SlotTracker); 1999 O << " +"; 2000 if (isa<FPMathOperator>(getUnderlyingInstr())) 2001 O << getUnderlyingInstr()->getFastMathFlags(); 2002 O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; 2003 getVecOp()->printAsOperand(O, SlotTracker); 2004 O << ", "; 2005 getEVL()->printAsOperand(O, SlotTracker); 2006 if (isConditional()) { 2007 O << ", "; 2008 getCondOp()->printAsOperand(O, SlotTracker); 2009 } 2010 O << ")"; 2011 if (RdxDesc.IntermediateStore) 2012 O << " (with final reduction value stored in invariant address sank " 2013 "outside of loop)"; 2014 } 2015 #endif 2016 2017 bool VPReplicateRecipe::shouldPack() const { 2018 // Find if the recipe is used by a widened recipe via an intervening 2019 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. 2020 return any_of(users(), [](const VPUser *U) { 2021 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U)) 2022 return any_of(PredR->users(), [PredR](const VPUser *U) { 2023 return !U->usesScalars(PredR); 2024 }); 2025 return false; 2026 }); 2027 } 2028 2029 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2030 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, 2031 VPSlotTracker &SlotTracker) const { 2032 O << Indent << (IsUniform ? "CLONE " : "REPLICATE "); 2033 2034 if (!getUnderlyingInstr()->getType()->isVoidTy()) { 2035 printAsOperand(O, SlotTracker); 2036 O << " = "; 2037 } 2038 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) { 2039 O << "call"; 2040 printFlags(O); 2041 O << "@" << CB->getCalledFunction()->getName() << "("; 2042 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), 2043 O, [&O, &SlotTracker](VPValue *Op) { 2044 Op->printAsOperand(O, SlotTracker); 2045 }); 2046 O << ")"; 2047 } else { 2048 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); 2049 printFlags(O); 2050 printOperands(O, SlotTracker); 2051 } 2052 2053 if (shouldPack()) 2054 O << " (S->V)"; 2055 } 2056 #endif 2057 2058 /// Checks if \p C is uniform across all VFs and UFs. It is considered as such 2059 /// if it is either defined outside the vector region or its operand is known to 2060 /// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI). 2061 /// TODO: Uniformity should be associated with a VPValue and there should be a 2062 /// generic way to check. 2063 static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) { 2064 return C->isDefinedOutsideVectorRegions() || 2065 isa<VPDerivedIVRecipe>(C->getOperand(0)) || 2066 isa<VPCanonicalIVPHIRecipe>(C->getOperand(0)); 2067 } 2068 2069 Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) { 2070 assert(vputils::onlyFirstLaneUsed(this) && 2071 "Codegen only implemented for first lane."); 2072 switch (Opcode) { 2073 case Instruction::SExt: 2074 case Instruction::ZExt: 2075 case Instruction::Trunc: { 2076 // Note: SExt/ZExt not used yet. 2077 Value *Op = State.get(getOperand(0), VPIteration(Part, 0)); 2078 return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy); 2079 } 2080 default: 2081 llvm_unreachable("opcode not implemented yet"); 2082 } 2083 } 2084 2085 void VPScalarCastRecipe ::execute(VPTransformState &State) { 2086 bool IsUniformAcrossVFsAndUFs = isUniformAcrossVFsAndUFs(this); 2087 for (unsigned Part = 0; Part != State.UF; ++Part) { 2088 Value *Res; 2089 // Only generate a single instance, if the recipe is uniform across UFs and 2090 // VFs. 2091 if (Part > 0 && IsUniformAcrossVFsAndUFs) 2092 Res = State.get(this, VPIteration(0, 0)); 2093 else 2094 Res = generate(State, Part); 2095 State.set(this, Res, VPIteration(Part, 0)); 2096 } 2097 } 2098 2099 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2100 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent, 2101 VPSlotTracker &SlotTracker) const { 2102 O << Indent << "SCALAR-CAST "; 2103 printAsOperand(O, SlotTracker); 2104 O << " = " << Instruction::getOpcodeName(Opcode) << " "; 2105 printOperands(O, SlotTracker); 2106 O << " to " << *ResultTy; 2107 } 2108 #endif 2109 2110 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 2111 assert(State.Instance && "Branch on Mask works only on single instance."); 2112 2113 unsigned Part = State.Instance->Part; 2114 unsigned Lane = State.Instance->Lane.getKnownLane(); 2115 2116 Value *ConditionBit = nullptr; 2117 VPValue *BlockInMask = getMask(); 2118 if (BlockInMask) { 2119 ConditionBit = State.get(BlockInMask, Part); 2120 if (ConditionBit->getType()->isVectorTy()) 2121 ConditionBit = State.Builder.CreateExtractElement( 2122 ConditionBit, State.Builder.getInt32(Lane)); 2123 } else // Block in mask is all-one. 2124 ConditionBit = State.Builder.getTrue(); 2125 2126 // Replace the temporary unreachable terminator with a new conditional branch, 2127 // whose two destinations will be set later when they are created. 2128 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 2129 assert(isa<UnreachableInst>(CurrentTerminator) && 2130 "Expected to replace unreachable terminator with conditional branch."); 2131 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 2132 CondBr->setSuccessor(0, nullptr); 2133 ReplaceInstWithInst(CurrentTerminator, CondBr); 2134 } 2135 2136 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 2137 assert(State.Instance && "Predicated instruction PHI works per instance."); 2138 Instruction *ScalarPredInst = 2139 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 2140 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 2141 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 2142 assert(PredicatingBB && "Predicated block has no single predecessor."); 2143 assert(isa<VPReplicateRecipe>(getOperand(0)) && 2144 "operand must be VPReplicateRecipe"); 2145 2146 // By current pack/unpack logic we need to generate only a single phi node: if 2147 // a vector value for the predicated instruction exists at this point it means 2148 // the instruction has vector users only, and a phi for the vector value is 2149 // needed. In this case the recipe of the predicated instruction is marked to 2150 // also do that packing, thereby "hoisting" the insert-element sequence. 2151 // Otherwise, a phi node for the scalar value is needed. 2152 unsigned Part = State.Instance->Part; 2153 if (State.hasVectorValue(getOperand(0), Part)) { 2154 Value *VectorValue = State.get(getOperand(0), Part); 2155 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 2156 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 2157 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 2158 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 2159 if (State.hasVectorValue(this, Part)) 2160 State.reset(this, VPhi, Part); 2161 else 2162 State.set(this, VPhi, Part); 2163 // NOTE: Currently we need to update the value of the operand, so the next 2164 // predicated iteration inserts its generated value in the correct vector. 2165 State.reset(getOperand(0), VPhi, Part); 2166 } else { 2167 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 2168 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 2169 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 2170 PredicatingBB); 2171 Phi->addIncoming(ScalarPredInst, PredicatedBB); 2172 if (State.hasScalarValue(this, *State.Instance)) 2173 State.reset(this, Phi, *State.Instance); 2174 else 2175 State.set(this, Phi, *State.Instance); 2176 // NOTE: Currently we need to update the value of the operand, so the next 2177 // predicated iteration inserts its generated value in the correct vector. 2178 State.reset(getOperand(0), Phi, *State.Instance); 2179 } 2180 } 2181 2182 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2183 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2184 VPSlotTracker &SlotTracker) const { 2185 O << Indent << "PHI-PREDICATED-INSTRUCTION "; 2186 printAsOperand(O, SlotTracker); 2187 O << " = "; 2188 printOperands(O, SlotTracker); 2189 } 2190 #endif 2191 2192 InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, 2193 VPCostContext &Ctx) const { 2194 Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF); 2195 const Align Alignment = 2196 getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient)); 2197 unsigned AS = 2198 getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient)); 2199 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2200 2201 if (!Consecutive) { 2202 // TODO: Using the original IR may not be accurate. 2203 // Currently, ARM will use the underlying IR to calculate gather/scatter 2204 // instruction cost. 2205 const Value *Ptr = getLoadStorePointerOperand(&Ingredient); 2206 assert(!Reverse && 2207 "Inconsecutive memory access should not have the order."); 2208 return Ctx.TTI.getAddressComputationCost(Ty) + 2209 Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr, 2210 IsMasked, Alignment, CostKind, 2211 &Ingredient); 2212 } 2213 2214 InstructionCost Cost = 0; 2215 if (IsMasked) { 2216 Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, 2217 AS, CostKind); 2218 } else { 2219 TTI::OperandValueInfo OpInfo = 2220 Ctx.TTI.getOperandInfo(Ingredient.getOperand(0)); 2221 Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS, 2222 CostKind, OpInfo, &Ingredient); 2223 } 2224 if (!Reverse) 2225 return Cost; 2226 2227 return Cost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 2228 cast<VectorType>(Ty), std::nullopt, 2229 CostKind, 0); 2230 } 2231 2232 void VPWidenLoadRecipe::execute(VPTransformState &State) { 2233 auto *LI = cast<LoadInst>(&Ingredient); 2234 2235 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 2236 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 2237 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2238 bool CreateGather = !isConsecutive(); 2239 2240 auto &Builder = State.Builder; 2241 State.setDebugLocFrom(getDebugLoc()); 2242 for (unsigned Part = 0; Part < State.UF; ++Part) { 2243 Value *NewLI; 2244 Value *Mask = nullptr; 2245 if (auto *VPMask = getMask()) { 2246 // Mask reversal is only needed for non-all-one (null) masks, as reverse 2247 // of a null all-one mask is a null mask. 2248 Mask = State.get(VPMask, Part); 2249 if (isReverse()) 2250 Mask = Builder.CreateVectorReverse(Mask, "reverse"); 2251 } 2252 2253 Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateGather); 2254 if (CreateGather) { 2255 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr, 2256 "wide.masked.gather"); 2257 } else if (Mask) { 2258 NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask, 2259 PoisonValue::get(DataTy), 2260 "wide.masked.load"); 2261 } else { 2262 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load"); 2263 } 2264 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2265 State.addMetadata(NewLI, LI); 2266 if (Reverse) 2267 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 2268 State.set(this, NewLI, Part); 2269 } 2270 } 2271 2272 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2273 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, 2274 VPSlotTracker &SlotTracker) const { 2275 O << Indent << "WIDEN "; 2276 printAsOperand(O, SlotTracker); 2277 O << " = load "; 2278 printOperands(O, SlotTracker); 2279 } 2280 #endif 2281 2282 /// Use all-true mask for reverse rather than actual mask, as it avoids a 2283 /// dependence w/o affecting the result. 2284 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, 2285 Value *EVL, const Twine &Name) { 2286 VectorType *ValTy = cast<VectorType>(Operand->getType()); 2287 Value *AllTrueMask = 2288 Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue()); 2289 return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse, 2290 {Operand, AllTrueMask, EVL}, nullptr, Name); 2291 } 2292 2293 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { 2294 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " 2295 "explicit vector length."); 2296 auto *LI = cast<LoadInst>(&Ingredient); 2297 2298 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 2299 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 2300 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2301 bool CreateGather = !isConsecutive(); 2302 2303 auto &Builder = State.Builder; 2304 State.setDebugLocFrom(getDebugLoc()); 2305 CallInst *NewLI; 2306 Value *EVL = State.get(getEVL(), VPIteration(0, 0)); 2307 Value *Addr = State.get(getAddr(), 0, !CreateGather); 2308 Value *Mask = nullptr; 2309 if (VPValue *VPMask = getMask()) { 2310 Mask = State.get(VPMask, 0); 2311 if (isReverse()) 2312 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); 2313 } else { 2314 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 2315 } 2316 2317 if (CreateGather) { 2318 NewLI = 2319 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, 2320 nullptr, "wide.masked.gather"); 2321 } else { 2322 VectorBuilder VBuilder(Builder); 2323 VBuilder.setEVL(EVL).setMask(Mask); 2324 NewLI = cast<CallInst>(VBuilder.createVectorInstruction( 2325 Instruction::Load, DataTy, Addr, "vp.op.load")); 2326 } 2327 NewLI->addParamAttr( 2328 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); 2329 State.addMetadata(NewLI, LI); 2330 Instruction *Res = NewLI; 2331 if (isReverse()) 2332 Res = createReverseEVL(Builder, Res, EVL, "vp.reverse"); 2333 State.set(this, Res, 0); 2334 } 2335 2336 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2337 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, 2338 VPSlotTracker &SlotTracker) const { 2339 O << Indent << "WIDEN "; 2340 printAsOperand(O, SlotTracker); 2341 O << " = vp.load "; 2342 printOperands(O, SlotTracker); 2343 } 2344 #endif 2345 2346 void VPWidenStoreRecipe::execute(VPTransformState &State) { 2347 auto *SI = cast<StoreInst>(&Ingredient); 2348 2349 VPValue *StoredVPValue = getStoredValue(); 2350 bool CreateScatter = !isConsecutive(); 2351 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2352 2353 auto &Builder = State.Builder; 2354 State.setDebugLocFrom(getDebugLoc()); 2355 2356 for (unsigned Part = 0; Part < State.UF; ++Part) { 2357 Instruction *NewSI = nullptr; 2358 Value *Mask = nullptr; 2359 if (auto *VPMask = getMask()) { 2360 // Mask reversal is only needed for non-all-one (null) masks, as reverse 2361 // of a null all-one mask is a null mask. 2362 Mask = State.get(VPMask, Part); 2363 if (isReverse()) 2364 Mask = Builder.CreateVectorReverse(Mask, "reverse"); 2365 } 2366 2367 Value *StoredVal = State.get(StoredVPValue, Part); 2368 if (isReverse()) { 2369 // If we store to reverse consecutive memory locations, then we need 2370 // to reverse the order of elements in the stored value. 2371 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 2372 // We don't want to update the value in the map as it might be used in 2373 // another expression. So don't call resetVectorValue(StoredVal). 2374 } 2375 Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateScatter); 2376 if (CreateScatter) 2377 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask); 2378 else if (Mask) 2379 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask); 2380 else 2381 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment); 2382 State.addMetadata(NewSI, SI); 2383 } 2384 } 2385 2386 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2387 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, 2388 VPSlotTracker &SlotTracker) const { 2389 O << Indent << "WIDEN store "; 2390 printOperands(O, SlotTracker); 2391 } 2392 #endif 2393 2394 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { 2395 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " 2396 "explicit vector length."); 2397 auto *SI = cast<StoreInst>(&Ingredient); 2398 2399 VPValue *StoredValue = getStoredValue(); 2400 bool CreateScatter = !isConsecutive(); 2401 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2402 2403 auto &Builder = State.Builder; 2404 State.setDebugLocFrom(getDebugLoc()); 2405 2406 CallInst *NewSI = nullptr; 2407 Value *StoredVal = State.get(StoredValue, 0); 2408 Value *EVL = State.get(getEVL(), VPIteration(0, 0)); 2409 if (isReverse()) 2410 StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse"); 2411 Value *Mask = nullptr; 2412 if (VPValue *VPMask = getMask()) { 2413 Mask = State.get(VPMask, 0); 2414 if (isReverse()) 2415 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); 2416 } else { 2417 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 2418 } 2419 Value *Addr = State.get(getAddr(), 0, !CreateScatter); 2420 if (CreateScatter) { 2421 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), 2422 Intrinsic::vp_scatter, 2423 {StoredVal, Addr, Mask, EVL}); 2424 } else { 2425 VectorBuilder VBuilder(Builder); 2426 VBuilder.setEVL(EVL).setMask(Mask); 2427 NewSI = cast<CallInst>(VBuilder.createVectorInstruction( 2428 Instruction::Store, Type::getVoidTy(EVL->getContext()), 2429 {StoredVal, Addr})); 2430 } 2431 NewSI->addParamAttr( 2432 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment)); 2433 State.addMetadata(NewSI, SI); 2434 } 2435 2436 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2437 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent, 2438 VPSlotTracker &SlotTracker) const { 2439 O << Indent << "WIDEN vp.store "; 2440 printOperands(O, SlotTracker); 2441 } 2442 #endif 2443 2444 static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V, 2445 VectorType *DstVTy, const DataLayout &DL) { 2446 // Verify that V is a vector type with same number of elements as DstVTy. 2447 auto VF = DstVTy->getElementCount(); 2448 auto *SrcVecTy = cast<VectorType>(V->getType()); 2449 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match"); 2450 Type *SrcElemTy = SrcVecTy->getElementType(); 2451 Type *DstElemTy = DstVTy->getElementType(); 2452 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2453 "Vector elements must have same size"); 2454 2455 // Do a direct cast if element types are castable. 2456 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2457 return Builder.CreateBitOrPointerCast(V, DstVTy); 2458 } 2459 // V cannot be directly casted to desired vector type. 2460 // May happen when V is a floating point vector but DstVTy is a vector of 2461 // pointers or vice-versa. Handle this using a two-step bitcast using an 2462 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2463 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2464 "Only one type should be a pointer type"); 2465 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2466 "Only one type should be a floating point type"); 2467 Type *IntTy = 2468 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2469 auto *VecIntTy = VectorType::get(IntTy, VF); 2470 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2471 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2472 } 2473 2474 /// Return a vector containing interleaved elements from multiple 2475 /// smaller input vectors. 2476 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, 2477 const Twine &Name) { 2478 unsigned Factor = Vals.size(); 2479 assert(Factor > 1 && "Tried to interleave invalid number of vectors"); 2480 2481 VectorType *VecTy = cast<VectorType>(Vals[0]->getType()); 2482 #ifndef NDEBUG 2483 for (Value *Val : Vals) 2484 assert(Val->getType() == VecTy && "Tried to interleave mismatched types"); 2485 #endif 2486 2487 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so 2488 // must use intrinsics to interleave. 2489 if (VecTy->isScalableTy()) { 2490 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); 2491 return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2, 2492 Vals, 2493 /*FMFSource=*/nullptr, Name); 2494 } 2495 2496 // Fixed length. Start by concatenating all vectors into a wide vector. 2497 Value *WideVec = concatenateVectors(Builder, Vals); 2498 2499 // Interleave the elements into the wide vector. 2500 const unsigned NumElts = VecTy->getElementCount().getFixedValue(); 2501 return Builder.CreateShuffleVector( 2502 WideVec, createInterleaveMask(NumElts, Factor), Name); 2503 } 2504 2505 // Try to vectorize the interleave group that \p Instr belongs to. 2506 // 2507 // E.g. Translate following interleaved load group (factor = 3): 2508 // for (i = 0; i < N; i+=3) { 2509 // R = Pic[i]; // Member of index 0 2510 // G = Pic[i+1]; // Member of index 1 2511 // B = Pic[i+2]; // Member of index 2 2512 // ... // do something to R, G, B 2513 // } 2514 // To: 2515 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2516 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2517 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2518 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2519 // 2520 // Or translate following interleaved store group (factor = 3): 2521 // for (i = 0; i < N; i+=3) { 2522 // ... do something to R, G, B 2523 // Pic[i] = R; // Member of index 0 2524 // Pic[i+1] = G; // Member of index 1 2525 // Pic[i+2] = B; // Member of index 2 2526 // } 2527 // To: 2528 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2529 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2530 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2531 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2532 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2533 void VPInterleaveRecipe::execute(VPTransformState &State) { 2534 assert(!State.Instance && "Interleave group being replicated."); 2535 const InterleaveGroup<Instruction> *Group = IG; 2536 Instruction *Instr = Group->getInsertPos(); 2537 2538 // Prepare for the vector type of the interleaved load/store. 2539 Type *ScalarTy = getLoadStoreType(Instr); 2540 unsigned InterleaveFactor = Group->getFactor(); 2541 auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor); 2542 2543 // Prepare for the new pointers. 2544 SmallVector<Value *, 2> AddrParts; 2545 unsigned Index = Group->getIndex(Instr); 2546 2547 // TODO: extend the masked interleaved-group support to reversed access. 2548 VPValue *BlockInMask = getMask(); 2549 assert((!BlockInMask || !Group->isReverse()) && 2550 "Reversed masked interleave-group not supported."); 2551 2552 Value *Idx; 2553 // If the group is reverse, adjust the index to refer to the last vector lane 2554 // instead of the first. We adjust the index from the first vector lane, 2555 // rather than directly getting the pointer for lane VF - 1, because the 2556 // pointer operand of the interleaved access is supposed to be uniform. For 2557 // uniform instructions, we're only required to generate a value for the 2558 // first vector lane in each unroll iteration. 2559 if (Group->isReverse()) { 2560 Value *RuntimeVF = 2561 getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF); 2562 Idx = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1)); 2563 Idx = State.Builder.CreateMul(Idx, 2564 State.Builder.getInt32(Group->getFactor())); 2565 Idx = State.Builder.CreateAdd(Idx, State.Builder.getInt32(Index)); 2566 Idx = State.Builder.CreateNeg(Idx); 2567 } else 2568 Idx = State.Builder.getInt32(-Index); 2569 2570 VPValue *Addr = getAddr(); 2571 for (unsigned Part = 0; Part < State.UF; Part++) { 2572 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2573 if (auto *I = dyn_cast<Instruction>(AddrPart)) 2574 State.setDebugLocFrom(I->getDebugLoc()); 2575 2576 // Notice current instruction could be any index. Need to adjust the address 2577 // to the member of index 0. 2578 // 2579 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2580 // b = A[i]; // Member of index 0 2581 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2582 // 2583 // E.g. A[i+1] = a; // Member of index 1 2584 // A[i] = b; // Member of index 0 2585 // A[i+2] = c; // Member of index 2 (Current instruction) 2586 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2587 2588 bool InBounds = false; 2589 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2590 InBounds = gep->isInBounds(); 2591 AddrPart = State.Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds); 2592 AddrParts.push_back(AddrPart); 2593 } 2594 2595 State.setDebugLocFrom(Instr->getDebugLoc()); 2596 Value *PoisonVec = PoisonValue::get(VecTy); 2597 2598 auto CreateGroupMask = [&BlockInMask, &State, &InterleaveFactor]( 2599 unsigned Part, Value *MaskForGaps) -> Value * { 2600 if (State.VF.isScalable()) { 2601 assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); 2602 assert(InterleaveFactor == 2 && 2603 "Unsupported deinterleave factor for scalable vectors"); 2604 auto *BlockInMaskPart = State.get(BlockInMask, Part); 2605 SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart}; 2606 auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(), 2607 State.VF.getKnownMinValue() * 2, true); 2608 return State.Builder.CreateIntrinsic( 2609 MaskTy, Intrinsic::vector_interleave2, Ops, 2610 /*FMFSource=*/nullptr, "interleaved.mask"); 2611 } 2612 2613 if (!BlockInMask) 2614 return MaskForGaps; 2615 2616 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2617 Value *ShuffledMask = State.Builder.CreateShuffleVector( 2618 BlockInMaskPart, 2619 createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()), 2620 "interleaved.mask"); 2621 return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And, 2622 ShuffledMask, MaskForGaps) 2623 : ShuffledMask; 2624 }; 2625 2626 const DataLayout &DL = Instr->getDataLayout(); 2627 // Vectorize the interleaved load group. 2628 if (isa<LoadInst>(Instr)) { 2629 Value *MaskForGaps = nullptr; 2630 if (NeedsMaskForGaps) { 2631 MaskForGaps = createBitMaskForGaps(State.Builder, 2632 State.VF.getKnownMinValue(), *Group); 2633 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2634 } 2635 2636 // For each unroll part, create a wide load for the group. 2637 SmallVector<Value *, 2> NewLoads; 2638 for (unsigned Part = 0; Part < State.UF; Part++) { 2639 Instruction *NewLoad; 2640 if (BlockInMask || MaskForGaps) { 2641 Value *GroupMask = CreateGroupMask(Part, MaskForGaps); 2642 NewLoad = State.Builder.CreateMaskedLoad(VecTy, AddrParts[Part], 2643 Group->getAlign(), GroupMask, 2644 PoisonVec, "wide.masked.vec"); 2645 } else 2646 NewLoad = State.Builder.CreateAlignedLoad( 2647 VecTy, AddrParts[Part], Group->getAlign(), "wide.vec"); 2648 Group->addMetadata(NewLoad); 2649 NewLoads.push_back(NewLoad); 2650 } 2651 2652 ArrayRef<VPValue *> VPDefs = definedValues(); 2653 const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); 2654 if (VecTy->isScalableTy()) { 2655 assert(InterleaveFactor == 2 && 2656 "Unsupported deinterleave factor for scalable vectors"); 2657 2658 for (unsigned Part = 0; Part < State.UF; ++Part) { 2659 // Scalable vectors cannot use arbitrary shufflevectors (only splats), 2660 // so must use intrinsics to deinterleave. 2661 Value *DI = State.Builder.CreateIntrinsic( 2662 Intrinsic::vector_deinterleave2, VecTy, NewLoads[Part], 2663 /*FMFSource=*/nullptr, "strided.vec"); 2664 unsigned J = 0; 2665 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2666 Instruction *Member = Group->getMember(I); 2667 2668 if (!Member) 2669 continue; 2670 2671 Value *StridedVec = State.Builder.CreateExtractValue(DI, I); 2672 // If this member has different type, cast the result type. 2673 if (Member->getType() != ScalarTy) { 2674 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); 2675 StridedVec = 2676 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); 2677 } 2678 2679 if (Group->isReverse()) 2680 StridedVec = 2681 State.Builder.CreateVectorReverse(StridedVec, "reverse"); 2682 2683 State.set(VPDefs[J], StridedVec, Part); 2684 ++J; 2685 } 2686 } 2687 2688 return; 2689 } 2690 2691 // For each member in the group, shuffle out the appropriate data from the 2692 // wide loads. 2693 unsigned J = 0; 2694 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2695 Instruction *Member = Group->getMember(I); 2696 2697 // Skip the gaps in the group. 2698 if (!Member) 2699 continue; 2700 2701 auto StrideMask = 2702 createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue()); 2703 for (unsigned Part = 0; Part < State.UF; Part++) { 2704 Value *StridedVec = State.Builder.CreateShuffleVector( 2705 NewLoads[Part], StrideMask, "strided.vec"); 2706 2707 // If this member has different type, cast the result type. 2708 if (Member->getType() != ScalarTy) { 2709 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 2710 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); 2711 StridedVec = 2712 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); 2713 } 2714 2715 if (Group->isReverse()) 2716 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); 2717 2718 State.set(VPDefs[J], StridedVec, Part); 2719 } 2720 ++J; 2721 } 2722 return; 2723 } 2724 2725 // The sub vector type for current instruction. 2726 auto *SubVT = VectorType::get(ScalarTy, State.VF); 2727 2728 // Vectorize the interleaved store group. 2729 Value *MaskForGaps = 2730 createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group); 2731 assert((!MaskForGaps || !State.VF.isScalable()) && 2732 "masking gaps for scalable vectors is not yet supported."); 2733 ArrayRef<VPValue *> StoredValues = getStoredValues(); 2734 for (unsigned Part = 0; Part < State.UF; Part++) { 2735 // Collect the stored vector from each member. 2736 SmallVector<Value *, 4> StoredVecs; 2737 unsigned StoredIdx = 0; 2738 for (unsigned i = 0; i < InterleaveFactor; i++) { 2739 assert((Group->getMember(i) || MaskForGaps) && 2740 "Fail to get a member from an interleaved store group"); 2741 Instruction *Member = Group->getMember(i); 2742 2743 // Skip the gaps in the group. 2744 if (!Member) { 2745 Value *Undef = PoisonValue::get(SubVT); 2746 StoredVecs.push_back(Undef); 2747 continue; 2748 } 2749 2750 Value *StoredVec = State.get(StoredValues[StoredIdx], Part); 2751 ++StoredIdx; 2752 2753 if (Group->isReverse()) 2754 StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse"); 2755 2756 // If this member has different type, cast it to a unified type. 2757 2758 if (StoredVec->getType() != SubVT) 2759 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL); 2760 2761 StoredVecs.push_back(StoredVec); 2762 } 2763 2764 // Interleave all the smaller vectors into one wider vector. 2765 Value *IVec = 2766 interleaveVectors(State.Builder, StoredVecs, "interleaved.vec"); 2767 Instruction *NewStoreInstr; 2768 if (BlockInMask || MaskForGaps) { 2769 Value *GroupMask = CreateGroupMask(Part, MaskForGaps); 2770 NewStoreInstr = State.Builder.CreateMaskedStore( 2771 IVec, AddrParts[Part], Group->getAlign(), GroupMask); 2772 } else 2773 NewStoreInstr = State.Builder.CreateAlignedStore(IVec, AddrParts[Part], 2774 Group->getAlign()); 2775 2776 Group->addMetadata(NewStoreInstr); 2777 } 2778 } 2779 2780 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2781 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 2782 VPSlotTracker &SlotTracker) const { 2783 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 2784 IG->getInsertPos()->printAsOperand(O, false); 2785 O << ", "; 2786 getAddr()->printAsOperand(O, SlotTracker); 2787 VPValue *Mask = getMask(); 2788 if (Mask) { 2789 O << ", "; 2790 Mask->printAsOperand(O, SlotTracker); 2791 } 2792 2793 unsigned OpIdx = 0; 2794 for (unsigned i = 0; i < IG->getFactor(); ++i) { 2795 if (!IG->getMember(i)) 2796 continue; 2797 if (getNumStoreOperands() > 0) { 2798 O << "\n" << Indent << " store "; 2799 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 2800 O << " to index " << i; 2801 } else { 2802 O << "\n" << Indent << " "; 2803 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 2804 O << " = load from index " << i; 2805 } 2806 ++OpIdx; 2807 } 2808 } 2809 #endif 2810 2811 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { 2812 Value *Start = getStartValue()->getLiveInIRValue(); 2813 PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index"); 2814 EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 2815 2816 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2817 EntryPart->addIncoming(Start, VectorPH); 2818 EntryPart->setDebugLoc(getDebugLoc()); 2819 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) 2820 State.set(this, EntryPart, Part, /*IsScalar*/ true); 2821 } 2822 2823 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2824 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2825 VPSlotTracker &SlotTracker) const { 2826 O << Indent << "EMIT "; 2827 printAsOperand(O, SlotTracker); 2828 O << " = CANONICAL-INDUCTION "; 2829 printOperands(O, SlotTracker); 2830 } 2831 #endif 2832 2833 bool VPCanonicalIVPHIRecipe::isCanonical( 2834 InductionDescriptor::InductionKind Kind, VPValue *Start, 2835 VPValue *Step) const { 2836 // Must be an integer induction. 2837 if (Kind != InductionDescriptor::IK_IntInduction) 2838 return false; 2839 // Start must match the start value of this canonical induction. 2840 if (Start != getStartValue()) 2841 return false; 2842 2843 // If the step is defined by a recipe, it is not a ConstantInt. 2844 if (Step->getDefiningRecipe()) 2845 return false; 2846 2847 ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue()); 2848 return StepC && StepC->isOne(); 2849 } 2850 2851 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) { 2852 return IsScalarAfterVectorization && 2853 (!IsScalable || vputils::onlyFirstLaneUsed(this)); 2854 } 2855 2856 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 2857 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && 2858 "Not a pointer induction according to InductionDescriptor!"); 2859 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && 2860 "Unexpected type."); 2861 assert(!onlyScalarsGenerated(State.VF.isScalable()) && 2862 "Recipe should have been replaced"); 2863 2864 auto *IVR = getParent()->getPlan()->getCanonicalIV(); 2865 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true)); 2866 Type *PhiType = IndDesc.getStep()->getType(); 2867 2868 // Build a pointer phi 2869 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 2870 Type *ScStValueType = ScalarStartValue->getType(); 2871 PHINode *NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi", 2872 CanonicalIV->getIterator()); 2873 2874 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2875 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 2876 2877 // A pointer induction, performed by using a gep 2878 BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint(); 2879 2880 Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0)); 2881 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 2882 Value *NumUnrolledElems = 2883 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 2884 Value *InductionGEP = GetElementPtrInst::Create( 2885 State.Builder.getInt8Ty(), NewPointerPhi, 2886 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 2887 InductionLoc); 2888 // Add induction update using an incorrect block temporarily. The phi node 2889 // will be fixed after VPlan execution. Note that at this point the latch 2890 // block cannot be used, as it does not exist yet. 2891 // TODO: Model increment value in VPlan, by turning the recipe into a 2892 // multi-def and a subclass of VPHeaderPHIRecipe. 2893 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 2894 2895 // Create UF many actual address geps that use the pointer 2896 // phi as base and a vectorized version of the step value 2897 // (<step*0, ..., step*N>) as offset. 2898 for (unsigned Part = 0; Part < State.UF; ++Part) { 2899 Type *VecPhiType = VectorType::get(PhiType, State.VF); 2900 Value *StartOffsetScalar = 2901 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 2902 Value *StartOffset = 2903 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 2904 // Create a vector of consecutive numbers from zero to VF. 2905 StartOffset = State.Builder.CreateAdd( 2906 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 2907 2908 assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) && 2909 "scalar step must be the same across all parts"); 2910 Value *GEP = State.Builder.CreateGEP( 2911 State.Builder.getInt8Ty(), NewPointerPhi, 2912 State.Builder.CreateMul( 2913 StartOffset, 2914 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue), 2915 "vector.gep")); 2916 State.set(this, GEP, Part); 2917 } 2918 } 2919 2920 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2921 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, 2922 VPSlotTracker &SlotTracker) const { 2923 O << Indent << "EMIT "; 2924 printAsOperand(O, SlotTracker); 2925 O << " = WIDEN-POINTER-INDUCTION "; 2926 getStartValue()->printAsOperand(O, SlotTracker); 2927 O << ", " << *IndDesc.getStep(); 2928 } 2929 #endif 2930 2931 void VPExpandSCEVRecipe::execute(VPTransformState &State) { 2932 assert(!State.Instance && "cannot be used in per-lane"); 2933 const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); 2934 SCEVExpander Exp(SE, DL, "induction"); 2935 2936 Value *Res = Exp.expandCodeFor(Expr, Expr->getType(), 2937 &*State.Builder.GetInsertPoint()); 2938 assert(!State.ExpandedSCEVs.contains(Expr) && 2939 "Same SCEV expanded multiple times"); 2940 State.ExpandedSCEVs[Expr] = Res; 2941 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) 2942 State.set(this, Res, {Part, 0}); 2943 } 2944 2945 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2946 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, 2947 VPSlotTracker &SlotTracker) const { 2948 O << Indent << "EMIT "; 2949 getVPSingleValue()->printAsOperand(O, SlotTracker); 2950 O << " = EXPAND SCEV " << *Expr; 2951 } 2952 #endif 2953 2954 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { 2955 Value *CanonicalIV = State.get(getOperand(0), 0, /*IsScalar*/ true); 2956 Type *STy = CanonicalIV->getType(); 2957 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); 2958 ElementCount VF = State.VF; 2959 Value *VStart = VF.isScalar() 2960 ? CanonicalIV 2961 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); 2962 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { 2963 Value *VStep = createStepForVF(Builder, STy, VF, Part); 2964 if (VF.isVector()) { 2965 VStep = Builder.CreateVectorSplat(VF, VStep); 2966 VStep = 2967 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); 2968 } 2969 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); 2970 State.set(this, CanonicalVectorIV, Part); 2971 } 2972 } 2973 2974 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2975 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, 2976 VPSlotTracker &SlotTracker) const { 2977 O << Indent << "EMIT "; 2978 printAsOperand(O, SlotTracker); 2979 O << " = WIDEN-CANONICAL-INDUCTION "; 2980 printOperands(O, SlotTracker); 2981 } 2982 #endif 2983 2984 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { 2985 auto &Builder = State.Builder; 2986 // Create a vector from the initial value. 2987 auto *VectorInit = getStartValue()->getLiveInIRValue(); 2988 2989 Type *VecTy = State.VF.isScalar() 2990 ? VectorInit->getType() 2991 : VectorType::get(VectorInit->getType(), State.VF); 2992 2993 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2994 if (State.VF.isVector()) { 2995 auto *IdxTy = Builder.getInt32Ty(); 2996 auto *One = ConstantInt::get(IdxTy, 1); 2997 IRBuilder<>::InsertPointGuard Guard(Builder); 2998 Builder.SetInsertPoint(VectorPH->getTerminator()); 2999 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF); 3000 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 3001 VectorInit = Builder.CreateInsertElement( 3002 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init"); 3003 } 3004 3005 // Create a phi node for the new recurrence. 3006 PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur"); 3007 EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 3008 EntryPart->addIncoming(VectorInit, VectorPH); 3009 State.set(this, EntryPart, 0); 3010 } 3011 3012 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3013 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, 3014 VPSlotTracker &SlotTracker) const { 3015 O << Indent << "FIRST-ORDER-RECURRENCE-PHI "; 3016 printAsOperand(O, SlotTracker); 3017 O << " = phi "; 3018 printOperands(O, SlotTracker); 3019 } 3020 #endif 3021 3022 void VPReductionPHIRecipe::execute(VPTransformState &State) { 3023 auto &Builder = State.Builder; 3024 3025 // Reductions do not have to start at zero. They can start with 3026 // any loop invariant values. 3027 VPValue *StartVPV = getStartValue(); 3028 Value *StartV = StartVPV->getLiveInIRValue(); 3029 3030 // In order to support recurrences we need to be able to vectorize Phi nodes. 3031 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3032 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 3033 // this value when we vectorize all of the instructions that use the PHI. 3034 bool ScalarPHI = State.VF.isScalar() || IsInLoop; 3035 Type *VecTy = ScalarPHI ? StartV->getType() 3036 : VectorType::get(StartV->getType(), State.VF); 3037 3038 BasicBlock *HeaderBB = State.CFG.PrevBB; 3039 assert(State.CurrentVectorLoop->getHeader() == HeaderBB && 3040 "recipe must be in the vector loop header"); 3041 unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF; 3042 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { 3043 Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi"); 3044 EntryPart->insertBefore(HeaderBB->getFirstInsertionPt()); 3045 State.set(this, EntryPart, Part, IsInLoop); 3046 } 3047 3048 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3049 3050 Value *Iden = nullptr; 3051 RecurKind RK = RdxDesc.getRecurrenceKind(); 3052 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || 3053 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { 3054 // MinMax and AnyOf reductions have the start value as their identity. 3055 if (ScalarPHI) { 3056 Iden = StartV; 3057 } else { 3058 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 3059 Builder.SetInsertPoint(VectorPH->getTerminator()); 3060 StartV = Iden = 3061 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 3062 } 3063 } else { 3064 Iden = llvm::getRecurrenceIdentity(RK, VecTy->getScalarType(), 3065 RdxDesc.getFastMathFlags()); 3066 3067 if (!ScalarPHI) { 3068 Iden = Builder.CreateVectorSplat(State.VF, Iden); 3069 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 3070 Builder.SetInsertPoint(VectorPH->getTerminator()); 3071 Constant *Zero = Builder.getInt32(0); 3072 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 3073 } 3074 } 3075 3076 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { 3077 Value *EntryPart = State.get(this, Part, IsInLoop); 3078 // Make sure to add the reduction start value only to the 3079 // first unroll part. 3080 Value *StartVal = (Part == 0) ? StartV : Iden; 3081 cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH); 3082 } 3083 } 3084 3085 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3086 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3087 VPSlotTracker &SlotTracker) const { 3088 O << Indent << "WIDEN-REDUCTION-PHI "; 3089 3090 printAsOperand(O, SlotTracker); 3091 O << " = phi "; 3092 printOperands(O, SlotTracker); 3093 } 3094 #endif 3095 3096 void VPWidenPHIRecipe::execute(VPTransformState &State) { 3097 assert(EnableVPlanNativePath && 3098 "Non-native vplans are not expected to have VPWidenPHIRecipes."); 3099 3100 Value *Op0 = State.get(getOperand(0), 0); 3101 Type *VecTy = Op0->getType(); 3102 Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi"); 3103 State.set(this, VecPhi, 0); 3104 } 3105 3106 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3107 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3108 VPSlotTracker &SlotTracker) const { 3109 O << Indent << "WIDEN-PHI "; 3110 3111 auto *OriginalPhi = cast<PHINode>(getUnderlyingValue()); 3112 // Unless all incoming values are modeled in VPlan print the original PHI 3113 // directly. 3114 // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming 3115 // values as VPValues. 3116 if (getNumOperands() != OriginalPhi->getNumOperands()) { 3117 O << VPlanIngredient(OriginalPhi); 3118 return; 3119 } 3120 3121 printAsOperand(O, SlotTracker); 3122 O << " = phi "; 3123 printOperands(O, SlotTracker); 3124 } 3125 #endif 3126 3127 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and 3128 // remove VPActiveLaneMaskPHIRecipe. 3129 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { 3130 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3131 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { 3132 Value *StartMask = State.get(getOperand(0), Part); 3133 PHINode *EntryPart = 3134 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); 3135 EntryPart->addIncoming(StartMask, VectorPH); 3136 EntryPart->setDebugLoc(getDebugLoc()); 3137 State.set(this, EntryPart, Part); 3138 } 3139 } 3140 3141 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3142 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3143 VPSlotTracker &SlotTracker) const { 3144 O << Indent << "ACTIVE-LANE-MASK-PHI "; 3145 3146 printAsOperand(O, SlotTracker); 3147 O << " = phi "; 3148 printOperands(O, SlotTracker); 3149 } 3150 #endif 3151 3152 void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) { 3153 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3154 assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization."); 3155 Value *Start = State.get(getOperand(0), VPIteration(0, 0)); 3156 PHINode *EntryPart = 3157 State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv"); 3158 EntryPart->addIncoming(Start, VectorPH); 3159 EntryPart->setDebugLoc(getDebugLoc()); 3160 State.set(this, EntryPart, 0, /*IsScalar=*/true); 3161 } 3162 3163 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3164 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3165 VPSlotTracker &SlotTracker) const { 3166 O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI "; 3167 3168 printAsOperand(O, SlotTracker); 3169 O << " = phi "; 3170 printOperands(O, SlotTracker); 3171 } 3172 #endif 3173