1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file contains implementations for different VPlan recipes. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "VPlan.h" 15 #include "VPlanAnalysis.h" 16 #include "VPlanUtils.h" 17 #include "llvm/ADT/STLExtras.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/Twine.h" 20 #include "llvm/Analysis/IVDescriptors.h" 21 #include "llvm/IR/BasicBlock.h" 22 #include "llvm/IR/IRBuilder.h" 23 #include "llvm/IR/Instruction.h" 24 #include "llvm/IR/Instructions.h" 25 #include "llvm/IR/Intrinsics.h" 26 #include "llvm/IR/Type.h" 27 #include "llvm/IR/Value.h" 28 #include "llvm/IR/VectorBuilder.h" 29 #include "llvm/Support/Casting.h" 30 #include "llvm/Support/CommandLine.h" 31 #include "llvm/Support/Debug.h" 32 #include "llvm/Support/raw_ostream.h" 33 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 34 #include "llvm/Transforms/Utils/LoopUtils.h" 35 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 36 #include <cassert> 37 38 using namespace llvm; 39 40 using VectorParts = SmallVector<Value *, 2>; 41 42 namespace llvm { 43 extern cl::opt<bool> EnableVPlanNativePath; 44 } 45 extern cl::opt<unsigned> ForceTargetInstructionCost; 46 47 #define LV_NAME "loop-vectorize" 48 #define DEBUG_TYPE LV_NAME 49 50 bool VPRecipeBase::mayWriteToMemory() const { 51 switch (getVPDefID()) { 52 case VPInstructionSC: 53 if (Instruction::isBinaryOp(cast<VPInstruction>(this)->getOpcode())) 54 return false; 55 switch (cast<VPInstruction>(this)->getOpcode()) { 56 case Instruction::Or: 57 case Instruction::ICmp: 58 case Instruction::Select: 59 case VPInstruction::Not: 60 case VPInstruction::CalculateTripCountMinusVF: 61 case VPInstruction::CanonicalIVIncrementForPart: 62 case VPInstruction::ExtractFromEnd: 63 case VPInstruction::FirstOrderRecurrenceSplice: 64 case VPInstruction::LogicalAnd: 65 case VPInstruction::PtrAdd: 66 return false; 67 default: 68 return true; 69 } 70 case VPInterleaveSC: 71 return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0; 72 case VPWidenStoreEVLSC: 73 case VPWidenStoreSC: 74 return true; 75 case VPReplicateSC: 76 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 77 ->mayWriteToMemory(); 78 case VPWidenCallSC: 79 return !cast<VPWidenCallRecipe>(this) 80 ->getCalledScalarFunction() 81 ->onlyReadsMemory(); 82 case VPBranchOnMaskSC: 83 case VPScalarIVStepsSC: 84 case VPPredInstPHISC: 85 return false; 86 case VPBlendSC: 87 case VPReductionEVLSC: 88 case VPReductionSC: 89 case VPWidenCanonicalIVSC: 90 case VPWidenCastSC: 91 case VPWidenGEPSC: 92 case VPWidenIntOrFpInductionSC: 93 case VPWidenLoadEVLSC: 94 case VPWidenLoadSC: 95 case VPWidenPHISC: 96 case VPWidenSC: 97 case VPWidenEVLSC: 98 case VPWidenSelectSC: { 99 const Instruction *I = 100 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 101 (void)I; 102 assert((!I || !I->mayWriteToMemory()) && 103 "underlying instruction may write to memory"); 104 return false; 105 } 106 default: 107 return true; 108 } 109 } 110 111 bool VPRecipeBase::mayReadFromMemory() const { 112 switch (getVPDefID()) { 113 case VPWidenLoadEVLSC: 114 case VPWidenLoadSC: 115 return true; 116 case VPReplicateSC: 117 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 118 ->mayReadFromMemory(); 119 case VPWidenCallSC: 120 return !cast<VPWidenCallRecipe>(this) 121 ->getCalledScalarFunction() 122 ->onlyWritesMemory(); 123 case VPBranchOnMaskSC: 124 case VPPredInstPHISC: 125 case VPScalarIVStepsSC: 126 case VPWidenStoreEVLSC: 127 case VPWidenStoreSC: 128 return false; 129 case VPBlendSC: 130 case VPReductionEVLSC: 131 case VPReductionSC: 132 case VPWidenCanonicalIVSC: 133 case VPWidenCastSC: 134 case VPWidenGEPSC: 135 case VPWidenIntOrFpInductionSC: 136 case VPWidenPHISC: 137 case VPWidenSC: 138 case VPWidenEVLSC: 139 case VPWidenSelectSC: { 140 const Instruction *I = 141 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 142 (void)I; 143 assert((!I || !I->mayReadFromMemory()) && 144 "underlying instruction may read from memory"); 145 return false; 146 } 147 default: 148 return true; 149 } 150 } 151 152 bool VPRecipeBase::mayHaveSideEffects() const { 153 switch (getVPDefID()) { 154 case VPDerivedIVSC: 155 case VPPredInstPHISC: 156 case VPScalarCastSC: 157 return false; 158 case VPInstructionSC: 159 return mayWriteToMemory(); 160 case VPWidenCallSC: { 161 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction(); 162 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn(); 163 } 164 case VPBlendSC: 165 case VPReductionEVLSC: 166 case VPReductionSC: 167 case VPScalarIVStepsSC: 168 case VPWidenCanonicalIVSC: 169 case VPWidenCastSC: 170 case VPWidenGEPSC: 171 case VPWidenIntOrFpInductionSC: 172 case VPWidenPHISC: 173 case VPWidenPointerInductionSC: 174 case VPWidenSC: 175 case VPWidenEVLSC: 176 case VPWidenSelectSC: { 177 const Instruction *I = 178 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 179 (void)I; 180 assert((!I || !I->mayHaveSideEffects()) && 181 "underlying instruction has side-effects"); 182 return false; 183 } 184 case VPInterleaveSC: 185 return mayWriteToMemory(); 186 case VPWidenLoadEVLSC: 187 case VPWidenLoadSC: 188 case VPWidenStoreEVLSC: 189 case VPWidenStoreSC: 190 assert( 191 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() == 192 mayWriteToMemory() && 193 "mayHaveSideffects result for ingredient differs from this " 194 "implementation"); 195 return mayWriteToMemory(); 196 case VPReplicateSC: { 197 auto *R = cast<VPReplicateRecipe>(this); 198 return R->getUnderlyingInstr()->mayHaveSideEffects(); 199 } 200 default: 201 return true; 202 } 203 } 204 205 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { 206 VPValue *ExitValue = getOperand(0); 207 VPBasicBlock *MiddleVPBB = 208 cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor()); 209 VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe(); 210 auto *ExitingVPBB = ExitingRecipe ? ExitingRecipe->getParent() : nullptr; 211 // Values leaving the vector loop reach live out phi's in the exiting block 212 // via middle block. 213 auto *PredVPBB = !ExitingVPBB || ExitingVPBB->getEnclosingLoopRegion() 214 ? MiddleVPBB 215 : ExitingVPBB; 216 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; 217 Value *V = State.get(ExitValue, VPLane(0)); 218 if (Phi->getBasicBlockIndex(PredBB) != -1) 219 Phi->setIncomingValueForBlock(PredBB, V); 220 else 221 Phi->addIncoming(V, PredBB); 222 } 223 224 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 225 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { 226 O << "Live-out "; 227 getPhi()->printAsOperand(O); 228 O << " = "; 229 getOperand(0)->printAsOperand(O, SlotTracker); 230 O << "\n"; 231 } 232 #endif 233 234 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { 235 assert(!Parent && "Recipe already in some VPBasicBlock"); 236 assert(InsertPos->getParent() && 237 "Insertion position not in any VPBasicBlock"); 238 InsertPos->getParent()->insert(this, InsertPos->getIterator()); 239 } 240 241 void VPRecipeBase::insertBefore(VPBasicBlock &BB, 242 iplist<VPRecipeBase>::iterator I) { 243 assert(!Parent && "Recipe already in some VPBasicBlock"); 244 assert(I == BB.end() || I->getParent() == &BB); 245 BB.insert(this, I); 246 } 247 248 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { 249 assert(!Parent && "Recipe already in some VPBasicBlock"); 250 assert(InsertPos->getParent() && 251 "Insertion position not in any VPBasicBlock"); 252 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator())); 253 } 254 255 void VPRecipeBase::removeFromParent() { 256 assert(getParent() && "Recipe not in any VPBasicBlock"); 257 getParent()->getRecipeList().remove(getIterator()); 258 Parent = nullptr; 259 } 260 261 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { 262 assert(getParent() && "Recipe not in any VPBasicBlock"); 263 return getParent()->getRecipeList().erase(getIterator()); 264 } 265 266 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { 267 removeFromParent(); 268 insertAfter(InsertPos); 269 } 270 271 void VPRecipeBase::moveBefore(VPBasicBlock &BB, 272 iplist<VPRecipeBase>::iterator I) { 273 removeFromParent(); 274 insertBefore(BB, I); 275 } 276 277 /// Return the underlying instruction to be used for computing \p R's cost via 278 /// the legacy cost model. Return nullptr if there's no suitable instruction. 279 static Instruction *getInstructionForCost(const VPRecipeBase *R) { 280 if (auto *S = dyn_cast<VPSingleDefRecipe>(R)) 281 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue()); 282 if (auto *IG = dyn_cast<VPInterleaveRecipe>(R)) 283 return IG->getInsertPos(); 284 // Currently the legacy cost model only calculates the instruction cost with 285 // underlying instruction. Removing the WidenMem here will prevent 286 // force-target-instruction-cost overwriting the cost of recipe with 287 // underlying instruction which is inconsistent with the legacy model. 288 // TODO: Remove WidenMem from this function when we don't need to compare to 289 // the legacy model. 290 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R)) 291 return &WidenMem->getIngredient(); 292 return nullptr; 293 } 294 295 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { 296 auto *UI = getInstructionForCost(this); 297 if (UI && Ctx.skipCostComputation(UI, VF.isVector())) 298 return 0; 299 300 InstructionCost RecipeCost = computeCost(VF, Ctx); 301 if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 && 302 RecipeCost.isValid()) 303 RecipeCost = InstructionCost(ForceTargetInstructionCost); 304 305 LLVM_DEBUG({ 306 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": "; 307 dump(); 308 }); 309 return RecipeCost; 310 } 311 312 InstructionCost VPRecipeBase::computeCost(ElementCount VF, 313 VPCostContext &Ctx) const { 314 // Compute the cost for the recipe falling back to the legacy cost model using 315 // the underlying instruction. If there is no underlying instruction, returns 316 // 0. 317 Instruction *UI = getInstructionForCost(this); 318 if (UI && isa<VPReplicateRecipe>(this)) { 319 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan 320 // transform, avoid computing their cost multiple times for now. 321 Ctx.SkipCostComputation.insert(UI); 322 } 323 return UI ? Ctx.getLegacyCost(UI, VF) : 0; 324 } 325 326 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { 327 assert(OpType == OperationType::FPMathOp && 328 "recipe doesn't have fast math flags"); 329 FastMathFlags Res; 330 Res.setAllowReassoc(FMFs.AllowReassoc); 331 Res.setNoNaNs(FMFs.NoNaNs); 332 Res.setNoInfs(FMFs.NoInfs); 333 Res.setNoSignedZeros(FMFs.NoSignedZeros); 334 Res.setAllowReciprocal(FMFs.AllowReciprocal); 335 Res.setAllowContract(FMFs.AllowContract); 336 Res.setApproxFunc(FMFs.ApproxFunc); 337 return Res; 338 } 339 340 template <unsigned PartOpIdx> 341 VPValue * 342 VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(VPUser &U) const { 343 if (U.getNumOperands() == PartOpIdx + 1) 344 return U.getOperand(PartOpIdx); 345 return nullptr; 346 } 347 348 template <unsigned PartOpIdx> 349 unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(VPUser &U) const { 350 if (auto *UnrollPartOp = getUnrollPartOperand(U)) 351 return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue(); 352 return 0; 353 } 354 355 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, 356 VPValue *A, VPValue *B, DebugLoc DL, 357 const Twine &Name) 358 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}), 359 Pred, DL), 360 Opcode(Opcode), Name(Name.str()) { 361 assert(Opcode == Instruction::ICmp && 362 "only ICmp predicates supported at the moment"); 363 } 364 365 VPInstruction::VPInstruction(unsigned Opcode, 366 std::initializer_list<VPValue *> Operands, 367 FastMathFlags FMFs, DebugLoc DL, const Twine &Name) 368 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL), 369 Opcode(Opcode), Name(Name.str()) { 370 // Make sure the VPInstruction is a floating-point operation. 371 assert(isFPMathOp() && "this op can't take fast-math flags"); 372 } 373 374 bool VPInstruction::doesGeneratePerAllLanes() const { 375 return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this); 376 } 377 378 bool VPInstruction::canGenerateScalarForFirstLane() const { 379 if (Instruction::isBinaryOp(getOpcode())) 380 return true; 381 if (isSingleScalar() || isVectorToScalar()) 382 return true; 383 switch (Opcode) { 384 case Instruction::ICmp: 385 case VPInstruction::BranchOnCond: 386 case VPInstruction::BranchOnCount: 387 case VPInstruction::CalculateTripCountMinusVF: 388 case VPInstruction::CanonicalIVIncrementForPart: 389 case VPInstruction::PtrAdd: 390 case VPInstruction::ExplicitVectorLength: 391 return true; 392 default: 393 return false; 394 } 395 } 396 397 Value *VPInstruction::generatePerLane(VPTransformState &State, 398 const VPLane &Lane) { 399 IRBuilderBase &Builder = State.Builder; 400 401 assert(getOpcode() == VPInstruction::PtrAdd && 402 "only PtrAdd opcodes are supported for now"); 403 return Builder.CreatePtrAdd(State.get(getOperand(0), Lane), 404 State.get(getOperand(1), Lane), Name); 405 } 406 407 Value *VPInstruction::generate(VPTransformState &State) { 408 IRBuilderBase &Builder = State.Builder; 409 410 if (Instruction::isBinaryOp(getOpcode())) { 411 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 412 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); 413 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed); 414 auto *Res = 415 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); 416 if (auto *I = dyn_cast<Instruction>(Res)) 417 setFlags(I); 418 return Res; 419 } 420 421 switch (getOpcode()) { 422 case VPInstruction::Not: { 423 Value *A = State.get(getOperand(0)); 424 return Builder.CreateNot(A, Name); 425 } 426 case Instruction::ICmp: { 427 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 428 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); 429 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed); 430 return Builder.CreateCmp(getPredicate(), A, B, Name); 431 } 432 case Instruction::Select: { 433 Value *Cond = State.get(getOperand(0)); 434 Value *Op1 = State.get(getOperand(1)); 435 Value *Op2 = State.get(getOperand(2)); 436 return Builder.CreateSelect(Cond, Op1, Op2, Name); 437 } 438 case VPInstruction::ActiveLaneMask: { 439 // Get first lane of vector induction variable. 440 Value *VIVElem0 = State.get(getOperand(0), VPLane(0)); 441 // Get the original loop tripcount. 442 Value *ScalarTC = State.get(getOperand(1), VPLane(0)); 443 444 // If this part of the active lane mask is scalar, generate the CMP directly 445 // to avoid unnecessary extracts. 446 if (State.VF.isScalar()) 447 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC, 448 Name); 449 450 auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); 451 auto *PredTy = VectorType::get(Int1Ty, State.VF); 452 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, 453 {PredTy, ScalarTC->getType()}, 454 {VIVElem0, ScalarTC}, nullptr, Name); 455 } 456 case VPInstruction::FirstOrderRecurrenceSplice: { 457 // Generate code to combine the previous and current values in vector v3. 458 // 459 // vector.ph: 460 // v_init = vector(..., ..., ..., a[-1]) 461 // br vector.body 462 // 463 // vector.body 464 // i = phi [0, vector.ph], [i+4, vector.body] 465 // v1 = phi [v_init, vector.ph], [v2, vector.body] 466 // v2 = a[i, i+1, i+2, i+3]; 467 // v3 = vector(v1(3), v2(0, 1, 2)) 468 469 auto *V1 = State.get(getOperand(0)); 470 if (!V1->getType()->isVectorTy()) 471 return V1; 472 Value *V2 = State.get(getOperand(1)); 473 return Builder.CreateVectorSplice(V1, V2, -1, Name); 474 } 475 case VPInstruction::CalculateTripCountMinusVF: { 476 unsigned UF = getParent()->getPlan()->getUF(); 477 Value *ScalarTC = State.get(getOperand(0), VPLane(0)); 478 Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF); 479 Value *Sub = Builder.CreateSub(ScalarTC, Step); 480 Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step); 481 Value *Zero = ConstantInt::get(ScalarTC->getType(), 0); 482 return Builder.CreateSelect(Cmp, Sub, Zero); 483 } 484 case VPInstruction::ExplicitVectorLength: { 485 // TODO: Restructure this code with an explicit remainder loop, vsetvli can 486 // be outside of the main loop. 487 Value *AVL = State.get(getOperand(0), /*IsScalar*/ true); 488 // Compute EVL 489 assert(AVL->getType()->isIntegerTy() && 490 "Requested vector length should be an integer."); 491 492 assert(State.VF.isScalable() && "Expected scalable vector factor."); 493 Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue()); 494 495 Value *EVL = State.Builder.CreateIntrinsic( 496 State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length, 497 {AVL, VFArg, State.Builder.getTrue()}); 498 return EVL; 499 } 500 case VPInstruction::CanonicalIVIncrementForPart: { 501 unsigned Part = getUnrollPart(*this); 502 auto *IV = State.get(getOperand(0), VPLane(0)); 503 assert(Part != 0 && "Must have a positive part"); 504 // The canonical IV is incremented by the vectorization factor (num of 505 // SIMD elements) times the unroll part. 506 Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); 507 return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(), 508 hasNoSignedWrap()); 509 } 510 case VPInstruction::BranchOnCond: { 511 Value *Cond = State.get(getOperand(0), VPLane(0)); 512 // Replace the temporary unreachable terminator with a new conditional 513 // branch, hooking it up to backward destination for exiting blocks now and 514 // to forward destination(s) later when they are created. 515 BranchInst *CondBr = 516 Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr); 517 CondBr->setSuccessor(0, nullptr); 518 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 519 520 if (!getParent()->isExiting()) 521 return CondBr; 522 523 VPRegionBlock *ParentRegion = getParent()->getParent(); 524 VPBasicBlock *Header = ParentRegion->getEntryBasicBlock(); 525 CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]); 526 return CondBr; 527 } 528 case VPInstruction::BranchOnCount: { 529 // First create the compare. 530 Value *IV = State.get(getOperand(0), /*IsScalar*/ true); 531 Value *TC = State.get(getOperand(1), /*IsScalar*/ true); 532 Value *Cond = Builder.CreateICmpEQ(IV, TC); 533 534 // Now create the branch. 535 auto *Plan = getParent()->getPlan(); 536 VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); 537 VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock(); 538 539 // Replace the temporary unreachable terminator with a new conditional 540 // branch, hooking it up to backward destination (the header) now and to the 541 // forward destination (the exit/middle block) later when it is created. 542 // Note that CreateCondBr expects a valid BB as first argument, so we need 543 // to set it to nullptr later. 544 BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), 545 State.CFG.VPBB2IRBB[Header]); 546 CondBr->setSuccessor(0, nullptr); 547 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 548 return CondBr; 549 } 550 case VPInstruction::ComputeReductionResult: { 551 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary 552 // and will be removed by breaking up the recipe further. 553 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0)); 554 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 555 // Get its reduction variable descriptor. 556 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 557 558 RecurKind RK = RdxDesc.getRecurrenceKind(); 559 560 Type *PhiTy = OrigPhi->getType(); 561 // The recipe's operands are the reduction phi, followed by one operand for 562 // each part of the reduction. 563 unsigned UF = getNumOperands() - 1; 564 VectorParts RdxParts(UF); 565 for (unsigned Part = 0; Part < UF; ++Part) 566 RdxParts[Part] = State.get(getOperand(1 + Part), PhiR->isInLoop()); 567 568 // If the vector reduction can be performed in a smaller type, we truncate 569 // then extend the loop exit value to enable InstCombine to evaluate the 570 // entire expression in the smaller type. 571 // TODO: Handle this in truncateToMinBW. 572 if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 573 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF); 574 for (unsigned Part = 0; Part < UF; ++Part) 575 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 576 } 577 // Reduce all of the unrolled parts into a single vector. 578 Value *ReducedPartRdx = RdxParts[0]; 579 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 580 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) 581 Op = Instruction::Or; 582 583 if (PhiR->isOrdered()) { 584 ReducedPartRdx = RdxParts[UF - 1]; 585 } else { 586 // Floating-point operations should have some FMF to enable the reduction. 587 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 588 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 589 for (unsigned Part = 1; Part < UF; ++Part) { 590 Value *RdxPart = RdxParts[Part]; 591 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 592 ReducedPartRdx = Builder.CreateBinOp( 593 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 594 else 595 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 596 } 597 } 598 599 // Create the reduction after the loop. Note that inloop reductions create 600 // the target reduction in the loop using a Reduction recipe. 601 if ((State.VF.isVector() || 602 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) && 603 !PhiR->isInLoop()) { 604 ReducedPartRdx = 605 createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); 606 // If the reduction can be performed in a smaller type, we need to extend 607 // the reduction to the wider type before we branch to the original loop. 608 if (PhiTy != RdxDesc.getRecurrenceType()) 609 ReducedPartRdx = RdxDesc.isSigned() 610 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 611 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 612 } 613 614 // If there were stores of the reduction value to a uniform memory address 615 // inside the loop, create the final store here. 616 if (StoreInst *SI = RdxDesc.IntermediateStore) { 617 auto *NewSI = Builder.CreateAlignedStore( 618 ReducedPartRdx, SI->getPointerOperand(), SI->getAlign()); 619 propagateMetadata(NewSI, SI); 620 } 621 622 return ReducedPartRdx; 623 } 624 case VPInstruction::ExtractFromEnd: { 625 auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue()); 626 unsigned Offset = CI->getZExtValue(); 627 assert(Offset > 0 && "Offset from end must be positive"); 628 Value *Res; 629 if (State.VF.isVector()) { 630 assert(Offset <= State.VF.getKnownMinValue() && 631 "invalid offset to extract from"); 632 // Extract lane VF - Offset from the operand. 633 Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset)); 634 } else { 635 assert(Offset <= 1 && "invalid offset to extract from"); 636 Res = State.get(getOperand(0)); 637 } 638 if (isa<ExtractElementInst>(Res)) 639 Res->setName(Name); 640 return Res; 641 } 642 case VPInstruction::LogicalAnd: { 643 Value *A = State.get(getOperand(0)); 644 Value *B = State.get(getOperand(1)); 645 return Builder.CreateLogicalAnd(A, B, Name); 646 } 647 case VPInstruction::PtrAdd: { 648 assert(vputils::onlyFirstLaneUsed(this) && 649 "can only generate first lane for PtrAdd"); 650 Value *Ptr = State.get(getOperand(0), /* IsScalar */ true); 651 Value *Addend = State.get(getOperand(1), /* IsScalar */ true); 652 return Builder.CreatePtrAdd(Ptr, Addend, Name); 653 } 654 case VPInstruction::ResumePhi: { 655 Value *IncomingFromVPlanPred = 656 State.get(getOperand(0), /* IsScalar */ true); 657 Value *IncomingFromOtherPreds = 658 State.get(getOperand(1), /* IsScalar */ true); 659 auto *NewPhi = 660 Builder.CreatePHI(IncomingFromOtherPreds->getType(), 2, Name); 661 BasicBlock *VPlanPred = 662 State.CFG 663 .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getSinglePredecessor())]; 664 NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred); 665 for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) { 666 assert(OtherPred != VPlanPred && 667 "VPlan predecessors should not be connected yet"); 668 NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred); 669 } 670 return NewPhi; 671 } 672 673 default: 674 llvm_unreachable("Unsupported opcode for instruction"); 675 } 676 } 677 678 bool VPInstruction::isVectorToScalar() const { 679 return getOpcode() == VPInstruction::ExtractFromEnd || 680 getOpcode() == VPInstruction::ComputeReductionResult; 681 } 682 683 bool VPInstruction::isSingleScalar() const { 684 return getOpcode() == VPInstruction::ResumePhi; 685 } 686 687 #if !defined(NDEBUG) 688 bool VPInstruction::isFPMathOp() const { 689 // Inspired by FPMathOperator::classof. Notable differences are that we don't 690 // support Call, PHI and Select opcodes here yet. 691 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || 692 Opcode == Instruction::FNeg || Opcode == Instruction::FSub || 693 Opcode == Instruction::FDiv || Opcode == Instruction::FRem || 694 Opcode == Instruction::FCmp || Opcode == Instruction::Select; 695 } 696 #endif 697 698 void VPInstruction::execute(VPTransformState &State) { 699 assert(!State.Lane && "VPInstruction executing an Lane"); 700 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 701 assert((hasFastMathFlags() == isFPMathOp() || 702 getOpcode() == Instruction::Select) && 703 "Recipe not a FPMathOp but has fast-math flags?"); 704 if (hasFastMathFlags()) 705 State.Builder.setFastMathFlags(getFastMathFlags()); 706 State.setDebugLocFrom(getDebugLoc()); 707 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() && 708 (vputils::onlyFirstLaneUsed(this) || 709 isVectorToScalar() || isSingleScalar()); 710 bool GeneratesPerAllLanes = doesGeneratePerAllLanes(); 711 if (GeneratesPerAllLanes) { 712 for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue(); 713 Lane != NumLanes; ++Lane) { 714 Value *GeneratedValue = generatePerLane(State, VPLane(Lane)); 715 assert(GeneratedValue && "generatePerLane must produce a value"); 716 State.set(this, GeneratedValue, VPLane(Lane)); 717 } 718 return; 719 } 720 721 Value *GeneratedValue = generate(State); 722 if (!hasResult()) 723 return; 724 assert(GeneratedValue && "generate must produce a value"); 725 assert( 726 (GeneratedValue->getType()->isVectorTy() == !GeneratesPerFirstLaneOnly || 727 State.VF.isScalar()) && 728 "scalar value but not only first lane defined"); 729 State.set(this, GeneratedValue, 730 /*IsScalar*/ GeneratesPerFirstLaneOnly); 731 } 732 733 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { 734 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 735 if (Instruction::isBinaryOp(getOpcode())) 736 return vputils::onlyFirstLaneUsed(this); 737 738 switch (getOpcode()) { 739 default: 740 return false; 741 case Instruction::ICmp: 742 case VPInstruction::PtrAdd: 743 // TODO: Cover additional opcodes. 744 return vputils::onlyFirstLaneUsed(this); 745 case VPInstruction::ActiveLaneMask: 746 case VPInstruction::ExplicitVectorLength: 747 case VPInstruction::CalculateTripCountMinusVF: 748 case VPInstruction::CanonicalIVIncrementForPart: 749 case VPInstruction::BranchOnCount: 750 case VPInstruction::BranchOnCond: 751 case VPInstruction::ResumePhi: 752 return true; 753 }; 754 llvm_unreachable("switch should return"); 755 } 756 757 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const { 758 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 759 if (Instruction::isBinaryOp(getOpcode())) 760 return vputils::onlyFirstPartUsed(this); 761 762 switch (getOpcode()) { 763 default: 764 return false; 765 case Instruction::ICmp: 766 case Instruction::Select: 767 return vputils::onlyFirstPartUsed(this); 768 case VPInstruction::BranchOnCount: 769 case VPInstruction::BranchOnCond: 770 case VPInstruction::CanonicalIVIncrementForPart: 771 return true; 772 }; 773 llvm_unreachable("switch should return"); 774 } 775 776 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 777 void VPInstruction::dump() const { 778 VPSlotTracker SlotTracker(getParent()->getPlan()); 779 print(dbgs(), "", SlotTracker); 780 } 781 782 void VPInstruction::print(raw_ostream &O, const Twine &Indent, 783 VPSlotTracker &SlotTracker) const { 784 O << Indent << "EMIT "; 785 786 if (hasResult()) { 787 printAsOperand(O, SlotTracker); 788 O << " = "; 789 } 790 791 switch (getOpcode()) { 792 case VPInstruction::Not: 793 O << "not"; 794 break; 795 case VPInstruction::SLPLoad: 796 O << "combined load"; 797 break; 798 case VPInstruction::SLPStore: 799 O << "combined store"; 800 break; 801 case VPInstruction::ActiveLaneMask: 802 O << "active lane mask"; 803 break; 804 case VPInstruction::ResumePhi: 805 O << "resume-phi"; 806 break; 807 case VPInstruction::ExplicitVectorLength: 808 O << "EXPLICIT-VECTOR-LENGTH"; 809 break; 810 case VPInstruction::FirstOrderRecurrenceSplice: 811 O << "first-order splice"; 812 break; 813 case VPInstruction::BranchOnCond: 814 O << "branch-on-cond"; 815 break; 816 case VPInstruction::CalculateTripCountMinusVF: 817 O << "TC > VF ? TC - VF : 0"; 818 break; 819 case VPInstruction::CanonicalIVIncrementForPart: 820 O << "VF * Part +"; 821 break; 822 case VPInstruction::BranchOnCount: 823 O << "branch-on-count"; 824 break; 825 case VPInstruction::ExtractFromEnd: 826 O << "extract-from-end"; 827 break; 828 case VPInstruction::ComputeReductionResult: 829 O << "compute-reduction-result"; 830 break; 831 case VPInstruction::LogicalAnd: 832 O << "logical-and"; 833 break; 834 case VPInstruction::PtrAdd: 835 O << "ptradd"; 836 break; 837 default: 838 O << Instruction::getOpcodeName(getOpcode()); 839 } 840 841 printFlags(O); 842 printOperands(O, SlotTracker); 843 844 if (auto DL = getDebugLoc()) { 845 O << ", !dbg "; 846 DL.print(O); 847 } 848 } 849 #endif 850 851 void VPIRInstruction::execute(VPTransformState &State) { 852 assert((isa<PHINode>(&I) || getNumOperands() == 0) && 853 "Only PHINodes can have extra operands"); 854 if (getNumOperands() == 1) { 855 VPValue *ExitValue = getOperand(0); 856 auto Lane = vputils::isUniformAfterVectorization(ExitValue) 857 ? VPLane::getFirstLane() 858 : VPLane::getLastLaneForVF(State.VF); 859 auto *PredVPBB = cast<VPBasicBlock>(getParent()->getSinglePredecessor()); 860 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; 861 // Set insertion point in PredBB in case an extract needs to be generated. 862 // TODO: Model extracts explicitly. 863 State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt()); 864 Value *V = State.get(ExitValue, VPLane(Lane)); 865 auto *Phi = cast<PHINode>(&I); 866 Phi->addIncoming(V, PredBB); 867 } 868 869 // Advance the insert point after the wrapped IR instruction. This allows 870 // interleaving VPIRInstructions and other recipes. 871 State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator())); 872 } 873 874 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 875 void VPIRInstruction::print(raw_ostream &O, const Twine &Indent, 876 VPSlotTracker &SlotTracker) const { 877 O << Indent << "IR " << I; 878 879 if (getNumOperands() != 0) { 880 assert(getNumOperands() == 1 && "can have at most 1 operand"); 881 O << " (extra operand: "; 882 printOperands(O, SlotTracker); 883 O << ")"; 884 } 885 } 886 #endif 887 888 void VPWidenCallRecipe::execute(VPTransformState &State) { 889 assert(State.VF.isVector() && "not widening"); 890 Function *CalledScalarFn = getCalledScalarFunction(); 891 assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) && 892 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 893 State.setDebugLocFrom(getDebugLoc()); 894 895 bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic; 896 FunctionType *VFTy = nullptr; 897 if (Variant) 898 VFTy = Variant->getFunctionType(); 899 SmallVector<Type *, 2> TysForDecl; 900 // Add return type if intrinsic is overloaded on it. 901 if (UseIntrinsic && 902 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1)) 903 TysForDecl.push_back(VectorType::get( 904 CalledScalarFn->getReturnType()->getScalarType(), State.VF)); 905 SmallVector<Value *, 4> Args; 906 for (const auto &I : enumerate(arg_operands())) { 907 // Some intrinsics have a scalar argument - don't replace it with a 908 // vector. 909 Value *Arg; 910 if (UseIntrinsic && 911 isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())) 912 Arg = State.get(I.value(), VPLane(0)); 913 // Some vectorized function variants may also take a scalar argument, 914 // e.g. linear parameters for pointers. This needs to be the scalar value 915 // from the start of the respective part when interleaving. 916 else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy()) 917 Arg = State.get(I.value(), VPLane(0)); 918 else 919 Arg = State.get(I.value()); 920 if (UseIntrinsic && 921 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index())) 922 TysForDecl.push_back(Arg->getType()); 923 Args.push_back(Arg); 924 } 925 926 Function *VectorF; 927 if (UseIntrinsic) { 928 // Use vector version of the intrinsic. 929 Module *M = State.Builder.GetInsertBlock()->getModule(); 930 VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl); 931 assert(VectorF && "Can't retrieve vector intrinsic."); 932 } else { 933 #ifndef NDEBUG 934 assert(Variant != nullptr && "Can't create vector function."); 935 #endif 936 VectorF = Variant; 937 } 938 939 auto *CI = cast_or_null<CallInst>(getUnderlyingInstr()); 940 SmallVector<OperandBundleDef, 1> OpBundles; 941 if (CI) 942 CI->getOperandBundlesAsDefs(OpBundles); 943 944 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); 945 946 if (isa<FPMathOperator>(V)) 947 V->copyFastMathFlags(CI); 948 949 if (!V->getType()->isVoidTy()) 950 State.set(this, V); 951 State.addMetadata(V, CI); 952 } 953 954 InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF, 955 VPCostContext &Ctx) const { 956 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 957 if (Variant) { 958 return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(), 959 Variant->getFunctionType()->params(), 960 CostKind); 961 } 962 963 FastMathFlags FMF; 964 // TODO: Manage flags via VPRecipeWithIRFlags. 965 if (auto *FPMO = dyn_cast_or_null<FPMathOperator>(getUnderlyingValue())) 966 FMF = FPMO->getFastMathFlags(); 967 968 // Some backends analyze intrinsic arguments to determine cost. Use the 969 // underlying value for the operand if it has one. Otherwise try to use the 970 // operand of the underlying call instruction, if there is one. Otherwise 971 // clear Arguments. 972 // TODO: Rework TTI interface to be independent of concrete IR values. 973 SmallVector<const Value *> Arguments; 974 for (const auto &[Idx, Op] : enumerate(operands())) { 975 auto *V = Op->getUnderlyingValue(); 976 if (!V) { 977 if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) { 978 Arguments.push_back(UI->getArgOperand(Idx)); 979 continue; 980 } 981 Arguments.clear(); 982 break; 983 } 984 Arguments.push_back(V); 985 } 986 987 Type *RetTy = 988 ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); 989 SmallVector<Type *> ParamTys; 990 for (unsigned I = 0; I != getNumOperands(); ++I) 991 ParamTys.push_back( 992 ToVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF)); 993 994 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst. 995 IntrinsicCostAttributes CostAttrs( 996 VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF, 997 dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue())); 998 return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, CostKind); 999 } 1000 1001 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1002 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, 1003 VPSlotTracker &SlotTracker) const { 1004 O << Indent << "WIDEN-CALL "; 1005 1006 Function *CalledFn = getCalledScalarFunction(); 1007 if (CalledFn->getReturnType()->isVoidTy()) 1008 O << "void "; 1009 else { 1010 printAsOperand(O, SlotTracker); 1011 O << " = "; 1012 } 1013 1014 O << "call @" << CalledFn->getName() << "("; 1015 interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) { 1016 Op->printAsOperand(O, SlotTracker); 1017 }); 1018 O << ")"; 1019 1020 if (VectorIntrinsicID) 1021 O << " (using vector intrinsic)"; 1022 else { 1023 O << " (using library function"; 1024 if (Variant->hasName()) 1025 O << ": " << Variant->getName(); 1026 O << ")"; 1027 } 1028 } 1029 #endif 1030 1031 void VPHistogramRecipe::execute(VPTransformState &State) { 1032 State.setDebugLocFrom(getDebugLoc()); 1033 IRBuilderBase &Builder = State.Builder; 1034 1035 Value *Address = State.get(getOperand(0)); 1036 Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true); 1037 VectorType *VTy = cast<VectorType>(Address->getType()); 1038 1039 // The histogram intrinsic requires a mask even if the recipe doesn't; 1040 // if the mask operand was omitted then all lanes should be executed and 1041 // we just need to synthesize an all-true mask. 1042 Value *Mask = nullptr; 1043 if (VPValue *VPMask = getMask()) 1044 Mask = State.get(VPMask); 1045 else 1046 Mask = 1047 Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1)); 1048 1049 // If this is a subtract, we want to invert the increment amount. We may 1050 // add a separate intrinsic in future, but for now we'll try this. 1051 if (Opcode == Instruction::Sub) 1052 IncAmt = Builder.CreateNeg(IncAmt); 1053 else 1054 assert(Opcode == Instruction::Add && "only add or sub supported for now"); 1055 1056 State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add, 1057 {VTy, IncAmt->getType()}, 1058 {Address, IncAmt, Mask}); 1059 } 1060 1061 InstructionCost VPHistogramRecipe::computeCost(ElementCount VF, 1062 VPCostContext &Ctx) const { 1063 // FIXME: Take the gather and scatter into account as well. For now we're 1064 // generating the same cost as the fallback path, but we'll likely 1065 // need to create a new TTI method for determining the cost, including 1066 // whether we can use base + vec-of-smaller-indices or just 1067 // vec-of-pointers. 1068 assert(VF.isVector() && "Invalid VF for histogram cost"); 1069 Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0)); 1070 VPValue *IncAmt = getOperand(1); 1071 Type *IncTy = Ctx.Types.inferScalarType(IncAmt); 1072 VectorType *VTy = VectorType::get(IncTy, VF); 1073 1074 // Assume that a non-constant update value (or a constant != 1) requires 1075 // a multiply, and add that into the cost. 1076 InstructionCost MulCost = 1077 Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy); 1078 if (IncAmt->isLiveIn()) { 1079 ConstantInt *CI = dyn_cast<ConstantInt>(IncAmt->getLiveInIRValue()); 1080 1081 if (CI && CI->getZExtValue() == 1) 1082 MulCost = TTI::TCC_Free; 1083 } 1084 1085 // Find the cost of the histogram operation itself. 1086 Type *PtrTy = VectorType::get(AddressTy, VF); 1087 Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF); 1088 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add, 1089 Type::getVoidTy(Ctx.LLVMCtx), 1090 {PtrTy, IncTy, MaskTy}); 1091 1092 // Add the costs together with the add/sub operation. 1093 return Ctx.TTI.getIntrinsicInstrCost( 1094 ICA, TargetTransformInfo::TCK_RecipThroughput) + 1095 MulCost + Ctx.TTI.getArithmeticInstrCost(Opcode, VTy); 1096 } 1097 1098 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1099 void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent, 1100 VPSlotTracker &SlotTracker) const { 1101 O << Indent << "WIDEN-HISTOGRAM buckets: "; 1102 getOperand(0)->printAsOperand(O, SlotTracker); 1103 1104 if (Opcode == Instruction::Sub) 1105 O << ", dec: "; 1106 else { 1107 assert(Opcode == Instruction::Add); 1108 O << ", inc: "; 1109 } 1110 getOperand(1)->printAsOperand(O, SlotTracker); 1111 1112 if (VPValue *Mask = getMask()) { 1113 O << ", mask: "; 1114 Mask->printAsOperand(O, SlotTracker); 1115 } 1116 } 1117 1118 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, 1119 VPSlotTracker &SlotTracker) const { 1120 O << Indent << "WIDEN-SELECT "; 1121 printAsOperand(O, SlotTracker); 1122 O << " = select "; 1123 getOperand(0)->printAsOperand(O, SlotTracker); 1124 O << ", "; 1125 getOperand(1)->printAsOperand(O, SlotTracker); 1126 O << ", "; 1127 getOperand(2)->printAsOperand(O, SlotTracker); 1128 O << (isInvariantCond() ? " (condition is loop invariant)" : ""); 1129 } 1130 #endif 1131 1132 void VPWidenSelectRecipe::execute(VPTransformState &State) { 1133 State.setDebugLocFrom(getDebugLoc()); 1134 1135 // The condition can be loop invariant but still defined inside the 1136 // loop. This means that we can't just use the original 'cond' value. 1137 // We have to take the 'vectorized' value and pick the first lane. 1138 // Instcombine will make this a no-op. 1139 auto *InvarCond = 1140 isInvariantCond() ? State.get(getCond(), VPLane(0)) : nullptr; 1141 1142 Value *Cond = InvarCond ? InvarCond : State.get(getCond()); 1143 Value *Op0 = State.get(getOperand(1)); 1144 Value *Op1 = State.get(getOperand(2)); 1145 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 1146 State.set(this, Sel); 1147 State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1148 } 1149 1150 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy( 1151 const FastMathFlags &FMF) { 1152 AllowReassoc = FMF.allowReassoc(); 1153 NoNaNs = FMF.noNaNs(); 1154 NoInfs = FMF.noInfs(); 1155 NoSignedZeros = FMF.noSignedZeros(); 1156 AllowReciprocal = FMF.allowReciprocal(); 1157 AllowContract = FMF.allowContract(); 1158 ApproxFunc = FMF.approxFunc(); 1159 } 1160 1161 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1162 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { 1163 switch (OpType) { 1164 case OperationType::Cmp: 1165 O << " " << CmpInst::getPredicateName(getPredicate()); 1166 break; 1167 case OperationType::DisjointOp: 1168 if (DisjointFlags.IsDisjoint) 1169 O << " disjoint"; 1170 break; 1171 case OperationType::PossiblyExactOp: 1172 if (ExactFlags.IsExact) 1173 O << " exact"; 1174 break; 1175 case OperationType::OverflowingBinOp: 1176 if (WrapFlags.HasNUW) 1177 O << " nuw"; 1178 if (WrapFlags.HasNSW) 1179 O << " nsw"; 1180 break; 1181 case OperationType::FPMathOp: 1182 getFastMathFlags().print(O); 1183 break; 1184 case OperationType::GEPOp: 1185 if (GEPFlags.IsInBounds) 1186 O << " inbounds"; 1187 break; 1188 case OperationType::NonNegOp: 1189 if (NonNegFlags.NonNeg) 1190 O << " nneg"; 1191 break; 1192 case OperationType::Other: 1193 break; 1194 } 1195 if (getNumOperands() > 0) 1196 O << " "; 1197 } 1198 #endif 1199 1200 void VPWidenRecipe::execute(VPTransformState &State) { 1201 State.setDebugLocFrom(getDebugLoc()); 1202 auto &Builder = State.Builder; 1203 switch (Opcode) { 1204 case Instruction::Call: 1205 case Instruction::Br: 1206 case Instruction::PHI: 1207 case Instruction::GetElementPtr: 1208 case Instruction::Select: 1209 llvm_unreachable("This instruction is handled by a different recipe."); 1210 case Instruction::UDiv: 1211 case Instruction::SDiv: 1212 case Instruction::SRem: 1213 case Instruction::URem: 1214 case Instruction::Add: 1215 case Instruction::FAdd: 1216 case Instruction::Sub: 1217 case Instruction::FSub: 1218 case Instruction::FNeg: 1219 case Instruction::Mul: 1220 case Instruction::FMul: 1221 case Instruction::FDiv: 1222 case Instruction::FRem: 1223 case Instruction::Shl: 1224 case Instruction::LShr: 1225 case Instruction::AShr: 1226 case Instruction::And: 1227 case Instruction::Or: 1228 case Instruction::Xor: { 1229 // Just widen unops and binops. 1230 SmallVector<Value *, 2> Ops; 1231 for (VPValue *VPOp : operands()) 1232 Ops.push_back(State.get(VPOp)); 1233 1234 Value *V = Builder.CreateNAryOp(Opcode, Ops); 1235 1236 if (auto *VecOp = dyn_cast<Instruction>(V)) 1237 setFlags(VecOp); 1238 1239 // Use this vector value for all users of the original instruction. 1240 State.set(this, V); 1241 State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1242 break; 1243 } 1244 case Instruction::Freeze: { 1245 Value *Op = State.get(getOperand(0)); 1246 1247 Value *Freeze = Builder.CreateFreeze(Op); 1248 State.set(this, Freeze); 1249 break; 1250 } 1251 case Instruction::ICmp: 1252 case Instruction::FCmp: { 1253 // Widen compares. Generate vector compares. 1254 bool FCmp = Opcode == Instruction::FCmp; 1255 Value *A = State.get(getOperand(0)); 1256 Value *B = State.get(getOperand(1)); 1257 Value *C = nullptr; 1258 if (FCmp) { 1259 // Propagate fast math flags. 1260 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 1261 if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue())) 1262 Builder.setFastMathFlags(I->getFastMathFlags()); 1263 C = Builder.CreateFCmp(getPredicate(), A, B); 1264 } else { 1265 C = Builder.CreateICmp(getPredicate(), A, B); 1266 } 1267 State.set(this, C); 1268 State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1269 break; 1270 } 1271 default: 1272 // This instruction is not vectorized by simple widening. 1273 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " 1274 << Instruction::getOpcodeName(Opcode)); 1275 llvm_unreachable("Unhandled instruction!"); 1276 } // end of switch. 1277 1278 #if !defined(NDEBUG) 1279 // Verify that VPlan type inference results agree with the type of the 1280 // generated values. 1281 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) == 1282 State.get(this)->getType() && 1283 "inferred type and type from generated instructions do not match"); 1284 #endif 1285 } 1286 1287 InstructionCost VPWidenRecipe::computeCost(ElementCount VF, 1288 VPCostContext &Ctx) const { 1289 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1290 switch (Opcode) { 1291 case Instruction::FNeg: { 1292 Type *VectorTy = 1293 ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); 1294 return Ctx.TTI.getArithmeticInstrCost( 1295 Opcode, VectorTy, CostKind, 1296 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 1297 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}); 1298 } 1299 1300 case Instruction::UDiv: 1301 case Instruction::SDiv: 1302 case Instruction::SRem: 1303 case Instruction::URem: 1304 // More complex computation, let the legacy cost-model handle this for now. 1305 return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF); 1306 case Instruction::Add: 1307 case Instruction::FAdd: 1308 case Instruction::Sub: 1309 case Instruction::FSub: 1310 case Instruction::Mul: 1311 case Instruction::FMul: 1312 case Instruction::FDiv: 1313 case Instruction::FRem: 1314 case Instruction::Shl: 1315 case Instruction::LShr: 1316 case Instruction::AShr: 1317 case Instruction::And: 1318 case Instruction::Or: 1319 case Instruction::Xor: { 1320 VPValue *RHS = getOperand(1); 1321 // Certain instructions can be cheaper to vectorize if they have a constant 1322 // second vector operand. One example of this are shifts on x86. 1323 TargetTransformInfo::OperandValueInfo RHSInfo = { 1324 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}; 1325 if (RHS->isLiveIn()) 1326 RHSInfo = Ctx.TTI.getOperandInfo(RHS->getLiveInIRValue()); 1327 1328 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue && 1329 getOperand(1)->isDefinedOutsideLoopRegions()) 1330 RHSInfo.Kind = TargetTransformInfo::OK_UniformValue; 1331 Type *VectorTy = 1332 ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); 1333 Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue()); 1334 1335 SmallVector<const Value *, 4> Operands; 1336 if (CtxI) 1337 Operands.append(CtxI->value_op_begin(), CtxI->value_op_end()); 1338 return Ctx.TTI.getArithmeticInstrCost( 1339 Opcode, VectorTy, CostKind, 1340 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 1341 RHSInfo, Operands, CtxI, &Ctx.TLI); 1342 } 1343 case Instruction::Freeze: { 1344 // This opcode is unknown. Assume that it is the same as 'mul'. 1345 Type *VectorTy = 1346 ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); 1347 return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 1348 } 1349 case Instruction::ICmp: 1350 case Instruction::FCmp: { 1351 Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue()); 1352 Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); 1353 return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(), 1354 CostKind, 1355 {TTI::OK_AnyValue, TTI::OP_None}, 1356 {TTI::OK_AnyValue, TTI::OP_None}, CtxI); 1357 } 1358 default: 1359 llvm_unreachable("Unsupported opcode for instruction"); 1360 } 1361 } 1362 1363 void VPWidenEVLRecipe::execute(VPTransformState &State) { 1364 unsigned Opcode = getOpcode(); 1365 // TODO: Support other opcodes 1366 if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode)) 1367 llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute"); 1368 1369 State.setDebugLocFrom(getDebugLoc()); 1370 1371 assert(State.get(getOperand(0))->getType()->isVectorTy() && 1372 "VPWidenEVLRecipe should not be used for scalars"); 1373 1374 VPValue *EVL = getEVL(); 1375 Value *EVLArg = State.get(EVL, /*NeedsScalar=*/true); 1376 IRBuilderBase &BuilderIR = State.Builder; 1377 VectorBuilder Builder(BuilderIR); 1378 Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue()); 1379 1380 SmallVector<Value *, 4> Ops; 1381 for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) { 1382 VPValue *VPOp = getOperand(I); 1383 Ops.push_back(State.get(VPOp)); 1384 } 1385 1386 Builder.setMask(Mask).setEVL(EVLArg); 1387 Value *VPInst = 1388 Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops, "vp.op"); 1389 // Currently vp-intrinsics only accept FMF flags. 1390 // TODO: Enable other flags when support is added. 1391 if (isa<FPMathOperator>(VPInst)) 1392 setFlags(cast<Instruction>(VPInst)); 1393 1394 State.set(this, VPInst); 1395 State.addMetadata(VPInst, 1396 dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1397 } 1398 1399 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1400 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, 1401 VPSlotTracker &SlotTracker) const { 1402 O << Indent << "WIDEN "; 1403 printAsOperand(O, SlotTracker); 1404 O << " = " << Instruction::getOpcodeName(Opcode); 1405 printFlags(O); 1406 printOperands(O, SlotTracker); 1407 } 1408 1409 void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent, 1410 VPSlotTracker &SlotTracker) const { 1411 O << Indent << "WIDEN "; 1412 printAsOperand(O, SlotTracker); 1413 O << " = vp." << Instruction::getOpcodeName(getOpcode()); 1414 printFlags(O); 1415 printOperands(O, SlotTracker); 1416 } 1417 #endif 1418 1419 void VPWidenCastRecipe::execute(VPTransformState &State) { 1420 State.setDebugLocFrom(getDebugLoc()); 1421 auto &Builder = State.Builder; 1422 /// Vectorize casts. 1423 assert(State.VF.isVector() && "Not vectorizing?"); 1424 Type *DestTy = VectorType::get(getResultType(), State.VF); 1425 VPValue *Op = getOperand(0); 1426 Value *A = State.get(Op); 1427 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); 1428 State.set(this, Cast); 1429 State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue())); 1430 } 1431 1432 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1433 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, 1434 VPSlotTracker &SlotTracker) const { 1435 O << Indent << "WIDEN-CAST "; 1436 printAsOperand(O, SlotTracker); 1437 O << " = " << Instruction::getOpcodeName(Opcode) << " "; 1438 printFlags(O); 1439 printOperands(O, SlotTracker); 1440 O << " to " << *getResultType(); 1441 } 1442 #endif 1443 1444 /// This function adds 1445 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 1446 /// to each vector element of Val. The sequence starts at StartIndex. 1447 /// \p Opcode is relevant for FP induction variable. 1448 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 1449 Instruction::BinaryOps BinOp, ElementCount VF, 1450 IRBuilderBase &Builder) { 1451 assert(VF.isVector() && "only vector VFs are supported"); 1452 1453 // Create and check the types. 1454 auto *ValVTy = cast<VectorType>(Val->getType()); 1455 ElementCount VLen = ValVTy->getElementCount(); 1456 1457 Type *STy = Val->getType()->getScalarType(); 1458 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1459 "Induction Step must be an integer or FP"); 1460 assert(Step->getType() == STy && "Step has wrong type"); 1461 1462 SmallVector<Constant *, 8> Indices; 1463 1464 // Create a vector of consecutive numbers from zero to VF. 1465 VectorType *InitVecValVTy = ValVTy; 1466 if (STy->isFloatingPointTy()) { 1467 Type *InitVecValSTy = 1468 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 1469 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 1470 } 1471 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 1472 1473 // Splat the StartIdx 1474 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 1475 1476 if (STy->isIntegerTy()) { 1477 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 1478 Step = Builder.CreateVectorSplat(VLen, Step); 1479 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1480 // FIXME: The newly created binary instructions should contain nsw/nuw 1481 // flags, which can be found from the original scalar operations. 1482 Step = Builder.CreateMul(InitVec, Step); 1483 return Builder.CreateAdd(Val, Step, "induction"); 1484 } 1485 1486 // Floating point induction. 1487 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1488 "Binary Opcode should be specified for FP induction"); 1489 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 1490 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 1491 1492 Step = Builder.CreateVectorSplat(VLen, Step); 1493 Value *MulOp = Builder.CreateFMul(InitVec, Step); 1494 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1495 } 1496 1497 /// A helper function that returns an integer or floating-point constant with 1498 /// value C. 1499 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 1500 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 1501 : ConstantFP::get(Ty, C); 1502 } 1503 1504 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 1505 assert(!State.Lane && "Int or FP induction being replicated."); 1506 1507 Value *Start = getStartValue()->getLiveInIRValue(); 1508 const InductionDescriptor &ID = getInductionDescriptor(); 1509 TruncInst *Trunc = getTruncInst(); 1510 IRBuilderBase &Builder = State.Builder; 1511 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1512 assert(State.VF.isVector() && "must have vector VF"); 1513 1514 // The value from the original loop to which we are mapping the new induction 1515 // variable. 1516 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1517 1518 // Fast-math-flags propagate from the original induction instruction. 1519 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 1520 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 1521 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 1522 1523 // Now do the actual transformations, and start with fetching the step value. 1524 Value *Step = State.get(getStepValue(), VPLane(0)); 1525 1526 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1527 "Expected either an induction phi-node or a truncate of it!"); 1528 1529 // Construct the initial value of the vector IV in the vector loop preheader 1530 auto CurrIP = Builder.saveIP(); 1531 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1532 Builder.SetInsertPoint(VectorPH->getTerminator()); 1533 if (isa<TruncInst>(EntryVal)) { 1534 assert(Start->getType()->isIntegerTy() && 1535 "Truncation requires an integer type"); 1536 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1537 Step = Builder.CreateTrunc(Step, TruncType); 1538 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1539 } 1540 1541 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 1542 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 1543 Value *SteppedStart = getStepVector( 1544 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); 1545 1546 // We create vector phi nodes for both integer and floating-point induction 1547 // variables. Here, we determine the kind of arithmetic we will perform. 1548 Instruction::BinaryOps AddOp; 1549 Instruction::BinaryOps MulOp; 1550 if (Step->getType()->isIntegerTy()) { 1551 AddOp = Instruction::Add; 1552 MulOp = Instruction::Mul; 1553 } else { 1554 AddOp = ID.getInductionOpcode(); 1555 MulOp = Instruction::FMul; 1556 } 1557 1558 Value *SplatVF; 1559 if (VPValue *SplatVFOperand = getSplatVFValue()) { 1560 // The recipe has been unrolled. In that case, fetch the splat value for the 1561 // induction increment. 1562 SplatVF = State.get(SplatVFOperand); 1563 } else { 1564 // Multiply the vectorization factor by the step using integer or 1565 // floating-point arithmetic as appropriate. 1566 Type *StepType = Step->getType(); 1567 Value *RuntimeVF = State.get(getVFValue(), VPLane(0)); 1568 if (Step->getType()->isFloatingPointTy()) 1569 RuntimeVF = Builder.CreateUIToFP(RuntimeVF, StepType); 1570 else 1571 RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, StepType); 1572 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 1573 1574 // Create a vector splat to use in the induction update. 1575 // 1576 // FIXME: If the step is non-constant, we create the vector splat with 1577 // IRBuilder. IRBuilder can constant-fold the multiply, but it 1578 // doesn't handle a constant vector splat. 1579 SplatVF = isa<Constant>(Mul) 1580 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 1581 : Builder.CreateVectorSplat(State.VF, Mul); 1582 } 1583 1584 Builder.restoreIP(CurrIP); 1585 1586 // We may need to add the step a number of times, depending on the unroll 1587 // factor. The last of those goes into the PHI. 1588 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind"); 1589 VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 1590 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1591 State.set(this, VecInd); 1592 1593 Instruction *LastInduction = cast<Instruction>( 1594 Builder.CreateBinOp(AddOp, VecInd, SplatVF, "vec.ind.next")); 1595 if (isa<TruncInst>(EntryVal)) 1596 State.addMetadata(LastInduction, EntryVal); 1597 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1598 1599 VecInd->addIncoming(SteppedStart, VectorPH); 1600 // Add induction update using an incorrect block temporarily. The phi node 1601 // will be fixed after VPlan execution. Note that at this point the latch 1602 // block cannot be used, as it does not exist yet. 1603 // TODO: Model increment value in VPlan, by turning the recipe into a 1604 // multi-def and a subclass of VPHeaderPHIRecipe. 1605 VecInd->addIncoming(LastInduction, VectorPH); 1606 } 1607 1608 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1609 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, 1610 VPSlotTracker &SlotTracker) const { 1611 O << Indent << "WIDEN-INDUCTION"; 1612 if (getTruncInst()) { 1613 O << "\\l\""; 1614 O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\""; 1615 O << " +\n" << Indent << "\" "; 1616 getVPValue(0)->printAsOperand(O, SlotTracker); 1617 } else 1618 O << " " << VPlanIngredient(IV); 1619 1620 O << ", "; 1621 getStepValue()->printAsOperand(O, SlotTracker); 1622 1623 O << ", "; 1624 getVFValue()->printAsOperand(O, SlotTracker); 1625 } 1626 #endif 1627 1628 bool VPWidenIntOrFpInductionRecipe::isCanonical() const { 1629 // The step may be defined by a recipe in the preheader (e.g. if it requires 1630 // SCEV expansion), but for the canonical induction the step is required to be 1631 // 1, which is represented as live-in. 1632 if (getStepValue()->getDefiningRecipe()) 1633 return false; 1634 auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue()); 1635 auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); 1636 auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin()); 1637 return StartC && StartC->isZero() && StepC && StepC->isOne() && 1638 getScalarType() == CanIV->getScalarType(); 1639 } 1640 1641 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1642 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, 1643 VPSlotTracker &SlotTracker) const { 1644 O << Indent; 1645 printAsOperand(O, SlotTracker); 1646 O << " = DERIVED-IV "; 1647 getStartValue()->printAsOperand(O, SlotTracker); 1648 O << " + "; 1649 getOperand(1)->printAsOperand(O, SlotTracker); 1650 O << " * "; 1651 getStepValue()->printAsOperand(O, SlotTracker); 1652 } 1653 #endif 1654 1655 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 1656 // Fast-math-flags propagate from the original induction instruction. 1657 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 1658 if (hasFastMathFlags()) 1659 State.Builder.setFastMathFlags(getFastMathFlags()); 1660 1661 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 1662 /// variable on which to base the steps, \p Step is the size of the step. 1663 1664 Value *BaseIV = State.get(getOperand(0), VPLane(0)); 1665 Value *Step = State.get(getStepValue(), VPLane(0)); 1666 IRBuilderBase &Builder = State.Builder; 1667 1668 // Ensure step has the same type as that of scalar IV. 1669 Type *BaseIVTy = BaseIV->getType()->getScalarType(); 1670 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!"); 1671 1672 // We build scalar steps for both integer and floating-point induction 1673 // variables. Here, we determine the kind of arithmetic we will perform. 1674 Instruction::BinaryOps AddOp; 1675 Instruction::BinaryOps MulOp; 1676 if (BaseIVTy->isIntegerTy()) { 1677 AddOp = Instruction::Add; 1678 MulOp = Instruction::Mul; 1679 } else { 1680 AddOp = InductionOpcode; 1681 MulOp = Instruction::FMul; 1682 } 1683 1684 // Determine the number of scalars we need to generate for each unroll 1685 // iteration. 1686 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this); 1687 // Compute the scalar steps and save the results in State. 1688 Type *IntStepTy = 1689 IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits()); 1690 Type *VecIVTy = nullptr; 1691 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 1692 if (!FirstLaneOnly && State.VF.isScalable()) { 1693 VecIVTy = VectorType::get(BaseIVTy, State.VF); 1694 UnitStepVec = 1695 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 1696 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 1697 SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV); 1698 } 1699 1700 unsigned StartLane = 0; 1701 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 1702 if (State.Lane) { 1703 StartLane = State.Lane->getKnownLane(); 1704 EndLane = StartLane + 1; 1705 } 1706 Value *StartIdx0 = 1707 createStepForVF(Builder, IntStepTy, State.VF, getUnrollPart(*this)); 1708 1709 if (!FirstLaneOnly && State.VF.isScalable()) { 1710 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 1711 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 1712 if (BaseIVTy->isFloatingPointTy()) 1713 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 1714 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 1715 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 1716 State.set(this, Add); 1717 // It's useful to record the lane values too for the known minimum number 1718 // of elements so we do those below. This improves the code quality when 1719 // trying to extract the first element, for example. 1720 } 1721 1722 if (BaseIVTy->isFloatingPointTy()) 1723 StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy); 1724 1725 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { 1726 Value *StartIdx = Builder.CreateBinOp( 1727 AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane)); 1728 // The step returned by `createStepForVF` is a runtime-evaluated value 1729 // when VF is scalable. Otherwise, it should be folded into a Constant. 1730 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 1731 "Expected StartIdx to be folded to a constant when VF is not " 1732 "scalable"); 1733 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 1734 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul); 1735 State.set(this, Add, VPLane(Lane)); 1736 } 1737 } 1738 1739 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1740 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, 1741 VPSlotTracker &SlotTracker) const { 1742 O << Indent; 1743 printAsOperand(O, SlotTracker); 1744 O << " = SCALAR-STEPS "; 1745 printOperands(O, SlotTracker); 1746 } 1747 #endif 1748 1749 void VPWidenGEPRecipe::execute(VPTransformState &State) { 1750 assert(State.VF.isVector() && "not widening"); 1751 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 1752 // Construct a vector GEP by widening the operands of the scalar GEP as 1753 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 1754 // results in a vector of pointers when at least one operand of the GEP 1755 // is vector-typed. Thus, to keep the representation compact, we only use 1756 // vector-typed operands for loop-varying values. 1757 1758 if (areAllOperandsInvariant()) { 1759 // If we are vectorizing, but the GEP has only loop-invariant operands, 1760 // the GEP we build (by only using vector-typed operands for 1761 // loop-varying values) would be a scalar pointer. Thus, to ensure we 1762 // produce a vector of pointers, we need to either arbitrarily pick an 1763 // operand to broadcast, or broadcast a clone of the original GEP. 1764 // Here, we broadcast a clone of the original. 1765 // 1766 // TODO: If at some point we decide to scalarize instructions having 1767 // loop-invariant operands, this special case will no longer be 1768 // required. We would add the scalarization decision to 1769 // collectLoopScalars() and teach getVectorValue() to broadcast 1770 // the lane-zero scalar value. 1771 SmallVector<Value *> Ops; 1772 for (unsigned I = 0, E = getNumOperands(); I != E; I++) 1773 Ops.push_back(State.get(getOperand(I), VPLane(0))); 1774 1775 auto *NewGEP = 1776 State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0], 1777 ArrayRef(Ops).drop_front(), "", isInBounds()); 1778 Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP); 1779 State.set(this, Splat); 1780 State.addMetadata(Splat, GEP); 1781 } else { 1782 // If the GEP has at least one loop-varying operand, we are sure to 1783 // produce a vector of pointers unless VF is scalar. 1784 // The pointer operand of the new GEP. If it's loop-invariant, we 1785 // won't broadcast it. 1786 auto *Ptr = isPointerLoopInvariant() ? State.get(getOperand(0), VPLane(0)) 1787 : State.get(getOperand(0)); 1788 1789 // Collect all the indices for the new GEP. If any index is 1790 // loop-invariant, we won't broadcast it. 1791 SmallVector<Value *, 4> Indices; 1792 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 1793 VPValue *Operand = getOperand(I); 1794 if (isIndexLoopInvariant(I - 1)) 1795 Indices.push_back(State.get(Operand, VPLane(0))); 1796 else 1797 Indices.push_back(State.get(Operand)); 1798 } 1799 1800 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 1801 // but it should be a vector, otherwise. 1802 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, 1803 Indices, "", isInBounds()); 1804 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 1805 "NewGEP is not a pointer vector"); 1806 State.set(this, NewGEP); 1807 State.addMetadata(NewGEP, GEP); 1808 } 1809 } 1810 1811 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1812 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, 1813 VPSlotTracker &SlotTracker) const { 1814 O << Indent << "WIDEN-GEP "; 1815 O << (isPointerLoopInvariant() ? "Inv" : "Var"); 1816 for (size_t I = 0; I < getNumOperands() - 1; ++I) 1817 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]"; 1818 1819 O << " "; 1820 printAsOperand(O, SlotTracker); 1821 O << " = getelementptr"; 1822 printFlags(O); 1823 printOperands(O, SlotTracker); 1824 } 1825 #endif 1826 1827 void VPVectorPointerRecipe ::execute(VPTransformState &State) { 1828 auto &Builder = State.Builder; 1829 State.setDebugLocFrom(getDebugLoc()); 1830 unsigned CurrentPart = getUnrollPart(*this); 1831 // Use i32 for the gep index type when the value is constant, 1832 // or query DataLayout for a more suitable index type otherwise. 1833 const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout(); 1834 Type *IndexTy = State.VF.isScalable() && (IsReverse || CurrentPart > 0) 1835 ? DL.getIndexType(Builder.getPtrTy(0)) 1836 : Builder.getInt32Ty(); 1837 Value *Ptr = State.get(getOperand(0), VPLane(0)); 1838 bool InBounds = isInBounds(); 1839 1840 Value *ResultPtr = nullptr; 1841 if (IsReverse) { 1842 // If the address is consecutive but reversed, then the 1843 // wide store needs to start at the last vector element. 1844 // RunTimeVF = VScale * VF.getKnownMinValue() 1845 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 1846 Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF); 1847 // NumElt = -CurrentPart * RunTimeVF 1848 Value *NumElt = Builder.CreateMul( 1849 ConstantInt::get(IndexTy, -(int64_t)CurrentPart), RunTimeVF); 1850 // LastLane = 1 - RunTimeVF 1851 Value *LastLane = 1852 Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); 1853 ResultPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds); 1854 ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "", InBounds); 1855 } else { 1856 Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart); 1857 ResultPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds); 1858 } 1859 1860 State.set(this, ResultPtr, /*IsScalar*/ true); 1861 } 1862 1863 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1864 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, 1865 VPSlotTracker &SlotTracker) const { 1866 O << Indent; 1867 printAsOperand(O, SlotTracker); 1868 O << " = vector-pointer "; 1869 if (IsReverse) 1870 O << "(reverse) "; 1871 1872 printOperands(O, SlotTracker); 1873 } 1874 #endif 1875 1876 void VPBlendRecipe::execute(VPTransformState &State) { 1877 assert(isNormalized() && "Expected blend to be normalized!"); 1878 State.setDebugLocFrom(getDebugLoc()); 1879 // We know that all PHIs in non-header blocks are converted into 1880 // selects, so we don't have to worry about the insertion order and we 1881 // can just use the builder. 1882 // At this point we generate the predication tree. There may be 1883 // duplications since this is a simple recursive scan, but future 1884 // optimizations will clean it up. 1885 1886 unsigned NumIncoming = getNumIncomingValues(); 1887 1888 // Generate a sequence of selects of the form: 1889 // SELECT(Mask3, In3, 1890 // SELECT(Mask2, In2, 1891 // SELECT(Mask1, In1, 1892 // In0))) 1893 // Note that Mask0 is never used: lanes for which no path reaches this phi and 1894 // are essentially undef are taken from In0. 1895 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 1896 Value *Result = nullptr; 1897 for (unsigned In = 0; In < NumIncoming; ++In) { 1898 // We might have single edge PHIs (blocks) - use an identity 1899 // 'select' for the first PHI operand. 1900 Value *In0 = State.get(getIncomingValue(In), OnlyFirstLaneUsed); 1901 if (In == 0) 1902 Result = In0; // Initialize with the first incoming value. 1903 else { 1904 // Select between the current value and the previous incoming edge 1905 // based on the incoming mask. 1906 Value *Cond = State.get(getMask(In), OnlyFirstLaneUsed); 1907 Result = State.Builder.CreateSelect(Cond, In0, Result, "predphi"); 1908 } 1909 } 1910 State.set(this, Result, OnlyFirstLaneUsed); 1911 } 1912 1913 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1914 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, 1915 VPSlotTracker &SlotTracker) const { 1916 O << Indent << "BLEND "; 1917 printAsOperand(O, SlotTracker); 1918 O << " ="; 1919 if (getNumIncomingValues() == 1) { 1920 // Not a User of any mask: not really blending, this is a 1921 // single-predecessor phi. 1922 O << " "; 1923 getIncomingValue(0)->printAsOperand(O, SlotTracker); 1924 } else { 1925 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { 1926 O << " "; 1927 getIncomingValue(I)->printAsOperand(O, SlotTracker); 1928 if (I == 0) 1929 continue; 1930 O << "/"; 1931 getMask(I)->printAsOperand(O, SlotTracker); 1932 } 1933 } 1934 } 1935 #endif 1936 1937 void VPReductionRecipe::execute(VPTransformState &State) { 1938 assert(!State.Lane && "Reduction being replicated."); 1939 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); 1940 RecurKind Kind = RdxDesc.getRecurrenceKind(); 1941 // Propagate the fast-math flags carried by the underlying instruction. 1942 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 1943 State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 1944 Value *NewVecOp = State.get(getVecOp()); 1945 if (VPValue *Cond = getCondOp()) { 1946 Value *NewCond = State.get(Cond, State.VF.isScalar()); 1947 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType()); 1948 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); 1949 1950 Value *Start; 1951 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind)) 1952 Start = RdxDesc.getRecurrenceStartValue(); 1953 else 1954 Start = llvm::getRecurrenceIdentity(Kind, ElementTy, 1955 RdxDesc.getFastMathFlags()); 1956 if (State.VF.isVector()) 1957 Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start); 1958 1959 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start); 1960 NewVecOp = Select; 1961 } 1962 Value *NewRed; 1963 Value *NextInChain; 1964 if (IsOrdered) { 1965 if (State.VF.isVector()) 1966 NewRed = 1967 createOrderedReduction(State.Builder, RdxDesc, NewVecOp, PrevInChain); 1968 else 1969 NewRed = State.Builder.CreateBinOp( 1970 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain, 1971 NewVecOp); 1972 PrevInChain = NewRed; 1973 NextInChain = NewRed; 1974 } else { 1975 PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); 1976 NewRed = createReduction(State.Builder, RdxDesc, NewVecOp); 1977 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 1978 NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(), 1979 NewRed, PrevInChain); 1980 else 1981 NextInChain = State.Builder.CreateBinOp( 1982 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain); 1983 } 1984 State.set(this, NextInChain, /*IsScalar*/ true); 1985 } 1986 1987 void VPReductionEVLRecipe::execute(VPTransformState &State) { 1988 assert(!State.Lane && "Reduction being replicated."); 1989 1990 auto &Builder = State.Builder; 1991 // Propagate the fast-math flags carried by the underlying instruction. 1992 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder); 1993 const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor(); 1994 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 1995 1996 RecurKind Kind = RdxDesc.getRecurrenceKind(); 1997 Value *Prev = State.get(getChainOp(), /*IsScalar*/ true); 1998 Value *VecOp = State.get(getVecOp()); 1999 Value *EVL = State.get(getEVL(), VPLane(0)); 2000 2001 VectorBuilder VBuilder(Builder); 2002 VBuilder.setEVL(EVL); 2003 Value *Mask; 2004 // TODO: move the all-true mask generation into VectorBuilder. 2005 if (VPValue *CondOp = getCondOp()) 2006 Mask = State.get(CondOp); 2007 else 2008 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 2009 VBuilder.setMask(Mask); 2010 2011 Value *NewRed; 2012 if (isOrdered()) { 2013 NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev); 2014 } else { 2015 NewRed = createSimpleReduction(VBuilder, VecOp, RdxDesc); 2016 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 2017 NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev); 2018 else 2019 NewRed = Builder.CreateBinOp( 2020 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, Prev); 2021 } 2022 State.set(this, NewRed, /*IsScalar*/ true); 2023 } 2024 2025 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2026 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, 2027 VPSlotTracker &SlotTracker) const { 2028 O << Indent << "REDUCE "; 2029 printAsOperand(O, SlotTracker); 2030 O << " = "; 2031 getChainOp()->printAsOperand(O, SlotTracker); 2032 O << " +"; 2033 if (isa<FPMathOperator>(getUnderlyingInstr())) 2034 O << getUnderlyingInstr()->getFastMathFlags(); 2035 O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; 2036 getVecOp()->printAsOperand(O, SlotTracker); 2037 if (isConditional()) { 2038 O << ", "; 2039 getCondOp()->printAsOperand(O, SlotTracker); 2040 } 2041 O << ")"; 2042 if (RdxDesc.IntermediateStore) 2043 O << " (with final reduction value stored in invariant address sank " 2044 "outside of loop)"; 2045 } 2046 2047 void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent, 2048 VPSlotTracker &SlotTracker) const { 2049 const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor(); 2050 O << Indent << "REDUCE "; 2051 printAsOperand(O, SlotTracker); 2052 O << " = "; 2053 getChainOp()->printAsOperand(O, SlotTracker); 2054 O << " +"; 2055 if (isa<FPMathOperator>(getUnderlyingInstr())) 2056 O << getUnderlyingInstr()->getFastMathFlags(); 2057 O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; 2058 getVecOp()->printAsOperand(O, SlotTracker); 2059 O << ", "; 2060 getEVL()->printAsOperand(O, SlotTracker); 2061 if (isConditional()) { 2062 O << ", "; 2063 getCondOp()->printAsOperand(O, SlotTracker); 2064 } 2065 O << ")"; 2066 if (RdxDesc.IntermediateStore) 2067 O << " (with final reduction value stored in invariant address sank " 2068 "outside of loop)"; 2069 } 2070 #endif 2071 2072 bool VPReplicateRecipe::shouldPack() const { 2073 // Find if the recipe is used by a widened recipe via an intervening 2074 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. 2075 return any_of(users(), [](const VPUser *U) { 2076 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U)) 2077 return any_of(PredR->users(), [PredR](const VPUser *U) { 2078 return !U->usesScalars(PredR); 2079 }); 2080 return false; 2081 }); 2082 } 2083 2084 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2085 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, 2086 VPSlotTracker &SlotTracker) const { 2087 O << Indent << (IsUniform ? "CLONE " : "REPLICATE "); 2088 2089 if (!getUnderlyingInstr()->getType()->isVoidTy()) { 2090 printAsOperand(O, SlotTracker); 2091 O << " = "; 2092 } 2093 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) { 2094 O << "call"; 2095 printFlags(O); 2096 O << "@" << CB->getCalledFunction()->getName() << "("; 2097 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), 2098 O, [&O, &SlotTracker](VPValue *Op) { 2099 Op->printAsOperand(O, SlotTracker); 2100 }); 2101 O << ")"; 2102 } else { 2103 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); 2104 printFlags(O); 2105 printOperands(O, SlotTracker); 2106 } 2107 2108 if (shouldPack()) 2109 O << " (S->V)"; 2110 } 2111 #endif 2112 2113 Value *VPScalarCastRecipe ::generate(VPTransformState &State) { 2114 assert(vputils::onlyFirstLaneUsed(this) && 2115 "Codegen only implemented for first lane."); 2116 switch (Opcode) { 2117 case Instruction::SExt: 2118 case Instruction::ZExt: 2119 case Instruction::Trunc: { 2120 // Note: SExt/ZExt not used yet. 2121 Value *Op = State.get(getOperand(0), VPLane(0)); 2122 return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy); 2123 } 2124 default: 2125 llvm_unreachable("opcode not implemented yet"); 2126 } 2127 } 2128 2129 void VPScalarCastRecipe ::execute(VPTransformState &State) { 2130 State.set(this, generate(State), VPLane(0)); 2131 } 2132 2133 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2134 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent, 2135 VPSlotTracker &SlotTracker) const { 2136 O << Indent << "SCALAR-CAST "; 2137 printAsOperand(O, SlotTracker); 2138 O << " = " << Instruction::getOpcodeName(Opcode) << " "; 2139 printOperands(O, SlotTracker); 2140 O << " to " << *ResultTy; 2141 } 2142 #endif 2143 2144 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 2145 assert(State.Lane && "Branch on Mask works only on single instance."); 2146 2147 unsigned Lane = State.Lane->getKnownLane(); 2148 2149 Value *ConditionBit = nullptr; 2150 VPValue *BlockInMask = getMask(); 2151 if (BlockInMask) { 2152 ConditionBit = State.get(BlockInMask); 2153 if (ConditionBit->getType()->isVectorTy()) 2154 ConditionBit = State.Builder.CreateExtractElement( 2155 ConditionBit, State.Builder.getInt32(Lane)); 2156 } else // Block in mask is all-one. 2157 ConditionBit = State.Builder.getTrue(); 2158 2159 // Replace the temporary unreachable terminator with a new conditional branch, 2160 // whose two destinations will be set later when they are created. 2161 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 2162 assert(isa<UnreachableInst>(CurrentTerminator) && 2163 "Expected to replace unreachable terminator with conditional branch."); 2164 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 2165 CondBr->setSuccessor(0, nullptr); 2166 ReplaceInstWithInst(CurrentTerminator, CondBr); 2167 } 2168 2169 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 2170 assert(State.Lane && "Predicated instruction PHI works per instance."); 2171 Instruction *ScalarPredInst = 2172 cast<Instruction>(State.get(getOperand(0), *State.Lane)); 2173 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 2174 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 2175 assert(PredicatingBB && "Predicated block has no single predecessor."); 2176 assert(isa<VPReplicateRecipe>(getOperand(0)) && 2177 "operand must be VPReplicateRecipe"); 2178 2179 // By current pack/unpack logic we need to generate only a single phi node: if 2180 // a vector value for the predicated instruction exists at this point it means 2181 // the instruction has vector users only, and a phi for the vector value is 2182 // needed. In this case the recipe of the predicated instruction is marked to 2183 // also do that packing, thereby "hoisting" the insert-element sequence. 2184 // Otherwise, a phi node for the scalar value is needed. 2185 if (State.hasVectorValue(getOperand(0))) { 2186 Value *VectorValue = State.get(getOperand(0)); 2187 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 2188 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 2189 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 2190 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 2191 if (State.hasVectorValue(this)) 2192 State.reset(this, VPhi); 2193 else 2194 State.set(this, VPhi); 2195 // NOTE: Currently we need to update the value of the operand, so the next 2196 // predicated iteration inserts its generated value in the correct vector. 2197 State.reset(getOperand(0), VPhi); 2198 } else { 2199 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 2200 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 2201 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 2202 PredicatingBB); 2203 Phi->addIncoming(ScalarPredInst, PredicatedBB); 2204 if (State.hasScalarValue(this, *State.Lane)) 2205 State.reset(this, Phi, *State.Lane); 2206 else 2207 State.set(this, Phi, *State.Lane); 2208 // NOTE: Currently we need to update the value of the operand, so the next 2209 // predicated iteration inserts its generated value in the correct vector. 2210 State.reset(getOperand(0), Phi, *State.Lane); 2211 } 2212 } 2213 2214 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2215 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2216 VPSlotTracker &SlotTracker) const { 2217 O << Indent << "PHI-PREDICATED-INSTRUCTION "; 2218 printAsOperand(O, SlotTracker); 2219 O << " = "; 2220 printOperands(O, SlotTracker); 2221 } 2222 #endif 2223 2224 InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, 2225 VPCostContext &Ctx) const { 2226 Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF); 2227 const Align Alignment = 2228 getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient)); 2229 unsigned AS = 2230 getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient)); 2231 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2232 2233 if (!Consecutive) { 2234 // TODO: Using the original IR may not be accurate. 2235 // Currently, ARM will use the underlying IR to calculate gather/scatter 2236 // instruction cost. 2237 const Value *Ptr = getLoadStorePointerOperand(&Ingredient); 2238 assert(!Reverse && 2239 "Inconsecutive memory access should not have the order."); 2240 return Ctx.TTI.getAddressComputationCost(Ty) + 2241 Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr, 2242 IsMasked, Alignment, CostKind, 2243 &Ingredient); 2244 } 2245 2246 InstructionCost Cost = 0; 2247 if (IsMasked) { 2248 Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, 2249 AS, CostKind); 2250 } else { 2251 TTI::OperandValueInfo OpInfo = 2252 Ctx.TTI.getOperandInfo(Ingredient.getOperand(0)); 2253 Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS, 2254 CostKind, OpInfo, &Ingredient); 2255 } 2256 if (!Reverse) 2257 return Cost; 2258 2259 return Cost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 2260 cast<VectorType>(Ty), {}, CostKind, 0); 2261 } 2262 2263 void VPWidenLoadRecipe::execute(VPTransformState &State) { 2264 auto *LI = cast<LoadInst>(&Ingredient); 2265 2266 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 2267 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 2268 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2269 bool CreateGather = !isConsecutive(); 2270 2271 auto &Builder = State.Builder; 2272 State.setDebugLocFrom(getDebugLoc()); 2273 Value *Mask = nullptr; 2274 if (auto *VPMask = getMask()) { 2275 // Mask reversal is only needed for non-all-one (null) masks, as reverse 2276 // of a null all-one mask is a null mask. 2277 Mask = State.get(VPMask); 2278 if (isReverse()) 2279 Mask = Builder.CreateVectorReverse(Mask, "reverse"); 2280 } 2281 2282 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather); 2283 Value *NewLI; 2284 if (CreateGather) { 2285 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr, 2286 "wide.masked.gather"); 2287 } else if (Mask) { 2288 NewLI = 2289 Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask, 2290 PoisonValue::get(DataTy), "wide.masked.load"); 2291 } else { 2292 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load"); 2293 } 2294 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2295 State.addMetadata(NewLI, LI); 2296 if (Reverse) 2297 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 2298 State.set(this, NewLI); 2299 } 2300 2301 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2302 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, 2303 VPSlotTracker &SlotTracker) const { 2304 O << Indent << "WIDEN "; 2305 printAsOperand(O, SlotTracker); 2306 O << " = load "; 2307 printOperands(O, SlotTracker); 2308 } 2309 #endif 2310 2311 /// Use all-true mask for reverse rather than actual mask, as it avoids a 2312 /// dependence w/o affecting the result. 2313 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, 2314 Value *EVL, const Twine &Name) { 2315 VectorType *ValTy = cast<VectorType>(Operand->getType()); 2316 Value *AllTrueMask = 2317 Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue()); 2318 return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse, 2319 {Operand, AllTrueMask, EVL}, nullptr, Name); 2320 } 2321 2322 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { 2323 auto *LI = cast<LoadInst>(&Ingredient); 2324 2325 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 2326 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 2327 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2328 bool CreateGather = !isConsecutive(); 2329 2330 auto &Builder = State.Builder; 2331 State.setDebugLocFrom(getDebugLoc()); 2332 CallInst *NewLI; 2333 Value *EVL = State.get(getEVL(), VPLane(0)); 2334 Value *Addr = State.get(getAddr(), !CreateGather); 2335 Value *Mask = nullptr; 2336 if (VPValue *VPMask = getMask()) { 2337 Mask = State.get(VPMask); 2338 if (isReverse()) 2339 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); 2340 } else { 2341 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 2342 } 2343 2344 if (CreateGather) { 2345 NewLI = 2346 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, 2347 nullptr, "wide.masked.gather"); 2348 } else { 2349 VectorBuilder VBuilder(Builder); 2350 VBuilder.setEVL(EVL).setMask(Mask); 2351 NewLI = cast<CallInst>(VBuilder.createVectorInstruction( 2352 Instruction::Load, DataTy, Addr, "vp.op.load")); 2353 } 2354 NewLI->addParamAttr( 2355 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); 2356 State.addMetadata(NewLI, LI); 2357 Instruction *Res = NewLI; 2358 if (isReverse()) 2359 Res = createReverseEVL(Builder, Res, EVL, "vp.reverse"); 2360 State.set(this, Res); 2361 } 2362 2363 InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF, 2364 VPCostContext &Ctx) const { 2365 if (!Consecutive || IsMasked) 2366 return VPWidenMemoryRecipe::computeCost(VF, Ctx); 2367 2368 // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost() 2369 // here because the EVL recipes using EVL to replace the tail mask. But in the 2370 // legacy model, it will always calculate the cost of mask. 2371 // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we 2372 // don't need to compare to the legacy cost model. 2373 Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF); 2374 const Align Alignment = 2375 getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient)); 2376 unsigned AS = 2377 getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient)); 2378 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2379 InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( 2380 Ingredient.getOpcode(), Ty, Alignment, AS, CostKind); 2381 if (!Reverse) 2382 return Cost; 2383 2384 return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 2385 cast<VectorType>(Ty), {}, CostKind, 0); 2386 } 2387 2388 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2389 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, 2390 VPSlotTracker &SlotTracker) const { 2391 O << Indent << "WIDEN "; 2392 printAsOperand(O, SlotTracker); 2393 O << " = vp.load "; 2394 printOperands(O, SlotTracker); 2395 } 2396 #endif 2397 2398 void VPWidenStoreRecipe::execute(VPTransformState &State) { 2399 auto *SI = cast<StoreInst>(&Ingredient); 2400 2401 VPValue *StoredVPValue = getStoredValue(); 2402 bool CreateScatter = !isConsecutive(); 2403 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2404 2405 auto &Builder = State.Builder; 2406 State.setDebugLocFrom(getDebugLoc()); 2407 2408 Value *Mask = nullptr; 2409 if (auto *VPMask = getMask()) { 2410 // Mask reversal is only needed for non-all-one (null) masks, as reverse 2411 // of a null all-one mask is a null mask. 2412 Mask = State.get(VPMask); 2413 if (isReverse()) 2414 Mask = Builder.CreateVectorReverse(Mask, "reverse"); 2415 } 2416 2417 Value *StoredVal = State.get(StoredVPValue); 2418 if (isReverse()) { 2419 // If we store to reverse consecutive memory locations, then we need 2420 // to reverse the order of elements in the stored value. 2421 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 2422 // We don't want to update the value in the map as it might be used in 2423 // another expression. So don't call resetVectorValue(StoredVal). 2424 } 2425 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter); 2426 Instruction *NewSI = nullptr; 2427 if (CreateScatter) 2428 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask); 2429 else if (Mask) 2430 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask); 2431 else 2432 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment); 2433 State.addMetadata(NewSI, SI); 2434 } 2435 2436 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2437 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, 2438 VPSlotTracker &SlotTracker) const { 2439 O << Indent << "WIDEN store "; 2440 printOperands(O, SlotTracker); 2441 } 2442 #endif 2443 2444 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { 2445 auto *SI = cast<StoreInst>(&Ingredient); 2446 2447 VPValue *StoredValue = getStoredValue(); 2448 bool CreateScatter = !isConsecutive(); 2449 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2450 2451 auto &Builder = State.Builder; 2452 State.setDebugLocFrom(getDebugLoc()); 2453 2454 CallInst *NewSI = nullptr; 2455 Value *StoredVal = State.get(StoredValue); 2456 Value *EVL = State.get(getEVL(), VPLane(0)); 2457 if (isReverse()) 2458 StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse"); 2459 Value *Mask = nullptr; 2460 if (VPValue *VPMask = getMask()) { 2461 Mask = State.get(VPMask); 2462 if (isReverse()) 2463 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); 2464 } else { 2465 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 2466 } 2467 Value *Addr = State.get(getAddr(), !CreateScatter); 2468 if (CreateScatter) { 2469 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), 2470 Intrinsic::vp_scatter, 2471 {StoredVal, Addr, Mask, EVL}); 2472 } else { 2473 VectorBuilder VBuilder(Builder); 2474 VBuilder.setEVL(EVL).setMask(Mask); 2475 NewSI = cast<CallInst>(VBuilder.createVectorInstruction( 2476 Instruction::Store, Type::getVoidTy(EVL->getContext()), 2477 {StoredVal, Addr})); 2478 } 2479 NewSI->addParamAttr( 2480 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment)); 2481 State.addMetadata(NewSI, SI); 2482 } 2483 2484 InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF, 2485 VPCostContext &Ctx) const { 2486 if (!Consecutive || IsMasked) 2487 return VPWidenMemoryRecipe::computeCost(VF, Ctx); 2488 2489 // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost() 2490 // here because the EVL recipes using EVL to replace the tail mask. But in the 2491 // legacy model, it will always calculate the cost of mask. 2492 // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we 2493 // don't need to compare to the legacy cost model. 2494 Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF); 2495 const Align Alignment = 2496 getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient)); 2497 unsigned AS = 2498 getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient)); 2499 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2500 InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( 2501 Ingredient.getOpcode(), Ty, Alignment, AS, CostKind); 2502 if (!Reverse) 2503 return Cost; 2504 2505 return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 2506 cast<VectorType>(Ty), {}, CostKind, 0); 2507 } 2508 2509 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2510 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent, 2511 VPSlotTracker &SlotTracker) const { 2512 O << Indent << "WIDEN vp.store "; 2513 printOperands(O, SlotTracker); 2514 } 2515 #endif 2516 2517 static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V, 2518 VectorType *DstVTy, const DataLayout &DL) { 2519 // Verify that V is a vector type with same number of elements as DstVTy. 2520 auto VF = DstVTy->getElementCount(); 2521 auto *SrcVecTy = cast<VectorType>(V->getType()); 2522 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match"); 2523 Type *SrcElemTy = SrcVecTy->getElementType(); 2524 Type *DstElemTy = DstVTy->getElementType(); 2525 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2526 "Vector elements must have same size"); 2527 2528 // Do a direct cast if element types are castable. 2529 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2530 return Builder.CreateBitOrPointerCast(V, DstVTy); 2531 } 2532 // V cannot be directly casted to desired vector type. 2533 // May happen when V is a floating point vector but DstVTy is a vector of 2534 // pointers or vice-versa. Handle this using a two-step bitcast using an 2535 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2536 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2537 "Only one type should be a pointer type"); 2538 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2539 "Only one type should be a floating point type"); 2540 Type *IntTy = 2541 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2542 auto *VecIntTy = VectorType::get(IntTy, VF); 2543 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2544 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2545 } 2546 2547 /// Return a vector containing interleaved elements from multiple 2548 /// smaller input vectors. 2549 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, 2550 const Twine &Name) { 2551 unsigned Factor = Vals.size(); 2552 assert(Factor > 1 && "Tried to interleave invalid number of vectors"); 2553 2554 VectorType *VecTy = cast<VectorType>(Vals[0]->getType()); 2555 #ifndef NDEBUG 2556 for (Value *Val : Vals) 2557 assert(Val->getType() == VecTy && "Tried to interleave mismatched types"); 2558 #endif 2559 2560 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so 2561 // must use intrinsics to interleave. 2562 if (VecTy->isScalableTy()) { 2563 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); 2564 return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2, 2565 Vals, 2566 /*FMFSource=*/nullptr, Name); 2567 } 2568 2569 // Fixed length. Start by concatenating all vectors into a wide vector. 2570 Value *WideVec = concatenateVectors(Builder, Vals); 2571 2572 // Interleave the elements into the wide vector. 2573 const unsigned NumElts = VecTy->getElementCount().getFixedValue(); 2574 return Builder.CreateShuffleVector( 2575 WideVec, createInterleaveMask(NumElts, Factor), Name); 2576 } 2577 2578 // Try to vectorize the interleave group that \p Instr belongs to. 2579 // 2580 // E.g. Translate following interleaved load group (factor = 3): 2581 // for (i = 0; i < N; i+=3) { 2582 // R = Pic[i]; // Member of index 0 2583 // G = Pic[i+1]; // Member of index 1 2584 // B = Pic[i+2]; // Member of index 2 2585 // ... // do something to R, G, B 2586 // } 2587 // To: 2588 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2589 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2590 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2591 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2592 // 2593 // Or translate following interleaved store group (factor = 3): 2594 // for (i = 0; i < N; i+=3) { 2595 // ... do something to R, G, B 2596 // Pic[i] = R; // Member of index 0 2597 // Pic[i+1] = G; // Member of index 1 2598 // Pic[i+2] = B; // Member of index 2 2599 // } 2600 // To: 2601 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2602 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2603 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2604 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2605 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2606 void VPInterleaveRecipe::execute(VPTransformState &State) { 2607 assert(!State.Lane && "Interleave group being replicated."); 2608 const InterleaveGroup<Instruction> *Group = IG; 2609 Instruction *Instr = Group->getInsertPos(); 2610 2611 // Prepare for the vector type of the interleaved load/store. 2612 Type *ScalarTy = getLoadStoreType(Instr); 2613 unsigned InterleaveFactor = Group->getFactor(); 2614 auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor); 2615 2616 // Prepare for the new pointers. 2617 unsigned Index = Group->getIndex(Instr); 2618 2619 // TODO: extend the masked interleaved-group support to reversed access. 2620 VPValue *BlockInMask = getMask(); 2621 assert((!BlockInMask || !Group->isReverse()) && 2622 "Reversed masked interleave-group not supported."); 2623 2624 Value *Idx; 2625 // If the group is reverse, adjust the index to refer to the last vector lane 2626 // instead of the first. We adjust the index from the first vector lane, 2627 // rather than directly getting the pointer for lane VF - 1, because the 2628 // pointer operand of the interleaved access is supposed to be uniform. 2629 if (Group->isReverse()) { 2630 Value *RuntimeVF = 2631 getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF); 2632 Idx = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1)); 2633 Idx = State.Builder.CreateMul(Idx, 2634 State.Builder.getInt32(Group->getFactor())); 2635 Idx = State.Builder.CreateAdd(Idx, State.Builder.getInt32(Index)); 2636 Idx = State.Builder.CreateNeg(Idx); 2637 } else 2638 Idx = State.Builder.getInt32(-Index); 2639 2640 VPValue *Addr = getAddr(); 2641 Value *ResAddr = State.get(Addr, VPLane(0)); 2642 if (auto *I = dyn_cast<Instruction>(ResAddr)) 2643 State.setDebugLocFrom(I->getDebugLoc()); 2644 2645 // Notice current instruction could be any index. Need to adjust the address 2646 // to the member of index 0. 2647 // 2648 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2649 // b = A[i]; // Member of index 0 2650 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2651 // 2652 // E.g. A[i+1] = a; // Member of index 1 2653 // A[i] = b; // Member of index 0 2654 // A[i+2] = c; // Member of index 2 (Current instruction) 2655 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2656 2657 bool InBounds = false; 2658 if (auto *gep = dyn_cast<GetElementPtrInst>(ResAddr->stripPointerCasts())) 2659 InBounds = gep->isInBounds(); 2660 ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Idx, "", InBounds); 2661 2662 State.setDebugLocFrom(Instr->getDebugLoc()); 2663 Value *PoisonVec = PoisonValue::get(VecTy); 2664 2665 auto CreateGroupMask = [&BlockInMask, &State, 2666 &InterleaveFactor](Value *MaskForGaps) -> Value * { 2667 if (State.VF.isScalable()) { 2668 assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); 2669 assert(InterleaveFactor == 2 && 2670 "Unsupported deinterleave factor for scalable vectors"); 2671 auto *ResBlockInMask = State.get(BlockInMask); 2672 SmallVector<Value *, 2> Ops = {ResBlockInMask, ResBlockInMask}; 2673 auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(), 2674 State.VF.getKnownMinValue() * 2, true); 2675 return State.Builder.CreateIntrinsic( 2676 MaskTy, Intrinsic::vector_interleave2, Ops, 2677 /*FMFSource=*/nullptr, "interleaved.mask"); 2678 } 2679 2680 if (!BlockInMask) 2681 return MaskForGaps; 2682 2683 Value *ResBlockInMask = State.get(BlockInMask); 2684 Value *ShuffledMask = State.Builder.CreateShuffleVector( 2685 ResBlockInMask, 2686 createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()), 2687 "interleaved.mask"); 2688 return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And, 2689 ShuffledMask, MaskForGaps) 2690 : ShuffledMask; 2691 }; 2692 2693 const DataLayout &DL = Instr->getDataLayout(); 2694 // Vectorize the interleaved load group. 2695 if (isa<LoadInst>(Instr)) { 2696 Value *MaskForGaps = nullptr; 2697 if (NeedsMaskForGaps) { 2698 MaskForGaps = createBitMaskForGaps(State.Builder, 2699 State.VF.getKnownMinValue(), *Group); 2700 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2701 } 2702 2703 Instruction *NewLoad; 2704 if (BlockInMask || MaskForGaps) { 2705 Value *GroupMask = CreateGroupMask(MaskForGaps); 2706 NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr, 2707 Group->getAlign(), GroupMask, 2708 PoisonVec, "wide.masked.vec"); 2709 } else 2710 NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr, 2711 Group->getAlign(), "wide.vec"); 2712 Group->addMetadata(NewLoad); 2713 2714 ArrayRef<VPValue *> VPDefs = definedValues(); 2715 const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); 2716 if (VecTy->isScalableTy()) { 2717 assert(InterleaveFactor == 2 && 2718 "Unsupported deinterleave factor for scalable vectors"); 2719 2720 // Scalable vectors cannot use arbitrary shufflevectors (only splats), 2721 // so must use intrinsics to deinterleave. 2722 Value *DI = State.Builder.CreateIntrinsic( 2723 Intrinsic::vector_deinterleave2, VecTy, NewLoad, 2724 /*FMFSource=*/nullptr, "strided.vec"); 2725 unsigned J = 0; 2726 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2727 Instruction *Member = Group->getMember(I); 2728 2729 if (!Member) 2730 continue; 2731 2732 Value *StridedVec = State.Builder.CreateExtractValue(DI, I); 2733 // If this member has different type, cast the result type. 2734 if (Member->getType() != ScalarTy) { 2735 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); 2736 StridedVec = 2737 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); 2738 } 2739 2740 if (Group->isReverse()) 2741 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); 2742 2743 State.set(VPDefs[J], StridedVec); 2744 ++J; 2745 } 2746 2747 return; 2748 } 2749 2750 // For each member in the group, shuffle out the appropriate data from the 2751 // wide loads. 2752 unsigned J = 0; 2753 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2754 Instruction *Member = Group->getMember(I); 2755 2756 // Skip the gaps in the group. 2757 if (!Member) 2758 continue; 2759 2760 auto StrideMask = 2761 createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue()); 2762 Value *StridedVec = 2763 State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec"); 2764 2765 // If this member has different type, cast the result type. 2766 if (Member->getType() != ScalarTy) { 2767 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 2768 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); 2769 StridedVec = 2770 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); 2771 } 2772 2773 if (Group->isReverse()) 2774 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); 2775 2776 State.set(VPDefs[J], StridedVec); 2777 ++J; 2778 } 2779 return; 2780 } 2781 2782 // The sub vector type for current instruction. 2783 auto *SubVT = VectorType::get(ScalarTy, State.VF); 2784 2785 // Vectorize the interleaved store group. 2786 Value *MaskForGaps = 2787 createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group); 2788 assert((!MaskForGaps || !State.VF.isScalable()) && 2789 "masking gaps for scalable vectors is not yet supported."); 2790 ArrayRef<VPValue *> StoredValues = getStoredValues(); 2791 // Collect the stored vector from each member. 2792 SmallVector<Value *, 4> StoredVecs; 2793 unsigned StoredIdx = 0; 2794 for (unsigned i = 0; i < InterleaveFactor; i++) { 2795 assert((Group->getMember(i) || MaskForGaps) && 2796 "Fail to get a member from an interleaved store group"); 2797 Instruction *Member = Group->getMember(i); 2798 2799 // Skip the gaps in the group. 2800 if (!Member) { 2801 Value *Undef = PoisonValue::get(SubVT); 2802 StoredVecs.push_back(Undef); 2803 continue; 2804 } 2805 2806 Value *StoredVec = State.get(StoredValues[StoredIdx]); 2807 ++StoredIdx; 2808 2809 if (Group->isReverse()) 2810 StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse"); 2811 2812 // If this member has different type, cast it to a unified type. 2813 2814 if (StoredVec->getType() != SubVT) 2815 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL); 2816 2817 StoredVecs.push_back(StoredVec); 2818 } 2819 2820 // Interleave all the smaller vectors into one wider vector. 2821 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec"); 2822 Instruction *NewStoreInstr; 2823 if (BlockInMask || MaskForGaps) { 2824 Value *GroupMask = CreateGroupMask(MaskForGaps); 2825 NewStoreInstr = State.Builder.CreateMaskedStore( 2826 IVec, ResAddr, Group->getAlign(), GroupMask); 2827 } else 2828 NewStoreInstr = 2829 State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign()); 2830 2831 Group->addMetadata(NewStoreInstr); 2832 } 2833 2834 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2835 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 2836 VPSlotTracker &SlotTracker) const { 2837 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 2838 IG->getInsertPos()->printAsOperand(O, false); 2839 O << ", "; 2840 getAddr()->printAsOperand(O, SlotTracker); 2841 VPValue *Mask = getMask(); 2842 if (Mask) { 2843 O << ", "; 2844 Mask->printAsOperand(O, SlotTracker); 2845 } 2846 2847 unsigned OpIdx = 0; 2848 for (unsigned i = 0; i < IG->getFactor(); ++i) { 2849 if (!IG->getMember(i)) 2850 continue; 2851 if (getNumStoreOperands() > 0) { 2852 O << "\n" << Indent << " store "; 2853 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 2854 O << " to index " << i; 2855 } else { 2856 O << "\n" << Indent << " "; 2857 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 2858 O << " = load from index " << i; 2859 } 2860 ++OpIdx; 2861 } 2862 } 2863 #endif 2864 2865 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { 2866 Value *Start = getStartValue()->getLiveInIRValue(); 2867 PHINode *Phi = PHINode::Create(Start->getType(), 2, "index"); 2868 Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 2869 2870 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2871 Phi->addIncoming(Start, VectorPH); 2872 Phi->setDebugLoc(getDebugLoc()); 2873 State.set(this, Phi, /*IsScalar*/ true); 2874 } 2875 2876 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2877 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2878 VPSlotTracker &SlotTracker) const { 2879 O << Indent << "EMIT "; 2880 printAsOperand(O, SlotTracker); 2881 O << " = CANONICAL-INDUCTION "; 2882 printOperands(O, SlotTracker); 2883 } 2884 #endif 2885 2886 bool VPCanonicalIVPHIRecipe::isCanonical( 2887 InductionDescriptor::InductionKind Kind, VPValue *Start, 2888 VPValue *Step) const { 2889 // Must be an integer induction. 2890 if (Kind != InductionDescriptor::IK_IntInduction) 2891 return false; 2892 // Start must match the start value of this canonical induction. 2893 if (Start != getStartValue()) 2894 return false; 2895 2896 // If the step is defined by a recipe, it is not a ConstantInt. 2897 if (Step->getDefiningRecipe()) 2898 return false; 2899 2900 ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue()); 2901 return StepC && StepC->isOne(); 2902 } 2903 2904 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) { 2905 return IsScalarAfterVectorization && 2906 (!IsScalable || vputils::onlyFirstLaneUsed(this)); 2907 } 2908 2909 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 2910 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && 2911 "Not a pointer induction according to InductionDescriptor!"); 2912 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && 2913 "Unexpected type."); 2914 assert(!onlyScalarsGenerated(State.VF.isScalable()) && 2915 "Recipe should have been replaced"); 2916 2917 auto *IVR = getParent()->getPlan()->getCanonicalIV(); 2918 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, /*IsScalar*/ true)); 2919 unsigned CurrentPart = getUnrollPart(*this); 2920 2921 // Build a pointer phi 2922 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 2923 Type *ScStValueType = ScalarStartValue->getType(); 2924 2925 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2926 PHINode *NewPointerPhi = nullptr; 2927 if (CurrentPart == 0) { 2928 NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi", 2929 CanonicalIV->getIterator()); 2930 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 2931 } else { 2932 // The recipe has been unrolled. In that case, fetch the single pointer phi 2933 // shared among all unrolled parts of the recipe. 2934 auto *GEP = 2935 cast<GetElementPtrInst>(State.get(getFirstUnrolledPartOperand())); 2936 NewPointerPhi = cast<PHINode>(GEP->getPointerOperand()); 2937 } 2938 2939 // A pointer induction, performed by using a gep 2940 BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint(); 2941 Value *ScalarStepValue = State.get(getOperand(1), VPLane(0)); 2942 Type *PhiType = IndDesc.getStep()->getType(); 2943 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 2944 // Add induction update using an incorrect block temporarily. The phi node 2945 // will be fixed after VPlan execution. Note that at this point the latch 2946 // block cannot be used, as it does not exist yet. 2947 // TODO: Model increment value in VPlan, by turning the recipe into a 2948 // multi-def and a subclass of VPHeaderPHIRecipe. 2949 if (CurrentPart == 0) { 2950 // The recipe represents the first part of the pointer induction. Create the 2951 // GEP to increment the phi across all unrolled parts. 2952 unsigned UF = CurrentPart == 0 ? getParent()->getPlan()->getUF() : 1; 2953 Value *NumUnrolledElems = 2954 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, UF)); 2955 2956 Value *InductionGEP = GetElementPtrInst::Create( 2957 State.Builder.getInt8Ty(), NewPointerPhi, 2958 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 2959 InductionLoc); 2960 2961 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 2962 } 2963 2964 // Create actual address geps that use the pointer phi as base and a 2965 // vectorized version of the step value (<step*0, ..., step*N>) as offset. 2966 Type *VecPhiType = VectorType::get(PhiType, State.VF); 2967 Value *StartOffsetScalar = State.Builder.CreateMul( 2968 RuntimeVF, ConstantInt::get(PhiType, CurrentPart)); 2969 Value *StartOffset = 2970 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 2971 // Create a vector of consecutive numbers from zero to VF. 2972 StartOffset = State.Builder.CreateAdd( 2973 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 2974 2975 assert(ScalarStepValue == State.get(getOperand(1), VPLane(0)) && 2976 "scalar step must be the same across all parts"); 2977 Value *GEP = State.Builder.CreateGEP( 2978 State.Builder.getInt8Ty(), NewPointerPhi, 2979 State.Builder.CreateMul(StartOffset, State.Builder.CreateVectorSplat( 2980 State.VF, ScalarStepValue)), 2981 "vector.gep"); 2982 State.set(this, GEP); 2983 } 2984 2985 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2986 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, 2987 VPSlotTracker &SlotTracker) const { 2988 assert((getNumOperands() == 2 || getNumOperands() == 4) && 2989 "unexpected number of operands"); 2990 O << Indent << "EMIT "; 2991 printAsOperand(O, SlotTracker); 2992 O << " = WIDEN-POINTER-INDUCTION "; 2993 getStartValue()->printAsOperand(O, SlotTracker); 2994 O << ", " << *IndDesc.getStep(); 2995 if (getNumOperands() == 4) { 2996 O << ", "; 2997 getOperand(2)->printAsOperand(O, SlotTracker); 2998 O << ", "; 2999 getOperand(3)->printAsOperand(O, SlotTracker); 3000 } 3001 } 3002 #endif 3003 3004 void VPExpandSCEVRecipe::execute(VPTransformState &State) { 3005 assert(!State.Lane && "cannot be used in per-lane"); 3006 const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); 3007 SCEVExpander Exp(SE, DL, "induction"); 3008 3009 Value *Res = Exp.expandCodeFor(Expr, Expr->getType(), 3010 &*State.Builder.GetInsertPoint()); 3011 assert(!State.ExpandedSCEVs.contains(Expr) && 3012 "Same SCEV expanded multiple times"); 3013 State.ExpandedSCEVs[Expr] = Res; 3014 State.set(this, Res, VPLane(0)); 3015 } 3016 3017 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3018 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, 3019 VPSlotTracker &SlotTracker) const { 3020 O << Indent << "EMIT "; 3021 getVPSingleValue()->printAsOperand(O, SlotTracker); 3022 O << " = EXPAND SCEV " << *Expr; 3023 } 3024 #endif 3025 3026 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { 3027 Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true); 3028 Type *STy = CanonicalIV->getType(); 3029 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); 3030 ElementCount VF = State.VF; 3031 Value *VStart = VF.isScalar() 3032 ? CanonicalIV 3033 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); 3034 Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this)); 3035 if (VF.isVector()) { 3036 VStep = Builder.CreateVectorSplat(VF, VStep); 3037 VStep = 3038 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); 3039 } 3040 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); 3041 State.set(this, CanonicalVectorIV); 3042 } 3043 3044 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3045 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, 3046 VPSlotTracker &SlotTracker) const { 3047 O << Indent << "EMIT "; 3048 printAsOperand(O, SlotTracker); 3049 O << " = WIDEN-CANONICAL-INDUCTION "; 3050 printOperands(O, SlotTracker); 3051 } 3052 #endif 3053 3054 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { 3055 auto &Builder = State.Builder; 3056 // Create a vector from the initial value. 3057 auto *VectorInit = getStartValue()->getLiveInIRValue(); 3058 3059 Type *VecTy = State.VF.isScalar() 3060 ? VectorInit->getType() 3061 : VectorType::get(VectorInit->getType(), State.VF); 3062 3063 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3064 if (State.VF.isVector()) { 3065 auto *IdxTy = Builder.getInt32Ty(); 3066 auto *One = ConstantInt::get(IdxTy, 1); 3067 IRBuilder<>::InsertPointGuard Guard(Builder); 3068 Builder.SetInsertPoint(VectorPH->getTerminator()); 3069 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF); 3070 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 3071 VectorInit = Builder.CreateInsertElement( 3072 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init"); 3073 } 3074 3075 // Create a phi node for the new recurrence. 3076 PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur"); 3077 Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 3078 Phi->addIncoming(VectorInit, VectorPH); 3079 State.set(this, Phi); 3080 } 3081 3082 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3083 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, 3084 VPSlotTracker &SlotTracker) const { 3085 O << Indent << "FIRST-ORDER-RECURRENCE-PHI "; 3086 printAsOperand(O, SlotTracker); 3087 O << " = phi "; 3088 printOperands(O, SlotTracker); 3089 } 3090 #endif 3091 3092 void VPReductionPHIRecipe::execute(VPTransformState &State) { 3093 auto &Builder = State.Builder; 3094 3095 // Reductions do not have to start at zero. They can start with 3096 // any loop invariant values. 3097 VPValue *StartVPV = getStartValue(); 3098 Value *StartV = StartVPV->getLiveInIRValue(); 3099 3100 // In order to support recurrences we need to be able to vectorize Phi nodes. 3101 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3102 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 3103 // this value when we vectorize all of the instructions that use the PHI. 3104 bool ScalarPHI = State.VF.isScalar() || IsInLoop; 3105 Type *VecTy = ScalarPHI ? StartV->getType() 3106 : VectorType::get(StartV->getType(), State.VF); 3107 3108 BasicBlock *HeaderBB = State.CFG.PrevBB; 3109 assert(State.CurrentVectorLoop->getHeader() == HeaderBB && 3110 "recipe must be in the vector loop header"); 3111 auto *Phi = PHINode::Create(VecTy, 2, "vec.phi"); 3112 Phi->insertBefore(HeaderBB->getFirstInsertionPt()); 3113 State.set(this, Phi, IsInLoop); 3114 3115 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3116 3117 Value *Iden = nullptr; 3118 RecurKind RK = RdxDesc.getRecurrenceKind(); 3119 unsigned CurrentPart = getUnrollPart(*this); 3120 3121 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || 3122 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { 3123 // MinMax and AnyOf reductions have the start value as their identity. 3124 if (ScalarPHI) { 3125 Iden = StartV; 3126 } else { 3127 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 3128 Builder.SetInsertPoint(VectorPH->getTerminator()); 3129 StartV = Iden = State.get(StartVPV); 3130 } 3131 } else { 3132 Iden = llvm::getRecurrenceIdentity(RK, VecTy->getScalarType(), 3133 RdxDesc.getFastMathFlags()); 3134 3135 if (!ScalarPHI) { 3136 if (CurrentPart == 0) { 3137 // Create start and identity vector values for the reduction in the 3138 // preheader. 3139 // TODO: Introduce recipes in VPlan preheader to create initial values. 3140 Iden = Builder.CreateVectorSplat(State.VF, Iden); 3141 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 3142 Builder.SetInsertPoint(VectorPH->getTerminator()); 3143 Constant *Zero = Builder.getInt32(0); 3144 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 3145 } else { 3146 Iden = Builder.CreateVectorSplat(State.VF, Iden); 3147 } 3148 } 3149 } 3150 3151 Phi = cast<PHINode>(State.get(this, IsInLoop)); 3152 Value *StartVal = (CurrentPart == 0) ? StartV : Iden; 3153 Phi->addIncoming(StartVal, VectorPH); 3154 } 3155 3156 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3157 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3158 VPSlotTracker &SlotTracker) const { 3159 O << Indent << "WIDEN-REDUCTION-PHI "; 3160 3161 printAsOperand(O, SlotTracker); 3162 O << " = phi "; 3163 printOperands(O, SlotTracker); 3164 } 3165 #endif 3166 3167 void VPWidenPHIRecipe::execute(VPTransformState &State) { 3168 assert(EnableVPlanNativePath && 3169 "Non-native vplans are not expected to have VPWidenPHIRecipes."); 3170 3171 Value *Op0 = State.get(getOperand(0)); 3172 Type *VecTy = Op0->getType(); 3173 Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi"); 3174 State.set(this, VecPhi); 3175 } 3176 3177 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3178 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3179 VPSlotTracker &SlotTracker) const { 3180 O << Indent << "WIDEN-PHI "; 3181 3182 auto *OriginalPhi = cast<PHINode>(getUnderlyingValue()); 3183 // Unless all incoming values are modeled in VPlan print the original PHI 3184 // directly. 3185 // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming 3186 // values as VPValues. 3187 if (getNumOperands() != OriginalPhi->getNumOperands()) { 3188 O << VPlanIngredient(OriginalPhi); 3189 return; 3190 } 3191 3192 printAsOperand(O, SlotTracker); 3193 O << " = phi "; 3194 printOperands(O, SlotTracker); 3195 } 3196 #endif 3197 3198 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and 3199 // remove VPActiveLaneMaskPHIRecipe. 3200 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { 3201 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3202 Value *StartMask = State.get(getOperand(0)); 3203 PHINode *Phi = 3204 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); 3205 Phi->addIncoming(StartMask, VectorPH); 3206 Phi->setDebugLoc(getDebugLoc()); 3207 State.set(this, Phi); 3208 } 3209 3210 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3211 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3212 VPSlotTracker &SlotTracker) const { 3213 O << Indent << "ACTIVE-LANE-MASK-PHI "; 3214 3215 printAsOperand(O, SlotTracker); 3216 O << " = phi "; 3217 printOperands(O, SlotTracker); 3218 } 3219 #endif 3220 3221 void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) { 3222 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3223 Value *Start = State.get(getOperand(0), VPLane(0)); 3224 PHINode *Phi = State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv"); 3225 Phi->addIncoming(Start, VectorPH); 3226 Phi->setDebugLoc(getDebugLoc()); 3227 State.set(this, Phi, /*IsScalar=*/true); 3228 } 3229 3230 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3231 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3232 VPSlotTracker &SlotTracker) const { 3233 O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI "; 3234 3235 printAsOperand(O, SlotTracker); 3236 O << " = phi "; 3237 printOperands(O, SlotTracker); 3238 } 3239 #endif 3240