1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file contains implementations for different VPlan recipes. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "VPlan.h" 15 #include "VPlanAnalysis.h" 16 #include "VPlanPatternMatch.h" 17 #include "VPlanUtils.h" 18 #include "llvm/ADT/STLExtras.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/ADT/Twine.h" 21 #include "llvm/Analysis/IVDescriptors.h" 22 #include "llvm/IR/BasicBlock.h" 23 #include "llvm/IR/IRBuilder.h" 24 #include "llvm/IR/Instruction.h" 25 #include "llvm/IR/Instructions.h" 26 #include "llvm/IR/Intrinsics.h" 27 #include "llvm/IR/Type.h" 28 #include "llvm/IR/Value.h" 29 #include "llvm/IR/VectorBuilder.h" 30 #include "llvm/Support/Casting.h" 31 #include "llvm/Support/CommandLine.h" 32 #include "llvm/Support/Debug.h" 33 #include "llvm/Support/raw_ostream.h" 34 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 35 #include "llvm/Transforms/Utils/LoopUtils.h" 36 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 37 #include <cassert> 38 39 using namespace llvm; 40 41 using VectorParts = SmallVector<Value *, 2>; 42 43 namespace llvm { 44 extern cl::opt<bool> EnableVPlanNativePath; 45 } 46 extern cl::opt<unsigned> ForceTargetInstructionCost; 47 48 #define LV_NAME "loop-vectorize" 49 #define DEBUG_TYPE LV_NAME 50 51 bool VPRecipeBase::mayWriteToMemory() const { 52 switch (getVPDefID()) { 53 case VPInstructionSC: 54 if (Instruction::isBinaryOp(cast<VPInstruction>(this)->getOpcode())) 55 return false; 56 switch (cast<VPInstruction>(this)->getOpcode()) { 57 case Instruction::Or: 58 case Instruction::ICmp: 59 case Instruction::Select: 60 case VPInstruction::AnyOf: 61 case VPInstruction::Not: 62 case VPInstruction::CalculateTripCountMinusVF: 63 case VPInstruction::CanonicalIVIncrementForPart: 64 case VPInstruction::ExtractFromEnd: 65 case VPInstruction::FirstOrderRecurrenceSplice: 66 case VPInstruction::LogicalAnd: 67 case VPInstruction::PtrAdd: 68 return false; 69 default: 70 return true; 71 } 72 case VPInterleaveSC: 73 return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0; 74 case VPWidenStoreEVLSC: 75 case VPWidenStoreSC: 76 return true; 77 case VPReplicateSC: 78 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 79 ->mayWriteToMemory(); 80 case VPWidenCallSC: 81 return !cast<VPWidenCallRecipe>(this) 82 ->getCalledScalarFunction() 83 ->onlyReadsMemory(); 84 case VPWidenIntrinsicSC: 85 return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory(); 86 case VPBranchOnMaskSC: 87 case VPScalarIVStepsSC: 88 case VPPredInstPHISC: 89 return false; 90 case VPBlendSC: 91 case VPReductionEVLSC: 92 case VPReductionSC: 93 case VPVectorPointerSC: 94 case VPWidenCanonicalIVSC: 95 case VPWidenCastSC: 96 case VPWidenGEPSC: 97 case VPWidenIntOrFpInductionSC: 98 case VPWidenLoadEVLSC: 99 case VPWidenLoadSC: 100 case VPWidenPHISC: 101 case VPWidenSC: 102 case VPWidenEVLSC: 103 case VPWidenSelectSC: { 104 const Instruction *I = 105 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 106 (void)I; 107 assert((!I || !I->mayWriteToMemory()) && 108 "underlying instruction may write to memory"); 109 return false; 110 } 111 default: 112 return true; 113 } 114 } 115 116 bool VPRecipeBase::mayReadFromMemory() const { 117 switch (getVPDefID()) { 118 case VPWidenLoadEVLSC: 119 case VPWidenLoadSC: 120 return true; 121 case VPReplicateSC: 122 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 123 ->mayReadFromMemory(); 124 case VPWidenCallSC: 125 return !cast<VPWidenCallRecipe>(this) 126 ->getCalledScalarFunction() 127 ->onlyWritesMemory(); 128 case VPWidenIntrinsicSC: 129 return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory(); 130 case VPBranchOnMaskSC: 131 case VPPredInstPHISC: 132 case VPScalarIVStepsSC: 133 case VPWidenStoreEVLSC: 134 case VPWidenStoreSC: 135 return false; 136 case VPBlendSC: 137 case VPReductionEVLSC: 138 case VPReductionSC: 139 case VPVectorPointerSC: 140 case VPWidenCanonicalIVSC: 141 case VPWidenCastSC: 142 case VPWidenGEPSC: 143 case VPWidenIntOrFpInductionSC: 144 case VPWidenPHISC: 145 case VPWidenSC: 146 case VPWidenEVLSC: 147 case VPWidenSelectSC: { 148 const Instruction *I = 149 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 150 (void)I; 151 assert((!I || !I->mayReadFromMemory()) && 152 "underlying instruction may read from memory"); 153 return false; 154 } 155 default: 156 return true; 157 } 158 } 159 160 bool VPRecipeBase::mayHaveSideEffects() const { 161 switch (getVPDefID()) { 162 case VPDerivedIVSC: 163 case VPPredInstPHISC: 164 case VPScalarCastSC: 165 case VPReverseVectorPointerSC: 166 return false; 167 case VPInstructionSC: 168 return mayWriteToMemory(); 169 case VPWidenCallSC: { 170 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction(); 171 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn(); 172 } 173 case VPWidenIntrinsicSC: 174 return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects(); 175 case VPBlendSC: 176 case VPReductionEVLSC: 177 case VPReductionSC: 178 case VPScalarIVStepsSC: 179 case VPVectorPointerSC: 180 case VPWidenCanonicalIVSC: 181 case VPWidenCastSC: 182 case VPWidenGEPSC: 183 case VPWidenIntOrFpInductionSC: 184 case VPWidenPHISC: 185 case VPWidenPointerInductionSC: 186 case VPWidenSC: 187 case VPWidenEVLSC: 188 case VPWidenSelectSC: { 189 const Instruction *I = 190 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 191 (void)I; 192 assert((!I || !I->mayHaveSideEffects()) && 193 "underlying instruction has side-effects"); 194 return false; 195 } 196 case VPInterleaveSC: 197 return mayWriteToMemory(); 198 case VPWidenLoadEVLSC: 199 case VPWidenLoadSC: 200 case VPWidenStoreEVLSC: 201 case VPWidenStoreSC: 202 assert( 203 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() == 204 mayWriteToMemory() && 205 "mayHaveSideffects result for ingredient differs from this " 206 "implementation"); 207 return mayWriteToMemory(); 208 case VPReplicateSC: { 209 auto *R = cast<VPReplicateRecipe>(this); 210 return R->getUnderlyingInstr()->mayHaveSideEffects(); 211 } 212 default: 213 return true; 214 } 215 } 216 217 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { 218 assert(!Parent && "Recipe already in some VPBasicBlock"); 219 assert(InsertPos->getParent() && 220 "Insertion position not in any VPBasicBlock"); 221 InsertPos->getParent()->insert(this, InsertPos->getIterator()); 222 } 223 224 void VPRecipeBase::insertBefore(VPBasicBlock &BB, 225 iplist<VPRecipeBase>::iterator I) { 226 assert(!Parent && "Recipe already in some VPBasicBlock"); 227 assert(I == BB.end() || I->getParent() == &BB); 228 BB.insert(this, I); 229 } 230 231 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { 232 assert(!Parent && "Recipe already in some VPBasicBlock"); 233 assert(InsertPos->getParent() && 234 "Insertion position not in any VPBasicBlock"); 235 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator())); 236 } 237 238 void VPRecipeBase::removeFromParent() { 239 assert(getParent() && "Recipe not in any VPBasicBlock"); 240 getParent()->getRecipeList().remove(getIterator()); 241 Parent = nullptr; 242 } 243 244 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { 245 assert(getParent() && "Recipe not in any VPBasicBlock"); 246 return getParent()->getRecipeList().erase(getIterator()); 247 } 248 249 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { 250 removeFromParent(); 251 insertAfter(InsertPos); 252 } 253 254 void VPRecipeBase::moveBefore(VPBasicBlock &BB, 255 iplist<VPRecipeBase>::iterator I) { 256 removeFromParent(); 257 insertBefore(BB, I); 258 } 259 260 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { 261 // Get the underlying instruction for the recipe, if there is one. It is used 262 // to 263 // * decide if cost computation should be skipped for this recipe, 264 // * apply forced target instruction cost. 265 Instruction *UI = nullptr; 266 if (auto *S = dyn_cast<VPSingleDefRecipe>(this)) 267 UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue()); 268 else if (auto *IG = dyn_cast<VPInterleaveRecipe>(this)) 269 UI = IG->getInsertPos(); 270 else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this)) 271 UI = &WidenMem->getIngredient(); 272 273 InstructionCost RecipeCost; 274 if (UI && Ctx.skipCostComputation(UI, VF.isVector())) { 275 RecipeCost = 0; 276 } else { 277 RecipeCost = computeCost(VF, Ctx); 278 if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 && 279 RecipeCost.isValid()) 280 RecipeCost = InstructionCost(ForceTargetInstructionCost); 281 } 282 283 LLVM_DEBUG({ 284 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": "; 285 dump(); 286 }); 287 return RecipeCost; 288 } 289 290 InstructionCost VPRecipeBase::computeCost(ElementCount VF, 291 VPCostContext &Ctx) const { 292 llvm_unreachable("subclasses should implement computeCost"); 293 } 294 295 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { 296 assert(OpType == OperationType::FPMathOp && 297 "recipe doesn't have fast math flags"); 298 FastMathFlags Res; 299 Res.setAllowReassoc(FMFs.AllowReassoc); 300 Res.setNoNaNs(FMFs.NoNaNs); 301 Res.setNoInfs(FMFs.NoInfs); 302 Res.setNoSignedZeros(FMFs.NoSignedZeros); 303 Res.setAllowReciprocal(FMFs.AllowReciprocal); 304 Res.setAllowContract(FMFs.AllowContract); 305 Res.setApproxFunc(FMFs.ApproxFunc); 306 return Res; 307 } 308 309 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 310 void VPSingleDefRecipe::dump() const { VPDef::dump(); } 311 #endif 312 313 template <unsigned PartOpIdx> 314 VPValue * 315 VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(VPUser &U) const { 316 if (U.getNumOperands() == PartOpIdx + 1) 317 return U.getOperand(PartOpIdx); 318 return nullptr; 319 } 320 321 template <unsigned PartOpIdx> 322 unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(VPUser &U) const { 323 if (auto *UnrollPartOp = getUnrollPartOperand(U)) 324 return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue(); 325 return 0; 326 } 327 328 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, 329 VPValue *A, VPValue *B, DebugLoc DL, 330 const Twine &Name) 331 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}), 332 Pred, DL), 333 Opcode(Opcode), Name(Name.str()) { 334 assert(Opcode == Instruction::ICmp && 335 "only ICmp predicates supported at the moment"); 336 } 337 338 VPInstruction::VPInstruction(unsigned Opcode, 339 std::initializer_list<VPValue *> Operands, 340 FastMathFlags FMFs, DebugLoc DL, const Twine &Name) 341 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL), 342 Opcode(Opcode), Name(Name.str()) { 343 // Make sure the VPInstruction is a floating-point operation. 344 assert(isFPMathOp() && "this op can't take fast-math flags"); 345 } 346 347 bool VPInstruction::doesGeneratePerAllLanes() const { 348 return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this); 349 } 350 351 bool VPInstruction::canGenerateScalarForFirstLane() const { 352 if (Instruction::isBinaryOp(getOpcode())) 353 return true; 354 if (isSingleScalar() || isVectorToScalar()) 355 return true; 356 switch (Opcode) { 357 case Instruction::ICmp: 358 case Instruction::Select: 359 case VPInstruction::BranchOnCond: 360 case VPInstruction::BranchOnCount: 361 case VPInstruction::CalculateTripCountMinusVF: 362 case VPInstruction::CanonicalIVIncrementForPart: 363 case VPInstruction::PtrAdd: 364 case VPInstruction::ExplicitVectorLength: 365 case VPInstruction::AnyOf: 366 return true; 367 default: 368 return false; 369 } 370 } 371 372 Value *VPInstruction::generatePerLane(VPTransformState &State, 373 const VPLane &Lane) { 374 IRBuilderBase &Builder = State.Builder; 375 376 assert(getOpcode() == VPInstruction::PtrAdd && 377 "only PtrAdd opcodes are supported for now"); 378 return Builder.CreatePtrAdd(State.get(getOperand(0), Lane), 379 State.get(getOperand(1), Lane), Name); 380 } 381 382 Value *VPInstruction::generate(VPTransformState &State) { 383 IRBuilderBase &Builder = State.Builder; 384 385 if (Instruction::isBinaryOp(getOpcode())) { 386 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 387 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); 388 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed); 389 auto *Res = 390 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); 391 if (auto *I = dyn_cast<Instruction>(Res)) 392 setFlags(I); 393 return Res; 394 } 395 396 switch (getOpcode()) { 397 case VPInstruction::Not: { 398 Value *A = State.get(getOperand(0)); 399 return Builder.CreateNot(A, Name); 400 } 401 case Instruction::ICmp: { 402 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 403 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); 404 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed); 405 return Builder.CreateCmp(getPredicate(), A, B, Name); 406 } 407 case Instruction::Select: { 408 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 409 Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed); 410 Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed); 411 Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed); 412 return Builder.CreateSelect(Cond, Op1, Op2, Name); 413 } 414 case VPInstruction::ActiveLaneMask: { 415 // Get first lane of vector induction variable. 416 Value *VIVElem0 = State.get(getOperand(0), VPLane(0)); 417 // Get the original loop tripcount. 418 Value *ScalarTC = State.get(getOperand(1), VPLane(0)); 419 420 // If this part of the active lane mask is scalar, generate the CMP directly 421 // to avoid unnecessary extracts. 422 if (State.VF.isScalar()) 423 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC, 424 Name); 425 426 auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); 427 auto *PredTy = VectorType::get(Int1Ty, State.VF); 428 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, 429 {PredTy, ScalarTC->getType()}, 430 {VIVElem0, ScalarTC}, nullptr, Name); 431 } 432 case VPInstruction::FirstOrderRecurrenceSplice: { 433 // Generate code to combine the previous and current values in vector v3. 434 // 435 // vector.ph: 436 // v_init = vector(..., ..., ..., a[-1]) 437 // br vector.body 438 // 439 // vector.body 440 // i = phi [0, vector.ph], [i+4, vector.body] 441 // v1 = phi [v_init, vector.ph], [v2, vector.body] 442 // v2 = a[i, i+1, i+2, i+3]; 443 // v3 = vector(v1(3), v2(0, 1, 2)) 444 445 auto *V1 = State.get(getOperand(0)); 446 if (!V1->getType()->isVectorTy()) 447 return V1; 448 Value *V2 = State.get(getOperand(1)); 449 return Builder.CreateVectorSplice(V1, V2, -1, Name); 450 } 451 case VPInstruction::CalculateTripCountMinusVF: { 452 unsigned UF = getParent()->getPlan()->getUF(); 453 Value *ScalarTC = State.get(getOperand(0), VPLane(0)); 454 Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF); 455 Value *Sub = Builder.CreateSub(ScalarTC, Step); 456 Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step); 457 Value *Zero = ConstantInt::get(ScalarTC->getType(), 0); 458 return Builder.CreateSelect(Cmp, Sub, Zero); 459 } 460 case VPInstruction::ExplicitVectorLength: { 461 // TODO: Restructure this code with an explicit remainder loop, vsetvli can 462 // be outside of the main loop. 463 Value *AVL = State.get(getOperand(0), /*IsScalar*/ true); 464 // Compute EVL 465 assert(AVL->getType()->isIntegerTy() && 466 "Requested vector length should be an integer."); 467 468 assert(State.VF.isScalable() && "Expected scalable vector factor."); 469 Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue()); 470 471 Value *EVL = State.Builder.CreateIntrinsic( 472 State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length, 473 {AVL, VFArg, State.Builder.getTrue()}); 474 return EVL; 475 } 476 case VPInstruction::CanonicalIVIncrementForPart: { 477 unsigned Part = getUnrollPart(*this); 478 auto *IV = State.get(getOperand(0), VPLane(0)); 479 assert(Part != 0 && "Must have a positive part"); 480 // The canonical IV is incremented by the vectorization factor (num of 481 // SIMD elements) times the unroll part. 482 Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); 483 return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(), 484 hasNoSignedWrap()); 485 } 486 case VPInstruction::BranchOnCond: { 487 Value *Cond = State.get(getOperand(0), VPLane(0)); 488 // Replace the temporary unreachable terminator with a new conditional 489 // branch, hooking it up to backward destination for exiting blocks now and 490 // to forward destination(s) later when they are created. 491 BranchInst *CondBr = 492 Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr); 493 CondBr->setSuccessor(0, nullptr); 494 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 495 496 if (!getParent()->isExiting()) 497 return CondBr; 498 499 VPRegionBlock *ParentRegion = getParent()->getParent(); 500 VPBasicBlock *Header = ParentRegion->getEntryBasicBlock(); 501 CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]); 502 return CondBr; 503 } 504 case VPInstruction::BranchOnCount: { 505 // First create the compare. 506 Value *IV = State.get(getOperand(0), /*IsScalar*/ true); 507 Value *TC = State.get(getOperand(1), /*IsScalar*/ true); 508 Value *Cond = Builder.CreateICmpEQ(IV, TC); 509 510 // Now create the branch. 511 auto *Plan = getParent()->getPlan(); 512 VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); 513 VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock(); 514 515 // Replace the temporary unreachable terminator with a new conditional 516 // branch, hooking it up to backward destination (the header) now and to the 517 // forward destination (the exit/middle block) later when it is created. 518 // Note that CreateCondBr expects a valid BB as first argument, so we need 519 // to set it to nullptr later. 520 BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), 521 State.CFG.VPBB2IRBB[Header]); 522 CondBr->setSuccessor(0, nullptr); 523 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 524 return CondBr; 525 } 526 case VPInstruction::ComputeReductionResult: { 527 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary 528 // and will be removed by breaking up the recipe further. 529 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0)); 530 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 531 // Get its reduction variable descriptor. 532 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 533 534 RecurKind RK = RdxDesc.getRecurrenceKind(); 535 536 Type *PhiTy = OrigPhi->getType(); 537 // The recipe's operands are the reduction phi, followed by one operand for 538 // each part of the reduction. 539 unsigned UF = getNumOperands() - 1; 540 VectorParts RdxParts(UF); 541 for (unsigned Part = 0; Part < UF; ++Part) 542 RdxParts[Part] = State.get(getOperand(1 + Part), PhiR->isInLoop()); 543 544 // If the vector reduction can be performed in a smaller type, we truncate 545 // then extend the loop exit value to enable InstCombine to evaluate the 546 // entire expression in the smaller type. 547 // TODO: Handle this in truncateToMinBW. 548 if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 549 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF); 550 for (unsigned Part = 0; Part < UF; ++Part) 551 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 552 } 553 // Reduce all of the unrolled parts into a single vector. 554 Value *ReducedPartRdx = RdxParts[0]; 555 unsigned Op = RdxDesc.getOpcode(); 556 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) 557 Op = Instruction::Or; 558 559 if (PhiR->isOrdered()) { 560 ReducedPartRdx = RdxParts[UF - 1]; 561 } else { 562 // Floating-point operations should have some FMF to enable the reduction. 563 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 564 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 565 for (unsigned Part = 1; Part < UF; ++Part) { 566 Value *RdxPart = RdxParts[Part]; 567 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 568 ReducedPartRdx = Builder.CreateBinOp( 569 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 570 else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) 571 ReducedPartRdx = 572 createMinMaxOp(Builder, RecurKind::SMax, ReducedPartRdx, RdxPart); 573 else 574 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 575 } 576 } 577 578 // Create the reduction after the loop. Note that inloop reductions create 579 // the target reduction in the loop using a Reduction recipe. 580 if ((State.VF.isVector() || 581 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) || 582 RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) && 583 !PhiR->isInLoop()) { 584 ReducedPartRdx = 585 createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); 586 // If the reduction can be performed in a smaller type, we need to extend 587 // the reduction to the wider type before we branch to the original loop. 588 if (PhiTy != RdxDesc.getRecurrenceType()) 589 ReducedPartRdx = RdxDesc.isSigned() 590 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 591 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 592 } 593 594 return ReducedPartRdx; 595 } 596 case VPInstruction::ExtractFromEnd: { 597 auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue()); 598 unsigned Offset = CI->getZExtValue(); 599 assert(Offset > 0 && "Offset from end must be positive"); 600 Value *Res; 601 if (State.VF.isVector()) { 602 assert(Offset <= State.VF.getKnownMinValue() && 603 "invalid offset to extract from"); 604 // Extract lane VF - Offset from the operand. 605 Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset)); 606 } else { 607 assert(Offset <= 1 && "invalid offset to extract from"); 608 Res = State.get(getOperand(0)); 609 } 610 if (isa<ExtractElementInst>(Res)) 611 Res->setName(Name); 612 return Res; 613 } 614 case VPInstruction::LogicalAnd: { 615 Value *A = State.get(getOperand(0)); 616 Value *B = State.get(getOperand(1)); 617 return Builder.CreateLogicalAnd(A, B, Name); 618 } 619 case VPInstruction::PtrAdd: { 620 assert(vputils::onlyFirstLaneUsed(this) && 621 "can only generate first lane for PtrAdd"); 622 Value *Ptr = State.get(getOperand(0), VPLane(0)); 623 Value *Addend = State.get(getOperand(1), VPLane(0)); 624 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags()); 625 } 626 case VPInstruction::ResumePhi: { 627 Value *IncomingFromVPlanPred = 628 State.get(getOperand(0), /* IsScalar */ true); 629 Value *IncomingFromOtherPreds = 630 State.get(getOperand(1), /* IsScalar */ true); 631 auto *NewPhi = 632 Builder.CreatePHI(State.TypeAnalysis.inferScalarType(this), 2, Name); 633 BasicBlock *VPlanPred = 634 State.CFG 635 .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getPredecessors()[0])]; 636 NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred); 637 for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) { 638 if (OtherPred == VPlanPred) 639 continue; 640 NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred); 641 } 642 return NewPhi; 643 } 644 case VPInstruction::AnyOf: { 645 Value *A = State.get(getOperand(0)); 646 return Builder.CreateOrReduce(A); 647 } 648 649 default: 650 llvm_unreachable("Unsupported opcode for instruction"); 651 } 652 } 653 654 bool VPInstruction::isVectorToScalar() const { 655 return getOpcode() == VPInstruction::ExtractFromEnd || 656 getOpcode() == VPInstruction::ComputeReductionResult || 657 getOpcode() == VPInstruction::AnyOf; 658 } 659 660 bool VPInstruction::isSingleScalar() const { 661 return getOpcode() == VPInstruction::ResumePhi; 662 } 663 664 #if !defined(NDEBUG) 665 bool VPInstruction::isFPMathOp() const { 666 // Inspired by FPMathOperator::classof. Notable differences are that we don't 667 // support Call, PHI and Select opcodes here yet. 668 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || 669 Opcode == Instruction::FNeg || Opcode == Instruction::FSub || 670 Opcode == Instruction::FDiv || Opcode == Instruction::FRem || 671 Opcode == Instruction::FCmp || Opcode == Instruction::Select; 672 } 673 #endif 674 675 void VPInstruction::execute(VPTransformState &State) { 676 assert(!State.Lane && "VPInstruction executing an Lane"); 677 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 678 assert((hasFastMathFlags() == isFPMathOp() || 679 getOpcode() == Instruction::Select) && 680 "Recipe not a FPMathOp but has fast-math flags?"); 681 if (hasFastMathFlags()) 682 State.Builder.setFastMathFlags(getFastMathFlags()); 683 State.setDebugLocFrom(getDebugLoc()); 684 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() && 685 (vputils::onlyFirstLaneUsed(this) || 686 isVectorToScalar() || isSingleScalar()); 687 bool GeneratesPerAllLanes = doesGeneratePerAllLanes(); 688 if (GeneratesPerAllLanes) { 689 for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue(); 690 Lane != NumLanes; ++Lane) { 691 Value *GeneratedValue = generatePerLane(State, VPLane(Lane)); 692 assert(GeneratedValue && "generatePerLane must produce a value"); 693 State.set(this, GeneratedValue, VPLane(Lane)); 694 } 695 return; 696 } 697 698 Value *GeneratedValue = generate(State); 699 if (!hasResult()) 700 return; 701 assert(GeneratedValue && "generate must produce a value"); 702 assert( 703 (GeneratedValue->getType()->isVectorTy() == !GeneratesPerFirstLaneOnly || 704 State.VF.isScalar()) && 705 "scalar value but not only first lane defined"); 706 State.set(this, GeneratedValue, 707 /*IsScalar*/ GeneratesPerFirstLaneOnly); 708 } 709 710 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { 711 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 712 if (Instruction::isBinaryOp(getOpcode())) 713 return vputils::onlyFirstLaneUsed(this); 714 715 switch (getOpcode()) { 716 default: 717 return false; 718 case Instruction::ICmp: 719 case Instruction::Select: 720 case Instruction::Or: 721 case VPInstruction::PtrAdd: 722 // TODO: Cover additional opcodes. 723 return vputils::onlyFirstLaneUsed(this); 724 case VPInstruction::ActiveLaneMask: 725 case VPInstruction::ExplicitVectorLength: 726 case VPInstruction::CalculateTripCountMinusVF: 727 case VPInstruction::CanonicalIVIncrementForPart: 728 case VPInstruction::BranchOnCount: 729 case VPInstruction::BranchOnCond: 730 case VPInstruction::ResumePhi: 731 return true; 732 }; 733 llvm_unreachable("switch should return"); 734 } 735 736 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const { 737 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 738 if (Instruction::isBinaryOp(getOpcode())) 739 return vputils::onlyFirstPartUsed(this); 740 741 switch (getOpcode()) { 742 default: 743 return false; 744 case Instruction::ICmp: 745 case Instruction::Select: 746 return vputils::onlyFirstPartUsed(this); 747 case VPInstruction::BranchOnCount: 748 case VPInstruction::BranchOnCond: 749 case VPInstruction::CanonicalIVIncrementForPart: 750 return true; 751 }; 752 llvm_unreachable("switch should return"); 753 } 754 755 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 756 void VPInstruction::dump() const { 757 VPSlotTracker SlotTracker(getParent()->getPlan()); 758 print(dbgs(), "", SlotTracker); 759 } 760 761 void VPInstruction::print(raw_ostream &O, const Twine &Indent, 762 VPSlotTracker &SlotTracker) const { 763 O << Indent << "EMIT "; 764 765 if (hasResult()) { 766 printAsOperand(O, SlotTracker); 767 O << " = "; 768 } 769 770 switch (getOpcode()) { 771 case VPInstruction::Not: 772 O << "not"; 773 break; 774 case VPInstruction::SLPLoad: 775 O << "combined load"; 776 break; 777 case VPInstruction::SLPStore: 778 O << "combined store"; 779 break; 780 case VPInstruction::ActiveLaneMask: 781 O << "active lane mask"; 782 break; 783 case VPInstruction::ResumePhi: 784 O << "resume-phi"; 785 break; 786 case VPInstruction::ExplicitVectorLength: 787 O << "EXPLICIT-VECTOR-LENGTH"; 788 break; 789 case VPInstruction::FirstOrderRecurrenceSplice: 790 O << "first-order splice"; 791 break; 792 case VPInstruction::BranchOnCond: 793 O << "branch-on-cond"; 794 break; 795 case VPInstruction::CalculateTripCountMinusVF: 796 O << "TC > VF ? TC - VF : 0"; 797 break; 798 case VPInstruction::CanonicalIVIncrementForPart: 799 O << "VF * Part +"; 800 break; 801 case VPInstruction::BranchOnCount: 802 O << "branch-on-count"; 803 break; 804 case VPInstruction::ExtractFromEnd: 805 O << "extract-from-end"; 806 break; 807 case VPInstruction::ComputeReductionResult: 808 O << "compute-reduction-result"; 809 break; 810 case VPInstruction::LogicalAnd: 811 O << "logical-and"; 812 break; 813 case VPInstruction::PtrAdd: 814 O << "ptradd"; 815 break; 816 case VPInstruction::AnyOf: 817 O << "any-of"; 818 break; 819 default: 820 O << Instruction::getOpcodeName(getOpcode()); 821 } 822 823 printFlags(O); 824 printOperands(O, SlotTracker); 825 826 if (auto DL = getDebugLoc()) { 827 O << ", !dbg "; 828 DL.print(O); 829 } 830 } 831 #endif 832 833 void VPIRInstruction::execute(VPTransformState &State) { 834 assert((isa<PHINode>(&I) || getNumOperands() == 0) && 835 "Only PHINodes can have extra operands"); 836 for (const auto &[Idx, Op] : enumerate(operands())) { 837 VPValue *ExitValue = Op; 838 auto Lane = vputils::isUniformAfterVectorization(ExitValue) 839 ? VPLane::getFirstLane() 840 : VPLane::getLastLaneForVF(State.VF); 841 VPBlockBase *Pred = getParent()->getPredecessors()[Idx]; 842 auto *PredVPBB = Pred->getExitingBasicBlock(); 843 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; 844 // Set insertion point in PredBB in case an extract needs to be generated. 845 // TODO: Model extracts explicitly. 846 State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt()); 847 Value *V = State.get(ExitValue, VPLane(Lane)); 848 auto *Phi = cast<PHINode>(&I); 849 // If there is no existing block for PredBB in the phi, add a new incoming 850 // value. Otherwise update the existing incoming value for PredBB. 851 if (Phi->getBasicBlockIndex(PredBB) == -1) 852 Phi->addIncoming(V, PredBB); 853 else 854 Phi->setIncomingValueForBlock(PredBB, V); 855 } 856 857 // Advance the insert point after the wrapped IR instruction. This allows 858 // interleaving VPIRInstructions and other recipes. 859 State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator())); 860 } 861 862 InstructionCost VPIRInstruction::computeCost(ElementCount VF, 863 VPCostContext &Ctx) const { 864 // The recipe wraps an existing IR instruction on the border of VPlan's scope, 865 // hence it does not contribute to the cost-modeling for the VPlan. 866 return 0; 867 } 868 869 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 870 void VPIRInstruction::print(raw_ostream &O, const Twine &Indent, 871 VPSlotTracker &SlotTracker) const { 872 O << Indent << "IR " << I; 873 874 if (getNumOperands() != 0) { 875 O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": "; 876 interleaveComma( 877 enumerate(operands()), O, [this, &O, &SlotTracker](auto Op) { 878 Op.value()->printAsOperand(O, SlotTracker); 879 O << " from "; 880 getParent()->getPredecessors()[Op.index()]->printAsOperand(O); 881 }); 882 O << ")"; 883 } 884 } 885 #endif 886 887 void VPWidenCallRecipe::execute(VPTransformState &State) { 888 assert(State.VF.isVector() && "not widening"); 889 State.setDebugLocFrom(getDebugLoc()); 890 891 FunctionType *VFTy = Variant->getFunctionType(); 892 // Add return type if intrinsic is overloaded on it. 893 SmallVector<Value *, 4> Args; 894 for (const auto &I : enumerate(arg_operands())) { 895 Value *Arg; 896 // Some vectorized function variants may also take a scalar argument, 897 // e.g. linear parameters for pointers. This needs to be the scalar value 898 // from the start of the respective part when interleaving. 899 if (!VFTy->getParamType(I.index())->isVectorTy()) 900 Arg = State.get(I.value(), VPLane(0)); 901 else 902 Arg = State.get(I.value(), onlyFirstLaneUsed(I.value())); 903 Args.push_back(Arg); 904 } 905 906 assert(Variant != nullptr && "Can't create vector function."); 907 908 auto *CI = cast_or_null<CallInst>(getUnderlyingValue()); 909 SmallVector<OperandBundleDef, 1> OpBundles; 910 if (CI) 911 CI->getOperandBundlesAsDefs(OpBundles); 912 913 CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles); 914 setFlags(V); 915 916 if (!V->getType()->isVoidTy()) 917 State.set(this, V); 918 State.addMetadata(V, CI); 919 } 920 921 InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF, 922 VPCostContext &Ctx) const { 923 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 924 return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(), 925 Variant->getFunctionType()->params(), 926 CostKind); 927 } 928 929 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 930 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, 931 VPSlotTracker &SlotTracker) const { 932 O << Indent << "WIDEN-CALL "; 933 934 Function *CalledFn = getCalledScalarFunction(); 935 if (CalledFn->getReturnType()->isVoidTy()) 936 O << "void "; 937 else { 938 printAsOperand(O, SlotTracker); 939 O << " = "; 940 } 941 942 O << "call"; 943 printFlags(O); 944 O << " @" << CalledFn->getName() << "("; 945 interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) { 946 Op->printAsOperand(O, SlotTracker); 947 }); 948 O << ")"; 949 950 O << " (using library function"; 951 if (Variant->hasName()) 952 O << ": " << Variant->getName(); 953 O << ")"; 954 } 955 #endif 956 957 void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { 958 assert(State.VF.isVector() && "not widening"); 959 State.setDebugLocFrom(getDebugLoc()); 960 961 SmallVector<Type *, 2> TysForDecl; 962 // Add return type if intrinsic is overloaded on it. 963 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1, State.TTI)) 964 TysForDecl.push_back(VectorType::get(getResultType(), State.VF)); 965 SmallVector<Value *, 4> Args; 966 for (const auto &I : enumerate(operands())) { 967 // Some intrinsics have a scalar argument - don't replace it with a 968 // vector. 969 Value *Arg; 970 if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(), 971 State.TTI)) 972 Arg = State.get(I.value(), VPLane(0)); 973 else 974 Arg = State.get(I.value(), onlyFirstLaneUsed(I.value())); 975 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(), 976 State.TTI)) 977 TysForDecl.push_back(Arg->getType()); 978 Args.push_back(Arg); 979 } 980 981 // Use vector version of the intrinsic. 982 Module *M = State.Builder.GetInsertBlock()->getModule(); 983 Function *VectorF = 984 Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl); 985 assert(VectorF && 986 "Can't retrieve vector intrinsic or vector-predication intrinsics."); 987 988 auto *CI = cast_or_null<CallInst>(getUnderlyingValue()); 989 SmallVector<OperandBundleDef, 1> OpBundles; 990 if (CI) 991 CI->getOperandBundlesAsDefs(OpBundles); 992 993 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); 994 995 setFlags(V); 996 997 if (!V->getType()->isVoidTy()) 998 State.set(this, V); 999 State.addMetadata(V, CI); 1000 } 1001 1002 InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, 1003 VPCostContext &Ctx) const { 1004 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1005 1006 // Some backends analyze intrinsic arguments to determine cost. Use the 1007 // underlying value for the operand if it has one. Otherwise try to use the 1008 // operand of the underlying call instruction, if there is one. Otherwise 1009 // clear Arguments. 1010 // TODO: Rework TTI interface to be independent of concrete IR values. 1011 SmallVector<const Value *> Arguments; 1012 for (const auto &[Idx, Op] : enumerate(operands())) { 1013 auto *V = Op->getUnderlyingValue(); 1014 if (!V) { 1015 // Push all the VP Intrinsic's ops into the Argments even if is nullptr. 1016 // Some VP Intrinsic's cost will assert the number of parameters. 1017 // Mainly appears in the following two scenarios: 1018 // 1. EVL Op is nullptr 1019 // 2. The Argmunt of the VP Intrinsic is also the VP Intrinsic 1020 if (VPIntrinsic::isVPIntrinsic(VectorIntrinsicID)) { 1021 Arguments.push_back(V); 1022 continue; 1023 } 1024 if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) { 1025 Arguments.push_back(UI->getArgOperand(Idx)); 1026 continue; 1027 } 1028 Arguments.clear(); 1029 break; 1030 } 1031 Arguments.push_back(V); 1032 } 1033 1034 Type *RetTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF); 1035 SmallVector<Type *> ParamTys; 1036 for (unsigned I = 0; I != getNumOperands(); ++I) 1037 ParamTys.push_back( 1038 ToVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF)); 1039 1040 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst. 1041 FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags(); 1042 IntrinsicCostAttributes CostAttrs( 1043 VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF, 1044 dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue())); 1045 return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, CostKind); 1046 } 1047 1048 StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const { 1049 return Intrinsic::getBaseName(VectorIntrinsicID); 1050 } 1051 1052 bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const { 1053 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 1054 // Vector predication intrinsics only demand the the first lane the last 1055 // operand (the EVL operand). 1056 return VPIntrinsic::isVPIntrinsic(VectorIntrinsicID) && 1057 Op == getOperand(getNumOperands() - 1); 1058 } 1059 1060 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1061 void VPWidenIntrinsicRecipe::print(raw_ostream &O, const Twine &Indent, 1062 VPSlotTracker &SlotTracker) const { 1063 O << Indent << "WIDEN-INTRINSIC "; 1064 if (ResultTy->isVoidTy()) { 1065 O << "void "; 1066 } else { 1067 printAsOperand(O, SlotTracker); 1068 O << " = "; 1069 } 1070 1071 O << "call"; 1072 printFlags(O); 1073 O << getIntrinsicName() << "("; 1074 1075 interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) { 1076 Op->printAsOperand(O, SlotTracker); 1077 }); 1078 O << ")"; 1079 } 1080 #endif 1081 1082 void VPHistogramRecipe::execute(VPTransformState &State) { 1083 State.setDebugLocFrom(getDebugLoc()); 1084 IRBuilderBase &Builder = State.Builder; 1085 1086 Value *Address = State.get(getOperand(0)); 1087 Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true); 1088 VectorType *VTy = cast<VectorType>(Address->getType()); 1089 1090 // The histogram intrinsic requires a mask even if the recipe doesn't; 1091 // if the mask operand was omitted then all lanes should be executed and 1092 // we just need to synthesize an all-true mask. 1093 Value *Mask = nullptr; 1094 if (VPValue *VPMask = getMask()) 1095 Mask = State.get(VPMask); 1096 else 1097 Mask = 1098 Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1)); 1099 1100 // If this is a subtract, we want to invert the increment amount. We may 1101 // add a separate intrinsic in future, but for now we'll try this. 1102 if (Opcode == Instruction::Sub) 1103 IncAmt = Builder.CreateNeg(IncAmt); 1104 else 1105 assert(Opcode == Instruction::Add && "only add or sub supported for now"); 1106 1107 State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add, 1108 {VTy, IncAmt->getType()}, 1109 {Address, IncAmt, Mask}); 1110 } 1111 1112 InstructionCost VPHistogramRecipe::computeCost(ElementCount VF, 1113 VPCostContext &Ctx) const { 1114 // FIXME: Take the gather and scatter into account as well. For now we're 1115 // generating the same cost as the fallback path, but we'll likely 1116 // need to create a new TTI method for determining the cost, including 1117 // whether we can use base + vec-of-smaller-indices or just 1118 // vec-of-pointers. 1119 assert(VF.isVector() && "Invalid VF for histogram cost"); 1120 Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0)); 1121 VPValue *IncAmt = getOperand(1); 1122 Type *IncTy = Ctx.Types.inferScalarType(IncAmt); 1123 VectorType *VTy = VectorType::get(IncTy, VF); 1124 1125 // Assume that a non-constant update value (or a constant != 1) requires 1126 // a multiply, and add that into the cost. 1127 InstructionCost MulCost = 1128 Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy); 1129 if (IncAmt->isLiveIn()) { 1130 ConstantInt *CI = dyn_cast<ConstantInt>(IncAmt->getLiveInIRValue()); 1131 1132 if (CI && CI->getZExtValue() == 1) 1133 MulCost = TTI::TCC_Free; 1134 } 1135 1136 // Find the cost of the histogram operation itself. 1137 Type *PtrTy = VectorType::get(AddressTy, VF); 1138 Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF); 1139 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add, 1140 Type::getVoidTy(Ctx.LLVMCtx), 1141 {PtrTy, IncTy, MaskTy}); 1142 1143 // Add the costs together with the add/sub operation. 1144 return Ctx.TTI.getIntrinsicInstrCost( 1145 ICA, TargetTransformInfo::TCK_RecipThroughput) + 1146 MulCost + Ctx.TTI.getArithmeticInstrCost(Opcode, VTy); 1147 } 1148 1149 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1150 void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent, 1151 VPSlotTracker &SlotTracker) const { 1152 O << Indent << "WIDEN-HISTOGRAM buckets: "; 1153 getOperand(0)->printAsOperand(O, SlotTracker); 1154 1155 if (Opcode == Instruction::Sub) 1156 O << ", dec: "; 1157 else { 1158 assert(Opcode == Instruction::Add); 1159 O << ", inc: "; 1160 } 1161 getOperand(1)->printAsOperand(O, SlotTracker); 1162 1163 if (VPValue *Mask = getMask()) { 1164 O << ", mask: "; 1165 Mask->printAsOperand(O, SlotTracker); 1166 } 1167 } 1168 1169 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, 1170 VPSlotTracker &SlotTracker) const { 1171 O << Indent << "WIDEN-SELECT "; 1172 printAsOperand(O, SlotTracker); 1173 O << " = select "; 1174 getOperand(0)->printAsOperand(O, SlotTracker); 1175 O << ", "; 1176 getOperand(1)->printAsOperand(O, SlotTracker); 1177 O << ", "; 1178 getOperand(2)->printAsOperand(O, SlotTracker); 1179 O << (isInvariantCond() ? " (condition is loop invariant)" : ""); 1180 } 1181 #endif 1182 1183 void VPWidenSelectRecipe::execute(VPTransformState &State) { 1184 State.setDebugLocFrom(getDebugLoc()); 1185 1186 // The condition can be loop invariant but still defined inside the 1187 // loop. This means that we can't just use the original 'cond' value. 1188 // We have to take the 'vectorized' value and pick the first lane. 1189 // Instcombine will make this a no-op. 1190 auto *InvarCond = 1191 isInvariantCond() ? State.get(getCond(), VPLane(0)) : nullptr; 1192 1193 Value *Cond = InvarCond ? InvarCond : State.get(getCond()); 1194 Value *Op0 = State.get(getOperand(1)); 1195 Value *Op1 = State.get(getOperand(2)); 1196 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 1197 State.set(this, Sel); 1198 State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1199 } 1200 1201 InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF, 1202 VPCostContext &Ctx) const { 1203 SelectInst *SI = cast<SelectInst>(getUnderlyingValue()); 1204 bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions(); 1205 Type *ScalarTy = Ctx.Types.inferScalarType(this); 1206 Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF); 1207 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1208 1209 VPValue *Op0, *Op1; 1210 using namespace llvm::VPlanPatternMatch; 1211 if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 && 1212 (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) || 1213 match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) { 1214 // select x, y, false --> x & y 1215 // select x, true, y --> x | y 1216 const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0); 1217 const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1); 1218 1219 SmallVector<const Value *, 2> Operands; 1220 if (all_of(operands(), 1221 [](VPValue *Op) { return Op->getUnderlyingValue(); })) 1222 Operands.append(SI->op_begin(), SI->op_end()); 1223 bool IsLogicalOr = match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))); 1224 return Ctx.TTI.getArithmeticInstrCost( 1225 IsLogicalOr ? Instruction::Or : Instruction::And, VectorTy, CostKind, 1226 {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI); 1227 } 1228 1229 Type *CondTy = Ctx.Types.inferScalarType(getOperand(0)); 1230 if (!ScalarCond) 1231 CondTy = VectorType::get(CondTy, VF); 1232 1233 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 1234 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 1235 Pred = Cmp->getPredicate(); 1236 return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VectorTy, CondTy, Pred, 1237 CostKind, {TTI::OK_AnyValue, TTI::OP_None}, 1238 {TTI::OK_AnyValue, TTI::OP_None}, SI); 1239 } 1240 1241 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy( 1242 const FastMathFlags &FMF) { 1243 AllowReassoc = FMF.allowReassoc(); 1244 NoNaNs = FMF.noNaNs(); 1245 NoInfs = FMF.noInfs(); 1246 NoSignedZeros = FMF.noSignedZeros(); 1247 AllowReciprocal = FMF.allowReciprocal(); 1248 AllowContract = FMF.allowContract(); 1249 ApproxFunc = FMF.approxFunc(); 1250 } 1251 1252 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1253 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { 1254 switch (OpType) { 1255 case OperationType::Cmp: 1256 O << " " << CmpInst::getPredicateName(getPredicate()); 1257 break; 1258 case OperationType::DisjointOp: 1259 if (DisjointFlags.IsDisjoint) 1260 O << " disjoint"; 1261 break; 1262 case OperationType::PossiblyExactOp: 1263 if (ExactFlags.IsExact) 1264 O << " exact"; 1265 break; 1266 case OperationType::OverflowingBinOp: 1267 if (WrapFlags.HasNUW) 1268 O << " nuw"; 1269 if (WrapFlags.HasNSW) 1270 O << " nsw"; 1271 break; 1272 case OperationType::FPMathOp: 1273 getFastMathFlags().print(O); 1274 break; 1275 case OperationType::GEPOp: 1276 if (GEPFlags.isInBounds()) 1277 O << " inbounds"; 1278 else if (GEPFlags.hasNoUnsignedSignedWrap()) 1279 O << " nusw"; 1280 if (GEPFlags.hasNoUnsignedWrap()) 1281 O << " nuw"; 1282 break; 1283 case OperationType::NonNegOp: 1284 if (NonNegFlags.NonNeg) 1285 O << " nneg"; 1286 break; 1287 case OperationType::Other: 1288 break; 1289 } 1290 if (getNumOperands() > 0) 1291 O << " "; 1292 } 1293 #endif 1294 1295 void VPWidenRecipe::execute(VPTransformState &State) { 1296 State.setDebugLocFrom(getDebugLoc()); 1297 auto &Builder = State.Builder; 1298 switch (Opcode) { 1299 case Instruction::Call: 1300 case Instruction::Br: 1301 case Instruction::PHI: 1302 case Instruction::GetElementPtr: 1303 case Instruction::Select: 1304 llvm_unreachable("This instruction is handled by a different recipe."); 1305 case Instruction::UDiv: 1306 case Instruction::SDiv: 1307 case Instruction::SRem: 1308 case Instruction::URem: 1309 case Instruction::Add: 1310 case Instruction::FAdd: 1311 case Instruction::Sub: 1312 case Instruction::FSub: 1313 case Instruction::FNeg: 1314 case Instruction::Mul: 1315 case Instruction::FMul: 1316 case Instruction::FDiv: 1317 case Instruction::FRem: 1318 case Instruction::Shl: 1319 case Instruction::LShr: 1320 case Instruction::AShr: 1321 case Instruction::And: 1322 case Instruction::Or: 1323 case Instruction::Xor: { 1324 // Just widen unops and binops. 1325 SmallVector<Value *, 2> Ops; 1326 for (VPValue *VPOp : operands()) 1327 Ops.push_back(State.get(VPOp)); 1328 1329 Value *V = Builder.CreateNAryOp(Opcode, Ops); 1330 1331 if (auto *VecOp = dyn_cast<Instruction>(V)) 1332 setFlags(VecOp); 1333 1334 // Use this vector value for all users of the original instruction. 1335 State.set(this, V); 1336 State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1337 break; 1338 } 1339 case Instruction::Freeze: { 1340 Value *Op = State.get(getOperand(0)); 1341 1342 Value *Freeze = Builder.CreateFreeze(Op); 1343 State.set(this, Freeze); 1344 break; 1345 } 1346 case Instruction::ICmp: 1347 case Instruction::FCmp: { 1348 // Widen compares. Generate vector compares. 1349 bool FCmp = Opcode == Instruction::FCmp; 1350 Value *A = State.get(getOperand(0)); 1351 Value *B = State.get(getOperand(1)); 1352 Value *C = nullptr; 1353 if (FCmp) { 1354 // Propagate fast math flags. 1355 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 1356 if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue())) 1357 Builder.setFastMathFlags(I->getFastMathFlags()); 1358 C = Builder.CreateFCmp(getPredicate(), A, B); 1359 } else { 1360 C = Builder.CreateICmp(getPredicate(), A, B); 1361 } 1362 State.set(this, C); 1363 State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1364 break; 1365 } 1366 default: 1367 // This instruction is not vectorized by simple widening. 1368 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " 1369 << Instruction::getOpcodeName(Opcode)); 1370 llvm_unreachable("Unhandled instruction!"); 1371 } // end of switch. 1372 1373 #if !defined(NDEBUG) 1374 // Verify that VPlan type inference results agree with the type of the 1375 // generated values. 1376 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) == 1377 State.get(this)->getType() && 1378 "inferred type and type from generated instructions do not match"); 1379 #endif 1380 } 1381 1382 InstructionCost VPWidenRecipe::computeCost(ElementCount VF, 1383 VPCostContext &Ctx) const { 1384 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1385 switch (Opcode) { 1386 case Instruction::FNeg: { 1387 Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF); 1388 return Ctx.TTI.getArithmeticInstrCost( 1389 Opcode, VectorTy, CostKind, 1390 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 1391 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}); 1392 } 1393 1394 case Instruction::UDiv: 1395 case Instruction::SDiv: 1396 case Instruction::SRem: 1397 case Instruction::URem: 1398 // More complex computation, let the legacy cost-model handle this for now. 1399 return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF); 1400 case Instruction::Add: 1401 case Instruction::FAdd: 1402 case Instruction::Sub: 1403 case Instruction::FSub: 1404 case Instruction::Mul: 1405 case Instruction::FMul: 1406 case Instruction::FDiv: 1407 case Instruction::FRem: 1408 case Instruction::Shl: 1409 case Instruction::LShr: 1410 case Instruction::AShr: 1411 case Instruction::And: 1412 case Instruction::Or: 1413 case Instruction::Xor: { 1414 VPValue *RHS = getOperand(1); 1415 // Certain instructions can be cheaper to vectorize if they have a constant 1416 // second vector operand. One example of this are shifts on x86. 1417 TargetTransformInfo::OperandValueInfo RHSInfo = { 1418 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}; 1419 if (RHS->isLiveIn()) 1420 RHSInfo = Ctx.TTI.getOperandInfo(RHS->getLiveInIRValue()); 1421 1422 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue && 1423 getOperand(1)->isDefinedOutsideLoopRegions()) 1424 RHSInfo.Kind = TargetTransformInfo::OK_UniformValue; 1425 Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF); 1426 Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue()); 1427 1428 SmallVector<const Value *, 4> Operands; 1429 if (CtxI) 1430 Operands.append(CtxI->value_op_begin(), CtxI->value_op_end()); 1431 return Ctx.TTI.getArithmeticInstrCost( 1432 Opcode, VectorTy, CostKind, 1433 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 1434 RHSInfo, Operands, CtxI, &Ctx.TLI); 1435 } 1436 case Instruction::Freeze: { 1437 // This opcode is unknown. Assume that it is the same as 'mul'. 1438 Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF); 1439 return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 1440 } 1441 case Instruction::ICmp: 1442 case Instruction::FCmp: { 1443 Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue()); 1444 Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); 1445 return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(), 1446 CostKind, 1447 {TTI::OK_AnyValue, TTI::OP_None}, 1448 {TTI::OK_AnyValue, TTI::OP_None}, CtxI); 1449 } 1450 default: 1451 llvm_unreachable("Unsupported opcode for instruction"); 1452 } 1453 } 1454 1455 void VPWidenEVLRecipe::execute(VPTransformState &State) { 1456 unsigned Opcode = getOpcode(); 1457 // TODO: Support other opcodes 1458 if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode)) 1459 llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute"); 1460 1461 State.setDebugLocFrom(getDebugLoc()); 1462 1463 assert(State.get(getOperand(0))->getType()->isVectorTy() && 1464 "VPWidenEVLRecipe should not be used for scalars"); 1465 1466 VPValue *EVL = getEVL(); 1467 Value *EVLArg = State.get(EVL, /*NeedsScalar=*/true); 1468 IRBuilderBase &BuilderIR = State.Builder; 1469 VectorBuilder Builder(BuilderIR); 1470 Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue()); 1471 1472 SmallVector<Value *, 4> Ops; 1473 for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) { 1474 VPValue *VPOp = getOperand(I); 1475 Ops.push_back(State.get(VPOp)); 1476 } 1477 1478 Builder.setMask(Mask).setEVL(EVLArg); 1479 Value *VPInst = 1480 Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops, "vp.op"); 1481 // Currently vp-intrinsics only accept FMF flags. 1482 // TODO: Enable other flags when support is added. 1483 if (isa<FPMathOperator>(VPInst)) 1484 setFlags(cast<Instruction>(VPInst)); 1485 1486 State.set(this, VPInst); 1487 State.addMetadata(VPInst, 1488 dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1489 } 1490 1491 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1492 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, 1493 VPSlotTracker &SlotTracker) const { 1494 O << Indent << "WIDEN "; 1495 printAsOperand(O, SlotTracker); 1496 O << " = " << Instruction::getOpcodeName(Opcode); 1497 printFlags(O); 1498 printOperands(O, SlotTracker); 1499 } 1500 1501 void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent, 1502 VPSlotTracker &SlotTracker) const { 1503 O << Indent << "WIDEN "; 1504 printAsOperand(O, SlotTracker); 1505 O << " = vp." << Instruction::getOpcodeName(getOpcode()); 1506 printFlags(O); 1507 printOperands(O, SlotTracker); 1508 } 1509 #endif 1510 1511 void VPWidenCastRecipe::execute(VPTransformState &State) { 1512 State.setDebugLocFrom(getDebugLoc()); 1513 auto &Builder = State.Builder; 1514 /// Vectorize casts. 1515 assert(State.VF.isVector() && "Not vectorizing?"); 1516 Type *DestTy = VectorType::get(getResultType(), State.VF); 1517 VPValue *Op = getOperand(0); 1518 Value *A = State.get(Op); 1519 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); 1520 State.set(this, Cast); 1521 State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue())); 1522 if (auto *CastOp = dyn_cast<Instruction>(Cast)) 1523 setFlags(CastOp); 1524 } 1525 1526 InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF, 1527 VPCostContext &Ctx) const { 1528 // TODO: In some cases, VPWidenCastRecipes are created but not considered in 1529 // the legacy cost model, including truncates/extends when evaluating a 1530 // reduction in a smaller type. 1531 if (!getUnderlyingValue()) 1532 return 0; 1533 // Computes the CastContextHint from a recipes that may access memory. 1534 auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint { 1535 if (VF.isScalar()) 1536 return TTI::CastContextHint::Normal; 1537 if (isa<VPInterleaveRecipe>(R)) 1538 return TTI::CastContextHint::Interleave; 1539 if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R)) 1540 return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked 1541 : TTI::CastContextHint::Normal; 1542 const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R); 1543 if (WidenMemoryRecipe == nullptr) 1544 return TTI::CastContextHint::None; 1545 if (!WidenMemoryRecipe->isConsecutive()) 1546 return TTI::CastContextHint::GatherScatter; 1547 if (WidenMemoryRecipe->isReverse()) 1548 return TTI::CastContextHint::Reversed; 1549 if (WidenMemoryRecipe->isMasked()) 1550 return TTI::CastContextHint::Masked; 1551 return TTI::CastContextHint::Normal; 1552 }; 1553 1554 VPValue *Operand = getOperand(0); 1555 TTI::CastContextHint CCH = TTI::CastContextHint::None; 1556 // For Trunc/FPTrunc, get the context from the only user. 1557 if ((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) && 1558 !hasMoreThanOneUniqueUser() && getNumUsers() > 0) { 1559 if (auto *StoreRecipe = dyn_cast<VPRecipeBase>(*user_begin())) 1560 CCH = ComputeCCH(StoreRecipe); 1561 } 1562 // For Z/Sext, get the context from the operand. 1563 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 1564 Opcode == Instruction::FPExt) { 1565 if (Operand->isLiveIn()) 1566 CCH = TTI::CastContextHint::Normal; 1567 else if (Operand->getDefiningRecipe()) 1568 CCH = ComputeCCH(Operand->getDefiningRecipe()); 1569 } 1570 1571 auto *SrcTy = 1572 cast<VectorType>(ToVectorTy(Ctx.Types.inferScalarType(Operand), VF)); 1573 auto *DestTy = cast<VectorType>(ToVectorTy(getResultType(), VF)); 1574 // Arm TTI will use the underlying instruction to determine the cost. 1575 return Ctx.TTI.getCastInstrCost( 1576 Opcode, DestTy, SrcTy, CCH, TTI::TCK_RecipThroughput, 1577 dyn_cast_if_present<Instruction>(getUnderlyingValue())); 1578 } 1579 1580 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1581 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, 1582 VPSlotTracker &SlotTracker) const { 1583 O << Indent << "WIDEN-CAST "; 1584 printAsOperand(O, SlotTracker); 1585 O << " = " << Instruction::getOpcodeName(Opcode) << " "; 1586 printFlags(O); 1587 printOperands(O, SlotTracker); 1588 O << " to " << *getResultType(); 1589 } 1590 #endif 1591 1592 InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF, 1593 VPCostContext &Ctx) const { 1594 return Ctx.TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 1595 } 1596 1597 /// This function adds 1598 /// (0 * Step, 1 * Step, 2 * Step, ...) 1599 /// to each vector element of Val. 1600 /// \p Opcode is relevant for FP induction variable. 1601 static Value *getStepVector(Value *Val, Value *Step, 1602 Instruction::BinaryOps BinOp, ElementCount VF, 1603 IRBuilderBase &Builder) { 1604 assert(VF.isVector() && "only vector VFs are supported"); 1605 1606 // Create and check the types. 1607 auto *ValVTy = cast<VectorType>(Val->getType()); 1608 ElementCount VLen = ValVTy->getElementCount(); 1609 1610 Type *STy = Val->getType()->getScalarType(); 1611 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1612 "Induction Step must be an integer or FP"); 1613 assert(Step->getType() == STy && "Step has wrong type"); 1614 1615 SmallVector<Constant *, 8> Indices; 1616 1617 // Create a vector of consecutive numbers from zero to VF. 1618 VectorType *InitVecValVTy = ValVTy; 1619 if (STy->isFloatingPointTy()) { 1620 Type *InitVecValSTy = 1621 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 1622 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 1623 } 1624 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 1625 1626 if (STy->isIntegerTy()) { 1627 Step = Builder.CreateVectorSplat(VLen, Step); 1628 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1629 // FIXME: The newly created binary instructions should contain nsw/nuw 1630 // flags, which can be found from the original scalar operations. 1631 Step = Builder.CreateMul(InitVec, Step); 1632 return Builder.CreateAdd(Val, Step, "induction"); 1633 } 1634 1635 // Floating point induction. 1636 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1637 "Binary Opcode should be specified for FP induction"); 1638 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 1639 1640 Step = Builder.CreateVectorSplat(VLen, Step); 1641 Value *MulOp = Builder.CreateFMul(InitVec, Step); 1642 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1643 } 1644 1645 /// A helper function that returns an integer or floating-point constant with 1646 /// value C. 1647 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 1648 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 1649 : ConstantFP::get(Ty, C); 1650 } 1651 1652 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 1653 assert(!State.Lane && "Int or FP induction being replicated."); 1654 1655 Value *Start = getStartValue()->getLiveInIRValue(); 1656 const InductionDescriptor &ID = getInductionDescriptor(); 1657 TruncInst *Trunc = getTruncInst(); 1658 IRBuilderBase &Builder = State.Builder; 1659 assert(getPHINode()->getType() == ID.getStartValue()->getType() && 1660 "Types must match"); 1661 assert(State.VF.isVector() && "must have vector VF"); 1662 1663 // The value from the original loop to which we are mapping the new induction 1664 // variable. 1665 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : getPHINode(); 1666 1667 // Fast-math-flags propagate from the original induction instruction. 1668 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 1669 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 1670 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 1671 1672 // Now do the actual transformations, and start with fetching the step value. 1673 Value *Step = State.get(getStepValue(), VPLane(0)); 1674 1675 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1676 "Expected either an induction phi-node or a truncate of it!"); 1677 1678 // Construct the initial value of the vector IV in the vector loop preheader 1679 auto CurrIP = Builder.saveIP(); 1680 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1681 Builder.SetInsertPoint(VectorPH->getTerminator()); 1682 if (isa<TruncInst>(EntryVal)) { 1683 assert(Start->getType()->isIntegerTy() && 1684 "Truncation requires an integer type"); 1685 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1686 Step = Builder.CreateTrunc(Step, TruncType); 1687 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1688 } 1689 1690 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 1691 Value *SteppedStart = getStepVector(SplatStart, Step, ID.getInductionOpcode(), 1692 State.VF, State.Builder); 1693 1694 // We create vector phi nodes for both integer and floating-point induction 1695 // variables. Here, we determine the kind of arithmetic we will perform. 1696 Instruction::BinaryOps AddOp; 1697 Instruction::BinaryOps MulOp; 1698 if (Step->getType()->isIntegerTy()) { 1699 AddOp = Instruction::Add; 1700 MulOp = Instruction::Mul; 1701 } else { 1702 AddOp = ID.getInductionOpcode(); 1703 MulOp = Instruction::FMul; 1704 } 1705 1706 Value *SplatVF; 1707 if (VPValue *SplatVFOperand = getSplatVFValue()) { 1708 // The recipe has been unrolled. In that case, fetch the splat value for the 1709 // induction increment. 1710 SplatVF = State.get(SplatVFOperand); 1711 } else { 1712 // Multiply the vectorization factor by the step using integer or 1713 // floating-point arithmetic as appropriate. 1714 Type *StepType = Step->getType(); 1715 Value *RuntimeVF = State.get(getVFValue(), VPLane(0)); 1716 if (Step->getType()->isFloatingPointTy()) 1717 RuntimeVF = Builder.CreateUIToFP(RuntimeVF, StepType); 1718 else 1719 RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, StepType); 1720 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 1721 1722 // Create a vector splat to use in the induction update. 1723 SplatVF = Builder.CreateVectorSplat(State.VF, Mul); 1724 } 1725 1726 Builder.restoreIP(CurrIP); 1727 1728 // We may need to add the step a number of times, depending on the unroll 1729 // factor. The last of those goes into the PHI. 1730 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind"); 1731 VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 1732 VecInd->setDebugLoc(getDebugLoc()); 1733 State.set(this, VecInd); 1734 1735 Instruction *LastInduction = cast<Instruction>( 1736 Builder.CreateBinOp(AddOp, VecInd, SplatVF, "vec.ind.next")); 1737 if (isa<TruncInst>(EntryVal)) 1738 State.addMetadata(LastInduction, EntryVal); 1739 LastInduction->setDebugLoc(getDebugLoc()); 1740 1741 VecInd->addIncoming(SteppedStart, VectorPH); 1742 // Add induction update using an incorrect block temporarily. The phi node 1743 // will be fixed after VPlan execution. Note that at this point the latch 1744 // block cannot be used, as it does not exist yet. 1745 // TODO: Model increment value in VPlan, by turning the recipe into a 1746 // multi-def and a subclass of VPHeaderPHIRecipe. 1747 VecInd->addIncoming(LastInduction, VectorPH); 1748 } 1749 1750 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1751 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, 1752 VPSlotTracker &SlotTracker) const { 1753 O << Indent; 1754 printAsOperand(O, SlotTracker); 1755 O << " = WIDEN-INDUCTION "; 1756 printOperands(O, SlotTracker); 1757 1758 if (auto *TI = getTruncInst()) 1759 O << " (truncated to " << *TI->getType() << ")"; 1760 } 1761 #endif 1762 1763 bool VPWidenIntOrFpInductionRecipe::isCanonical() const { 1764 // The step may be defined by a recipe in the preheader (e.g. if it requires 1765 // SCEV expansion), but for the canonical induction the step is required to be 1766 // 1, which is represented as live-in. 1767 if (getStepValue()->getDefiningRecipe()) 1768 return false; 1769 auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue()); 1770 auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); 1771 auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin()); 1772 return StartC && StartC->isZero() && StepC && StepC->isOne() && 1773 getScalarType() == CanIV->getScalarType(); 1774 } 1775 1776 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1777 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, 1778 VPSlotTracker &SlotTracker) const { 1779 O << Indent; 1780 printAsOperand(O, SlotTracker); 1781 O << " = DERIVED-IV "; 1782 getStartValue()->printAsOperand(O, SlotTracker); 1783 O << " + "; 1784 getOperand(1)->printAsOperand(O, SlotTracker); 1785 O << " * "; 1786 getStepValue()->printAsOperand(O, SlotTracker); 1787 } 1788 #endif 1789 1790 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 1791 // Fast-math-flags propagate from the original induction instruction. 1792 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 1793 if (hasFastMathFlags()) 1794 State.Builder.setFastMathFlags(getFastMathFlags()); 1795 1796 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 1797 /// variable on which to base the steps, \p Step is the size of the step. 1798 1799 Value *BaseIV = State.get(getOperand(0), VPLane(0)); 1800 Value *Step = State.get(getStepValue(), VPLane(0)); 1801 IRBuilderBase &Builder = State.Builder; 1802 1803 // Ensure step has the same type as that of scalar IV. 1804 Type *BaseIVTy = BaseIV->getType()->getScalarType(); 1805 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!"); 1806 1807 // We build scalar steps for both integer and floating-point induction 1808 // variables. Here, we determine the kind of arithmetic we will perform. 1809 Instruction::BinaryOps AddOp; 1810 Instruction::BinaryOps MulOp; 1811 if (BaseIVTy->isIntegerTy()) { 1812 AddOp = Instruction::Add; 1813 MulOp = Instruction::Mul; 1814 } else { 1815 AddOp = InductionOpcode; 1816 MulOp = Instruction::FMul; 1817 } 1818 1819 // Determine the number of scalars we need to generate for each unroll 1820 // iteration. 1821 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this); 1822 // Compute the scalar steps and save the results in State. 1823 Type *IntStepTy = 1824 IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits()); 1825 Type *VecIVTy = nullptr; 1826 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 1827 if (!FirstLaneOnly && State.VF.isScalable()) { 1828 VecIVTy = VectorType::get(BaseIVTy, State.VF); 1829 UnitStepVec = 1830 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 1831 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 1832 SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV); 1833 } 1834 1835 unsigned StartLane = 0; 1836 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 1837 if (State.Lane) { 1838 StartLane = State.Lane->getKnownLane(); 1839 EndLane = StartLane + 1; 1840 } 1841 Value *StartIdx0 = 1842 createStepForVF(Builder, IntStepTy, State.VF, getUnrollPart(*this)); 1843 1844 if (!FirstLaneOnly && State.VF.isScalable()) { 1845 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 1846 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 1847 if (BaseIVTy->isFloatingPointTy()) 1848 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 1849 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 1850 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 1851 State.set(this, Add); 1852 // It's useful to record the lane values too for the known minimum number 1853 // of elements so we do those below. This improves the code quality when 1854 // trying to extract the first element, for example. 1855 } 1856 1857 if (BaseIVTy->isFloatingPointTy()) 1858 StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy); 1859 1860 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { 1861 Value *StartIdx = Builder.CreateBinOp( 1862 AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane)); 1863 // The step returned by `createStepForVF` is a runtime-evaluated value 1864 // when VF is scalable. Otherwise, it should be folded into a Constant. 1865 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 1866 "Expected StartIdx to be folded to a constant when VF is not " 1867 "scalable"); 1868 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 1869 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul); 1870 State.set(this, Add, VPLane(Lane)); 1871 } 1872 } 1873 1874 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1875 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, 1876 VPSlotTracker &SlotTracker) const { 1877 O << Indent; 1878 printAsOperand(O, SlotTracker); 1879 O << " = SCALAR-STEPS "; 1880 printOperands(O, SlotTracker); 1881 } 1882 #endif 1883 1884 void VPWidenGEPRecipe::execute(VPTransformState &State) { 1885 assert(State.VF.isVector() && "not widening"); 1886 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 1887 // Construct a vector GEP by widening the operands of the scalar GEP as 1888 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 1889 // results in a vector of pointers when at least one operand of the GEP 1890 // is vector-typed. Thus, to keep the representation compact, we only use 1891 // vector-typed operands for loop-varying values. 1892 1893 if (areAllOperandsInvariant()) { 1894 // If we are vectorizing, but the GEP has only loop-invariant operands, 1895 // the GEP we build (by only using vector-typed operands for 1896 // loop-varying values) would be a scalar pointer. Thus, to ensure we 1897 // produce a vector of pointers, we need to either arbitrarily pick an 1898 // operand to broadcast, or broadcast a clone of the original GEP. 1899 // Here, we broadcast a clone of the original. 1900 // 1901 // TODO: If at some point we decide to scalarize instructions having 1902 // loop-invariant operands, this special case will no longer be 1903 // required. We would add the scalarization decision to 1904 // collectLoopScalars() and teach getVectorValue() to broadcast 1905 // the lane-zero scalar value. 1906 SmallVector<Value *> Ops; 1907 for (unsigned I = 0, E = getNumOperands(); I != E; I++) 1908 Ops.push_back(State.get(getOperand(I), VPLane(0))); 1909 1910 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0], 1911 ArrayRef(Ops).drop_front(), "", 1912 getGEPNoWrapFlags()); 1913 Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP); 1914 State.set(this, Splat); 1915 State.addMetadata(Splat, GEP); 1916 } else { 1917 // If the GEP has at least one loop-varying operand, we are sure to 1918 // produce a vector of pointers unless VF is scalar. 1919 // The pointer operand of the new GEP. If it's loop-invariant, we 1920 // won't broadcast it. 1921 auto *Ptr = isPointerLoopInvariant() ? State.get(getOperand(0), VPLane(0)) 1922 : State.get(getOperand(0)); 1923 1924 // Collect all the indices for the new GEP. If any index is 1925 // loop-invariant, we won't broadcast it. 1926 SmallVector<Value *, 4> Indices; 1927 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 1928 VPValue *Operand = getOperand(I); 1929 if (isIndexLoopInvariant(I - 1)) 1930 Indices.push_back(State.get(Operand, VPLane(0))); 1931 else 1932 Indices.push_back(State.get(Operand)); 1933 } 1934 1935 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 1936 // but it should be a vector, otherwise. 1937 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, 1938 Indices, "", getGEPNoWrapFlags()); 1939 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 1940 "NewGEP is not a pointer vector"); 1941 State.set(this, NewGEP); 1942 State.addMetadata(NewGEP, GEP); 1943 } 1944 } 1945 1946 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1947 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, 1948 VPSlotTracker &SlotTracker) const { 1949 O << Indent << "WIDEN-GEP "; 1950 O << (isPointerLoopInvariant() ? "Inv" : "Var"); 1951 for (size_t I = 0; I < getNumOperands() - 1; ++I) 1952 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]"; 1953 1954 O << " "; 1955 printAsOperand(O, SlotTracker); 1956 O << " = getelementptr"; 1957 printFlags(O); 1958 printOperands(O, SlotTracker); 1959 } 1960 #endif 1961 1962 static Type *getGEPIndexTy(bool IsScalable, bool IsReverse, 1963 unsigned CurrentPart, IRBuilderBase &Builder) { 1964 // Use i32 for the gep index type when the value is constant, 1965 // or query DataLayout for a more suitable index type otherwise. 1966 const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout(); 1967 return IsScalable && (IsReverse || CurrentPart > 0) 1968 ? DL.getIndexType(Builder.getPtrTy(0)) 1969 : Builder.getInt32Ty(); 1970 } 1971 1972 void VPReverseVectorPointerRecipe::execute(VPTransformState &State) { 1973 auto &Builder = State.Builder; 1974 State.setDebugLocFrom(getDebugLoc()); 1975 unsigned CurrentPart = getUnrollPart(*this); 1976 Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ true, 1977 CurrentPart, Builder); 1978 1979 // The wide store needs to start at the last vector element. 1980 Value *RunTimeVF = State.get(getVFValue(), VPLane(0)); 1981 if (IndexTy != RunTimeVF->getType()) 1982 RunTimeVF = Builder.CreateZExtOrTrunc(RunTimeVF, IndexTy); 1983 // NumElt = -CurrentPart * RunTimeVF 1984 Value *NumElt = Builder.CreateMul( 1985 ConstantInt::get(IndexTy, -(int64_t)CurrentPart), RunTimeVF); 1986 // LastLane = 1 - RunTimeVF 1987 Value *LastLane = Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); 1988 Value *Ptr = State.get(getOperand(0), VPLane(0)); 1989 Value *ResultPtr = 1990 Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags()); 1991 ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "", 1992 getGEPNoWrapFlags()); 1993 1994 State.set(this, ResultPtr, /*IsScalar*/ true); 1995 } 1996 1997 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1998 void VPReverseVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, 1999 VPSlotTracker &SlotTracker) const { 2000 O << Indent; 2001 printAsOperand(O, SlotTracker); 2002 O << " = reverse-vector-pointer"; 2003 printFlags(O); 2004 O << " "; 2005 printOperands(O, SlotTracker); 2006 } 2007 #endif 2008 2009 void VPVectorPointerRecipe::execute(VPTransformState &State) { 2010 auto &Builder = State.Builder; 2011 State.setDebugLocFrom(getDebugLoc()); 2012 unsigned CurrentPart = getUnrollPart(*this); 2013 Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false, 2014 CurrentPart, Builder); 2015 Value *Ptr = State.get(getOperand(0), VPLane(0)); 2016 2017 Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart); 2018 Value *ResultPtr = 2019 Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags()); 2020 2021 State.set(this, ResultPtr, /*IsScalar*/ true); 2022 } 2023 2024 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2025 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, 2026 VPSlotTracker &SlotTracker) const { 2027 O << Indent; 2028 printAsOperand(O, SlotTracker); 2029 O << " = vector-pointer "; 2030 2031 printOperands(O, SlotTracker); 2032 } 2033 #endif 2034 2035 void VPBlendRecipe::execute(VPTransformState &State) { 2036 assert(isNormalized() && "Expected blend to be normalized!"); 2037 State.setDebugLocFrom(getDebugLoc()); 2038 // We know that all PHIs in non-header blocks are converted into 2039 // selects, so we don't have to worry about the insertion order and we 2040 // can just use the builder. 2041 // At this point we generate the predication tree. There may be 2042 // duplications since this is a simple recursive scan, but future 2043 // optimizations will clean it up. 2044 2045 unsigned NumIncoming = getNumIncomingValues(); 2046 2047 // Generate a sequence of selects of the form: 2048 // SELECT(Mask3, In3, 2049 // SELECT(Mask2, In2, 2050 // SELECT(Mask1, In1, 2051 // In0))) 2052 // Note that Mask0 is never used: lanes for which no path reaches this phi and 2053 // are essentially undef are taken from In0. 2054 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 2055 Value *Result = nullptr; 2056 for (unsigned In = 0; In < NumIncoming; ++In) { 2057 // We might have single edge PHIs (blocks) - use an identity 2058 // 'select' for the first PHI operand. 2059 Value *In0 = State.get(getIncomingValue(In), OnlyFirstLaneUsed); 2060 if (In == 0) 2061 Result = In0; // Initialize with the first incoming value. 2062 else { 2063 // Select between the current value and the previous incoming edge 2064 // based on the incoming mask. 2065 Value *Cond = State.get(getMask(In), OnlyFirstLaneUsed); 2066 Result = State.Builder.CreateSelect(Cond, In0, Result, "predphi"); 2067 } 2068 } 2069 State.set(this, Result, OnlyFirstLaneUsed); 2070 } 2071 2072 InstructionCost VPBlendRecipe::computeCost(ElementCount VF, 2073 VPCostContext &Ctx) const { 2074 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2075 2076 // Handle cases where only the first lane is used the same way as the legacy 2077 // cost model. 2078 if (vputils::onlyFirstLaneUsed(this)) 2079 return Ctx.TTI.getCFInstrCost(Instruction::PHI, CostKind); 2080 2081 Type *ResultTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF); 2082 Type *CmpTy = ToVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF); 2083 return (getNumIncomingValues() - 1) * 2084 Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy, 2085 CmpInst::BAD_ICMP_PREDICATE, CostKind); 2086 } 2087 2088 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2089 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, 2090 VPSlotTracker &SlotTracker) const { 2091 O << Indent << "BLEND "; 2092 printAsOperand(O, SlotTracker); 2093 O << " ="; 2094 if (getNumIncomingValues() == 1) { 2095 // Not a User of any mask: not really blending, this is a 2096 // single-predecessor phi. 2097 O << " "; 2098 getIncomingValue(0)->printAsOperand(O, SlotTracker); 2099 } else { 2100 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { 2101 O << " "; 2102 getIncomingValue(I)->printAsOperand(O, SlotTracker); 2103 if (I == 0) 2104 continue; 2105 O << "/"; 2106 getMask(I)->printAsOperand(O, SlotTracker); 2107 } 2108 } 2109 } 2110 #endif 2111 2112 void VPReductionRecipe::execute(VPTransformState &State) { 2113 assert(!State.Lane && "Reduction being replicated."); 2114 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); 2115 RecurKind Kind = RdxDesc.getRecurrenceKind(); 2116 // Propagate the fast-math flags carried by the underlying instruction. 2117 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 2118 State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 2119 Value *NewVecOp = State.get(getVecOp()); 2120 if (VPValue *Cond = getCondOp()) { 2121 Value *NewCond = State.get(Cond, State.VF.isScalar()); 2122 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType()); 2123 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); 2124 2125 Value *Start; 2126 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind)) 2127 Start = RdxDesc.getRecurrenceStartValue(); 2128 else 2129 Start = llvm::getRecurrenceIdentity(Kind, ElementTy, 2130 RdxDesc.getFastMathFlags()); 2131 if (State.VF.isVector()) 2132 Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start); 2133 2134 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start); 2135 NewVecOp = Select; 2136 } 2137 Value *NewRed; 2138 Value *NextInChain; 2139 if (IsOrdered) { 2140 if (State.VF.isVector()) 2141 NewRed = 2142 createOrderedReduction(State.Builder, RdxDesc, NewVecOp, PrevInChain); 2143 else 2144 NewRed = State.Builder.CreateBinOp( 2145 (Instruction::BinaryOps)RdxDesc.getOpcode(), PrevInChain, NewVecOp); 2146 PrevInChain = NewRed; 2147 NextInChain = NewRed; 2148 } else { 2149 PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); 2150 NewRed = createReduction(State.Builder, RdxDesc, NewVecOp); 2151 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 2152 NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(), 2153 NewRed, PrevInChain); 2154 else 2155 NextInChain = State.Builder.CreateBinOp( 2156 (Instruction::BinaryOps)RdxDesc.getOpcode(), NewRed, PrevInChain); 2157 } 2158 State.set(this, NextInChain, /*IsScalar*/ true); 2159 } 2160 2161 void VPReductionEVLRecipe::execute(VPTransformState &State) { 2162 assert(!State.Lane && "Reduction being replicated."); 2163 2164 auto &Builder = State.Builder; 2165 // Propagate the fast-math flags carried by the underlying instruction. 2166 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder); 2167 const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor(); 2168 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 2169 2170 RecurKind Kind = RdxDesc.getRecurrenceKind(); 2171 Value *Prev = State.get(getChainOp(), /*IsScalar*/ true); 2172 Value *VecOp = State.get(getVecOp()); 2173 Value *EVL = State.get(getEVL(), VPLane(0)); 2174 2175 VectorBuilder VBuilder(Builder); 2176 VBuilder.setEVL(EVL); 2177 Value *Mask; 2178 // TODO: move the all-true mask generation into VectorBuilder. 2179 if (VPValue *CondOp = getCondOp()) 2180 Mask = State.get(CondOp); 2181 else 2182 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 2183 VBuilder.setMask(Mask); 2184 2185 Value *NewRed; 2186 if (isOrdered()) { 2187 NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev); 2188 } else { 2189 NewRed = createSimpleReduction(VBuilder, VecOp, RdxDesc); 2190 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 2191 NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev); 2192 else 2193 NewRed = Builder.CreateBinOp((Instruction::BinaryOps)RdxDesc.getOpcode(), 2194 NewRed, Prev); 2195 } 2196 State.set(this, NewRed, /*IsScalar*/ true); 2197 } 2198 2199 InstructionCost VPReductionRecipe::computeCost(ElementCount VF, 2200 VPCostContext &Ctx) const { 2201 RecurKind RdxKind = RdxDesc.getRecurrenceKind(); 2202 Type *ElementTy = Ctx.Types.inferScalarType(this); 2203 auto *VectorTy = cast<VectorType>(ToVectorTy(ElementTy, VF)); 2204 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2205 unsigned Opcode = RdxDesc.getOpcode(); 2206 2207 // TODO: Support any-of and in-loop reductions. 2208 assert( 2209 (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) || 2210 ForceTargetInstructionCost.getNumOccurrences() > 0) && 2211 "Any-of reduction not implemented in VPlan-based cost model currently."); 2212 assert( 2213 (!cast<VPReductionPHIRecipe>(getOperand(0))->isInLoop() || 2214 ForceTargetInstructionCost.getNumOccurrences() > 0) && 2215 "In-loop reduction not implemented in VPlan-based cost model currently."); 2216 2217 assert(ElementTy->getTypeID() == RdxDesc.getRecurrenceType()->getTypeID() && 2218 "Inferred type and recurrence type mismatch."); 2219 2220 // Cost = Reduction cost + BinOp cost 2221 InstructionCost Cost = 2222 Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, CostKind); 2223 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) { 2224 Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind); 2225 return Cost + Ctx.TTI.getMinMaxReductionCost( 2226 Id, VectorTy, RdxDesc.getFastMathFlags(), CostKind); 2227 } 2228 2229 return Cost + Ctx.TTI.getArithmeticReductionCost( 2230 Opcode, VectorTy, RdxDesc.getFastMathFlags(), CostKind); 2231 } 2232 2233 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2234 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, 2235 VPSlotTracker &SlotTracker) const { 2236 O << Indent << "REDUCE "; 2237 printAsOperand(O, SlotTracker); 2238 O << " = "; 2239 getChainOp()->printAsOperand(O, SlotTracker); 2240 O << " +"; 2241 if (isa<FPMathOperator>(getUnderlyingInstr())) 2242 O << getUnderlyingInstr()->getFastMathFlags(); 2243 O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; 2244 getVecOp()->printAsOperand(O, SlotTracker); 2245 if (isConditional()) { 2246 O << ", "; 2247 getCondOp()->printAsOperand(O, SlotTracker); 2248 } 2249 O << ")"; 2250 if (RdxDesc.IntermediateStore) 2251 O << " (with final reduction value stored in invariant address sank " 2252 "outside of loop)"; 2253 } 2254 2255 void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent, 2256 VPSlotTracker &SlotTracker) const { 2257 const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor(); 2258 O << Indent << "REDUCE "; 2259 printAsOperand(O, SlotTracker); 2260 O << " = "; 2261 getChainOp()->printAsOperand(O, SlotTracker); 2262 O << " +"; 2263 if (isa<FPMathOperator>(getUnderlyingInstr())) 2264 O << getUnderlyingInstr()->getFastMathFlags(); 2265 O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; 2266 getVecOp()->printAsOperand(O, SlotTracker); 2267 O << ", "; 2268 getEVL()->printAsOperand(O, SlotTracker); 2269 if (isConditional()) { 2270 O << ", "; 2271 getCondOp()->printAsOperand(O, SlotTracker); 2272 } 2273 O << ")"; 2274 if (RdxDesc.IntermediateStore) 2275 O << " (with final reduction value stored in invariant address sank " 2276 "outside of loop)"; 2277 } 2278 #endif 2279 2280 bool VPReplicateRecipe::shouldPack() const { 2281 // Find if the recipe is used by a widened recipe via an intervening 2282 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. 2283 return any_of(users(), [](const VPUser *U) { 2284 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U)) 2285 return any_of(PredR->users(), [PredR](const VPUser *U) { 2286 return !U->usesScalars(PredR); 2287 }); 2288 return false; 2289 }); 2290 } 2291 2292 InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, 2293 VPCostContext &Ctx) const { 2294 Instruction *UI = cast<Instruction>(getUnderlyingValue()); 2295 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan 2296 // transform, avoid computing their cost multiple times for now. 2297 Ctx.SkipCostComputation.insert(UI); 2298 return Ctx.getLegacyCost(UI, VF); 2299 } 2300 2301 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2302 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, 2303 VPSlotTracker &SlotTracker) const { 2304 O << Indent << (IsUniform ? "CLONE " : "REPLICATE "); 2305 2306 if (!getUnderlyingInstr()->getType()->isVoidTy()) { 2307 printAsOperand(O, SlotTracker); 2308 O << " = "; 2309 } 2310 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) { 2311 O << "call"; 2312 printFlags(O); 2313 O << "@" << CB->getCalledFunction()->getName() << "("; 2314 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), 2315 O, [&O, &SlotTracker](VPValue *Op) { 2316 Op->printAsOperand(O, SlotTracker); 2317 }); 2318 O << ")"; 2319 } else { 2320 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); 2321 printFlags(O); 2322 printOperands(O, SlotTracker); 2323 } 2324 2325 if (shouldPack()) 2326 O << " (S->V)"; 2327 } 2328 #endif 2329 2330 Value *VPScalarCastRecipe ::generate(VPTransformState &State) { 2331 assert(vputils::onlyFirstLaneUsed(this) && 2332 "Codegen only implemented for first lane."); 2333 switch (Opcode) { 2334 case Instruction::SExt: 2335 case Instruction::ZExt: 2336 case Instruction::Trunc: { 2337 // Note: SExt/ZExt not used yet. 2338 Value *Op = State.get(getOperand(0), VPLane(0)); 2339 return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy); 2340 } 2341 default: 2342 llvm_unreachable("opcode not implemented yet"); 2343 } 2344 } 2345 2346 void VPScalarCastRecipe ::execute(VPTransformState &State) { 2347 State.set(this, generate(State), VPLane(0)); 2348 } 2349 2350 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2351 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent, 2352 VPSlotTracker &SlotTracker) const { 2353 O << Indent << "SCALAR-CAST "; 2354 printAsOperand(O, SlotTracker); 2355 O << " = " << Instruction::getOpcodeName(Opcode) << " "; 2356 printOperands(O, SlotTracker); 2357 O << " to " << *ResultTy; 2358 } 2359 #endif 2360 2361 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 2362 assert(State.Lane && "Branch on Mask works only on single instance."); 2363 2364 unsigned Lane = State.Lane->getKnownLane(); 2365 2366 Value *ConditionBit = nullptr; 2367 VPValue *BlockInMask = getMask(); 2368 if (BlockInMask) { 2369 ConditionBit = State.get(BlockInMask); 2370 if (ConditionBit->getType()->isVectorTy()) 2371 ConditionBit = State.Builder.CreateExtractElement( 2372 ConditionBit, State.Builder.getInt32(Lane)); 2373 } else // Block in mask is all-one. 2374 ConditionBit = State.Builder.getTrue(); 2375 2376 // Replace the temporary unreachable terminator with a new conditional branch, 2377 // whose two destinations will be set later when they are created. 2378 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 2379 assert(isa<UnreachableInst>(CurrentTerminator) && 2380 "Expected to replace unreachable terminator with conditional branch."); 2381 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 2382 CondBr->setSuccessor(0, nullptr); 2383 ReplaceInstWithInst(CurrentTerminator, CondBr); 2384 } 2385 2386 InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF, 2387 VPCostContext &Ctx) const { 2388 // The legacy cost model doesn't assign costs to branches for individual 2389 // replicate regions. Match the current behavior in the VPlan cost model for 2390 // now. 2391 return 0; 2392 } 2393 2394 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 2395 State.setDebugLocFrom(getDebugLoc()); 2396 assert(State.Lane && "Predicated instruction PHI works per instance."); 2397 Instruction *ScalarPredInst = 2398 cast<Instruction>(State.get(getOperand(0), *State.Lane)); 2399 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 2400 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 2401 assert(PredicatingBB && "Predicated block has no single predecessor."); 2402 assert(isa<VPReplicateRecipe>(getOperand(0)) && 2403 "operand must be VPReplicateRecipe"); 2404 2405 // By current pack/unpack logic we need to generate only a single phi node: if 2406 // a vector value for the predicated instruction exists at this point it means 2407 // the instruction has vector users only, and a phi for the vector value is 2408 // needed. In this case the recipe of the predicated instruction is marked to 2409 // also do that packing, thereby "hoisting" the insert-element sequence. 2410 // Otherwise, a phi node for the scalar value is needed. 2411 if (State.hasVectorValue(getOperand(0))) { 2412 Value *VectorValue = State.get(getOperand(0)); 2413 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 2414 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 2415 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 2416 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 2417 if (State.hasVectorValue(this)) 2418 State.reset(this, VPhi); 2419 else 2420 State.set(this, VPhi); 2421 // NOTE: Currently we need to update the value of the operand, so the next 2422 // predicated iteration inserts its generated value in the correct vector. 2423 State.reset(getOperand(0), VPhi); 2424 } else { 2425 if (vputils::onlyFirstLaneUsed(this) && !State.Lane->isFirstLane()) 2426 return; 2427 2428 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 2429 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 2430 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 2431 PredicatingBB); 2432 Phi->addIncoming(ScalarPredInst, PredicatedBB); 2433 if (State.hasScalarValue(this, *State.Lane)) 2434 State.reset(this, Phi, *State.Lane); 2435 else 2436 State.set(this, Phi, *State.Lane); 2437 // NOTE: Currently we need to update the value of the operand, so the next 2438 // predicated iteration inserts its generated value in the correct vector. 2439 State.reset(getOperand(0), Phi, *State.Lane); 2440 } 2441 } 2442 2443 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2444 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2445 VPSlotTracker &SlotTracker) const { 2446 O << Indent << "PHI-PREDICATED-INSTRUCTION "; 2447 printAsOperand(O, SlotTracker); 2448 O << " = "; 2449 printOperands(O, SlotTracker); 2450 } 2451 #endif 2452 2453 InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, 2454 VPCostContext &Ctx) const { 2455 Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF); 2456 const Align Alignment = 2457 getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient)); 2458 unsigned AS = 2459 getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient)); 2460 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2461 2462 if (!Consecutive) { 2463 // TODO: Using the original IR may not be accurate. 2464 // Currently, ARM will use the underlying IR to calculate gather/scatter 2465 // instruction cost. 2466 const Value *Ptr = getLoadStorePointerOperand(&Ingredient); 2467 assert(!Reverse && 2468 "Inconsecutive memory access should not have the order."); 2469 return Ctx.TTI.getAddressComputationCost(Ty) + 2470 Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr, 2471 IsMasked, Alignment, CostKind, 2472 &Ingredient); 2473 } 2474 2475 InstructionCost Cost = 0; 2476 if (IsMasked) { 2477 Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, 2478 AS, CostKind); 2479 } else { 2480 TTI::OperandValueInfo OpInfo = 2481 Ctx.TTI.getOperandInfo(Ingredient.getOperand(0)); 2482 Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS, 2483 CostKind, OpInfo, &Ingredient); 2484 } 2485 if (!Reverse) 2486 return Cost; 2487 2488 return Cost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 2489 cast<VectorType>(Ty), {}, CostKind, 0); 2490 } 2491 2492 void VPWidenLoadRecipe::execute(VPTransformState &State) { 2493 auto *LI = cast<LoadInst>(&Ingredient); 2494 2495 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 2496 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 2497 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2498 bool CreateGather = !isConsecutive(); 2499 2500 auto &Builder = State.Builder; 2501 State.setDebugLocFrom(getDebugLoc()); 2502 Value *Mask = nullptr; 2503 if (auto *VPMask = getMask()) { 2504 // Mask reversal is only needed for non-all-one (null) masks, as reverse 2505 // of a null all-one mask is a null mask. 2506 Mask = State.get(VPMask); 2507 if (isReverse()) 2508 Mask = Builder.CreateVectorReverse(Mask, "reverse"); 2509 } 2510 2511 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather); 2512 Value *NewLI; 2513 if (CreateGather) { 2514 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr, 2515 "wide.masked.gather"); 2516 } else if (Mask) { 2517 NewLI = 2518 Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask, 2519 PoisonValue::get(DataTy), "wide.masked.load"); 2520 } else { 2521 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load"); 2522 } 2523 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2524 State.addMetadata(NewLI, LI); 2525 if (Reverse) 2526 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 2527 State.set(this, NewLI); 2528 } 2529 2530 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2531 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, 2532 VPSlotTracker &SlotTracker) const { 2533 O << Indent << "WIDEN "; 2534 printAsOperand(O, SlotTracker); 2535 O << " = load "; 2536 printOperands(O, SlotTracker); 2537 } 2538 #endif 2539 2540 /// Use all-true mask for reverse rather than actual mask, as it avoids a 2541 /// dependence w/o affecting the result. 2542 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, 2543 Value *EVL, const Twine &Name) { 2544 VectorType *ValTy = cast<VectorType>(Operand->getType()); 2545 Value *AllTrueMask = 2546 Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue()); 2547 return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse, 2548 {Operand, AllTrueMask, EVL}, nullptr, Name); 2549 } 2550 2551 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { 2552 auto *LI = cast<LoadInst>(&Ingredient); 2553 2554 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 2555 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 2556 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2557 bool CreateGather = !isConsecutive(); 2558 2559 auto &Builder = State.Builder; 2560 State.setDebugLocFrom(getDebugLoc()); 2561 CallInst *NewLI; 2562 Value *EVL = State.get(getEVL(), VPLane(0)); 2563 Value *Addr = State.get(getAddr(), !CreateGather); 2564 Value *Mask = nullptr; 2565 if (VPValue *VPMask = getMask()) { 2566 Mask = State.get(VPMask); 2567 if (isReverse()) 2568 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); 2569 } else { 2570 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 2571 } 2572 2573 if (CreateGather) { 2574 NewLI = 2575 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, 2576 nullptr, "wide.masked.gather"); 2577 } else { 2578 VectorBuilder VBuilder(Builder); 2579 VBuilder.setEVL(EVL).setMask(Mask); 2580 NewLI = cast<CallInst>(VBuilder.createVectorInstruction( 2581 Instruction::Load, DataTy, Addr, "vp.op.load")); 2582 } 2583 NewLI->addParamAttr( 2584 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); 2585 State.addMetadata(NewLI, LI); 2586 Instruction *Res = NewLI; 2587 if (isReverse()) 2588 Res = createReverseEVL(Builder, Res, EVL, "vp.reverse"); 2589 State.set(this, Res); 2590 } 2591 2592 InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF, 2593 VPCostContext &Ctx) const { 2594 if (!Consecutive || IsMasked) 2595 return VPWidenMemoryRecipe::computeCost(VF, Ctx); 2596 2597 // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost() 2598 // here because the EVL recipes using EVL to replace the tail mask. But in the 2599 // legacy model, it will always calculate the cost of mask. 2600 // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we 2601 // don't need to compare to the legacy cost model. 2602 Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF); 2603 const Align Alignment = 2604 getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient)); 2605 unsigned AS = 2606 getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient)); 2607 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2608 InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( 2609 Ingredient.getOpcode(), Ty, Alignment, AS, CostKind); 2610 if (!Reverse) 2611 return Cost; 2612 2613 return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 2614 cast<VectorType>(Ty), {}, CostKind, 0); 2615 } 2616 2617 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2618 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, 2619 VPSlotTracker &SlotTracker) const { 2620 O << Indent << "WIDEN "; 2621 printAsOperand(O, SlotTracker); 2622 O << " = vp.load "; 2623 printOperands(O, SlotTracker); 2624 } 2625 #endif 2626 2627 void VPWidenStoreRecipe::execute(VPTransformState &State) { 2628 auto *SI = cast<StoreInst>(&Ingredient); 2629 2630 VPValue *StoredVPValue = getStoredValue(); 2631 bool CreateScatter = !isConsecutive(); 2632 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2633 2634 auto &Builder = State.Builder; 2635 State.setDebugLocFrom(getDebugLoc()); 2636 2637 Value *Mask = nullptr; 2638 if (auto *VPMask = getMask()) { 2639 // Mask reversal is only needed for non-all-one (null) masks, as reverse 2640 // of a null all-one mask is a null mask. 2641 Mask = State.get(VPMask); 2642 if (isReverse()) 2643 Mask = Builder.CreateVectorReverse(Mask, "reverse"); 2644 } 2645 2646 Value *StoredVal = State.get(StoredVPValue); 2647 if (isReverse()) { 2648 // If we store to reverse consecutive memory locations, then we need 2649 // to reverse the order of elements in the stored value. 2650 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 2651 // We don't want to update the value in the map as it might be used in 2652 // another expression. So don't call resetVectorValue(StoredVal). 2653 } 2654 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter); 2655 Instruction *NewSI = nullptr; 2656 if (CreateScatter) 2657 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask); 2658 else if (Mask) 2659 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask); 2660 else 2661 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment); 2662 State.addMetadata(NewSI, SI); 2663 } 2664 2665 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2666 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, 2667 VPSlotTracker &SlotTracker) const { 2668 O << Indent << "WIDEN store "; 2669 printOperands(O, SlotTracker); 2670 } 2671 #endif 2672 2673 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { 2674 auto *SI = cast<StoreInst>(&Ingredient); 2675 2676 VPValue *StoredValue = getStoredValue(); 2677 bool CreateScatter = !isConsecutive(); 2678 const Align Alignment = getLoadStoreAlignment(&Ingredient); 2679 2680 auto &Builder = State.Builder; 2681 State.setDebugLocFrom(getDebugLoc()); 2682 2683 CallInst *NewSI = nullptr; 2684 Value *StoredVal = State.get(StoredValue); 2685 Value *EVL = State.get(getEVL(), VPLane(0)); 2686 if (isReverse()) 2687 StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse"); 2688 Value *Mask = nullptr; 2689 if (VPValue *VPMask = getMask()) { 2690 Mask = State.get(VPMask); 2691 if (isReverse()) 2692 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); 2693 } else { 2694 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 2695 } 2696 Value *Addr = State.get(getAddr(), !CreateScatter); 2697 if (CreateScatter) { 2698 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), 2699 Intrinsic::vp_scatter, 2700 {StoredVal, Addr, Mask, EVL}); 2701 } else { 2702 VectorBuilder VBuilder(Builder); 2703 VBuilder.setEVL(EVL).setMask(Mask); 2704 NewSI = cast<CallInst>(VBuilder.createVectorInstruction( 2705 Instruction::Store, Type::getVoidTy(EVL->getContext()), 2706 {StoredVal, Addr})); 2707 } 2708 NewSI->addParamAttr( 2709 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment)); 2710 State.addMetadata(NewSI, SI); 2711 } 2712 2713 InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF, 2714 VPCostContext &Ctx) const { 2715 if (!Consecutive || IsMasked) 2716 return VPWidenMemoryRecipe::computeCost(VF, Ctx); 2717 2718 // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost() 2719 // here because the EVL recipes using EVL to replace the tail mask. But in the 2720 // legacy model, it will always calculate the cost of mask. 2721 // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we 2722 // don't need to compare to the legacy cost model. 2723 Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF); 2724 const Align Alignment = 2725 getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient)); 2726 unsigned AS = 2727 getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient)); 2728 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2729 InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( 2730 Ingredient.getOpcode(), Ty, Alignment, AS, CostKind); 2731 if (!Reverse) 2732 return Cost; 2733 2734 return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 2735 cast<VectorType>(Ty), {}, CostKind, 0); 2736 } 2737 2738 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2739 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent, 2740 VPSlotTracker &SlotTracker) const { 2741 O << Indent << "WIDEN vp.store "; 2742 printOperands(O, SlotTracker); 2743 } 2744 #endif 2745 2746 static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V, 2747 VectorType *DstVTy, const DataLayout &DL) { 2748 // Verify that V is a vector type with same number of elements as DstVTy. 2749 auto VF = DstVTy->getElementCount(); 2750 auto *SrcVecTy = cast<VectorType>(V->getType()); 2751 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match"); 2752 Type *SrcElemTy = SrcVecTy->getElementType(); 2753 Type *DstElemTy = DstVTy->getElementType(); 2754 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2755 "Vector elements must have same size"); 2756 2757 // Do a direct cast if element types are castable. 2758 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2759 return Builder.CreateBitOrPointerCast(V, DstVTy); 2760 } 2761 // V cannot be directly casted to desired vector type. 2762 // May happen when V is a floating point vector but DstVTy is a vector of 2763 // pointers or vice-versa. Handle this using a two-step bitcast using an 2764 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2765 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2766 "Only one type should be a pointer type"); 2767 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2768 "Only one type should be a floating point type"); 2769 Type *IntTy = 2770 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2771 auto *VecIntTy = VectorType::get(IntTy, VF); 2772 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2773 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2774 } 2775 2776 /// Return a vector containing interleaved elements from multiple 2777 /// smaller input vectors. 2778 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, 2779 const Twine &Name) { 2780 unsigned Factor = Vals.size(); 2781 assert(Factor > 1 && "Tried to interleave invalid number of vectors"); 2782 2783 VectorType *VecTy = cast<VectorType>(Vals[0]->getType()); 2784 #ifndef NDEBUG 2785 for (Value *Val : Vals) 2786 assert(Val->getType() == VecTy && "Tried to interleave mismatched types"); 2787 #endif 2788 2789 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so 2790 // must use intrinsics to interleave. 2791 if (VecTy->isScalableTy()) { 2792 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); 2793 return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2, 2794 Vals, 2795 /*FMFSource=*/nullptr, Name); 2796 } 2797 2798 // Fixed length. Start by concatenating all vectors into a wide vector. 2799 Value *WideVec = concatenateVectors(Builder, Vals); 2800 2801 // Interleave the elements into the wide vector. 2802 const unsigned NumElts = VecTy->getElementCount().getFixedValue(); 2803 return Builder.CreateShuffleVector( 2804 WideVec, createInterleaveMask(NumElts, Factor), Name); 2805 } 2806 2807 // Try to vectorize the interleave group that \p Instr belongs to. 2808 // 2809 // E.g. Translate following interleaved load group (factor = 3): 2810 // for (i = 0; i < N; i+=3) { 2811 // R = Pic[i]; // Member of index 0 2812 // G = Pic[i+1]; // Member of index 1 2813 // B = Pic[i+2]; // Member of index 2 2814 // ... // do something to R, G, B 2815 // } 2816 // To: 2817 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2818 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2819 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2820 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2821 // 2822 // Or translate following interleaved store group (factor = 3): 2823 // for (i = 0; i < N; i+=3) { 2824 // ... do something to R, G, B 2825 // Pic[i] = R; // Member of index 0 2826 // Pic[i+1] = G; // Member of index 1 2827 // Pic[i+2] = B; // Member of index 2 2828 // } 2829 // To: 2830 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2831 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2832 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2833 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2834 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2835 void VPInterleaveRecipe::execute(VPTransformState &State) { 2836 assert(!State.Lane && "Interleave group being replicated."); 2837 const InterleaveGroup<Instruction> *Group = IG; 2838 Instruction *Instr = Group->getInsertPos(); 2839 2840 // Prepare for the vector type of the interleaved load/store. 2841 Type *ScalarTy = getLoadStoreType(Instr); 2842 unsigned InterleaveFactor = Group->getFactor(); 2843 auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor); 2844 2845 // TODO: extend the masked interleaved-group support to reversed access. 2846 VPValue *BlockInMask = getMask(); 2847 assert((!BlockInMask || !Group->isReverse()) && 2848 "Reversed masked interleave-group not supported."); 2849 2850 VPValue *Addr = getAddr(); 2851 Value *ResAddr = State.get(Addr, VPLane(0)); 2852 if (auto *I = dyn_cast<Instruction>(ResAddr)) 2853 State.setDebugLocFrom(I->getDebugLoc()); 2854 2855 // If the group is reverse, adjust the index to refer to the last vector lane 2856 // instead of the first. We adjust the index from the first vector lane, 2857 // rather than directly getting the pointer for lane VF - 1, because the 2858 // pointer operand of the interleaved access is supposed to be uniform. 2859 if (Group->isReverse()) { 2860 Value *RuntimeVF = 2861 getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF); 2862 Value *Index = 2863 State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1)); 2864 Index = State.Builder.CreateMul(Index, 2865 State.Builder.getInt32(Group->getFactor())); 2866 Index = State.Builder.CreateNeg(Index); 2867 2868 bool InBounds = false; 2869 if (auto *Gep = dyn_cast<GetElementPtrInst>(ResAddr->stripPointerCasts())) 2870 InBounds = Gep->isInBounds(); 2871 ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Index, "", InBounds); 2872 } 2873 2874 State.setDebugLocFrom(Instr->getDebugLoc()); 2875 Value *PoisonVec = PoisonValue::get(VecTy); 2876 2877 auto CreateGroupMask = [&BlockInMask, &State, 2878 &InterleaveFactor](Value *MaskForGaps) -> Value * { 2879 if (State.VF.isScalable()) { 2880 assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); 2881 assert(InterleaveFactor == 2 && 2882 "Unsupported deinterleave factor for scalable vectors"); 2883 auto *ResBlockInMask = State.get(BlockInMask); 2884 SmallVector<Value *, 2> Ops = {ResBlockInMask, ResBlockInMask}; 2885 auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(), 2886 State.VF.getKnownMinValue() * 2, true); 2887 return State.Builder.CreateIntrinsic( 2888 MaskTy, Intrinsic::vector_interleave2, Ops, 2889 /*FMFSource=*/nullptr, "interleaved.mask"); 2890 } 2891 2892 if (!BlockInMask) 2893 return MaskForGaps; 2894 2895 Value *ResBlockInMask = State.get(BlockInMask); 2896 Value *ShuffledMask = State.Builder.CreateShuffleVector( 2897 ResBlockInMask, 2898 createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()), 2899 "interleaved.mask"); 2900 return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And, 2901 ShuffledMask, MaskForGaps) 2902 : ShuffledMask; 2903 }; 2904 2905 const DataLayout &DL = Instr->getDataLayout(); 2906 // Vectorize the interleaved load group. 2907 if (isa<LoadInst>(Instr)) { 2908 Value *MaskForGaps = nullptr; 2909 if (NeedsMaskForGaps) { 2910 MaskForGaps = createBitMaskForGaps(State.Builder, 2911 State.VF.getKnownMinValue(), *Group); 2912 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2913 } 2914 2915 Instruction *NewLoad; 2916 if (BlockInMask || MaskForGaps) { 2917 Value *GroupMask = CreateGroupMask(MaskForGaps); 2918 NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr, 2919 Group->getAlign(), GroupMask, 2920 PoisonVec, "wide.masked.vec"); 2921 } else 2922 NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr, 2923 Group->getAlign(), "wide.vec"); 2924 Group->addMetadata(NewLoad); 2925 2926 ArrayRef<VPValue *> VPDefs = definedValues(); 2927 const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); 2928 if (VecTy->isScalableTy()) { 2929 assert(InterleaveFactor == 2 && 2930 "Unsupported deinterleave factor for scalable vectors"); 2931 2932 // Scalable vectors cannot use arbitrary shufflevectors (only splats), 2933 // so must use intrinsics to deinterleave. 2934 Value *DI = State.Builder.CreateIntrinsic( 2935 Intrinsic::vector_deinterleave2, VecTy, NewLoad, 2936 /*FMFSource=*/nullptr, "strided.vec"); 2937 unsigned J = 0; 2938 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2939 Instruction *Member = Group->getMember(I); 2940 2941 if (!Member) 2942 continue; 2943 2944 Value *StridedVec = State.Builder.CreateExtractValue(DI, I); 2945 // If this member has different type, cast the result type. 2946 if (Member->getType() != ScalarTy) { 2947 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); 2948 StridedVec = 2949 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); 2950 } 2951 2952 if (Group->isReverse()) 2953 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); 2954 2955 State.set(VPDefs[J], StridedVec); 2956 ++J; 2957 } 2958 2959 return; 2960 } 2961 2962 // For each member in the group, shuffle out the appropriate data from the 2963 // wide loads. 2964 unsigned J = 0; 2965 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2966 Instruction *Member = Group->getMember(I); 2967 2968 // Skip the gaps in the group. 2969 if (!Member) 2970 continue; 2971 2972 auto StrideMask = 2973 createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue()); 2974 Value *StridedVec = 2975 State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec"); 2976 2977 // If this member has different type, cast the result type. 2978 if (Member->getType() != ScalarTy) { 2979 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 2980 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); 2981 StridedVec = 2982 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); 2983 } 2984 2985 if (Group->isReverse()) 2986 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); 2987 2988 State.set(VPDefs[J], StridedVec); 2989 ++J; 2990 } 2991 return; 2992 } 2993 2994 // The sub vector type for current instruction. 2995 auto *SubVT = VectorType::get(ScalarTy, State.VF); 2996 2997 // Vectorize the interleaved store group. 2998 Value *MaskForGaps = 2999 createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group); 3000 assert((!MaskForGaps || !State.VF.isScalable()) && 3001 "masking gaps for scalable vectors is not yet supported."); 3002 ArrayRef<VPValue *> StoredValues = getStoredValues(); 3003 // Collect the stored vector from each member. 3004 SmallVector<Value *, 4> StoredVecs; 3005 unsigned StoredIdx = 0; 3006 for (unsigned i = 0; i < InterleaveFactor; i++) { 3007 assert((Group->getMember(i) || MaskForGaps) && 3008 "Fail to get a member from an interleaved store group"); 3009 Instruction *Member = Group->getMember(i); 3010 3011 // Skip the gaps in the group. 3012 if (!Member) { 3013 Value *Undef = PoisonValue::get(SubVT); 3014 StoredVecs.push_back(Undef); 3015 continue; 3016 } 3017 3018 Value *StoredVec = State.get(StoredValues[StoredIdx]); 3019 ++StoredIdx; 3020 3021 if (Group->isReverse()) 3022 StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse"); 3023 3024 // If this member has different type, cast it to a unified type. 3025 3026 if (StoredVec->getType() != SubVT) 3027 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL); 3028 3029 StoredVecs.push_back(StoredVec); 3030 } 3031 3032 // Interleave all the smaller vectors into one wider vector. 3033 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec"); 3034 Instruction *NewStoreInstr; 3035 if (BlockInMask || MaskForGaps) { 3036 Value *GroupMask = CreateGroupMask(MaskForGaps); 3037 NewStoreInstr = State.Builder.CreateMaskedStore( 3038 IVec, ResAddr, Group->getAlign(), GroupMask); 3039 } else 3040 NewStoreInstr = 3041 State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign()); 3042 3043 Group->addMetadata(NewStoreInstr); 3044 } 3045 3046 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3047 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 3048 VPSlotTracker &SlotTracker) const { 3049 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 3050 IG->getInsertPos()->printAsOperand(O, false); 3051 O << ", "; 3052 getAddr()->printAsOperand(O, SlotTracker); 3053 VPValue *Mask = getMask(); 3054 if (Mask) { 3055 O << ", "; 3056 Mask->printAsOperand(O, SlotTracker); 3057 } 3058 3059 unsigned OpIdx = 0; 3060 for (unsigned i = 0; i < IG->getFactor(); ++i) { 3061 if (!IG->getMember(i)) 3062 continue; 3063 if (getNumStoreOperands() > 0) { 3064 O << "\n" << Indent << " store "; 3065 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 3066 O << " to index " << i; 3067 } else { 3068 O << "\n" << Indent << " "; 3069 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 3070 O << " = load from index " << i; 3071 } 3072 ++OpIdx; 3073 } 3074 } 3075 #endif 3076 3077 InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF, 3078 VPCostContext &Ctx) const { 3079 Instruction *InsertPos = getInsertPos(); 3080 // Find the VPValue index of the interleave group. We need to skip gaps. 3081 unsigned InsertPosIdx = 0; 3082 for (unsigned Idx = 0; IG->getFactor(); ++Idx) 3083 if (auto *Member = IG->getMember(Idx)) { 3084 if (Member == InsertPos) 3085 break; 3086 InsertPosIdx++; 3087 } 3088 Type *ValTy = Ctx.Types.inferScalarType( 3089 getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx) 3090 : getStoredValues()[InsertPosIdx]); 3091 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 3092 unsigned AS = getLoadStoreAddressSpace(InsertPos); 3093 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 3094 3095 unsigned InterleaveFactor = IG->getFactor(); 3096 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 3097 3098 // Holds the indices of existing members in the interleaved group. 3099 SmallVector<unsigned, 4> Indices; 3100 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 3101 if (IG->getMember(IF)) 3102 Indices.push_back(IF); 3103 3104 // Calculate the cost of the whole interleaved group. 3105 InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost( 3106 InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices, 3107 IG->getAlign(), AS, CostKind, getMask(), NeedsMaskForGaps); 3108 3109 if (!IG->isReverse()) 3110 return Cost; 3111 3112 return Cost + IG->getNumMembers() * 3113 Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 3114 VectorTy, std::nullopt, CostKind, 0); 3115 } 3116 3117 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3118 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3119 VPSlotTracker &SlotTracker) const { 3120 O << Indent << "EMIT "; 3121 printAsOperand(O, SlotTracker); 3122 O << " = CANONICAL-INDUCTION "; 3123 printOperands(O, SlotTracker); 3124 } 3125 #endif 3126 3127 bool VPCanonicalIVPHIRecipe::isCanonical( 3128 InductionDescriptor::InductionKind Kind, VPValue *Start, 3129 VPValue *Step) const { 3130 // Must be an integer induction. 3131 if (Kind != InductionDescriptor::IK_IntInduction) 3132 return false; 3133 // Start must match the start value of this canonical induction. 3134 if (Start != getStartValue()) 3135 return false; 3136 3137 // If the step is defined by a recipe, it is not a ConstantInt. 3138 if (Step->getDefiningRecipe()) 3139 return false; 3140 3141 ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue()); 3142 return StepC && StepC->isOne(); 3143 } 3144 3145 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) { 3146 return IsScalarAfterVectorization && 3147 (!IsScalable || vputils::onlyFirstLaneUsed(this)); 3148 } 3149 3150 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 3151 assert(getInductionDescriptor().getKind() == 3152 InductionDescriptor::IK_PtrInduction && 3153 "Not a pointer induction according to InductionDescriptor!"); 3154 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && 3155 "Unexpected type."); 3156 assert(!onlyScalarsGenerated(State.VF.isScalable()) && 3157 "Recipe should have been replaced"); 3158 3159 unsigned CurrentPart = getUnrollPart(*this); 3160 3161 // Build a pointer phi 3162 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 3163 Type *ScStValueType = ScalarStartValue->getType(); 3164 3165 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3166 PHINode *NewPointerPhi = nullptr; 3167 if (CurrentPart == 0) { 3168 auto *IVR = cast<VPHeaderPHIRecipe>(&getParent() 3169 ->getPlan() 3170 ->getVectorLoopRegion() 3171 ->getEntryBasicBlock() 3172 ->front()); 3173 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, /*IsScalar*/ true)); 3174 NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi", 3175 CanonicalIV->getIterator()); 3176 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 3177 NewPointerPhi->setDebugLoc(getDebugLoc()); 3178 } else { 3179 // The recipe has been unrolled. In that case, fetch the single pointer phi 3180 // shared among all unrolled parts of the recipe. 3181 auto *GEP = 3182 cast<GetElementPtrInst>(State.get(getFirstUnrolledPartOperand())); 3183 NewPointerPhi = cast<PHINode>(GEP->getPointerOperand()); 3184 } 3185 3186 // A pointer induction, performed by using a gep 3187 BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint(); 3188 Value *ScalarStepValue = State.get(getStepValue(), VPLane(0)); 3189 Type *PhiType = State.TypeAnalysis.inferScalarType(getStepValue()); 3190 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 3191 // Add induction update using an incorrect block temporarily. The phi node 3192 // will be fixed after VPlan execution. Note that at this point the latch 3193 // block cannot be used, as it does not exist yet. 3194 // TODO: Model increment value in VPlan, by turning the recipe into a 3195 // multi-def and a subclass of VPHeaderPHIRecipe. 3196 if (CurrentPart == 0) { 3197 // The recipe represents the first part of the pointer induction. Create the 3198 // GEP to increment the phi across all unrolled parts. 3199 unsigned UF = CurrentPart == 0 ? getParent()->getPlan()->getUF() : 1; 3200 Value *NumUnrolledElems = 3201 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, UF)); 3202 3203 Value *InductionGEP = GetElementPtrInst::Create( 3204 State.Builder.getInt8Ty(), NewPointerPhi, 3205 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 3206 InductionLoc); 3207 3208 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 3209 } 3210 3211 // Create actual address geps that use the pointer phi as base and a 3212 // vectorized version of the step value (<step*0, ..., step*N>) as offset. 3213 Type *VecPhiType = VectorType::get(PhiType, State.VF); 3214 Value *StartOffsetScalar = State.Builder.CreateMul( 3215 RuntimeVF, ConstantInt::get(PhiType, CurrentPart)); 3216 Value *StartOffset = 3217 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 3218 // Create a vector of consecutive numbers from zero to VF. 3219 StartOffset = State.Builder.CreateAdd( 3220 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 3221 3222 assert(ScalarStepValue == State.get(getOperand(1), VPLane(0)) && 3223 "scalar step must be the same across all parts"); 3224 Value *GEP = State.Builder.CreateGEP( 3225 State.Builder.getInt8Ty(), NewPointerPhi, 3226 State.Builder.CreateMul(StartOffset, State.Builder.CreateVectorSplat( 3227 State.VF, ScalarStepValue)), 3228 "vector.gep"); 3229 State.set(this, GEP); 3230 } 3231 3232 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3233 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, 3234 VPSlotTracker &SlotTracker) const { 3235 assert((getNumOperands() == 2 || getNumOperands() == 4) && 3236 "unexpected number of operands"); 3237 O << Indent << "EMIT "; 3238 printAsOperand(O, SlotTracker); 3239 O << " = WIDEN-POINTER-INDUCTION "; 3240 getStartValue()->printAsOperand(O, SlotTracker); 3241 O << ", "; 3242 getStepValue()->printAsOperand(O, SlotTracker); 3243 if (getNumOperands() == 4) { 3244 O << ", "; 3245 getOperand(2)->printAsOperand(O, SlotTracker); 3246 O << ", "; 3247 getOperand(3)->printAsOperand(O, SlotTracker); 3248 } 3249 } 3250 #endif 3251 3252 void VPExpandSCEVRecipe::execute(VPTransformState &State) { 3253 assert(!State.Lane && "cannot be used in per-lane"); 3254 if (State.ExpandedSCEVs.contains(Expr)) { 3255 // SCEV Expr has already been expanded, result must already be set. At the 3256 // moment we have to execute the entry block twice (once before skeleton 3257 // creation to get expanded SCEVs used by the skeleton and once during 3258 // regular VPlan execution). 3259 State.Builder.SetInsertPoint(State.CFG.VPBB2IRBB[getParent()]); 3260 assert(State.get(this, VPLane(0)) == State.ExpandedSCEVs[Expr] && 3261 "Results must match"); 3262 return; 3263 } 3264 3265 const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); 3266 SCEVExpander Exp(SE, DL, "induction"); 3267 3268 Value *Res = Exp.expandCodeFor(Expr, Expr->getType(), 3269 &*State.Builder.GetInsertPoint()); 3270 State.ExpandedSCEVs[Expr] = Res; 3271 State.set(this, Res, VPLane(0)); 3272 } 3273 3274 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3275 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, 3276 VPSlotTracker &SlotTracker) const { 3277 O << Indent << "EMIT "; 3278 printAsOperand(O, SlotTracker); 3279 O << " = EXPAND SCEV " << *Expr; 3280 } 3281 #endif 3282 3283 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { 3284 Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true); 3285 Type *STy = CanonicalIV->getType(); 3286 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); 3287 ElementCount VF = State.VF; 3288 Value *VStart = VF.isScalar() 3289 ? CanonicalIV 3290 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); 3291 Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this)); 3292 if (VF.isVector()) { 3293 VStep = Builder.CreateVectorSplat(VF, VStep); 3294 VStep = 3295 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); 3296 } 3297 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); 3298 State.set(this, CanonicalVectorIV); 3299 } 3300 3301 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3302 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, 3303 VPSlotTracker &SlotTracker) const { 3304 O << Indent << "EMIT "; 3305 printAsOperand(O, SlotTracker); 3306 O << " = WIDEN-CANONICAL-INDUCTION "; 3307 printOperands(O, SlotTracker); 3308 } 3309 #endif 3310 3311 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { 3312 auto &Builder = State.Builder; 3313 // Create a vector from the initial value. 3314 auto *VectorInit = getStartValue()->getLiveInIRValue(); 3315 3316 Type *VecTy = State.VF.isScalar() 3317 ? VectorInit->getType() 3318 : VectorType::get(VectorInit->getType(), State.VF); 3319 3320 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3321 if (State.VF.isVector()) { 3322 auto *IdxTy = Builder.getInt32Ty(); 3323 auto *One = ConstantInt::get(IdxTy, 1); 3324 IRBuilder<>::InsertPointGuard Guard(Builder); 3325 Builder.SetInsertPoint(VectorPH->getTerminator()); 3326 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF); 3327 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 3328 VectorInit = Builder.CreateInsertElement( 3329 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init"); 3330 } 3331 3332 // Create a phi node for the new recurrence. 3333 PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur"); 3334 Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 3335 Phi->addIncoming(VectorInit, VectorPH); 3336 State.set(this, Phi); 3337 } 3338 3339 InstructionCost 3340 VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF, 3341 VPCostContext &Ctx) const { 3342 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 3343 if (VF.isScalar()) 3344 return Ctx.TTI.getCFInstrCost(Instruction::PHI, CostKind); 3345 3346 if (VF.isScalable() && VF.getKnownMinValue() == 1) 3347 return InstructionCost::getInvalid(); 3348 3349 SmallVector<int> Mask(VF.getKnownMinValue()); 3350 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1); 3351 Type *VectorTy = 3352 ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); 3353 3354 return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice, 3355 cast<VectorType>(VectorTy), Mask, CostKind, 3356 VF.getKnownMinValue() - 1); 3357 } 3358 3359 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3360 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, 3361 VPSlotTracker &SlotTracker) const { 3362 O << Indent << "FIRST-ORDER-RECURRENCE-PHI "; 3363 printAsOperand(O, SlotTracker); 3364 O << " = phi "; 3365 printOperands(O, SlotTracker); 3366 } 3367 #endif 3368 3369 void VPReductionPHIRecipe::execute(VPTransformState &State) { 3370 auto &Builder = State.Builder; 3371 3372 // Reductions do not have to start at zero. They can start with 3373 // any loop invariant values. 3374 VPValue *StartVPV = getStartValue(); 3375 Value *StartV = StartVPV->getLiveInIRValue(); 3376 3377 // In order to support recurrences we need to be able to vectorize Phi nodes. 3378 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3379 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 3380 // this value when we vectorize all of the instructions that use the PHI. 3381 bool ScalarPHI = State.VF.isScalar() || IsInLoop; 3382 Type *VecTy = ScalarPHI ? StartV->getType() 3383 : VectorType::get(StartV->getType(), State.VF); 3384 3385 BasicBlock *HeaderBB = State.CFG.PrevBB; 3386 assert(State.CurrentVectorLoop->getHeader() == HeaderBB && 3387 "recipe must be in the vector loop header"); 3388 auto *Phi = PHINode::Create(VecTy, 2, "vec.phi"); 3389 Phi->insertBefore(HeaderBB->getFirstInsertionPt()); 3390 State.set(this, Phi, IsInLoop); 3391 3392 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3393 3394 Value *Iden = nullptr; 3395 RecurKind RK = RdxDesc.getRecurrenceKind(); 3396 unsigned CurrentPart = getUnrollPart(*this); 3397 3398 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || 3399 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { 3400 // MinMax and AnyOf reductions have the start value as their identity. 3401 if (ScalarPHI) { 3402 Iden = StartV; 3403 } else { 3404 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 3405 Builder.SetInsertPoint(VectorPH->getTerminator()); 3406 StartV = Iden = State.get(StartVPV); 3407 } 3408 } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) { 3409 // [I|F]FindLastIV will use a sentinel value to initialize the reduction 3410 // phi or the resume value from the main vector loop when vectorizing the 3411 // epilogue loop. In the exit block, ComputeReductionResult will generate 3412 // checks to verify if the reduction result is the sentinel value. If the 3413 // result is the sentinel value, it will be corrected back to the start 3414 // value. 3415 // TODO: The sentinel value is not always necessary. When the start value is 3416 // a constant, and smaller than the start value of the induction variable, 3417 // the start value can be directly used to initialize the reduction phi. 3418 Iden = StartV; 3419 if (!ScalarPHI) { 3420 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 3421 Builder.SetInsertPoint(VectorPH->getTerminator()); 3422 StartV = Iden = Builder.CreateVectorSplat(State.VF, Iden); 3423 } 3424 } else { 3425 Iden = llvm::getRecurrenceIdentity(RK, VecTy->getScalarType(), 3426 RdxDesc.getFastMathFlags()); 3427 3428 if (!ScalarPHI) { 3429 if (CurrentPart == 0) { 3430 // Create start and identity vector values for the reduction in the 3431 // preheader. 3432 // TODO: Introduce recipes in VPlan preheader to create initial values. 3433 Iden = Builder.CreateVectorSplat(State.VF, Iden); 3434 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 3435 Builder.SetInsertPoint(VectorPH->getTerminator()); 3436 Constant *Zero = Builder.getInt32(0); 3437 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 3438 } else { 3439 Iden = Builder.CreateVectorSplat(State.VF, Iden); 3440 } 3441 } 3442 } 3443 3444 Phi = cast<PHINode>(State.get(this, IsInLoop)); 3445 Value *StartVal = (CurrentPart == 0) ? StartV : Iden; 3446 Phi->addIncoming(StartVal, VectorPH); 3447 } 3448 3449 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3450 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3451 VPSlotTracker &SlotTracker) const { 3452 O << Indent << "WIDEN-REDUCTION-PHI "; 3453 3454 printAsOperand(O, SlotTracker); 3455 O << " = phi "; 3456 printOperands(O, SlotTracker); 3457 } 3458 #endif 3459 3460 void VPWidenPHIRecipe::execute(VPTransformState &State) { 3461 assert(EnableVPlanNativePath && 3462 "Non-native vplans are not expected to have VPWidenPHIRecipes."); 3463 3464 Value *Op0 = State.get(getOperand(0)); 3465 Type *VecTy = Op0->getType(); 3466 Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi"); 3467 State.set(this, VecPhi); 3468 } 3469 3470 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3471 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3472 VPSlotTracker &SlotTracker) const { 3473 O << Indent << "WIDEN-PHI "; 3474 3475 auto *OriginalPhi = cast<PHINode>(getUnderlyingValue()); 3476 // Unless all incoming values are modeled in VPlan print the original PHI 3477 // directly. 3478 // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming 3479 // values as VPValues. 3480 if (getNumOperands() != OriginalPhi->getNumOperands()) { 3481 O << VPlanIngredient(OriginalPhi); 3482 return; 3483 } 3484 3485 printAsOperand(O, SlotTracker); 3486 O << " = phi "; 3487 printOperands(O, SlotTracker); 3488 } 3489 #endif 3490 3491 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and 3492 // remove VPActiveLaneMaskPHIRecipe. 3493 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { 3494 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3495 Value *StartMask = State.get(getOperand(0)); 3496 PHINode *Phi = 3497 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); 3498 Phi->addIncoming(StartMask, VectorPH); 3499 Phi->setDebugLoc(getDebugLoc()); 3500 State.set(this, Phi); 3501 } 3502 3503 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3504 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3505 VPSlotTracker &SlotTracker) const { 3506 O << Indent << "ACTIVE-LANE-MASK-PHI "; 3507 3508 printAsOperand(O, SlotTracker); 3509 O << " = phi "; 3510 printOperands(O, SlotTracker); 3511 } 3512 #endif 3513 3514 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3515 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3516 VPSlotTracker &SlotTracker) const { 3517 O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI "; 3518 3519 printAsOperand(O, SlotTracker); 3520 O << " = phi "; 3521 printOperands(O, SlotTracker); 3522 } 3523 #endif 3524 3525 void VPScalarPHIRecipe::execute(VPTransformState &State) { 3526 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 3527 Value *Start = State.get(getStartValue(), VPLane(0)); 3528 PHINode *Phi = State.Builder.CreatePHI(Start->getType(), 2, Name); 3529 Phi->addIncoming(Start, VectorPH); 3530 Phi->setDebugLoc(getDebugLoc()); 3531 State.set(this, Phi, /*IsScalar=*/true); 3532 } 3533 3534 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3535 void VPScalarPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3536 VPSlotTracker &SlotTracker) const { 3537 O << Indent << "SCALAR-PHI "; 3538 printAsOperand(O, SlotTracker); 3539 O << " = phi "; 3540 printOperands(O, SlotTracker); 3541 } 3542 #endif 3543