1 //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This file provides a helper that implements much of the TTI interface in 11 /// terms of the target-independent code generator and TargetLowering 12 /// interfaces. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #ifndef LLVM_CODEGEN_BASICTTIIMPL_H 17 #define LLVM_CODEGEN_BASICTTIIMPL_H 18 19 #include "llvm/ADT/APInt.h" 20 #include "llvm/ADT/BitVector.h" 21 #include "llvm/ADT/SmallPtrSet.h" 22 #include "llvm/ADT/SmallVector.h" 23 #include "llvm/Analysis/LoopInfo.h" 24 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 25 #include "llvm/Analysis/TargetTransformInfo.h" 26 #include "llvm/Analysis/TargetTransformInfoImpl.h" 27 #include "llvm/Analysis/ValueTracking.h" 28 #include "llvm/CodeGen/ISDOpcodes.h" 29 #include "llvm/CodeGen/TargetLowering.h" 30 #include "llvm/CodeGen/TargetSubtargetInfo.h" 31 #include "llvm/CodeGen/ValueTypes.h" 32 #include "llvm/CodeGenTypes/MachineValueType.h" 33 #include "llvm/IR/BasicBlock.h" 34 #include "llvm/IR/Constant.h" 35 #include "llvm/IR/Constants.h" 36 #include "llvm/IR/DataLayout.h" 37 #include "llvm/IR/DerivedTypes.h" 38 #include "llvm/IR/InstrTypes.h" 39 #include "llvm/IR/Instruction.h" 40 #include "llvm/IR/Instructions.h" 41 #include "llvm/IR/Intrinsics.h" 42 #include "llvm/IR/Operator.h" 43 #include "llvm/IR/Type.h" 44 #include "llvm/IR/Value.h" 45 #include "llvm/Support/Casting.h" 46 #include "llvm/Support/CommandLine.h" 47 #include "llvm/Support/ErrorHandling.h" 48 #include "llvm/Support/MathExtras.h" 49 #include "llvm/Target/TargetMachine.h" 50 #include "llvm/Target/TargetOptions.h" 51 #include "llvm/Transforms/Utils/LoopUtils.h" 52 #include <algorithm> 53 #include <cassert> 54 #include <cstdint> 55 #include <limits> 56 #include <optional> 57 #include <utility> 58 59 namespace llvm { 60 61 class Function; 62 class GlobalValue; 63 class LLVMContext; 64 class ScalarEvolution; 65 class SCEV; 66 class TargetMachine; 67 68 extern cl::opt<unsigned> PartialUnrollingThreshold; 69 70 /// Base class which can be used to help build a TTI implementation. 71 /// 72 /// This class provides as much implementation of the TTI interface as is 73 /// possible using the target independent parts of the code generator. 74 /// 75 /// In order to subclass it, your class must implement a getST() method to 76 /// return the subtarget, and a getTLI() method to return the target lowering. 77 /// We need these methods implemented in the derived class so that this class 78 /// doesn't have to duplicate storage for them. 79 template <typename T> 80 class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { 81 private: 82 using BaseT = TargetTransformInfoImplCRTPBase<T>; 83 using TTI = TargetTransformInfo; 84 85 /// Helper function to access this as a T. 86 T *thisT() { return static_cast<T *>(this); } 87 88 /// Estimate a cost of Broadcast as an extract and sequence of insert 89 /// operations. 90 InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy, 91 TTI::TargetCostKind CostKind) { 92 InstructionCost Cost = 0; 93 // Broadcast cost is equal to the cost of extracting the zero'th element 94 // plus the cost of inserting it into every element of the result vector. 95 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 96 CostKind, 0, nullptr, nullptr); 97 98 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { 99 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, 100 CostKind, i, nullptr, nullptr); 101 } 102 return Cost; 103 } 104 105 /// Estimate a cost of shuffle as a sequence of extract and insert 106 /// operations. 107 InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy, 108 TTI::TargetCostKind CostKind) { 109 InstructionCost Cost = 0; 110 // Shuffle cost is equal to the cost of extracting element from its argument 111 // plus the cost of inserting them onto the result vector. 112 113 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from 114 // index 0 of first vector, index 1 of second vector,index 2 of first 115 // vector and finally index 3 of second vector and insert them at index 116 // <0,1,2,3> of result vector. 117 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { 118 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, 119 CostKind, i, nullptr, nullptr); 120 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 121 CostKind, i, nullptr, nullptr); 122 } 123 return Cost; 124 } 125 126 /// Estimate a cost of subvector extraction as a sequence of extract and 127 /// insert operations. 128 InstructionCost getExtractSubvectorOverhead(VectorType *VTy, 129 TTI::TargetCostKind CostKind, 130 int Index, 131 FixedVectorType *SubVTy) { 132 assert(VTy && SubVTy && 133 "Can only extract subvectors from vectors"); 134 int NumSubElts = SubVTy->getNumElements(); 135 assert((!isa<FixedVectorType>(VTy) || 136 (Index + NumSubElts) <= 137 (int)cast<FixedVectorType>(VTy)->getNumElements()) && 138 "SK_ExtractSubvector index out of range"); 139 140 InstructionCost Cost = 0; 141 // Subvector extraction cost is equal to the cost of extracting element from 142 // the source type plus the cost of inserting them into the result vector 143 // type. 144 for (int i = 0; i != NumSubElts; ++i) { 145 Cost += 146 thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 147 CostKind, i + Index, nullptr, nullptr); 148 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, 149 CostKind, i, nullptr, nullptr); 150 } 151 return Cost; 152 } 153 154 /// Estimate a cost of subvector insertion as a sequence of extract and 155 /// insert operations. 156 InstructionCost getInsertSubvectorOverhead(VectorType *VTy, 157 TTI::TargetCostKind CostKind, 158 int Index, 159 FixedVectorType *SubVTy) { 160 assert(VTy && SubVTy && 161 "Can only insert subvectors into vectors"); 162 int NumSubElts = SubVTy->getNumElements(); 163 assert((!isa<FixedVectorType>(VTy) || 164 (Index + NumSubElts) <= 165 (int)cast<FixedVectorType>(VTy)->getNumElements()) && 166 "SK_InsertSubvector index out of range"); 167 168 InstructionCost Cost = 0; 169 // Subvector insertion cost is equal to the cost of extracting element from 170 // the source type plus the cost of inserting them into the result vector 171 // type. 172 for (int i = 0; i != NumSubElts; ++i) { 173 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, 174 CostKind, i, nullptr, nullptr); 175 Cost += 176 thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind, 177 i + Index, nullptr, nullptr); 178 } 179 return Cost; 180 } 181 182 /// Local query method delegates up to T which *must* implement this! 183 const TargetSubtargetInfo *getST() const { 184 return static_cast<const T *>(this)->getST(); 185 } 186 187 /// Local query method delegates up to T which *must* implement this! 188 const TargetLoweringBase *getTLI() const { 189 return static_cast<const T *>(this)->getTLI(); 190 } 191 192 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) { 193 switch (M) { 194 case TTI::MIM_Unindexed: 195 return ISD::UNINDEXED; 196 case TTI::MIM_PreInc: 197 return ISD::PRE_INC; 198 case TTI::MIM_PreDec: 199 return ISD::PRE_DEC; 200 case TTI::MIM_PostInc: 201 return ISD::POST_INC; 202 case TTI::MIM_PostDec: 203 return ISD::POST_DEC; 204 } 205 llvm_unreachable("Unexpected MemIndexedMode"); 206 } 207 208 InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, 209 Align Alignment, 210 bool VariableMask, 211 bool IsGatherScatter, 212 TTI::TargetCostKind CostKind, 213 unsigned AddressSpace = 0) { 214 // We cannot scalarize scalable vectors, so return Invalid. 215 if (isa<ScalableVectorType>(DataTy)) 216 return InstructionCost::getInvalid(); 217 218 auto *VT = cast<FixedVectorType>(DataTy); 219 unsigned VF = VT->getNumElements(); 220 221 // Assume the target does not have support for gather/scatter operations 222 // and provide a rough estimate. 223 // 224 // First, compute the cost of the individual memory operations. 225 InstructionCost AddrExtractCost = 226 IsGatherScatter ? getScalarizationOverhead( 227 FixedVectorType::get( 228 PointerType::get(VT->getContext(), 0), VF), 229 /*Insert=*/false, /*Extract=*/true, CostKind) 230 : 0; 231 232 // The cost of the scalar loads/stores. 233 InstructionCost MemoryOpCost = 234 VF * thisT()->getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 235 AddressSpace, CostKind); 236 237 // Next, compute the cost of packing the result in a vector. 238 InstructionCost PackingCost = 239 getScalarizationOverhead(VT, Opcode != Instruction::Store, 240 Opcode == Instruction::Store, CostKind); 241 242 InstructionCost ConditionalCost = 0; 243 if (VariableMask) { 244 // Compute the cost of conditionally executing the memory operations with 245 // variable masks. This includes extracting the individual conditions, a 246 // branches and PHIs to combine the results. 247 // NOTE: Estimating the cost of conditionally executing the memory 248 // operations accurately is quite difficult and the current solution 249 // provides a very rough estimate only. 250 ConditionalCost = 251 getScalarizationOverhead( 252 FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), VF), 253 /*Insert=*/false, /*Extract=*/true, CostKind) + 254 VF * (thisT()->getCFInstrCost(Instruction::Br, CostKind) + 255 thisT()->getCFInstrCost(Instruction::PHI, CostKind)); 256 } 257 258 return AddrExtractCost + MemoryOpCost + PackingCost + ConditionalCost; 259 } 260 261 /// Checks if the provided mask \p is a splat mask, i.e. it contains only -1 262 /// or same non -1 index value and this index value contained at least twice. 263 /// So, mask <0, -1,-1, -1> is not considered splat (it is just identity), 264 /// same for <-1, 0, -1, -1> (just a slide), while <2, -1, 2, -1> is a splat 265 /// with \p Index=2. 266 static bool isSplatMask(ArrayRef<int> Mask, unsigned NumSrcElts, int &Index) { 267 // Check that the broadcast index meets at least twice. 268 bool IsCompared = false; 269 if (int SplatIdx = PoisonMaskElem; 270 all_of(enumerate(Mask), [&](const auto &P) { 271 if (P.value() == PoisonMaskElem) 272 return P.index() != Mask.size() - 1 || IsCompared; 273 if (static_cast<unsigned>(P.value()) >= NumSrcElts * 2) 274 return false; 275 if (SplatIdx == PoisonMaskElem) { 276 SplatIdx = P.value(); 277 return P.index() != Mask.size() - 1; 278 } 279 IsCompared = true; 280 return SplatIdx == P.value(); 281 })) { 282 Index = SplatIdx; 283 return true; 284 } 285 return false; 286 } 287 288 protected: 289 explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL) 290 : BaseT(DL) {} 291 virtual ~BasicTTIImplBase() = default; 292 293 using TargetTransformInfoImplBase::DL; 294 295 public: 296 /// \name Scalar TTI Implementations 297 /// @{ 298 bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, 299 unsigned AddressSpace, Align Alignment, 300 unsigned *Fast) const { 301 EVT E = EVT::getIntegerVT(Context, BitWidth); 302 return getTLI()->allowsMisalignedMemoryAccesses( 303 E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast); 304 } 305 306 bool areInlineCompatible(const Function *Caller, 307 const Function *Callee) const { 308 const TargetMachine &TM = getTLI()->getTargetMachine(); 309 310 const FeatureBitset &CallerBits = 311 TM.getSubtargetImpl(*Caller)->getFeatureBits(); 312 const FeatureBitset &CalleeBits = 313 TM.getSubtargetImpl(*Callee)->getFeatureBits(); 314 315 // Inline a callee if its target-features are a subset of the callers 316 // target-features. 317 return (CallerBits & CalleeBits) == CalleeBits; 318 } 319 320 bool hasBranchDivergence(const Function *F = nullptr) { return false; } 321 322 bool isSourceOfDivergence(const Value *V) { return false; } 323 324 bool isAlwaysUniform(const Value *V) { return false; } 325 326 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { 327 return false; 328 } 329 330 bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const { 331 return true; 332 } 333 334 unsigned getFlatAddressSpace() { 335 // Return an invalid address space. 336 return -1; 337 } 338 339 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, 340 Intrinsic::ID IID) const { 341 return false; 342 } 343 344 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { 345 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS); 346 } 347 348 unsigned getAssumedAddrSpace(const Value *V) const { 349 return getTLI()->getTargetMachine().getAssumedAddrSpace(V); 350 } 351 352 bool isSingleThreaded() const { 353 return getTLI()->getTargetMachine().Options.ThreadModel == 354 ThreadModel::Single; 355 } 356 357 std::pair<const Value *, unsigned> 358 getPredicatedAddrSpace(const Value *V) const { 359 return getTLI()->getTargetMachine().getPredicatedAddrSpace(V); 360 } 361 362 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, 363 Value *NewV) const { 364 return nullptr; 365 } 366 367 bool isLegalAddImmediate(int64_t imm) { 368 return getTLI()->isLegalAddImmediate(imm); 369 } 370 371 bool isLegalAddScalableImmediate(int64_t Imm) { 372 return getTLI()->isLegalAddScalableImmediate(Imm); 373 } 374 375 bool isLegalICmpImmediate(int64_t imm) { 376 return getTLI()->isLegalICmpImmediate(imm); 377 } 378 379 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, 380 bool HasBaseReg, int64_t Scale, unsigned AddrSpace, 381 Instruction *I = nullptr, 382 int64_t ScalableOffset = 0) { 383 TargetLoweringBase::AddrMode AM; 384 AM.BaseGV = BaseGV; 385 AM.BaseOffs = BaseOffset; 386 AM.HasBaseReg = HasBaseReg; 387 AM.Scale = Scale; 388 AM.ScalableOffset = ScalableOffset; 389 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I); 390 } 391 392 int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) { 393 return getTLI()->getPreferredLargeGEPBaseOffset(MinOffset, MaxOffset); 394 } 395 396 unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, 397 Type *ScalarValTy) const { 398 auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) { 399 auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2); 400 EVT VT = getTLI()->getValueType(DL, SrcTy); 401 if (getTLI()->isOperationLegal(ISD::STORE, VT) || 402 getTLI()->isOperationCustom(ISD::STORE, VT)) 403 return true; 404 405 EVT ValVT = 406 getTLI()->getValueType(DL, FixedVectorType::get(ScalarValTy, VF / 2)); 407 EVT LegalizedVT = 408 getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT); 409 return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT); 410 }; 411 while (VF > 2 && IsSupportedByTarget(VF)) 412 VF /= 2; 413 return VF; 414 } 415 416 bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty, 417 const DataLayout &DL) const { 418 EVT VT = getTLI()->getValueType(DL, Ty); 419 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT); 420 } 421 422 bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty, 423 const DataLayout &DL) const { 424 EVT VT = getTLI()->getValueType(DL, Ty); 425 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT); 426 } 427 428 bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) { 429 return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); 430 } 431 432 bool isNumRegsMajorCostOfLSR() { 433 return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR(); 434 } 435 436 bool shouldDropLSRSolutionIfLessProfitable() const { 437 return TargetTransformInfoImplBase::shouldDropLSRSolutionIfLessProfitable(); 438 } 439 440 bool isProfitableLSRChainElement(Instruction *I) { 441 return TargetTransformInfoImplBase::isProfitableLSRChainElement(I); 442 } 443 444 InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, 445 StackOffset BaseOffset, bool HasBaseReg, 446 int64_t Scale, unsigned AddrSpace) { 447 TargetLoweringBase::AddrMode AM; 448 AM.BaseGV = BaseGV; 449 AM.BaseOffs = BaseOffset.getFixed(); 450 AM.HasBaseReg = HasBaseReg; 451 AM.Scale = Scale; 452 AM.ScalableOffset = BaseOffset.getScalable(); 453 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) 454 return 0; 455 return -1; 456 } 457 458 bool isTruncateFree(Type *Ty1, Type *Ty2) { 459 return getTLI()->isTruncateFree(Ty1, Ty2); 460 } 461 462 bool isProfitableToHoist(Instruction *I) { 463 return getTLI()->isProfitableToHoist(I); 464 } 465 466 bool useAA() const { return getST()->useAA(); } 467 468 bool isTypeLegal(Type *Ty) { 469 EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true); 470 return getTLI()->isTypeLegal(VT); 471 } 472 473 unsigned getRegUsageForType(Type *Ty) { 474 EVT ETy = getTLI()->getValueType(DL, Ty); 475 return getTLI()->getNumRegisters(Ty->getContext(), ETy); 476 } 477 478 InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, 479 ArrayRef<const Value *> Operands, Type *AccessType, 480 TTI::TargetCostKind CostKind) { 481 return BaseT::getGEPCost(PointeeType, Ptr, Operands, AccessType, CostKind); 482 } 483 484 unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, 485 unsigned &JumpTableSize, 486 ProfileSummaryInfo *PSI, 487 BlockFrequencyInfo *BFI) { 488 /// Try to find the estimated number of clusters. Note that the number of 489 /// clusters identified in this function could be different from the actual 490 /// numbers found in lowering. This function ignore switches that are 491 /// lowered with a mix of jump table / bit test / BTree. This function was 492 /// initially intended to be used when estimating the cost of switch in 493 /// inline cost heuristic, but it's a generic cost model to be used in other 494 /// places (e.g., in loop unrolling). 495 unsigned N = SI.getNumCases(); 496 const TargetLoweringBase *TLI = getTLI(); 497 const DataLayout &DL = this->getDataLayout(); 498 499 JumpTableSize = 0; 500 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent()); 501 502 // Early exit if both a jump table and bit test are not allowed. 503 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N)) 504 return N; 505 506 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue(); 507 APInt MinCaseVal = MaxCaseVal; 508 for (auto CI : SI.cases()) { 509 const APInt &CaseVal = CI.getCaseValue()->getValue(); 510 if (CaseVal.sgt(MaxCaseVal)) 511 MaxCaseVal = CaseVal; 512 if (CaseVal.slt(MinCaseVal)) 513 MinCaseVal = CaseVal; 514 } 515 516 // Check if suitable for a bit test 517 if (N <= DL.getIndexSizeInBits(0u)) { 518 SmallPtrSet<const BasicBlock *, 4> Dests; 519 for (auto I : SI.cases()) 520 Dests.insert(I.getCaseSuccessor()); 521 522 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal, 523 DL)) 524 return 1; 525 } 526 527 // Check if suitable for a jump table. 528 if (IsJTAllowed) { 529 if (N < 2 || N < TLI->getMinimumJumpTableEntries()) 530 return N; 531 uint64_t Range = 532 (MaxCaseVal - MinCaseVal) 533 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1; 534 // Check whether a range of clusters is dense enough for a jump table 535 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) { 536 JumpTableSize = Range; 537 return 1; 538 } 539 } 540 return N; 541 } 542 543 bool shouldBuildLookupTables() { 544 const TargetLoweringBase *TLI = getTLI(); 545 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) || 546 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other); 547 } 548 549 bool shouldBuildRelLookupTables() const { 550 const TargetMachine &TM = getTLI()->getTargetMachine(); 551 // If non-PIC mode, do not generate a relative lookup table. 552 if (!TM.isPositionIndependent()) 553 return false; 554 555 /// Relative lookup table entries consist of 32-bit offsets. 556 /// Do not generate relative lookup tables for large code models 557 /// in 64-bit achitectures where 32-bit offsets might not be enough. 558 if (TM.getCodeModel() == CodeModel::Medium || 559 TM.getCodeModel() == CodeModel::Large) 560 return false; 561 562 const Triple &TargetTriple = TM.getTargetTriple(); 563 if (!TargetTriple.isArch64Bit()) 564 return false; 565 566 // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it 567 // there. 568 if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin()) 569 return false; 570 571 return true; 572 } 573 574 bool haveFastSqrt(Type *Ty) { 575 const TargetLoweringBase *TLI = getTLI(); 576 EVT VT = TLI->getValueType(DL, Ty); 577 return TLI->isTypeLegal(VT) && 578 TLI->isOperationLegalOrCustom(ISD::FSQRT, VT); 579 } 580 581 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) { 582 return true; 583 } 584 585 InstructionCost getFPOpCost(Type *Ty) { 586 // Check whether FADD is available, as a proxy for floating-point in 587 // general. 588 const TargetLoweringBase *TLI = getTLI(); 589 EVT VT = TLI->getValueType(DL, Ty); 590 if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT)) 591 return TargetTransformInfo::TCC_Basic; 592 return TargetTransformInfo::TCC_Expensive; 593 } 594 595 bool preferToKeepConstantsAttached(const Instruction &Inst, 596 const Function &Fn) const { 597 switch (Inst.getOpcode()) { 598 default: 599 break; 600 case Instruction::SDiv: 601 case Instruction::SRem: 602 case Instruction::UDiv: 603 case Instruction::URem: { 604 if (!isa<ConstantInt>(Inst.getOperand(1))) 605 return false; 606 EVT VT = getTLI()->getValueType(DL, Inst.getType()); 607 return !getTLI()->isIntDivCheap(VT, Fn.getAttributes()); 608 } 609 }; 610 611 return false; 612 } 613 614 unsigned getInliningThresholdMultiplier() const { return 1; } 615 unsigned adjustInliningThreshold(const CallBase *CB) { return 0; } 616 unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const { 617 return 0; 618 } 619 620 int getInlinerVectorBonusPercent() const { return 150; } 621 622 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 623 TTI::UnrollingPreferences &UP, 624 OptimizationRemarkEmitter *ORE) { 625 // This unrolling functionality is target independent, but to provide some 626 // motivation for its intended use, for x86: 627 628 // According to the Intel 64 and IA-32 Architectures Optimization Reference 629 // Manual, Intel Core models and later have a loop stream detector (and 630 // associated uop queue) that can benefit from partial unrolling. 631 // The relevant requirements are: 632 // - The loop must have no more than 4 (8 for Nehalem and later) branches 633 // taken, and none of them may be calls. 634 // - The loop can have no more than 18 (28 for Nehalem and later) uops. 635 636 // According to the Software Optimization Guide for AMD Family 15h 637 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor 638 // and loop buffer which can benefit from partial unrolling. 639 // The relevant requirements are: 640 // - The loop must have fewer than 16 branches 641 // - The loop must have less than 40 uops in all executed loop branches 642 643 // The number of taken branches in a loop is hard to estimate here, and 644 // benchmarking has revealed that it is better not to be conservative when 645 // estimating the branch count. As a result, we'll ignore the branch limits 646 // until someone finds a case where it matters in practice. 647 648 unsigned MaxOps; 649 const TargetSubtargetInfo *ST = getST(); 650 if (PartialUnrollingThreshold.getNumOccurrences() > 0) 651 MaxOps = PartialUnrollingThreshold; 652 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0) 653 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize; 654 else 655 return; 656 657 // Scan the loop: don't unroll loops with calls. 658 for (BasicBlock *BB : L->blocks()) { 659 for (Instruction &I : *BB) { 660 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 661 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 662 if (!thisT()->isLoweredToCall(F)) 663 continue; 664 } 665 666 if (ORE) { 667 ORE->emit([&]() { 668 return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(), 669 L->getHeader()) 670 << "advising against unrolling the loop because it " 671 "contains a " 672 << ore::NV("Call", &I); 673 }); 674 } 675 return; 676 } 677 } 678 } 679 680 // Enable runtime and partial unrolling up to the specified size. 681 // Enable using trip count upper bound to unroll loops. 682 UP.Partial = UP.Runtime = UP.UpperBound = true; 683 UP.PartialThreshold = MaxOps; 684 685 // Avoid unrolling when optimizing for size. 686 UP.OptSizeThreshold = 0; 687 UP.PartialOptSizeThreshold = 0; 688 689 // Set number of instructions optimized when "back edge" 690 // becomes "fall through" to default value of 2. 691 UP.BEInsns = 2; 692 } 693 694 void getPeelingPreferences(Loop *L, ScalarEvolution &SE, 695 TTI::PeelingPreferences &PP) { 696 PP.PeelCount = 0; 697 PP.AllowPeeling = true; 698 PP.AllowLoopNestsPeeling = false; 699 PP.PeelProfiledIterations = true; 700 } 701 702 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, 703 AssumptionCache &AC, 704 TargetLibraryInfo *LibInfo, 705 HardwareLoopInfo &HWLoopInfo) { 706 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); 707 } 708 709 unsigned getEpilogueVectorizationMinVF() { 710 return BaseT::getEpilogueVectorizationMinVF(); 711 } 712 713 bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) { 714 return BaseT::preferPredicateOverEpilogue(TFI); 715 } 716 717 TailFoldingStyle 718 getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) { 719 return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow); 720 } 721 722 std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, 723 IntrinsicInst &II) { 724 return BaseT::instCombineIntrinsic(IC, II); 725 } 726 727 std::optional<Value *> 728 simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, 729 APInt DemandedMask, KnownBits &Known, 730 bool &KnownBitsComputed) { 731 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known, 732 KnownBitsComputed); 733 } 734 735 std::optional<Value *> simplifyDemandedVectorEltsIntrinsic( 736 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 737 APInt &UndefElts2, APInt &UndefElts3, 738 std::function<void(Instruction *, unsigned, APInt, APInt &)> 739 SimplifyAndSetOp) { 740 return BaseT::simplifyDemandedVectorEltsIntrinsic( 741 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3, 742 SimplifyAndSetOp); 743 } 744 745 virtual std::optional<unsigned> 746 getCacheSize(TargetTransformInfo::CacheLevel Level) const { 747 return std::optional<unsigned>( 748 getST()->getCacheSize(static_cast<unsigned>(Level))); 749 } 750 751 virtual std::optional<unsigned> 752 getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const { 753 std::optional<unsigned> TargetResult = 754 getST()->getCacheAssociativity(static_cast<unsigned>(Level)); 755 756 if (TargetResult) 757 return TargetResult; 758 759 return BaseT::getCacheAssociativity(Level); 760 } 761 762 virtual unsigned getCacheLineSize() const { 763 return getST()->getCacheLineSize(); 764 } 765 766 virtual unsigned getPrefetchDistance() const { 767 return getST()->getPrefetchDistance(); 768 } 769 770 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, 771 unsigned NumStridedMemAccesses, 772 unsigned NumPrefetches, 773 bool HasCall) const { 774 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses, 775 NumPrefetches, HasCall); 776 } 777 778 virtual unsigned getMaxPrefetchIterationsAhead() const { 779 return getST()->getMaxPrefetchIterationsAhead(); 780 } 781 782 virtual bool enableWritePrefetching() const { 783 return getST()->enableWritePrefetching(); 784 } 785 786 virtual bool shouldPrefetchAddressSpace(unsigned AS) const { 787 return getST()->shouldPrefetchAddressSpace(AS); 788 } 789 790 /// @} 791 792 /// \name Vector TTI Implementations 793 /// @{ 794 795 TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 796 return TypeSize::getFixed(32); 797 } 798 799 std::optional<unsigned> getMaxVScale() const { return std::nullopt; } 800 std::optional<unsigned> getVScaleForTuning() const { return std::nullopt; } 801 bool isVScaleKnownToBeAPowerOfTwo() const { return false; } 802 803 /// Estimate the overhead of scalarizing an instruction. Insert and Extract 804 /// are set if the demanded result elements need to be inserted and/or 805 /// extracted from vectors. 806 InstructionCost getScalarizationOverhead(VectorType *InTy, 807 const APInt &DemandedElts, 808 bool Insert, bool Extract, 809 TTI::TargetCostKind CostKind, 810 ArrayRef<Value *> VL = {}) { 811 /// FIXME: a bitfield is not a reasonable abstraction for talking about 812 /// which elements are needed from a scalable vector 813 if (isa<ScalableVectorType>(InTy)) 814 return InstructionCost::getInvalid(); 815 auto *Ty = cast<FixedVectorType>(InTy); 816 817 assert(DemandedElts.getBitWidth() == Ty->getNumElements() && 818 (VL.empty() || VL.size() == Ty->getNumElements()) && 819 "Vector size mismatch"); 820 821 InstructionCost Cost = 0; 822 823 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) { 824 if (!DemandedElts[i]) 825 continue; 826 if (Insert) { 827 Value *InsertedVal = VL.empty() ? nullptr : VL[i]; 828 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, 829 CostKind, i, nullptr, InsertedVal); 830 } 831 if (Extract) 832 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 833 CostKind, i, nullptr, nullptr); 834 } 835 836 return Cost; 837 } 838 839 bool isTargetIntrinsicTriviallyScalarizable(Intrinsic::ID ID) const { 840 return false; 841 } 842 843 bool isTargetIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, 844 unsigned ScalarOpdIdx) const { 845 return false; 846 } 847 848 bool isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, 849 int OpdIdx) const { 850 return OpdIdx == -1; 851 } 852 853 bool isTargetIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID, 854 int RetIdx) const { 855 return RetIdx == 0; 856 } 857 858 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead. 859 InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert, 860 bool Extract, 861 TTI::TargetCostKind CostKind) { 862 if (isa<ScalableVectorType>(InTy)) 863 return InstructionCost::getInvalid(); 864 auto *Ty = cast<FixedVectorType>(InTy); 865 866 APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements()); 867 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract, 868 CostKind); 869 } 870 871 /// Estimate the overhead of scalarizing an instructions unique 872 /// non-constant operands. The (potentially vector) types to use for each of 873 /// argument are passes via Tys. 874 InstructionCost 875 getOperandsScalarizationOverhead(ArrayRef<const Value *> Args, 876 ArrayRef<Type *> Tys, 877 TTI::TargetCostKind CostKind) { 878 assert(Args.size() == Tys.size() && "Expected matching Args and Tys"); 879 880 InstructionCost Cost = 0; 881 SmallPtrSet<const Value*, 4> UniqueOperands; 882 for (int I = 0, E = Args.size(); I != E; I++) { 883 // Disregard things like metadata arguments. 884 const Value *A = Args[I]; 885 Type *Ty = Tys[I]; 886 if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() && 887 !Ty->isPtrOrPtrVectorTy()) 888 continue; 889 890 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) { 891 if (auto *VecTy = dyn_cast<VectorType>(Ty)) 892 Cost += getScalarizationOverhead(VecTy, /*Insert*/ false, 893 /*Extract*/ true, CostKind); 894 } 895 } 896 897 return Cost; 898 } 899 900 /// Estimate the overhead of scalarizing the inputs and outputs of an 901 /// instruction, with return type RetTy and arguments Args of type Tys. If 902 /// Args are unknown (empty), then the cost associated with one argument is 903 /// added as a heuristic. 904 InstructionCost getScalarizationOverhead(VectorType *RetTy, 905 ArrayRef<const Value *> Args, 906 ArrayRef<Type *> Tys, 907 TTI::TargetCostKind CostKind) { 908 InstructionCost Cost = getScalarizationOverhead( 909 RetTy, /*Insert*/ true, /*Extract*/ false, CostKind); 910 if (!Args.empty()) 911 Cost += getOperandsScalarizationOverhead(Args, Tys, CostKind); 912 else 913 // When no information on arguments is provided, we add the cost 914 // associated with one argument as a heuristic. 915 Cost += getScalarizationOverhead(RetTy, /*Insert*/ false, 916 /*Extract*/ true, CostKind); 917 918 return Cost; 919 } 920 921 /// Estimate the cost of type-legalization and the legalized type. 922 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const { 923 LLVMContext &C = Ty->getContext(); 924 EVT MTy = getTLI()->getValueType(DL, Ty); 925 926 InstructionCost Cost = 1; 927 // We keep legalizing the type until we find a legal kind. We assume that 928 // the only operation that costs anything is the split. After splitting 929 // we need to handle two types. 930 while (true) { 931 TargetLoweringBase::LegalizeKind LK = getTLI()->getTypeConversion(C, MTy); 932 933 if (LK.first == TargetLoweringBase::TypeScalarizeScalableVector) { 934 // Ensure we return a sensible simple VT here, since many callers of 935 // this function require it. 936 MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64; 937 return std::make_pair(InstructionCost::getInvalid(), VT); 938 } 939 940 if (LK.first == TargetLoweringBase::TypeLegal) 941 return std::make_pair(Cost, MTy.getSimpleVT()); 942 943 if (LK.first == TargetLoweringBase::TypeSplitVector || 944 LK.first == TargetLoweringBase::TypeExpandInteger) 945 Cost *= 2; 946 947 // Do not loop with f128 type. 948 if (MTy == LK.second) 949 return std::make_pair(Cost, MTy.getSimpleVT()); 950 951 // Keep legalizing the type. 952 MTy = LK.second; 953 } 954 } 955 956 unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; } 957 958 InstructionCost getArithmeticInstrCost( 959 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 960 TTI::OperandValueInfo Opd1Info = {TTI::OK_AnyValue, TTI::OP_None}, 961 TTI::OperandValueInfo Opd2Info = {TTI::OK_AnyValue, TTI::OP_None}, 962 ArrayRef<const Value *> Args = {}, const Instruction *CxtI = nullptr) { 963 // Check if any of the operands are vector operands. 964 const TargetLoweringBase *TLI = getTLI(); 965 int ISD = TLI->InstructionOpcodeToISD(Opcode); 966 assert(ISD && "Invalid opcode"); 967 968 // TODO: Handle more cost kinds. 969 if (CostKind != TTI::TCK_RecipThroughput) 970 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, 971 Opd1Info, Opd2Info, 972 Args, CxtI); 973 974 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 975 976 bool IsFloat = Ty->isFPOrFPVectorTy(); 977 // Assume that floating point arithmetic operations cost twice as much as 978 // integer operations. 979 InstructionCost OpCost = (IsFloat ? 2 : 1); 980 981 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 982 // The operation is legal. Assume it costs 1. 983 // TODO: Once we have extract/insert subvector cost we need to use them. 984 return LT.first * OpCost; 985 } 986 987 if (!TLI->isOperationExpand(ISD, LT.second)) { 988 // If the operation is custom lowered, then assume that the code is twice 989 // as expensive. 990 return LT.first * 2 * OpCost; 991 } 992 993 // An 'Expand' of URem and SRem is special because it may default 994 // to expanding the operation into a sequence of sub-operations 995 // i.e. X % Y -> X-(X/Y)*Y. 996 if (ISD == ISD::UREM || ISD == ISD::SREM) { 997 bool IsSigned = ISD == ISD::SREM; 998 if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, 999 LT.second) || 1000 TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV, 1001 LT.second)) { 1002 unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv; 1003 InstructionCost DivCost = thisT()->getArithmeticInstrCost( 1004 DivOpc, Ty, CostKind, Opd1Info, Opd2Info); 1005 InstructionCost MulCost = 1006 thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind); 1007 InstructionCost SubCost = 1008 thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind); 1009 return DivCost + MulCost + SubCost; 1010 } 1011 } 1012 1013 // We cannot scalarize scalable vectors, so return Invalid. 1014 if (isa<ScalableVectorType>(Ty)) 1015 return InstructionCost::getInvalid(); 1016 1017 // Else, assume that we need to scalarize this op. 1018 // TODO: If one of the types get legalized by splitting, handle this 1019 // similarly to what getCastInstrCost() does. 1020 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) { 1021 InstructionCost Cost = thisT()->getArithmeticInstrCost( 1022 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info, 1023 Args, CxtI); 1024 // Return the cost of multiple scalar invocation plus the cost of 1025 // inserting and extracting the values. 1026 SmallVector<Type *> Tys(Args.size(), Ty); 1027 return getScalarizationOverhead(VTy, Args, Tys, CostKind) + 1028 VTy->getNumElements() * Cost; 1029 } 1030 1031 // We don't know anything about this scalar instruction. 1032 return OpCost; 1033 } 1034 1035 TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, 1036 ArrayRef<int> Mask, 1037 VectorType *Ty, int &Index, 1038 VectorType *&SubTy) const { 1039 if (Mask.empty()) 1040 return Kind; 1041 int NumSrcElts = Ty->getElementCount().getKnownMinValue(); 1042 switch (Kind) { 1043 case TTI::SK_PermuteSingleSrc: { 1044 if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts)) 1045 return TTI::SK_Reverse; 1046 if (ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts)) 1047 return TTI::SK_Broadcast; 1048 if (isSplatMask(Mask, NumSrcElts, Index)) 1049 return TTI::SK_Broadcast; 1050 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts, Index) && 1051 (Index + Mask.size()) <= (size_t)NumSrcElts) { 1052 SubTy = FixedVectorType::get(Ty->getElementType(), Mask.size()); 1053 return TTI::SK_ExtractSubvector; 1054 } 1055 break; 1056 } 1057 case TTI::SK_PermuteTwoSrc: { 1058 int NumSubElts; 1059 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask( 1060 Mask, NumSrcElts, NumSubElts, Index)) { 1061 if (Index + NumSubElts > NumSrcElts) 1062 return Kind; 1063 SubTy = FixedVectorType::get(Ty->getElementType(), NumSubElts); 1064 return TTI::SK_InsertSubvector; 1065 } 1066 if (ShuffleVectorInst::isSelectMask(Mask, NumSrcElts)) 1067 return TTI::SK_Select; 1068 if (ShuffleVectorInst::isTransposeMask(Mask, NumSrcElts)) 1069 return TTI::SK_Transpose; 1070 if (ShuffleVectorInst::isSpliceMask(Mask, NumSrcElts, Index)) 1071 return TTI::SK_Splice; 1072 break; 1073 } 1074 case TTI::SK_Select: 1075 case TTI::SK_Reverse: 1076 case TTI::SK_Broadcast: 1077 case TTI::SK_Transpose: 1078 case TTI::SK_InsertSubvector: 1079 case TTI::SK_ExtractSubvector: 1080 case TTI::SK_Splice: 1081 break; 1082 } 1083 return Kind; 1084 } 1085 1086 InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, 1087 ArrayRef<int> Mask, 1088 TTI::TargetCostKind CostKind, int Index, 1089 VectorType *SubTp, 1090 ArrayRef<const Value *> Args = {}, 1091 const Instruction *CxtI = nullptr) { 1092 switch (improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp)) { 1093 case TTI::SK_Broadcast: 1094 if (auto *FVT = dyn_cast<FixedVectorType>(Tp)) 1095 return getBroadcastShuffleOverhead(FVT, CostKind); 1096 return InstructionCost::getInvalid(); 1097 case TTI::SK_Select: 1098 case TTI::SK_Splice: 1099 case TTI::SK_Reverse: 1100 case TTI::SK_Transpose: 1101 case TTI::SK_PermuteSingleSrc: 1102 case TTI::SK_PermuteTwoSrc: 1103 if (auto *FVT = dyn_cast<FixedVectorType>(Tp)) 1104 return getPermuteShuffleOverhead(FVT, CostKind); 1105 return InstructionCost::getInvalid(); 1106 case TTI::SK_ExtractSubvector: 1107 return getExtractSubvectorOverhead(Tp, CostKind, Index, 1108 cast<FixedVectorType>(SubTp)); 1109 case TTI::SK_InsertSubvector: 1110 return getInsertSubvectorOverhead(Tp, CostKind, Index, 1111 cast<FixedVectorType>(SubTp)); 1112 } 1113 llvm_unreachable("Unknown TTI::ShuffleKind"); 1114 } 1115 1116 InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, 1117 TTI::CastContextHint CCH, 1118 TTI::TargetCostKind CostKind, 1119 const Instruction *I = nullptr) { 1120 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0) 1121 return 0; 1122 1123 const TargetLoweringBase *TLI = getTLI(); 1124 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1125 assert(ISD && "Invalid opcode"); 1126 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src); 1127 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst); 1128 1129 TypeSize SrcSize = SrcLT.second.getSizeInBits(); 1130 TypeSize DstSize = DstLT.second.getSizeInBits(); 1131 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy(); 1132 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy(); 1133 1134 switch (Opcode) { 1135 default: 1136 break; 1137 case Instruction::Trunc: 1138 // Check for NOOP conversions. 1139 if (TLI->isTruncateFree(SrcLT.second, DstLT.second)) 1140 return 0; 1141 [[fallthrough]]; 1142 case Instruction::BitCast: 1143 // Bitcast between types that are legalized to the same type are free and 1144 // assume int to/from ptr of the same size is also free. 1145 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst && 1146 SrcSize == DstSize) 1147 return 0; 1148 break; 1149 case Instruction::FPExt: 1150 if (I && getTLI()->isExtFree(I)) 1151 return 0; 1152 break; 1153 case Instruction::ZExt: 1154 if (TLI->isZExtFree(SrcLT.second, DstLT.second)) 1155 return 0; 1156 [[fallthrough]]; 1157 case Instruction::SExt: 1158 if (I && getTLI()->isExtFree(I)) 1159 return 0; 1160 1161 // If this is a zext/sext of a load, return 0 if the corresponding 1162 // extending load exists on target and the result type is legal. 1163 if (CCH == TTI::CastContextHint::Normal) { 1164 EVT ExtVT = EVT::getEVT(Dst); 1165 EVT LoadVT = EVT::getEVT(Src); 1166 unsigned LType = 1167 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD); 1168 if (DstLT.first == SrcLT.first && 1169 TLI->isLoadExtLegal(LType, ExtVT, LoadVT)) 1170 return 0; 1171 } 1172 break; 1173 case Instruction::AddrSpaceCast: 1174 if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(), 1175 Dst->getPointerAddressSpace())) 1176 return 0; 1177 break; 1178 } 1179 1180 auto *SrcVTy = dyn_cast<VectorType>(Src); 1181 auto *DstVTy = dyn_cast<VectorType>(Dst); 1182 1183 // If the cast is marked as legal (or promote) then assume low cost. 1184 if (SrcLT.first == DstLT.first && 1185 TLI->isOperationLegalOrPromote(ISD, DstLT.second)) 1186 return SrcLT.first; 1187 1188 // Handle scalar conversions. 1189 if (!SrcVTy && !DstVTy) { 1190 // Just check the op cost. If the operation is legal then assume it costs 1191 // 1. 1192 if (!TLI->isOperationExpand(ISD, DstLT.second)) 1193 return 1; 1194 1195 // Assume that illegal scalar instruction are expensive. 1196 return 4; 1197 } 1198 1199 // Check vector-to-vector casts. 1200 if (DstVTy && SrcVTy) { 1201 // If the cast is between same-sized registers, then the check is simple. 1202 if (SrcLT.first == DstLT.first && SrcSize == DstSize) { 1203 1204 // Assume that Zext is done using AND. 1205 if (Opcode == Instruction::ZExt) 1206 return SrcLT.first; 1207 1208 // Assume that sext is done using SHL and SRA. 1209 if (Opcode == Instruction::SExt) 1210 return SrcLT.first * 2; 1211 1212 // Just check the op cost. If the operation is legal then assume it 1213 // costs 1214 // 1 and multiply by the type-legalization overhead. 1215 if (!TLI->isOperationExpand(ISD, DstLT.second)) 1216 return SrcLT.first * 1; 1217 } 1218 1219 // If we are legalizing by splitting, query the concrete TTI for the cost 1220 // of casting the original vector twice. We also need to factor in the 1221 // cost of the split itself. Count that as 1, to be consistent with 1222 // getTypeLegalizationCost(). 1223 bool SplitSrc = 1224 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) == 1225 TargetLowering::TypeSplitVector; 1226 bool SplitDst = 1227 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) == 1228 TargetLowering::TypeSplitVector; 1229 if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() && 1230 DstVTy->getElementCount().isVector()) { 1231 Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy); 1232 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy); 1233 T *TTI = static_cast<T *>(this); 1234 // If both types need to be split then the split is free. 1235 InstructionCost SplitCost = 1236 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0; 1237 return SplitCost + 1238 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH, 1239 CostKind, I)); 1240 } 1241 1242 // Scalarization cost is Invalid, can't assume any num elements. 1243 if (isa<ScalableVectorType>(DstVTy)) 1244 return InstructionCost::getInvalid(); 1245 1246 // In other cases where the source or destination are illegal, assume 1247 // the operation will get scalarized. 1248 unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements(); 1249 InstructionCost Cost = thisT()->getCastInstrCost( 1250 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I); 1251 1252 // Return the cost of multiple scalar invocation plus the cost of 1253 // inserting and extracting the values. 1254 return getScalarizationOverhead(DstVTy, /*Insert*/ true, /*Extract*/ true, 1255 CostKind) + 1256 Num * Cost; 1257 } 1258 1259 // We already handled vector-to-vector and scalar-to-scalar conversions. 1260 // This 1261 // is where we handle bitcast between vectors and scalars. We need to assume 1262 // that the conversion is scalarized in one way or another. 1263 if (Opcode == Instruction::BitCast) { 1264 // Illegal bitcasts are done by storing and loading from a stack slot. 1265 return (SrcVTy ? getScalarizationOverhead(SrcVTy, /*Insert*/ false, 1266 /*Extract*/ true, CostKind) 1267 : 0) + 1268 (DstVTy ? getScalarizationOverhead(DstVTy, /*Insert*/ true, 1269 /*Extract*/ false, CostKind) 1270 : 0); 1271 } 1272 1273 llvm_unreachable("Unhandled cast"); 1274 } 1275 1276 InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, 1277 VectorType *VecTy, unsigned Index) { 1278 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1279 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy, 1280 CostKind, Index, nullptr, nullptr) + 1281 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(), 1282 TTI::CastContextHint::None, CostKind); 1283 } 1284 1285 InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, 1286 const Instruction *I = nullptr) { 1287 return BaseT::getCFInstrCost(Opcode, CostKind, I); 1288 } 1289 1290 InstructionCost getCmpSelInstrCost( 1291 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, 1292 TTI::TargetCostKind CostKind, 1293 TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None}, 1294 TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None}, 1295 const Instruction *I = nullptr) { 1296 const TargetLoweringBase *TLI = getTLI(); 1297 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1298 assert(ISD && "Invalid opcode"); 1299 1300 // TODO: Handle other cost kinds. 1301 if (CostKind != TTI::TCK_RecipThroughput) 1302 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1303 Op1Info, Op2Info, I); 1304 1305 // Selects on vectors are actually vector selects. 1306 if (ISD == ISD::SELECT) { 1307 assert(CondTy && "CondTy must exist"); 1308 if (CondTy->isVectorTy()) 1309 ISD = ISD::VSELECT; 1310 } 1311 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1312 1313 if (!(ValTy->isVectorTy() && !LT.second.isVector()) && 1314 !TLI->isOperationExpand(ISD, LT.second)) { 1315 // The operation is legal. Assume it costs 1. Multiply 1316 // by the type-legalization overhead. 1317 return LT.first * 1; 1318 } 1319 1320 // Otherwise, assume that the cast is scalarized. 1321 // TODO: If one of the types get legalized by splitting, handle this 1322 // similarly to what getCastInstrCost() does. 1323 if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) { 1324 if (isa<ScalableVectorType>(ValTy)) 1325 return InstructionCost::getInvalid(); 1326 1327 unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements(); 1328 if (CondTy) 1329 CondTy = CondTy->getScalarType(); 1330 InstructionCost Cost = 1331 thisT()->getCmpSelInstrCost(Opcode, ValVTy->getScalarType(), CondTy, 1332 VecPred, CostKind, Op1Info, Op2Info, I); 1333 1334 // Return the cost of multiple scalar invocation plus the cost of 1335 // inserting and extracting the values. 1336 return getScalarizationOverhead(ValVTy, /*Insert*/ true, 1337 /*Extract*/ false, CostKind) + 1338 Num * Cost; 1339 } 1340 1341 // Unknown scalar opcode. 1342 return 1; 1343 } 1344 1345 InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, 1346 TTI::TargetCostKind CostKind, 1347 unsigned Index, Value *Op0, Value *Op1) { 1348 return getRegUsageForType(Val->getScalarType()); 1349 } 1350 1351 /// \param ScalarUserAndIdx encodes the information about extracts from a 1352 /// vector with 'Scalar' being the value being extracted,'User' being the user 1353 /// of the extract(nullptr if user is not known before vectorization) and 1354 /// 'Idx' being the extract lane. 1355 InstructionCost getVectorInstrCost( 1356 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, 1357 Value *Scalar, 1358 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) { 1359 return thisT()->getVectorInstrCost(Opcode, Val, CostKind, Index, nullptr, 1360 nullptr); 1361 } 1362 1363 InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, 1364 TTI::TargetCostKind CostKind, 1365 unsigned Index) { 1366 Value *Op0 = nullptr; 1367 Value *Op1 = nullptr; 1368 if (auto *IE = dyn_cast<InsertElementInst>(&I)) { 1369 Op0 = IE->getOperand(0); 1370 Op1 = IE->getOperand(1); 1371 } 1372 return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0, 1373 Op1); 1374 } 1375 1376 InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, 1377 int VF, 1378 const APInt &DemandedDstElts, 1379 TTI::TargetCostKind CostKind) { 1380 assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor && 1381 "Unexpected size of DemandedDstElts."); 1382 1383 InstructionCost Cost; 1384 1385 auto *SrcVT = FixedVectorType::get(EltTy, VF); 1386 auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor); 1387 1388 // The Mask shuffling cost is extract all the elements of the Mask 1389 // and insert each of them Factor times into the wide vector: 1390 // 1391 // E.g. an interleaved group with factor 3: 1392 // %mask = icmp ult <8 x i32> %vec1, %vec2 1393 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef, 1394 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7> 1395 // The cost is estimated as extract all mask elements from the <8xi1> mask 1396 // vector and insert them factor times into the <24xi1> shuffled mask 1397 // vector. 1398 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF); 1399 Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts, 1400 /*Insert*/ false, 1401 /*Extract*/ true, CostKind); 1402 Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts, 1403 /*Insert*/ true, 1404 /*Extract*/ false, CostKind); 1405 1406 return Cost; 1407 } 1408 1409 InstructionCost 1410 getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, 1411 unsigned AddressSpace, TTI::TargetCostKind CostKind, 1412 TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None}, 1413 const Instruction *I = nullptr) { 1414 assert(!Src->isVoidTy() && "Invalid type"); 1415 // Assume types, such as structs, are expensive. 1416 if (getTLI()->getValueType(DL, Src, true) == MVT::Other) 1417 return 4; 1418 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src); 1419 1420 // Assuming that all loads of legal types cost 1. 1421 InstructionCost Cost = LT.first; 1422 if (CostKind != TTI::TCK_RecipThroughput) 1423 return Cost; 1424 1425 const DataLayout &DL = this->getDataLayout(); 1426 if (Src->isVectorTy() && 1427 // In practice it's not currently possible to have a change in lane 1428 // length for extending loads or truncating stores so both types should 1429 // have the same scalable property. 1430 TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src), 1431 LT.second.getSizeInBits())) { 1432 // This is a vector load that legalizes to a larger type than the vector 1433 // itself. Unless the corresponding extending load or truncating store is 1434 // legal, then this will scalarize. 1435 TargetLowering::LegalizeAction LA = TargetLowering::Expand; 1436 EVT MemVT = getTLI()->getValueType(DL, Src); 1437 if (Opcode == Instruction::Store) 1438 LA = getTLI()->getTruncStoreAction(LT.second, MemVT); 1439 else 1440 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT); 1441 1442 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) { 1443 // This is a vector load/store for some illegal type that is scalarized. 1444 // We must account for the cost of building or decomposing the vector. 1445 Cost += getScalarizationOverhead( 1446 cast<VectorType>(Src), Opcode != Instruction::Store, 1447 Opcode == Instruction::Store, CostKind); 1448 } 1449 } 1450 1451 return Cost; 1452 } 1453 1454 InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, 1455 Align Alignment, unsigned AddressSpace, 1456 TTI::TargetCostKind CostKind) { 1457 // TODO: Pass on AddressSpace when we have test coverage. 1458 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false, 1459 CostKind); 1460 } 1461 1462 InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, 1463 const Value *Ptr, bool VariableMask, 1464 Align Alignment, 1465 TTI::TargetCostKind CostKind, 1466 const Instruction *I = nullptr) { 1467 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask, 1468 true, CostKind); 1469 } 1470 1471 InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, 1472 const Value *Ptr, bool VariableMask, 1473 Align Alignment, 1474 TTI::TargetCostKind CostKind, 1475 const Instruction *I) { 1476 // For a target without strided memory operations (or for an illegal 1477 // operation type on one which does), assume we lower to a gather/scatter 1478 // operation. (Which may in turn be scalarized.) 1479 return thisT()->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 1480 Alignment, CostKind, I); 1481 } 1482 1483 InstructionCost getInterleavedMemoryOpCost( 1484 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 1485 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 1486 bool UseMaskForCond = false, bool UseMaskForGaps = false) { 1487 1488 // We cannot scalarize scalable vectors, so return Invalid. 1489 if (isa<ScalableVectorType>(VecTy)) 1490 return InstructionCost::getInvalid(); 1491 1492 auto *VT = cast<FixedVectorType>(VecTy); 1493 1494 unsigned NumElts = VT->getNumElements(); 1495 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor"); 1496 1497 unsigned NumSubElts = NumElts / Factor; 1498 auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts); 1499 1500 // Firstly, the cost of load/store operation. 1501 InstructionCost Cost; 1502 if (UseMaskForCond || UseMaskForGaps) 1503 Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment, 1504 AddressSpace, CostKind); 1505 else 1506 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, 1507 CostKind); 1508 1509 // Legalize the vector type, and get the legalized and unlegalized type 1510 // sizes. 1511 MVT VecTyLT = getTypeLegalizationCost(VecTy).second; 1512 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy); 1513 unsigned VecTyLTSize = VecTyLT.getStoreSize(); 1514 1515 // Scale the cost of the memory operation by the fraction of legalized 1516 // instructions that will actually be used. We shouldn't account for the 1517 // cost of dead instructions since they will be removed. 1518 // 1519 // E.g., An interleaved load of factor 8: 1520 // %vec = load <16 x i64>, <16 x i64>* %ptr 1521 // %v0 = shufflevector %vec, undef, <0, 8> 1522 // 1523 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be 1524 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized 1525 // type). The other loads are unused. 1526 // 1527 // TODO: Note that legalization can turn masked loads/stores into unmasked 1528 // (legalized) loads/stores. This can be reflected in the cost. 1529 if (Cost.isValid() && VecTySize > VecTyLTSize) { 1530 // The number of loads of a legal type it will take to represent a load 1531 // of the unlegalized vector type. 1532 unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize); 1533 1534 // The number of elements of the unlegalized type that correspond to a 1535 // single legal instruction. 1536 unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts); 1537 1538 // Determine which legal instructions will be used. 1539 BitVector UsedInsts(NumLegalInsts, false); 1540 for (unsigned Index : Indices) 1541 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt) 1542 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst); 1543 1544 // Scale the cost of the load by the fraction of legal instructions that 1545 // will be used. 1546 Cost = divideCeil(UsedInsts.count() * *Cost.getValue(), NumLegalInsts); 1547 } 1548 1549 // Then plus the cost of interleave operation. 1550 assert(Indices.size() <= Factor && 1551 "Interleaved memory op has too many members"); 1552 1553 const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts); 1554 const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts); 1555 1556 APInt DemandedLoadStoreElts = APInt::getZero(NumElts); 1557 for (unsigned Index : Indices) { 1558 assert(Index < Factor && "Invalid index for interleaved memory op"); 1559 for (unsigned Elm = 0; Elm < NumSubElts; Elm++) 1560 DemandedLoadStoreElts.setBit(Index + Elm * Factor); 1561 } 1562 1563 if (Opcode == Instruction::Load) { 1564 // The interleave cost is similar to extract sub vectors' elements 1565 // from the wide vector, and insert them into sub vectors. 1566 // 1567 // E.g. An interleaved load of factor 2 (with one member of index 0): 1568 // %vec = load <8 x i32>, <8 x i32>* %ptr 1569 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0 1570 // The cost is estimated as extract elements at 0, 2, 4, 6 from the 1571 // <8 x i32> vector and insert them into a <4 x i32> vector. 1572 InstructionCost InsSubCost = thisT()->getScalarizationOverhead( 1573 SubVT, DemandedAllSubElts, 1574 /*Insert*/ true, /*Extract*/ false, CostKind); 1575 Cost += Indices.size() * InsSubCost; 1576 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, 1577 /*Insert*/ false, 1578 /*Extract*/ true, CostKind); 1579 } else { 1580 // The interleave cost is extract elements from sub vectors, and 1581 // insert them into the wide vector. 1582 // 1583 // E.g. An interleaved store of factor 3 with 2 members at indices 0,1: 1584 // (using VF=4): 1585 // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef> 1586 // %gaps.mask = <true, true, false, true, true, false, 1587 // true, true, false, true, true, false> 1588 // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr, 1589 // i32 Align, <12 x i1> %gaps.mask 1590 // The cost is estimated as extract all elements (of actual members, 1591 // excluding gaps) from both <4 x i32> vectors and insert into the <12 x 1592 // i32> vector. 1593 InstructionCost ExtSubCost = thisT()->getScalarizationOverhead( 1594 SubVT, DemandedAllSubElts, 1595 /*Insert*/ false, /*Extract*/ true, CostKind); 1596 Cost += ExtSubCost * Indices.size(); 1597 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, 1598 /*Insert*/ true, 1599 /*Extract*/ false, CostKind); 1600 } 1601 1602 if (!UseMaskForCond) 1603 return Cost; 1604 1605 Type *I8Type = Type::getInt8Ty(VT->getContext()); 1606 1607 Cost += thisT()->getReplicationShuffleCost( 1608 I8Type, Factor, NumSubElts, 1609 UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts, 1610 CostKind); 1611 1612 // The Gaps mask is invariant and created outside the loop, therefore the 1613 // cost of creating it is not accounted for here. However if we have both 1614 // a MaskForGaps and some other mask that guards the execution of the 1615 // memory access, we need to account for the cost of And-ing the two masks 1616 // inside the loop. 1617 if (UseMaskForGaps) { 1618 auto *MaskVT = FixedVectorType::get(I8Type, NumElts); 1619 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT, 1620 CostKind); 1621 } 1622 1623 return Cost; 1624 } 1625 1626 /// Get intrinsic cost based on arguments. 1627 InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1628 TTI::TargetCostKind CostKind) { 1629 // Check for generically free intrinsics. 1630 if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0) 1631 return 0; 1632 1633 // Assume that target intrinsics are cheap. 1634 Intrinsic::ID IID = ICA.getID(); 1635 if (Intrinsic::isTargetIntrinsic(IID)) 1636 return TargetTransformInfo::TCC_Basic; 1637 1638 // VP Intrinsics should have the same cost as their non-vp counterpart. 1639 // TODO: Adjust the cost to make the vp intrinsic cheaper than its non-vp 1640 // counterpart when the vector length argument is smaller than the maximum 1641 // vector length. 1642 // TODO: Support other kinds of VPIntrinsics 1643 if (VPIntrinsic::isVPIntrinsic(ICA.getID())) { 1644 std::optional<unsigned> FOp = 1645 VPIntrinsic::getFunctionalOpcodeForVP(ICA.getID()); 1646 if (FOp) { 1647 if (ICA.getID() == Intrinsic::vp_load) { 1648 Align Alignment; 1649 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst())) 1650 Alignment = VPI->getPointerAlignment().valueOrOne(); 1651 unsigned AS = 0; 1652 if (ICA.getArgTypes().size() > 1) 1653 if (auto *PtrTy = dyn_cast<PointerType>(ICA.getArgTypes()[0])) 1654 AS = PtrTy->getAddressSpace(); 1655 return thisT()->getMemoryOpCost(*FOp, ICA.getReturnType(), Alignment, 1656 AS, CostKind); 1657 } 1658 if (ICA.getID() == Intrinsic::vp_store) { 1659 Align Alignment; 1660 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst())) 1661 Alignment = VPI->getPointerAlignment().valueOrOne(); 1662 unsigned AS = 0; 1663 if (ICA.getArgTypes().size() >= 2) 1664 if (auto *PtrTy = dyn_cast<PointerType>(ICA.getArgTypes()[1])) 1665 AS = PtrTy->getAddressSpace(); 1666 return thisT()->getMemoryOpCost(*FOp, ICA.getArgTypes()[0], Alignment, 1667 AS, CostKind); 1668 } 1669 if (VPBinOpIntrinsic::isVPBinOp(ICA.getID())) { 1670 return thisT()->getArithmeticInstrCost(*FOp, ICA.getReturnType(), 1671 CostKind); 1672 } 1673 if (VPCastIntrinsic::isVPCast(ICA.getID())) { 1674 return thisT()->getCastInstrCost( 1675 *FOp, ICA.getReturnType(), ICA.getArgTypes()[0], 1676 TTI::CastContextHint::None, CostKind); 1677 } 1678 if (VPCmpIntrinsic::isVPCmp(ICA.getID())) { 1679 // We can only handle vp_cmp intrinsics with underlying instructions. 1680 if (ICA.getInst()) { 1681 assert(FOp); 1682 auto *UI = cast<VPCmpIntrinsic>(ICA.getInst()); 1683 return thisT()->getCmpSelInstrCost(*FOp, ICA.getArgTypes()[0], 1684 ICA.getReturnType(), 1685 UI->getPredicate(), CostKind); 1686 } 1687 } 1688 } 1689 1690 std::optional<Intrinsic::ID> FID = 1691 VPIntrinsic::getFunctionalIntrinsicIDForVP(ICA.getID()); 1692 if (FID) { 1693 // Non-vp version will have same arg types except mask and vector 1694 // length. 1695 assert(ICA.getArgTypes().size() >= 2 && 1696 "Expected VPIntrinsic to have Mask and Vector Length args and " 1697 "types"); 1698 ArrayRef<Type *> NewTys = ArrayRef(ICA.getArgTypes()).drop_back(2); 1699 1700 // VPReduction intrinsics have a start value argument that their non-vp 1701 // counterparts do not have, except for the fadd and fmul non-vp 1702 // counterpart. 1703 if (VPReductionIntrinsic::isVPReduction(ICA.getID()) && 1704 *FID != Intrinsic::vector_reduce_fadd && 1705 *FID != Intrinsic::vector_reduce_fmul) 1706 NewTys = NewTys.drop_front(); 1707 1708 IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewTys, 1709 ICA.getFlags()); 1710 return thisT()->getIntrinsicInstrCost(NewICA, CostKind); 1711 } 1712 } 1713 1714 if (ICA.isTypeBasedOnly()) 1715 return getTypeBasedIntrinsicInstrCost(ICA, CostKind); 1716 1717 Type *RetTy = ICA.getReturnType(); 1718 1719 ElementCount RetVF = 1720 (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount() 1721 : ElementCount::getFixed(1)); 1722 const IntrinsicInst *I = ICA.getInst(); 1723 const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); 1724 FastMathFlags FMF = ICA.getFlags(); 1725 switch (IID) { 1726 default: 1727 break; 1728 1729 case Intrinsic::powi: 1730 if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) { 1731 bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize(); 1732 if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(), 1733 ShouldOptForSize)) { 1734 // The cost is modeled on the expansion performed by ExpandPowI in 1735 // SelectionDAGBuilder. 1736 APInt Exponent = RHSC->getValue().abs(); 1737 unsigned ActiveBits = Exponent.getActiveBits(); 1738 unsigned PopCount = Exponent.popcount(); 1739 InstructionCost Cost = (ActiveBits + PopCount - 2) * 1740 thisT()->getArithmeticInstrCost( 1741 Instruction::FMul, RetTy, CostKind); 1742 if (RHSC->isNegative()) 1743 Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy, 1744 CostKind); 1745 return Cost; 1746 } 1747 } 1748 break; 1749 case Intrinsic::cttz: 1750 // FIXME: If necessary, this should go in target-specific overrides. 1751 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy)) 1752 return TargetTransformInfo::TCC_Basic; 1753 break; 1754 1755 case Intrinsic::ctlz: 1756 // FIXME: If necessary, this should go in target-specific overrides. 1757 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy)) 1758 return TargetTransformInfo::TCC_Basic; 1759 break; 1760 1761 case Intrinsic::memcpy: 1762 return thisT()->getMemcpyCost(ICA.getInst()); 1763 1764 case Intrinsic::masked_scatter: { 1765 const Value *Mask = Args[3]; 1766 bool VarMask = !isa<Constant>(Mask); 1767 Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue(); 1768 return thisT()->getGatherScatterOpCost(Instruction::Store, 1769 ICA.getArgTypes()[0], Args[1], 1770 VarMask, Alignment, CostKind, I); 1771 } 1772 case Intrinsic::masked_gather: { 1773 const Value *Mask = Args[2]; 1774 bool VarMask = !isa<Constant>(Mask); 1775 Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue(); 1776 return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0], 1777 VarMask, Alignment, CostKind, I); 1778 } 1779 case Intrinsic::experimental_vp_strided_store: { 1780 const Value *Data = Args[0]; 1781 const Value *Ptr = Args[1]; 1782 const Value *Mask = Args[3]; 1783 const Value *EVL = Args[4]; 1784 bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL); 1785 Type *EltTy = cast<VectorType>(Data->getType())->getElementType(); 1786 Align Alignment = 1787 I->getParamAlign(1).value_or(thisT()->DL.getABITypeAlign(EltTy)); 1788 return thisT()->getStridedMemoryOpCost(Instruction::Store, 1789 Data->getType(), Ptr, VarMask, 1790 Alignment, CostKind, I); 1791 } 1792 case Intrinsic::experimental_vp_strided_load: { 1793 const Value *Ptr = Args[0]; 1794 const Value *Mask = Args[2]; 1795 const Value *EVL = Args[3]; 1796 bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL); 1797 Type *EltTy = cast<VectorType>(RetTy)->getElementType(); 1798 Align Alignment = 1799 I->getParamAlign(0).value_or(thisT()->DL.getABITypeAlign(EltTy)); 1800 return thisT()->getStridedMemoryOpCost(Instruction::Load, RetTy, Ptr, 1801 VarMask, Alignment, CostKind, I); 1802 } 1803 case Intrinsic::stepvector: { 1804 if (isa<ScalableVectorType>(RetTy)) 1805 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1806 // The cost of materialising a constant integer vector. 1807 return TargetTransformInfo::TCC_Basic; 1808 } 1809 case Intrinsic::vector_extract: { 1810 // FIXME: Handle case where a scalable vector is extracted from a scalable 1811 // vector 1812 if (isa<ScalableVectorType>(RetTy)) 1813 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1814 unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue(); 1815 return thisT()->getShuffleCost(TTI::SK_ExtractSubvector, 1816 cast<VectorType>(Args[0]->getType()), {}, 1817 CostKind, Index, cast<VectorType>(RetTy)); 1818 } 1819 case Intrinsic::vector_insert: { 1820 // FIXME: Handle case where a scalable vector is inserted into a scalable 1821 // vector 1822 if (isa<ScalableVectorType>(Args[1]->getType())) 1823 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1824 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue(); 1825 return thisT()->getShuffleCost( 1826 TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), {}, 1827 CostKind, Index, cast<VectorType>(Args[1]->getType())); 1828 } 1829 case Intrinsic::vector_reverse: { 1830 return thisT()->getShuffleCost(TTI::SK_Reverse, 1831 cast<VectorType>(Args[0]->getType()), {}, 1832 CostKind, 0, cast<VectorType>(RetTy)); 1833 } 1834 case Intrinsic::vector_splice: { 1835 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue(); 1836 return thisT()->getShuffleCost(TTI::SK_Splice, 1837 cast<VectorType>(Args[0]->getType()), {}, 1838 CostKind, Index, cast<VectorType>(RetTy)); 1839 } 1840 case Intrinsic::vector_reduce_add: 1841 case Intrinsic::vector_reduce_mul: 1842 case Intrinsic::vector_reduce_and: 1843 case Intrinsic::vector_reduce_or: 1844 case Intrinsic::vector_reduce_xor: 1845 case Intrinsic::vector_reduce_smax: 1846 case Intrinsic::vector_reduce_smin: 1847 case Intrinsic::vector_reduce_fmax: 1848 case Intrinsic::vector_reduce_fmin: 1849 case Intrinsic::vector_reduce_fmaximum: 1850 case Intrinsic::vector_reduce_fminimum: 1851 case Intrinsic::vector_reduce_umax: 1852 case Intrinsic::vector_reduce_umin: { 1853 IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1); 1854 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1855 } 1856 case Intrinsic::vector_reduce_fadd: 1857 case Intrinsic::vector_reduce_fmul: { 1858 IntrinsicCostAttributes Attrs( 1859 IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1); 1860 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1861 } 1862 case Intrinsic::fshl: 1863 case Intrinsic::fshr: { 1864 const Value *X = Args[0]; 1865 const Value *Y = Args[1]; 1866 const Value *Z = Args[2]; 1867 const TTI::OperandValueInfo OpInfoX = TTI::getOperandInfo(X); 1868 const TTI::OperandValueInfo OpInfoY = TTI::getOperandInfo(Y); 1869 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(Z); 1870 const TTI::OperandValueInfo OpInfoBW = 1871 {TTI::OK_UniformConstantValue, 1872 isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2 1873 : TTI::OP_None}; 1874 1875 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) 1876 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) 1877 InstructionCost Cost = 0; 1878 Cost += 1879 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind); 1880 Cost += 1881 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind); 1882 Cost += thisT()->getArithmeticInstrCost( 1883 BinaryOperator::Shl, RetTy, CostKind, OpInfoX, 1884 {OpInfoZ.Kind, TTI::OP_None}); 1885 Cost += thisT()->getArithmeticInstrCost( 1886 BinaryOperator::LShr, RetTy, CostKind, OpInfoY, 1887 {OpInfoZ.Kind, TTI::OP_None}); 1888 // Non-constant shift amounts requires a modulo. 1889 if (!OpInfoZ.isConstant()) 1890 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy, 1891 CostKind, OpInfoZ, OpInfoBW); 1892 // For non-rotates (X != Y) we must add shift-by-zero handling costs. 1893 if (X != Y) { 1894 Type *CondTy = RetTy->getWithNewBitWidth(1); 1895 Cost += 1896 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 1897 CmpInst::ICMP_EQ, CostKind); 1898 Cost += 1899 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 1900 CmpInst::ICMP_EQ, CostKind); 1901 } 1902 return Cost; 1903 } 1904 case Intrinsic::get_active_lane_mask: { 1905 EVT ResVT = getTLI()->getValueType(DL, RetTy, true); 1906 EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true); 1907 1908 // If we're not expanding the intrinsic then we assume this is cheap 1909 // to implement. 1910 if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) { 1911 return getTypeLegalizationCost(RetTy).first; 1912 } 1913 1914 // Create the expanded types that will be used to calculate the uadd_sat 1915 // operation. 1916 Type *ExpRetTy = VectorType::get( 1917 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount()); 1918 IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF); 1919 InstructionCost Cost = 1920 thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1921 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy, 1922 CmpInst::ICMP_ULT, CostKind); 1923 return Cost; 1924 } 1925 case Intrinsic::experimental_cttz_elts: { 1926 EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true); 1927 1928 // If we're not expanding the intrinsic then we assume this is cheap 1929 // to implement. 1930 if (!getTLI()->shouldExpandCttzElements(ArgType)) 1931 return getTypeLegalizationCost(RetTy).first; 1932 1933 // TODO: The costs below reflect the expansion code in 1934 // SelectionDAGBuilder, but we may want to sacrifice some accuracy in 1935 // favour of compile time. 1936 1937 // Find the smallest "sensible" element type to use for the expansion. 1938 bool ZeroIsPoison = !cast<ConstantInt>(Args[1])->isZero(); 1939 ConstantRange VScaleRange(APInt(64, 1), APInt::getZero(64)); 1940 if (isa<ScalableVectorType>(ICA.getArgTypes()[0]) && I && I->getCaller()) 1941 VScaleRange = getVScaleRange(I->getCaller(), 64); 1942 1943 unsigned EltWidth = getTLI()->getBitWidthForCttzElements( 1944 RetTy, ArgType.getVectorElementCount(), ZeroIsPoison, &VScaleRange); 1945 Type *NewEltTy = IntegerType::getIntNTy(RetTy->getContext(), EltWidth); 1946 1947 // Create the new vector type & get the vector length 1948 Type *NewVecTy = VectorType::get( 1949 NewEltTy, cast<VectorType>(Args[0]->getType())->getElementCount()); 1950 1951 IntrinsicCostAttributes StepVecAttrs(Intrinsic::stepvector, NewVecTy, {}, 1952 FMF); 1953 InstructionCost Cost = 1954 thisT()->getIntrinsicInstrCost(StepVecAttrs, CostKind); 1955 1956 Cost += 1957 thisT()->getArithmeticInstrCost(Instruction::Sub, NewVecTy, CostKind); 1958 Cost += thisT()->getCastInstrCost(Instruction::SExt, NewVecTy, 1959 Args[0]->getType(), 1960 TTI::CastContextHint::None, CostKind); 1961 Cost += 1962 thisT()->getArithmeticInstrCost(Instruction::And, NewVecTy, CostKind); 1963 1964 IntrinsicCostAttributes ReducAttrs(Intrinsic::vector_reduce_umax, 1965 NewEltTy, NewVecTy, FMF, I, 1); 1966 Cost += thisT()->getTypeBasedIntrinsicInstrCost(ReducAttrs, CostKind); 1967 Cost += 1968 thisT()->getArithmeticInstrCost(Instruction::Sub, NewEltTy, CostKind); 1969 1970 return Cost; 1971 } 1972 case Intrinsic::experimental_vector_match: 1973 return thisT()->getTypeBasedIntrinsicInstrCost(ICA, CostKind); 1974 } 1975 1976 // Assume that we need to scalarize this intrinsic.) 1977 // Compute the scalarization overhead based on Args for a vector 1978 // intrinsic. 1979 InstructionCost ScalarizationCost = InstructionCost::getInvalid(); 1980 if (RetVF.isVector() && !RetVF.isScalable()) { 1981 ScalarizationCost = 0; 1982 if (!RetTy->isVoidTy()) 1983 ScalarizationCost += getScalarizationOverhead( 1984 cast<VectorType>(RetTy), 1985 /*Insert*/ true, /*Extract*/ false, CostKind); 1986 ScalarizationCost += 1987 getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind); 1988 } 1989 1990 IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I, 1991 ScalarizationCost); 1992 return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1993 } 1994 1995 /// Get intrinsic cost based on argument types. 1996 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the 1997 /// cost of scalarizing the arguments and the return value will be computed 1998 /// based on types. 1999 InstructionCost 2000 getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 2001 TTI::TargetCostKind CostKind) { 2002 Intrinsic::ID IID = ICA.getID(); 2003 Type *RetTy = ICA.getReturnType(); 2004 const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes(); 2005 FastMathFlags FMF = ICA.getFlags(); 2006 InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost(); 2007 bool SkipScalarizationCost = ICA.skipScalarizationCost(); 2008 2009 VectorType *VecOpTy = nullptr; 2010 if (!Tys.empty()) { 2011 // The vector reduction operand is operand 0 except for fadd/fmul. 2012 // Their operand 0 is a scalar start value, so the vector op is operand 1. 2013 unsigned VecTyIndex = 0; 2014 if (IID == Intrinsic::vector_reduce_fadd || 2015 IID == Intrinsic::vector_reduce_fmul) 2016 VecTyIndex = 1; 2017 assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes"); 2018 VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]); 2019 } 2020 2021 // Library call cost - other than size, make it expensive. 2022 unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10; 2023 unsigned ISD = 0; 2024 switch (IID) { 2025 default: { 2026 // Scalable vectors cannot be scalarized, so return Invalid. 2027 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) { 2028 return isa<ScalableVectorType>(Ty); 2029 })) 2030 return InstructionCost::getInvalid(); 2031 2032 // Assume that we need to scalarize this intrinsic. 2033 InstructionCost ScalarizationCost = 2034 SkipScalarizationCost ? ScalarizationCostPassed : 0; 2035 unsigned ScalarCalls = 1; 2036 Type *ScalarRetTy = RetTy; 2037 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) { 2038 if (!SkipScalarizationCost) 2039 ScalarizationCost = getScalarizationOverhead( 2040 RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind); 2041 ScalarCalls = std::max(ScalarCalls, 2042 cast<FixedVectorType>(RetVTy)->getNumElements()); 2043 ScalarRetTy = RetTy->getScalarType(); 2044 } 2045 SmallVector<Type *, 4> ScalarTys; 2046 for (Type *Ty : Tys) { 2047 if (auto *VTy = dyn_cast<VectorType>(Ty)) { 2048 if (!SkipScalarizationCost) 2049 ScalarizationCost += getScalarizationOverhead( 2050 VTy, /*Insert*/ false, /*Extract*/ true, CostKind); 2051 ScalarCalls = std::max(ScalarCalls, 2052 cast<FixedVectorType>(VTy)->getNumElements()); 2053 Ty = Ty->getScalarType(); 2054 } 2055 ScalarTys.push_back(Ty); 2056 } 2057 if (ScalarCalls == 1) 2058 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap. 2059 2060 IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF); 2061 InstructionCost ScalarCost = 2062 thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind); 2063 2064 return ScalarCalls * ScalarCost + ScalarizationCost; 2065 } 2066 // Look for intrinsics that can be lowered directly or turned into a scalar 2067 // intrinsic call. 2068 case Intrinsic::sqrt: 2069 ISD = ISD::FSQRT; 2070 break; 2071 case Intrinsic::sin: 2072 ISD = ISD::FSIN; 2073 break; 2074 case Intrinsic::cos: 2075 ISD = ISD::FCOS; 2076 break; 2077 case Intrinsic::sincos: 2078 ISD = ISD::FSINCOS; 2079 break; 2080 case Intrinsic::tan: 2081 ISD = ISD::FTAN; 2082 break; 2083 case Intrinsic::asin: 2084 ISD = ISD::FASIN; 2085 break; 2086 case Intrinsic::acos: 2087 ISD = ISD::FACOS; 2088 break; 2089 case Intrinsic::atan: 2090 ISD = ISD::FATAN; 2091 break; 2092 case Intrinsic::atan2: 2093 ISD = ISD::FATAN2; 2094 break; 2095 case Intrinsic::sinh: 2096 ISD = ISD::FSINH; 2097 break; 2098 case Intrinsic::cosh: 2099 ISD = ISD::FCOSH; 2100 break; 2101 case Intrinsic::tanh: 2102 ISD = ISD::FTANH; 2103 break; 2104 case Intrinsic::exp: 2105 ISD = ISD::FEXP; 2106 break; 2107 case Intrinsic::exp2: 2108 ISD = ISD::FEXP2; 2109 break; 2110 case Intrinsic::exp10: 2111 ISD = ISD::FEXP10; 2112 break; 2113 case Intrinsic::log: 2114 ISD = ISD::FLOG; 2115 break; 2116 case Intrinsic::log10: 2117 ISD = ISD::FLOG10; 2118 break; 2119 case Intrinsic::log2: 2120 ISD = ISD::FLOG2; 2121 break; 2122 case Intrinsic::fabs: 2123 ISD = ISD::FABS; 2124 break; 2125 case Intrinsic::canonicalize: 2126 ISD = ISD::FCANONICALIZE; 2127 break; 2128 case Intrinsic::minnum: 2129 ISD = ISD::FMINNUM; 2130 break; 2131 case Intrinsic::maxnum: 2132 ISD = ISD::FMAXNUM; 2133 break; 2134 case Intrinsic::minimum: 2135 ISD = ISD::FMINIMUM; 2136 break; 2137 case Intrinsic::maximum: 2138 ISD = ISD::FMAXIMUM; 2139 break; 2140 case Intrinsic::minimumnum: 2141 ISD = ISD::FMINIMUMNUM; 2142 break; 2143 case Intrinsic::maximumnum: 2144 ISD = ISD::FMAXIMUMNUM; 2145 break; 2146 case Intrinsic::copysign: 2147 ISD = ISD::FCOPYSIGN; 2148 break; 2149 case Intrinsic::floor: 2150 ISD = ISD::FFLOOR; 2151 break; 2152 case Intrinsic::ceil: 2153 ISD = ISD::FCEIL; 2154 break; 2155 case Intrinsic::trunc: 2156 ISD = ISD::FTRUNC; 2157 break; 2158 case Intrinsic::nearbyint: 2159 ISD = ISD::FNEARBYINT; 2160 break; 2161 case Intrinsic::rint: 2162 ISD = ISD::FRINT; 2163 break; 2164 case Intrinsic::lrint: 2165 ISD = ISD::LRINT; 2166 break; 2167 case Intrinsic::llrint: 2168 ISD = ISD::LLRINT; 2169 break; 2170 case Intrinsic::round: 2171 ISD = ISD::FROUND; 2172 break; 2173 case Intrinsic::roundeven: 2174 ISD = ISD::FROUNDEVEN; 2175 break; 2176 case Intrinsic::pow: 2177 ISD = ISD::FPOW; 2178 break; 2179 case Intrinsic::fma: 2180 ISD = ISD::FMA; 2181 break; 2182 case Intrinsic::fmuladd: 2183 ISD = ISD::FMA; 2184 break; 2185 case Intrinsic::experimental_constrained_fmuladd: 2186 ISD = ISD::STRICT_FMA; 2187 break; 2188 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. 2189 case Intrinsic::lifetime_start: 2190 case Intrinsic::lifetime_end: 2191 case Intrinsic::sideeffect: 2192 case Intrinsic::pseudoprobe: 2193 case Intrinsic::arithmetic_fence: 2194 return 0; 2195 case Intrinsic::masked_store: { 2196 Type *Ty = Tys[0]; 2197 Align TyAlign = thisT()->DL.getABITypeAlign(Ty); 2198 return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0, 2199 CostKind); 2200 } 2201 case Intrinsic::masked_load: { 2202 Type *Ty = RetTy; 2203 Align TyAlign = thisT()->DL.getABITypeAlign(Ty); 2204 return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0, 2205 CostKind); 2206 } 2207 case Intrinsic::vector_reduce_add: 2208 case Intrinsic::vector_reduce_mul: 2209 case Intrinsic::vector_reduce_and: 2210 case Intrinsic::vector_reduce_or: 2211 case Intrinsic::vector_reduce_xor: 2212 return thisT()->getArithmeticReductionCost( 2213 getArithmeticReductionInstruction(IID), VecOpTy, std::nullopt, 2214 CostKind); 2215 case Intrinsic::vector_reduce_fadd: 2216 case Intrinsic::vector_reduce_fmul: 2217 return thisT()->getArithmeticReductionCost( 2218 getArithmeticReductionInstruction(IID), VecOpTy, FMF, CostKind); 2219 case Intrinsic::vector_reduce_smax: 2220 case Intrinsic::vector_reduce_smin: 2221 case Intrinsic::vector_reduce_umax: 2222 case Intrinsic::vector_reduce_umin: 2223 case Intrinsic::vector_reduce_fmax: 2224 case Intrinsic::vector_reduce_fmin: 2225 case Intrinsic::vector_reduce_fmaximum: 2226 case Intrinsic::vector_reduce_fminimum: 2227 return thisT()->getMinMaxReductionCost(getMinMaxReductionIntrinsicOp(IID), 2228 VecOpTy, ICA.getFlags(), CostKind); 2229 case Intrinsic::experimental_vector_match: { 2230 auto *SearchTy = cast<VectorType>(ICA.getArgTypes()[0]); 2231 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]); 2232 unsigned SearchSize = NeedleTy->getNumElements(); 2233 2234 // If we're not expanding the intrinsic then we assume this is cheap to 2235 // implement. 2236 EVT SearchVT = getTLI()->getValueType(DL, SearchTy); 2237 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) 2238 return getTypeLegalizationCost(RetTy).first; 2239 2240 // Approximate the cost based on the expansion code in 2241 // SelectionDAGBuilder. 2242 InstructionCost Cost = 0; 2243 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, NeedleTy, 2244 CostKind, 1, nullptr, nullptr); 2245 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SearchTy, 2246 CostKind, 0, nullptr, nullptr); 2247 Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, SearchTy, std::nullopt, 2248 CostKind, 0, nullptr); 2249 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SearchTy, RetTy, 2250 CmpInst::ICMP_EQ, CostKind); 2251 Cost += 2252 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind); 2253 Cost *= SearchSize; 2254 Cost += 2255 thisT()->getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind); 2256 return Cost; 2257 } 2258 case Intrinsic::abs: 2259 ISD = ISD::ABS; 2260 break; 2261 case Intrinsic::fshl: 2262 ISD = ISD::FSHL; 2263 break; 2264 case Intrinsic::fshr: 2265 ISD = ISD::FSHR; 2266 break; 2267 case Intrinsic::smax: 2268 ISD = ISD::SMAX; 2269 break; 2270 case Intrinsic::smin: 2271 ISD = ISD::SMIN; 2272 break; 2273 case Intrinsic::umax: 2274 ISD = ISD::UMAX; 2275 break; 2276 case Intrinsic::umin: 2277 ISD = ISD::UMIN; 2278 break; 2279 case Intrinsic::sadd_sat: 2280 ISD = ISD::SADDSAT; 2281 break; 2282 case Intrinsic::ssub_sat: 2283 ISD = ISD::SSUBSAT; 2284 break; 2285 case Intrinsic::uadd_sat: 2286 ISD = ISD::UADDSAT; 2287 break; 2288 case Intrinsic::usub_sat: 2289 ISD = ISD::USUBSAT; 2290 break; 2291 case Intrinsic::smul_fix: 2292 ISD = ISD::SMULFIX; 2293 break; 2294 case Intrinsic::umul_fix: 2295 ISD = ISD::UMULFIX; 2296 break; 2297 case Intrinsic::sadd_with_overflow: 2298 ISD = ISD::SADDO; 2299 break; 2300 case Intrinsic::ssub_with_overflow: 2301 ISD = ISD::SSUBO; 2302 break; 2303 case Intrinsic::uadd_with_overflow: 2304 ISD = ISD::UADDO; 2305 break; 2306 case Intrinsic::usub_with_overflow: 2307 ISD = ISD::USUBO; 2308 break; 2309 case Intrinsic::smul_with_overflow: 2310 ISD = ISD::SMULO; 2311 break; 2312 case Intrinsic::umul_with_overflow: 2313 ISD = ISD::UMULO; 2314 break; 2315 case Intrinsic::fptosi_sat: 2316 ISD = ISD::FP_TO_SINT_SAT; 2317 break; 2318 case Intrinsic::fptoui_sat: 2319 ISD = ISD::FP_TO_UINT_SAT; 2320 break; 2321 case Intrinsic::ctpop: 2322 ISD = ISD::CTPOP; 2323 // In case of legalization use TCC_Expensive. This is cheaper than a 2324 // library call but still not a cheap instruction. 2325 SingleCallCost = TargetTransformInfo::TCC_Expensive; 2326 break; 2327 case Intrinsic::ctlz: 2328 ISD = ISD::CTLZ; 2329 break; 2330 case Intrinsic::cttz: 2331 ISD = ISD::CTTZ; 2332 break; 2333 case Intrinsic::bswap: 2334 ISD = ISD::BSWAP; 2335 break; 2336 case Intrinsic::bitreverse: 2337 ISD = ISD::BITREVERSE; 2338 break; 2339 case Intrinsic::ucmp: 2340 ISD = ISD::UCMP; 2341 break; 2342 case Intrinsic::scmp: 2343 ISD = ISD::SCMP; 2344 break; 2345 } 2346 2347 auto *ST = dyn_cast<StructType>(RetTy); 2348 Type *LegalizeTy = ST ? ST->getContainedType(0) : RetTy; 2349 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(LegalizeTy); 2350 2351 const TargetLoweringBase *TLI = getTLI(); 2352 2353 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 2354 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() && 2355 TLI->isFAbsFree(LT.second)) { 2356 return 0; 2357 } 2358 2359 // The operation is legal. Assume it costs 1. 2360 // If the type is split to multiple registers, assume that there is some 2361 // overhead to this. 2362 // TODO: Once we have extract/insert subvector cost we need to use them. 2363 if (LT.first > 1) 2364 return (LT.first * 2); 2365 else 2366 return (LT.first * 1); 2367 } else if (!TLI->isOperationExpand(ISD, LT.second)) { 2368 // If the operation is custom lowered then assume 2369 // that the code is twice as expensive. 2370 return (LT.first * 2); 2371 } 2372 2373 switch (IID) { 2374 case Intrinsic::fmuladd: { 2375 // If we can't lower fmuladd into an FMA estimate the cost as a floating 2376 // point mul followed by an add. 2377 2378 return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy, 2379 CostKind) + 2380 thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy, 2381 CostKind); 2382 } 2383 case Intrinsic::experimental_constrained_fmuladd: { 2384 IntrinsicCostAttributes FMulAttrs( 2385 Intrinsic::experimental_constrained_fmul, RetTy, Tys); 2386 IntrinsicCostAttributes FAddAttrs( 2387 Intrinsic::experimental_constrained_fadd, RetTy, Tys); 2388 return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) + 2389 thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind); 2390 } 2391 case Intrinsic::smin: 2392 case Intrinsic::smax: 2393 case Intrinsic::umin: 2394 case Intrinsic::umax: { 2395 // minmax(X,Y) = select(icmp(X,Y),X,Y) 2396 Type *CondTy = RetTy->getWithNewBitWidth(1); 2397 bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin; 2398 CmpInst::Predicate Pred = 2399 IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT; 2400 InstructionCost Cost = 0; 2401 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 2402 Pred, CostKind); 2403 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 2404 Pred, CostKind); 2405 return Cost; 2406 } 2407 case Intrinsic::sadd_with_overflow: 2408 case Intrinsic::ssub_with_overflow: { 2409 Type *SumTy = RetTy->getContainedType(0); 2410 Type *OverflowTy = RetTy->getContainedType(1); 2411 unsigned Opcode = IID == Intrinsic::sadd_with_overflow 2412 ? BinaryOperator::Add 2413 : BinaryOperator::Sub; 2414 2415 // Add: 2416 // Overflow -> (Result < LHS) ^ (RHS < 0) 2417 // Sub: 2418 // Overflow -> (Result < LHS) ^ (RHS > 0) 2419 InstructionCost Cost = 0; 2420 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); 2421 Cost += 2422 2 * thisT()->getCmpSelInstrCost(Instruction::ICmp, SumTy, OverflowTy, 2423 CmpInst::ICMP_SGT, CostKind); 2424 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy, 2425 CostKind); 2426 return Cost; 2427 } 2428 case Intrinsic::uadd_with_overflow: 2429 case Intrinsic::usub_with_overflow: { 2430 Type *SumTy = RetTy->getContainedType(0); 2431 Type *OverflowTy = RetTy->getContainedType(1); 2432 unsigned Opcode = IID == Intrinsic::uadd_with_overflow 2433 ? BinaryOperator::Add 2434 : BinaryOperator::Sub; 2435 CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow 2436 ? CmpInst::ICMP_ULT 2437 : CmpInst::ICMP_UGT; 2438 2439 InstructionCost Cost = 0; 2440 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); 2441 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, 2442 OverflowTy, Pred, CostKind); 2443 return Cost; 2444 } 2445 case Intrinsic::smul_with_overflow: 2446 case Intrinsic::umul_with_overflow: { 2447 Type *MulTy = RetTy->getContainedType(0); 2448 Type *OverflowTy = RetTy->getContainedType(1); 2449 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2; 2450 Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize); 2451 bool IsSigned = IID == Intrinsic::smul_with_overflow; 2452 2453 unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt; 2454 TTI::CastContextHint CCH = TTI::CastContextHint::None; 2455 2456 InstructionCost Cost = 0; 2457 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind); 2458 Cost += 2459 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 2460 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy, 2461 CCH, CostKind); 2462 Cost += thisT()->getArithmeticInstrCost( 2463 Instruction::LShr, ExtTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None}, 2464 {TTI::OK_UniformConstantValue, TTI::OP_None}); 2465 2466 if (IsSigned) 2467 Cost += thisT()->getArithmeticInstrCost( 2468 Instruction::AShr, MulTy, CostKind, 2469 {TTI::OK_AnyValue, TTI::OP_None}, 2470 {TTI::OK_UniformConstantValue, TTI::OP_None}); 2471 2472 Cost += thisT()->getCmpSelInstrCost( 2473 BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind); 2474 return Cost; 2475 } 2476 case Intrinsic::sadd_sat: 2477 case Intrinsic::ssub_sat: { 2478 // Assume a default expansion. 2479 Type *CondTy = RetTy->getWithNewBitWidth(1); 2480 2481 Type *OpTy = StructType::create({RetTy, CondTy}); 2482 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat 2483 ? Intrinsic::sadd_with_overflow 2484 : Intrinsic::ssub_with_overflow; 2485 CmpInst::Predicate Pred = CmpInst::ICMP_SGT; 2486 2487 // SatMax -> Overflow && SumDiff < 0 2488 // SatMin -> Overflow && SumDiff >= 0 2489 InstructionCost Cost = 0; 2490 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, 2491 nullptr, ScalarizationCostPassed); 2492 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); 2493 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 2494 Pred, CostKind); 2495 Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, 2496 CondTy, Pred, CostKind); 2497 return Cost; 2498 } 2499 case Intrinsic::uadd_sat: 2500 case Intrinsic::usub_sat: { 2501 Type *CondTy = RetTy->getWithNewBitWidth(1); 2502 2503 Type *OpTy = StructType::create({RetTy, CondTy}); 2504 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat 2505 ? Intrinsic::uadd_with_overflow 2506 : Intrinsic::usub_with_overflow; 2507 2508 InstructionCost Cost = 0; 2509 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, 2510 nullptr, ScalarizationCostPassed); 2511 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); 2512 Cost += 2513 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 2514 CmpInst::BAD_ICMP_PREDICATE, CostKind); 2515 return Cost; 2516 } 2517 case Intrinsic::smul_fix: 2518 case Intrinsic::umul_fix: { 2519 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2; 2520 Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize); 2521 2522 unsigned ExtOp = 2523 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt; 2524 TTI::CastContextHint CCH = TTI::CastContextHint::None; 2525 2526 InstructionCost Cost = 0; 2527 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind); 2528 Cost += 2529 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 2530 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy, 2531 CCH, CostKind); 2532 Cost += thisT()->getArithmeticInstrCost( 2533 Instruction::LShr, RetTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None}, 2534 {TTI::OK_UniformConstantValue, TTI::OP_None}); 2535 Cost += thisT()->getArithmeticInstrCost( 2536 Instruction::Shl, RetTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None}, 2537 {TTI::OK_UniformConstantValue, TTI::OP_None}); 2538 Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind); 2539 return Cost; 2540 } 2541 case Intrinsic::abs: { 2542 // abs(X) = select(icmp(X,0),X,sub(0,X)) 2543 Type *CondTy = RetTy->getWithNewBitWidth(1); 2544 CmpInst::Predicate Pred = CmpInst::ICMP_SGT; 2545 InstructionCost Cost = 0; 2546 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 2547 Pred, CostKind); 2548 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 2549 Pred, CostKind); 2550 // TODO: Should we add an OperandValueProperties::OP_Zero property? 2551 Cost += thisT()->getArithmeticInstrCost( 2552 BinaryOperator::Sub, RetTy, CostKind, 2553 {TTI::OK_UniformConstantValue, TTI::OP_None}); 2554 return Cost; 2555 } 2556 case Intrinsic::fshl: 2557 case Intrinsic::fshr: { 2558 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) 2559 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) 2560 Type *CondTy = RetTy->getWithNewBitWidth(1); 2561 InstructionCost Cost = 0; 2562 Cost += 2563 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind); 2564 Cost += 2565 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind); 2566 Cost += 2567 thisT()->getArithmeticInstrCost(BinaryOperator::Shl, RetTy, CostKind); 2568 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::LShr, RetTy, 2569 CostKind); 2570 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy, 2571 CostKind); 2572 // Shift-by-zero handling. 2573 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 2574 CmpInst::ICMP_EQ, CostKind); 2575 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 2576 CmpInst::ICMP_EQ, CostKind); 2577 return Cost; 2578 } 2579 case Intrinsic::fptosi_sat: 2580 case Intrinsic::fptoui_sat: { 2581 if (Tys.empty()) 2582 break; 2583 Type *FromTy = Tys[0]; 2584 bool IsSigned = IID == Intrinsic::fptosi_sat; 2585 2586 InstructionCost Cost = 0; 2587 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy, 2588 {FromTy, FromTy}); 2589 Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind); 2590 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy, 2591 {FromTy, FromTy}); 2592 Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind); 2593 Cost += thisT()->getCastInstrCost( 2594 IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy, 2595 TTI::CastContextHint::None, CostKind); 2596 if (IsSigned) { 2597 Type *CondTy = RetTy->getWithNewBitWidth(1); 2598 Cost += thisT()->getCmpSelInstrCost( 2599 BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind); 2600 Cost += thisT()->getCmpSelInstrCost( 2601 BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind); 2602 } 2603 return Cost; 2604 } 2605 case Intrinsic::ucmp: 2606 case Intrinsic::scmp: { 2607 Type *CmpTy = Tys[0]; 2608 Type *CondTy = RetTy->getWithNewBitWidth(1); 2609 InstructionCost Cost = 2610 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, CmpTy, CondTy, 2611 CmpIntrinsic::getGTPredicate(IID), 2612 CostKind) + 2613 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, CmpTy, CondTy, 2614 CmpIntrinsic::getLTPredicate(IID), 2615 CostKind); 2616 2617 EVT VT = TLI->getValueType(DL, CmpTy, true); 2618 if (TLI->shouldExpandCmpUsingSelects(VT)) { 2619 // x < y ? -1 : (x > y ? 1 : 0) 2620 Cost += 2 * thisT()->getCmpSelInstrCost( 2621 BinaryOperator::Select, RetTy, CondTy, 2622 ICmpInst::BAD_ICMP_PREDICATE, CostKind); 2623 } else { 2624 // zext(x > y) - zext(x < y) 2625 Cost += 2626 2 * thisT()->getCastInstrCost(CastInst::ZExt, RetTy, CondTy, 2627 TTI::CastContextHint::None, CostKind); 2628 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, 2629 CostKind); 2630 } 2631 return Cost; 2632 } 2633 default: 2634 break; 2635 } 2636 2637 // Else, assume that we need to scalarize this intrinsic. For math builtins 2638 // this will emit a costly libcall, adding call overhead and spills. Make it 2639 // very expensive. 2640 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) { 2641 // Scalable vectors cannot be scalarized, so return Invalid. 2642 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) { 2643 return isa<ScalableVectorType>(Ty); 2644 })) 2645 return InstructionCost::getInvalid(); 2646 2647 InstructionCost ScalarizationCost = 2648 SkipScalarizationCost 2649 ? ScalarizationCostPassed 2650 : getScalarizationOverhead(RetVTy, /*Insert*/ true, 2651 /*Extract*/ false, CostKind); 2652 2653 unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements(); 2654 SmallVector<Type *, 4> ScalarTys; 2655 for (Type *Ty : Tys) { 2656 if (Ty->isVectorTy()) 2657 Ty = Ty->getScalarType(); 2658 ScalarTys.push_back(Ty); 2659 } 2660 IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF); 2661 InstructionCost ScalarCost = 2662 thisT()->getIntrinsicInstrCost(Attrs, CostKind); 2663 for (Type *Ty : Tys) { 2664 if (auto *VTy = dyn_cast<VectorType>(Ty)) { 2665 if (!ICA.skipScalarizationCost()) 2666 ScalarizationCost += getScalarizationOverhead( 2667 VTy, /*Insert*/ false, /*Extract*/ true, CostKind); 2668 ScalarCalls = std::max(ScalarCalls, 2669 cast<FixedVectorType>(VTy)->getNumElements()); 2670 } 2671 } 2672 return ScalarCalls * ScalarCost + ScalarizationCost; 2673 } 2674 2675 // This is going to be turned into a library call, make it expensive. 2676 return SingleCallCost; 2677 } 2678 2679 /// Compute a cost of the given call instruction. 2680 /// 2681 /// Compute the cost of calling function F with return type RetTy and 2682 /// argument types Tys. F might be nullptr, in this case the cost of an 2683 /// arbitrary call with the specified signature will be returned. 2684 /// This is used, for instance, when we estimate call of a vector 2685 /// counterpart of the given function. 2686 /// \param F Called function, might be nullptr. 2687 /// \param RetTy Return value types. 2688 /// \param Tys Argument types. 2689 /// \returns The cost of Call instruction. 2690 InstructionCost getCallInstrCost(Function *F, Type *RetTy, 2691 ArrayRef<Type *> Tys, 2692 TTI::TargetCostKind CostKind) { 2693 return 10; 2694 } 2695 2696 unsigned getNumberOfParts(Type *Tp) { 2697 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 2698 if (!LT.first.isValid()) 2699 return 0; 2700 // Try to find actual number of parts for non-power-of-2 elements as 2701 // ceil(num-of-elements/num-of-subtype-elements). 2702 if (auto *FTp = dyn_cast<FixedVectorType>(Tp); 2703 Tp && LT.second.isFixedLengthVector() && 2704 !has_single_bit(FTp->getNumElements())) { 2705 if (auto *SubTp = dyn_cast_if_present<FixedVectorType>( 2706 EVT(LT.second).getTypeForEVT(Tp->getContext())); 2707 SubTp && SubTp->getElementType() == FTp->getElementType()) 2708 return divideCeil(FTp->getNumElements(), SubTp->getNumElements()); 2709 } 2710 return *LT.first.getValue(); 2711 } 2712 2713 InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, 2714 const SCEV *) { 2715 return 0; 2716 } 2717 2718 /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics. 2719 /// We're assuming that reduction operation are performing the following way: 2720 /// 2721 /// %val1 = shufflevector<n x t> %val, <n x t> %undef, 2722 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef> 2723 /// \----------------v-------------/ \----------v------------/ 2724 /// n/2 elements n/2 elements 2725 /// %red1 = op <n x t> %val, <n x t> val1 2726 /// After this operation we have a vector %red1 where only the first n/2 2727 /// elements are meaningful, the second n/2 elements are undefined and can be 2728 /// dropped. All other operations are actually working with the vector of 2729 /// length n/2, not n, though the real vector length is still n. 2730 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef, 2731 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef> 2732 /// \----------------v-------------/ \----------v------------/ 2733 /// n/4 elements 3*n/4 elements 2734 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of 2735 /// length n/2, the resulting vector has length n/4 etc. 2736 /// 2737 /// The cost model should take into account that the actual length of the 2738 /// vector is reduced on each iteration. 2739 InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty, 2740 TTI::TargetCostKind CostKind) { 2741 // Targets must implement a default value for the scalable case, since 2742 // we don't know how many lanes the vector has. 2743 if (isa<ScalableVectorType>(Ty)) 2744 return InstructionCost::getInvalid(); 2745 2746 Type *ScalarTy = Ty->getElementType(); 2747 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); 2748 if ((Opcode == Instruction::Or || Opcode == Instruction::And) && 2749 ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) && 2750 NumVecElts >= 2) { 2751 // Or reduction for i1 is represented as: 2752 // %val = bitcast <ReduxWidth x i1> to iReduxWidth 2753 // %res = cmp ne iReduxWidth %val, 0 2754 // And reduction for i1 is represented as: 2755 // %val = bitcast <ReduxWidth x i1> to iReduxWidth 2756 // %res = cmp eq iReduxWidth %val, 11111 2757 Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts); 2758 return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty, 2759 TTI::CastContextHint::None, CostKind) + 2760 thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy, 2761 CmpInst::makeCmpResultType(ValTy), 2762 CmpInst::BAD_ICMP_PREDICATE, CostKind); 2763 } 2764 unsigned NumReduxLevels = Log2_32(NumVecElts); 2765 InstructionCost ArithCost = 0; 2766 InstructionCost ShuffleCost = 0; 2767 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty); 2768 unsigned LongVectorCount = 0; 2769 unsigned MVTLen = 2770 LT.second.isVector() ? LT.second.getVectorNumElements() : 1; 2771 while (NumVecElts > MVTLen) { 2772 NumVecElts /= 2; 2773 VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); 2774 ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, {}, 2775 CostKind, NumVecElts, SubTy); 2776 ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind); 2777 Ty = SubTy; 2778 ++LongVectorCount; 2779 } 2780 2781 NumReduxLevels -= LongVectorCount; 2782 2783 // The minimal length of the vector is limited by the real length of vector 2784 // operations performed on the current platform. That's why several final 2785 // reduction operations are performed on the vectors with the same 2786 // architecture-dependent length. 2787 2788 // By default reductions need one shuffle per reduction level. 2789 ShuffleCost += 2790 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 2791 {}, CostKind, 0, Ty); 2792 ArithCost += 2793 NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind); 2794 return ShuffleCost + ArithCost + 2795 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 2796 CostKind, 0, nullptr, nullptr); 2797 } 2798 2799 /// Try to calculate the cost of performing strict (in-order) reductions, 2800 /// which involves doing a sequence of floating point additions in lane 2801 /// order, starting with an initial value. For example, consider a scalar 2802 /// initial value 'InitVal' of type float and a vector of type <4 x float>: 2803 /// 2804 /// Vector = <float %v0, float %v1, float %v2, float %v3> 2805 /// 2806 /// %add1 = %InitVal + %v0 2807 /// %add2 = %add1 + %v1 2808 /// %add3 = %add2 + %v2 2809 /// %add4 = %add3 + %v3 2810 /// 2811 /// As a simple estimate we can say the cost of such a reduction is 4 times 2812 /// the cost of a scalar FP addition. We can only estimate the costs for 2813 /// fixed-width vectors here because for scalable vectors we do not know the 2814 /// runtime number of operations. 2815 InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty, 2816 TTI::TargetCostKind CostKind) { 2817 // Targets must implement a default value for the scalable case, since 2818 // we don't know how many lanes the vector has. 2819 if (isa<ScalableVectorType>(Ty)) 2820 return InstructionCost::getInvalid(); 2821 2822 auto *VTy = cast<FixedVectorType>(Ty); 2823 InstructionCost ExtractCost = getScalarizationOverhead( 2824 VTy, /*Insert=*/false, /*Extract=*/true, CostKind); 2825 InstructionCost ArithCost = thisT()->getArithmeticInstrCost( 2826 Opcode, VTy->getElementType(), CostKind); 2827 ArithCost *= VTy->getNumElements(); 2828 2829 return ExtractCost + ArithCost; 2830 } 2831 2832 InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, 2833 std::optional<FastMathFlags> FMF, 2834 TTI::TargetCostKind CostKind) { 2835 assert(Ty && "Unknown reduction vector type"); 2836 if (TTI::requiresOrderedReduction(FMF)) 2837 return getOrderedReductionCost(Opcode, Ty, CostKind); 2838 return getTreeReductionCost(Opcode, Ty, CostKind); 2839 } 2840 2841 /// Try to calculate op costs for min/max reduction operations. 2842 /// \param CondTy Conditional type for the Select instruction. 2843 InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, 2844 FastMathFlags FMF, 2845 TTI::TargetCostKind CostKind) { 2846 // Targets must implement a default value for the scalable case, since 2847 // we don't know how many lanes the vector has. 2848 if (isa<ScalableVectorType>(Ty)) 2849 return InstructionCost::getInvalid(); 2850 2851 Type *ScalarTy = Ty->getElementType(); 2852 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); 2853 unsigned NumReduxLevels = Log2_32(NumVecElts); 2854 InstructionCost MinMaxCost = 0; 2855 InstructionCost ShuffleCost = 0; 2856 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty); 2857 unsigned LongVectorCount = 0; 2858 unsigned MVTLen = 2859 LT.second.isVector() ? LT.second.getVectorNumElements() : 1; 2860 while (NumVecElts > MVTLen) { 2861 NumVecElts /= 2; 2862 auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); 2863 2864 ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, {}, 2865 CostKind, NumVecElts, SubTy); 2866 2867 IntrinsicCostAttributes Attrs(IID, SubTy, {SubTy, SubTy}, FMF); 2868 MinMaxCost += getIntrinsicInstrCost(Attrs, CostKind); 2869 Ty = SubTy; 2870 ++LongVectorCount; 2871 } 2872 2873 NumReduxLevels -= LongVectorCount; 2874 2875 // The minimal length of the vector is limited by the real length of vector 2876 // operations performed on the current platform. That's why several final 2877 // reduction opertions are perfomed on the vectors with the same 2878 // architecture-dependent length. 2879 ShuffleCost += 2880 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 2881 {}, CostKind, 0, Ty); 2882 IntrinsicCostAttributes Attrs(IID, Ty, {Ty, Ty}, FMF); 2883 MinMaxCost += NumReduxLevels * getIntrinsicInstrCost(Attrs, CostKind); 2884 // The last min/max should be in vector registers and we counted it above. 2885 // So just need a single extractelement. 2886 return ShuffleCost + MinMaxCost + 2887 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 2888 CostKind, 0, nullptr, nullptr); 2889 } 2890 2891 InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, 2892 Type *ResTy, VectorType *Ty, 2893 FastMathFlags FMF, 2894 TTI::TargetCostKind CostKind) { 2895 if (auto *FTy = dyn_cast<FixedVectorType>(Ty); 2896 FTy && IsUnsigned && Opcode == Instruction::Add && 2897 FTy->getElementType() == IntegerType::getInt1Ty(Ty->getContext())) { 2898 // Represent vector_reduce_add(ZExt(<n x i1>)) as 2899 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)). 2900 auto *IntTy = 2901 IntegerType::get(ResTy->getContext(), FTy->getNumElements()); 2902 IntrinsicCostAttributes ICA(Intrinsic::ctpop, IntTy, {IntTy}, FMF); 2903 return thisT()->getCastInstrCost(Instruction::BitCast, IntTy, FTy, 2904 TTI::CastContextHint::None, CostKind) + 2905 thisT()->getIntrinsicInstrCost(ICA, CostKind); 2906 } 2907 // Without any native support, this is equivalent to the cost of 2908 // vecreduce.opcode(ext(Ty A)). 2909 VectorType *ExtTy = VectorType::get(ResTy, Ty); 2910 InstructionCost RedCost = 2911 thisT()->getArithmeticReductionCost(Opcode, ExtTy, FMF, CostKind); 2912 InstructionCost ExtCost = thisT()->getCastInstrCost( 2913 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty, 2914 TTI::CastContextHint::None, CostKind); 2915 2916 return RedCost + ExtCost; 2917 } 2918 2919 InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, 2920 VectorType *Ty, 2921 TTI::TargetCostKind CostKind) { 2922 // Without any native support, this is equivalent to the cost of 2923 // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or 2924 // vecreduce.add(mul(A, B)). 2925 VectorType *ExtTy = VectorType::get(ResTy, Ty); 2926 InstructionCost RedCost = thisT()->getArithmeticReductionCost( 2927 Instruction::Add, ExtTy, std::nullopt, CostKind); 2928 InstructionCost ExtCost = thisT()->getCastInstrCost( 2929 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty, 2930 TTI::CastContextHint::None, CostKind); 2931 2932 InstructionCost MulCost = 2933 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 2934 2935 return RedCost + MulCost + 2 * ExtCost; 2936 } 2937 2938 InstructionCost getVectorSplitCost() { return 1; } 2939 2940 /// @} 2941 }; 2942 2943 /// Concrete BasicTTIImpl that can be used if no further customization 2944 /// is needed. 2945 class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> { 2946 using BaseT = BasicTTIImplBase<BasicTTIImpl>; 2947 2948 friend class BasicTTIImplBase<BasicTTIImpl>; 2949 2950 const TargetSubtargetInfo *ST; 2951 const TargetLoweringBase *TLI; 2952 2953 const TargetSubtargetInfo *getST() const { return ST; } 2954 const TargetLoweringBase *getTLI() const { return TLI; } 2955 2956 public: 2957 explicit BasicTTIImpl(const TargetMachine *TM, const Function &F); 2958 }; 2959 2960 } // end namespace llvm 2961 2962 #endif // LLVM_CODEGEN_BASICTTIIMPL_H 2963