1 //===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "RISCVTargetTransformInfo.h" 10 #include "MCTargetDesc/RISCVMatInt.h" 11 #include "llvm/ADT/STLExtras.h" 12 #include "llvm/Analysis/TargetTransformInfo.h" 13 #include "llvm/CodeGen/BasicTTIImpl.h" 14 #include "llvm/CodeGen/CostTable.h" 15 #include "llvm/CodeGen/TargetLowering.h" 16 #include "llvm/IR/Instructions.h" 17 #include <cmath> 18 #include <optional> 19 using namespace llvm; 20 21 #define DEBUG_TYPE "riscvtti" 22 23 static cl::opt<unsigned> RVVRegisterWidthLMUL( 24 "riscv-v-register-bit-width-lmul", 25 cl::desc( 26 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " 27 "by autovectorized code. Fractional LMULs are not supported."), 28 cl::init(2), cl::Hidden); 29 30 static cl::opt<unsigned> SLPMaxVF( 31 "riscv-v-slp-max-vf", 32 cl::desc( 33 "Overrides result used for getMaximumVF query which is used " 34 "exclusively by SLP vectorizer."), 35 cl::Hidden); 36 37 InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 38 TTI::TargetCostKind CostKind) { 39 assert(Ty->isIntegerTy() && 40 "getIntImmCost can only estimate cost of materialising integers"); 41 42 // We have a Zero register, so 0 is always free. 43 if (Imm == 0) 44 return TTI::TCC_Free; 45 46 // Otherwise, we check how many instructions it will take to materialise. 47 const DataLayout &DL = getDataLayout(); 48 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *getST()); 49 } 50 51 // Look for patterns of shift followed by AND that can be turned into a pair of 52 // shifts. We won't need to materialize an immediate for the AND so these can 53 // be considered free. 54 static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) { 55 uint64_t Mask = Imm.getZExtValue(); 56 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0)); 57 if (!BO || !BO->hasOneUse()) 58 return false; 59 60 if (BO->getOpcode() != Instruction::Shl) 61 return false; 62 63 if (!isa<ConstantInt>(BO->getOperand(1))) 64 return false; 65 66 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue(); 67 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1 68 // is a mask shifted by c2 bits with c3 leading zeros. 69 if (isShiftedMask_64(Mask)) { 70 unsigned Trailing = llvm::countr_zero(Mask); 71 if (ShAmt == Trailing) 72 return true; 73 } 74 75 return false; 76 } 77 78 InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 79 const APInt &Imm, Type *Ty, 80 TTI::TargetCostKind CostKind, 81 Instruction *Inst) { 82 assert(Ty->isIntegerTy() && 83 "getIntImmCost can only estimate cost of materialising integers"); 84 85 // We have a Zero register, so 0 is always free. 86 if (Imm == 0) 87 return TTI::TCC_Free; 88 89 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are 90 // commutative, in others the immediate comes from a specific argument index. 91 bool Takes12BitImm = false; 92 unsigned ImmArgIdx = ~0U; 93 94 switch (Opcode) { 95 case Instruction::GetElementPtr: 96 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will 97 // split up large offsets in GEP into better parts than ConstantHoisting 98 // can. 99 return TTI::TCC_Free; 100 case Instruction::And: 101 // zext.h 102 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb()) 103 return TTI::TCC_Free; 104 // zext.w 105 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba()) 106 return TTI::TCC_Free; 107 // bclri 108 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2()) 109 return TTI::TCC_Free; 110 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() && 111 canUseShiftPair(Inst, Imm)) 112 return TTI::TCC_Free; 113 Takes12BitImm = true; 114 break; 115 case Instruction::Add: 116 Takes12BitImm = true; 117 break; 118 case Instruction::Or: 119 case Instruction::Xor: 120 // bseti/binvi 121 if (ST->hasStdExtZbs() && Imm.isPowerOf2()) 122 return TTI::TCC_Free; 123 Takes12BitImm = true; 124 break; 125 case Instruction::Mul: 126 // Power of 2 is a shift. Negated power of 2 is a shift and a negate. 127 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2()) 128 return TTI::TCC_Free; 129 // One more or less than a power of 2 can use SLLI+ADD/SUB. 130 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2()) 131 return TTI::TCC_Free; 132 // FIXME: There is no MULI instruction. 133 Takes12BitImm = true; 134 break; 135 case Instruction::Sub: 136 case Instruction::Shl: 137 case Instruction::LShr: 138 case Instruction::AShr: 139 Takes12BitImm = true; 140 ImmArgIdx = 1; 141 break; 142 default: 143 break; 144 } 145 146 if (Takes12BitImm) { 147 // Check immediate is the correct argument... 148 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) { 149 // ... and fits into the 12-bit immediate. 150 if (Imm.getSignificantBits() <= 64 && 151 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) { 152 return TTI::TCC_Free; 153 } 154 } 155 156 // Otherwise, use the full materialisation cost. 157 return getIntImmCost(Imm, Ty, CostKind); 158 } 159 160 // By default, prevent hoisting. 161 return TTI::TCC_Free; 162 } 163 164 InstructionCost 165 RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 166 const APInt &Imm, Type *Ty, 167 TTI::TargetCostKind CostKind) { 168 // Prevent hoisting in unknown cases. 169 return TTI::TCC_Free; 170 } 171 172 TargetTransformInfo::PopcntSupportKind 173 RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) { 174 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 175 return ST->hasStdExtZbb() || ST->hasVendorXCVbitmanip() 176 ? TTI::PSK_FastHardware 177 : TTI::PSK_Software; 178 } 179 180 bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const { 181 // Currently, the ExpandReductions pass can't expand scalable-vector 182 // reductions, but we still request expansion as RVV doesn't support certain 183 // reductions and the SelectionDAG can't legalize them either. 184 switch (II->getIntrinsicID()) { 185 default: 186 return false; 187 // These reductions have no equivalent in RVV 188 case Intrinsic::vector_reduce_mul: 189 case Intrinsic::vector_reduce_fmul: 190 return true; 191 } 192 } 193 194 std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const { 195 if (ST->hasVInstructions()) 196 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock; 197 return BaseT::getMaxVScale(); 198 } 199 200 std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const { 201 if (ST->hasVInstructions()) 202 if (unsigned MinVLen = ST->getRealMinVLen(); 203 MinVLen >= RISCV::RVVBitsPerBlock) 204 return MinVLen / RISCV::RVVBitsPerBlock; 205 return BaseT::getVScaleForTuning(); 206 } 207 208 TypeSize 209 RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 210 unsigned LMUL = 211 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8)); 212 switch (K) { 213 case TargetTransformInfo::RGK_Scalar: 214 return TypeSize::getFixed(ST->getXLen()); 215 case TargetTransformInfo::RGK_FixedWidthVector: 216 return TypeSize::getFixed( 217 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0); 218 case TargetTransformInfo::RGK_ScalableVector: 219 return TypeSize::getScalable( 220 (ST->hasVInstructions() && 221 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock) 222 ? LMUL * RISCV::RVVBitsPerBlock 223 : 0); 224 } 225 226 llvm_unreachable("Unsupported register kind"); 227 } 228 229 InstructionCost 230 RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) { 231 // Add a cost of address generation + the cost of the load. The address 232 // is expected to be a PC relative offset to a constant pool entry 233 // using auipc/addi. 234 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty), 235 /*AddressSpace=*/0, CostKind); 236 } 237 238 static VectorType *getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, 239 LLVMContext &C) { 240 assert((DataVT.getScalarSizeInBits() != 8 || 241 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering"); 242 MVT IndexVT = DataVT.changeTypeToInteger(); 243 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT())) 244 IndexVT = IndexVT.changeVectorElementType(MVT::i16); 245 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C)); 246 } 247 248 InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, 249 VectorType *Tp, ArrayRef<int> Mask, 250 TTI::TargetCostKind CostKind, 251 int Index, VectorType *SubTp, 252 ArrayRef<const Value *> Args) { 253 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp); 254 255 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 256 257 // First, handle cases where having a fixed length vector enables us to 258 // give a more accurate cost than falling back to generic scalable codegen. 259 // TODO: Each of these cases hints at a modeling gap around scalable vectors. 260 if (isa<FixedVectorType>(Tp)) { 261 switch (Kind) { 262 default: 263 break; 264 case TTI::SK_PermuteSingleSrc: { 265 if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) { 266 MVT EltTp = LT.second.getVectorElementType(); 267 // If the size of the element is < ELEN then shuffles of interleaves and 268 // deinterleaves of 2 vectors can be lowered into the following 269 // sequences 270 if (EltTp.getScalarSizeInBits() < ST->getELen()) { 271 // Example sequence: 272 // vsetivli zero, 4, e8, mf4, ta, ma (ignored) 273 // vwaddu.vv v10, v8, v9 274 // li a0, -1 (ignored) 275 // vwmaccu.vx v10, a0, v9 276 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size())) 277 return 2 * LT.first * TLI->getLMULCost(LT.second); 278 279 if (Mask[0] == 0 || Mask[0] == 1) { 280 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size()); 281 // Example sequence: 282 // vnsrl.wi v10, v8, 0 283 if (equal(DeinterleaveMask, Mask)) 284 return LT.first * TLI->getLMULCost(LT.second); 285 } 286 } 287 } 288 // vrgather + cost of generating the mask constant. 289 // We model this for an unknown mask with a single vrgather. 290 if (LT.second.isFixedLengthVector() && LT.first == 1 && 291 (LT.second.getScalarSizeInBits() != 8 || 292 LT.second.getVectorNumElements() <= 256)) { 293 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext()); 294 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind); 295 return IndexCost + TLI->getVRGatherVVCost(LT.second); 296 } 297 [[fallthrough]]; 298 } 299 case TTI::SK_Transpose: 300 case TTI::SK_PermuteTwoSrc: { 301 // 2 x (vrgather + cost of generating the mask constant) + cost of mask 302 // register for the second vrgather. We model this for an unknown 303 // (shuffle) mask. 304 if (LT.second.isFixedLengthVector() && LT.first == 1 && 305 (LT.second.getScalarSizeInBits() != 8 || 306 LT.second.getVectorNumElements() <= 256)) { 307 auto &C = Tp->getContext(); 308 auto EC = Tp->getElementCount(); 309 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C); 310 VectorType *MaskTy = VectorType::get(IntegerType::getInt1Ty(C), EC); 311 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind); 312 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind); 313 return 2 * IndexCost + 2 * TLI->getVRGatherVVCost(LT.second) + MaskCost; 314 } 315 [[fallthrough]]; 316 } 317 case TTI::SK_Select: { 318 // We are going to permute multiple sources and the result will be in 319 // multiple destinations. Providing an accurate cost only for splits where 320 // the element type remains the same. 321 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 && 322 LT.second.isFixedLengthVector() && 323 LT.second.getVectorElementType().getSizeInBits() == 324 Tp->getElementType()->getPrimitiveSizeInBits() && 325 LT.second.getVectorNumElements() < 326 cast<FixedVectorType>(Tp)->getNumElements() && 327 divideCeil(Mask.size(), 328 cast<FixedVectorType>(Tp)->getNumElements()) == 329 static_cast<unsigned>(*LT.first.getValue())) { 330 unsigned NumRegs = *LT.first.getValue(); 331 unsigned VF = cast<FixedVectorType>(Tp)->getNumElements(); 332 unsigned SubVF = PowerOf2Ceil(VF / NumRegs); 333 auto *SubVecTy = FixedVectorType::get(Tp->getElementType(), SubVF); 334 335 InstructionCost Cost = 0; 336 for (unsigned I = 0; I < NumRegs; ++I) { 337 bool IsSingleVector = true; 338 SmallVector<int> SubMask(SubVF, PoisonMaskElem); 339 transform(Mask.slice(I * SubVF, 340 I == NumRegs - 1 ? Mask.size() % SubVF : SubVF), 341 SubMask.begin(), [&](int I) { 342 bool SingleSubVector = I / VF == 0; 343 IsSingleVector &= SingleSubVector; 344 return (SingleSubVector ? 0 : 1) * SubVF + I % VF; 345 }); 346 Cost += getShuffleCost(IsSingleVector ? TTI::SK_PermuteSingleSrc 347 : TTI::SK_PermuteTwoSrc, 348 SubVecTy, SubMask, CostKind, 0, nullptr); 349 return Cost; 350 } 351 } 352 break; 353 } 354 } 355 }; 356 357 // Handle scalable vectors (and fixed vectors legalized to scalable vectors). 358 switch (Kind) { 359 default: 360 // Fallthrough to generic handling. 361 // TODO: Most of these cases will return getInvalid in generic code, and 362 // must be implemented here. 363 break; 364 case TTI::SK_ExtractSubvector: 365 // Example sequence: 366 // vsetivli zero, 4, e8, mf2, tu, ma (ignored) 367 // vslidedown.vi v8, v9, 2 368 return LT.first * TLI->getVSlideCost(LT.second); 369 case TTI::SK_InsertSubvector: 370 // Example sequence: 371 // vsetivli zero, 4, e8, mf2, tu, ma (ignored) 372 // vslideup.vi v8, v9, 2 373 return LT.first * TLI->getVSlideCost(LT.second); 374 case TTI::SK_Select: { 375 // Example sequence: 376 // li a0, 90 377 // vsetivli zero, 8, e8, mf2, ta, ma (ignored) 378 // vmv.s.x v0, a0 379 // vmerge.vvm v8, v9, v8, v0 380 return LT.first * 3 * TLI->getLMULCost(LT.second); 381 } 382 case TTI::SK_Broadcast: { 383 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) == 384 Instruction::InsertElement); 385 if (LT.second.getScalarSizeInBits() == 1) { 386 if (HasScalar) { 387 // Example sequence: 388 // andi a0, a0, 1 389 // vsetivli zero, 2, e8, mf8, ta, ma (ignored) 390 // vmv.v.x v8, a0 391 // vmsne.vi v0, v8, 0 392 return LT.first * TLI->getLMULCost(LT.second) * 3; 393 } 394 // Example sequence: 395 // vsetivli zero, 2, e8, mf8, ta, mu (ignored) 396 // vmv.v.i v8, 0 397 // vmerge.vim v8, v8, 1, v0 398 // vmv.x.s a0, v8 399 // andi a0, a0, 1 400 // vmv.v.x v8, a0 401 // vmsne.vi v0, v8, 0 402 403 return LT.first * TLI->getLMULCost(LT.second) * 6; 404 } 405 406 if (HasScalar) { 407 // Example sequence: 408 // vmv.v.x v8, a0 409 return LT.first * TLI->getLMULCost(LT.second); 410 } 411 412 // Example sequence: 413 // vrgather.vi v9, v8, 0 414 return LT.first * TLI->getVRGatherVICost(LT.second); 415 } 416 case TTI::SK_Splice: 417 // vslidedown+vslideup. 418 // TODO: Multiplying by LT.first implies this legalizes into multiple copies 419 // of similar code, but I think we expand through memory. 420 return 2 * LT.first * TLI->getVSlideCost(LT.second); 421 case TTI::SK_Reverse: { 422 // TODO: Cases to improve here: 423 // * Illegal vector types 424 // * i64 on RV32 425 // * i1 vector 426 // At low LMUL, most of the cost is producing the vrgather index register. 427 // At high LMUL, the cost of the vrgather itself will dominate. 428 // Example sequence: 429 // csrr a0, vlenb 430 // srli a0, a0, 3 431 // addi a0, a0, -1 432 // vsetvli a1, zero, e8, mf8, ta, mu (ignored) 433 // vid.v v9 434 // vrsub.vx v10, v9, a0 435 // vrgather.vv v9, v8, v10 436 InstructionCost LenCost = 3; 437 if (LT.second.isFixedLengthVector()) 438 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices 439 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1; 440 InstructionCost GatherCost = 2 + TLI->getVRGatherVVCost(LT.second); 441 // Mask operation additionally required extend and truncate 442 InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0; 443 return LT.first * (LenCost + GatherCost + ExtendCost); 444 } 445 } 446 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp); 447 } 448 449 InstructionCost 450 RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, 451 unsigned AddressSpace, 452 TTI::TargetCostKind CostKind) { 453 if (!isLegalMaskedLoadStore(Src, Alignment) || 454 CostKind != TTI::TCK_RecipThroughput) 455 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 456 CostKind); 457 458 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); 459 } 460 461 InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost( 462 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 463 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 464 bool UseMaskForCond, bool UseMaskForGaps) { 465 if (isa<ScalableVectorType>(VecTy)) 466 return InstructionCost::getInvalid(); 467 auto *FVTy = cast<FixedVectorType>(VecTy); 468 InstructionCost MemCost = 469 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind); 470 unsigned VF = FVTy->getNumElements() / Factor; 471 472 // The interleaved memory access pass will lower interleaved memory ops (i.e 473 // a load and store followed by a specific shuffle) to vlseg/vsseg 474 // intrinsics. In those cases then we can treat it as if it's just one (legal) 475 // memory op 476 if (!UseMaskForCond && !UseMaskForGaps && 477 Factor <= TLI->getMaxSupportedInterleaveFactor()) { 478 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(FVTy); 479 // Need to make sure type has't been scalarized 480 if (LT.second.isFixedLengthVector()) { 481 auto *LegalFVTy = FixedVectorType::get(FVTy->getElementType(), 482 LT.second.getVectorNumElements()); 483 // FIXME: We use the memory op cost of the *legalized* type here, becuase 484 // it's getMemoryOpCost returns a really expensive cost for types like 485 // <6 x i8>, which show up when doing interleaves of Factor=3 etc. 486 // Should the memory op cost of these be cheaper? 487 if (TLI->isLegalInterleavedAccessType(LegalFVTy, Factor, Alignment, 488 AddressSpace, DL)) { 489 InstructionCost LegalMemCost = getMemoryOpCost( 490 Opcode, LegalFVTy, Alignment, AddressSpace, CostKind); 491 return LT.first + LegalMemCost; 492 } 493 } 494 } 495 496 // An interleaved load will look like this for Factor=3: 497 // %wide.vec = load <12 x i32>, ptr %3, align 4 498 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask> 499 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask> 500 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask> 501 if (Opcode == Instruction::Load) { 502 InstructionCost Cost = MemCost; 503 for (unsigned Index : Indices) { 504 FixedVectorType *SubVecTy = 505 FixedVectorType::get(FVTy->getElementType(), VF * Factor); 506 auto Mask = createStrideMask(Index, Factor, VF); 507 InstructionCost ShuffleCost = 508 getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, SubVecTy, Mask, 509 CostKind, 0, nullptr, {}); 510 Cost += ShuffleCost; 511 } 512 return Cost; 513 } 514 515 // TODO: Model for NF > 2 516 // We'll need to enhance getShuffleCost to model shuffles that are just 517 // inserts and extracts into subvectors, since they won't have the full cost 518 // of a vrgather. 519 // An interleaved store for 3 vectors of 4 lanes will look like 520 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7> 521 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3> 522 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11> 523 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask> 524 // store <12 x i32> %interleaved.vec, ptr %10, align 4 525 if (Factor != 2) 526 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 527 Alignment, AddressSpace, CostKind, 528 UseMaskForCond, UseMaskForGaps); 529 530 assert(Opcode == Instruction::Store && "Opcode must be a store"); 531 // For an interleaving store of 2 vectors, we perform one large interleaving 532 // shuffle that goes into the wide store 533 auto Mask = createInterleaveMask(VF, Factor); 534 InstructionCost ShuffleCost = 535 getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, FVTy, Mask, 536 CostKind, 0, nullptr, {}); 537 return MemCost + ShuffleCost; 538 } 539 540 InstructionCost RISCVTTIImpl::getGatherScatterOpCost( 541 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 542 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { 543 if (CostKind != TTI::TCK_RecipThroughput) 544 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 545 Alignment, CostKind, I); 546 547 if ((Opcode == Instruction::Load && 548 !isLegalMaskedGather(DataTy, Align(Alignment))) || 549 (Opcode == Instruction::Store && 550 !isLegalMaskedScatter(DataTy, Align(Alignment)))) 551 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 552 Alignment, CostKind, I); 553 554 // Cost is proportional to the number of memory operations implied. For 555 // scalable vectors, we use an estimate on that number since we don't 556 // know exactly what VL will be. 557 auto &VTy = *cast<VectorType>(DataTy); 558 InstructionCost MemOpCost = 559 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind, 560 {TTI::OK_AnyValue, TTI::OP_None}, I); 561 unsigned NumLoads = getEstimatedVLFor(&VTy); 562 return NumLoads * MemOpCost; 563 } 564 565 // Currently, these represent both throughput and codesize costs 566 // for the respective intrinsics. The costs in this table are simply 567 // instruction counts with the following adjustments made: 568 // * One vsetvli is considered free. 569 static const CostTblEntry VectorIntrinsicCostTable[]{ 570 {Intrinsic::floor, MVT::v2f32, 9}, 571 {Intrinsic::floor, MVT::v4f32, 9}, 572 {Intrinsic::floor, MVT::v8f32, 9}, 573 {Intrinsic::floor, MVT::v16f32, 9}, 574 {Intrinsic::floor, MVT::nxv1f32, 9}, 575 {Intrinsic::floor, MVT::nxv2f32, 9}, 576 {Intrinsic::floor, MVT::nxv4f32, 9}, 577 {Intrinsic::floor, MVT::nxv8f32, 9}, 578 {Intrinsic::floor, MVT::nxv16f32, 9}, 579 {Intrinsic::floor, MVT::v2f64, 9}, 580 {Intrinsic::floor, MVT::v4f64, 9}, 581 {Intrinsic::floor, MVT::v8f64, 9}, 582 {Intrinsic::floor, MVT::v16f64, 9}, 583 {Intrinsic::floor, MVT::nxv1f64, 9}, 584 {Intrinsic::floor, MVT::nxv2f64, 9}, 585 {Intrinsic::floor, MVT::nxv4f64, 9}, 586 {Intrinsic::floor, MVT::nxv8f64, 9}, 587 {Intrinsic::ceil, MVT::v2f32, 9}, 588 {Intrinsic::ceil, MVT::v4f32, 9}, 589 {Intrinsic::ceil, MVT::v8f32, 9}, 590 {Intrinsic::ceil, MVT::v16f32, 9}, 591 {Intrinsic::ceil, MVT::nxv1f32, 9}, 592 {Intrinsic::ceil, MVT::nxv2f32, 9}, 593 {Intrinsic::ceil, MVT::nxv4f32, 9}, 594 {Intrinsic::ceil, MVT::nxv8f32, 9}, 595 {Intrinsic::ceil, MVT::nxv16f32, 9}, 596 {Intrinsic::ceil, MVT::v2f64, 9}, 597 {Intrinsic::ceil, MVT::v4f64, 9}, 598 {Intrinsic::ceil, MVT::v8f64, 9}, 599 {Intrinsic::ceil, MVT::v16f64, 9}, 600 {Intrinsic::ceil, MVT::nxv1f64, 9}, 601 {Intrinsic::ceil, MVT::nxv2f64, 9}, 602 {Intrinsic::ceil, MVT::nxv4f64, 9}, 603 {Intrinsic::ceil, MVT::nxv8f64, 9}, 604 {Intrinsic::trunc, MVT::v2f32, 7}, 605 {Intrinsic::trunc, MVT::v4f32, 7}, 606 {Intrinsic::trunc, MVT::v8f32, 7}, 607 {Intrinsic::trunc, MVT::v16f32, 7}, 608 {Intrinsic::trunc, MVT::nxv1f32, 7}, 609 {Intrinsic::trunc, MVT::nxv2f32, 7}, 610 {Intrinsic::trunc, MVT::nxv4f32, 7}, 611 {Intrinsic::trunc, MVT::nxv8f32, 7}, 612 {Intrinsic::trunc, MVT::nxv16f32, 7}, 613 {Intrinsic::trunc, MVT::v2f64, 7}, 614 {Intrinsic::trunc, MVT::v4f64, 7}, 615 {Intrinsic::trunc, MVT::v8f64, 7}, 616 {Intrinsic::trunc, MVT::v16f64, 7}, 617 {Intrinsic::trunc, MVT::nxv1f64, 7}, 618 {Intrinsic::trunc, MVT::nxv2f64, 7}, 619 {Intrinsic::trunc, MVT::nxv4f64, 7}, 620 {Intrinsic::trunc, MVT::nxv8f64, 7}, 621 {Intrinsic::round, MVT::v2f32, 9}, 622 {Intrinsic::round, MVT::v4f32, 9}, 623 {Intrinsic::round, MVT::v8f32, 9}, 624 {Intrinsic::round, MVT::v16f32, 9}, 625 {Intrinsic::round, MVT::nxv1f32, 9}, 626 {Intrinsic::round, MVT::nxv2f32, 9}, 627 {Intrinsic::round, MVT::nxv4f32, 9}, 628 {Intrinsic::round, MVT::nxv8f32, 9}, 629 {Intrinsic::round, MVT::nxv16f32, 9}, 630 {Intrinsic::round, MVT::v2f64, 9}, 631 {Intrinsic::round, MVT::v4f64, 9}, 632 {Intrinsic::round, MVT::v8f64, 9}, 633 {Intrinsic::round, MVT::v16f64, 9}, 634 {Intrinsic::round, MVT::nxv1f64, 9}, 635 {Intrinsic::round, MVT::nxv2f64, 9}, 636 {Intrinsic::round, MVT::nxv4f64, 9}, 637 {Intrinsic::round, MVT::nxv8f64, 9}, 638 {Intrinsic::roundeven, MVT::v2f32, 9}, 639 {Intrinsic::roundeven, MVT::v4f32, 9}, 640 {Intrinsic::roundeven, MVT::v8f32, 9}, 641 {Intrinsic::roundeven, MVT::v16f32, 9}, 642 {Intrinsic::roundeven, MVT::nxv1f32, 9}, 643 {Intrinsic::roundeven, MVT::nxv2f32, 9}, 644 {Intrinsic::roundeven, MVT::nxv4f32, 9}, 645 {Intrinsic::roundeven, MVT::nxv8f32, 9}, 646 {Intrinsic::roundeven, MVT::nxv16f32, 9}, 647 {Intrinsic::roundeven, MVT::v2f64, 9}, 648 {Intrinsic::roundeven, MVT::v4f64, 9}, 649 {Intrinsic::roundeven, MVT::v8f64, 9}, 650 {Intrinsic::roundeven, MVT::v16f64, 9}, 651 {Intrinsic::roundeven, MVT::nxv1f64, 9}, 652 {Intrinsic::roundeven, MVT::nxv2f64, 9}, 653 {Intrinsic::roundeven, MVT::nxv4f64, 9}, 654 {Intrinsic::roundeven, MVT::nxv8f64, 9}, 655 {Intrinsic::rint, MVT::v2f32, 7}, 656 {Intrinsic::rint, MVT::v4f32, 7}, 657 {Intrinsic::rint, MVT::v8f32, 7}, 658 {Intrinsic::rint, MVT::v16f32, 7}, 659 {Intrinsic::rint, MVT::nxv1f32, 7}, 660 {Intrinsic::rint, MVT::nxv2f32, 7}, 661 {Intrinsic::rint, MVT::nxv4f32, 7}, 662 {Intrinsic::rint, MVT::nxv8f32, 7}, 663 {Intrinsic::rint, MVT::nxv16f32, 7}, 664 {Intrinsic::rint, MVT::v2f64, 7}, 665 {Intrinsic::rint, MVT::v4f64, 7}, 666 {Intrinsic::rint, MVT::v8f64, 7}, 667 {Intrinsic::rint, MVT::v16f64, 7}, 668 {Intrinsic::rint, MVT::nxv1f64, 7}, 669 {Intrinsic::rint, MVT::nxv2f64, 7}, 670 {Intrinsic::rint, MVT::nxv4f64, 7}, 671 {Intrinsic::rint, MVT::nxv8f64, 7}, 672 {Intrinsic::lrint, MVT::v2i32, 1}, 673 {Intrinsic::lrint, MVT::v4i32, 1}, 674 {Intrinsic::lrint, MVT::v8i32, 1}, 675 {Intrinsic::lrint, MVT::v16i32, 1}, 676 {Intrinsic::lrint, MVT::nxv1i32, 1}, 677 {Intrinsic::lrint, MVT::nxv2i32, 1}, 678 {Intrinsic::lrint, MVT::nxv4i32, 1}, 679 {Intrinsic::lrint, MVT::nxv8i32, 1}, 680 {Intrinsic::lrint, MVT::nxv16i32, 1}, 681 {Intrinsic::lrint, MVT::v2i64, 1}, 682 {Intrinsic::lrint, MVT::v4i64, 1}, 683 {Intrinsic::lrint, MVT::v8i64, 1}, 684 {Intrinsic::lrint, MVT::v16i64, 1}, 685 {Intrinsic::lrint, MVT::nxv1i64, 1}, 686 {Intrinsic::lrint, MVT::nxv2i64, 1}, 687 {Intrinsic::lrint, MVT::nxv4i64, 1}, 688 {Intrinsic::lrint, MVT::nxv8i64, 1}, 689 {Intrinsic::llrint, MVT::v2i64, 1}, 690 {Intrinsic::llrint, MVT::v4i64, 1}, 691 {Intrinsic::llrint, MVT::v8i64, 1}, 692 {Intrinsic::llrint, MVT::v16i64, 1}, 693 {Intrinsic::llrint, MVT::nxv1i64, 1}, 694 {Intrinsic::llrint, MVT::nxv2i64, 1}, 695 {Intrinsic::llrint, MVT::nxv4i64, 1}, 696 {Intrinsic::llrint, MVT::nxv8i64, 1}, 697 {Intrinsic::nearbyint, MVT::v2f32, 9}, 698 {Intrinsic::nearbyint, MVT::v4f32, 9}, 699 {Intrinsic::nearbyint, MVT::v8f32, 9}, 700 {Intrinsic::nearbyint, MVT::v16f32, 9}, 701 {Intrinsic::nearbyint, MVT::nxv1f32, 9}, 702 {Intrinsic::nearbyint, MVT::nxv2f32, 9}, 703 {Intrinsic::nearbyint, MVT::nxv4f32, 9}, 704 {Intrinsic::nearbyint, MVT::nxv8f32, 9}, 705 {Intrinsic::nearbyint, MVT::nxv16f32, 9}, 706 {Intrinsic::nearbyint, MVT::v2f64, 9}, 707 {Intrinsic::nearbyint, MVT::v4f64, 9}, 708 {Intrinsic::nearbyint, MVT::v8f64, 9}, 709 {Intrinsic::nearbyint, MVT::v16f64, 9}, 710 {Intrinsic::nearbyint, MVT::nxv1f64, 9}, 711 {Intrinsic::nearbyint, MVT::nxv2f64, 9}, 712 {Intrinsic::nearbyint, MVT::nxv4f64, 9}, 713 {Intrinsic::nearbyint, MVT::nxv8f64, 9}, 714 {Intrinsic::bswap, MVT::v2i16, 3}, 715 {Intrinsic::bswap, MVT::v4i16, 3}, 716 {Intrinsic::bswap, MVT::v8i16, 3}, 717 {Intrinsic::bswap, MVT::v16i16, 3}, 718 {Intrinsic::bswap, MVT::nxv1i16, 3}, 719 {Intrinsic::bswap, MVT::nxv2i16, 3}, 720 {Intrinsic::bswap, MVT::nxv4i16, 3}, 721 {Intrinsic::bswap, MVT::nxv8i16, 3}, 722 {Intrinsic::bswap, MVT::nxv16i16, 3}, 723 {Intrinsic::bswap, MVT::v2i32, 12}, 724 {Intrinsic::bswap, MVT::v4i32, 12}, 725 {Intrinsic::bswap, MVT::v8i32, 12}, 726 {Intrinsic::bswap, MVT::v16i32, 12}, 727 {Intrinsic::bswap, MVT::nxv1i32, 12}, 728 {Intrinsic::bswap, MVT::nxv2i32, 12}, 729 {Intrinsic::bswap, MVT::nxv4i32, 12}, 730 {Intrinsic::bswap, MVT::nxv8i32, 12}, 731 {Intrinsic::bswap, MVT::nxv16i32, 12}, 732 {Intrinsic::bswap, MVT::v2i64, 31}, 733 {Intrinsic::bswap, MVT::v4i64, 31}, 734 {Intrinsic::bswap, MVT::v8i64, 31}, 735 {Intrinsic::bswap, MVT::v16i64, 31}, 736 {Intrinsic::bswap, MVT::nxv1i64, 31}, 737 {Intrinsic::bswap, MVT::nxv2i64, 31}, 738 {Intrinsic::bswap, MVT::nxv4i64, 31}, 739 {Intrinsic::bswap, MVT::nxv8i64, 31}, 740 {Intrinsic::vp_bswap, MVT::v2i16, 3}, 741 {Intrinsic::vp_bswap, MVT::v4i16, 3}, 742 {Intrinsic::vp_bswap, MVT::v8i16, 3}, 743 {Intrinsic::vp_bswap, MVT::v16i16, 3}, 744 {Intrinsic::vp_bswap, MVT::nxv1i16, 3}, 745 {Intrinsic::vp_bswap, MVT::nxv2i16, 3}, 746 {Intrinsic::vp_bswap, MVT::nxv4i16, 3}, 747 {Intrinsic::vp_bswap, MVT::nxv8i16, 3}, 748 {Intrinsic::vp_bswap, MVT::nxv16i16, 3}, 749 {Intrinsic::vp_bswap, MVT::v2i32, 12}, 750 {Intrinsic::vp_bswap, MVT::v4i32, 12}, 751 {Intrinsic::vp_bswap, MVT::v8i32, 12}, 752 {Intrinsic::vp_bswap, MVT::v16i32, 12}, 753 {Intrinsic::vp_bswap, MVT::nxv1i32, 12}, 754 {Intrinsic::vp_bswap, MVT::nxv2i32, 12}, 755 {Intrinsic::vp_bswap, MVT::nxv4i32, 12}, 756 {Intrinsic::vp_bswap, MVT::nxv8i32, 12}, 757 {Intrinsic::vp_bswap, MVT::nxv16i32, 12}, 758 {Intrinsic::vp_bswap, MVT::v2i64, 31}, 759 {Intrinsic::vp_bswap, MVT::v4i64, 31}, 760 {Intrinsic::vp_bswap, MVT::v8i64, 31}, 761 {Intrinsic::vp_bswap, MVT::v16i64, 31}, 762 {Intrinsic::vp_bswap, MVT::nxv1i64, 31}, 763 {Intrinsic::vp_bswap, MVT::nxv2i64, 31}, 764 {Intrinsic::vp_bswap, MVT::nxv4i64, 31}, 765 {Intrinsic::vp_bswap, MVT::nxv8i64, 31}, 766 {Intrinsic::vp_fshl, MVT::v2i8, 7}, 767 {Intrinsic::vp_fshl, MVT::v4i8, 7}, 768 {Intrinsic::vp_fshl, MVT::v8i8, 7}, 769 {Intrinsic::vp_fshl, MVT::v16i8, 7}, 770 {Intrinsic::vp_fshl, MVT::nxv1i8, 7}, 771 {Intrinsic::vp_fshl, MVT::nxv2i8, 7}, 772 {Intrinsic::vp_fshl, MVT::nxv4i8, 7}, 773 {Intrinsic::vp_fshl, MVT::nxv8i8, 7}, 774 {Intrinsic::vp_fshl, MVT::nxv16i8, 7}, 775 {Intrinsic::vp_fshl, MVT::nxv32i8, 7}, 776 {Intrinsic::vp_fshl, MVT::nxv64i8, 7}, 777 {Intrinsic::vp_fshl, MVT::v2i16, 7}, 778 {Intrinsic::vp_fshl, MVT::v4i16, 7}, 779 {Intrinsic::vp_fshl, MVT::v8i16, 7}, 780 {Intrinsic::vp_fshl, MVT::v16i16, 7}, 781 {Intrinsic::vp_fshl, MVT::nxv1i16, 7}, 782 {Intrinsic::vp_fshl, MVT::nxv2i16, 7}, 783 {Intrinsic::vp_fshl, MVT::nxv4i16, 7}, 784 {Intrinsic::vp_fshl, MVT::nxv8i16, 7}, 785 {Intrinsic::vp_fshl, MVT::nxv16i16, 7}, 786 {Intrinsic::vp_fshl, MVT::nxv32i16, 7}, 787 {Intrinsic::vp_fshl, MVT::v2i32, 7}, 788 {Intrinsic::vp_fshl, MVT::v4i32, 7}, 789 {Intrinsic::vp_fshl, MVT::v8i32, 7}, 790 {Intrinsic::vp_fshl, MVT::v16i32, 7}, 791 {Intrinsic::vp_fshl, MVT::nxv1i32, 7}, 792 {Intrinsic::vp_fshl, MVT::nxv2i32, 7}, 793 {Intrinsic::vp_fshl, MVT::nxv4i32, 7}, 794 {Intrinsic::vp_fshl, MVT::nxv8i32, 7}, 795 {Intrinsic::vp_fshl, MVT::nxv16i32, 7}, 796 {Intrinsic::vp_fshl, MVT::v2i64, 7}, 797 {Intrinsic::vp_fshl, MVT::v4i64, 7}, 798 {Intrinsic::vp_fshl, MVT::v8i64, 7}, 799 {Intrinsic::vp_fshl, MVT::v16i64, 7}, 800 {Intrinsic::vp_fshl, MVT::nxv1i64, 7}, 801 {Intrinsic::vp_fshl, MVT::nxv2i64, 7}, 802 {Intrinsic::vp_fshl, MVT::nxv4i64, 7}, 803 {Intrinsic::vp_fshl, MVT::nxv8i64, 7}, 804 {Intrinsic::vp_fshr, MVT::v2i8, 7}, 805 {Intrinsic::vp_fshr, MVT::v4i8, 7}, 806 {Intrinsic::vp_fshr, MVT::v8i8, 7}, 807 {Intrinsic::vp_fshr, MVT::v16i8, 7}, 808 {Intrinsic::vp_fshr, MVT::nxv1i8, 7}, 809 {Intrinsic::vp_fshr, MVT::nxv2i8, 7}, 810 {Intrinsic::vp_fshr, MVT::nxv4i8, 7}, 811 {Intrinsic::vp_fshr, MVT::nxv8i8, 7}, 812 {Intrinsic::vp_fshr, MVT::nxv16i8, 7}, 813 {Intrinsic::vp_fshr, MVT::nxv32i8, 7}, 814 {Intrinsic::vp_fshr, MVT::nxv64i8, 7}, 815 {Intrinsic::vp_fshr, MVT::v2i16, 7}, 816 {Intrinsic::vp_fshr, MVT::v4i16, 7}, 817 {Intrinsic::vp_fshr, MVT::v8i16, 7}, 818 {Intrinsic::vp_fshr, MVT::v16i16, 7}, 819 {Intrinsic::vp_fshr, MVT::nxv1i16, 7}, 820 {Intrinsic::vp_fshr, MVT::nxv2i16, 7}, 821 {Intrinsic::vp_fshr, MVT::nxv4i16, 7}, 822 {Intrinsic::vp_fshr, MVT::nxv8i16, 7}, 823 {Intrinsic::vp_fshr, MVT::nxv16i16, 7}, 824 {Intrinsic::vp_fshr, MVT::nxv32i16, 7}, 825 {Intrinsic::vp_fshr, MVT::v2i32, 7}, 826 {Intrinsic::vp_fshr, MVT::v4i32, 7}, 827 {Intrinsic::vp_fshr, MVT::v8i32, 7}, 828 {Intrinsic::vp_fshr, MVT::v16i32, 7}, 829 {Intrinsic::vp_fshr, MVT::nxv1i32, 7}, 830 {Intrinsic::vp_fshr, MVT::nxv2i32, 7}, 831 {Intrinsic::vp_fshr, MVT::nxv4i32, 7}, 832 {Intrinsic::vp_fshr, MVT::nxv8i32, 7}, 833 {Intrinsic::vp_fshr, MVT::nxv16i32, 7}, 834 {Intrinsic::vp_fshr, MVT::v2i64, 7}, 835 {Intrinsic::vp_fshr, MVT::v4i64, 7}, 836 {Intrinsic::vp_fshr, MVT::v8i64, 7}, 837 {Intrinsic::vp_fshr, MVT::v16i64, 7}, 838 {Intrinsic::vp_fshr, MVT::nxv1i64, 7}, 839 {Intrinsic::vp_fshr, MVT::nxv2i64, 7}, 840 {Intrinsic::vp_fshr, MVT::nxv4i64, 7}, 841 {Intrinsic::vp_fshr, MVT::nxv8i64, 7}, 842 {Intrinsic::bitreverse, MVT::v2i8, 17}, 843 {Intrinsic::bitreverse, MVT::v4i8, 17}, 844 {Intrinsic::bitreverse, MVT::v8i8, 17}, 845 {Intrinsic::bitreverse, MVT::v16i8, 17}, 846 {Intrinsic::bitreverse, MVT::nxv1i8, 17}, 847 {Intrinsic::bitreverse, MVT::nxv2i8, 17}, 848 {Intrinsic::bitreverse, MVT::nxv4i8, 17}, 849 {Intrinsic::bitreverse, MVT::nxv8i8, 17}, 850 {Intrinsic::bitreverse, MVT::nxv16i8, 17}, 851 {Intrinsic::bitreverse, MVT::v2i16, 24}, 852 {Intrinsic::bitreverse, MVT::v4i16, 24}, 853 {Intrinsic::bitreverse, MVT::v8i16, 24}, 854 {Intrinsic::bitreverse, MVT::v16i16, 24}, 855 {Intrinsic::bitreverse, MVT::nxv1i16, 24}, 856 {Intrinsic::bitreverse, MVT::nxv2i16, 24}, 857 {Intrinsic::bitreverse, MVT::nxv4i16, 24}, 858 {Intrinsic::bitreverse, MVT::nxv8i16, 24}, 859 {Intrinsic::bitreverse, MVT::nxv16i16, 24}, 860 {Intrinsic::bitreverse, MVT::v2i32, 33}, 861 {Intrinsic::bitreverse, MVT::v4i32, 33}, 862 {Intrinsic::bitreverse, MVT::v8i32, 33}, 863 {Intrinsic::bitreverse, MVT::v16i32, 33}, 864 {Intrinsic::bitreverse, MVT::nxv1i32, 33}, 865 {Intrinsic::bitreverse, MVT::nxv2i32, 33}, 866 {Intrinsic::bitreverse, MVT::nxv4i32, 33}, 867 {Intrinsic::bitreverse, MVT::nxv8i32, 33}, 868 {Intrinsic::bitreverse, MVT::nxv16i32, 33}, 869 {Intrinsic::bitreverse, MVT::v2i64, 52}, 870 {Intrinsic::bitreverse, MVT::v4i64, 52}, 871 {Intrinsic::bitreverse, MVT::v8i64, 52}, 872 {Intrinsic::bitreverse, MVT::v16i64, 52}, 873 {Intrinsic::bitreverse, MVT::nxv1i64, 52}, 874 {Intrinsic::bitreverse, MVT::nxv2i64, 52}, 875 {Intrinsic::bitreverse, MVT::nxv4i64, 52}, 876 {Intrinsic::bitreverse, MVT::nxv8i64, 52}, 877 {Intrinsic::vp_bitreverse, MVT::v2i8, 17}, 878 {Intrinsic::vp_bitreverse, MVT::v4i8, 17}, 879 {Intrinsic::vp_bitreverse, MVT::v8i8, 17}, 880 {Intrinsic::vp_bitreverse, MVT::v16i8, 17}, 881 {Intrinsic::vp_bitreverse, MVT::nxv1i8, 17}, 882 {Intrinsic::vp_bitreverse, MVT::nxv2i8, 17}, 883 {Intrinsic::vp_bitreverse, MVT::nxv4i8, 17}, 884 {Intrinsic::vp_bitreverse, MVT::nxv8i8, 17}, 885 {Intrinsic::vp_bitreverse, MVT::nxv16i8, 17}, 886 {Intrinsic::vp_bitreverse, MVT::v2i16, 24}, 887 {Intrinsic::vp_bitreverse, MVT::v4i16, 24}, 888 {Intrinsic::vp_bitreverse, MVT::v8i16, 24}, 889 {Intrinsic::vp_bitreverse, MVT::v16i16, 24}, 890 {Intrinsic::vp_bitreverse, MVT::nxv1i16, 24}, 891 {Intrinsic::vp_bitreverse, MVT::nxv2i16, 24}, 892 {Intrinsic::vp_bitreverse, MVT::nxv4i16, 24}, 893 {Intrinsic::vp_bitreverse, MVT::nxv8i16, 24}, 894 {Intrinsic::vp_bitreverse, MVT::nxv16i16, 24}, 895 {Intrinsic::vp_bitreverse, MVT::v2i32, 33}, 896 {Intrinsic::vp_bitreverse, MVT::v4i32, 33}, 897 {Intrinsic::vp_bitreverse, MVT::v8i32, 33}, 898 {Intrinsic::vp_bitreverse, MVT::v16i32, 33}, 899 {Intrinsic::vp_bitreverse, MVT::nxv1i32, 33}, 900 {Intrinsic::vp_bitreverse, MVT::nxv2i32, 33}, 901 {Intrinsic::vp_bitreverse, MVT::nxv4i32, 33}, 902 {Intrinsic::vp_bitreverse, MVT::nxv8i32, 33}, 903 {Intrinsic::vp_bitreverse, MVT::nxv16i32, 33}, 904 {Intrinsic::vp_bitreverse, MVT::v2i64, 52}, 905 {Intrinsic::vp_bitreverse, MVT::v4i64, 52}, 906 {Intrinsic::vp_bitreverse, MVT::v8i64, 52}, 907 {Intrinsic::vp_bitreverse, MVT::v16i64, 52}, 908 {Intrinsic::vp_bitreverse, MVT::nxv1i64, 52}, 909 {Intrinsic::vp_bitreverse, MVT::nxv2i64, 52}, 910 {Intrinsic::vp_bitreverse, MVT::nxv4i64, 52}, 911 {Intrinsic::vp_bitreverse, MVT::nxv8i64, 52}, 912 {Intrinsic::ctpop, MVT::v2i8, 12}, 913 {Intrinsic::ctpop, MVT::v4i8, 12}, 914 {Intrinsic::ctpop, MVT::v8i8, 12}, 915 {Intrinsic::ctpop, MVT::v16i8, 12}, 916 {Intrinsic::ctpop, MVT::nxv1i8, 12}, 917 {Intrinsic::ctpop, MVT::nxv2i8, 12}, 918 {Intrinsic::ctpop, MVT::nxv4i8, 12}, 919 {Intrinsic::ctpop, MVT::nxv8i8, 12}, 920 {Intrinsic::ctpop, MVT::nxv16i8, 12}, 921 {Intrinsic::ctpop, MVT::v2i16, 19}, 922 {Intrinsic::ctpop, MVT::v4i16, 19}, 923 {Intrinsic::ctpop, MVT::v8i16, 19}, 924 {Intrinsic::ctpop, MVT::v16i16, 19}, 925 {Intrinsic::ctpop, MVT::nxv1i16, 19}, 926 {Intrinsic::ctpop, MVT::nxv2i16, 19}, 927 {Intrinsic::ctpop, MVT::nxv4i16, 19}, 928 {Intrinsic::ctpop, MVT::nxv8i16, 19}, 929 {Intrinsic::ctpop, MVT::nxv16i16, 19}, 930 {Intrinsic::ctpop, MVT::v2i32, 20}, 931 {Intrinsic::ctpop, MVT::v4i32, 20}, 932 {Intrinsic::ctpop, MVT::v8i32, 20}, 933 {Intrinsic::ctpop, MVT::v16i32, 20}, 934 {Intrinsic::ctpop, MVT::nxv1i32, 20}, 935 {Intrinsic::ctpop, MVT::nxv2i32, 20}, 936 {Intrinsic::ctpop, MVT::nxv4i32, 20}, 937 {Intrinsic::ctpop, MVT::nxv8i32, 20}, 938 {Intrinsic::ctpop, MVT::nxv16i32, 20}, 939 {Intrinsic::ctpop, MVT::v2i64, 21}, 940 {Intrinsic::ctpop, MVT::v4i64, 21}, 941 {Intrinsic::ctpop, MVT::v8i64, 21}, 942 {Intrinsic::ctpop, MVT::v16i64, 21}, 943 {Intrinsic::ctpop, MVT::nxv1i64, 21}, 944 {Intrinsic::ctpop, MVT::nxv2i64, 21}, 945 {Intrinsic::ctpop, MVT::nxv4i64, 21}, 946 {Intrinsic::ctpop, MVT::nxv8i64, 21}, 947 {Intrinsic::vp_ctpop, MVT::v2i8, 12}, 948 {Intrinsic::vp_ctpop, MVT::v4i8, 12}, 949 {Intrinsic::vp_ctpop, MVT::v8i8, 12}, 950 {Intrinsic::vp_ctpop, MVT::v16i8, 12}, 951 {Intrinsic::vp_ctpop, MVT::nxv1i8, 12}, 952 {Intrinsic::vp_ctpop, MVT::nxv2i8, 12}, 953 {Intrinsic::vp_ctpop, MVT::nxv4i8, 12}, 954 {Intrinsic::vp_ctpop, MVT::nxv8i8, 12}, 955 {Intrinsic::vp_ctpop, MVT::nxv16i8, 12}, 956 {Intrinsic::vp_ctpop, MVT::v2i16, 19}, 957 {Intrinsic::vp_ctpop, MVT::v4i16, 19}, 958 {Intrinsic::vp_ctpop, MVT::v8i16, 19}, 959 {Intrinsic::vp_ctpop, MVT::v16i16, 19}, 960 {Intrinsic::vp_ctpop, MVT::nxv1i16, 19}, 961 {Intrinsic::vp_ctpop, MVT::nxv2i16, 19}, 962 {Intrinsic::vp_ctpop, MVT::nxv4i16, 19}, 963 {Intrinsic::vp_ctpop, MVT::nxv8i16, 19}, 964 {Intrinsic::vp_ctpop, MVT::nxv16i16, 19}, 965 {Intrinsic::vp_ctpop, MVT::v2i32, 20}, 966 {Intrinsic::vp_ctpop, MVT::v4i32, 20}, 967 {Intrinsic::vp_ctpop, MVT::v8i32, 20}, 968 {Intrinsic::vp_ctpop, MVT::v16i32, 20}, 969 {Intrinsic::vp_ctpop, MVT::nxv1i32, 20}, 970 {Intrinsic::vp_ctpop, MVT::nxv2i32, 20}, 971 {Intrinsic::vp_ctpop, MVT::nxv4i32, 20}, 972 {Intrinsic::vp_ctpop, MVT::nxv8i32, 20}, 973 {Intrinsic::vp_ctpop, MVT::nxv16i32, 20}, 974 {Intrinsic::vp_ctpop, MVT::v2i64, 21}, 975 {Intrinsic::vp_ctpop, MVT::v4i64, 21}, 976 {Intrinsic::vp_ctpop, MVT::v8i64, 21}, 977 {Intrinsic::vp_ctpop, MVT::v16i64, 21}, 978 {Intrinsic::vp_ctpop, MVT::nxv1i64, 21}, 979 {Intrinsic::vp_ctpop, MVT::nxv2i64, 21}, 980 {Intrinsic::vp_ctpop, MVT::nxv4i64, 21}, 981 {Intrinsic::vp_ctpop, MVT::nxv8i64, 21}, 982 {Intrinsic::vp_ctlz, MVT::v2i8, 19}, 983 {Intrinsic::vp_ctlz, MVT::v4i8, 19}, 984 {Intrinsic::vp_ctlz, MVT::v8i8, 19}, 985 {Intrinsic::vp_ctlz, MVT::v16i8, 19}, 986 {Intrinsic::vp_ctlz, MVT::nxv1i8, 19}, 987 {Intrinsic::vp_ctlz, MVT::nxv2i8, 19}, 988 {Intrinsic::vp_ctlz, MVT::nxv4i8, 19}, 989 {Intrinsic::vp_ctlz, MVT::nxv8i8, 19}, 990 {Intrinsic::vp_ctlz, MVT::nxv16i8, 19}, 991 {Intrinsic::vp_ctlz, MVT::nxv32i8, 19}, 992 {Intrinsic::vp_ctlz, MVT::nxv64i8, 19}, 993 {Intrinsic::vp_ctlz, MVT::v2i16, 28}, 994 {Intrinsic::vp_ctlz, MVT::v4i16, 28}, 995 {Intrinsic::vp_ctlz, MVT::v8i16, 28}, 996 {Intrinsic::vp_ctlz, MVT::v16i16, 28}, 997 {Intrinsic::vp_ctlz, MVT::nxv1i16, 28}, 998 {Intrinsic::vp_ctlz, MVT::nxv2i16, 28}, 999 {Intrinsic::vp_ctlz, MVT::nxv4i16, 28}, 1000 {Intrinsic::vp_ctlz, MVT::nxv8i16, 28}, 1001 {Intrinsic::vp_ctlz, MVT::nxv16i16, 28}, 1002 {Intrinsic::vp_ctlz, MVT::nxv32i16, 28}, 1003 {Intrinsic::vp_ctlz, MVT::v2i32, 31}, 1004 {Intrinsic::vp_ctlz, MVT::v4i32, 31}, 1005 {Intrinsic::vp_ctlz, MVT::v8i32, 31}, 1006 {Intrinsic::vp_ctlz, MVT::v16i32, 31}, 1007 {Intrinsic::vp_ctlz, MVT::nxv1i32, 31}, 1008 {Intrinsic::vp_ctlz, MVT::nxv2i32, 31}, 1009 {Intrinsic::vp_ctlz, MVT::nxv4i32, 31}, 1010 {Intrinsic::vp_ctlz, MVT::nxv8i32, 31}, 1011 {Intrinsic::vp_ctlz, MVT::nxv16i32, 31}, 1012 {Intrinsic::vp_ctlz, MVT::v2i64, 35}, 1013 {Intrinsic::vp_ctlz, MVT::v4i64, 35}, 1014 {Intrinsic::vp_ctlz, MVT::v8i64, 35}, 1015 {Intrinsic::vp_ctlz, MVT::v16i64, 35}, 1016 {Intrinsic::vp_ctlz, MVT::nxv1i64, 35}, 1017 {Intrinsic::vp_ctlz, MVT::nxv2i64, 35}, 1018 {Intrinsic::vp_ctlz, MVT::nxv4i64, 35}, 1019 {Intrinsic::vp_ctlz, MVT::nxv8i64, 35}, 1020 {Intrinsic::vp_cttz, MVT::v2i8, 16}, 1021 {Intrinsic::vp_cttz, MVT::v4i8, 16}, 1022 {Intrinsic::vp_cttz, MVT::v8i8, 16}, 1023 {Intrinsic::vp_cttz, MVT::v16i8, 16}, 1024 {Intrinsic::vp_cttz, MVT::nxv1i8, 16}, 1025 {Intrinsic::vp_cttz, MVT::nxv2i8, 16}, 1026 {Intrinsic::vp_cttz, MVT::nxv4i8, 16}, 1027 {Intrinsic::vp_cttz, MVT::nxv8i8, 16}, 1028 {Intrinsic::vp_cttz, MVT::nxv16i8, 16}, 1029 {Intrinsic::vp_cttz, MVT::nxv32i8, 16}, 1030 {Intrinsic::vp_cttz, MVT::nxv64i8, 16}, 1031 {Intrinsic::vp_cttz, MVT::v2i16, 23}, 1032 {Intrinsic::vp_cttz, MVT::v4i16, 23}, 1033 {Intrinsic::vp_cttz, MVT::v8i16, 23}, 1034 {Intrinsic::vp_cttz, MVT::v16i16, 23}, 1035 {Intrinsic::vp_cttz, MVT::nxv1i16, 23}, 1036 {Intrinsic::vp_cttz, MVT::nxv2i16, 23}, 1037 {Intrinsic::vp_cttz, MVT::nxv4i16, 23}, 1038 {Intrinsic::vp_cttz, MVT::nxv8i16, 23}, 1039 {Intrinsic::vp_cttz, MVT::nxv16i16, 23}, 1040 {Intrinsic::vp_cttz, MVT::nxv32i16, 23}, 1041 {Intrinsic::vp_cttz, MVT::v2i32, 24}, 1042 {Intrinsic::vp_cttz, MVT::v4i32, 24}, 1043 {Intrinsic::vp_cttz, MVT::v8i32, 24}, 1044 {Intrinsic::vp_cttz, MVT::v16i32, 24}, 1045 {Intrinsic::vp_cttz, MVT::nxv1i32, 24}, 1046 {Intrinsic::vp_cttz, MVT::nxv2i32, 24}, 1047 {Intrinsic::vp_cttz, MVT::nxv4i32, 24}, 1048 {Intrinsic::vp_cttz, MVT::nxv8i32, 24}, 1049 {Intrinsic::vp_cttz, MVT::nxv16i32, 24}, 1050 {Intrinsic::vp_cttz, MVT::v2i64, 25}, 1051 {Intrinsic::vp_cttz, MVT::v4i64, 25}, 1052 {Intrinsic::vp_cttz, MVT::v8i64, 25}, 1053 {Intrinsic::vp_cttz, MVT::v16i64, 25}, 1054 {Intrinsic::vp_cttz, MVT::nxv1i64, 25}, 1055 {Intrinsic::vp_cttz, MVT::nxv2i64, 25}, 1056 {Intrinsic::vp_cttz, MVT::nxv4i64, 25}, 1057 {Intrinsic::vp_cttz, MVT::nxv8i64, 25}, 1058 }; 1059 1060 static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID) { 1061 switch (ID) { 1062 #define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \ 1063 case Intrinsic::VPID: \ 1064 return ISD::VPSD; 1065 #include "llvm/IR/VPIntrinsics.def" 1066 #undef HELPER_MAP_VPID_TO_VPSD 1067 } 1068 return ISD::DELETED_NODE; 1069 } 1070 1071 InstructionCost 1072 RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1073 TTI::TargetCostKind CostKind) { 1074 auto *RetTy = ICA.getReturnType(); 1075 switch (ICA.getID()) { 1076 case Intrinsic::ceil: 1077 case Intrinsic::floor: 1078 case Intrinsic::trunc: 1079 case Intrinsic::rint: 1080 case Intrinsic::lrint: 1081 case Intrinsic::llrint: 1082 case Intrinsic::round: 1083 case Intrinsic::roundeven: { 1084 // These all use the same code. 1085 auto LT = getTypeLegalizationCost(RetTy); 1086 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second)) 1087 return LT.first * 8; 1088 break; 1089 } 1090 case Intrinsic::umin: 1091 case Intrinsic::umax: 1092 case Intrinsic::smin: 1093 case Intrinsic::smax: { 1094 auto LT = getTypeLegalizationCost(RetTy); 1095 if ((ST->hasVInstructions() && LT.second.isVector()) || 1096 (LT.second.isScalarInteger() && ST->hasStdExtZbb())) 1097 return LT.first; 1098 break; 1099 } 1100 case Intrinsic::sadd_sat: 1101 case Intrinsic::ssub_sat: 1102 case Intrinsic::uadd_sat: 1103 case Intrinsic::usub_sat: 1104 case Intrinsic::fabs: 1105 case Intrinsic::sqrt: { 1106 auto LT = getTypeLegalizationCost(RetTy); 1107 if (ST->hasVInstructions() && LT.second.isVector()) 1108 return LT.first; 1109 break; 1110 } 1111 case Intrinsic::ctpop: { 1112 auto LT = getTypeLegalizationCost(RetTy); 1113 if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector()) 1114 return LT.first; 1115 break; 1116 } 1117 case Intrinsic::abs: { 1118 auto LT = getTypeLegalizationCost(RetTy); 1119 if (ST->hasVInstructions() && LT.second.isVector()) { 1120 // vrsub.vi v10, v8, 0 1121 // vmax.vv v8, v8, v10 1122 return LT.first * 2; 1123 } 1124 break; 1125 } 1126 // TODO: add more intrinsic 1127 case Intrinsic::experimental_stepvector: { 1128 unsigned Cost = 1; // vid 1129 auto LT = getTypeLegalizationCost(RetTy); 1130 return Cost + (LT.first - 1); 1131 } 1132 case Intrinsic::vp_rint: { 1133 // RISC-V target uses at least 5 instructions to lower rounding intrinsics. 1134 unsigned Cost = 5; 1135 auto LT = getTypeLegalizationCost(RetTy); 1136 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second)) 1137 return Cost * LT.first; 1138 break; 1139 } 1140 case Intrinsic::vp_nearbyint: { 1141 // More one read and one write for fflags than vp_rint. 1142 unsigned Cost = 7; 1143 auto LT = getTypeLegalizationCost(RetTy); 1144 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second)) 1145 return Cost * LT.first; 1146 break; 1147 } 1148 case Intrinsic::vp_ceil: 1149 case Intrinsic::vp_floor: 1150 case Intrinsic::vp_round: 1151 case Intrinsic::vp_roundeven: 1152 case Intrinsic::vp_roundtozero: { 1153 // Rounding with static rounding mode needs two more instructions to 1154 // swap/write FRM than vp_rint. 1155 unsigned Cost = 7; 1156 auto LT = getTypeLegalizationCost(RetTy); 1157 unsigned VPISD = getISDForVPIntrinsicID(ICA.getID()); 1158 if (TLI->isOperationCustom(VPISD, LT.second)) 1159 return Cost * LT.first; 1160 break; 1161 } 1162 } 1163 1164 if (ST->hasVInstructions() && RetTy->isVectorTy()) { 1165 auto LT = getTypeLegalizationCost(RetTy); 1166 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable, 1167 ICA.getID(), LT.second)) 1168 return LT.first * Entry->Cost; 1169 } 1170 1171 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1172 } 1173 1174 InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 1175 Type *Src, 1176 TTI::CastContextHint CCH, 1177 TTI::TargetCostKind CostKind, 1178 const Instruction *I) { 1179 if (isa<VectorType>(Dst) && isa<VectorType>(Src)) { 1180 // FIXME: Need to compute legalizing cost for illegal types. 1181 if (!isTypeLegal(Src) || !isTypeLegal(Dst)) 1182 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 1183 1184 // Skip if element size of Dst or Src is bigger than ELEN. 1185 if (Src->getScalarSizeInBits() > ST->getELen() || 1186 Dst->getScalarSizeInBits() > ST->getELen()) 1187 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 1188 1189 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1190 assert(ISD && "Invalid opcode"); 1191 1192 // FIXME: Need to consider vsetvli and lmul. 1193 int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) - 1194 (int)Log2_32(Src->getScalarSizeInBits()); 1195 switch (ISD) { 1196 case ISD::SIGN_EXTEND: 1197 case ISD::ZERO_EXTEND: 1198 if (Src->getScalarSizeInBits() == 1) { 1199 // We do not use vsext/vzext to extend from mask vector. 1200 // Instead we use the following instructions to extend from mask vector: 1201 // vmv.v.i v8, 0 1202 // vmerge.vim v8, v8, -1, v0 1203 return 2; 1204 } 1205 return 1; 1206 case ISD::TRUNCATE: 1207 if (Dst->getScalarSizeInBits() == 1) { 1208 // We do not use several vncvt to truncate to mask vector. So we could 1209 // not use PowDiff to calculate it. 1210 // Instead we use the following instructions to truncate to mask vector: 1211 // vand.vi v8, v8, 1 1212 // vmsne.vi v0, v8, 0 1213 return 2; 1214 } 1215 [[fallthrough]]; 1216 case ISD::FP_EXTEND: 1217 case ISD::FP_ROUND: 1218 // Counts of narrow/widen instructions. 1219 return std::abs(PowDiff); 1220 case ISD::FP_TO_SINT: 1221 case ISD::FP_TO_UINT: 1222 case ISD::SINT_TO_FP: 1223 case ISD::UINT_TO_FP: 1224 if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) { 1225 // The cost of convert from or to mask vector is different from other 1226 // cases. We could not use PowDiff to calculate it. 1227 // For mask vector to fp, we should use the following instructions: 1228 // vmv.v.i v8, 0 1229 // vmerge.vim v8, v8, -1, v0 1230 // vfcvt.f.x.v v8, v8 1231 1232 // And for fp vector to mask, we use: 1233 // vfncvt.rtz.x.f.w v9, v8 1234 // vand.vi v8, v9, 1 1235 // vmsne.vi v0, v8, 0 1236 return 3; 1237 } 1238 if (std::abs(PowDiff) <= 1) 1239 return 1; 1240 // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8), 1241 // so it only need two conversion. 1242 if (Src->isIntOrIntVectorTy()) 1243 return 2; 1244 // Counts of narrow/widen instructions. 1245 return std::abs(PowDiff); 1246 } 1247 } 1248 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 1249 } 1250 1251 unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) { 1252 if (isa<ScalableVectorType>(Ty)) { 1253 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType()); 1254 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue(); 1255 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock; 1256 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize); 1257 } 1258 return cast<FixedVectorType>(Ty)->getNumElements(); 1259 } 1260 1261 InstructionCost 1262 RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, 1263 FastMathFlags FMF, 1264 TTI::TargetCostKind CostKind) { 1265 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors()) 1266 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); 1267 1268 // Skip if scalar size of Ty is bigger than ELEN. 1269 if (Ty->getScalarSizeInBits() > ST->getELen()) 1270 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); 1271 1272 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 1273 if (Ty->getElementType()->isIntegerTy(1)) 1274 // vcpop sequences, see vreduction-mask.ll. umax, smin actually only 1275 // cost 2, but we don't have enough info here so we slightly over cost. 1276 return (LT.first - 1) + 3; 1277 1278 // IR Reduction is composed by two vmv and one rvv reduction instruction. 1279 InstructionCost BaseCost = 2; 1280 1281 if (CostKind == TTI::TCK_CodeSize) 1282 return (LT.first - 1) + BaseCost; 1283 1284 unsigned VL = getEstimatedVLFor(Ty); 1285 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL); 1286 } 1287 1288 InstructionCost 1289 RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, 1290 std::optional<FastMathFlags> FMF, 1291 TTI::TargetCostKind CostKind) { 1292 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors()) 1293 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 1294 1295 // Skip if scalar size of Ty is bigger than ELEN. 1296 if (Ty->getScalarSizeInBits() > ST->getELen()) 1297 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 1298 1299 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1300 assert(ISD && "Invalid opcode"); 1301 1302 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND && 1303 ISD != ISD::FADD) 1304 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 1305 1306 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 1307 if (Ty->getElementType()->isIntegerTy(1)) 1308 // vcpop sequences, see vreduction-mask.ll 1309 return (LT.first - 1) + (ISD == ISD::AND ? 3 : 2); 1310 1311 // IR Reduction is composed by two vmv and one rvv reduction instruction. 1312 InstructionCost BaseCost = 2; 1313 1314 if (CostKind == TTI::TCK_CodeSize) 1315 return (LT.first - 1) + BaseCost; 1316 1317 unsigned VL = getEstimatedVLFor(Ty); 1318 if (TTI::requiresOrderedReduction(FMF)) 1319 return (LT.first - 1) + BaseCost + VL; 1320 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL); 1321 } 1322 1323 InstructionCost RISCVTTIImpl::getExtendedReductionCost( 1324 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, 1325 FastMathFlags FMF, TTI::TargetCostKind CostKind) { 1326 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors()) 1327 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1328 FMF, CostKind); 1329 1330 // Skip if scalar size of ResTy is bigger than ELEN. 1331 if (ResTy->getScalarSizeInBits() > ST->getELen()) 1332 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1333 FMF, CostKind); 1334 1335 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd) 1336 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1337 FMF, CostKind); 1338 1339 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1340 1341 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits()) 1342 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1343 FMF, CostKind); 1344 1345 return (LT.first - 1) + 1346 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 1347 } 1348 1349 InstructionCost RISCVTTIImpl::getStoreImmCost(Type *Ty, 1350 TTI::OperandValueInfo OpInfo, 1351 TTI::TargetCostKind CostKind) { 1352 assert(OpInfo.isConstant() && "non constant operand?"); 1353 if (!isa<VectorType>(Ty)) 1354 // FIXME: We need to account for immediate materialization here, but doing 1355 // a decent job requires more knowledge about the immediate than we 1356 // currently have here. 1357 return 0; 1358 1359 if (OpInfo.isUniform()) 1360 // vmv.x.i, vmv.v.x, or vfmv.v.f 1361 // We ignore the cost of the scalar constant materialization to be consistent 1362 // with how we treat scalar constants themselves just above. 1363 return 1; 1364 1365 return getConstantPoolLoadCost(Ty, CostKind); 1366 } 1367 1368 1369 InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, 1370 MaybeAlign Alignment, 1371 unsigned AddressSpace, 1372 TTI::TargetCostKind CostKind, 1373 TTI::OperandValueInfo OpInfo, 1374 const Instruction *I) { 1375 EVT VT = TLI->getValueType(DL, Src, true); 1376 // Type legalization can't handle structs 1377 if (VT == MVT::Other) 1378 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 1379 CostKind, OpInfo, I); 1380 1381 InstructionCost Cost = 0; 1382 if (Opcode == Instruction::Store && OpInfo.isConstant()) 1383 Cost += getStoreImmCost(Src, OpInfo, CostKind); 1384 InstructionCost BaseCost = 1385 BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 1386 CostKind, OpInfo, I); 1387 // Assume memory ops cost scale with the number of vector registers 1388 // possible accessed by the instruction. Note that BasicTTI already 1389 // handles the LT.first term for us. 1390 if (std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src); 1391 LT.second.isVector()) 1392 BaseCost *= TLI->getLMULCost(LT.second); 1393 return Cost + BaseCost; 1394 1395 } 1396 1397 InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 1398 Type *CondTy, 1399 CmpInst::Predicate VecPred, 1400 TTI::TargetCostKind CostKind, 1401 const Instruction *I) { 1402 if (CostKind != TTI::TCK_RecipThroughput) 1403 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1404 I); 1405 1406 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors()) 1407 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1408 I); 1409 1410 // Skip if scalar size of ValTy is bigger than ELEN. 1411 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen()) 1412 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1413 I); 1414 1415 if (Opcode == Instruction::Select && ValTy->isVectorTy()) { 1416 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1417 if (CondTy->isVectorTy()) { 1418 if (ValTy->getScalarSizeInBits() == 1) { 1419 // vmandn.mm v8, v8, v9 1420 // vmand.mm v9, v0, v9 1421 // vmor.mm v0, v9, v8 1422 return LT.first * 3; 1423 } 1424 // vselect and max/min are supported natively. 1425 return LT.first * 1; 1426 } 1427 1428 if (ValTy->getScalarSizeInBits() == 1) { 1429 // vmv.v.x v9, a0 1430 // vmsne.vi v9, v9, 0 1431 // vmandn.mm v8, v8, v9 1432 // vmand.mm v9, v0, v9 1433 // vmor.mm v0, v9, v8 1434 return LT.first * 5; 1435 } 1436 1437 // vmv.v.x v10, a0 1438 // vmsne.vi v0, v10, 0 1439 // vmerge.vvm v8, v9, v8, v0 1440 return LT.first * 3; 1441 } 1442 1443 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && 1444 ValTy->isVectorTy()) { 1445 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1446 1447 // Support natively. 1448 if (CmpInst::isIntPredicate(VecPred)) 1449 return LT.first * 1; 1450 1451 // If we do not support the input floating point vector type, use the base 1452 // one which will calculate as: 1453 // ScalarizeCost + Num * Cost for fixed vector, 1454 // InvalidCost for scalable vector. 1455 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) || 1456 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) || 1457 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64())) 1458 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1459 I); 1460 switch (VecPred) { 1461 // Support natively. 1462 case CmpInst::FCMP_OEQ: 1463 case CmpInst::FCMP_OGT: 1464 case CmpInst::FCMP_OGE: 1465 case CmpInst::FCMP_OLT: 1466 case CmpInst::FCMP_OLE: 1467 case CmpInst::FCMP_UNE: 1468 return LT.first * 1; 1469 // TODO: Other comparisons? 1470 default: 1471 break; 1472 } 1473 } 1474 1475 // TODO: Add cost for scalar type. 1476 1477 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); 1478 } 1479 1480 InstructionCost RISCVTTIImpl::getCFInstrCost(unsigned Opcode, 1481 TTI::TargetCostKind CostKind, 1482 const Instruction *I) { 1483 if (CostKind != TTI::TCK_RecipThroughput) 1484 return Opcode == Instruction::PHI ? 0 : 1; 1485 // Branches are assumed to be predicted. 1486 return 0; 1487 } 1488 1489 InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 1490 TTI::TargetCostKind CostKind, 1491 unsigned Index, Value *Op0, 1492 Value *Op1) { 1493 assert(Val->isVectorTy() && "This must be a vector type"); 1494 1495 if (Opcode != Instruction::ExtractElement && 1496 Opcode != Instruction::InsertElement) 1497 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); 1498 1499 // Legalize the type. 1500 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val); 1501 1502 // This type is legalized to a scalar type. 1503 if (!LT.second.isVector()) { 1504 auto *FixedVecTy = cast<FixedVectorType>(Val); 1505 // If Index is a known constant, cost is zero. 1506 if (Index != -1U) 1507 return 0; 1508 // Extract/InsertElement with non-constant index is very costly when 1509 // scalarized; estimate cost of loads/stores sequence via the stack: 1510 // ExtractElement cost: store vector to stack, load scalar; 1511 // InsertElement cost: store vector to stack, store scalar, load vector. 1512 Type *ElemTy = FixedVecTy->getElementType(); 1513 auto NumElems = FixedVecTy->getNumElements(); 1514 auto Align = DL.getPrefTypeAlign(ElemTy); 1515 InstructionCost LoadCost = 1516 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind); 1517 InstructionCost StoreCost = 1518 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind); 1519 return Opcode == Instruction::ExtractElement 1520 ? StoreCost * NumElems + LoadCost 1521 : (StoreCost + LoadCost) * NumElems + StoreCost; 1522 } 1523 1524 // For unsupported scalable vector. 1525 if (LT.second.isScalableVector() && !LT.first.isValid()) 1526 return LT.first; 1527 1528 if (!isTypeLegal(Val)) 1529 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); 1530 1531 // Mask vector extract/insert is expanded via e8. 1532 if (Val->getScalarSizeInBits() == 1) { 1533 VectorType *WideTy = 1534 VectorType::get(IntegerType::get(Val->getContext(), 8), 1535 cast<VectorType>(Val)->getElementCount()); 1536 if (Opcode == Instruction::ExtractElement) { 1537 InstructionCost ExtendCost 1538 = getCastInstrCost(Instruction::ZExt, WideTy, Val, 1539 TTI::CastContextHint::None, CostKind); 1540 InstructionCost ExtractCost 1541 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr); 1542 return ExtendCost + ExtractCost; 1543 } 1544 InstructionCost ExtendCost 1545 = getCastInstrCost(Instruction::ZExt, WideTy, Val, 1546 TTI::CastContextHint::None, CostKind); 1547 InstructionCost InsertCost 1548 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr); 1549 InstructionCost TruncCost 1550 = getCastInstrCost(Instruction::Trunc, Val, WideTy, 1551 TTI::CastContextHint::None, CostKind); 1552 return ExtendCost + InsertCost + TruncCost; 1553 } 1554 1555 1556 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector 1557 // and vslideup + vmv.s.x to insert element to vector. 1558 unsigned BaseCost = 1; 1559 // When insertelement we should add the index with 1 as the input of vslideup. 1560 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1; 1561 1562 if (Index != -1U) { 1563 // The type may be split. For fixed-width vectors we can normalize the 1564 // index to the new type. 1565 if (LT.second.isFixedLengthVector()) { 1566 unsigned Width = LT.second.getVectorNumElements(); 1567 Index = Index % Width; 1568 } 1569 1570 // We could extract/insert the first element without vslidedown/vslideup. 1571 if (Index == 0) 1572 SlideCost = 0; 1573 else if (Opcode == Instruction::InsertElement) 1574 SlideCost = 1; // With a constant index, we do not need to use addi. 1575 } 1576 1577 // Extract i64 in the target that has XLEN=32 need more instruction. 1578 if (Val->getScalarType()->isIntegerTy() && 1579 ST->getXLen() < Val->getScalarSizeInBits()) { 1580 // For extractelement, we need the following instructions: 1581 // vsetivli zero, 1, e64, m1, ta, mu (not count) 1582 // vslidedown.vx v8, v8, a0 1583 // vmv.x.s a0, v8 1584 // li a1, 32 1585 // vsrl.vx v8, v8, a1 1586 // vmv.x.s a1, v8 1587 1588 // For insertelement, we need the following instructions: 1589 // vsetivli zero, 2, e32, m4, ta, mu (not count) 1590 // vmv.v.i v12, 0 1591 // vslide1up.vx v16, v12, a1 1592 // vslide1up.vx v12, v16, a0 1593 // addi a0, a2, 1 1594 // vsetvli zero, a0, e64, m4, tu, mu (not count) 1595 // vslideup.vx v8, v12, a2 1596 1597 // TODO: should we count these special vsetvlis? 1598 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4; 1599 } 1600 return BaseCost + SlideCost; 1601 } 1602 1603 InstructionCost RISCVTTIImpl::getArithmeticInstrCost( 1604 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 1605 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, 1606 ArrayRef<const Value *> Args, const Instruction *CxtI) { 1607 1608 // TODO: Handle more cost kinds. 1609 if (CostKind != TTI::TCK_RecipThroughput) 1610 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1611 Args, CxtI); 1612 1613 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors()) 1614 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1615 Args, CxtI); 1616 1617 // Skip if scalar size of Ty is bigger than ELEN. 1618 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen()) 1619 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1620 Args, CxtI); 1621 1622 // Legalize the type. 1623 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 1624 1625 // TODO: Handle scalar type. 1626 if (!LT.second.isVector()) 1627 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1628 Args, CxtI); 1629 1630 1631 auto getConstantMatCost = 1632 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost { 1633 if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand)) 1634 // Two sub-cases: 1635 // * Has a 5 bit immediate operand which can be splatted. 1636 // * Has a larger immediate which must be materialized in scalar register 1637 // We return 0 for both as we currently ignore the cost of materializing 1638 // scalar constants in GPRs. 1639 return 0; 1640 1641 return getConstantPoolLoadCost(Ty, CostKind); 1642 }; 1643 1644 // Add the cost of materializing any constant vectors required. 1645 InstructionCost ConstantMatCost = 0; 1646 if (Op1Info.isConstant()) 1647 ConstantMatCost += getConstantMatCost(0, Op1Info); 1648 if (Op2Info.isConstant()) 1649 ConstantMatCost += getConstantMatCost(1, Op2Info); 1650 1651 switch (TLI->InstructionOpcodeToISD(Opcode)) { 1652 case ISD::ADD: 1653 case ISD::SUB: 1654 case ISD::AND: 1655 case ISD::OR: 1656 case ISD::XOR: 1657 case ISD::SHL: 1658 case ISD::SRL: 1659 case ISD::SRA: 1660 case ISD::MUL: 1661 case ISD::MULHS: 1662 case ISD::MULHU: 1663 case ISD::FADD: 1664 case ISD::FSUB: 1665 case ISD::FMUL: 1666 case ISD::FNEG: { 1667 return ConstantMatCost + TLI->getLMULCost(LT.second) * LT.first * 1; 1668 } 1669 default: 1670 return ConstantMatCost + 1671 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1672 Args, CxtI); 1673 } 1674 } 1675 1676 // TODO: Deduplicate from TargetTransformInfoImplCRTPBase. 1677 InstructionCost RISCVTTIImpl::getPointersChainCost( 1678 ArrayRef<const Value *> Ptrs, const Value *Base, 1679 const TTI::PointersChainInfo &Info, Type *AccessTy, 1680 TTI::TargetCostKind CostKind) { 1681 InstructionCost Cost = TTI::TCC_Free; 1682 // In the basic model we take into account GEP instructions only 1683 // (although here can come alloca instruction, a value, constants and/or 1684 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a 1685 // pointer). Typically, if Base is a not a GEP-instruction and all the 1686 // pointers are relative to the same base address, all the rest are 1687 // either GEP instructions, PHIs, bitcasts or constants. When we have same 1688 // base, we just calculate cost of each non-Base GEP as an ADD operation if 1689 // any their index is a non-const. 1690 // If no known dependecies between the pointers cost is calculated as a sum 1691 // of costs of GEP instructions. 1692 for (auto [I, V] : enumerate(Ptrs)) { 1693 const auto *GEP = dyn_cast<GetElementPtrInst>(V); 1694 if (!GEP) 1695 continue; 1696 if (Info.isSameBase() && V != Base) { 1697 if (GEP->hasAllConstantIndices()) 1698 continue; 1699 // If the chain is unit-stride and BaseReg + stride*i is a legal 1700 // addressing mode, then presume the base GEP is sitting around in a 1701 // register somewhere and check if we can fold the offset relative to 1702 // it. 1703 unsigned Stride = DL.getTypeStoreSize(AccessTy); 1704 if (Info.isUnitStride() && 1705 isLegalAddressingMode(AccessTy, 1706 /* BaseGV */ nullptr, 1707 /* BaseOffset */ Stride * I, 1708 /* HasBaseReg */ true, 1709 /* Scale */ 0, 1710 GEP->getType()->getPointerAddressSpace())) 1711 continue; 1712 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind, 1713 {TTI::OK_AnyValue, TTI::OP_None}, 1714 {TTI::OK_AnyValue, TTI::OP_None}, 1715 std::nullopt); 1716 } else { 1717 SmallVector<const Value *> Indices(GEP->indices()); 1718 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(), 1719 Indices, AccessTy, CostKind); 1720 } 1721 } 1722 return Cost; 1723 } 1724 1725 void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 1726 TTI::UnrollingPreferences &UP, 1727 OptimizationRemarkEmitter *ORE) { 1728 // TODO: More tuning on benchmarks and metrics with changes as needed 1729 // would apply to all settings below to enable performance. 1730 1731 1732 if (ST->enableDefaultUnroll()) 1733 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE); 1734 1735 // Enable Upper bound unrolling universally, not dependant upon the conditions 1736 // below. 1737 UP.UpperBound = true; 1738 1739 // Disable loop unrolling for Oz and Os. 1740 UP.OptSizeThreshold = 0; 1741 UP.PartialOptSizeThreshold = 0; 1742 if (L->getHeader()->getParent()->hasOptSize()) 1743 return; 1744 1745 SmallVector<BasicBlock *, 4> ExitingBlocks; 1746 L->getExitingBlocks(ExitingBlocks); 1747 LLVM_DEBUG(dbgs() << "Loop has:\n" 1748 << "Blocks: " << L->getNumBlocks() << "\n" 1749 << "Exit blocks: " << ExitingBlocks.size() << "\n"); 1750 1751 // Only allow another exit other than the latch. This acts as an early exit 1752 // as it mirrors the profitability calculation of the runtime unroller. 1753 if (ExitingBlocks.size() > 2) 1754 return; 1755 1756 // Limit the CFG of the loop body for targets with a branch predictor. 1757 // Allowing 4 blocks permits if-then-else diamonds in the body. 1758 if (L->getNumBlocks() > 4) 1759 return; 1760 1761 // Don't unroll vectorized loops, including the remainder loop 1762 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized")) 1763 return; 1764 1765 // Scan the loop: don't unroll loops with calls as this could prevent 1766 // inlining. 1767 InstructionCost Cost = 0; 1768 for (auto *BB : L->getBlocks()) { 1769 for (auto &I : *BB) { 1770 // Initial setting - Don't unroll loops containing vectorized 1771 // instructions. 1772 if (I.getType()->isVectorTy()) 1773 return; 1774 1775 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 1776 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 1777 if (!isLoweredToCall(F)) 1778 continue; 1779 } 1780 return; 1781 } 1782 1783 SmallVector<const Value *> Operands(I.operand_values()); 1784 Cost += getInstructionCost(&I, Operands, 1785 TargetTransformInfo::TCK_SizeAndLatency); 1786 } 1787 } 1788 1789 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n"); 1790 1791 UP.Partial = true; 1792 UP.Runtime = true; 1793 UP.UnrollRemainder = true; 1794 UP.UnrollAndJam = true; 1795 UP.UnrollAndJamInnerLoopThreshold = 60; 1796 1797 // Force unrolling small loops can be very useful because of the branch 1798 // taken cost of the backedge. 1799 if (Cost < 12) 1800 UP.Force = true; 1801 } 1802 1803 void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 1804 TTI::PeelingPreferences &PP) { 1805 BaseT::getPeelingPreferences(L, SE, PP); 1806 } 1807 1808 unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) { 1809 TypeSize Size = DL.getTypeSizeInBits(Ty); 1810 if (Ty->isVectorTy()) { 1811 if (Size.isScalable() && ST->hasVInstructions()) 1812 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock); 1813 1814 if (ST->useRVVForFixedLengthVectors()) 1815 return divideCeil(Size, ST->getRealMinVLen()); 1816 } 1817 1818 return BaseT::getRegUsageForType(Ty); 1819 } 1820 1821 unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { 1822 if (SLPMaxVF.getNumOccurrences()) 1823 return SLPMaxVF; 1824 1825 // Return how many elements can fit in getRegisterBitwidth. This is the 1826 // same routine as used in LoopVectorizer. We should probably be 1827 // accounting for whether we actually have instructions with the right 1828 // lane type, but we don't have enough information to do that without 1829 // some additional plumbing which hasn't been justified yet. 1830 TypeSize RegWidth = 1831 getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector); 1832 // If no vector registers, or absurd element widths, disable 1833 // vectorization by returning 1. 1834 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth); 1835 } 1836 1837 bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, 1838 const TargetTransformInfo::LSRCost &C2) { 1839 // RISC-V specific here are "instruction number 1st priority". 1840 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, 1841 C1.NumIVMuls, C1.NumBaseAdds, 1842 C1.ScaleCost, C1.ImmCost, C1.SetupCost) < 1843 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, 1844 C2.NumIVMuls, C2.NumBaseAdds, 1845 C2.ScaleCost, C2.ImmCost, C2.SetupCost); 1846 } 1847