1 //===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "RISCVTargetTransformInfo.h" 10 #include "MCTargetDesc/RISCVMatInt.h" 11 #include "llvm/ADT/STLExtras.h" 12 #include "llvm/Analysis/TargetTransformInfo.h" 13 #include "llvm/CodeGen/BasicTTIImpl.h" 14 #include "llvm/CodeGen/CostTable.h" 15 #include "llvm/CodeGen/TargetLowering.h" 16 #include "llvm/CodeGen/ValueTypes.h" 17 #include "llvm/IR/Instructions.h" 18 #include "llvm/IR/PatternMatch.h" 19 #include <cmath> 20 #include <optional> 21 using namespace llvm; 22 using namespace llvm::PatternMatch; 23 24 #define DEBUG_TYPE "riscvtti" 25 26 static cl::opt<unsigned> RVVRegisterWidthLMUL( 27 "riscv-v-register-bit-width-lmul", 28 cl::desc( 29 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " 30 "by autovectorized code. Fractional LMULs are not supported."), 31 cl::init(2), cl::Hidden); 32 33 static cl::opt<unsigned> SLPMaxVF( 34 "riscv-v-slp-max-vf", 35 cl::desc( 36 "Overrides result used for getMaximumVF query which is used " 37 "exclusively by SLP vectorizer."), 38 cl::Hidden); 39 40 InstructionCost 41 RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT, 42 TTI::TargetCostKind CostKind) { 43 // Check if the type is valid for all CostKind 44 if (!VT.isVector()) 45 return InstructionCost::getInvalid(); 46 size_t NumInstr = OpCodes.size(); 47 if (CostKind == TTI::TCK_CodeSize) 48 return NumInstr; 49 InstructionCost LMULCost = TLI->getLMULCost(VT); 50 if ((CostKind != TTI::TCK_RecipThroughput) && (CostKind != TTI::TCK_Latency)) 51 return LMULCost * NumInstr; 52 InstructionCost Cost = 0; 53 for (auto Op : OpCodes) { 54 switch (Op) { 55 case RISCV::VRGATHER_VI: 56 Cost += TLI->getVRGatherVICost(VT); 57 break; 58 case RISCV::VRGATHER_VV: 59 Cost += TLI->getVRGatherVVCost(VT); 60 break; 61 case RISCV::VSLIDEUP_VI: 62 case RISCV::VSLIDEDOWN_VI: 63 Cost += TLI->getVSlideVICost(VT); 64 break; 65 case RISCV::VSLIDEUP_VX: 66 case RISCV::VSLIDEDOWN_VX: 67 Cost += TLI->getVSlideVXCost(VT); 68 break; 69 case RISCV::VREDMAX_VS: 70 case RISCV::VREDMIN_VS: 71 case RISCV::VREDMAXU_VS: 72 case RISCV::VREDMINU_VS: 73 case RISCV::VREDSUM_VS: 74 case RISCV::VREDAND_VS: 75 case RISCV::VREDOR_VS: 76 case RISCV::VREDXOR_VS: 77 case RISCV::VFREDMAX_VS: 78 case RISCV::VFREDMIN_VS: 79 case RISCV::VFREDUSUM_VS: { 80 unsigned VL = VT.getVectorMinNumElements(); 81 if (!VT.isFixedLengthVector()) 82 VL *= *getVScaleForTuning(); 83 Cost += Log2_32_Ceil(VL); 84 break; 85 } 86 case RISCV::VFREDOSUM_VS: { 87 unsigned VL = VT.getVectorMinNumElements(); 88 if (!VT.isFixedLengthVector()) 89 VL *= *getVScaleForTuning(); 90 Cost += VL; 91 break; 92 } 93 case RISCV::VMV_X_S: 94 case RISCV::VMV_S_X: 95 case RISCV::VFMV_F_S: 96 case RISCV::VFMV_S_F: 97 case RISCV::VMOR_MM: 98 case RISCV::VMXOR_MM: 99 case RISCV::VMAND_MM: 100 case RISCV::VMANDN_MM: 101 case RISCV::VMNAND_MM: 102 case RISCV::VCPOP_M: 103 case RISCV::VFIRST_M: 104 Cost += 1; 105 break; 106 default: 107 Cost += LMULCost; 108 } 109 } 110 return Cost; 111 } 112 113 static InstructionCost getIntImmCostImpl(const DataLayout &DL, 114 const RISCVSubtarget *ST, 115 const APInt &Imm, Type *Ty, 116 TTI::TargetCostKind CostKind, 117 bool FreeZeroes) { 118 assert(Ty->isIntegerTy() && 119 "getIntImmCost can only estimate cost of materialising integers"); 120 121 // We have a Zero register, so 0 is always free. 122 if (Imm == 0) 123 return TTI::TCC_Free; 124 125 // Otherwise, we check how many instructions it will take to materialise. 126 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST, 127 /*CompressionCost=*/false, FreeZeroes); 128 } 129 130 InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 131 TTI::TargetCostKind CostKind) { 132 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false); 133 } 134 135 // Look for patterns of shift followed by AND that can be turned into a pair of 136 // shifts. We won't need to materialize an immediate for the AND so these can 137 // be considered free. 138 static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) { 139 uint64_t Mask = Imm.getZExtValue(); 140 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0)); 141 if (!BO || !BO->hasOneUse()) 142 return false; 143 144 if (BO->getOpcode() != Instruction::Shl) 145 return false; 146 147 if (!isa<ConstantInt>(BO->getOperand(1))) 148 return false; 149 150 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue(); 151 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1 152 // is a mask shifted by c2 bits with c3 leading zeros. 153 if (isShiftedMask_64(Mask)) { 154 unsigned Trailing = llvm::countr_zero(Mask); 155 if (ShAmt == Trailing) 156 return true; 157 } 158 159 return false; 160 } 161 162 InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 163 const APInt &Imm, Type *Ty, 164 TTI::TargetCostKind CostKind, 165 Instruction *Inst) { 166 assert(Ty->isIntegerTy() && 167 "getIntImmCost can only estimate cost of materialising integers"); 168 169 // We have a Zero register, so 0 is always free. 170 if (Imm == 0) 171 return TTI::TCC_Free; 172 173 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are 174 // commutative, in others the immediate comes from a specific argument index. 175 bool Takes12BitImm = false; 176 unsigned ImmArgIdx = ~0U; 177 178 switch (Opcode) { 179 case Instruction::GetElementPtr: 180 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will 181 // split up large offsets in GEP into better parts than ConstantHoisting 182 // can. 183 return TTI::TCC_Free; 184 case Instruction::Store: { 185 // Use the materialization cost regardless of if it's the address or the 186 // value that is constant, except for if the store is misaligned and 187 // misaligned accesses are not legal (experience shows constant hoisting 188 // can sometimes be harmful in such cases). 189 if (Idx == 1 || !Inst) 190 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, 191 /*FreeZeroes=*/true); 192 193 StoreInst *ST = cast<StoreInst>(Inst); 194 if (!getTLI()->allowsMemoryAccessForAlignment( 195 Ty->getContext(), DL, getTLI()->getValueType(DL, Ty), 196 ST->getPointerAddressSpace(), ST->getAlign())) 197 return TTI::TCC_Free; 198 199 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, 200 /*FreeZeroes=*/true); 201 } 202 case Instruction::Load: 203 // If the address is a constant, use the materialization cost. 204 return getIntImmCost(Imm, Ty, CostKind); 205 case Instruction::And: 206 // zext.h 207 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb()) 208 return TTI::TCC_Free; 209 // zext.w 210 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba()) 211 return TTI::TCC_Free; 212 // bclri 213 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2()) 214 return TTI::TCC_Free; 215 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() && 216 canUseShiftPair(Inst, Imm)) 217 return TTI::TCC_Free; 218 Takes12BitImm = true; 219 break; 220 case Instruction::Add: 221 Takes12BitImm = true; 222 break; 223 case Instruction::Or: 224 case Instruction::Xor: 225 // bseti/binvi 226 if (ST->hasStdExtZbs() && Imm.isPowerOf2()) 227 return TTI::TCC_Free; 228 Takes12BitImm = true; 229 break; 230 case Instruction::Mul: 231 // Power of 2 is a shift. Negated power of 2 is a shift and a negate. 232 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2()) 233 return TTI::TCC_Free; 234 // One more or less than a power of 2 can use SLLI+ADD/SUB. 235 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2()) 236 return TTI::TCC_Free; 237 // FIXME: There is no MULI instruction. 238 Takes12BitImm = true; 239 break; 240 case Instruction::Sub: 241 case Instruction::Shl: 242 case Instruction::LShr: 243 case Instruction::AShr: 244 Takes12BitImm = true; 245 ImmArgIdx = 1; 246 break; 247 default: 248 break; 249 } 250 251 if (Takes12BitImm) { 252 // Check immediate is the correct argument... 253 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) { 254 // ... and fits into the 12-bit immediate. 255 if (Imm.getSignificantBits() <= 64 && 256 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) { 257 return TTI::TCC_Free; 258 } 259 } 260 261 // Otherwise, use the full materialisation cost. 262 return getIntImmCost(Imm, Ty, CostKind); 263 } 264 265 // By default, prevent hoisting. 266 return TTI::TCC_Free; 267 } 268 269 InstructionCost 270 RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 271 const APInt &Imm, Type *Ty, 272 TTI::TargetCostKind CostKind) { 273 // Prevent hoisting in unknown cases. 274 return TTI::TCC_Free; 275 } 276 277 bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type *DataTy, Align) const { 278 return ST->hasVInstructions(); 279 } 280 281 TargetTransformInfo::PopcntSupportKind 282 RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) { 283 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 284 return ST->hasStdExtZbb() || (ST->hasVendorXCVbitmanip() && !ST->is64Bit()) 285 ? TTI::PSK_FastHardware 286 : TTI::PSK_Software; 287 } 288 289 bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const { 290 // Currently, the ExpandReductions pass can't expand scalable-vector 291 // reductions, but we still request expansion as RVV doesn't support certain 292 // reductions and the SelectionDAG can't legalize them either. 293 switch (II->getIntrinsicID()) { 294 default: 295 return false; 296 // These reductions have no equivalent in RVV 297 case Intrinsic::vector_reduce_mul: 298 case Intrinsic::vector_reduce_fmul: 299 return true; 300 } 301 } 302 303 std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const { 304 if (ST->hasVInstructions()) 305 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock; 306 return BaseT::getMaxVScale(); 307 } 308 309 std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const { 310 if (ST->hasVInstructions()) 311 if (unsigned MinVLen = ST->getRealMinVLen(); 312 MinVLen >= RISCV::RVVBitsPerBlock) 313 return MinVLen / RISCV::RVVBitsPerBlock; 314 return BaseT::getVScaleForTuning(); 315 } 316 317 TypeSize 318 RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 319 unsigned LMUL = 320 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8)); 321 switch (K) { 322 case TargetTransformInfo::RGK_Scalar: 323 return TypeSize::getFixed(ST->getXLen()); 324 case TargetTransformInfo::RGK_FixedWidthVector: 325 return TypeSize::getFixed( 326 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0); 327 case TargetTransformInfo::RGK_ScalableVector: 328 return TypeSize::getScalable( 329 (ST->hasVInstructions() && 330 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock) 331 ? LMUL * RISCV::RVVBitsPerBlock 332 : 0); 333 } 334 335 llvm_unreachable("Unsupported register kind"); 336 } 337 338 InstructionCost 339 RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) { 340 // Add a cost of address generation + the cost of the load. The address 341 // is expected to be a PC relative offset to a constant pool entry 342 // using auipc/addi. 343 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty), 344 /*AddressSpace=*/0, CostKind); 345 } 346 347 static bool isRepeatedConcatMask(ArrayRef<int> Mask, int &SubVectorSize) { 348 unsigned Size = Mask.size(); 349 if (!isPowerOf2_32(Size)) 350 return false; 351 for (unsigned I = 0; I != Size; ++I) { 352 if (static_cast<unsigned>(Mask[I]) == I) 353 continue; 354 if (Mask[I] != 0) 355 return false; 356 if (Size % I != 0) 357 return false; 358 for (unsigned J = I + 1; J != Size; ++J) 359 // Check the pattern is repeated. 360 if (static_cast<unsigned>(Mask[J]) != J % I) 361 return false; 362 SubVectorSize = I; 363 return true; 364 } 365 // That means Mask is <0, 1, 2, 3>. This is not a concatenation. 366 return false; 367 } 368 369 static VectorType *getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, 370 LLVMContext &C) { 371 assert((DataVT.getScalarSizeInBits() != 8 || 372 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering"); 373 MVT IndexVT = DataVT.changeTypeToInteger(); 374 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT())) 375 IndexVT = IndexVT.changeVectorElementType(MVT::i16); 376 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C)); 377 } 378 379 /// Try to perform better estimation of the permutation. 380 /// 1. Split the source/destination vectors into real registers. 381 /// 2. Do the mask analysis to identify which real registers are 382 /// permuted. If more than 1 source registers are used for the 383 /// destination register building, the cost for this destination register 384 /// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one 385 /// source register is used, build mask and calculate the cost as a cost 386 /// of PermuteSingleSrc. 387 /// Also, for the single register permute we try to identify if the 388 /// destination register is just a copy of the source register or the 389 /// copy of the previous destination register (the cost is 390 /// TTI::TCC_Basic). If the source register is just reused, the cost for 391 /// this operation is 0. 392 static InstructionCost 393 costShuffleViaVRegSplitting(RISCVTTIImpl &TTI, MVT LegalVT, 394 std::optional<unsigned> VLen, VectorType *Tp, 395 ArrayRef<int> Mask, TTI::TargetCostKind CostKind) { 396 InstructionCost NumOfDests = InstructionCost::getInvalid(); 397 if (VLen && LegalVT.isFixedLengthVector() && !Mask.empty()) { 398 MVT ElemVT = LegalVT.getVectorElementType(); 399 unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits(); 400 LegalVT = TTI.getTypeLegalizationCost( 401 FixedVectorType::get(Tp->getElementType(), ElemsPerVReg)) 402 .second; 403 // Number of destination vectors after legalization: 404 NumOfDests = divideCeil(Mask.size(), LegalVT.getVectorNumElements()); 405 } 406 if (!NumOfDests.isValid() || NumOfDests <= 1 || 407 !LegalVT.isFixedLengthVector() || 408 LegalVT.getVectorElementType().getSizeInBits() != 409 Tp->getElementType()->getPrimitiveSizeInBits() || 410 LegalVT.getVectorNumElements() >= Tp->getElementCount().getFixedValue()) 411 return InstructionCost::getInvalid(); 412 413 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp); 414 unsigned LegalVTSize = LegalVT.getStoreSize(); 415 // Number of source vectors after legalization: 416 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize); 417 418 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(), 419 LegalVT.getVectorNumElements()); 420 421 unsigned E = *NumOfDests.getValue(); 422 unsigned NormalizedVF = 423 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E); 424 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements(); 425 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements(); 426 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem); 427 assert(NormalizedVF >= Mask.size() && 428 "Normalized mask expected to be not shorter than original mask."); 429 copy(Mask, NormalizedMask.begin()); 430 InstructionCost Cost = 0; 431 SmallBitVector ExtractedRegs(2 * NumOfSrcRegs); 432 int NumShuffles = 0; 433 processShuffleMasks( 434 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {}, 435 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) { 436 if (ExtractedRegs.test(SrcReg)) { 437 Cost += TTI.getShuffleCost(TTI::SK_ExtractSubvector, Tp, {}, CostKind, 438 (SrcReg % NumOfSrcRegs) * 439 SingleOpTy->getNumElements(), 440 SingleOpTy); 441 ExtractedRegs.set(SrcReg); 442 } 443 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) { 444 ++NumShuffles; 445 Cost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy, 446 RegMask, CostKind, 0, nullptr); 447 return; 448 } 449 }, 450 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) { 451 if (ExtractedRegs.test(Idx1)) { 452 Cost += TTI.getShuffleCost( 453 TTI::SK_ExtractSubvector, Tp, {}, CostKind, 454 (Idx1 % NumOfSrcRegs) * SingleOpTy->getNumElements(), SingleOpTy); 455 ExtractedRegs.set(Idx1); 456 } 457 if (ExtractedRegs.test(Idx2)) { 458 Cost += TTI.getShuffleCost( 459 TTI::SK_ExtractSubvector, Tp, {}, CostKind, 460 (Idx2 % NumOfSrcRegs) * SingleOpTy->getNumElements(), SingleOpTy); 461 ExtractedRegs.set(Idx2); 462 } 463 Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask, 464 CostKind, 0, nullptr); 465 NumShuffles += 2; 466 }); 467 // Note: check that we do not emit too many shuffles here to prevent code 468 // size explosion. 469 // TODO: investigate, if it can be improved by extra analysis of the masks 470 // to check if the code is more profitable. 471 if ((NumOfDestRegs > 2 && NumShuffles <= static_cast<int>(NumOfDestRegs)) || 472 (NumOfDestRegs <= 2 && NumShuffles < 4)) 473 return Cost; 474 return InstructionCost::getInvalid(); 475 } 476 477 InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, 478 VectorType *Tp, ArrayRef<int> Mask, 479 TTI::TargetCostKind CostKind, 480 int Index, VectorType *SubTp, 481 ArrayRef<const Value *> Args, 482 const Instruction *CxtI) { 483 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp); 484 485 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 486 487 // First, handle cases where having a fixed length vector enables us to 488 // give a more accurate cost than falling back to generic scalable codegen. 489 // TODO: Each of these cases hints at a modeling gap around scalable vectors. 490 if (ST->hasVInstructions() && isa<FixedVectorType>(Tp)) { 491 InstructionCost VRegSplittingCost = costShuffleViaVRegSplitting( 492 *this, LT.second, ST->getRealVLen(), Tp, Mask, CostKind); 493 if (VRegSplittingCost.isValid()) 494 return VRegSplittingCost; 495 switch (Kind) { 496 default: 497 break; 498 case TTI::SK_PermuteSingleSrc: { 499 if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) { 500 MVT EltTp = LT.second.getVectorElementType(); 501 // If the size of the element is < ELEN then shuffles of interleaves and 502 // deinterleaves of 2 vectors can be lowered into the following 503 // sequences 504 if (EltTp.getScalarSizeInBits() < ST->getELen()) { 505 // Example sequence: 506 // vsetivli zero, 4, e8, mf4, ta, ma (ignored) 507 // vwaddu.vv v10, v8, v9 508 // li a0, -1 (ignored) 509 // vwmaccu.vx v10, a0, v9 510 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size())) 511 return 2 * LT.first * TLI->getLMULCost(LT.second); 512 513 if (Mask[0] == 0 || Mask[0] == 1) { 514 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size()); 515 // Example sequence: 516 // vnsrl.wi v10, v8, 0 517 if (equal(DeinterleaveMask, Mask)) 518 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI, 519 LT.second, CostKind); 520 } 521 } 522 int SubVectorSize; 523 if (LT.second.getScalarSizeInBits() != 1 && 524 isRepeatedConcatMask(Mask, SubVectorSize)) { 525 InstructionCost Cost = 0; 526 unsigned NumSlides = Log2_32(Mask.size() / SubVectorSize); 527 // The cost of extraction from a subvector is 0 if the index is 0. 528 for (unsigned I = 0; I != NumSlides; ++I) { 529 unsigned InsertIndex = SubVectorSize * (1 << I); 530 FixedVectorType *SubTp = 531 FixedVectorType::get(Tp->getElementType(), InsertIndex); 532 FixedVectorType *DestTp = 533 FixedVectorType::getDoubleElementsVectorType(SubTp); 534 std::pair<InstructionCost, MVT> DestLT = 535 getTypeLegalizationCost(DestTp); 536 // Add the cost of whole vector register move because the 537 // destination vector register group for vslideup cannot overlap the 538 // source. 539 Cost += DestLT.first * TLI->getLMULCost(DestLT.second); 540 Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, {}, 541 CostKind, InsertIndex, SubTp); 542 } 543 return Cost; 544 } 545 } 546 // vrgather + cost of generating the mask constant. 547 // We model this for an unknown mask with a single vrgather. 548 if (LT.second.isFixedLengthVector() && LT.first == 1 && 549 (LT.second.getScalarSizeInBits() != 8 || 550 LT.second.getVectorNumElements() <= 256)) { 551 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext()); 552 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind); 553 return IndexCost + 554 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind); 555 } 556 [[fallthrough]]; 557 } 558 case TTI::SK_Transpose: 559 case TTI::SK_PermuteTwoSrc: { 560 // 2 x (vrgather + cost of generating the mask constant) + cost of mask 561 // register for the second vrgather. We model this for an unknown 562 // (shuffle) mask. 563 if (LT.second.isFixedLengthVector() && LT.first == 1 && 564 (LT.second.getScalarSizeInBits() != 8 || 565 LT.second.getVectorNumElements() <= 256)) { 566 auto &C = Tp->getContext(); 567 auto EC = Tp->getElementCount(); 568 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C); 569 VectorType *MaskTy = VectorType::get(IntegerType::getInt1Ty(C), EC); 570 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind); 571 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind); 572 return 2 * IndexCost + 573 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV}, 574 LT.second, CostKind) + 575 MaskCost; 576 } 577 [[fallthrough]]; 578 } 579 case TTI::SK_Select: { 580 // We are going to permute multiple sources and the result will be in 581 // multiple destinations. Providing an accurate cost only for splits where 582 // the element type remains the same. 583 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 && 584 LT.second.isFixedLengthVector() && 585 LT.second.getVectorElementType().getSizeInBits() == 586 Tp->getElementType()->getPrimitiveSizeInBits() && 587 LT.second.getVectorNumElements() < 588 cast<FixedVectorType>(Tp)->getNumElements() && 589 divideCeil(Mask.size(), 590 cast<FixedVectorType>(Tp)->getNumElements()) == 591 static_cast<unsigned>(*LT.first.getValue())) { 592 unsigned NumRegs = *LT.first.getValue(); 593 unsigned VF = cast<FixedVectorType>(Tp)->getNumElements(); 594 unsigned SubVF = PowerOf2Ceil(VF / NumRegs); 595 auto *SubVecTy = FixedVectorType::get(Tp->getElementType(), SubVF); 596 597 InstructionCost Cost = 0; 598 for (unsigned I = 0, NumSrcRegs = divideCeil(Mask.size(), SubVF); 599 I < NumSrcRegs; ++I) { 600 bool IsSingleVector = true; 601 SmallVector<int> SubMask(SubVF, PoisonMaskElem); 602 transform( 603 Mask.slice(I * SubVF, 604 I == NumSrcRegs - 1 ? Mask.size() % SubVF : SubVF), 605 SubMask.begin(), [&](int I) -> int { 606 if (I == PoisonMaskElem) 607 return PoisonMaskElem; 608 bool SingleSubVector = I / VF == 0; 609 IsSingleVector &= SingleSubVector; 610 return (SingleSubVector ? 0 : 1) * SubVF + (I % VF) % SubVF; 611 }); 612 if (all_of(enumerate(SubMask), [](auto &&P) { 613 return P.value() == PoisonMaskElem || 614 static_cast<unsigned>(P.value()) == P.index(); 615 })) 616 continue; 617 Cost += getShuffleCost(IsSingleVector ? TTI::SK_PermuteSingleSrc 618 : TTI::SK_PermuteTwoSrc, 619 SubVecTy, SubMask, CostKind, 0, nullptr); 620 } 621 return Cost; 622 } 623 break; 624 } 625 } 626 }; 627 628 // Handle scalable vectors (and fixed vectors legalized to scalable vectors). 629 switch (Kind) { 630 default: 631 // Fallthrough to generic handling. 632 // TODO: Most of these cases will return getInvalid in generic code, and 633 // must be implemented here. 634 break; 635 case TTI::SK_ExtractSubvector: 636 // Extract at zero is always a subregister extract 637 if (Index == 0) 638 return TTI::TCC_Free; 639 640 // If we're extracting a subvector of at most m1 size at a sub-register 641 // boundary - which unfortunately we need exact vlen to identify - this is 642 // a subregister extract at worst and thus won't require a vslidedown. 643 // TODO: Extend for aligned m2, m4 subvector extracts 644 // TODO: Extend for misalgined (but contained) extracts 645 // TODO: Extend for scalable subvector types 646 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp); 647 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) { 648 const unsigned MinVLen = ST->getRealMinVLen(); 649 const unsigned MaxVLen = ST->getRealMaxVLen(); 650 if (MinVLen == MaxVLen && 651 SubLT.second.getScalarSizeInBits() * Index % MinVLen == 0 && 652 SubLT.second.getSizeInBits() <= MinVLen) 653 return TTI::TCC_Free; 654 } 655 656 // Example sequence: 657 // vsetivli zero, 4, e8, mf2, tu, ma (ignored) 658 // vslidedown.vi v8, v9, 2 659 return LT.first * 660 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind); 661 case TTI::SK_InsertSubvector: 662 // Example sequence: 663 // vsetivli zero, 4, e8, mf2, tu, ma (ignored) 664 // vslideup.vi v8, v9, 2 665 return LT.first * 666 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind); 667 case TTI::SK_Select: { 668 // Example sequence: 669 // li a0, 90 670 // vsetivli zero, 8, e8, mf2, ta, ma (ignored) 671 // vmv.s.x v0, a0 672 // vmerge.vvm v8, v9, v8, v0 673 // We use 2 for the cost of the mask materialization as this is the true 674 // cost for small masks and most shuffles are small. At worst, this cost 675 // should be a very small constant for the constant pool load. As such, 676 // we may bias towards large selects slightly more than truely warranted. 677 return LT.first * 678 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM}, 679 LT.second, CostKind)); 680 } 681 case TTI::SK_Broadcast: { 682 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) == 683 Instruction::InsertElement); 684 if (LT.second.getScalarSizeInBits() == 1) { 685 if (HasScalar) { 686 // Example sequence: 687 // andi a0, a0, 1 688 // vsetivli zero, 2, e8, mf8, ta, ma (ignored) 689 // vmv.v.x v8, a0 690 // vmsne.vi v0, v8, 0 691 return LT.first * 692 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI}, 693 LT.second, CostKind)); 694 } 695 // Example sequence: 696 // vsetivli zero, 2, e8, mf8, ta, mu (ignored) 697 // vmv.v.i v8, 0 698 // vmerge.vim v8, v8, 1, v0 699 // vmv.x.s a0, v8 700 // andi a0, a0, 1 701 // vmv.v.x v8, a0 702 // vmsne.vi v0, v8, 0 703 704 return LT.first * 705 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM, 706 RISCV::VMV_X_S, RISCV::VMV_V_X, 707 RISCV::VMSNE_VI}, 708 LT.second, CostKind)); 709 } 710 711 if (HasScalar) { 712 // Example sequence: 713 // vmv.v.x v8, a0 714 return LT.first * 715 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind); 716 } 717 718 // Example sequence: 719 // vrgather.vi v9, v8, 0 720 return LT.first * 721 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind); 722 } 723 case TTI::SK_Splice: { 724 // vslidedown+vslideup. 725 // TODO: Multiplying by LT.first implies this legalizes into multiple copies 726 // of similar code, but I think we expand through memory. 727 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX}; 728 if (Index >= 0 && Index < 32) 729 Opcodes[0] = RISCV::VSLIDEDOWN_VI; 730 else if (Index < 0 && Index > -32) 731 Opcodes[1] = RISCV::VSLIDEUP_VI; 732 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind); 733 } 734 case TTI::SK_Reverse: { 735 // TODO: Cases to improve here: 736 // * Illegal vector types 737 // * i64 on RV32 738 // * i1 vector 739 // At low LMUL, most of the cost is producing the vrgather index register. 740 // At high LMUL, the cost of the vrgather itself will dominate. 741 // Example sequence: 742 // csrr a0, vlenb 743 // srli a0, a0, 3 744 // addi a0, a0, -1 745 // vsetvli a1, zero, e8, mf8, ta, mu (ignored) 746 // vid.v v9 747 // vrsub.vx v10, v9, a0 748 // vrgather.vv v9, v8, v10 749 InstructionCost LenCost = 3; 750 if (LT.second.isFixedLengthVector()) 751 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices 752 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1; 753 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV}; 754 if (LT.second.isFixedLengthVector() && 755 isInt<5>(LT.second.getVectorNumElements() - 1)) 756 Opcodes[1] = RISCV::VRSUB_VI; 757 InstructionCost GatherCost = 758 getRISCVInstructionCost(Opcodes, LT.second, CostKind); 759 // Mask operation additionally required extend and truncate 760 InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0; 761 return LT.first * (LenCost + GatherCost + ExtendCost); 762 } 763 } 764 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp); 765 } 766 767 static unsigned isM1OrSmaller(MVT VT) { 768 RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT); 769 return (LMUL == RISCVII::VLMUL::LMUL_F8 || LMUL == RISCVII::VLMUL::LMUL_F4 || 770 LMUL == RISCVII::VLMUL::LMUL_F2 || LMUL == RISCVII::VLMUL::LMUL_1); 771 } 772 773 InstructionCost RISCVTTIImpl::getScalarizationOverhead( 774 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, 775 TTI::TargetCostKind CostKind, ArrayRef<Value *> VL) { 776 if (isa<ScalableVectorType>(Ty)) 777 return InstructionCost::getInvalid(); 778 779 // A build_vector (which is m1 sized or smaller) can be done in no 780 // worse than one vslide1down.vx per element in the type. We could 781 // in theory do an explode_vector in the inverse manner, but our 782 // lowering today does not have a first class node for this pattern. 783 InstructionCost Cost = BaseT::getScalarizationOverhead( 784 Ty, DemandedElts, Insert, Extract, CostKind); 785 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 786 if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) { 787 if (Ty->getScalarSizeInBits() == 1) { 788 auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8)); 789 // Note: Implicit scalar anyextend is assumed to be free since the i1 790 // must be stored in a GPR. 791 return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract, 792 CostKind) + 793 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy, 794 TTI::CastContextHint::None, CostKind, nullptr); 795 } 796 797 assert(LT.second.isFixedLengthVector()); 798 MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second); 799 if (isM1OrSmaller(ContainerVT)) { 800 InstructionCost BV = 801 cast<FixedVectorType>(Ty)->getNumElements() * 802 getRISCVInstructionCost(RISCV::VSLIDE1DOWN_VX, LT.second, CostKind); 803 if (BV < Cost) 804 Cost = BV; 805 } 806 } 807 return Cost; 808 } 809 810 InstructionCost 811 RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, 812 unsigned AddressSpace, 813 TTI::TargetCostKind CostKind) { 814 if (!isLegalMaskedLoadStore(Src, Alignment) || 815 CostKind != TTI::TCK_RecipThroughput) 816 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 817 CostKind); 818 819 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); 820 } 821 822 InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost( 823 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 824 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 825 bool UseMaskForCond, bool UseMaskForGaps) { 826 827 // The interleaved memory access pass will lower interleaved memory ops (i.e 828 // a load and store followed by a specific shuffle) to vlseg/vsseg 829 // intrinsics. 830 if (!UseMaskForCond && !UseMaskForGaps && 831 Factor <= TLI->getMaxSupportedInterleaveFactor()) { 832 auto *VTy = cast<VectorType>(VecTy); 833 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy); 834 // Need to make sure type has't been scalarized 835 if (LT.second.isVector()) { 836 auto *SubVecTy = 837 VectorType::get(VTy->getElementType(), 838 VTy->getElementCount().divideCoefficientBy(Factor)); 839 if (VTy->getElementCount().isKnownMultipleOf(Factor) && 840 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment, 841 AddressSpace, DL)) { 842 843 // Some processors optimize segment loads/stores as one wide memory op + 844 // Factor * LMUL shuffle ops. 845 if (ST->hasOptimizedSegmentLoadStore(Factor)) { 846 InstructionCost Cost = 847 getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind); 848 MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT(); 849 Cost += Factor * TLI->getLMULCost(SubVecVT); 850 return LT.first * Cost; 851 } 852 853 // Otherwise, the cost is proportional to the number of elements (VL * 854 // Factor ops). 855 InstructionCost MemOpCost = 856 getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0, 857 CostKind, {TTI::OK_AnyValue, TTI::OP_None}); 858 unsigned NumLoads = getEstimatedVLFor(VTy); 859 return NumLoads * MemOpCost; 860 } 861 } 862 } 863 864 // TODO: Return the cost of interleaved accesses for scalable vector when 865 // unable to convert to segment accesses instructions. 866 if (isa<ScalableVectorType>(VecTy)) 867 return InstructionCost::getInvalid(); 868 869 auto *FVTy = cast<FixedVectorType>(VecTy); 870 InstructionCost MemCost = 871 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind); 872 unsigned VF = FVTy->getNumElements() / Factor; 873 874 // An interleaved load will look like this for Factor=3: 875 // %wide.vec = load <12 x i32>, ptr %3, align 4 876 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask> 877 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask> 878 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask> 879 if (Opcode == Instruction::Load) { 880 InstructionCost Cost = MemCost; 881 for (unsigned Index : Indices) { 882 FixedVectorType *SubVecTy = 883 FixedVectorType::get(FVTy->getElementType(), VF * Factor); 884 auto Mask = createStrideMask(Index, Factor, VF); 885 InstructionCost ShuffleCost = 886 getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, SubVecTy, Mask, 887 CostKind, 0, nullptr, {}); 888 Cost += ShuffleCost; 889 } 890 return Cost; 891 } 892 893 // TODO: Model for NF > 2 894 // We'll need to enhance getShuffleCost to model shuffles that are just 895 // inserts and extracts into subvectors, since they won't have the full cost 896 // of a vrgather. 897 // An interleaved store for 3 vectors of 4 lanes will look like 898 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7> 899 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3> 900 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11> 901 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask> 902 // store <12 x i32> %interleaved.vec, ptr %10, align 4 903 if (Factor != 2) 904 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 905 Alignment, AddressSpace, CostKind, 906 UseMaskForCond, UseMaskForGaps); 907 908 assert(Opcode == Instruction::Store && "Opcode must be a store"); 909 // For an interleaving store of 2 vectors, we perform one large interleaving 910 // shuffle that goes into the wide store 911 auto Mask = createInterleaveMask(VF, Factor); 912 InstructionCost ShuffleCost = 913 getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, FVTy, Mask, 914 CostKind, 0, nullptr, {}); 915 return MemCost + ShuffleCost; 916 } 917 918 InstructionCost RISCVTTIImpl::getGatherScatterOpCost( 919 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 920 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { 921 if (CostKind != TTI::TCK_RecipThroughput) 922 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 923 Alignment, CostKind, I); 924 925 if ((Opcode == Instruction::Load && 926 !isLegalMaskedGather(DataTy, Align(Alignment))) || 927 (Opcode == Instruction::Store && 928 !isLegalMaskedScatter(DataTy, Align(Alignment)))) 929 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 930 Alignment, CostKind, I); 931 932 // Cost is proportional to the number of memory operations implied. For 933 // scalable vectors, we use an estimate on that number since we don't 934 // know exactly what VL will be. 935 auto &VTy = *cast<VectorType>(DataTy); 936 InstructionCost MemOpCost = 937 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind, 938 {TTI::OK_AnyValue, TTI::OP_None}, I); 939 unsigned NumLoads = getEstimatedVLFor(&VTy); 940 return NumLoads * MemOpCost; 941 } 942 943 InstructionCost RISCVTTIImpl::getStridedMemoryOpCost( 944 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 945 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { 946 if (((Opcode == Instruction::Load || Opcode == Instruction::Store) && 947 !isLegalStridedLoadStore(DataTy, Alignment)) || 948 (Opcode != Instruction::Load && Opcode != Instruction::Store)) 949 return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask, 950 Alignment, CostKind, I); 951 952 if (CostKind == TTI::TCK_CodeSize) 953 return TTI::TCC_Basic; 954 955 // Cost is proportional to the number of memory operations implied. For 956 // scalable vectors, we use an estimate on that number since we don't 957 // know exactly what VL will be. 958 auto &VTy = *cast<VectorType>(DataTy); 959 InstructionCost MemOpCost = 960 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind, 961 {TTI::OK_AnyValue, TTI::OP_None}, I); 962 unsigned NumLoads = getEstimatedVLFor(&VTy); 963 return NumLoads * MemOpCost; 964 } 965 966 InstructionCost 967 RISCVTTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { 968 // FIXME: This is a property of the default vector convention, not 969 // all possible calling conventions. Fixing that will require 970 // some TTI API and SLP rework. 971 InstructionCost Cost = 0; 972 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 973 for (auto *Ty : Tys) { 974 if (!Ty->isVectorTy()) 975 continue; 976 Align A = DL.getPrefTypeAlign(Ty); 977 Cost += getMemoryOpCost(Instruction::Store, Ty, A, 0, CostKind) + 978 getMemoryOpCost(Instruction::Load, Ty, A, 0, CostKind); 979 } 980 return Cost; 981 } 982 983 // Currently, these represent both throughput and codesize costs 984 // for the respective intrinsics. The costs in this table are simply 985 // instruction counts with the following adjustments made: 986 // * One vsetvli is considered free. 987 static const CostTblEntry VectorIntrinsicCostTable[]{ 988 {Intrinsic::floor, MVT::f32, 9}, 989 {Intrinsic::floor, MVT::f64, 9}, 990 {Intrinsic::ceil, MVT::f32, 9}, 991 {Intrinsic::ceil, MVT::f64, 9}, 992 {Intrinsic::trunc, MVT::f32, 7}, 993 {Intrinsic::trunc, MVT::f64, 7}, 994 {Intrinsic::round, MVT::f32, 9}, 995 {Intrinsic::round, MVT::f64, 9}, 996 {Intrinsic::roundeven, MVT::f32, 9}, 997 {Intrinsic::roundeven, MVT::f64, 9}, 998 {Intrinsic::rint, MVT::f32, 7}, 999 {Intrinsic::rint, MVT::f64, 7}, 1000 {Intrinsic::lrint, MVT::i32, 1}, 1001 {Intrinsic::lrint, MVT::i64, 1}, 1002 {Intrinsic::llrint, MVT::i64, 1}, 1003 {Intrinsic::nearbyint, MVT::f32, 9}, 1004 {Intrinsic::nearbyint, MVT::f64, 9}, 1005 {Intrinsic::bswap, MVT::i16, 3}, 1006 {Intrinsic::bswap, MVT::i32, 12}, 1007 {Intrinsic::bswap, MVT::i64, 31}, 1008 {Intrinsic::vp_bswap, MVT::i16, 3}, 1009 {Intrinsic::vp_bswap, MVT::i32, 12}, 1010 {Intrinsic::vp_bswap, MVT::i64, 31}, 1011 {Intrinsic::vp_fshl, MVT::i8, 7}, 1012 {Intrinsic::vp_fshl, MVT::i16, 7}, 1013 {Intrinsic::vp_fshl, MVT::i32, 7}, 1014 {Intrinsic::vp_fshl, MVT::i64, 7}, 1015 {Intrinsic::vp_fshr, MVT::i8, 7}, 1016 {Intrinsic::vp_fshr, MVT::i16, 7}, 1017 {Intrinsic::vp_fshr, MVT::i32, 7}, 1018 {Intrinsic::vp_fshr, MVT::i64, 7}, 1019 {Intrinsic::bitreverse, MVT::i8, 17}, 1020 {Intrinsic::bitreverse, MVT::i16, 24}, 1021 {Intrinsic::bitreverse, MVT::i32, 33}, 1022 {Intrinsic::bitreverse, MVT::i64, 52}, 1023 {Intrinsic::vp_bitreverse, MVT::i8, 17}, 1024 {Intrinsic::vp_bitreverse, MVT::i16, 24}, 1025 {Intrinsic::vp_bitreverse, MVT::i32, 33}, 1026 {Intrinsic::vp_bitreverse, MVT::i64, 52}, 1027 {Intrinsic::ctpop, MVT::i8, 12}, 1028 {Intrinsic::ctpop, MVT::i16, 19}, 1029 {Intrinsic::ctpop, MVT::i32, 20}, 1030 {Intrinsic::ctpop, MVT::i64, 21}, 1031 {Intrinsic::ctlz, MVT::i8, 19}, 1032 {Intrinsic::ctlz, MVT::i16, 28}, 1033 {Intrinsic::ctlz, MVT::i32, 31}, 1034 {Intrinsic::ctlz, MVT::i64, 35}, 1035 {Intrinsic::cttz, MVT::i8, 16}, 1036 {Intrinsic::cttz, MVT::i16, 23}, 1037 {Intrinsic::cttz, MVT::i32, 24}, 1038 {Intrinsic::cttz, MVT::i64, 25}, 1039 {Intrinsic::vp_ctpop, MVT::i8, 12}, 1040 {Intrinsic::vp_ctpop, MVT::i16, 19}, 1041 {Intrinsic::vp_ctpop, MVT::i32, 20}, 1042 {Intrinsic::vp_ctpop, MVT::i64, 21}, 1043 {Intrinsic::vp_ctlz, MVT::i8, 19}, 1044 {Intrinsic::vp_ctlz, MVT::i16, 28}, 1045 {Intrinsic::vp_ctlz, MVT::i32, 31}, 1046 {Intrinsic::vp_ctlz, MVT::i64, 35}, 1047 {Intrinsic::vp_cttz, MVT::i8, 16}, 1048 {Intrinsic::vp_cttz, MVT::i16, 23}, 1049 {Intrinsic::vp_cttz, MVT::i32, 24}, 1050 {Intrinsic::vp_cttz, MVT::i64, 25}, 1051 }; 1052 1053 static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID) { 1054 switch (ID) { 1055 #define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \ 1056 case Intrinsic::VPID: \ 1057 return ISD::VPSD; 1058 #include "llvm/IR/VPIntrinsics.def" 1059 #undef HELPER_MAP_VPID_TO_VPSD 1060 } 1061 return ISD::DELETED_NODE; 1062 } 1063 1064 InstructionCost 1065 RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1066 TTI::TargetCostKind CostKind) { 1067 auto *RetTy = ICA.getReturnType(); 1068 switch (ICA.getID()) { 1069 case Intrinsic::lrint: 1070 case Intrinsic::llrint: 1071 // We can't currently lower half or bfloat vector lrint/llrint. 1072 if (auto *VecTy = dyn_cast<VectorType>(ICA.getArgTypes()[0]); 1073 VecTy && VecTy->getElementType()->is16bitFPTy()) 1074 return InstructionCost::getInvalid(); 1075 [[fallthrough]]; 1076 case Intrinsic::ceil: 1077 case Intrinsic::floor: 1078 case Intrinsic::trunc: 1079 case Intrinsic::rint: 1080 case Intrinsic::round: 1081 case Intrinsic::roundeven: { 1082 // These all use the same code. 1083 auto LT = getTypeLegalizationCost(RetTy); 1084 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second)) 1085 return LT.first * 8; 1086 break; 1087 } 1088 case Intrinsic::umin: 1089 case Intrinsic::umax: 1090 case Intrinsic::smin: 1091 case Intrinsic::smax: { 1092 auto LT = getTypeLegalizationCost(RetTy); 1093 if (LT.second.isScalarInteger() && ST->hasStdExtZbb()) 1094 return LT.first; 1095 1096 if (ST->hasVInstructions() && LT.second.isVector()) { 1097 unsigned Op; 1098 switch (ICA.getID()) { 1099 case Intrinsic::umin: 1100 Op = RISCV::VMINU_VV; 1101 break; 1102 case Intrinsic::umax: 1103 Op = RISCV::VMAXU_VV; 1104 break; 1105 case Intrinsic::smin: 1106 Op = RISCV::VMIN_VV; 1107 break; 1108 case Intrinsic::smax: 1109 Op = RISCV::VMAX_VV; 1110 break; 1111 } 1112 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind); 1113 } 1114 break; 1115 } 1116 case Intrinsic::sadd_sat: 1117 case Intrinsic::ssub_sat: 1118 case Intrinsic::uadd_sat: 1119 case Intrinsic::usub_sat: { 1120 auto LT = getTypeLegalizationCost(RetTy); 1121 if (ST->hasVInstructions() && LT.second.isVector()) { 1122 unsigned Op; 1123 switch (ICA.getID()) { 1124 case Intrinsic::sadd_sat: 1125 Op = RISCV::VSADD_VV; 1126 break; 1127 case Intrinsic::ssub_sat: 1128 Op = RISCV::VSSUBU_VV; 1129 break; 1130 case Intrinsic::uadd_sat: 1131 Op = RISCV::VSADDU_VV; 1132 break; 1133 case Intrinsic::usub_sat: 1134 Op = RISCV::VSSUBU_VV; 1135 break; 1136 } 1137 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind); 1138 } 1139 break; 1140 } 1141 case Intrinsic::fabs: { 1142 auto LT = getTypeLegalizationCost(RetTy); 1143 if (ST->hasVInstructions() && LT.second.isVector()) { 1144 // lui a0, 8 1145 // addi a0, a0, -1 1146 // vsetvli a1, zero, e16, m1, ta, ma 1147 // vand.vx v8, v8, a0 1148 // f16 with zvfhmin and bf16 with zvfhbmin 1149 if (LT.second.getVectorElementType() == MVT::bf16 || 1150 (LT.second.getVectorElementType() == MVT::f16 && 1151 !ST->hasVInstructionsF16())) 1152 return LT.first * getRISCVInstructionCost(RISCV::VAND_VX, LT.second, 1153 CostKind) + 1154 2; 1155 else 1156 return LT.first * 1157 getRISCVInstructionCost(RISCV::VFSGNJX_VV, LT.second, CostKind); 1158 } 1159 break; 1160 } 1161 case Intrinsic::sqrt: { 1162 auto LT = getTypeLegalizationCost(RetTy); 1163 if (ST->hasVInstructions() && LT.second.isVector()) { 1164 SmallVector<unsigned, 4> ConvOp; 1165 SmallVector<unsigned, 2> FsqrtOp; 1166 MVT ConvType = LT.second; 1167 MVT FsqrtType = LT.second; 1168 // f16 with zvfhmin and bf16 with zvfbfmin and the type of nxv32[b]f16 1169 // will be spilt. 1170 if (LT.second.getVectorElementType() == MVT::bf16) { 1171 if (LT.second == MVT::nxv32bf16) { 1172 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVTBF16_F_F_V, 1173 RISCV::VFNCVTBF16_F_F_W, RISCV::VFNCVTBF16_F_F_W}; 1174 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V}; 1175 ConvType = MVT::nxv16f16; 1176 FsqrtType = MVT::nxv16f32; 1177 } else { 1178 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFNCVTBF16_F_F_W}; 1179 FsqrtOp = {RISCV::VFSQRT_V}; 1180 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType); 1181 } 1182 } else if (LT.second.getVectorElementType() == MVT::f16 && 1183 !ST->hasVInstructionsF16()) { 1184 if (LT.second == MVT::nxv32f16) { 1185 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V, 1186 RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W}; 1187 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V}; 1188 ConvType = MVT::nxv16f16; 1189 FsqrtType = MVT::nxv16f32; 1190 } else { 1191 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFNCVT_F_F_W}; 1192 FsqrtOp = {RISCV::VFSQRT_V}; 1193 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType); 1194 } 1195 } else { 1196 FsqrtOp = {RISCV::VFSQRT_V}; 1197 } 1198 1199 return LT.first * (getRISCVInstructionCost(FsqrtOp, FsqrtType, CostKind) + 1200 getRISCVInstructionCost(ConvOp, ConvType, CostKind)); 1201 } 1202 break; 1203 } 1204 case Intrinsic::cttz: 1205 case Intrinsic::ctlz: 1206 case Intrinsic::ctpop: { 1207 auto LT = getTypeLegalizationCost(RetTy); 1208 if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector()) { 1209 unsigned Op; 1210 switch (ICA.getID()) { 1211 case Intrinsic::cttz: 1212 Op = RISCV::VCTZ_V; 1213 break; 1214 case Intrinsic::ctlz: 1215 Op = RISCV::VCLZ_V; 1216 break; 1217 case Intrinsic::ctpop: 1218 Op = RISCV::VCPOP_V; 1219 break; 1220 } 1221 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind); 1222 } 1223 break; 1224 } 1225 case Intrinsic::abs: { 1226 auto LT = getTypeLegalizationCost(RetTy); 1227 if (ST->hasVInstructions() && LT.second.isVector()) { 1228 // vrsub.vi v10, v8, 0 1229 // vmax.vv v8, v8, v10 1230 return LT.first * 1231 getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV}, 1232 LT.second, CostKind); 1233 } 1234 break; 1235 } 1236 case Intrinsic::get_active_lane_mask: { 1237 if (ST->hasVInstructions()) { 1238 Type *ExpRetTy = VectorType::get( 1239 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount()); 1240 auto LT = getTypeLegalizationCost(ExpRetTy); 1241 1242 // vid.v v8 // considered hoisted 1243 // vsaddu.vx v8, v8, a0 1244 // vmsltu.vx v0, v8, a1 1245 return LT.first * 1246 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX}, 1247 LT.second, CostKind); 1248 } 1249 break; 1250 } 1251 // TODO: add more intrinsic 1252 case Intrinsic::stepvector: { 1253 auto LT = getTypeLegalizationCost(RetTy); 1254 // Legalisation of illegal types involves an `index' instruction plus 1255 // (LT.first - 1) vector adds. 1256 if (ST->hasVInstructions()) 1257 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) + 1258 (LT.first - 1) * 1259 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind); 1260 return 1 + (LT.first - 1); 1261 } 1262 case Intrinsic::experimental_cttz_elts: { 1263 Type *ArgTy = ICA.getArgTypes()[0]; 1264 EVT ArgType = TLI->getValueType(DL, ArgTy, true); 1265 if (getTLI()->shouldExpandCttzElements(ArgType)) 1266 break; 1267 InstructionCost Cost = getRISCVInstructionCost( 1268 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind); 1269 1270 // If zero_is_poison is false, then we will generate additional 1271 // cmp + select instructions to convert -1 to EVL. 1272 Type *BoolTy = Type::getInt1Ty(RetTy->getContext()); 1273 if (ICA.getArgs().size() > 1 && 1274 cast<ConstantInt>(ICA.getArgs()[1])->isZero()) 1275 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy, 1276 CmpInst::ICMP_SLT, CostKind) + 1277 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy, 1278 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1279 1280 return Cost; 1281 } 1282 case Intrinsic::vp_rint: { 1283 // RISC-V target uses at least 5 instructions to lower rounding intrinsics. 1284 unsigned Cost = 5; 1285 auto LT = getTypeLegalizationCost(RetTy); 1286 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second)) 1287 return Cost * LT.first; 1288 break; 1289 } 1290 case Intrinsic::vp_nearbyint: { 1291 // More one read and one write for fflags than vp_rint. 1292 unsigned Cost = 7; 1293 auto LT = getTypeLegalizationCost(RetTy); 1294 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second)) 1295 return Cost * LT.first; 1296 break; 1297 } 1298 case Intrinsic::vp_ceil: 1299 case Intrinsic::vp_floor: 1300 case Intrinsic::vp_round: 1301 case Intrinsic::vp_roundeven: 1302 case Intrinsic::vp_roundtozero: { 1303 // Rounding with static rounding mode needs two more instructions to 1304 // swap/write FRM than vp_rint. 1305 unsigned Cost = 7; 1306 auto LT = getTypeLegalizationCost(RetTy); 1307 unsigned VPISD = getISDForVPIntrinsicID(ICA.getID()); 1308 if (TLI->isOperationCustom(VPISD, LT.second)) 1309 return Cost * LT.first; 1310 break; 1311 } 1312 case Intrinsic::vp_fneg: { 1313 std::optional<unsigned> FOp = 1314 VPIntrinsic::getFunctionalOpcodeForVP(ICA.getID()); 1315 assert(FOp.has_value()); 1316 return getArithmeticInstrCost(*FOp, ICA.getReturnType(), CostKind); 1317 break; 1318 } 1319 case Intrinsic::vp_select: { 1320 Intrinsic::ID IID = ICA.getID(); 1321 std::optional<unsigned> FOp = VPIntrinsic::getFunctionalOpcodeForVP(IID); 1322 assert(FOp.has_value()); 1323 return getCmpSelInstrCost(*FOp, ICA.getReturnType(), ICA.getArgTypes()[0], 1324 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1325 } 1326 case Intrinsic::vp_merge: 1327 return getCmpSelInstrCost(Instruction::Select, ICA.getReturnType(), 1328 ICA.getArgTypes()[0], CmpInst::BAD_ICMP_PREDICATE, 1329 CostKind); 1330 case Intrinsic::experimental_vp_splat: { 1331 auto LT = getTypeLegalizationCost(RetTy); 1332 // TODO: Lower i1 experimental_vp_splat 1333 if (!ST->hasVInstructions() || LT.second.getScalarType() == MVT::i1) 1334 return InstructionCost::getInvalid(); 1335 return LT.first * getRISCVInstructionCost(LT.second.isFloatingPoint() 1336 ? RISCV::VFMV_V_F 1337 : RISCV::VMV_V_X, 1338 LT.second, CostKind); 1339 } 1340 } 1341 1342 if (ST->hasVInstructions() && RetTy->isVectorTy()) { 1343 if (auto LT = getTypeLegalizationCost(RetTy); 1344 LT.second.isVector()) { 1345 MVT EltTy = LT.second.getVectorElementType(); 1346 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable, 1347 ICA.getID(), EltTy)) 1348 return LT.first * Entry->Cost; 1349 } 1350 } 1351 1352 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1353 } 1354 1355 InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 1356 Type *Src, 1357 TTI::CastContextHint CCH, 1358 TTI::TargetCostKind CostKind, 1359 const Instruction *I) { 1360 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src); 1361 if (!IsVectorType) 1362 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 1363 1364 // FIXME: Need to compute legalizing cost for illegal types. The current 1365 // code handles only legal types and those which can be trivially 1366 // promoted to legal. 1367 if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() || 1368 Dst->getScalarSizeInBits() > ST->getELen()) 1369 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 1370 1371 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1372 assert(ISD && "Invalid opcode"); 1373 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src); 1374 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst); 1375 1376 // Handle i1 source and dest cases *before* calling logic in BasicTTI. 1377 // The shared implementation doesn't model vector widening during legalization 1378 // and instead assumes scalarization. In order to scalarize an <N x i1> 1379 // vector, we need to extend/trunc to/from i8. If we don't special case 1380 // this, we can get an infinite recursion cycle. 1381 switch (ISD) { 1382 default: 1383 break; 1384 case ISD::SIGN_EXTEND: 1385 case ISD::ZERO_EXTEND: 1386 if (Src->getScalarSizeInBits() == 1) { 1387 // We do not use vsext/vzext to extend from mask vector. 1388 // Instead we use the following instructions to extend from mask vector: 1389 // vmv.v.i v8, 0 1390 // vmerge.vim v8, v8, -1, v0 (repeated per split) 1391 return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) + 1392 DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM, 1393 DstLT.second, CostKind) + 1394 DstLT.first - 1; 1395 } 1396 break; 1397 case ISD::TRUNCATE: 1398 if (Dst->getScalarSizeInBits() == 1) { 1399 // We do not use several vncvt to truncate to mask vector. So we could 1400 // not use PowDiff to calculate it. 1401 // Instead we use the following instructions to truncate to mask vector: 1402 // vand.vi v8, v8, 1 1403 // vmsne.vi v0, v8, 0 1404 return SrcLT.first * 1405 getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI}, 1406 SrcLT.second, CostKind) + 1407 SrcLT.first - 1; 1408 } 1409 break; 1410 }; 1411 1412 // Our actual lowering for the case where a wider legal type is available 1413 // uses promotion to the wider type. This is reflected in the result of 1414 // getTypeLegalizationCost, but BasicTTI assumes the widened cases are 1415 // scalarized if the legalized Src and Dst are not equal sized. 1416 const DataLayout &DL = this->getDataLayout(); 1417 if (!SrcLT.second.isVector() || !DstLT.second.isVector() || 1418 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src), 1419 SrcLT.second.getSizeInBits()) || 1420 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst), 1421 DstLT.second.getSizeInBits())) 1422 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 1423 1424 // The split cost is handled by the base getCastInstrCost 1425 assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type"); 1426 1427 int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) - 1428 (int)Log2_32(SrcLT.second.getScalarSizeInBits()); 1429 switch (ISD) { 1430 case ISD::SIGN_EXTEND: 1431 case ISD::ZERO_EXTEND: { 1432 if ((PowDiff < 1) || (PowDiff > 3)) 1433 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 1434 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8}; 1435 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8}; 1436 unsigned Op = 1437 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1]; 1438 return getRISCVInstructionCost(Op, DstLT.second, CostKind); 1439 } 1440 case ISD::TRUNCATE: 1441 case ISD::FP_EXTEND: 1442 case ISD::FP_ROUND: { 1443 // Counts of narrow/widen instructions. 1444 unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits(); 1445 unsigned DstEltSize = DstLT.second.getScalarSizeInBits(); 1446 1447 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI 1448 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V 1449 : RISCV::VFNCVT_F_F_W; 1450 InstructionCost Cost = 0; 1451 for (; SrcEltSize != DstEltSize;) { 1452 MVT ElementMVT = (ISD == ISD::TRUNCATE) 1453 ? MVT::getIntegerVT(DstEltSize) 1454 : MVT::getFloatingPointVT(DstEltSize); 1455 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT); 1456 DstEltSize = 1457 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1; 1458 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind); 1459 } 1460 return Cost; 1461 } 1462 case ISD::FP_TO_SINT: 1463 case ISD::FP_TO_UINT: { 1464 unsigned IsSigned = ISD == ISD::FP_TO_SINT; 1465 unsigned FCVT = IsSigned ? RISCV::VFCVT_RTZ_X_F_V : RISCV::VFCVT_RTZ_XU_F_V; 1466 unsigned FWCVT = 1467 IsSigned ? RISCV::VFWCVT_RTZ_X_F_V : RISCV::VFWCVT_RTZ_XU_F_V; 1468 unsigned FNCVT = 1469 IsSigned ? RISCV::VFNCVT_RTZ_X_F_W : RISCV::VFNCVT_RTZ_XU_F_W; 1470 unsigned SrcEltSize = Src->getScalarSizeInBits(); 1471 unsigned DstEltSize = Dst->getScalarSizeInBits(); 1472 InstructionCost Cost = 0; 1473 if ((SrcEltSize == 16) && 1474 (!ST->hasVInstructionsF16() || ((DstEltSize / 2) > SrcEltSize))) { 1475 // If the target only supports zvfhmin or it is fp16-to-i64 conversion 1476 // pre-widening to f32 and then convert f32 to integer 1477 VectorType *VecF32Ty = 1478 VectorType::get(Type::getFloatTy(Dst->getContext()), 1479 cast<VectorType>(Dst)->getElementCount()); 1480 std::pair<InstructionCost, MVT> VecF32LT = 1481 getTypeLegalizationCost(VecF32Ty); 1482 Cost += 1483 VecF32LT.first * getRISCVInstructionCost(RISCV::VFWCVT_F_F_V, 1484 VecF32LT.second, CostKind); 1485 Cost += getCastInstrCost(Opcode, Dst, VecF32Ty, CCH, CostKind, I); 1486 return Cost; 1487 } 1488 if (DstEltSize == SrcEltSize) 1489 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind); 1490 else if (DstEltSize > SrcEltSize) 1491 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind); 1492 else { // (SrcEltSize > DstEltSize) 1493 // First do a narrowing conversion to an integer half the size, then 1494 // truncate if needed. 1495 MVT ElementVT = MVT::getIntegerVT(SrcEltSize / 2); 1496 MVT VecVT = DstLT.second.changeVectorElementType(ElementVT); 1497 Cost += getRISCVInstructionCost(FNCVT, VecVT, CostKind); 1498 if ((SrcEltSize / 2) > DstEltSize) { 1499 Type *VecTy = EVT(VecVT).getTypeForEVT(Dst->getContext()); 1500 Cost += 1501 getCastInstrCost(Instruction::Trunc, Dst, VecTy, CCH, CostKind, I); 1502 } 1503 } 1504 return Cost; 1505 } 1506 case ISD::SINT_TO_FP: 1507 case ISD::UINT_TO_FP: { 1508 unsigned IsSigned = ISD == ISD::SINT_TO_FP; 1509 unsigned FCVT = IsSigned ? RISCV::VFCVT_F_X_V : RISCV::VFCVT_F_XU_V; 1510 unsigned FWCVT = IsSigned ? RISCV::VFWCVT_F_X_V : RISCV::VFWCVT_F_XU_V; 1511 unsigned FNCVT = IsSigned ? RISCV::VFNCVT_F_X_W : RISCV::VFNCVT_F_XU_W; 1512 unsigned SrcEltSize = Src->getScalarSizeInBits(); 1513 unsigned DstEltSize = Dst->getScalarSizeInBits(); 1514 1515 InstructionCost Cost = 0; 1516 if ((DstEltSize == 16) && 1517 (!ST->hasVInstructionsF16() || ((SrcEltSize / 2) > DstEltSize))) { 1518 // If the target only supports zvfhmin or it is i64-to-fp16 conversion 1519 // it is converted to f32 and then converted to f16 1520 VectorType *VecF32Ty = 1521 VectorType::get(Type::getFloatTy(Dst->getContext()), 1522 cast<VectorType>(Dst)->getElementCount()); 1523 std::pair<InstructionCost, MVT> VecF32LT = 1524 getTypeLegalizationCost(VecF32Ty); 1525 Cost += getCastInstrCost(Opcode, VecF32Ty, Src, CCH, CostKind, I); 1526 Cost += VecF32LT.first * getRISCVInstructionCost(RISCV::VFNCVT_F_F_W, 1527 DstLT.second, CostKind); 1528 return Cost; 1529 } 1530 1531 if (DstEltSize == SrcEltSize) 1532 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind); 1533 else if (DstEltSize > SrcEltSize) { 1534 if ((DstEltSize / 2) > SrcEltSize) { 1535 VectorType *VecTy = 1536 VectorType::get(IntegerType::get(Dst->getContext(), DstEltSize / 2), 1537 cast<VectorType>(Dst)->getElementCount()); 1538 unsigned Op = IsSigned ? Instruction::SExt : Instruction::ZExt; 1539 Cost += getCastInstrCost(Op, VecTy, Src, CCH, CostKind, I); 1540 } 1541 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind); 1542 } else 1543 Cost += getRISCVInstructionCost(FNCVT, DstLT.second, CostKind); 1544 return Cost; 1545 } 1546 } 1547 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 1548 } 1549 1550 unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) { 1551 if (isa<ScalableVectorType>(Ty)) { 1552 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType()); 1553 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue(); 1554 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock; 1555 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize); 1556 } 1557 return cast<FixedVectorType>(Ty)->getNumElements(); 1558 } 1559 1560 InstructionCost 1561 RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, 1562 FastMathFlags FMF, 1563 TTI::TargetCostKind CostKind) { 1564 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors()) 1565 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); 1566 1567 // Skip if scalar size of Ty is bigger than ELEN. 1568 if (Ty->getScalarSizeInBits() > ST->getELen()) 1569 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); 1570 1571 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 1572 if (Ty->getElementType()->isIntegerTy(1)) { 1573 // SelectionDAGBuilder does following transforms: 1574 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>) 1575 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>) 1576 if (IID == Intrinsic::umax || IID == Intrinsic::smin) 1577 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind); 1578 else 1579 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind); 1580 } 1581 1582 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) { 1583 SmallVector<unsigned, 3> Opcodes; 1584 InstructionCost ExtraCost = 0; 1585 switch (IID) { 1586 case Intrinsic::maximum: 1587 if (FMF.noNaNs()) { 1588 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S}; 1589 } else { 1590 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS, 1591 RISCV::VFMV_F_S}; 1592 // Cost of Canonical Nan + branch 1593 // lui a0, 523264 1594 // fmv.w.x fa0, a0 1595 Type *DstTy = Ty->getScalarType(); 1596 const unsigned EltTyBits = DstTy->getScalarSizeInBits(); 1597 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits); 1598 ExtraCost = 1 + 1599 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy, 1600 TTI::CastContextHint::None, CostKind) + 1601 getCFInstrCost(Instruction::Br, CostKind); 1602 } 1603 break; 1604 1605 case Intrinsic::minimum: 1606 if (FMF.noNaNs()) { 1607 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S}; 1608 } else { 1609 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS, 1610 RISCV::VFMV_F_S}; 1611 // Cost of Canonical Nan + branch 1612 // lui a0, 523264 1613 // fmv.w.x fa0, a0 1614 Type *DstTy = Ty->getScalarType(); 1615 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy); 1616 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits); 1617 ExtraCost = 1 + 1618 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy, 1619 TTI::CastContextHint::None, CostKind) + 1620 getCFInstrCost(Instruction::Br, CostKind); 1621 } 1622 break; 1623 } 1624 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind); 1625 } 1626 1627 // IR Reduction is composed by one rvv reduction instruction and vmv 1628 unsigned SplitOp; 1629 SmallVector<unsigned, 3> Opcodes; 1630 switch (IID) { 1631 default: 1632 llvm_unreachable("Unsupported intrinsic"); 1633 case Intrinsic::smax: 1634 SplitOp = RISCV::VMAX_VV; 1635 Opcodes = {RISCV::VREDMAX_VS, RISCV::VMV_X_S}; 1636 break; 1637 case Intrinsic::smin: 1638 SplitOp = RISCV::VMIN_VV; 1639 Opcodes = {RISCV::VREDMIN_VS, RISCV::VMV_X_S}; 1640 break; 1641 case Intrinsic::umax: 1642 SplitOp = RISCV::VMAXU_VV; 1643 Opcodes = {RISCV::VREDMAXU_VS, RISCV::VMV_X_S}; 1644 break; 1645 case Intrinsic::umin: 1646 SplitOp = RISCV::VMINU_VV; 1647 Opcodes = {RISCV::VREDMINU_VS, RISCV::VMV_X_S}; 1648 break; 1649 case Intrinsic::maxnum: 1650 SplitOp = RISCV::VFMAX_VV; 1651 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S}; 1652 break; 1653 case Intrinsic::minnum: 1654 SplitOp = RISCV::VFMIN_VV; 1655 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S}; 1656 break; 1657 } 1658 // Add a cost for data larger than LMUL8 1659 InstructionCost SplitCost = 1660 (LT.first > 1) ? (LT.first - 1) * 1661 getRISCVInstructionCost(SplitOp, LT.second, CostKind) 1662 : 0; 1663 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind); 1664 } 1665 1666 InstructionCost 1667 RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, 1668 std::optional<FastMathFlags> FMF, 1669 TTI::TargetCostKind CostKind) { 1670 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors()) 1671 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 1672 1673 // Skip if scalar size of Ty is bigger than ELEN. 1674 if (Ty->getScalarSizeInBits() > ST->getELen()) 1675 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 1676 1677 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1678 assert(ISD && "Invalid opcode"); 1679 1680 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND && 1681 ISD != ISD::FADD) 1682 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 1683 1684 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 1685 Type *ElementTy = Ty->getElementType(); 1686 if (ElementTy->isIntegerTy(1)) { 1687 // Example sequences: 1688 // vfirst.m a0, v0 1689 // seqz a0, a0 1690 if (LT.second == MVT::v1i1) 1691 return getRISCVInstructionCost(RISCV::VFIRST_M, LT.second, CostKind) + 1692 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy, 1693 CmpInst::ICMP_EQ, CostKind); 1694 1695 if (ISD == ISD::AND) { 1696 // Example sequences: 1697 // vmand.mm v8, v9, v8 ; needed every time type is split 1698 // vmnot.m v8, v0 ; alias for vmnand 1699 // vcpop.m a0, v8 1700 // seqz a0, a0 1701 1702 // See the discussion: https://github.com/llvm/llvm-project/pull/119160 1703 // For LMUL <= 8, there is no splitting, 1704 // the sequences are vmnot, vcpop and seqz. 1705 // When LMUL > 8 and split = 1, 1706 // the sequences are vmnand, vcpop and seqz. 1707 // When LMUL > 8 and split > 1, 1708 // the sequences are (LT.first-2) * vmand, vmnand, vcpop and seqz. 1709 return ((LT.first > 2) ? (LT.first - 2) : 0) * 1710 getRISCVInstructionCost(RISCV::VMAND_MM, LT.second, CostKind) + 1711 getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second, CostKind) + 1712 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1713 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy, 1714 CmpInst::ICMP_EQ, CostKind); 1715 } else if (ISD == ISD::XOR || ISD == ISD::ADD) { 1716 // Example sequences: 1717 // vsetvli a0, zero, e8, mf8, ta, ma 1718 // vmxor.mm v8, v0, v8 ; needed every time type is split 1719 // vcpop.m a0, v8 1720 // andi a0, a0, 1 1721 return (LT.first - 1) * 1722 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) + 1723 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1; 1724 } else { 1725 assert(ISD == ISD::OR); 1726 // Example sequences: 1727 // vsetvli a0, zero, e8, mf8, ta, ma 1728 // vmor.mm v8, v9, v8 ; needed every time type is split 1729 // vcpop.m a0, v0 1730 // snez a0, a0 1731 return (LT.first - 1) * 1732 getRISCVInstructionCost(RISCV::VMOR_MM, LT.second, CostKind) + 1733 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1734 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy, 1735 CmpInst::ICMP_NE, CostKind); 1736 } 1737 } 1738 1739 // IR Reduction of or/and is composed by one vmv and one rvv reduction 1740 // instruction, and others is composed by two vmv and one rvv reduction 1741 // instruction 1742 unsigned SplitOp; 1743 SmallVector<unsigned, 3> Opcodes; 1744 switch (ISD) { 1745 case ISD::ADD: 1746 SplitOp = RISCV::VADD_VV; 1747 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S}; 1748 break; 1749 case ISD::OR: 1750 SplitOp = RISCV::VOR_VV; 1751 Opcodes = {RISCV::VREDOR_VS, RISCV::VMV_X_S}; 1752 break; 1753 case ISD::XOR: 1754 SplitOp = RISCV::VXOR_VV; 1755 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S}; 1756 break; 1757 case ISD::AND: 1758 SplitOp = RISCV::VAND_VV; 1759 Opcodes = {RISCV::VREDAND_VS, RISCV::VMV_X_S}; 1760 break; 1761 case ISD::FADD: 1762 // We can't promote f16/bf16 fadd reductions. 1763 if ((LT.second.getVectorElementType() == MVT::f16 && 1764 !ST->hasVInstructionsF16()) || 1765 LT.second.getVectorElementType() == MVT::bf16) 1766 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 1767 if (TTI::requiresOrderedReduction(FMF)) { 1768 Opcodes.push_back(RISCV::VFMV_S_F); 1769 for (unsigned i = 0; i < LT.first.getValue(); i++) 1770 Opcodes.push_back(RISCV::VFREDOSUM_VS); 1771 Opcodes.push_back(RISCV::VFMV_F_S); 1772 return getRISCVInstructionCost(Opcodes, LT.second, CostKind); 1773 } 1774 SplitOp = RISCV::VFADD_VV; 1775 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S}; 1776 break; 1777 } 1778 // Add a cost for data larger than LMUL8 1779 InstructionCost SplitCost = 1780 (LT.first > 1) ? (LT.first - 1) * 1781 getRISCVInstructionCost(SplitOp, LT.second, CostKind) 1782 : 0; 1783 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind); 1784 } 1785 1786 InstructionCost RISCVTTIImpl::getExtendedReductionCost( 1787 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, 1788 FastMathFlags FMF, TTI::TargetCostKind CostKind) { 1789 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors()) 1790 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1791 FMF, CostKind); 1792 1793 // Skip if scalar size of ResTy is bigger than ELEN. 1794 if (ResTy->getScalarSizeInBits() > ST->getELen()) 1795 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1796 FMF, CostKind); 1797 1798 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd) 1799 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1800 FMF, CostKind); 1801 1802 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1803 1804 if (IsUnsigned && Opcode == Instruction::Add && 1805 LT.second.isFixedLengthVector() && LT.second.getScalarType() == MVT::i1) { 1806 // Represent vector_reduce_add(ZExt(<n x i1>)) as 1807 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)). 1808 return LT.first * 1809 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind); 1810 } 1811 1812 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits()) 1813 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1814 FMF, CostKind); 1815 1816 return (LT.first - 1) + 1817 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 1818 } 1819 1820 InstructionCost RISCVTTIImpl::getStoreImmCost(Type *Ty, 1821 TTI::OperandValueInfo OpInfo, 1822 TTI::TargetCostKind CostKind) { 1823 assert(OpInfo.isConstant() && "non constant operand?"); 1824 if (!isa<VectorType>(Ty)) 1825 // FIXME: We need to account for immediate materialization here, but doing 1826 // a decent job requires more knowledge about the immediate than we 1827 // currently have here. 1828 return 0; 1829 1830 if (OpInfo.isUniform()) 1831 // vmv.v.i, vmv.v.x, or vfmv.v.f 1832 // We ignore the cost of the scalar constant materialization to be consistent 1833 // with how we treat scalar constants themselves just above. 1834 return 1; 1835 1836 return getConstantPoolLoadCost(Ty, CostKind); 1837 } 1838 1839 1840 InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, 1841 MaybeAlign Alignment, 1842 unsigned AddressSpace, 1843 TTI::TargetCostKind CostKind, 1844 TTI::OperandValueInfo OpInfo, 1845 const Instruction *I) { 1846 EVT VT = TLI->getValueType(DL, Src, true); 1847 // Type legalization can't handle structs 1848 if (VT == MVT::Other) 1849 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 1850 CostKind, OpInfo, I); 1851 1852 InstructionCost Cost = 0; 1853 if (Opcode == Instruction::Store && OpInfo.isConstant()) 1854 Cost += getStoreImmCost(Src, OpInfo, CostKind); 1855 1856 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src); 1857 1858 InstructionCost BaseCost = [&]() { 1859 InstructionCost Cost = LT.first; 1860 if (CostKind != TTI::TCK_RecipThroughput) 1861 return Cost; 1862 1863 // Our actual lowering for the case where a wider legal type is available 1864 // uses the a VL predicated load on the wider type. This is reflected in 1865 // the result of getTypeLegalizationCost, but BasicTTI assumes the 1866 // widened cases are scalarized. 1867 const DataLayout &DL = this->getDataLayout(); 1868 if (Src->isVectorTy() && LT.second.isVector() && 1869 TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src), 1870 LT.second.getSizeInBits())) 1871 return Cost; 1872 1873 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 1874 CostKind, OpInfo, I); 1875 }(); 1876 1877 // Assume memory ops cost scale with the number of vector registers 1878 // possible accessed by the instruction. Note that BasicTTI already 1879 // handles the LT.first term for us. 1880 if (LT.second.isVector() && CostKind != TTI::TCK_CodeSize) 1881 BaseCost *= TLI->getLMULCost(LT.second); 1882 return Cost + BaseCost; 1883 1884 } 1885 1886 InstructionCost RISCVTTIImpl::getCmpSelInstrCost( 1887 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, 1888 TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, 1889 TTI::OperandValueInfo Op2Info, const Instruction *I) { 1890 if (CostKind != TTI::TCK_RecipThroughput) 1891 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1892 Op1Info, Op2Info, I); 1893 1894 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors()) 1895 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1896 Op1Info, Op2Info, I); 1897 1898 // Skip if scalar size of ValTy is bigger than ELEN. 1899 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen()) 1900 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1901 Op1Info, Op2Info, I); 1902 1903 auto GetConstantMatCost = 1904 [&](TTI::OperandValueInfo OpInfo) -> InstructionCost { 1905 if (OpInfo.isUniform()) 1906 // We return 0 we currently ignore the cost of materializing scalar 1907 // constants in GPRs. 1908 return 0; 1909 1910 return getConstantPoolLoadCost(ValTy, CostKind); 1911 }; 1912 1913 InstructionCost ConstantMatCost; 1914 if (Op1Info.isConstant()) 1915 ConstantMatCost += GetConstantMatCost(Op1Info); 1916 if (Op2Info.isConstant()) 1917 ConstantMatCost += GetConstantMatCost(Op2Info); 1918 1919 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1920 if (Opcode == Instruction::Select && ValTy->isVectorTy()) { 1921 if (CondTy->isVectorTy()) { 1922 if (ValTy->getScalarSizeInBits() == 1) { 1923 // vmandn.mm v8, v8, v9 1924 // vmand.mm v9, v0, v9 1925 // vmor.mm v0, v9, v8 1926 return ConstantMatCost + 1927 LT.first * 1928 getRISCVInstructionCost( 1929 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM}, 1930 LT.second, CostKind); 1931 } 1932 // vselect and max/min are supported natively. 1933 return ConstantMatCost + 1934 LT.first * getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, 1935 CostKind); 1936 } 1937 1938 if (ValTy->getScalarSizeInBits() == 1) { 1939 // vmv.v.x v9, a0 1940 // vmsne.vi v9, v9, 0 1941 // vmandn.mm v8, v8, v9 1942 // vmand.mm v9, v0, v9 1943 // vmor.mm v0, v9, v8 1944 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8); 1945 return ConstantMatCost + 1946 LT.first * 1947 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI}, 1948 InterimVT, CostKind) + 1949 LT.first * getRISCVInstructionCost( 1950 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM}, 1951 LT.second, CostKind); 1952 } 1953 1954 // vmv.v.x v10, a0 1955 // vmsne.vi v0, v10, 0 1956 // vmerge.vvm v8, v9, v8, v0 1957 return ConstantMatCost + 1958 LT.first * getRISCVInstructionCost( 1959 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM}, 1960 LT.second, CostKind); 1961 } 1962 1963 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() && 1964 CmpInst::isIntPredicate(VecPred)) { 1965 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE 1966 // provided they incur the same cost across all implementations 1967 return ConstantMatCost + LT.first * getRISCVInstructionCost(RISCV::VMSLT_VV, 1968 LT.second, 1969 CostKind); 1970 } 1971 1972 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() && 1973 CmpInst::isFPPredicate(VecPred)) { 1974 1975 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask 1976 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE)) 1977 return ConstantMatCost + 1978 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind); 1979 1980 // If we do not support the input floating point vector type, use the base 1981 // one which will calculate as: 1982 // ScalarizeCost + Num * Cost for fixed vector, 1983 // InvalidCost for scalable vector. 1984 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) || 1985 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) || 1986 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64())) 1987 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1988 Op1Info, Op2Info, I); 1989 1990 // Assuming vector fp compare and mask instructions are all the same cost 1991 // until a need arises to differentiate them. 1992 switch (VecPred) { 1993 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm 1994 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm 1995 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm 1996 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm 1997 return ConstantMatCost + 1998 LT.first * getRISCVInstructionCost( 1999 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM}, 2000 LT.second, CostKind); 2001 2002 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m 2003 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m 2004 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m 2005 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m 2006 return ConstantMatCost + 2007 LT.first * 2008 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM}, 2009 LT.second, CostKind); 2010 2011 case CmpInst::FCMP_OEQ: // vmfeq.vv 2012 case CmpInst::FCMP_OGT: // vmflt.vv 2013 case CmpInst::FCMP_OGE: // vmfle.vv 2014 case CmpInst::FCMP_OLT: // vmflt.vv 2015 case CmpInst::FCMP_OLE: // vmfle.vv 2016 case CmpInst::FCMP_UNE: // vmfne.vv 2017 return ConstantMatCost + 2018 LT.first * 2019 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind); 2020 default: 2021 break; 2022 } 2023 } 2024 2025 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select 2026 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will 2027 // generate a conditional branch + mv. The cost of scalar (icmp + select) will 2028 // be (0 + select instr cost). 2029 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) && 2030 ValTy->isIntegerTy() && !I->user_empty()) { 2031 if (all_of(I->users(), [&](const User *U) { 2032 return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) && 2033 U->getType()->isIntegerTy() && 2034 !isa<ConstantData>(U->getOperand(1)) && 2035 !isa<ConstantData>(U->getOperand(2)); 2036 })) 2037 return 0; 2038 } 2039 2040 // TODO: Add cost for scalar type. 2041 2042 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 2043 Op1Info, Op2Info, I); 2044 } 2045 2046 InstructionCost RISCVTTIImpl::getCFInstrCost(unsigned Opcode, 2047 TTI::TargetCostKind CostKind, 2048 const Instruction *I) { 2049 if (CostKind != TTI::TCK_RecipThroughput) 2050 return Opcode == Instruction::PHI ? 0 : 1; 2051 // Branches are assumed to be predicted. 2052 return 0; 2053 } 2054 2055 InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 2056 TTI::TargetCostKind CostKind, 2057 unsigned Index, Value *Op0, 2058 Value *Op1) { 2059 assert(Val->isVectorTy() && "This must be a vector type"); 2060 2061 if (Opcode != Instruction::ExtractElement && 2062 Opcode != Instruction::InsertElement) 2063 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); 2064 2065 // Legalize the type. 2066 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val); 2067 2068 // This type is legalized to a scalar type. 2069 if (!LT.second.isVector()) { 2070 auto *FixedVecTy = cast<FixedVectorType>(Val); 2071 // If Index is a known constant, cost is zero. 2072 if (Index != -1U) 2073 return 0; 2074 // Extract/InsertElement with non-constant index is very costly when 2075 // scalarized; estimate cost of loads/stores sequence via the stack: 2076 // ExtractElement cost: store vector to stack, load scalar; 2077 // InsertElement cost: store vector to stack, store scalar, load vector. 2078 Type *ElemTy = FixedVecTy->getElementType(); 2079 auto NumElems = FixedVecTy->getNumElements(); 2080 auto Align = DL.getPrefTypeAlign(ElemTy); 2081 InstructionCost LoadCost = 2082 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind); 2083 InstructionCost StoreCost = 2084 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind); 2085 return Opcode == Instruction::ExtractElement 2086 ? StoreCost * NumElems + LoadCost 2087 : (StoreCost + LoadCost) * NumElems + StoreCost; 2088 } 2089 2090 // For unsupported scalable vector. 2091 if (LT.second.isScalableVector() && !LT.first.isValid()) 2092 return LT.first; 2093 2094 // Mask vector extract/insert is expanded via e8. 2095 if (Val->getScalarSizeInBits() == 1) { 2096 VectorType *WideTy = 2097 VectorType::get(IntegerType::get(Val->getContext(), 8), 2098 cast<VectorType>(Val)->getElementCount()); 2099 if (Opcode == Instruction::ExtractElement) { 2100 InstructionCost ExtendCost 2101 = getCastInstrCost(Instruction::ZExt, WideTy, Val, 2102 TTI::CastContextHint::None, CostKind); 2103 InstructionCost ExtractCost 2104 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr); 2105 return ExtendCost + ExtractCost; 2106 } 2107 InstructionCost ExtendCost 2108 = getCastInstrCost(Instruction::ZExt, WideTy, Val, 2109 TTI::CastContextHint::None, CostKind); 2110 InstructionCost InsertCost 2111 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr); 2112 InstructionCost TruncCost 2113 = getCastInstrCost(Instruction::Trunc, Val, WideTy, 2114 TTI::CastContextHint::None, CostKind); 2115 return ExtendCost + InsertCost + TruncCost; 2116 } 2117 2118 2119 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector 2120 // and vslideup + vmv.s.x to insert element to vector. 2121 unsigned BaseCost = 1; 2122 // When insertelement we should add the index with 1 as the input of vslideup. 2123 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1; 2124 2125 if (Index != -1U) { 2126 // The type may be split. For fixed-width vectors we can normalize the 2127 // index to the new type. 2128 if (LT.second.isFixedLengthVector()) { 2129 unsigned Width = LT.second.getVectorNumElements(); 2130 Index = Index % Width; 2131 } 2132 2133 // If exact VLEN is known, we will insert/extract into the appropriate 2134 // subvector with no additional subvector insert/extract cost. 2135 if (auto VLEN = ST->getRealVLen()) { 2136 unsigned EltSize = LT.second.getScalarSizeInBits(); 2137 unsigned M1Max = *VLEN / EltSize; 2138 Index = Index % M1Max; 2139 } 2140 2141 // We could extract/insert the first element without vslidedown/vslideup. 2142 if (Index == 0) 2143 SlideCost = 0; 2144 else if (Opcode == Instruction::InsertElement) 2145 SlideCost = 1; // With a constant index, we do not need to use addi. 2146 } 2147 2148 // When the vector needs to split into multiple register groups and the index 2149 // exceeds single vector register group, we need to insert/extract the element 2150 // via stack. 2151 if (LT.first > 1 && 2152 ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() && 2153 LT.second.isScalableVector()))) { 2154 Type *ScalarType = Val->getScalarType(); 2155 Align VecAlign = DL.getPrefTypeAlign(Val); 2156 Align SclAlign = DL.getPrefTypeAlign(ScalarType); 2157 // Extra addi for unknown index. 2158 InstructionCost IdxCost = Index == -1U ? 1 : 0; 2159 2160 // Store all split vectors into stack and load the target element. 2161 if (Opcode == Instruction::ExtractElement) 2162 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) + 2163 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0, 2164 CostKind) + 2165 IdxCost; 2166 2167 // Store all split vectors into stack and store the target element and load 2168 // vectors back. 2169 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) + 2170 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind) + 2171 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0, 2172 CostKind) + 2173 IdxCost; 2174 } 2175 2176 // Extract i64 in the target that has XLEN=32 need more instruction. 2177 if (Val->getScalarType()->isIntegerTy() && 2178 ST->getXLen() < Val->getScalarSizeInBits()) { 2179 // For extractelement, we need the following instructions: 2180 // vsetivli zero, 1, e64, m1, ta, mu (not count) 2181 // vslidedown.vx v8, v8, a0 2182 // vmv.x.s a0, v8 2183 // li a1, 32 2184 // vsrl.vx v8, v8, a1 2185 // vmv.x.s a1, v8 2186 2187 // For insertelement, we need the following instructions: 2188 // vsetivli zero, 2, e32, m4, ta, mu (not count) 2189 // vmv.v.i v12, 0 2190 // vslide1up.vx v16, v12, a1 2191 // vslide1up.vx v12, v16, a0 2192 // addi a0, a2, 1 2193 // vsetvli zero, a0, e64, m4, tu, mu (not count) 2194 // vslideup.vx v8, v12, a2 2195 2196 // TODO: should we count these special vsetvlis? 2197 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4; 2198 } 2199 return BaseCost + SlideCost; 2200 } 2201 2202 InstructionCost RISCVTTIImpl::getArithmeticInstrCost( 2203 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 2204 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, 2205 ArrayRef<const Value *> Args, const Instruction *CxtI) { 2206 2207 // TODO: Handle more cost kinds. 2208 if (CostKind != TTI::TCK_RecipThroughput) 2209 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 2210 Args, CxtI); 2211 2212 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors()) 2213 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 2214 Args, CxtI); 2215 2216 // Skip if scalar size of Ty is bigger than ELEN. 2217 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen()) 2218 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 2219 Args, CxtI); 2220 2221 // Legalize the type. 2222 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 2223 2224 // TODO: Handle scalar type. 2225 if (!LT.second.isVector()) 2226 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 2227 Args, CxtI); 2228 2229 // f16 with zvfhmin and bf16 will be promoted to f32. 2230 // FIXME: nxv32[b]f16 will be custom lowered and split. 2231 unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode); 2232 InstructionCost CastCost = 0; 2233 if ((LT.second.getVectorElementType() == MVT::f16 || 2234 LT.second.getVectorElementType() == MVT::bf16) && 2235 TLI->getOperationAction(ISDOpcode, LT.second) == 2236 TargetLoweringBase::LegalizeAction::Promote) { 2237 MVT PromotedVT = TLI->getTypeToPromoteTo(ISDOpcode, LT.second); 2238 Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Ty->getContext()); 2239 Type *LegalTy = EVT(LT.second).getTypeForEVT(Ty->getContext()); 2240 // Add cost of extending arguments 2241 CastCost += LT.first * Args.size() * 2242 getCastInstrCost(Instruction::FPExt, PromotedTy, LegalTy, 2243 TTI::CastContextHint::None, CostKind); 2244 // Add cost of truncating result 2245 CastCost += 2246 LT.first * getCastInstrCost(Instruction::FPTrunc, LegalTy, PromotedTy, 2247 TTI::CastContextHint::None, CostKind); 2248 // Compute cost of op in promoted type 2249 LT.second = PromotedVT; 2250 } 2251 2252 auto getConstantMatCost = 2253 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost { 2254 if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand)) 2255 // Two sub-cases: 2256 // * Has a 5 bit immediate operand which can be splatted. 2257 // * Has a larger immediate which must be materialized in scalar register 2258 // We return 0 for both as we currently ignore the cost of materializing 2259 // scalar constants in GPRs. 2260 return 0; 2261 2262 return getConstantPoolLoadCost(Ty, CostKind); 2263 }; 2264 2265 // Add the cost of materializing any constant vectors required. 2266 InstructionCost ConstantMatCost = 0; 2267 if (Op1Info.isConstant()) 2268 ConstantMatCost += getConstantMatCost(0, Op1Info); 2269 if (Op2Info.isConstant()) 2270 ConstantMatCost += getConstantMatCost(1, Op2Info); 2271 2272 unsigned Op; 2273 switch (ISDOpcode) { 2274 case ISD::ADD: 2275 case ISD::SUB: 2276 Op = RISCV::VADD_VV; 2277 break; 2278 case ISD::SHL: 2279 case ISD::SRL: 2280 case ISD::SRA: 2281 Op = RISCV::VSLL_VV; 2282 break; 2283 case ISD::AND: 2284 case ISD::OR: 2285 case ISD::XOR: 2286 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV; 2287 break; 2288 case ISD::MUL: 2289 case ISD::MULHS: 2290 case ISD::MULHU: 2291 Op = RISCV::VMUL_VV; 2292 break; 2293 case ISD::SDIV: 2294 case ISD::UDIV: 2295 Op = RISCV::VDIV_VV; 2296 break; 2297 case ISD::SREM: 2298 case ISD::UREM: 2299 Op = RISCV::VREM_VV; 2300 break; 2301 case ISD::FADD: 2302 case ISD::FSUB: 2303 Op = RISCV::VFADD_VV; 2304 break; 2305 case ISD::FMUL: 2306 Op = RISCV::VFMUL_VV; 2307 break; 2308 case ISD::FDIV: 2309 Op = RISCV::VFDIV_VV; 2310 break; 2311 case ISD::FNEG: 2312 Op = RISCV::VFSGNJN_VV; 2313 break; 2314 default: 2315 // Assuming all other instructions have the same cost until a need arises to 2316 // differentiate them. 2317 return CastCost + ConstantMatCost + 2318 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 2319 Args, CxtI); 2320 } 2321 2322 InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind); 2323 // We use BasicTTIImpl to calculate scalar costs, which assumes floating point 2324 // ops are twice as expensive as integer ops. Do the same for vectors so 2325 // scalar floating point ops aren't cheaper than their vector equivalents. 2326 if (Ty->isFPOrFPVectorTy()) 2327 InstrCost *= 2; 2328 return CastCost + ConstantMatCost + LT.first * InstrCost; 2329 } 2330 2331 // TODO: Deduplicate from TargetTransformInfoImplCRTPBase. 2332 InstructionCost RISCVTTIImpl::getPointersChainCost( 2333 ArrayRef<const Value *> Ptrs, const Value *Base, 2334 const TTI::PointersChainInfo &Info, Type *AccessTy, 2335 TTI::TargetCostKind CostKind) { 2336 InstructionCost Cost = TTI::TCC_Free; 2337 // In the basic model we take into account GEP instructions only 2338 // (although here can come alloca instruction, a value, constants and/or 2339 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a 2340 // pointer). Typically, if Base is a not a GEP-instruction and all the 2341 // pointers are relative to the same base address, all the rest are 2342 // either GEP instructions, PHIs, bitcasts or constants. When we have same 2343 // base, we just calculate cost of each non-Base GEP as an ADD operation if 2344 // any their index is a non-const. 2345 // If no known dependecies between the pointers cost is calculated as a sum 2346 // of costs of GEP instructions. 2347 for (auto [I, V] : enumerate(Ptrs)) { 2348 const auto *GEP = dyn_cast<GetElementPtrInst>(V); 2349 if (!GEP) 2350 continue; 2351 if (Info.isSameBase() && V != Base) { 2352 if (GEP->hasAllConstantIndices()) 2353 continue; 2354 // If the chain is unit-stride and BaseReg + stride*i is a legal 2355 // addressing mode, then presume the base GEP is sitting around in a 2356 // register somewhere and check if we can fold the offset relative to 2357 // it. 2358 unsigned Stride = DL.getTypeStoreSize(AccessTy); 2359 if (Info.isUnitStride() && 2360 isLegalAddressingMode(AccessTy, 2361 /* BaseGV */ nullptr, 2362 /* BaseOffset */ Stride * I, 2363 /* HasBaseReg */ true, 2364 /* Scale */ 0, 2365 GEP->getType()->getPointerAddressSpace())) 2366 continue; 2367 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind, 2368 {TTI::OK_AnyValue, TTI::OP_None}, 2369 {TTI::OK_AnyValue, TTI::OP_None}, {}); 2370 } else { 2371 SmallVector<const Value *> Indices(GEP->indices()); 2372 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(), 2373 Indices, AccessTy, CostKind); 2374 } 2375 } 2376 return Cost; 2377 } 2378 2379 void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 2380 TTI::UnrollingPreferences &UP, 2381 OptimizationRemarkEmitter *ORE) { 2382 // TODO: More tuning on benchmarks and metrics with changes as needed 2383 // would apply to all settings below to enable performance. 2384 2385 2386 if (ST->enableDefaultUnroll()) 2387 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE); 2388 2389 // Enable Upper bound unrolling universally, not dependant upon the conditions 2390 // below. 2391 UP.UpperBound = true; 2392 2393 // Disable loop unrolling for Oz and Os. 2394 UP.OptSizeThreshold = 0; 2395 UP.PartialOptSizeThreshold = 0; 2396 if (L->getHeader()->getParent()->hasOptSize()) 2397 return; 2398 2399 SmallVector<BasicBlock *, 4> ExitingBlocks; 2400 L->getExitingBlocks(ExitingBlocks); 2401 LLVM_DEBUG(dbgs() << "Loop has:\n" 2402 << "Blocks: " << L->getNumBlocks() << "\n" 2403 << "Exit blocks: " << ExitingBlocks.size() << "\n"); 2404 2405 // Only allow another exit other than the latch. This acts as an early exit 2406 // as it mirrors the profitability calculation of the runtime unroller. 2407 if (ExitingBlocks.size() > 2) 2408 return; 2409 2410 // Limit the CFG of the loop body for targets with a branch predictor. 2411 // Allowing 4 blocks permits if-then-else diamonds in the body. 2412 if (L->getNumBlocks() > 4) 2413 return; 2414 2415 // Don't unroll vectorized loops, including the remainder loop 2416 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized")) 2417 return; 2418 2419 // Scan the loop: don't unroll loops with calls as this could prevent 2420 // inlining. 2421 InstructionCost Cost = 0; 2422 for (auto *BB : L->getBlocks()) { 2423 for (auto &I : *BB) { 2424 // Initial setting - Don't unroll loops containing vectorized 2425 // instructions. 2426 if (I.getType()->isVectorTy()) 2427 return; 2428 2429 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 2430 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 2431 if (!isLoweredToCall(F)) 2432 continue; 2433 } 2434 return; 2435 } 2436 2437 SmallVector<const Value *> Operands(I.operand_values()); 2438 Cost += getInstructionCost(&I, Operands, 2439 TargetTransformInfo::TCK_SizeAndLatency); 2440 } 2441 } 2442 2443 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n"); 2444 2445 UP.Partial = true; 2446 UP.Runtime = true; 2447 UP.UnrollRemainder = true; 2448 UP.UnrollAndJam = true; 2449 2450 // Force unrolling small loops can be very useful because of the branch 2451 // taken cost of the backedge. 2452 if (Cost < 12) 2453 UP.Force = true; 2454 } 2455 2456 void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 2457 TTI::PeelingPreferences &PP) { 2458 BaseT::getPeelingPreferences(L, SE, PP); 2459 } 2460 2461 unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) { 2462 if (Ty->isVectorTy()) { 2463 // f16 with only zvfhmin and bf16 will be promoted to f32 2464 Type *EltTy = cast<VectorType>(Ty)->getElementType(); 2465 if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) || 2466 EltTy->isBFloatTy()) 2467 Ty = VectorType::get(Type::getFloatTy(Ty->getContext()), 2468 cast<VectorType>(Ty)); 2469 2470 TypeSize Size = DL.getTypeSizeInBits(Ty); 2471 if (Size.isScalable() && ST->hasVInstructions()) 2472 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock); 2473 2474 if (ST->useRVVForFixedLengthVectors()) 2475 return divideCeil(Size, ST->getRealMinVLen()); 2476 } 2477 2478 return BaseT::getRegUsageForType(Ty); 2479 } 2480 2481 unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { 2482 if (SLPMaxVF.getNumOccurrences()) 2483 return SLPMaxVF; 2484 2485 // Return how many elements can fit in getRegisterBitwidth. This is the 2486 // same routine as used in LoopVectorizer. We should probably be 2487 // accounting for whether we actually have instructions with the right 2488 // lane type, but we don't have enough information to do that without 2489 // some additional plumbing which hasn't been justified yet. 2490 TypeSize RegWidth = 2491 getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector); 2492 // If no vector registers, or absurd element widths, disable 2493 // vectorization by returning 1. 2494 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth); 2495 } 2496 2497 TTI::AddressingModeKind 2498 RISCVTTIImpl::getPreferredAddressingMode(const Loop *L, 2499 ScalarEvolution *SE) const { 2500 if (ST->hasVendorXCVmem() && !ST->is64Bit()) 2501 return TTI::AMK_PostIndexed; 2502 2503 return BasicTTIImplBase::getPreferredAddressingMode(L, SE); 2504 } 2505 2506 bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, 2507 const TargetTransformInfo::LSRCost &C2) { 2508 // RISC-V specific here are "instruction number 1st priority". 2509 // If we need to emit adds inside the loop to add up base registers, then 2510 // we need at least one extra temporary register. 2511 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0); 2512 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0); 2513 return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost, 2514 C1.NumIVMuls, C1.NumBaseAdds, 2515 C1.ScaleCost, C1.ImmCost, C1.SetupCost) < 2516 std::tie(C2.Insns, C2NumRegs, C2.AddRecCost, 2517 C2.NumIVMuls, C2.NumBaseAdds, 2518 C2.ScaleCost, C2.ImmCost, C2.SetupCost); 2519 } 2520 2521 bool RISCVTTIImpl::isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) { 2522 auto *VTy = dyn_cast<VectorType>(DataTy); 2523 if (!VTy || VTy->isScalableTy()) 2524 return false; 2525 2526 if (!isLegalMaskedLoadStore(DataTy, Alignment)) 2527 return false; 2528 2529 // FIXME: If it is an i8 vector and the element count exceeds 256, we should 2530 // scalarize these types with LMUL >= maximum fixed-length LMUL. 2531 if (VTy->getElementType()->isIntegerTy(8)) 2532 if (VTy->getElementCount().getFixedValue() > 256) 2533 return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() < 2534 ST->getMaxLMULForFixedLengthVectors(); 2535 return true; 2536 } 2537 2538 bool RISCVTTIImpl::isLegalMaskedCompressStore(Type *DataTy, Align Alignment) { 2539 auto *VTy = dyn_cast<VectorType>(DataTy); 2540 if (!VTy || VTy->isScalableTy()) 2541 return false; 2542 2543 if (!isLegalMaskedLoadStore(DataTy, Alignment)) 2544 return false; 2545 return true; 2546 } 2547 2548 /// See if \p I should be considered for address type promotion. We check if \p 2549 /// I is a sext with right type and used in memory accesses. If it used in a 2550 /// "complex" getelementptr, we allow it to be promoted without finding other 2551 /// sext instructions that sign extended the same initial value. A getelementptr 2552 /// is considered as "complex" if it has more than 2 operands. 2553 bool RISCVTTIImpl::shouldConsiderAddressTypePromotion( 2554 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) { 2555 bool Considerable = false; 2556 AllowPromotionWithoutCommonHeader = false; 2557 if (!isa<SExtInst>(&I)) 2558 return false; 2559 Type *ConsideredSExtType = 2560 Type::getInt64Ty(I.getParent()->getParent()->getContext()); 2561 if (I.getType() != ConsideredSExtType) 2562 return false; 2563 // See if the sext is the one with the right type and used in at least one 2564 // GetElementPtrInst. 2565 for (const User *U : I.users()) { 2566 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) { 2567 Considerable = true; 2568 // A getelementptr is considered as "complex" if it has more than 2 2569 // operands. We will promote a SExt used in such complex GEP as we 2570 // expect some computation to be merged if they are done on 64 bits. 2571 if (GEPInst->getNumOperands() > 2) { 2572 AllowPromotionWithoutCommonHeader = true; 2573 break; 2574 } 2575 } 2576 } 2577 return Considerable; 2578 } 2579 2580 bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const { 2581 switch (Opcode) { 2582 case Instruction::Add: 2583 case Instruction::Sub: 2584 case Instruction::Mul: 2585 case Instruction::And: 2586 case Instruction::Or: 2587 case Instruction::Xor: 2588 case Instruction::FAdd: 2589 case Instruction::FSub: 2590 case Instruction::FMul: 2591 case Instruction::FDiv: 2592 case Instruction::ICmp: 2593 case Instruction::FCmp: 2594 return true; 2595 case Instruction::Shl: 2596 case Instruction::LShr: 2597 case Instruction::AShr: 2598 case Instruction::UDiv: 2599 case Instruction::SDiv: 2600 case Instruction::URem: 2601 case Instruction::SRem: 2602 case Instruction::Select: 2603 return Operand == 1; 2604 default: 2605 return false; 2606 } 2607 } 2608 2609 bool RISCVTTIImpl::canSplatOperand(Instruction *I, int Operand) const { 2610 if (!I->getType()->isVectorTy() || !ST->hasVInstructions()) 2611 return false; 2612 2613 if (canSplatOperand(I->getOpcode(), Operand)) 2614 return true; 2615 2616 auto *II = dyn_cast<IntrinsicInst>(I); 2617 if (!II) 2618 return false; 2619 2620 switch (II->getIntrinsicID()) { 2621 case Intrinsic::fma: 2622 case Intrinsic::vp_fma: 2623 case Intrinsic::fmuladd: 2624 case Intrinsic::vp_fmuladd: 2625 return Operand == 0 || Operand == 1; 2626 case Intrinsic::vp_shl: 2627 case Intrinsic::vp_lshr: 2628 case Intrinsic::vp_ashr: 2629 case Intrinsic::vp_udiv: 2630 case Intrinsic::vp_sdiv: 2631 case Intrinsic::vp_urem: 2632 case Intrinsic::vp_srem: 2633 case Intrinsic::ssub_sat: 2634 case Intrinsic::vp_ssub_sat: 2635 case Intrinsic::usub_sat: 2636 case Intrinsic::vp_usub_sat: 2637 case Intrinsic::vp_select: 2638 return Operand == 1; 2639 // These intrinsics are commutative. 2640 case Intrinsic::vp_add: 2641 case Intrinsic::vp_mul: 2642 case Intrinsic::vp_and: 2643 case Intrinsic::vp_or: 2644 case Intrinsic::vp_xor: 2645 case Intrinsic::vp_fadd: 2646 case Intrinsic::vp_fmul: 2647 case Intrinsic::vp_icmp: 2648 case Intrinsic::vp_fcmp: 2649 case Intrinsic::smin: 2650 case Intrinsic::vp_smin: 2651 case Intrinsic::umin: 2652 case Intrinsic::vp_umin: 2653 case Intrinsic::smax: 2654 case Intrinsic::vp_smax: 2655 case Intrinsic::umax: 2656 case Intrinsic::vp_umax: 2657 case Intrinsic::sadd_sat: 2658 case Intrinsic::vp_sadd_sat: 2659 case Intrinsic::uadd_sat: 2660 case Intrinsic::vp_uadd_sat: 2661 // These intrinsics have 'vr' versions. 2662 case Intrinsic::vp_sub: 2663 case Intrinsic::vp_fsub: 2664 case Intrinsic::vp_fdiv: 2665 return Operand == 0 || Operand == 1; 2666 default: 2667 return false; 2668 } 2669 } 2670 2671 /// Check if sinking \p I's operands to I's basic block is profitable, because 2672 /// the operands can be folded into a target instruction, e.g. 2673 /// splats of scalars can fold into vector instructions. 2674 bool RISCVTTIImpl::isProfitableToSinkOperands( 2675 Instruction *I, SmallVectorImpl<Use *> &Ops) const { 2676 using namespace llvm::PatternMatch; 2677 2678 if (!I->getType()->isVectorTy() || !ST->hasVInstructions()) 2679 return false; 2680 2681 // Don't sink splat operands if the target prefers it. Some targets requires 2682 // S2V transfer buffers and we can run out of them copying the same value 2683 // repeatedly. 2684 // FIXME: It could still be worth doing if it would improve vector register 2685 // pressure and prevent a vector spill. 2686 if (!ST->sinkSplatOperands()) 2687 return false; 2688 2689 for (auto OpIdx : enumerate(I->operands())) { 2690 if (!canSplatOperand(I, OpIdx.index())) 2691 continue; 2692 2693 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get()); 2694 // Make sure we are not already sinking this operand 2695 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; })) 2696 continue; 2697 2698 // We are looking for a splat that can be sunk. 2699 if (!match(Op, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()), 2700 m_Undef(), m_ZeroMask()))) 2701 continue; 2702 2703 // Don't sink i1 splats. 2704 if (cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(1)) 2705 continue; 2706 2707 // All uses of the shuffle should be sunk to avoid duplicating it across gpr 2708 // and vector registers 2709 for (Use &U : Op->uses()) { 2710 Instruction *Insn = cast<Instruction>(U.getUser()); 2711 if (!canSplatOperand(Insn, U.getOperandNo())) 2712 return false; 2713 } 2714 2715 Ops.push_back(&Op->getOperandUse(0)); 2716 Ops.push_back(&OpIdx.value()); 2717 } 2718 return true; 2719 } 2720 2721 RISCVTTIImpl::TTI::MemCmpExpansionOptions 2722 RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { 2723 TTI::MemCmpExpansionOptions Options; 2724 // TODO: Enable expansion when unaligned access is not supported after we fix 2725 // issues in ExpandMemcmp. 2726 if (!ST->enableUnalignedScalarMem()) 2727 return Options; 2728 2729 if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp) 2730 return Options; 2731 2732 Options.AllowOverlappingLoads = true; 2733 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); 2734 Options.NumLoadsPerBlock = Options.MaxNumLoads; 2735 if (ST->is64Bit()) { 2736 Options.LoadSizes = {8, 4, 2, 1}; 2737 Options.AllowedTailExpansions = {3, 5, 6}; 2738 } else { 2739 Options.LoadSizes = {4, 2, 1}; 2740 Options.AllowedTailExpansions = {3}; 2741 } 2742 return Options; 2743 } 2744